From 12bd3377c840ef51f24b1db2e82bc59b26cea7f9 Mon Sep 17 00:00:00 2001 From: AmeenP Date: Fri, 8 May 2026 02:53:39 -0700 Subject: [PATCH 1/6] Add Dynamo renderer transport selector --- pyproject.toml | 5 ++++ tests/test_renderer_client.py | 42 ++++++++++++++++++++++++++++ uv.lock | 12 +++----- verifiers/clients/renderer_client.py | 3 ++ verifiers/types.py | 2 ++ 5 files changed, 56 insertions(+), 8 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index fcbf27566..9fb43812f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -132,6 +132,11 @@ prime-sandboxes = false renderers = false openenv-core = false +[tool.uv.sources] +# Pinned to renderers PR #11 until the next PyPI release lands; drop after. +# 1f3de65 = Dynamo chat nvext transport for token-in /chat/completions. +renderers = { git = "https://github.com/PrimeIntellect-ai/renderers.git", rev = "1f3de65" } + [tool.uv.extra-build-dependencies] flash-attn = [{ requirement = "torch", match-runtime = true }] diff --git a/tests/test_renderer_client.py b/tests/test_renderer_client.py index 9608c50a5..0877b282f 100644 --- a/tests/test_renderer_client.py +++ b/tests/test_renderer_client.py @@ -206,6 +206,48 @@ async def test_renderer_client_rejects_empty_dict_native_response(): await client.raise_from_native_response({}) +@pytest.mark.asyncio +async def test_get_native_response_passes_renderer_transport_to_generate( + monkeypatch: pytest.MonkeyPatch, +): + client = object.__new__(RendererClient) + client._client = object() + client._config = vf.ClientConfig( + client_type="renderer", + renderer_transport="dynamo_chat_nvext", + ) + renderer = _BridgeRenderer() + calls = [] + + client._get_renderer_or_pool = lambda model: renderer # type: ignore[method-assign] + + async def fake_get_incremental_prompt_ids(**kwargs): + return [10, 20] + + async def fake_generate(**kwargs): + calls.append(kwargs) + return {"content": "ok"} + + monkeypatch.setattr( + "verifiers.clients.renderer_client._get_incremental_prompt_ids", + fake_get_incremental_prompt_ids, + ) + monkeypatch.setattr("verifiers.clients.renderer_client.generate", fake_generate) + + response = await client.get_native_response( + prompt=[{"role": "user", "content": "hi"}], + model="test-model", + sampling_args={"temperature": 0.1}, + tools=None, + state={"trajectory": []}, + ) + + assert response == {"content": "ok"} + assert len(calls) == 1 + assert calls[0]["transport"] == "dynamo_chat_nvext" + assert calls[0]["prompt_ids"] == [10, 20] + + @pytest.mark.asyncio async def test_from_native_response_uses_request_id_and_token_lengths(): """vLLM's /inference/v1/generate returns ``request_id`` (not ``id``) and diff --git a/uv.lock b/uv.lock index 11a8be1be..d4079186f 100644 --- a/uv.lock +++ b/uv.lock @@ -5263,8 +5263,8 @@ wheels = [ [[package]] name = "renderers" -version = "0.1.8.dev0" -source = { registry = "https://pypi.org/simple" } +version = "0.1.7" +source = { git = "https://github.com/PrimeIntellect-ai/renderers.git?rev=1f3de65#1f3de6506208233217f61671ebe81a00b0f5082a" } dependencies = [ { name = "jinja2" }, { name = "numpy" }, @@ -5273,10 +5273,6 @@ dependencies = [ { name = "tiktoken" }, { name = "transformers" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/50/de/a445036157af3367c6a962c13333427c83c08926934c541886eb87f9dcdf/renderers-0.1.8.dev0.tar.gz", hash = "sha256:71eef7bfa3d3f5849ba070d38cd89a1f6387ca7710824f2e50d8c05c9b1048b9", size = 210667, upload-time = "2026-05-12T17:48:45.352Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e7/33/936a38c7f20fbe096b751842ffc6ef254c9eb2223153aa860a122ce9a834/renderers-0.1.8.dev0-py3-none-any.whl", hash = "sha256:09bb35233f67599519c0ff6edfad469f0836a55a6b78e039cd8e7b5e527bdcb3", size = 98617, upload-time = "2026-05-12T17:48:44.222Z" }, -] [[package]] name = "requests" @@ -6690,7 +6686,7 @@ requires-dist = [ { name = "pyzmq", specifier = ">=27.1.0" }, { name = "reasoning-gym", marker = "extra == 'rg'" }, { name = "regex", specifier = "<2026.4.4" }, - { name = "renderers", marker = "extra == 'renderers'", specifier = ">=0.1.8.dev0" }, + { name = "renderers", marker = "extra == 'renderers'", git = "https://github.com/PrimeIntellect-ai/renderers.git?rev=1f3de65" }, { name = "requests" }, { name = "requests", marker = "extra == 'rl'" }, { name = "rich" }, @@ -6721,7 +6717,7 @@ dev = [ { name = "pytest-xdist", specifier = ">=3.8.0" }, { name = "python-dotenv", specifier = ">=1.0.0" }, { name = "reasoning-gym" }, - { name = "renderers", specifier = ">=0.1.8.dev0" }, + { name = "renderers", git = "https://github.com/PrimeIntellect-ai/renderers.git?rev=1f3de65" }, { name = "ruff" }, { name = "stagehand", specifier = ">=3.0.0" }, { name = "textarena" }, diff --git a/verifiers/clients/renderer_client.py b/verifiers/clients/renderer_client.py index 6ffdcf6cd..79f27f52f 100644 --- a/verifiers/clients/renderer_client.py +++ b/verifiers/clients/renderer_client.py @@ -589,6 +589,9 @@ async def get_native_response( or sampling_params.pop("cache_salt", None), priority=args.get("priority") or sampling_params.pop("priority", None), extra_headers=args.get("extra_headers"), + transport=getattr( + self._config, "renderer_transport", "prime_vllm_generate" + ), ) async def raise_from_native_response(self, response: dict[str, Any]) -> None: diff --git a/verifiers/types.py b/verifiers/types.py index 4d8e965f9..c4190aee7 100644 --- a/verifiers/types.py +++ b/verifiers/types.py @@ -56,6 +56,7 @@ "openai_responses", "anthropic_messages", ] +RendererTransport = Literal["prime_vllm_generate", "dynamo_chat_nvext"] MessageType = Literal["chat", "completion"] # deprecated @@ -1015,6 +1016,7 @@ class ClientConfig(BaseModel): renderer: str = "auto" renderer_model_name: str | None = None renderer_pool_size: int | None = None + renderer_transport: RendererTransport = "prime_vllm_generate" tool_parser: str | None = None reasoning_parser: str | None = None preserve_all_thinking: bool = False From 496dba44fce004834614e5c18b7bdfbcc9621785 Mon Sep 17 00:00:00 2001 From: AmeenP Date: Fri, 8 May 2026 03:11:42 -0700 Subject: [PATCH 2/6] deps: update renderers Dynamo transport pin --- pyproject.toml | 2 +- uv.lock | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9fb43812f..4bb1a4aae 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -135,7 +135,7 @@ openenv-core = false [tool.uv.sources] # Pinned to renderers PR #11 until the next PyPI release lands; drop after. # 1f3de65 = Dynamo chat nvext transport for token-in /chat/completions. -renderers = { git = "https://github.com/PrimeIntellect-ai/renderers.git", rev = "1f3de65" } +renderers = { git = "https://github.com/PrimeIntellect-ai/renderers.git", rev = "7ca1ab3" } [tool.uv.extra-build-dependencies] flash-attn = [{ requirement = "torch", match-runtime = true }] diff --git a/uv.lock b/uv.lock index d4079186f..6dc0ba121 100644 --- a/uv.lock +++ b/uv.lock @@ -5264,7 +5264,7 @@ wheels = [ [[package]] name = "renderers" version = "0.1.7" -source = { git = "https://github.com/PrimeIntellect-ai/renderers.git?rev=1f3de65#1f3de6506208233217f61671ebe81a00b0f5082a" } +source = { git = "https://github.com/PrimeIntellect-ai/renderers.git?rev=7ca1ab3#7ca1ab357f3ae2262ad10ffe757670739a8ec2c5" } dependencies = [ { name = "jinja2" }, { name = "numpy" }, @@ -6686,7 +6686,7 @@ requires-dist = [ { name = "pyzmq", specifier = ">=27.1.0" }, { name = "reasoning-gym", marker = "extra == 'rg'" }, { name = "regex", specifier = "<2026.4.4" }, - { name = "renderers", marker = "extra == 'renderers'", git = "https://github.com/PrimeIntellect-ai/renderers.git?rev=1f3de65" }, + { name = "renderers", marker = "extra == 'renderers'", git = "https://github.com/PrimeIntellect-ai/renderers.git?rev=7ca1ab3" }, { name = "requests" }, { name = "requests", marker = "extra == 'rl'" }, { name = "rich" }, @@ -6717,7 +6717,7 @@ dev = [ { name = "pytest-xdist", specifier = ">=3.8.0" }, { name = "python-dotenv", specifier = ">=1.0.0" }, { name = "reasoning-gym" }, - { name = "renderers", git = "https://github.com/PrimeIntellect-ai/renderers.git?rev=1f3de65" }, + { name = "renderers", git = "https://github.com/PrimeIntellect-ai/renderers.git?rev=7ca1ab3" }, { name = "ruff" }, { name = "stagehand", specifier = ">=3.0.0" }, { name = "textarena" }, From edae1af8dae074550dfd922f9050384b5e96fdf0 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Tue, 5 May 2026 10:23:09 -0700 Subject: [PATCH 3/6] feat(tito): add dynamo_chat_nvext transport to OpenAIChatCompletionsTokenClient The verifiers TITO client previously only spoke vLLM's TITO surface (/v1/chat/completions/tokens for the final POST, /tokenize for bridge tokenization). Dynamo bis/dynamo-rl serves neither route, so multi-turn TITO against Dynamo silently degraded to MITO every turn-2+ via the existing fallback path. This commit teaches the TITO client to read ClientConfig.renderer_transport (same field RendererClient consults) and route accordingly: - prime_vllm_generate (default): unchanged - posts to /v1/chat/completions/tokens and uses /tokenize over HTTP. - dynamo_chat_nvext: bridge tokenize runs locally via the renderers package (zero RTTs); final POST goes to /v1/chat/completions with placeholder messages + nvext.token_data carrying the stitched prompt_ids + explicit stop_token_ids from renderer.get_stop_token_ids(). Wire shape matches what RendererClient already produces for the same transport, so a Dynamo deployment validated against renderer mode automatically accepts TITO traffic too. Adds two unit tests that assert the dynamo-transport wire shape end-to- end via a recording client + stub renderer (no real tokenizer download). --- ...st_openai_chat_completions_token_client.py | 159 +++++++++++++ .../openai_chat_completions_token_client.py | 220 +++++++++++++++++- 2 files changed, 376 insertions(+), 3 deletions(-) diff --git a/tests/test_openai_chat_completions_token_client.py b/tests/test_openai_chat_completions_token_client.py index 3c90e11a7..9e972c78c 100644 --- a/tests/test_openai_chat_completions_token_client.py +++ b/tests/test_openai_chat_completions_token_client.py @@ -274,3 +274,162 @@ async def fake_get_prompt_ids( # noqa: ANN001 assert len(recording_client.calls) == 1 assert recording_client.calls[0]["path"] == "/chat/completions/tokens" assert recording_client.calls[0]["body"]["tokens"] == [10, 20] + + +# --------------------------------------------------------------------------- +# dynamo_chat_nvext transport (Dynamo bis/dynamo-rl) +# --------------------------------------------------------------------------- + +class _StubRenderer: + """Renderer stand-in for the dynamo_chat_nvext transport tests. + + Returns deterministic ids so we can assert on body shape without pulling + in a real HuggingFace tokenizer download. ``render_ids`` returns a + fixed sequence; ``get_stop_token_ids`` returns a marker pair. + """ + + def __init__(self) -> None: + self.render_calls: list[dict[str, Any]] = [] + + def render_ids( + self, + messages, + *, + tools=None, + add_generation_prompt: bool = False, + ) -> list[int]: + self.render_calls.append( + { + "messages": messages, + "tools": tools, + "add_generation_prompt": add_generation_prompt, + } + ) + # Encode the call shape into ids so tests can disambiguate the two + # bridge tokenize calls without a real tokenizer. + return [42, len(messages), int(add_generation_prompt)] + + def get_stop_token_ids(self) -> list[int]: + return [99, 100] + + +class _DynamoTestClient(OpenAIChatCompletionsTokenClient): + """Dynamo-transport TITO client with a stubbed renderer. + + Subclass override is the cleanest way to inject the stub without going + through ``ClientConfig`` (which would require a real ``api_base_url`` + and ``setup_client`` to construct the AsyncOpenAI). The recording + client captures the eventual ``self.client.post(...)`` call. + """ + + _stub_renderer: _StubRenderer + + def __init__(self, recording_client) -> None: + super().__init__(recording_client) + self._stub_renderer = _StubRenderer() + + @property + def renderer_transport(self) -> str: # type: ignore[override] + return "dynamo_chat_nvext" + + def _get_renderer(self, model: str): # type: ignore[override] + return self._stub_renderer + + +@pytest.mark.asyncio +async def test_local_tokenize_uses_renderer_under_dynamo_transport(): + """Bridge tokenize must NOT hit any HTTP route under dynamo_chat_nvext. + + Goes straight through ``_local_tokenize`` -> ``renderer.render_ids``. + The recording client would record any errant POST; we assert it sees + none. + """ + recording_client = _RecordingClient() + client = _DynamoTestClient(recording_client) + + ids_full = await client.tokenize( + messages=[{"role": "user", "content": "u"}], + tools=None, + model="test-model", + ) + ids_base = await client.tokenize( + messages=[{"role": "user", "content": "u"}], + tools=None, + model="test-model", + extra_kwargs={"add_generation_prompt": False}, + ) + + # Both calls hit the renderer, neither hit the wire. + assert recording_client.calls == [] + assert client._stub_renderer.render_calls[0]["add_generation_prompt"] is True + assert client._stub_renderer.render_calls[1]["add_generation_prompt"] is False + # And the stub encodes that into the returned ids' last element. + assert ids_full[-1] == 1 + assert ids_base[-1] == 0 + + +@pytest.mark.asyncio +async def test_get_native_response_uses_dynamo_chat_nvext_under_transport( + monkeypatch: pytest.MonkeyPatch, +): + """Dynamo transport must POST to /chat/completions with nvext.token_data. + + Mirrors test_get_native_response_uses_token_route_when_prompt_ids_available + but for the new transport. + """ + recording_client = _RecordingClient() + client = _DynamoTestClient(recording_client) + + async def fake_get_prompt_ids(self, state, prompt_messages, oai_tools): # noqa: ANN001 + return [10, 20, 30] + + monkeypatch.setattr( + OpenAIChatCompletionsTokenClient, "get_prompt_ids", fake_get_prompt_ids + ) + + state = cast( + State, + { + "model": "test-model", + "trajectory": [ + _make_step( + prompt=[{"role": "user", "content": "u1"}], + completion=[{"role": "assistant", "content": "a1"}], + prompt_ids=[1], + completion_ids=[2], + ) + ], + }, + ) + prompt = cast(Any, [{"role": "user", "content": "u2"}]) + + response = await client.get_native_response( + prompt=prompt, + model="test-model", + sampling_args={"max_completion_tokens": 16, "temperature": 0.5}, + tools=None, + state=state, + ) + + assert response["ok"] is True + assert len(recording_client.calls) == 1 + call = recording_client.calls[0] + + # Wire-shape assertions: route, nvext.token_data, stop_token_ids, + # placeholder messages, sampling fields promoted. + assert call["path"] == "/chat/completions" + body = call["body"] + assert body["nvext"]["token_data"] == [10, 20, 30] + assert body["nvext"]["extra_fields"] == ["completion_token_ids"] + assert body["stop_token_ids"] == [99, 100] + assert body["messages"] == [{"role": "user", "content": "(token-in mode)"}] + assert body["max_completion_tokens"] == 16 + assert body["temperature"] == 0.5 + assert body["logprobs"] is True + assert body["stream"] is False + + # No /chat/completions/tokens, no /tokenize for the dynamo transport. + assert all( + c["path"] != "/chat/completions/tokens" and not c["path"].endswith("/tokenize") + for c in recording_client.calls + ) diff --git a/verifiers/clients/openai_chat_completions_token_client.py b/verifiers/clients/openai_chat_completions_token_client.py index d3b773ce6..d7abbb7bd 100644 --- a/verifiers/clients/openai_chat_completions_token_client.py +++ b/verifiers/clients/openai_chat_completions_token_client.py @@ -1,3 +1,4 @@ +import asyncio from collections.abc import Mapping from typing import Any, Optional, cast @@ -19,7 +20,11 @@ OpenAITool, handle_openai_overlong_prompt, ) -from verifiers.types import SamplingArgs, State +from verifiers.types import RendererTransport, SamplingArgs, State + +# Sentinel returned by transports that don't tokenize over HTTP. Lets callers +# route around the legacy /tokenize body shape without changing the signature. +_DEFAULT_TRANSPORT: RendererTransport = "prime_vllm_generate" def _has_multimodal_content(messages) -> bool: @@ -64,7 +69,22 @@ class TokenizeResponse(BaseModel): class OpenAIChatCompletionsTokenClient(OpenAIChatCompletionsClient): - """Wrapper for custom vLLM route /v1/chat/completions/tokens via AsyncOpenAI client.""" + """Token-in/token-out chat client. + + Two transports share this class: + + * ``prime_vllm_generate`` (default): the historical TITO surface that + posts to vLLM's ``/v1/chat/completions/tokens`` and uses the engine's + ``/tokenize`` for bridge-token computation. This is what vanilla vLLM + ``>=0.20`` exposes. + * ``dynamo_chat_nvext``: posts pre-tokenized prompts to Dynamo's standard + ``/v1/chat/completions`` route with ``nvext.token_data`` carrying the + stitched ``prompt_ids``. Bridge tokenization runs locally via the + ``renderers`` package (no ``/tokenize`` round-trip) since Dynamo + doesn't expose vLLM's token routes. Selection is via + ``ClientConfig.renderer_transport``; same field the renderer client + consults so a single config option drives both clients consistently. + """ @property def token_client(self) -> AsyncOpenAI: @@ -74,6 +94,51 @@ def token_client(self) -> AsyncOpenAI: base_url = base_url[:-3] return self.client.with_options(base_url=base_url) + @property + def renderer_transport(self) -> RendererTransport: + """Wire-shape selector. ``ClientConfig.renderer_transport`` if set, + else the default vLLM TITO shape. Mirrors the same field used by + ``RendererClient`` so backend selection stays in one place.""" + return cast( + RendererTransport, + getattr(self._config, "renderer_transport", _DEFAULT_TRANSPORT) + if self._config is not None + else _DEFAULT_TRANSPORT, + ) + + def _get_renderer(self, model: str): + """Lazy, per-model renderer cache. Used only by the ``dynamo_chat_nvext`` + transport for client-side tokenization and stop-token resolution. + + Loaded on first use and reused across calls so we pay the + ``AutoTokenizer.from_pretrained`` cost once. The renderer's + underlying tokenizer is HuggingFace fast-tokenizer-backed, so the + wrapping ``asyncio.to_thread`` calls in ``tokenize()`` get real + parallelism (the Rust encode releases the GIL). + """ + cache: dict[str, Any] = self.__dict__.setdefault("_renderer_cache", {}) + if model in cache: + return cache[model] + try: + from renderers import create_renderer # type: ignore[import-not-found] + from transformers import AutoTokenizer # type: ignore[import-not-found] + except ImportError as exc: # pragma: no cover - dependency surface + raise ImportError( + "OpenAIChatCompletionsTokenClient with renderer_transport=" + "'dynamo_chat_nvext' requires the 'renderers' and 'transformers' " + "packages. Install via `pip install verifiers[renderers]` or add " + "renderers + transformers to your environment." + ) from exc + tokenizer = AutoTokenizer.from_pretrained(model) + renderer_name = ( + getattr(self._config, "renderer", "auto") + if self._config is not None + else "auto" + ) + renderer = create_renderer(tokenizer, renderer=renderer_name or "auto") + cache[model] = renderer + return renderer + @handle_openai_overlong_prompt async def get_native_response( self, @@ -139,6 +204,16 @@ def normalize_sampling_args(sampling_args: SamplingArgs): prompt, model, sampling_args, tools, extra_headers=extra_headers ) + if self.renderer_transport == "dynamo_chat_nvext": + return await self._post_dynamo_chat_nvext( + prompt=prompt, + prompt_ids=prompt_ids, + model=model, + tools=tools, + sampling_args=sampling_args, + extra_headers=extra_headers, + ) + extra_body = sampling_args.pop("extra_body", {}) body = dict( model=model, @@ -156,6 +231,85 @@ def normalize_sampling_args(sampling_args: SamplingArgs): options={"headers": extra_headers} if extra_headers else {}, ) + async def _post_dynamo_chat_nvext( + self, + prompt: OpenAIChatMessages, + prompt_ids: list[int], + model: str, + tools: list[OpenAITool] | None, + sampling_args: dict, + extra_headers: Mapping[str, str] | None, + ) -> OpenAIChatResponse: + """Post stitched prompt_ids to Dynamo's chat-completions route. + + Wire shape mirrors what ``RendererClient`` produces for + ``dynamo_chat_nvext`` (placeholder messages + ``nvext.token_data`` + + explicit ``stop_token_ids``) so a Dynamo deployment validated against + renderer-mode automatically accepts TITO-mode traffic too. The + engine ignores ``messages`` when ``nvext.token_data`` is present, so + the placeholder body stays small regardless of trajectory length. + """ + renderer = self._get_renderer(model) + stop_token_ids = list(renderer.get_stop_token_ids()) + + extra_body = dict(sampling_args.pop("extra_body", {}) or {}) + + nvext: dict[str, Any] = { + "token_data": prompt_ids, + "extra_fields": ["completion_token_ids"], + } + priority = sampling_args.get("priority", extra_body.get("priority")) + if priority is not None: + nvext["agent_hints"] = {"priority": priority} + + body: dict[str, Any] = { + "model": model, + "messages": [{"role": "user", "content": "(token-in mode)"}], + "stream": False, + "logprobs": True, + "stop_token_ids": stop_token_ids, + "nvext": nvext, + } + if tools: + body["tools"] = tools + + # Promote sampling fields that Dynamo's chat-completions surface + # accepts directly. Anything else stays in extra_body and rides as + # an unrecognized passthrough field (validate.rs:104 allowlist). + promotable = ( + "max_completion_tokens", + "max_tokens", + "temperature", + "top_p", + "top_k", + "min_p", + "seed", + "n", + "repetition_penalty", + "min_tokens", + ) + for key in promotable: + value = sampling_args.get(key, extra_body.get(key)) + if value is not None: + body[key] = value + + # Pass any remaining unhandled extra_body keys straight through (e.g. + # cache_salt, return_token_ids). Dynamo's PASSTHROUGH_EXTRA_FIELDS + # allowlist accepts these without rejection. + passthrough = { + k: v + for k, v in extra_body.items() + if k not in promotable and v is not None and k not in body + } + body.update(passthrough) + + return await self.client.post( + "/chat/completions", + body=body, + cast_to=ChatCompletion, + options={"headers": extra_headers} if extra_headers else {}, + ) + async def get_prompt_ids( self, state: State, @@ -370,9 +524,27 @@ async def tokenize( extra_kwargs: dict | None = None, **kwargs, ) -> list[int]: - """Tokenize messages using the vLLM /tokenize API.""" + """Tokenize messages. + + ``dynamo_chat_nvext`` transport: tokenizes locally via the + ``renderers`` package, no network call. Runs on a worker thread so + the event loop stays free; HuggingFace fast tokenizers release the + GIL during the Rust encode pass. + + Default transport: posts to vLLM's ``/tokenize`` route on the + host root. + """ if extra_kwargs is None: extra_kwargs = {} + + if self.renderer_transport == "dynamo_chat_nvext": + return await self._local_tokenize( + messages=messages, + tools=tools, + model=model, + extra_kwargs=extra_kwargs, + ) + if isinstance(messages, str): body = dict( model=model, @@ -393,3 +565,45 @@ async def tokenize( "/tokenize", body=body, cast_to=TokenizeResponse ) return tokenize_response.tokens + + async def _local_tokenize( + self, + messages: str | OpenAIChatMessages, + tools: list[OpenAITool] | None, + model: str, + extra_kwargs: dict, + ) -> list[int]: + """Local in-process tokenization for the dynamo transport. + + Bridge tokenization under TITO calls this twice per turn (once for + ``add_generation_prompt=True`` and once for ``False``). Both calls + go through the same renderer, so the chat-template + tool-call + normalization is consistent with whatever Dynamo's worker would + produce server-side. + """ + renderer = self._get_renderer(model) + + def _render() -> list[int]: + if isinstance(messages, str): + tokenizer = getattr(renderer, "tokenizer", None) + if tokenizer is None: + raise RuntimeError( + "Renderer for model %r does not expose a tokenizer; " + "cannot tokenize a raw string under dynamo_chat_nvext." + % model + ) + # Strip BOS for parity with vLLM /tokenize (which never + # prepends a BOS for raw-prompt tokenize requests). + encoded = tokenizer(messages, add_special_tokens=False) + return list(encoded["input_ids"]) + + add_generation_prompt = bool(extra_kwargs.get("add_generation_prompt", True)) + return list( + renderer.render_ids( + cast(Any, list(messages)), + tools=cast(Any, tools), + add_generation_prompt=add_generation_prompt, + ) + ) + + return await asyncio.to_thread(_render) From 775f3179aa93a4eaa37fd58261a7dd5f9df2fa65 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Tue, 5 May 2026 11:42:27 -0700 Subject: [PATCH 4/6] fix(tito): make get_prompt_ids prefix-match work for vf.Message input OpenAIChatCompletionsTokenClient.get_prompt_ids' prefix-match between the prompt_messages caller-input and the trajectory step messages was asymmetric: - prompt_messages went straight through normalize_for_comparison (which picks up vf.AssistantMessage.model_dump's exhaustive view, including thinking_blocks=None and other defaulted fields). - step_messages went through to_native_prompt FIRST, which produces the slimmer OpenAI-format dict that omits thinking_blocks entirely. The two normalized forms then never compared equal whenever the caller handed the client Pydantic vf.Message types -- the form MultiTurnEnv produces after maybe_normalize_messages -- so the prefix match always returned None and TITO silently fell back to MITO every turn-2+. Probe-3 and the upstream test suite both used raw dict input, so the asymmetry only showed up under real orchestrator rollouts. Fix: drop None-valued keys in normalize_for_comparison. Both sides land on the same shape regardless of whether they came in as Pydantic or as plain OpenAI dicts. Validated end-to-end against bis-dev/5/always-continue-tito (multi-turn TITO + Dynamo bis/dynamo-rl smoke): 348 /v1/chat/completions, 21 SIDECAR-SKIP-TOKENIZE markers, 0 fall- back warnings. Same SIDECAR token-prefix appears across turns, confirming the engine reuses prior-turn ids verbatim. The existing 6 unit tests (4 vanilla TITO + 2 dynamo_chat_nvext) all still pass; their dict-shaped input always normalized to the same shape on both sides, so the symmetric drop-None doesn't change them. --- tests/test_openai_chat_completions_token_client.py | 1 + .../openai_chat_completions_token_client.py | 14 +++++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/tests/test_openai_chat_completions_token_client.py b/tests/test_openai_chat_completions_token_client.py index 9e972c78c..0d0b5e651 100644 --- a/tests/test_openai_chat_completions_token_client.py +++ b/tests/test_openai_chat_completions_token_client.py @@ -280,6 +280,7 @@ async def fake_get_prompt_ids( # noqa: ANN001 # dynamo_chat_nvext transport (Dynamo bis/dynamo-rl) # --------------------------------------------------------------------------- + class _StubRenderer: """Renderer stand-in for the dynamo_chat_nvext transport tests. diff --git a/verifiers/clients/openai_chat_completions_token_client.py b/verifiers/clients/openai_chat_completions_token_client.py index d7abbb7bd..8b916b546 100644 --- a/verifiers/clients/openai_chat_completions_token_client.py +++ b/verifiers/clients/openai_chat_completions_token_client.py @@ -343,6 +343,13 @@ def normalize_for_comparison(value: Any) -> Any: # prefix-match equality is unaffected. if normalized.get("content") == "": normalized["content"] = None + # Drop None-valued keys so model_dump's exhaustive view (which + # carries e.g. thinking_blocks=None on AssistantMessage) is + # equivalent to to_native_prompt's slimmer view (which omits + # the field entirely). Without this, vf.Message-shaped input + # never matches the to_native_prompt-normalized step messages, + # which breaks the prefix match for MultiTurnEnv rollouts. + normalized = {k: v for k, v in normalized.items() if v is not None} return normalized if isinstance(value, list): return [normalize_for_comparison(item) for item in value] @@ -589,15 +596,16 @@ def _render() -> list[int]: if tokenizer is None: raise RuntimeError( "Renderer for model %r does not expose a tokenizer; " - "cannot tokenize a raw string under dynamo_chat_nvext." - % model + "cannot tokenize a raw string under dynamo_chat_nvext." % model ) # Strip BOS for parity with vLLM /tokenize (which never # prepends a BOS for raw-prompt tokenize requests). encoded = tokenizer(messages, add_special_tokens=False) return list(encoded["input_ids"]) - add_generation_prompt = bool(extra_kwargs.get("add_generation_prompt", True)) + add_generation_prompt = bool( + extra_kwargs.get("add_generation_prompt", True) + ) return list( renderer.render_ids( cast(Any, list(messages)), From 0bece1fa67beb3cef8c180abfcfdfc7605dc2593 Mon Sep 17 00:00:00 2001 From: AmeenP Date: Fri, 15 May 2026 05:39:48 -0700 Subject: [PATCH 5/6] fix(dynamo): align renderer engine_data token flow Signed-off-by: AmeenP --- pyproject.toml | 4 +- ...st_openai_chat_completions_token_client.py | 66 ++++++++++++++++++- tests/test_renderer_client.py | 4 +- uv.lock | 8 +-- .../clients/openai_chat_completions_client.py | 38 ++++++++++- .../openai_chat_completions_token_client.py | 55 +++++++++++----- verifiers/clients/renderer_client.py | 12 +++- 7 files changed, 155 insertions(+), 32 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4bb1a4aae..c49f1be3b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -134,8 +134,8 @@ openenv-core = false [tool.uv.sources] # Pinned to renderers PR #11 until the next PyPI release lands; drop after. -# 1f3de65 = Dynamo chat nvext transport for token-in /chat/completions. -renderers = { git = "https://github.com/PrimeIntellect-ai/renderers.git", rev = "7ca1ab3" } +# 17005dd = Dynamo chat nvext transport with engine_data response support. +renderers = { git = "https://github.com/PrimeIntellect-ai/renderers.git", rev = "17005dd" } [tool.uv.extra-build-dependencies] flash-attn = [{ requirement = "torch", match-runtime = true }] diff --git a/tests/test_openai_chat_completions_token_client.py b/tests/test_openai_chat_completions_token_client.py index 0d0b5e651..6d1dee485 100644 --- a/tests/test_openai_chat_completions_token_client.py +++ b/tests/test_openai_chat_completions_token_client.py @@ -1,3 +1,4 @@ +from types import SimpleNamespace from typing import Any, cast import pytest @@ -381,7 +382,9 @@ async def test_get_native_response_uses_dynamo_chat_nvext_under_transport( recording_client = _RecordingClient() client = _DynamoTestClient(recording_client) - async def fake_get_prompt_ids(self, state, prompt_messages, oai_tools): # noqa: ANN001 + async def fake_get_prompt_ids( # noqa: ANN001 + self, state, prompt_messages, oai_tools, chat_template_kwargs=None + ): return [10, 20, 30] monkeypatch.setattr( @@ -407,7 +410,17 @@ async def fake_get_prompt_ids(self, state, prompt_messages, oai_tools): # noqa: response = await client.get_native_response( prompt=prompt, model="test-model", - sampling_args={"max_completion_tokens": 16, "temperature": 0.5}, + sampling_args={ + "max_completion_tokens": 16, + "temperature": 0.5, + "extra_body": { + "nvext": { + "extra_fields": ["timing"], + "cache_salt": "ckpt-42", + }, + "cache_salt": "top-level-salt", + }, + }, tools=None, state=state, ) @@ -421,7 +434,9 @@ async def fake_get_prompt_ids(self, state, prompt_messages, oai_tools): # noqa: assert call["path"] == "/chat/completions" body = call["body"] assert body["nvext"]["token_data"] == [10, 20, 30] - assert body["nvext"]["extra_fields"] == ["completion_token_ids"] + assert body["nvext"]["extra_fields"] == ["timing", "engine_data"] + assert body["nvext"]["cache_salt"] == "ckpt-42" + assert body["cache_salt"] == "top-level-salt" assert body["stop_token_ids"] == [99, 100] assert body["messages"] == [{"role": "user", "content": "(token-in mode)"}] assert body["max_completion_tokens"] == 16 @@ -434,3 +449,48 @@ async def fake_get_prompt_ids(self, state, prompt_messages, oai_tools): # noqa: c["path"] != "/chat/completions/tokens" and not c["path"].endswith("/tokenize") for c in recording_client.calls ) + + +@pytest.mark.asyncio +async def test_from_native_response_grafts_dynamo_engine_data_tokens(): + client = OpenAIChatCompletionsClient(_NoopClient()) + message = SimpleNamespace( + content="ok", + tool_calls=None, + model_dump=lambda: {}, + ) + response = SimpleNamespace( + id="chatcmpl-test", + created=0, + model="test-model", + usage=SimpleNamespace( + prompt_tokens=3, + completion_tokens=2, + total_tokens=5, + ), + nvext={ + "engine_data": { + "prompt_token_ids": [1, 2, 3], + "completion_token_ids": [4, 5], + }, + }, + choices=[ + SimpleNamespace( + finish_reason="stop", + message=message, + logprobs={ + "content": [ + {"logprob": -0.1}, + {"logprob": -0.2}, + ] + }, + ) + ], + ) + + parsed = await client.from_native_response(cast(Any, response)) + + assert parsed.message.tokens is not None + assert parsed.message.tokens.prompt_ids == [1, 2, 3] + assert parsed.message.tokens.completion_ids == [4, 5] + assert parsed.message.tokens.completion_logprobs == [-0.1, -0.2] diff --git a/tests/test_renderer_client.py b/tests/test_renderer_client.py index 0877b282f..625fb46f9 100644 --- a/tests/test_renderer_client.py +++ b/tests/test_renderer_client.py @@ -222,7 +222,7 @@ async def test_get_native_response_passes_renderer_transport_to_generate( client._get_renderer_or_pool = lambda model: renderer # type: ignore[method-assign] async def fake_get_incremental_prompt_ids(**kwargs): - return [10, 20] + return RenderedTokens(token_ids=[10, 20]) async def fake_generate(**kwargs): calls.append(kwargs) @@ -244,7 +244,7 @@ async def fake_generate(**kwargs): assert response == {"content": "ok"} assert len(calls) == 1 - assert calls[0]["transport"] == "dynamo_chat_nvext" + assert calls[0]["transport"] == "dynamo" assert calls[0]["prompt_ids"] == [10, 20] diff --git a/uv.lock b/uv.lock index 6dc0ba121..6f655bf16 100644 --- a/uv.lock +++ b/uv.lock @@ -5263,8 +5263,8 @@ wheels = [ [[package]] name = "renderers" -version = "0.1.7" -source = { git = "https://github.com/PrimeIntellect-ai/renderers.git?rev=7ca1ab3#7ca1ab357f3ae2262ad10ffe757670739a8ec2c5" } +version = "0.1.8.dev12+g17005dd" +source = { git = "https://github.com/PrimeIntellect-ai/renderers.git?rev=17005dd#17005dd79f031f7993fb8e0b26b52aff346ad07e" } dependencies = [ { name = "jinja2" }, { name = "numpy" }, @@ -6686,7 +6686,7 @@ requires-dist = [ { name = "pyzmq", specifier = ">=27.1.0" }, { name = "reasoning-gym", marker = "extra == 'rg'" }, { name = "regex", specifier = "<2026.4.4" }, - { name = "renderers", marker = "extra == 'renderers'", git = "https://github.com/PrimeIntellect-ai/renderers.git?rev=7ca1ab3" }, + { name = "renderers", marker = "extra == 'renderers'", git = "https://github.com/PrimeIntellect-ai/renderers.git?rev=17005dd" }, { name = "requests" }, { name = "requests", marker = "extra == 'rl'" }, { name = "rich" }, @@ -6717,7 +6717,7 @@ dev = [ { name = "pytest-xdist", specifier = ">=3.8.0" }, { name = "python-dotenv", specifier = ">=1.0.0" }, { name = "reasoning-gym" }, - { name = "renderers", git = "https://github.com/PrimeIntellect-ai/renderers.git?rev=7ca1ab3" }, + { name = "renderers", git = "https://github.com/PrimeIntellect-ai/renderers.git?rev=17005dd" }, { name = "ruff" }, { name = "stagehand", specifier = ">=3.0.0" }, { name = "textarena" }, diff --git a/verifiers/clients/openai_chat_completions_client.py b/verifiers/clients/openai_chat_completions_client.py index 7fc287397..8fa2c634d 100644 --- a/verifiers/clients/openai_chat_completions_client.py +++ b/verifiers/clients/openai_chat_completions_client.py @@ -443,8 +443,44 @@ def parse_finish_reason(response: OpenAIChatResponse) -> FinishReason: case _: return None + def _graft_engine_data(response: OpenAIChatResponse) -> None: + nvext = getattr(response, "nvext", None) + if nvext is None and hasattr(response, "model_dump"): + nvext = response.model_dump().get("nvext") + if not isinstance(nvext, dict): + return + + choice = response.choices[0] + engine_data = nvext.get("engine_data") + completion_token_ids_top = nvext.get("completion_token_ids") + prompt_token_ids_top = nvext.get("prompt_token_ids") + + completion_token_ids: list[int] | None = None + prompt_token_ids: list[int] | None = None + if isinstance(engine_data, dict): + if engine_data.get("completion_token_ids") is not None: + completion_token_ids = list(engine_data["completion_token_ids"]) + if engine_data.get("prompt_token_ids") is not None: + prompt_token_ids = list(engine_data["prompt_token_ids"]) + if completion_token_ids is None and completion_token_ids_top is not None: + completion_token_ids = list(completion_token_ids_top) + if prompt_token_ids is None and prompt_token_ids_top is not None: + prompt_token_ids = list(prompt_token_ids_top) + + if ( + getattr(choice, "token_ids", None) is None + and completion_token_ids is not None + ): + object.__setattr__(choice, "token_ids", completion_token_ids) + if ( + getattr(response, "prompt_token_ids", None) is None + and prompt_token_ids is not None + ): + object.__setattr__(response, "prompt_token_ids", prompt_token_ids) + def parse_tokens(response: OpenAIChatResponse) -> ResponseTokens | None: assert len(response.choices) == 1, "Response should always have one choice" + _graft_engine_data(response) choice = response.choices[0] if not hasattr(choice, "token_ids"): return None @@ -482,7 +518,7 @@ def parse_tokens(response: OpenAIChatResponse) -> ResponseTokens | None: logprobs_content = response.choices[0].logprobs["content"] completion_logprobs = [token["logprob"] for token in logprobs_content] - choice_extra = choice.model_extra or {} + choice_extra = getattr(choice, "model_extra", None) or {} routed_experts = parse_routed_experts(choice_extra.get("routed_experts")) return ResponseTokens( prompt_ids=prompt_ids, diff --git a/verifiers/clients/openai_chat_completions_token_client.py b/verifiers/clients/openai_chat_completions_token_client.py index 8b916b546..40dd8ebd7 100644 --- a/verifiers/clients/openai_chat_completions_token_client.py +++ b/verifiers/clients/openai_chat_completions_token_client.py @@ -120,8 +120,8 @@ def _get_renderer(self, model: str): if model in cache: return cache[model] try: - from renderers import create_renderer # type: ignore[import-not-found] - from transformers import AutoTokenizer # type: ignore[import-not-found] + from renderers import create_renderer + from transformers import AutoTokenizer except ImportError as exc: # pragma: no cover - dependency surface raise ImportError( "OpenAIChatCompletionsTokenClient with renderer_transport=" @@ -153,12 +153,35 @@ def normalize_sampling_args(sampling_args: SamplingArgs): if "max_tokens" in sampling_args: sampling_args["max_completion_tokens"] = sampling_args.pop("max_tokens") sampling_args["logprobs"] = True - extra_body = dict(return_token_ids=True) - if "extra_body" in sampling_args: - sampling_args["extra_body"] = { - **sampling_args["extra_body"], - **extra_body, + + if self.renderer_transport == "dynamo_chat_nvext": + extra_body: dict[str, Any] = { + "nvext": {"extra_fields": ["engine_data"]} } + else: + extra_body = {"return_token_ids": True} + + if "extra_body" in sampling_args: + merged = {**sampling_args["extra_body"]} + if "nvext" in merged and "nvext" in extra_body: + merged_nvext = merged.get("nvext") + extra_nvext = extra_body.get("nvext") + base = ( + dict(merged_nvext) if isinstance(merged_nvext, Mapping) else {} + ) + inc = dict(extra_nvext) if isinstance(extra_nvext, Mapping) else {} + base_extra_fields = list(base.get("extra_fields") or []) + inc_extra_fields = list(inc.get("extra_fields") or []) + extra_fields = list( + dict.fromkeys(base_extra_fields + inc_extra_fields) + ) + merged["nvext"] = {**base, **inc, "extra_fields": extra_fields} + sampling_args["extra_body"] = { + **{k: v for k, v in extra_body.items() if k != "nvext"}, + **merged, + } + else: + sampling_args["extra_body"] = {**merged, **extra_body} else: sampling_args["extra_body"] = extra_body return {k: v for k, v in sampling_args.items() if v is not None} @@ -242,22 +265,18 @@ async def _post_dynamo_chat_nvext( ) -> OpenAIChatResponse: """Post stitched prompt_ids to Dynamo's chat-completions route. - Wire shape mirrors what ``RendererClient`` produces for - ``dynamo_chat_nvext`` (placeholder messages + ``nvext.token_data`` + - explicit ``stop_token_ids``) so a Dynamo deployment validated against - renderer-mode automatically accepts TITO-mode traffic too. The - engine ignores ``messages`` when ``nvext.token_data`` is present, so - the placeholder body stays small regardless of trajectory length. + The engine sees ``nvext.token_data`` and skips tokenization. Response + token IDs come back through ``nvext.engine_data.completion_token_ids`` + and are grafted onto the standard token fields by + ``OpenAIChatCompletionsClient.from_native_response``. """ renderer = self._get_renderer(model) stop_token_ids = list(renderer.get_stop_token_ids()) extra_body = dict(sampling_args.pop("extra_body", {}) or {}) - nvext: dict[str, Any] = { - "token_data": prompt_ids, - "extra_fields": ["completion_token_ids"], - } + nvext = dict(extra_body.pop("nvext", None) or {}) + nvext["token_data"] = prompt_ids priority = sampling_args.get("priority", extra_body.get("priority")) if priority is not None: nvext["agent_hints"] = {"priority": priority} @@ -287,6 +306,8 @@ async def _post_dynamo_chat_nvext( "n", "repetition_penalty", "min_tokens", + "top_logprobs", + "stop", ) for key in promotable: value = sampling_args.get(key, extra_body.get(key)) diff --git a/verifiers/clients/renderer_client.py b/verifiers/clients/renderer_client.py index 79f27f52f..03579f753 100644 --- a/verifiers/clients/renderer_client.py +++ b/verifiers/clients/renderer_client.py @@ -28,6 +28,7 @@ ) from renderers import ToolCall as RendererToolCall from renderers import ToolCallFunction +from renderers.client import RendererTransport as RenderersTransport from renderers.client import generate from verifiers.clients.client import Client @@ -576,6 +577,13 @@ async def get_native_response( prompt_ids = None multi_modal_data = None + renderer_transport = getattr( + self._config, "renderer_transport", "prime_vllm_generate" + ) + transport: RenderersTransport = ( + "dynamo" if renderer_transport == "dynamo_chat_nvext" else "vllm" + ) + return await generate( client=self.client, renderer=renderer, @@ -589,9 +597,7 @@ async def get_native_response( or sampling_params.pop("cache_salt", None), priority=args.get("priority") or sampling_params.pop("priority", None), extra_headers=args.get("extra_headers"), - transport=getattr( - self._config, "renderer_transport", "prime_vllm_generate" - ), + transport=transport, ) async def raise_from_native_response(self, response: dict[str, Any]) -> None: From 9485194fdd03792f658d5a0c689a5eb1e05473a3 Mon Sep 17 00:00:00 2001 From: AmeenP Date: Fri, 15 May 2026 07:14:41 -0700 Subject: [PATCH 6/6] Use canonical renderer transport names --- ...st_openai_chat_completions_token_client.py | 10 +++--- tests/test_renderer_client.py | 2 +- .../openai_chat_completions_token_client.py | 34 +++++++++++-------- verifiers/clients/renderer_client.py | 8 ++--- verifiers/types.py | 19 +++++++++-- 5 files changed, 45 insertions(+), 28 deletions(-) diff --git a/tests/test_openai_chat_completions_token_client.py b/tests/test_openai_chat_completions_token_client.py index 6d1dee485..a424a321c 100644 --- a/tests/test_openai_chat_completions_token_client.py +++ b/tests/test_openai_chat_completions_token_client.py @@ -278,12 +278,12 @@ async def fake_get_prompt_ids( # noqa: ANN001 # --------------------------------------------------------------------------- -# dynamo_chat_nvext transport (Dynamo bis/dynamo-rl) +# dynamo transport (Dynamo bis/dynamo-rl) # --------------------------------------------------------------------------- class _StubRenderer: - """Renderer stand-in for the dynamo_chat_nvext transport tests. + """Renderer stand-in for the dynamo transport tests. Returns deterministic ids so we can assert on body shape without pulling in a real HuggingFace tokenizer download. ``render_ids`` returns a @@ -332,7 +332,7 @@ def __init__(self, recording_client) -> None: @property def renderer_transport(self) -> str: # type: ignore[override] - return "dynamo_chat_nvext" + return "dynamo" def _get_renderer(self, model: str): # type: ignore[override] return self._stub_renderer @@ -340,7 +340,7 @@ def _get_renderer(self, model: str): # type: ignore[override] @pytest.mark.asyncio async def test_local_tokenize_uses_renderer_under_dynamo_transport(): - """Bridge tokenize must NOT hit any HTTP route under dynamo_chat_nvext. + """Bridge tokenize must NOT hit any HTTP route under dynamo. Goes straight through ``_local_tokenize`` -> ``renderer.render_ids``. The recording client would record any errant POST; we assert it sees @@ -371,7 +371,7 @@ async def test_local_tokenize_uses_renderer_under_dynamo_transport(): @pytest.mark.asyncio -async def test_get_native_response_uses_dynamo_chat_nvext_under_transport( +async def test_get_native_response_uses_dynamo_under_transport( monkeypatch: pytest.MonkeyPatch, ): """Dynamo transport must POST to /chat/completions with nvext.token_data. diff --git a/tests/test_renderer_client.py b/tests/test_renderer_client.py index 625fb46f9..7519ee175 100644 --- a/tests/test_renderer_client.py +++ b/tests/test_renderer_client.py @@ -214,7 +214,7 @@ async def test_get_native_response_passes_renderer_transport_to_generate( client._client = object() client._config = vf.ClientConfig( client_type="renderer", - renderer_transport="dynamo_chat_nvext", + renderer_transport="dynamo", ) renderer = _BridgeRenderer() calls = [] diff --git a/verifiers/clients/openai_chat_completions_token_client.py b/verifiers/clients/openai_chat_completions_token_client.py index 40dd8ebd7..e75f046cb 100644 --- a/verifiers/clients/openai_chat_completions_token_client.py +++ b/verifiers/clients/openai_chat_completions_token_client.py @@ -20,11 +20,16 @@ OpenAITool, handle_openai_overlong_prompt, ) -from verifiers.types import RendererTransport, SamplingArgs, State +from verifiers.types import ( + RendererTransport, + SamplingArgs, + State, + normalize_renderer_transport, +) # Sentinel returned by transports that don't tokenize over HTTP. Lets callers # route around the legacy /tokenize body shape without changing the signature. -_DEFAULT_TRANSPORT: RendererTransport = "prime_vllm_generate" +_DEFAULT_TRANSPORT: RendererTransport = "vllm" def _has_multimodal_content(messages) -> bool: @@ -73,11 +78,11 @@ class OpenAIChatCompletionsTokenClient(OpenAIChatCompletionsClient): Two transports share this class: - * ``prime_vllm_generate`` (default): the historical TITO surface that + * ``vllm`` (default): the historical TITO surface that posts to vLLM's ``/v1/chat/completions/tokens`` and uses the engine's ``/tokenize`` for bridge-token computation. This is what vanilla vLLM ``>=0.20`` exposes. - * ``dynamo_chat_nvext``: posts pre-tokenized prompts to Dynamo's standard + * ``dynamo``: posts pre-tokenized prompts to Dynamo's standard ``/v1/chat/completions`` route with ``nvext.token_data`` carrying the stitched ``prompt_ids``. Bridge tokenization runs locally via the ``renderers`` package (no ``/tokenize`` round-trip) since Dynamo @@ -99,15 +104,14 @@ def renderer_transport(self) -> RendererTransport: """Wire-shape selector. ``ClientConfig.renderer_transport`` if set, else the default vLLM TITO shape. Mirrors the same field used by ``RendererClient`` so backend selection stays in one place.""" - return cast( - RendererTransport, + return normalize_renderer_transport( getattr(self._config, "renderer_transport", _DEFAULT_TRANSPORT) if self._config is not None else _DEFAULT_TRANSPORT, ) def _get_renderer(self, model: str): - """Lazy, per-model renderer cache. Used only by the ``dynamo_chat_nvext`` + """Lazy, per-model renderer cache. Used only by the ``dynamo`` transport for client-side tokenization and stop-token resolution. Loaded on first use and reused across calls so we pay the @@ -125,7 +129,7 @@ def _get_renderer(self, model: str): except ImportError as exc: # pragma: no cover - dependency surface raise ImportError( "OpenAIChatCompletionsTokenClient with renderer_transport=" - "'dynamo_chat_nvext' requires the 'renderers' and 'transformers' " + "'dynamo' requires the 'renderers' and 'transformers' " "packages. Install via `pip install verifiers[renderers]` or add " "renderers + transformers to your environment." ) from exc @@ -154,7 +158,7 @@ def normalize_sampling_args(sampling_args: SamplingArgs): sampling_args["max_completion_tokens"] = sampling_args.pop("max_tokens") sampling_args["logprobs"] = True - if self.renderer_transport == "dynamo_chat_nvext": + if self.renderer_transport == "dynamo": extra_body: dict[str, Any] = { "nvext": {"extra_fields": ["engine_data"]} } @@ -227,8 +231,8 @@ def normalize_sampling_args(sampling_args: SamplingArgs): prompt, model, sampling_args, tools, extra_headers=extra_headers ) - if self.renderer_transport == "dynamo_chat_nvext": - return await self._post_dynamo_chat_nvext( + if self.renderer_transport == "dynamo": + return await self._post_dynamo_chat_completions( prompt=prompt, prompt_ids=prompt_ids, model=model, @@ -254,7 +258,7 @@ def normalize_sampling_args(sampling_args: SamplingArgs): options={"headers": extra_headers} if extra_headers else {}, ) - async def _post_dynamo_chat_nvext( + async def _post_dynamo_chat_completions( self, prompt: OpenAIChatMessages, prompt_ids: list[int], @@ -554,7 +558,7 @@ async def tokenize( ) -> list[int]: """Tokenize messages. - ``dynamo_chat_nvext`` transport: tokenizes locally via the + ``dynamo`` transport: tokenizes locally via the ``renderers`` package, no network call. Runs on a worker thread so the event loop stays free; HuggingFace fast tokenizers release the GIL during the Rust encode pass. @@ -565,7 +569,7 @@ async def tokenize( if extra_kwargs is None: extra_kwargs = {} - if self.renderer_transport == "dynamo_chat_nvext": + if self.renderer_transport == "dynamo": return await self._local_tokenize( messages=messages, tools=tools, @@ -617,7 +621,7 @@ def _render() -> list[int]: if tokenizer is None: raise RuntimeError( "Renderer for model %r does not expose a tokenizer; " - "cannot tokenize a raw string under dynamo_chat_nvext." % model + "cannot tokenize a raw string under dynamo." % model ) # Strip BOS for parity with vLLM /tokenize (which never # prepends a BOS for raw-prompt tokenize requests). diff --git a/verifiers/clients/renderer_client.py b/verifiers/clients/renderer_client.py index 03579f753..6ba3605f1 100644 --- a/verifiers/clients/renderer_client.py +++ b/verifiers/clients/renderer_client.py @@ -53,6 +53,7 @@ ToolMessage, Usage, UserMessage, + normalize_renderer_transport, ) from verifiers.utils.client_utils import setup_openai_client @@ -577,11 +578,8 @@ async def get_native_response( prompt_ids = None multi_modal_data = None - renderer_transport = getattr( - self._config, "renderer_transport", "prime_vllm_generate" - ) - transport: RenderersTransport = ( - "dynamo" if renderer_transport == "dynamo_chat_nvext" else "vllm" + transport: RenderersTransport = normalize_renderer_transport( + getattr(self._config, "renderer_transport", None) ) return await generate( diff --git a/verifiers/types.py b/verifiers/types.py index c4190aee7..08db8fa68 100644 --- a/verifiers/types.py +++ b/verifiers/types.py @@ -56,10 +56,20 @@ "openai_responses", "anthropic_messages", ] -RendererTransport = Literal["prime_vllm_generate", "dynamo_chat_nvext"] +RendererTransport = Literal["vllm", "dynamo"] MessageType = Literal["chat", "completion"] # deprecated +def normalize_renderer_transport(value: object) -> RendererTransport: + if value is None: + return "vllm" + if not isinstance(value, str): + raise ValueError("renderer_transport must be a string") + if value in ("vllm", "dynamo"): + return cast(RendererTransport, value) + raise ValueError("renderer_transport must be 'vllm' or 'dynamo'") + + # Provider-agnostic message + response types class CustomBaseModel(BaseModel): """Allow extras and dict-like attribute access.""" @@ -1016,7 +1026,7 @@ class ClientConfig(BaseModel): renderer: str = "auto" renderer_model_name: str | None = None renderer_pool_size: int | None = None - renderer_transport: RendererTransport = "prime_vllm_generate" + renderer_transport: RendererTransport = "vllm" tool_parser: str | None = None reasoning_parser: str | None = None preserve_all_thinking: bool = False @@ -1043,6 +1053,11 @@ class ClientConfig(BaseModel): def validate_extra_headers(cls, value: object) -> dict[str, str]: return _validate_extra_headers_value(value) + @field_validator("renderer_transport", mode="before") + @classmethod + def validate_renderer_transport(cls, value: object) -> RendererTransport: + return normalize_renderer_transport(value) + @field_validator("endpoint_configs", mode="before") @classmethod def validate_non_recursive_endpoints(cls, value):