From 196476a7cc1842652e54f3baf452da5641b8ed41 Mon Sep 17 00:00:00 2001 From: AmeenP Date: Fri, 8 May 2026 02:50:10 -0700 Subject: [PATCH 1/6] feat: add Dynamo chat nvext transport --- renderers/__init__.py | 2 + renderers/client.py | 114 ++++++++++++++++++++++++++++++------------ tests/test_client.py | 68 ++++++++++++++++++++++++- 3 files changed, 152 insertions(+), 32 deletions(-) diff --git a/renderers/__init__.py b/renderers/__init__.py index 62bc666..9c0f0c4 100644 --- a/renderers/__init__.py +++ b/renderers/__init__.py @@ -36,6 +36,7 @@ reject_assistant_in_extension, trim_to_turn_close, ) +from renderers.client import RendererTransport from renderers.deepseek_v3 import DeepSeekV3Renderer from renderers.default import DefaultRenderer from renderers.glm5 import GLM5Renderer @@ -80,6 +81,7 @@ "RenderedTokens", "Renderer", "RendererPool", + "RendererTransport", "TextPart", "ThinkingPart", "ToolCall", diff --git a/renderers/client.py b/renderers/client.py index 902e148..879425a 100644 --- a/renderers/client.py +++ b/renderers/client.py @@ -14,7 +14,7 @@ import asyncio import base64 import logging -from typing import Any, cast +from typing import Any, Literal, cast import numpy as np from openai import AsyncOpenAI, BadRequestError @@ -28,6 +28,8 @@ ToolSpec, ) +RendererTransport = Literal["vllm", "dynamo"] + _request_logger = logging.getLogger("renderers.client") @@ -58,12 +60,13 @@ async def generate( cache_salt: str | None = None, priority: int | None = None, extra_headers: dict[str, str] | None = None, + transport: RendererTransport = "vllm", ) -> dict[str, Any]: - """Tokenize messages, call vLLM /inference/v1/generate, parse the response. + """Tokenize messages, call the selected token-in backend, parse response. - ``sampling_params`` is forwarded to vLLM verbatim. Two fields are always - set by us and override caller values: ``stop_token_ids`` (from the - renderer) and ``logprobs=1`` (we always emit completion_logprobs). Pass + ``sampling_params`` is forwarded to the selected token-in backend. Two + fields are always set by us and override caller values: stop token IDs + from the renderer and ``logprobs=1`` (we always emit completion_logprobs). Pass ``prompt_ids`` to skip rendering and use a prebuilt token sequence — pair it with ``multi_modal_data`` when the prebuilt prompt has image / video placeholders that need engine-side mm payload. @@ -101,31 +104,65 @@ def _prepare(): sp["logprobs"] = 1 sp.setdefault("skip_special_tokens", False) - body: dict[str, Any] = { - "model": model, - "token_ids": prompt_ids, - "sampling_params": sp, - } - features = ( - _build_mm_features(renderer, mm_data) - if mm_data and not mm_data.is_empty() - else None - ) - if features is not None: - body["features"] = features - if cache_salt is not None: - body["cache_salt"] = cache_salt - if priority is not None: - body["priority"] = priority - - # /inference/v1/generate is mounted at the server root, not under /v1 - # like the OpenAI-compatible endpoints. Build an absolute URL so the - # AsyncOpenAI client doesn't prepend its automatic /v1. - base = str(client.base_url).rstrip("/").removesuffix("/v1") - endpoint = f"{base}/inference/v1/generate" + if transport == "vllm": + body: dict[str, Any] = { + "model": model, + "token_ids": prompt_ids, + "sampling_params": sp, + } + features = ( + _build_mm_features(renderer, mm_data) + if mm_data and not mm_data.is_empty() + else None + ) + if features is not None: + body["features"] = features + if cache_salt is not None: + body["cache_salt"] = cache_salt + if priority is not None: + body["priority"] = priority + + # /inference/v1/generate is mounted at the server root, not under /v1 + # like the OpenAI-compatible endpoints. Build an absolute URL so the + # AsyncOpenAI client doesn't prepend its automatic /v1. + base = str(client.base_url).rstrip("/").removesuffix("/v1") + endpoint = f"{base}/inference/v1/generate" + elif transport == "dynamo": + nvext: dict[str, Any] = { + "token_data": prompt_ids, + "extra_fields": ["completion_token_ids"], + } + if priority is not None: + nvext["agent_hints"] = {"priority": priority} + + body = { + "model": model, + "messages": [{"role": "user", "content": "(token-in mode)"}], + "stream": False, + "logprobs": True, + "stop": stop_token_ids, + "nvext": nvext, + } + if cache_salt is not None: + body["cache_salt"] = cache_salt + + passthrough = dict(sp) + passthrough.pop("stop_token_ids", None) + passthrough.pop("stop", None) + passthrough.pop("logprobs", None) + passthrough.pop("skip_special_tokens", None) + max_tokens = passthrough.pop("max_tokens", None) + if max_tokens is not None: + body["max_completion_tokens"] = max_tokens + body.update({k: v for k, v in passthrough.items() if v is not None}) + endpoint = "/chat/completions" + else: + raise ValueError(f"Unsupported renderer transport: {transport}") + _request_logger.debug( - "POST %s prompt_len=%d max_tokens=%s", + "POST %s transport=%s prompt_len=%d max_tokens=%s", endpoint, + transport, len(prompt_ids), sp.get("max_tokens"), ) @@ -147,7 +184,23 @@ def _prepare(): raise choice = (data.get("choices") or [{}])[0] - completion_ids = choice.get("token_ids") or [] + if transport == "dynamo": + completion_ids = ( + choice.get("token_ids") + or choice.get("nvext", {}).get("completion_token_ids") + or data.get("nvext", {}).get("completion_token_ids") + or [] + ) + raw_re = ( + choice.get("routed_experts") + or choice.get("nvext", {}).get("routed_experts") + or data.get("nvext", {}).get("routed_experts") + ) + request_id = data.get("id") or data.get("request_id") or "" + else: + completion_ids = choice.get("token_ids") or [] + raw_re = choice.get("routed_experts") + request_id = data.get("request_id") or "" parsed = await _maybe_offload( renderer, lambda: renderer.parse_response(completion_ids, tools=tools) @@ -159,7 +212,6 @@ def _prepare(): completion_logprobs = [float(c.get("logprob") or 0.0) for c in content_lp or []] routed_experts = None - raw_re = choice.get("routed_experts") if isinstance(raw_re, dict) and "data" in raw_re and "shape" in raw_re: routed_experts = ( np.frombuffer(base64.b85decode(raw_re["data"]), dtype=np.int32) @@ -183,7 +235,7 @@ def _prepare(): finish_reason = "tool_calls" return { - "request_id": data.get("request_id") or "", + "request_id": request_id, "prompt_ids": list(prompt_ids), "completion_ids": list(completion_ids), "completion_logprobs": completion_logprobs, diff --git a/tests/test_client.py b/tests/test_client.py index d724234..eb2d3f2 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -54,14 +54,17 @@ class _FakeClient: URL off ``client.base_url``, so we expose one that includes the /v1 suffix the OpenAI SDK normally appends.""" - def __init__(self): + def __init__(self, response=None): self.calls = [] self.base_url = "http://fake-host:8000/v1" + self.response = response async def post(self, path, *, cast_to=dict, body=None, options=None): self.calls.append( {"path": path, "cast_to": cast_to, "body": body, "options": options} ) + if self.response is not None: + return self.response routed_experts = np.array([[[1]], [[2]]], dtype=np.int32) return { "request_id": "gen-test", @@ -291,3 +294,66 @@ def test_generate_serializes_multimodal_features_for_qwen3_vl(): # Items are base64 strings (encode_mm_kwargs_item output). for item in features["kwargs_data"]["image"]: assert isinstance(item, str) and len(item) > 0 + + +def test_generate_can_use_dynamo_transport(): + client = _FakeClient( + response={ + "id": "chatcmpl-test", + "model": "test-model", + "nvext": {"completion_token_ids": [7, 8]}, + "choices": [ + { + "logprobs": { + "content": [ + {"token": "token_id:7", "logprob": -0.1}, + {"token": "token_id:8", "logprob": -0.2}, + ] + }, + "finish_reason": "stop", + } + ], + } + ) + + result = asyncio.run( + generate( + client=client, + renderer=_FakeRenderer(), + messages=[{"role": "user", "content": "hi"}], + model="test-model", + tools=[{"type": "function", "function": {"name": "echo"}}], + sampling_params={ + "temperature": 0.3, + "max_tokens": 7, + "min_tokens": 2, + "stop": "caller-stop", + }, + priority=4, + cache_salt="ckpt-42", + transport="dynamo", + ) + ) + + assert client.calls[0]["path"] == "/chat/completions" + assert client.calls[0]["body"] == { + "model": "test-model", + "messages": [{"role": "user", "content": "(token-in mode)"}], + "stream": False, + "logprobs": True, + "stop": [99], + "nvext": { + "token_data": [1, 2, 3], + "extra_fields": ["completion_token_ids"], + "agent_hints": {"priority": 4}, + }, + "cache_salt": "ckpt-42", + "max_completion_tokens": 7, + "temperature": 0.3, + "min_tokens": 2, + } + assert result["request_id"] == "chatcmpl-test" + assert result["prompt_ids"] == [1, 2, 3] + assert result["completion_ids"] == [7, 8] + assert result["completion_logprobs"] == [-0.1, -0.2] + assert result["finish_reason"] == "tool_calls" From 0b89024bcbe5bd57c69dbb55aecd82b6ddc5f143 Mon Sep 17 00:00:00 2001 From: AmeenP Date: Wed, 13 May 2026 16:10:48 -0700 Subject: [PATCH 2/6] fix: omit empty Dynamo stop ids --- renderers/client.py | 3 ++- tests/test_client.py | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/renderers/client.py b/renderers/client.py index 879425a..a4daab7 100644 --- a/renderers/client.py +++ b/renderers/client.py @@ -140,9 +140,10 @@ def _prepare(): "messages": [{"role": "user", "content": "(token-in mode)"}], "stream": False, "logprobs": True, - "stop": stop_token_ids, "nvext": nvext, } + if stop_token_ids: + body["stop"] = stop_token_ids if cache_salt is not None: body["cache_salt"] = cache_salt diff --git a/tests/test_client.py b/tests/test_client.py index eb2d3f2..b3899f2 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -49,6 +49,11 @@ def parse_response( ) +class _NoStopRenderer(_FakeRenderer): + def get_stop_token_ids(self): + return [] + + class _FakeClient: """Mocks AsyncOpenAI's `.post()`. The renderer client builds an absolute URL off ``client.base_url``, so we expose one that includes the /v1 suffix @@ -357,3 +362,34 @@ def test_generate_can_use_dynamo_transport(): assert result["completion_ids"] == [7, 8] assert result["completion_logprobs"] == [-0.1, -0.2] assert result["finish_reason"] == "tool_calls" + + +def test_generate_dynamo_omits_empty_stop_token_ids(): + client = _FakeClient( + response={ + "id": "chatcmpl-test", + "model": "test-model", + "nvext": {"completion_token_ids": [7, 8]}, + "choices": [{"finish_reason": "stop"}], + } + ) + + asyncio.run( + generate( + client=client, + renderer=_NoStopRenderer(), + messages=[{"role": "user", "content": "hi"}], + model="test-model", + tools=[{"type": "function", "function": {"name": "echo"}}], + sampling_params={ + "max_tokens": 7, + "stop": "caller-stop", + "stop_token_ids": [123], + }, + transport="dynamo", + ) + ) + + body = client.calls[0]["body"] + assert "stop" not in body + assert "stop_token_ids" not in body From 69a78bd779a78989c3b809dc685ded6f49ebd278 Mon Sep 17 00:00:00 2001 From: AmeenP Date: Fri, 15 May 2026 04:48:26 -0700 Subject: [PATCH 3/6] fix: align Dynamo renderer transport naming Signed-off-by: AmeenP --- renderers/client.py | 52 ++++++++++++++++++++++++++++---------- tests/test_client.py | 60 +++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 96 insertions(+), 16 deletions(-) diff --git a/renderers/client.py b/renderers/client.py index a4daab7..42aadf8 100644 --- a/renderers/client.py +++ b/renderers/client.py @@ -1,7 +1,9 @@ -"""Renderer-based generate client for vLLM 0.20's /inference/v1/generate. +"""Renderer-based generate client for vLLM 0.20 and Dynamo token-in routes. - messages → Renderer.render_ids() → token IDs → POST /inference/v1/generate - → completion tokens → Renderer.parse_response() → structured message +Two transports are selected per call: + + "prime_vllm_generate" → POST /inference/v1/generate + "dynamo_chat_nvext" → POST /chat/completions with nvext.token_data When a RendererPool is passed instead of a single Renderer, the sync tokenization and parsing work is offloaded to threads for parallel execution across rollouts. @@ -28,7 +30,7 @@ ToolSpec, ) -RendererTransport = Literal["vllm", "dynamo"] +RendererTransport = Literal["prime_vllm_generate", "dynamo_chat_nvext"] _request_logger = logging.getLogger("renderers.client") @@ -60,7 +62,7 @@ async def generate( cache_salt: str | None = None, priority: int | None = None, extra_headers: dict[str, str] | None = None, - transport: RendererTransport = "vllm", + transport: RendererTransport = "prime_vllm_generate", ) -> dict[str, Any]: """Tokenize messages, call the selected token-in backend, parse response. @@ -104,7 +106,7 @@ def _prepare(): sp["logprobs"] = 1 sp.setdefault("skip_special_tokens", False) - if transport == "vllm": + if transport == "prime_vllm_generate": body: dict[str, Any] = { "model": model, "token_ids": prompt_ids, @@ -127,7 +129,12 @@ def _prepare(): # AsyncOpenAI client doesn't prepend its automatic /v1. base = str(client.base_url).rstrip("/").removesuffix("/v1") endpoint = f"{base}/inference/v1/generate" - elif transport == "dynamo": + elif transport == "dynamo_chat_nvext": + if mm_data and not mm_data.is_empty(): + raise NotImplementedError( + "Multimodal renderers are not yet supported on the " + "dynamo_chat_nvext transport." + ) nvext: dict[str, Any] = { "token_data": prompt_ids, "extra_fields": ["completion_token_ids"], @@ -142,10 +149,12 @@ def _prepare(): "logprobs": True, "nvext": nvext, } + if tools: + body["tools"] = tools if stop_token_ids: body["stop"] = stop_token_ids if cache_salt is not None: - body["cache_salt"] = cache_salt + nvext["cache_salt"] = cache_salt passthrough = dict(sp) passthrough.pop("stop_token_ids", None) @@ -185,17 +194,25 @@ def _prepare(): raise choice = (data.get("choices") or [{}])[0] - if transport == "dynamo": + if transport == "dynamo_chat_nvext": + choice_nvext = choice.get("nvext") or {} + response_nvext = data.get("nvext") or {} + choice_engine_data = choice_nvext.get("engine_data") or {} + response_engine_data = response_nvext.get("engine_data") or {} completion_ids = ( choice.get("token_ids") - or choice.get("nvext", {}).get("completion_token_ids") - or data.get("nvext", {}).get("completion_token_ids") + or choice_nvext.get("completion_token_ids") + or response_nvext.get("completion_token_ids") + or choice_engine_data.get("completion_token_ids") + or response_engine_data.get("completion_token_ids") or [] ) raw_re = ( choice.get("routed_experts") - or choice.get("nvext", {}).get("routed_experts") - or data.get("nvext", {}).get("routed_experts") + or choice_nvext.get("routed_experts") + or response_nvext.get("routed_experts") + or choice_engine_data.get("routed_experts") + or response_engine_data.get("routed_experts") ) request_id = data.get("id") or data.get("request_id") or "" else: @@ -211,6 +228,15 @@ def _prepare(): raw_logprobs = choice.get("logprobs") or {} content_lp = raw_logprobs.get("content") if isinstance(raw_logprobs, dict) else None completion_logprobs = [float(c.get("logprob") or 0.0) for c in content_lp or []] + if not completion_logprobs and transport == "dynamo_chat_nvext": + choice_nvext = choice.get("nvext") or {} + response_nvext = data.get("nvext") or {} + engine_logprobs = ( + (choice_nvext.get("engine_data") or {}).get("completion_logprobs") + or (response_nvext.get("engine_data") or {}).get("completion_logprobs") + or [] + ) + completion_logprobs = [float(logprob) for logprob in engine_logprobs] routed_experts = None if isinstance(raw_re, dict) and "data" in raw_re and "shape" in raw_re: diff --git a/tests/test_client.py b/tests/test_client.py index b3899f2..1979a24 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -2,8 +2,10 @@ import base64 import numpy as np +import pytest from renderers.base import ( + MultiModalData, ParsedResponse, ParsedToolCall, RenderedTokens, @@ -336,7 +338,7 @@ def test_generate_can_use_dynamo_transport(): }, priority=4, cache_salt="ckpt-42", - transport="dynamo", + transport="dynamo_chat_nvext", ) ) @@ -347,12 +349,13 @@ def test_generate_can_use_dynamo_transport(): "stream": False, "logprobs": True, "stop": [99], + "tools": [{"type": "function", "function": {"name": "echo"}}], "nvext": { "token_data": [1, 2, 3], "extra_fields": ["completion_token_ids"], "agent_hints": {"priority": 4}, + "cache_salt": "ckpt-42", }, - "cache_salt": "ckpt-42", "max_completion_tokens": 7, "temperature": 0.3, "min_tokens": 2, @@ -386,10 +389,61 @@ def test_generate_dynamo_omits_empty_stop_token_ids(): "stop": "caller-stop", "stop_token_ids": [123], }, - transport="dynamo", + transport="dynamo_chat_nvext", ) ) body = client.calls[0]["body"] assert "stop" not in body assert "stop_token_ids" not in body + + +def test_generate_dynamo_reads_engine_data_fallback(): + client = _FakeClient( + response={ + "id": "chatcmpl-test", + "model": "test-model", + "nvext": { + "engine_data": { + "completion_token_ids": [7, 8], + "completion_logprobs": [-0.3, -0.4], + } + }, + "choices": [{"finish_reason": "stop"}], + } + ) + + result = asyncio.run( + generate( + client=client, + renderer=_FakeRenderer(), + messages=[{"role": "user", "content": "hi"}], + model="test-model", + prompt_ids=[1, 2, 3], + transport="dynamo_chat_nvext", + ) + ) + + assert result["completion_ids"] == [7, 8] + assert result["completion_logprobs"] == [-0.3, -0.4] + + +class _MultiModalRenderer(_FakeRenderer): + def render(self, messages, *, tools=None, add_generation_prompt=False): + return RenderedTokens( + token_ids=[1, 2, 3], + multi_modal_data=MultiModalData(mm_hashes={"image": ["aaa"]}), + ) + + +def test_generate_dynamo_rejects_multimodal_sidecar(): + with pytest.raises(NotImplementedError, match="dynamo_chat_nvext"): + asyncio.run( + generate( + client=_FakeClient(), + renderer=_MultiModalRenderer(), + messages=[{"role": "user", "content": "hi"}], + model="test-model", + transport="dynamo_chat_nvext", + ) + ) From 310b48ae017a881e11a2cc73ebd9018bb5559f46 Mon Sep 17 00:00:00 2001 From: AmeenP Date: Fri, 15 May 2026 05:07:56 -0700 Subject: [PATCH 4/6] fix: restore renderer transport names Signed-off-by: AmeenP --- renderers/client.py | 19 +++++++++---------- tests/test_client.py | 10 +++++----- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/renderers/client.py b/renderers/client.py index 42aadf8..080d01b 100644 --- a/renderers/client.py +++ b/renderers/client.py @@ -2,8 +2,8 @@ Two transports are selected per call: - "prime_vllm_generate" → POST /inference/v1/generate - "dynamo_chat_nvext" → POST /chat/completions with nvext.token_data + "vllm" → POST /inference/v1/generate + "dynamo" → POST /chat/completions with nvext.token_data When a RendererPool is passed instead of a single Renderer, the sync tokenization and parsing work is offloaded to threads for parallel execution across rollouts. @@ -30,7 +30,7 @@ ToolSpec, ) -RendererTransport = Literal["prime_vllm_generate", "dynamo_chat_nvext"] +RendererTransport = Literal["vllm", "dynamo"] _request_logger = logging.getLogger("renderers.client") @@ -62,7 +62,7 @@ async def generate( cache_salt: str | None = None, priority: int | None = None, extra_headers: dict[str, str] | None = None, - transport: RendererTransport = "prime_vllm_generate", + transport: RendererTransport = "vllm", ) -> dict[str, Any]: """Tokenize messages, call the selected token-in backend, parse response. @@ -106,7 +106,7 @@ def _prepare(): sp["logprobs"] = 1 sp.setdefault("skip_special_tokens", False) - if transport == "prime_vllm_generate": + if transport == "vllm": body: dict[str, Any] = { "model": model, "token_ids": prompt_ids, @@ -129,11 +129,10 @@ def _prepare(): # AsyncOpenAI client doesn't prepend its automatic /v1. base = str(client.base_url).rstrip("/").removesuffix("/v1") endpoint = f"{base}/inference/v1/generate" - elif transport == "dynamo_chat_nvext": + elif transport == "dynamo": if mm_data and not mm_data.is_empty(): raise NotImplementedError( - "Multimodal renderers are not yet supported on the " - "dynamo_chat_nvext transport." + "Multimodal renderers are not yet supported on the dynamo transport." ) nvext: dict[str, Any] = { "token_data": prompt_ids, @@ -194,7 +193,7 @@ def _prepare(): raise choice = (data.get("choices") or [{}])[0] - if transport == "dynamo_chat_nvext": + if transport == "dynamo": choice_nvext = choice.get("nvext") or {} response_nvext = data.get("nvext") or {} choice_engine_data = choice_nvext.get("engine_data") or {} @@ -228,7 +227,7 @@ def _prepare(): raw_logprobs = choice.get("logprobs") or {} content_lp = raw_logprobs.get("content") if isinstance(raw_logprobs, dict) else None completion_logprobs = [float(c.get("logprob") or 0.0) for c in content_lp or []] - if not completion_logprobs and transport == "dynamo_chat_nvext": + if not completion_logprobs and transport == "dynamo": choice_nvext = choice.get("nvext") or {} response_nvext = data.get("nvext") or {} engine_logprobs = ( diff --git a/tests/test_client.py b/tests/test_client.py index 1979a24..50f6ad3 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -338,7 +338,7 @@ def test_generate_can_use_dynamo_transport(): }, priority=4, cache_salt="ckpt-42", - transport="dynamo_chat_nvext", + transport="dynamo", ) ) @@ -389,7 +389,7 @@ def test_generate_dynamo_omits_empty_stop_token_ids(): "stop": "caller-stop", "stop_token_ids": [123], }, - transport="dynamo_chat_nvext", + transport="dynamo", ) ) @@ -420,7 +420,7 @@ def test_generate_dynamo_reads_engine_data_fallback(): messages=[{"role": "user", "content": "hi"}], model="test-model", prompt_ids=[1, 2, 3], - transport="dynamo_chat_nvext", + transport="dynamo", ) ) @@ -437,13 +437,13 @@ def render(self, messages, *, tools=None, add_generation_prompt=False): def test_generate_dynamo_rejects_multimodal_sidecar(): - with pytest.raises(NotImplementedError, match="dynamo_chat_nvext"): + with pytest.raises(NotImplementedError, match="dynamo transport"): asyncio.run( generate( client=_FakeClient(), renderer=_MultiModalRenderer(), messages=[{"role": "user", "content": "hi"}], model="test-model", - transport="dynamo_chat_nvext", + transport="dynamo", ) ) From e2cfa59e83e9eaaf2094d5d00e42e8d85ac6ac92 Mon Sep 17 00:00:00 2001 From: AmeenP Date: Fri, 15 May 2026 05:20:19 -0700 Subject: [PATCH 5/6] fix: use rl-sdk Dynamo nvext wire shape Signed-off-by: AmeenP --- renderers/client.py | 4 ++-- tests/test_client.py | 16 ++++++++++++---- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/renderers/client.py b/renderers/client.py index 080d01b..93bcbc3 100644 --- a/renderers/client.py +++ b/renderers/client.py @@ -136,7 +136,7 @@ def _prepare(): ) nvext: dict[str, Any] = { "token_data": prompt_ids, - "extra_fields": ["completion_token_ids"], + "extra_fields": ["engine_data"], } if priority is not None: nvext["agent_hints"] = {"priority": priority} @@ -151,7 +151,7 @@ def _prepare(): if tools: body["tools"] = tools if stop_token_ids: - body["stop"] = stop_token_ids + body["stop_token_ids"] = stop_token_ids if cache_salt is not None: nvext["cache_salt"] = cache_salt diff --git a/tests/test_client.py b/tests/test_client.py index 50f6ad3..3ade6f5 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -308,7 +308,11 @@ def test_generate_can_use_dynamo_transport(): response={ "id": "chatcmpl-test", "model": "test-model", - "nvext": {"completion_token_ids": [7, 8]}, + "nvext": { + "engine_data": { + "completion_token_ids": [7, 8], + } + }, "choices": [ { "logprobs": { @@ -348,11 +352,11 @@ def test_generate_can_use_dynamo_transport(): "messages": [{"role": "user", "content": "(token-in mode)"}], "stream": False, "logprobs": True, - "stop": [99], + "stop_token_ids": [99], "tools": [{"type": "function", "function": {"name": "echo"}}], "nvext": { "token_data": [1, 2, 3], - "extra_fields": ["completion_token_ids"], + "extra_fields": ["engine_data"], "agent_hints": {"priority": 4}, "cache_salt": "ckpt-42", }, @@ -372,7 +376,11 @@ def test_generate_dynamo_omits_empty_stop_token_ids(): response={ "id": "chatcmpl-test", "model": "test-model", - "nvext": {"completion_token_ids": [7, 8]}, + "nvext": { + "engine_data": { + "completion_token_ids": [7, 8], + } + }, "choices": [{"finish_reason": "stop"}], } ) From 17005dd79f031f7993fb8e0b26b52aff346ad07e Mon Sep 17 00:00:00 2001 From: AmeenP Date: Fri, 15 May 2026 05:36:57 -0700 Subject: [PATCH 6/6] fix: make renderers git builds ignore invalid dev tags Signed-off-by: AmeenP --- pyproject.toml | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f511b65..2ff5ddb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,15 +30,19 @@ dependencies = [ source = "vcs" # Tags look like ``renderers-v0.1.8`` (prefix matches the publish.yml # release contract); strip the prefix to get a PEP 440 version. The -# regex accepts any PEP 440-valid suffix after the prefix so we can -# tag pre-releases like ``renderers-v0.2.0rc1`` later if needed. -tag-pattern = '^renderers-v(?P.+)$' +# The regex intentionally ignores historical ``.devN`` tags except +# ``.dev0``. hatch-vcs uses distance-from-tag to calculate dev versions, +# so anchoring at ``.dev1`` or later makes git dependency builds fail. +tag-pattern = '^renderers-v(?P\d+\.\d+\.\d+(?:(?:a|b|rc)\d+|\.dev0)?)$' # Used when building from a context without VCS metadata (e.g. an # sdist consumed by a downstream that doesn't ship .git). Real # builds from a checkout get the resolved version; this fallback # only fires when the resolver has nothing to go on. fallback-version = "0.0.0" +[tool.hatch.version.raw-options] +git_describe_command = "git describe --dirty --tags --long --match renderers-v[0-9]* --exclude renderers-v*.dev[1-9]*" + [tool.hatch.build.hooks.vcs] # Write the resolved version to a Python file so it can be inspected # at runtime via ``renderers.__version__`` without re-parsing the