From 3443fdff00c52a79831f7a790543062851179a0f Mon Sep 17 00:00:00 2001 From: eligotts <78387377+eligotts@users.noreply.github.com> Date: Fri, 22 May 2026 18:48:55 -0700 Subject: [PATCH] Pass renderer chat template kwargs from sampling --- tests/test_renderer_client.py | 54 ++++++++++++++++++++++++++++ verifiers/clients/renderer_client.py | 47 +++++++++++++++++++----- 2 files changed, 93 insertions(+), 8 deletions(-) diff --git a/tests/test_renderer_client.py b/tests/test_renderer_client.py index b959b26fb..aabcebe38 100644 --- a/tests/test_renderer_client.py +++ b/tests/test_renderer_client.py @@ -1,3 +1,4 @@ +import asyncio from functools import lru_cache from unittest.mock import patch @@ -46,6 +47,7 @@ def test_renderer_client_honors_configured_renderer_name(): size=1, tool_parser=None, reasoning_parser=None, + chat_template_kwargs={}, preserve_all_thinking=False, preserve_thinking_between_tool_calls=False, ) @@ -77,11 +79,63 @@ def test_renderer_client_uses_renderer_model_name_override(): size=1, tool_parser=None, reasoning_parser=None, + chat_template_kwargs={}, preserve_all_thinking=False, preserve_thinking_between_tool_calls=False, ) +def test_renderer_client_consumes_sampling_chat_template_kwargs(): + RendererClient._shared_pools.clear() + + client = object.__new__(RendererClient) + client._renderer = None + client._pool_size = 1 + client._config = vf.ClientConfig(client_type="renderer", renderer="qwen3") + client._client = object() # type: ignore[attr-defined] + + sentinel_pool = RendererPool.__new__(RendererPool) + captured: dict = {} + + async def _fake_generate(**kwargs): + captured.update(kwargs) + return {"content": "ok"} + + with ( + patch( + "verifiers.clients.renderer_client.create_renderer_pool", + return_value=sentinel_pool, + ) as create_pool_mock, + patch("verifiers.clients.renderer_client.generate", side_effect=_fake_generate), + ): + response = asyncio.run( + client.get_native_response( + prompt=[{"role": "user", "content": "hi"}], + model="Qwen/Qwen3-8B", + sampling_args={ + "extra_body": { + "chat_template_kwargs": {"enable_thinking": False}, + "top_k": 20, + } + }, + tools=None, + ) + ) + + assert response == {"content": "ok"} + create_pool_mock.assert_called_once_with( + "Qwen/Qwen3-8B", + renderer="qwen3", + size=1, + tool_parser=None, + reasoning_parser=None, + chat_template_kwargs={"enable_thinking": False}, + preserve_all_thinking=False, + preserve_thinking_between_tool_calls=False, + ) + assert captured["sampling_params"] == {"top_k": 20} + + # Provenance: Eli's review on PR #1068, comment 3150580768. # "RendererClient parses the GPT-OSS assistant tool call into ToolCall(name=...), # but ToolEnv returns ToolMessage with only content/tool_call_id, and diff --git a/verifiers/clients/renderer_client.py b/verifiers/clients/renderer_client.py index 10600d178..aebaaa350 100644 --- a/verifiers/clients/renderer_client.py +++ b/verifiers/clients/renderer_client.py @@ -405,6 +405,23 @@ def _parse_finish_reason(raw: str | None) -> FinishReason: return None +def _freeze_json_like(value: Any) -> Any: + if isinstance(value, Mapping): + return tuple(sorted((str(k), _freeze_json_like(v)) for k, v in value.items())) + if isinstance(value, list): + return tuple(_freeze_json_like(v) for v in value) + return value + + +def _pop_chat_template_kwargs(sampling_params: dict[str, Any]) -> dict[str, Any]: + raw = sampling_params.pop("chat_template_kwargs", None) + if raw is None: + return {} + if not isinstance(raw, Mapping): + raise ValueError("extra_body.chat_template_kwargs must be a mapping") + return dict(raw) + + class RendererClient( Client[AsyncOpenAI, list[RendererMessage], dict[str, Any], ToolSpec] ): @@ -418,13 +435,22 @@ class RendererClient( """ # Cache key is (renderer_model_name, renderer_name, tool_parser, - # reasoning_parser, pool_size, preserve_all_thinking, - # preserve_thinking_between_tool_calls) so that different parser configs, - # pool sizes, or preserve-thinking bindings for the same model don't - # collide. + # reasoning_parser, pool_size, chat_template_kwargs, + # preserve_all_thinking, preserve_thinking_between_tool_calls) so that + # different parser configs, pool sizes, template kwargs, or + # preserve-thinking bindings for the same model don't collide. _shared_pools: ClassVar[ dict[ - tuple[str, str, str | None, str | None, int, bool, bool], + tuple[ + str, + str, + str | None, + str | None, + int, + Any, + bool, + bool, + ], RendererPool, ] ] = {} @@ -451,7 +477,9 @@ async def close(self) -> None: # ── Renderer management ───────────────────────────────────────── - def _get_renderer_or_pool(self, model: str) -> Renderer | RendererPool: + def _get_renderer_or_pool( + self, model: str, chat_template_kwargs: dict[str, Any] | None = None + ) -> Renderer | RendererPool: if self._renderer is not None: return self._renderer @@ -473,12 +501,14 @@ def _get_renderer_or_pool(self, model: str) -> Renderer | RendererPool: if self._config is not None else False ) + renderer_chat_template_kwargs = dict(chat_template_kwargs or {}) cache_key = ( renderer_model, renderer_name, tool_parser, reasoning_parser, self._pool_size, + _freeze_json_like(renderer_chat_template_kwargs), preserve_all_thinking, preserve_thinking_between_tool_calls, ) @@ -491,6 +521,7 @@ def _get_renderer_or_pool(self, model: str) -> Renderer | RendererPool: size=self._pool_size, tool_parser=tool_parser, reasoning_parser=reasoning_parser, + chat_template_kwargs=renderer_chat_template_kwargs, preserve_all_thinking=preserve_all_thinking, preserve_thinking_between_tool_calls=preserve_thinking_between_tool_calls, ) @@ -528,10 +559,10 @@ async def get_native_response( tools: list[ToolSpec] | None = None, **kwargs: Any, ) -> dict[str, Any]: - renderer = self._get_renderer_or_pool(model) - args = dict(sampling_args) sampling_params: dict[str, Any] = dict(args.pop("extra_body", None) or {}) + chat_template_kwargs = _pop_chat_template_kwargs(sampling_params) + renderer = self._get_renderer_or_pool(model, chat_template_kwargs) for key in ( "temperature", "top_p",