From 5f653df9acecebb4cb99e3d70aff60969f35d4b3 Mon Sep 17 00:00:00 2001 From: Sebastian Mueller Date: Mon, 25 May 2026 13:15:20 +0530 Subject: [PATCH] feat(client): RENDERERS_MAX_PROMPT_LEN env override for pre-flight cap Adds an env-var escape hatch for the pre-flight overflow check in `_resolve_max_prompt_len`. When `RENDERERS_MAX_PROMPT_LEN` is set to a positive integer, that value is returned directly and `/v1/models` is not queried. Motivation: routers/gateways whose `/v1/models` handler is broken (observed with vllm-router v0.1.22 under `--intra-node-data-parallel-size` > 1) silently disable the pre-flight via the cached-`None` path, which lets overlong prompts reach the engine and crash the orchestrator with a raw `ValueError`. Operators who know the real cap can now set the env var to restore pre-flight without touching the broken endpoint. Invalid values (non-integer, <= 0) are logged and ignored, falling back to the existing auto-discovery path. Co-Authored-By: Claude Opus 4.7 (1M context) --- renderers/client.py | 42 +++++++++++++++++++++++++++ tests/test_client.py | 69 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+) diff --git a/renderers/client.py b/renderers/client.py index 0c63c0e..98a5b5c 100644 --- a/renderers/client.py +++ b/renderers/client.py @@ -14,6 +14,7 @@ import asyncio import json import logging +import os from collections.abc import Mapping from typing import Any, cast @@ -66,6 +67,38 @@ def __init__(self, *, prompt_len: int, max_prompt_len: int) -> None: _max_prompt_len_lock = asyncio.Lock() +_MAX_PROMPT_LEN_ENV_VAR = "RENDERERS_MAX_PROMPT_LEN" + + +def _max_prompt_len_from_env() -> int | None: + """Read the ``RENDERERS_MAX_PROMPT_LEN`` env-var override. + + Returns the parsed positive int, or ``None`` if unset/invalid. When + set, ``_resolve_max_prompt_len`` returns this value without querying + the engine — useful when the engine's ``/v1/models`` endpoint is + broken (e.g., a misconfigured router) but the operator knows the + real ``max_model_len``. + """ + raw = os.environ.get(_MAX_PROMPT_LEN_ENV_VAR) + if not raw: + return None + try: + value = int(raw) + except ValueError: + _request_logger.warning( + "%s is set to %r, which is not a valid integer; ignoring.", + _MAX_PROMPT_LEN_ENV_VAR, + raw, + ) + return None + if value <= 0: + _request_logger.warning( + "%s=%d is not positive; ignoring.", _MAX_PROMPT_LEN_ENV_VAR, value + ) + return None + return value + + async def _resolve_max_prompt_len(client: AsyncOpenAI, model: str) -> int | None: """Discover ``max_model_len`` from the engine via ``GET /v1/models``. @@ -80,7 +113,16 @@ async def _resolve_max_prompt_len(client: AsyncOpenAI, model: str) -> int | None Any exception during lookup (network error, non-JSON body, attribute miss on a mock client in tests) is treated as "unknown cap": cached ``None`` so we don't retry on every call. + + The ``RENDERERS_MAX_PROMPT_LEN`` env var overrides auto-discovery + entirely: when set to a positive integer, that value is returned and + no ``/v1/models`` request is issued. Use this when the engine's model + card is unreachable (e.g., a router whose ``/v1/models`` handler is + broken) but the operator knows the engine's real cap. """ + override = _max_prompt_len_from_env() + if override is not None: + return override key = (str(getattr(client, "base_url", "")), model) if key in _max_prompt_len_cache: return _max_prompt_len_cache[key] diff --git a/tests/test_client.py b/tests/test_client.py index 1cc1000..1fe7bc0 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -502,3 +502,72 @@ def test_generate_caches_max_prompt_len_lookup_failure(): assert len(client.calls) == 1 assert result["prompt_ids"] == list(range(10)) assert _max_prompt_len_cache[("http://no-models:8000/v1", "test-model")] is None + + +def test_generate_uses_env_max_prompt_len_override(monkeypatch): + """``RENDERERS_MAX_PROMPT_LEN`` overrides auto-discovery entirely: the + pre-flight uses the env value and ``/v1/models`` is never queried, + even when the engine would have returned a different cap.""" + from renderers.client import OverlongPromptError, _max_prompt_len_cache + + class _ClientWithModels(_FakeClient): + def __init__(self): + super().__init__() + self.base_url = "http://disco-host:8000/v1" + self.models_calls = 0 + + async def get(self, path, *, cast_to): + self.models_calls += 1 + return {"object": "list", "data": [{"id": "test-model", "max_model_len": 999}]} + + _max_prompt_len_cache.clear() + monkeypatch.setenv("RENDERERS_MAX_PROMPT_LEN", "4") + + client = _ClientWithModels() + with pytest.raises(OverlongPromptError) as excinfo: + asyncio.run( + generate( + client=client, + renderer=_LongRenderer(), + messages=[{"role": "user", "content": "hi"}], + model="test-model", + ) + ) + + assert excinfo.value.max_prompt_len == 4, "env override beats engine card" + assert excinfo.value.prompt_len == 10 + assert client.models_calls == 0, "env override must skip /v1/models query" + assert client.calls == [], "request must not be dispatched on pre-flight fail" + + +def test_generate_env_override_invalid_falls_back_to_auto_discovery(monkeypatch): + """A non-integer or non-positive ``RENDERERS_MAX_PROMPT_LEN`` is ignored + (with a warning) and auto-discovery proceeds normally.""" + from renderers.client import OverlongPromptError, _max_prompt_len_cache + + class _ClientWithModels(_FakeClient): + def __init__(self): + super().__init__() + self.base_url = "http://disco-host-2:8000/v1" + self.models_calls = 0 + + async def get(self, path, *, cast_to): + self.models_calls += 1 + return {"object": "list", "data": [{"id": "test-model", "max_model_len": 4}]} + + _max_prompt_len_cache.clear() + monkeypatch.setenv("RENDERERS_MAX_PROMPT_LEN", "not-an-int") + + client = _ClientWithModels() + with pytest.raises(OverlongPromptError) as excinfo: + asyncio.run( + generate( + client=client, + renderer=_LongRenderer(), + messages=[{"role": "user", "content": "hi"}], + model="test-model", + ) + ) + + assert excinfo.value.max_prompt_len == 4, "auto-discovered cap should win" + assert client.models_calls == 1, "invalid env override must not skip /v1/models"