Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions renderers/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import asyncio
import json
import logging
import os
from collections.abc import Mapping
from typing import Any, cast

Expand Down Expand Up @@ -66,6 +67,38 @@ def __init__(self, *, prompt_len: int, max_prompt_len: int) -> None:
_max_prompt_len_lock = asyncio.Lock()


_MAX_PROMPT_LEN_ENV_VAR = "RENDERERS_MAX_PROMPT_LEN"


def _max_prompt_len_from_env() -> int | None:
"""Read the ``RENDERERS_MAX_PROMPT_LEN`` env-var override.

Returns the parsed positive int, or ``None`` if unset/invalid. When
set, ``_resolve_max_prompt_len`` returns this value without querying
the engine — useful when the engine's ``/v1/models`` endpoint is
broken (e.g., a misconfigured router) but the operator knows the
real ``max_model_len``.
"""
raw = os.environ.get(_MAX_PROMPT_LEN_ENV_VAR)
if not raw:
return None
try:
value = int(raw)
except ValueError:
_request_logger.warning(
"%s is set to %r, which is not a valid integer; ignoring.",
_MAX_PROMPT_LEN_ENV_VAR,
raw,
)
return None
if value <= 0:
_request_logger.warning(
"%s=%d is not positive; ignoring.", _MAX_PROMPT_LEN_ENV_VAR, value
)
return None
return value


async def _resolve_max_prompt_len(client: AsyncOpenAI, model: str) -> int | None:
"""Discover ``max_model_len`` from the engine via ``GET /v1/models``.

Expand All @@ -80,7 +113,16 @@ async def _resolve_max_prompt_len(client: AsyncOpenAI, model: str) -> int | None
Any exception during lookup (network error, non-JSON body, attribute
miss on a mock client in tests) is treated as "unknown cap": cached
``None`` so we don't retry on every call.

The ``RENDERERS_MAX_PROMPT_LEN`` env var overrides auto-discovery
entirely: when set to a positive integer, that value is returned and
no ``/v1/models`` request is issued. Use this when the engine's model
card is unreachable (e.g., a router whose ``/v1/models`` handler is
broken) but the operator knows the engine's real cap.
"""
override = _max_prompt_len_from_env()
if override is not None:
return override
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Invalid env var logs warning on every call

Medium Severity

_max_prompt_len_from_env() is called on every invocation of _resolve_max_prompt_len, which runs on every generate() call. When the env var is set to an invalid value (e.g. a typo like "4096O"), the warning is logged on every single call — potentially thousands of times per second — even though the auto-discovery fallback result is properly cached. The auto-discovery path deliberately caches failures to avoid "retry on every call," but the env var parse has no such caching, creating unbounded log spam for a simple operator typo.

Additional Locations (1)
Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 5f653df. Configure here.

key = (str(getattr(client, "base_url", "")), model)
if key in _max_prompt_len_cache:
return _max_prompt_len_cache[key]
Expand Down
69 changes: 69 additions & 0 deletions tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,3 +502,72 @@ def test_generate_caches_max_prompt_len_lookup_failure():
assert len(client.calls) == 1
assert result["prompt_ids"] == list(range(10))
assert _max_prompt_len_cache[("http://no-models:8000/v1", "test-model")] is None


def test_generate_uses_env_max_prompt_len_override(monkeypatch):
"""``RENDERERS_MAX_PROMPT_LEN`` overrides auto-discovery entirely: the
pre-flight uses the env value and ``/v1/models`` is never queried,
even when the engine would have returned a different cap."""
from renderers.client import OverlongPromptError, _max_prompt_len_cache

class _ClientWithModels(_FakeClient):
def __init__(self):
super().__init__()
self.base_url = "http://disco-host:8000/v1"
self.models_calls = 0

async def get(self, path, *, cast_to):
self.models_calls += 1
return {"object": "list", "data": [{"id": "test-model", "max_model_len": 999}]}

_max_prompt_len_cache.clear()
monkeypatch.setenv("RENDERERS_MAX_PROMPT_LEN", "4")

client = _ClientWithModels()
with pytest.raises(OverlongPromptError) as excinfo:
asyncio.run(
generate(
client=client,
renderer=_LongRenderer(),
messages=[{"role": "user", "content": "hi"}],
model="test-model",
)
)

assert excinfo.value.max_prompt_len == 4, "env override beats engine card"
assert excinfo.value.prompt_len == 10
assert client.models_calls == 0, "env override must skip /v1/models query"
assert client.calls == [], "request must not be dispatched on pre-flight fail"


def test_generate_env_override_invalid_falls_back_to_auto_discovery(monkeypatch):
"""A non-integer or non-positive ``RENDERERS_MAX_PROMPT_LEN`` is ignored
(with a warning) and auto-discovery proceeds normally."""
from renderers.client import OverlongPromptError, _max_prompt_len_cache

class _ClientWithModels(_FakeClient):
def __init__(self):
super().__init__()
self.base_url = "http://disco-host-2:8000/v1"
self.models_calls = 0

async def get(self, path, *, cast_to):
self.models_calls += 1
return {"object": "list", "data": [{"id": "test-model", "max_model_len": 4}]}

_max_prompt_len_cache.clear()
monkeypatch.setenv("RENDERERS_MAX_PROMPT_LEN", "not-an-int")

client = _ClientWithModels()
with pytest.raises(OverlongPromptError) as excinfo:
asyncio.run(
generate(
client=client,
renderer=_LongRenderer(),
messages=[{"role": "user", "content": "hi"}],
model="test-model",
)
)

assert excinfo.value.max_prompt_len == 4, "auto-discovered cap should win"
assert client.models_calls == 1, "invalid env override must not skip /v1/models"
Loading