Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 22 additions & 11 deletions src/inference_endpoint/commands/benchmark/execute.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,27 +179,38 @@ def enable_streaming(self) -> bool:


def _check_tokenizer_exists(model_name: str) -> bool:
"""Check if a HuggingFace tokenizer exists for the model (API only, no download).

Returns True if the model repo exists and has tokenizer files, False otherwise.
This function is a probe — it never loads or downloads the tokenizer itself.
Downstream consumers that need tokenization (e.g. the MetricsAggregator
subprocess for ISL/OSL/TPOT, Harmony transforms for prompt preprocessing,
and any future plugin with its own tokenization need) each load their own
instance as required.
"""Check if a tokenizer exists for the model (local dir or HF repo, no download).

Returns True if a tokenizer is available, False otherwise. This function is
a probe — it never loads or downloads the tokenizer itself. Downstream
consumers that need tokenization (e.g. the MetricsAggregator subprocess
for ISL/OSL/TPOT, Harmony transforms for prompt preprocessing, and any
future plugin with its own tokenization need) each load their own instance
as required.

``model_name`` may be a local checkpoint directory (e.g. an NVFP4 snapshot
cached under ``/root/.cache/huggingface/hub/...``) or an HF repo ID. Local
directories are probed directly; otherwise we ask the HF Hub for the file
listing.
"""
try:
info = model_info(model_name)
# Check for tokenizer files in the repo
siblings = {s.rfilename for s in (info.siblings or [])}
local_path = Path(model_name)
if local_path.is_dir():
siblings = {p.name for p in local_path.iterdir() if p.is_file()}
else:
info = model_info(model_name)
siblings = {s.rfilename for s in (info.siblings or [])}

has_tokenizer = (
"tokenizer_config.json" in siblings or "tokenizer.json" in siblings
)

if has_tokenizer:
logger.info(f"Tokenizer available for model: {model_name}")
else:
logger.warning(f"Model {model_name} found but has no tokenizer files")
return has_tokenizer

except ImportError:
# huggingface_hub not installed — fall back to assuming it works
logger.info(
Expand Down
9 changes: 8 additions & 1 deletion src/inference_endpoint/load_generator/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,15 @@ def issue(self, sample_index: int) -> str | None:
prompt_data: PromptData
if isinstance(data, dict):
token_ids = data.get("input_tokens") or data.get("token_ids")
# Multimodal datasets store ``prompt`` as a list of OpenAI content
# parts (e.g. [{"type": "text", ...}, {"type": "image_url", ...}])
# which the HTTP adapter handles directly. `PromptData.text` is only
# meaningful for ISL reporting on text-only prompts.
# Therefore, setting `text=None` for non-string prompts
# means that ISL reporting will be unavailable for multimodal samples.
prompt = data.get("prompt")
prompt_data = PromptData(
text=data.get("prompt"),
text=prompt if isinstance(prompt, str) else None,
token_ids=tuple(token_ids) if token_ids is not None else None,
)
else:
Expand Down
36 changes: 26 additions & 10 deletions src/inference_endpoint/openai/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,21 +108,30 @@ class ChatCompletionRequest(
class ChatCompletionResponseMessage(
msgspec.Struct, frozen=True, kw_only=True, omit_defaults=True, gc=False
): # type: ignore[call-arg]
"""Response message from OpenAI."""
"""Response message from OpenAI.

``content`` and ``refusal`` are nullable per the OpenAI spec and vLLM
routinely omits them (e.g. when the model returns no text or no refusal
block), so they default to ``None`` to allow successful decoding.
"""

role: str
content: str | None
refusal: str | None
content: str | None = None
refusal: str | None = None


class ChatCompletionChoice(
msgspec.Struct, frozen=True, kw_only=True, omit_defaults=True, gc=False
): # type: ignore[call-arg]
"""A single choice in the completion response."""
"""A single choice in the completion response.

``finish_reason`` may be omitted in non-final SSE chunks; default to
``None`` so decoding intermediate frames does not fail.
"""

index: int
message: ChatCompletionResponseMessage
finish_reason: str | None
finish_reason: str | None = None


class CompletionUsage(
Expand All @@ -142,15 +151,22 @@ class ChatCompletionResponse(
omit_defaults=False,
gc=False,
): # type: ignore[call-arg]
"""OpenAI chat completion response."""
"""OpenAI chat completion response.

Most servers (vLLM, Dynamo, etc.) legitimately omit a number of these
fields — e.g. ``usage`` is only emitted on the final SSE chunk,
``system_fingerprint`` is rarely populated, and ``created``/``model``
can be missing in some response variants. All of these get safe
defaults so the decoder accepts whatever the server sends.
"""

id: str
object: str = "chat.completion"
created: int
model: str
created: int = 0
model: str = ""
choices: list[ChatCompletionChoice]
usage: CompletionUsage | None
system_fingerprint: str | None
usage: CompletionUsage | None = None
system_fingerprint: str | None = None


# ============================================================================
Expand Down
Loading