diff --git a/src/inference_endpoint/commands/benchmark/execute.py b/src/inference_endpoint/commands/benchmark/execute.py index 73c3427f..34a9fd40 100644 --- a/src/inference_endpoint/commands/benchmark/execute.py +++ b/src/inference_endpoint/commands/benchmark/execute.py @@ -179,27 +179,38 @@ def enable_streaming(self) -> bool: def _check_tokenizer_exists(model_name: str) -> bool: - """Check if a HuggingFace tokenizer exists for the model (API only, no download). - - Returns True if the model repo exists and has tokenizer files, False otherwise. - This function is a probe — it never loads or downloads the tokenizer itself. - Downstream consumers that need tokenization (e.g. the MetricsAggregator - subprocess for ISL/OSL/TPOT, Harmony transforms for prompt preprocessing, - and any future plugin with its own tokenization need) each load their own - instance as required. + """Check if a tokenizer exists for the model (local dir or HF repo, no download). + + Returns True if a tokenizer is available, False otherwise. This function is + a probe — it never loads or downloads the tokenizer itself. Downstream + consumers that need tokenization (e.g. the MetricsAggregator subprocess + for ISL/OSL/TPOT, Harmony transforms for prompt preprocessing, and any + future plugin with its own tokenization need) each load their own instance + as required. + + ``model_name`` may be a local checkpoint directory (e.g. an NVFP4 snapshot + cached under ``/root/.cache/huggingface/hub/...``) or an HF repo ID. Local + directories are probed directly; otherwise we ask the HF Hub for the file + listing. """ try: - info = model_info(model_name) - # Check for tokenizer files in the repo - siblings = {s.rfilename for s in (info.siblings or [])} + local_path = Path(model_name) + if local_path.is_dir(): + siblings = {p.name for p in local_path.iterdir() if p.is_file()} + else: + info = model_info(model_name) + siblings = {s.rfilename for s in (info.siblings or [])} + has_tokenizer = ( "tokenizer_config.json" in siblings or "tokenizer.json" in siblings ) + if has_tokenizer: logger.info(f"Tokenizer available for model: {model_name}") else: logger.warning(f"Model {model_name} found but has no tokenizer files") return has_tokenizer + except ImportError: # huggingface_hub not installed — fall back to assuming it works logger.info( diff --git a/src/inference_endpoint/load_generator/session.py b/src/inference_endpoint/load_generator/session.py index 1d0a63ec..3be480cb 100644 --- a/src/inference_endpoint/load_generator/session.py +++ b/src/inference_endpoint/load_generator/session.py @@ -191,8 +191,15 @@ def issue(self, sample_index: int) -> str | None: prompt_data: PromptData if isinstance(data, dict): token_ids = data.get("input_tokens") or data.get("token_ids") + # Multimodal datasets store ``prompt`` as a list of OpenAI content + # parts (e.g. [{"type": "text", ...}, {"type": "image_url", ...}]) + # which the HTTP adapter handles directly. `PromptData.text` is only + # meaningful for ISL reporting on text-only prompts. + # Therefore, setting `text=None` for non-string prompts + # means that ISL reporting will be unavailable for multimodal samples. + prompt = data.get("prompt") prompt_data = PromptData( - text=data.get("prompt"), + text=prompt if isinstance(prompt, str) else None, token_ids=tuple(token_ids) if token_ids is not None else None, ) else: diff --git a/src/inference_endpoint/openai/types.py b/src/inference_endpoint/openai/types.py index 2f697d76..875656fa 100644 --- a/src/inference_endpoint/openai/types.py +++ b/src/inference_endpoint/openai/types.py @@ -108,21 +108,30 @@ class ChatCompletionRequest( class ChatCompletionResponseMessage( msgspec.Struct, frozen=True, kw_only=True, omit_defaults=True, gc=False ): # type: ignore[call-arg] - """Response message from OpenAI.""" + """Response message from OpenAI. + + ``content`` and ``refusal`` are nullable per the OpenAI spec and vLLM + routinely omits them (e.g. when the model returns no text or no refusal + block), so they default to ``None`` to allow successful decoding. + """ role: str - content: str | None - refusal: str | None + content: str | None = None + refusal: str | None = None class ChatCompletionChoice( msgspec.Struct, frozen=True, kw_only=True, omit_defaults=True, gc=False ): # type: ignore[call-arg] - """A single choice in the completion response.""" + """A single choice in the completion response. + + ``finish_reason`` may be omitted in non-final SSE chunks; default to + ``None`` so decoding intermediate frames does not fail. + """ index: int message: ChatCompletionResponseMessage - finish_reason: str | None + finish_reason: str | None = None class CompletionUsage( @@ -142,15 +151,22 @@ class ChatCompletionResponse( omit_defaults=False, gc=False, ): # type: ignore[call-arg] - """OpenAI chat completion response.""" + """OpenAI chat completion response. + + Most servers (vLLM, Dynamo, etc.) legitimately omit a number of these + fields — e.g. ``usage`` is only emitted on the final SSE chunk, + ``system_fingerprint`` is rarely populated, and ``created``/``model`` + can be missing in some response variants. All of these get safe + defaults so the decoder accepts whatever the server sends. + """ id: str object: str = "chat.completion" - created: int - model: str + created: int = 0 + model: str = "" choices: list[ChatCompletionChoice] - usage: CompletionUsage | None - system_fingerprint: str | None + usage: CompletionUsage | None = None + system_fingerprint: str | None = None # ============================================================================