From b7835294818c37991ab80c606b8f0ba11e1b78d0 Mon Sep 17 00:00:00 2001 From: Bolin Sun Date: Tue, 12 May 2026 08:42:47 -0700 Subject: [PATCH 1/3] fix: support Q3VL multimodal benchmark on vLLM endpoints --- .../commands/benchmark/execute.py | 33 +++++++++++------ .../load_generator/session.py | 8 ++++- src/inference_endpoint/openai/types.py | 36 +++++++++++++------ 3 files changed, 55 insertions(+), 22 deletions(-) diff --git a/src/inference_endpoint/commands/benchmark/execute.py b/src/inference_endpoint/commands/benchmark/execute.py index 73c3427f..34a9fd40 100644 --- a/src/inference_endpoint/commands/benchmark/execute.py +++ b/src/inference_endpoint/commands/benchmark/execute.py @@ -179,27 +179,38 @@ def enable_streaming(self) -> bool: def _check_tokenizer_exists(model_name: str) -> bool: - """Check if a HuggingFace tokenizer exists for the model (API only, no download). - - Returns True if the model repo exists and has tokenizer files, False otherwise. - This function is a probe — it never loads or downloads the tokenizer itself. - Downstream consumers that need tokenization (e.g. the MetricsAggregator - subprocess for ISL/OSL/TPOT, Harmony transforms for prompt preprocessing, - and any future plugin with its own tokenization need) each load their own - instance as required. + """Check if a tokenizer exists for the model (local dir or HF repo, no download). + + Returns True if a tokenizer is available, False otherwise. This function is + a probe — it never loads or downloads the tokenizer itself. Downstream + consumers that need tokenization (e.g. the MetricsAggregator subprocess + for ISL/OSL/TPOT, Harmony transforms for prompt preprocessing, and any + future plugin with its own tokenization need) each load their own instance + as required. + + ``model_name`` may be a local checkpoint directory (e.g. an NVFP4 snapshot + cached under ``/root/.cache/huggingface/hub/...``) or an HF repo ID. Local + directories are probed directly; otherwise we ask the HF Hub for the file + listing. """ try: - info = model_info(model_name) - # Check for tokenizer files in the repo - siblings = {s.rfilename for s in (info.siblings or [])} + local_path = Path(model_name) + if local_path.is_dir(): + siblings = {p.name for p in local_path.iterdir() if p.is_file()} + else: + info = model_info(model_name) + siblings = {s.rfilename for s in (info.siblings or [])} + has_tokenizer = ( "tokenizer_config.json" in siblings or "tokenizer.json" in siblings ) + if has_tokenizer: logger.info(f"Tokenizer available for model: {model_name}") else: logger.warning(f"Model {model_name} found but has no tokenizer files") return has_tokenizer + except ImportError: # huggingface_hub not installed — fall back to assuming it works logger.info( diff --git a/src/inference_endpoint/load_generator/session.py b/src/inference_endpoint/load_generator/session.py index 1d0a63ec..7b20ccdf 100644 --- a/src/inference_endpoint/load_generator/session.py +++ b/src/inference_endpoint/load_generator/session.py @@ -191,8 +191,14 @@ def issue(self, sample_index: int) -> str | None: prompt_data: PromptData if isinstance(data, dict): token_ids = data.get("input_tokens") or data.get("token_ids") + # Multimodal datasets store ``prompt`` as a list of OpenAI content + # parts (e.g. [{"type": "text", ...}, {"type": "image_url", ...}]) + # which the HTTP adapter handles directly. PromptData.text is only + # meaningful for ISL on text-only prompts, so coerce non-strings + # to None and rely on token_ids when the dataset pre-tokenizes. + prompt = data.get("prompt") prompt_data = PromptData( - text=data.get("prompt"), + text=prompt if isinstance(prompt, str) else None, token_ids=tuple(token_ids) if token_ids is not None else None, ) else: diff --git a/src/inference_endpoint/openai/types.py b/src/inference_endpoint/openai/types.py index 2f697d76..875656fa 100644 --- a/src/inference_endpoint/openai/types.py +++ b/src/inference_endpoint/openai/types.py @@ -108,21 +108,30 @@ class ChatCompletionRequest( class ChatCompletionResponseMessage( msgspec.Struct, frozen=True, kw_only=True, omit_defaults=True, gc=False ): # type: ignore[call-arg] - """Response message from OpenAI.""" + """Response message from OpenAI. + + ``content`` and ``refusal`` are nullable per the OpenAI spec and vLLM + routinely omits them (e.g. when the model returns no text or no refusal + block), so they default to ``None`` to allow successful decoding. + """ role: str - content: str | None - refusal: str | None + content: str | None = None + refusal: str | None = None class ChatCompletionChoice( msgspec.Struct, frozen=True, kw_only=True, omit_defaults=True, gc=False ): # type: ignore[call-arg] - """A single choice in the completion response.""" + """A single choice in the completion response. + + ``finish_reason`` may be omitted in non-final SSE chunks; default to + ``None`` so decoding intermediate frames does not fail. + """ index: int message: ChatCompletionResponseMessage - finish_reason: str | None + finish_reason: str | None = None class CompletionUsage( @@ -142,15 +151,22 @@ class ChatCompletionResponse( omit_defaults=False, gc=False, ): # type: ignore[call-arg] - """OpenAI chat completion response.""" + """OpenAI chat completion response. + + Most servers (vLLM, Dynamo, etc.) legitimately omit a number of these + fields — e.g. ``usage`` is only emitted on the final SSE chunk, + ``system_fingerprint`` is rarely populated, and ``created``/``model`` + can be missing in some response variants. All of these get safe + defaults so the decoder accepts whatever the server sends. + """ id: str object: str = "chat.completion" - created: int - model: str + created: int = 0 + model: str = "" choices: list[ChatCompletionChoice] - usage: CompletionUsage | None - system_fingerprint: str | None + usage: CompletionUsage | None = None + system_fingerprint: str | None = None # ============================================================================ From 8f1b4fe70c4fdd84053e1e1e834601cdecf0ce00 Mon Sep 17 00:00:00 2001 From: Bolin Sun Date: Tue, 12 May 2026 13:37:23 -0400 Subject: [PATCH 2/3] Update src/inference_endpoint/load_generator/session.py Co-authored-by: Shang Wang --- src/inference_endpoint/load_generator/session.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/inference_endpoint/load_generator/session.py b/src/inference_endpoint/load_generator/session.py index 7b20ccdf..d03bc67c 100644 --- a/src/inference_endpoint/load_generator/session.py +++ b/src/inference_endpoint/load_generator/session.py @@ -193,9 +193,10 @@ def issue(self, sample_index: int) -> str | None: token_ids = data.get("input_tokens") or data.get("token_ids") # Multimodal datasets store ``prompt`` as a list of OpenAI content # parts (e.g. [{"type": "text", ...}, {"type": "image_url", ...}]) - # which the HTTP adapter handles directly. PromptData.text is only - # meaningful for ISL on text-only prompts, so coerce non-strings - # to None and rely on token_ids when the dataset pre-tokenizes. + # which the HTTP adapter handles directly. `PromptData.text` is only + # meaningful for ISL reporting on text-only prompts. + # Therefore, setting `text=None` for non-string prompts + # means that ISL reporting will be unavailable for multimodal samples. prompt = data.get("prompt") prompt_data = PromptData( text=prompt if isinstance(prompt, str) else None, From 21965e76d5badc3662073082f3eaf20dca7d9253 Mon Sep 17 00:00:00 2001 From: Bolin Sun Date: Tue, 12 May 2026 16:17:11 -0400 Subject: [PATCH 3/3] precommit --- src/inference_endpoint/load_generator/session.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/inference_endpoint/load_generator/session.py b/src/inference_endpoint/load_generator/session.py index d03bc67c..3be480cb 100644 --- a/src/inference_endpoint/load_generator/session.py +++ b/src/inference_endpoint/load_generator/session.py @@ -195,7 +195,7 @@ def issue(self, sample_index: int) -> str | None: # parts (e.g. [{"type": "text", ...}, {"type": "image_url", ...}]) # which the HTTP adapter handles directly. `PromptData.text` is only # meaningful for ISL reporting on text-only prompts. - # Therefore, setting `text=None` for non-string prompts + # Therefore, setting `text=None` for non-string prompts # means that ISL reporting will be unavailable for multimodal samples. prompt = data.get("prompt") prompt_data = PromptData(