From b7835294818c37991ab80c606b8f0ba11e1b78d0 Mon Sep 17 00:00:00 2001
From: Bolin Sun <bolins@nvidia.com>
Date: Tue, 12 May 2026 08:42:47 -0700
Subject: [PATCH 1/3] fix: support Q3VL multimodal benchmark on vLLM endpoints

---
 .../commands/benchmark/execute.py             | 33 +++++++++++------
 .../load_generator/session.py                 |  8 ++++-
 src/inference_endpoint/openai/types.py        | 36 +++++++++++++------
 3 files changed, 55 insertions(+), 22 deletions(-)

diff --git a/src/inference_endpoint/commands/benchmark/execute.py b/src/inference_endpoint/commands/benchmark/execute.py
index 73c3427f..34a9fd40 100644
--- a/src/inference_endpoint/commands/benchmark/execute.py
+++ b/src/inference_endpoint/commands/benchmark/execute.py
@@ -179,27 +179,38 @@ def enable_streaming(self) -> bool:
 
 
 def _check_tokenizer_exists(model_name: str) -> bool:
-    """Check if a HuggingFace tokenizer exists for the model (API only, no download).
-
-    Returns True if the model repo exists and has tokenizer files, False otherwise.
-    This function is a probe — it never loads or downloads the tokenizer itself.
-    Downstream consumers that need tokenization (e.g. the MetricsAggregator
-    subprocess for ISL/OSL/TPOT, Harmony transforms for prompt preprocessing,
-    and any future plugin with its own tokenization need) each load their own
-    instance as required.
+    """Check if a tokenizer exists for the model (local dir or HF repo, no download).
+
+    Returns True if a tokenizer is available, False otherwise. This function is
+    a probe — it never loads or downloads the tokenizer itself. Downstream
+    consumers that need tokenization (e.g. the MetricsAggregator subprocess
+    for ISL/OSL/TPOT, Harmony transforms for prompt preprocessing, and any
+    future plugin with its own tokenization need) each load their own instance
+    as required.
+
+    ``model_name`` may be a local checkpoint directory (e.g. an NVFP4 snapshot
+    cached under ``/root/.cache/huggingface/hub/...``) or an HF repo ID. Local
+    directories are probed directly; otherwise we ask the HF Hub for the file
+    listing.
     """
     try:
-        info = model_info(model_name)
-        # Check for tokenizer files in the repo
-        siblings = {s.rfilename for s in (info.siblings or [])}
+        local_path = Path(model_name)
+        if local_path.is_dir():
+            siblings = {p.name for p in local_path.iterdir() if p.is_file()}
+        else:
+            info = model_info(model_name)
+            siblings = {s.rfilename for s in (info.siblings or [])}
+
         has_tokenizer = (
             "tokenizer_config.json" in siblings or "tokenizer.json" in siblings
         )
+
         if has_tokenizer:
             logger.info(f"Tokenizer available for model: {model_name}")
         else:
             logger.warning(f"Model {model_name} found but has no tokenizer files")
         return has_tokenizer
+
     except ImportError:
         # huggingface_hub not installed — fall back to assuming it works
         logger.info(
diff --git a/src/inference_endpoint/load_generator/session.py b/src/inference_endpoint/load_generator/session.py
index 1d0a63ec..7b20ccdf 100644
--- a/src/inference_endpoint/load_generator/session.py
+++ b/src/inference_endpoint/load_generator/session.py
@@ -191,8 +191,14 @@ def issue(self, sample_index: int) -> str | None:
         prompt_data: PromptData
         if isinstance(data, dict):
             token_ids = data.get("input_tokens") or data.get("token_ids")
+            # Multimodal datasets store ``prompt`` as a list of OpenAI content
+            # parts (e.g. [{"type": "text", ...}, {"type": "image_url", ...}])
+            # which the HTTP adapter handles directly. PromptData.text is only
+            # meaningful for ISL on text-only prompts, so coerce non-strings
+            # to None and rely on token_ids when the dataset pre-tokenizes.
+            prompt = data.get("prompt")
             prompt_data = PromptData(
-                text=data.get("prompt"),
+                text=prompt if isinstance(prompt, str) else None,
                 token_ids=tuple(token_ids) if token_ids is not None else None,
             )
         else:
diff --git a/src/inference_endpoint/openai/types.py b/src/inference_endpoint/openai/types.py
index 2f697d76..875656fa 100644
--- a/src/inference_endpoint/openai/types.py
+++ b/src/inference_endpoint/openai/types.py
@@ -108,21 +108,30 @@ class ChatCompletionRequest(
 class ChatCompletionResponseMessage(
     msgspec.Struct, frozen=True, kw_only=True, omit_defaults=True, gc=False
 ):  # type: ignore[call-arg]
-    """Response message from OpenAI."""
+    """Response message from OpenAI.
+
+    ``content`` and ``refusal`` are nullable per the OpenAI spec and vLLM
+    routinely omits them (e.g. when the model returns no text or no refusal
+    block), so they default to ``None`` to allow successful decoding.
+    """
 
     role: str
-    content: str | None
-    refusal: str | None
+    content: str | None = None
+    refusal: str | None = None
 
 
 class ChatCompletionChoice(
     msgspec.Struct, frozen=True, kw_only=True, omit_defaults=True, gc=False
 ):  # type: ignore[call-arg]
-    """A single choice in the completion response."""
+    """A single choice in the completion response.
+
+    ``finish_reason`` may be omitted in non-final SSE chunks; default to
+    ``None`` so decoding intermediate frames does not fail.
+    """
 
     index: int
     message: ChatCompletionResponseMessage
-    finish_reason: str | None
+    finish_reason: str | None = None
 
 
 class CompletionUsage(
@@ -142,15 +151,22 @@ class ChatCompletionResponse(
     omit_defaults=False,
     gc=False,
 ):  # type: ignore[call-arg]
-    """OpenAI chat completion response."""
+    """OpenAI chat completion response.
+
+    Most servers (vLLM, Dynamo, etc.) legitimately omit a number of these
+    fields — e.g. ``usage`` is only emitted on the final SSE chunk,
+    ``system_fingerprint`` is rarely populated, and ``created``/``model``
+    can be missing in some response variants. All of these get safe
+    defaults so the decoder accepts whatever the server sends.
+    """
 
     id: str
     object: str = "chat.completion"
-    created: int
-    model: str
+    created: int = 0
+    model: str = ""
     choices: list[ChatCompletionChoice]
-    usage: CompletionUsage | None
-    system_fingerprint: str | None
+    usage: CompletionUsage | None = None
+    system_fingerprint: str | None = None
 
 
 # ============================================================================

From 8f1b4fe70c4fdd84053e1e1e834601cdecf0ce00 Mon Sep 17 00:00:00 2001
From: Bolin Sun <bolin.sun@centml.ai>
Date: Tue, 12 May 2026 13:37:23 -0400
Subject: [PATCH 2/3] Update src/inference_endpoint/load_generator/session.py

Co-authored-by: Shang Wang <shangw@nvidia.com>
---
 src/inference_endpoint/load_generator/session.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/inference_endpoint/load_generator/session.py b/src/inference_endpoint/load_generator/session.py
index 7b20ccdf..d03bc67c 100644
--- a/src/inference_endpoint/load_generator/session.py
+++ b/src/inference_endpoint/load_generator/session.py
@@ -193,9 +193,10 @@ def issue(self, sample_index: int) -> str | None:
             token_ids = data.get("input_tokens") or data.get("token_ids")
             # Multimodal datasets store ``prompt`` as a list of OpenAI content
             # parts (e.g. [{"type": "text", ...}, {"type": "image_url", ...}])
-            # which the HTTP adapter handles directly. PromptData.text is only
-            # meaningful for ISL on text-only prompts, so coerce non-strings
-            # to None and rely on token_ids when the dataset pre-tokenizes.
+            # which the HTTP adapter handles directly. `PromptData.text` is only
+            # meaningful for ISL reporting on text-only prompts.
+            # Therefore, setting `text=None` for non-string prompts 
+            # means that ISL reporting will be unavailable for multimodal samples.
             prompt = data.get("prompt")
             prompt_data = PromptData(
                 text=prompt if isinstance(prompt, str) else None,

From 21965e76d5badc3662073082f3eaf20dca7d9253 Mon Sep 17 00:00:00 2001
From: Bolin Sun <bolins@nvidia.com>
Date: Tue, 12 May 2026 16:17:11 -0400
Subject: [PATCH 3/3] precommit

---
 src/inference_endpoint/load_generator/session.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/inference_endpoint/load_generator/session.py b/src/inference_endpoint/load_generator/session.py
index d03bc67c..3be480cb 100644
--- a/src/inference_endpoint/load_generator/session.py
+++ b/src/inference_endpoint/load_generator/session.py
@@ -195,7 +195,7 @@ def issue(self, sample_index: int) -> str | None:
             # parts (e.g. [{"type": "text", ...}, {"type": "image_url", ...}])
             # which the HTTP adapter handles directly. `PromptData.text` is only
             # meaningful for ISL reporting on text-only prompts.
-            # Therefore, setting `text=None` for non-string prompts 
+            # Therefore, setting `text=None` for non-string prompts
             # means that ISL reporting will be unavailable for multimodal samples.
             prompt = data.get("prompt")
             prompt_data = PromptData(