mlcommons · Victor49152 · May 7, 2026 · May 7, 2026 · May 7, 2026 · May 11, 2026
@@ -103,15 +103,16 @@ Transforms are composed in order; each receives the output of the previous.
 Registered in `dataset.py` under `Dataset.PREDEFINED`. Referenced by name in rulesets and YAML
 configs. Each predefined dataset ships with default transforms for supported model families.
 
-| Name                        | Source        | Notes                                      |
-| --------------------------- | ------------- | ------------------------------------------ |
-| `aime25`                    | AIME 2025     | Math reasoning                             |
-| `gpqa`                      | GPQA Diamond  | Science QA                                 |
-| `cnndailymail`              | CNN/DailyMail | Summarization                              |
-| `open_orca`                 | OpenOrca      | General instruction                        |
-| `livecodebench`             | LiveCodeBench | Code generation; requires additional setup |
-| `shopify_product_catalogue` | Shopify       | E-commerce Q&A (q3vl)                      |
-| `random`                    | Synthetic     | Generated prompts for throughput testing   |
+| Name                           | Source        | Notes                                                 |
+| ------------------------------ | ------------- | ----------------------------------------------------- |
+| `aime25`                       | AIME 2025     | Math reasoning                                        |
+| `gpqa`                         | GPQA Diamond  | Science QA                                            |
+| `cnndailymail`                 | CNN/DailyMail | Summarization                                         |
+| `open_orca`                    | OpenOrca      | General instruction                                   |
+| `livecodebench`                | LiveCodeBench | Code generation; requires additional setup            |
+| `shopify_product_catalogue`    | Shopify       | E-commerce Q&A (q3vl)                                 |
+| `shopify_product_catalogue_8k` | Shopify       | 8k sample variant of Shopify product catalogue (q3vl) |
+| `random`                       | Synthetic     | Generated prompts for throughput testing              |
 
 ## Preset System
 

@@ -0,0 +1,49 @@
+# Interactive Benchmark - Qwen3-VL-235B-A22B on Shopify Product Catalogue 8k
+# Use this for interactive testing and development with the smaller 8k dataset
+name: "interactive-qwen3-vl-235b-a22b-shopify-8k-benchmark"
+version: "1.0"
+type: "online"
+timeout: 1800 # 30 minutes for quick interactive runs
+
+model_params:
+  name: "Qwen/Qwen3-VL-235B-A22B-Instruct"
+  top_p: 1
+  max_new_tokens: 150
+  streaming: "on" # Required for TTFT/TPOT measurement
+
+datasets:
+  - name: shopify_product_catalogue_8k::q3vl
+    type: "performance"
+  - name: shopify_product_catalogue_8k::q3vl
+    type: "accuracy"
+    accuracy_config:
+      eval_method: "shopify_category_f1"
+      ground_truth: "ground_truth_category"
+      extractor: "identity_extractor"
+      num_repeats: 1
+
+settings:
+  runtime:
+    min_duration_ms: 600000 # 10 minute
+    scheduler_random_seed: 42
+    dataloader_random_seed: 42
+
+  load_pattern:
+    type: "poisson"
+    target_qps: 5
+
+  client:
+    num_workers: 5
+    transport:
+      type: zmq
+      recv_buffer_size: 16777216
+      send_buffer_size: 16777216
+    max_connections: 1000
+    worker_initialization_timeout: 120
+
+endpoint_config:
+  endpoints:
+    - "http://localhost:8000"
+  api_key: null
+
+report_dir: results/qwen3_vl_235b_a22b_shopify_8k_benchmark_interactive/
@@ -7,7 +7,6 @@ timeout: 14400 # Perf + acc run takes over 3 hours, consider limit n_samples_to_
 
 model_params:
   name: "Qwen/Qwen3-VL-235B-A22B-Instruct"
-  temperature: 0
   top_p: 1
   max_new_tokens: 150
 
@@ -25,15 +24,15 @@ datasets:
 settings:
   runtime:
     min_duration_ms: 600000 # 10 minute
-    n_samples_to_issue: 100 # Limit queries for testing (remove or increase for full run)
+    n_samples_to_issue: 48289 # Full dataset size for this benchmark; lower this (e.g., 1000 or 100) for quick tests, or remove for an unconstrained full run
     scheduler_random_seed: 42 # For Poisson/distribution sampling
     dataloader_random_seed: 42 # For dataset shuffling
 
   load_pattern:
     type: "max_throughput"
 
   client:
-    num_workers: 2
+    num_workers: 5
     transport:
       type: zmq
       recv_buffer_size: 16777216

@@ -6,14 +6,20 @@ timeout: 14400
 
 model_params:
   name: "Qwen/Qwen3-VL-235B-A22B-Instruct"
-  temperature: 0
   top_p: 1
   max_new_tokens: 150
   streaming: "on" # Required for TTFT/TPOT measurement
 
 datasets:
   - name: shopify_product_catalogue::q3vl
     type: "performance"
+  - name: shopify_product_catalogue::q3vl
+    type: "accuracy"
+    accuracy_config:
+      eval_method: "shopify_category_f1"
+      ground_truth: "ground_truth_category"
+      extractor: "identity_extractor"
+      num_repeats: 1
 
 settings:
   runtime:
@@ -24,10 +30,10 @@ settings:
 
   load_pattern:
     type: "poisson"
-    target_qps: 6.5
+    target_qps: 6
 
   client:
-    num_workers: 2
+    num_workers: 5
     transport:
       type: zmq
       recv_buffer_size: 16777216

@@ -27,7 +27,10 @@
 from .predefined.livecodebench import LiveCodeBench
 from .predefined.open_orca import OpenOrca
 from .predefined.random import RandomDataset
-from .predefined.shopify_product_catalogue import ShopifyProductCatalogue
+from .predefined.shopify_product_catalogue import (
+    ShopifyProductCatalogue,
+    ShopifyProductCatalogue8k,
+)
 from .transforms import (
     AddStaticColumns,
     ColumnFilter,
@@ -58,4 +61,5 @@
     "CNNDailyMail",
     "RandomDataset",
     "ShopifyProductCatalogue",
+    "ShopifyProductCatalogue8k",
 ]
@@ -13,14 +13,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Shopify product catalogue dataset for multimodal product taxonomy classification."""
+"""Shopify product catalogue datasets for multimodal product taxonomy classification."""
 
 import base64
 import json
+from abc import ABC
 from io import BytesIO
 from logging import getLogger
 from pathlib import Path
-from typing import Any
+from typing import Any, ClassVar
 
 import pandas as pd
 from datasets import load_dataset
@@ -66,16 +67,12 @@ def _process_sample_to_row(sample: dict[str, Any]) -> dict[str, Any]:
     }
 
 
-class ShopifyProductCatalogue(
-    Dataset,
-    dataset_id="shopify_product_catalogue",
-):
-    """Shopify product catalogue: multimodal benchmark for product taxonomy classification.
+class BaseShopifyProductCatalogue(Dataset, ABC):
+    """Abstract base class for Shopify product catalogue datasets.
 
-    Reference: https://huggingface.co/datasets/Shopify/product-catalogue
-
-    Each sample includes product image, title, description, and candidate categories.
-    Compatible with OpenAI multimodal adapter (prompt/system with vision content).
+    Contains shared logic for downloading and processing product catalogue
+    data from HuggingFace. Subclasses only need to define REPO_ID,
+    dataset_id, and optionally DEFAULT_SPLITS.
     """
 
     COLUMN_NAMES = [
@@ -91,7 +88,11 @@ class ShopifyProductCatalogue(
 
     PRESETS = presets
 
-    REPO_ID = "Shopify/product-catalogue"
+    REPO_ID: str
+    """HuggingFace dataset repository ID. Must be set by subclass."""
+
+    DEFAULT_SPLITS: ClassVar[list[str]] = ["train", "test"]
+    """Default splits to load when split is not specified. Override in subclass if needed."""
 
     @classmethod
     def generate(
@@ -110,7 +111,7 @@ def generate(
 
         Args:
             datasets_dir: Directory to save transformed dataset.
-            split: Splits to load (e.g. ["train", "test"]). Defaults to ["train", "test"].
+            split: Splits to load (e.g. ["train", "test"]). Defaults to DEFAULT_SPLITS.
             force: Regenerate even if file exists.
             token: HuggingFace token for gated datasets.
             revision: Dataset revision/branch. Defaults to "main".
@@ -120,7 +121,7 @@ def generate(
             product_image_format, potential_product_categories.
         """
         if split is None:
-            split = ["train", "test"]
+            split = cls.DEFAULT_SPLITS
         split_key = "+".join(split)
         filename = f"{cls.DATASET_ID}_{split_key}.parquet"
         dst_path = datasets_dir / cls.DATASET_ID / split_key / filename
@@ -138,9 +139,7 @@ def generate(
             load_options["revision"] = revision
 
         ds = load_dataset(cls.REPO_ID, split=split_key, **load_options)
-        logger.info(
-            f"Loaded {len(ds)} samples from Shopify product catalogue ({split_key})"
-        )
+        logger.info(f"Loaded {len(ds)} samples from {cls.REPO_ID} ({split_key})")
 
         all_rows: list[dict[str, Any]] = []
         for i in tqdm(
@@ -158,4 +157,42 @@ def generate(
         return df
 
 
-__all__ = ["ProductMetadata", "ShopifyProductCatalogue"]
+class ShopifyProductCatalogue(
+    BaseShopifyProductCatalogue,
+    dataset_id="shopify_product_catalogue",
+):
+    """Shopify product catalogue: multimodal benchmark for product taxonomy classification.
+
+    Reference: https://huggingface.co/datasets/Shopify/product-catalogue
+
+    Each sample includes product image, title, description, and candidate categories.
+    Compatible with OpenAI multimodal adapter (prompt/system with vision content).
+    """
+
+    REPO_ID = "Shopify/product-catalogue"
+
+
+class ShopifyProductCatalogue8k(
+    BaseShopifyProductCatalogue,
+    dataset_id="shopify_product_catalogue_8k",
+):
+    """Shopify product catalogue 8k: 8,000 sample variant for product taxonomy classification.
+
+    Reference: https://huggingface.co/datasets/nvidia/Shopify-product-catalogue-8k
+
+    Each sample includes product image, title, description, and candidate categories.
+    Compatible with OpenAI multimodal adapter (prompt/system with vision content).
+
+    Note: This dataset only has a train split (no test split).
+    """
+
+    REPO_ID = "nvidia/Shopify-product-catalogue-8k"
+
+    DEFAULT_SPLITS = ["train"]
+
+
+__all__ = [
+    "ProductMetadata",
+    "ShopifyProductCatalogue",
+    "ShopifyProductCatalogue8k",
+]