mlcommons · nv-alicheng · May 14, 2026 · May 5, 2026 · May 5, 2026 · May 5, 2026
@@ -40,13 +40,13 @@ uv run inference-endpoint probe \
 uv run inference-endpoint benchmark offline \
   --endpoints http://your-endpoint:8000 \
   --model Qwen/Qwen3-8B \
-  --dataset tests/datasets/dummy_1k.jsonl
+  --dataset tests/assets/datasets/dummy_1k.jsonl
 
 # Run online benchmark (sustained QPS)
 uv run inference-endpoint benchmark online \
   --endpoints http://your-endpoint:8000 \
   --model Qwen/Qwen3-8B \
-  --dataset tests/datasets/dummy_1k.jsonl \
+  --dataset tests/assets/datasets/dummy_1k.jsonl \
   --load-pattern poisson \
   --target-qps 100
 ```
@@ -59,7 +59,7 @@ uv run python -m inference_endpoint.testing.echo_server --port 8765 &
 uv run inference-endpoint benchmark offline \
   --endpoints http://localhost:8765 \
   --model test-model \
-  --dataset tests/datasets/dummy_1k.jsonl
+  --dataset tests/assets/datasets/dummy_1k.jsonl
 pkill -f echo_server
 ```
 

@@ -13,13 +13,13 @@ Command-line reference for all `inference-endpoint` subcommands, flags, load pat
 inference-endpoint benchmark offline \
   --endpoints URL \
   --model Qwen/Qwen3-8B \
-  --dataset tests/datasets/dummy_1k.jsonl
+  --dataset tests/assets/datasets/dummy_1k.jsonl
 
 # Online (sustained QPS - requires --load-pattern, --target-qps)
 inference-endpoint benchmark online \
   --endpoints URL \
   --model Qwen/Qwen3-8B \
-  --dataset tests/datasets/dummy_1k.jsonl \
+  --dataset tests/assets/datasets/dummy_1k.jsonl \
   --load-pattern poisson \
   --target-qps 100
 
@@ -35,14 +35,14 @@ inference-endpoint benchmark offline \
 inference-endpoint benchmark offline \
   --endpoints URL \
   --model Qwen/Qwen3-8B \
-  --dataset tests/datasets/dummy_1k.jsonl \
+  --dataset tests/assets/datasets/dummy_1k.jsonl \
   --report-dir my_benchmark_report
 
 # YAML-based
 inference-endpoint benchmark from-config --config test.yaml
 ```
 
-**Default Test Dataset:** Use `tests/datasets/dummy_1k.jsonl` (1000 samples) for local testing.
+**Default Test Dataset:** Use `tests/assets/datasets/dummy_1k.jsonl` (1000 samples) for local testing.
 
 **Dataset format:** `--dataset [perf|acc:]<path>[,key=value...]` — TOML-style dotted paths. Type prefix is optional (defaults to `perf`):
 
@@ -200,7 +200,7 @@ inference-endpoint benchmark offline \
 inference-endpoint benchmark offline \
   --endpoints http://localhost:8000 \
   --model Qwen/Qwen3-8B \
-  --dataset tests/datasets/dummy_1k.jsonl
+  --dataset tests/assets/datasets/dummy_1k.jsonl
 ```
 
 ### Production Benchmark

@@ -6,7 +6,7 @@ How to run and test the CLI locally using the built-in echo server and the inclu
 
 ### 1. Prepare Test Environment
 
-**Dataset:** The repo includes `tests/datasets/dummy_1k.jsonl` (1000 samples)
+**Dataset:** The repo includes `tests/assets/datasets/dummy_1k.jsonl` (1000 samples)
 **Format:** Automatically inferred from the file extension. Common local formats include `jsonl`, `json`, `csv`, `parquet`, and HuggingFace datasets.
 
 ### 2. Start the Echo Server
@@ -74,14 +74,14 @@ Waiting for 5 responses...
 uv run inference-endpoint -v benchmark offline \
   --endpoints http://localhost:8765 \
   --model Qwen/Qwen3-8B \
-  --dataset tests/datasets/dummy_1k.jsonl \
+  --dataset tests/assets/datasets/dummy_1k.jsonl \
   --duration 0
 
 # Production test with custom params and report generation
 uv run inference-endpoint -v benchmark offline \
   --endpoints http://localhost:8765 \
   --model Qwen/Qwen3-8B \
-  --dataset tests/datasets/dummy_1k.jsonl \
+  --dataset tests/assets/datasets/dummy_1k.jsonl \
   --num-samples 5000 \
   --workers 4 \
   --report-dir benchmark_report
@@ -114,7 +114,7 @@ Cleaning up...
 uv run inference-endpoint -v benchmark online \
   --endpoints http://localhost:8765 \
   --model Qwen/Qwen3-8B \
-  --dataset tests/datasets/dummy_1k.jsonl \
+  --dataset tests/assets/datasets/dummy_1k.jsonl \
   --duration 0 \
   --load-pattern poisson \
   --target-qps 100 \
@@ -154,7 +154,7 @@ uv run inference-endpoint validate-yaml --config offline_template.yaml
 uv run inference-endpoint benchmark offline \
   --endpoints http://localhost:8765 \
   --model Qwen/Qwen3-8B \
-  --dataset tests/datasets/ds_samples.jsonl \
+  --dataset tests/assets/datasets/ds_samples.jsonl \
   -v
 ```
 
@@ -246,7 +246,7 @@ uv run inference-endpoint probe --endpoints http://localhost:8000 --model Qwen/Q
 uv run inference-endpoint -v benchmark offline \
   --endpoints http://localhost:8000 \
   --model Qwen/Qwen3-8B \
-  --dataset tests/datasets/dummy_1k.jsonl \
+  --dataset tests/assets/datasets/dummy_1k.jsonl \
   --workers 4 \
   --report-dir benchmark_report
 
@@ -261,14 +261,14 @@ pkill -f echo_server
 uv run inference-endpoint benchmark offline \
   --endpoints http://localhost:8765 \
   --model Qwen/Qwen3-8B \
-  --dataset tests/datasets/dummy_1k.jsonl \
+  --dataset tests/assets/datasets/dummy_1k.jsonl \
   --report-dir offline_report
 
 # Online (Poisson distribution)
 uv run inference-endpoint benchmark online \
   --endpoints http://localhost:8765 \
   --model Qwen/Qwen3-8B \
-  --dataset tests/datasets/dummy_1k.jsonl \
+  --dataset tests/assets/datasets/dummy_1k.jsonl \
   --load-pattern poisson \
   --target-qps 500 \
   --report-dir online_report
@@ -277,21 +277,21 @@ uv run inference-endpoint benchmark online \
 uv run inference-endpoint benchmark offline \
   --endpoints http://localhost:8765 \
   --model Qwen/Qwen3-8B \
-  --dataset tests/datasets/dummy_1k.jsonl \
+  --dataset tests/assets/datasets/dummy_1k.jsonl \
   --num-samples 500
 
 # Force streaming on for offline mode (to test TTFT metrics)
 uv run inference-endpoint benchmark offline \
   --endpoints http://localhost:8765 \
   --model Qwen/Qwen3-8B \
-  --dataset tests/datasets/dummy_1k.jsonl \
+  --dataset tests/assets/datasets/dummy_1k.jsonl \
   --streaming on
 
 # Concurrency mode (fixed concurrent requests)
 uv run inference-endpoint benchmark online \
   --endpoints http://localhost:8765 \
   --model Qwen/Qwen3-8B \
-  --dataset tests/datasets/dummy_1k.jsonl \
+  --dataset tests/assets/datasets/dummy_1k.jsonl \
   --load-pattern concurrency \
   --concurrency 32
 ```
@@ -317,7 +317,7 @@ uv run inference-endpoint benchmark online \
 - Use `-v` for INFO logging, `-vv` for DEBUG
 - Echo server mirrors prompts back - perfect for quick testing without real inference
 - Press `Ctrl+C` to gracefully interrupt benchmarks
-- Default test dataset: `tests/datasets/dummy_1k.jsonl` (1000 samples)
+- Default test dataset: `tests/assets/datasets/dummy_1k.jsonl` (1000 samples)
 
 **Advanced:**
 

@@ -49,7 +49,7 @@ enroot start -e HF_TOKEN=$HF_TOKEN -m $HF_HOME:/root/.cache/huggingface vllm+vll
 Once the server is up and running, we can send requests to the endpoint by passing in the endpoint address and model name:
 
 ```
-uv run inference-endpoint benchmark offline --endpoints http://localhost:8000 --dataset tests/datasets/dummy_1k.jsonl --model ${MODEL_NAME}
+uv run inference-endpoint benchmark offline --endpoints http://localhost:8000 --dataset tests/assets/datasets/dummy_1k.jsonl --model ${MODEL_NAME}
 ```
 
 # Using a config file

@@ -63,6 +63,9 @@ dependencies = [
     "sentencepiece==0.2.1",
     "protobuf==7.34.1",
     "openai_harmony==0.0.8",
+    # HDR Histogram for live percentile/histogram approximations in the
+    # metrics aggregator (PyPI: hdrhistogram, importable as hdrh.histogram).
+    "hdrhistogram==0.10.3",
     # Color support for cross-platform terminals
     "colorama==0.4.6",
     # Fix pytz-2024 import warning
@@ -131,7 +134,7 @@ Issues = "https://github.com/mlperf/inference-endpoint/issues"
 target-version = "py312"
 line-length = 88
 exclude = [
-    "tests/datasets/*",
+    "tests/assets/datasets/*",
     "src/inference_endpoint/openai/openai_types_gen.py",
     "src/inference_endpoint/openai/openapi.yaml",
     "datasets/*",

@@ -36,7 +36,7 @@ def create_dummy_dataset(num_samples: int = 1000, output_path: str = None):
 
     Args:
         num_samples: Number of samples to generate
-        output_path: Output file path (default: tests/datasets/dummy_1k.jsonl)
+        output_path: Output file path (default: tests/assets/datasets/dummy_1k.jsonl)
     """
     # Create varied prompts
     prompt_templates = [
@@ -122,7 +122,7 @@ def main():
         "--output",
         "-o",
         type=str,
-        help="Output file path (default: tests/datasets/dummy_1k.jsonl)",
+        help="Output file path (default: tests/assets/datasets/dummy_1k.jsonl)",
     )
 
     args = parser.parse_args()

@@ -74,14 +74,14 @@
 PERF_DATASET = {
     "name": "perf",
     "type": "performance",
-    "path": "<DATASET_PATH eg: tests/datasets/dummy_1k.jsonl>",
+    "path": "<DATASET_PATH eg: tests/assets/datasets/dummy_1k.jsonl>",
     "parser": {"prompt": "text_input"},
 }
 
 ACC_DATASET = {
     "name": "accuracy",
     "type": "accuracy",
-    "path": "<DATASET_PATH eg: tests/datasets/ds_samples.jsonl>",
+    "path": "<DATASET_PATH eg: tests/assets/datasets/ds_samples.jsonl>",
     "eval_method": "exact_match",
     "parser": {"prompt": "question", "system": "system_prompt"},
     "accuracy_config": {