SemiAnalysisAI · Oseltamivir · Mar 28, 2026 · Mar 7, 2026 · Mar 7, 2026 · Mar 7, 2026
diff --git a/.github/workflows/README.md b/.github/workflows/README.md
@@ -40,7 +40,7 @@ usage: generate_sweep_configs.py full-sweep
     [--precision PRECISION [PRECISION ...]]
     [--framework FRAMEWORK [FRAMEWORK ...]]
     [--runner-type RUNNER_TYPE [RUNNER_TYPE ...]]
-    [--seq-lens {1k1k,1k8k,8k1k} [{1k1k,1k8k,8k1k} ...]]
+    [--seq-lens {1k1k,8k1k} [{1k1k,8k1k} ...]]
     [--step-size STEP_SIZE]
     [--max-conc MAX_CONC]
     [--max-tp MAX_TP]
@@ -62,9 +62,9 @@ full-sweep --config-files .github/configs/nvidia-master.yaml
 full-sweep --single-node --model-prefix gptoss --runner-type b200 --seq-lens 1k1k --config-files .github/configs/nvidia-master.yaml
 ```
 
-**Test all single-node fp8 precision configs for 1k8k workloads:**
+**Test all single-node fp8 precision configs for 8k1k workloads:**
 ```
-full-sweep --single-node --precision fp8 --seq-lens 1k8k --config-files .github/configs/nvidia-master.yaml .github/configs/amd-master.yaml
+full-sweep --single-node --precision fp8 --seq-lens 8k1k --config-files .github/configs/nvidia-master.yaml .github/configs/amd-master.yaml
 ```
 
 **Test all single-node TRT configs on H200 runners:**

diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -96,6 +96,8 @@ env:
   CONC_LIST: ${{ join(fromJson(inputs.conc-list), ' ') }}
   SPEC_DECODING: ${{ inputs.spec-decoding }}
   DISAGG: ${{ inputs.disagg }}
+  PYTHONDONTWRITEBYTECODE: '1'
+  PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache
 
   PREFILL_NUM_WORKERS: ${{ inputs.prefill-num-worker }}
   PREFILL_TP: ${{ inputs.prefill-tp }}
@@ -142,6 +144,7 @@ jobs:
           token: ${{ secrets.REPO_PAT }}
           fetch-depth: 0
           ref: ${{ inputs.ref || github.ref }}
+          clean: false
 
       - name: Launch multi-node job script
         env:

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
@@ -54,6 +54,11 @@ on:
         type: boolean
         required: true
         default: false
+      eval-only:
+        description: "Run only evals (skip throughput benchmark)"
+        type: boolean
+        required: false
+        default: false
       random-range-ratio:
         required: false
         type: string
@@ -83,6 +88,9 @@ env:
   SPEC_DECODING: ${{ inputs.spec-decoding }}
   DISAGG: ${{ inputs.disagg }}
   RUN_EVAL: ${{ inputs.run-eval }}
+  EVAL_ONLY: ${{ inputs.eval-only }}
+  PYTHONDONTWRITEBYTECODE: '1'
+  PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache
 
 permissions:
   contents: read
@@ -91,7 +99,7 @@ jobs:
   benchmark:
     runs-on: ${{ inputs.runner }}
     timeout-minutes: 300
-    name: "${{ inputs.exp-name }} ${{ inputs.precision }} ${{ inputs.runner }} ${{ inputs.framework }} | tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} | disagg-${{ inputs.disagg }} spec-${{ inputs.spec-decoding }} conc-${{ inputs.conc }}${{ inputs.run-eval && ' | eval' || '' }}"
+    name: "${{ inputs.exp-name }} ${{ inputs.precision }} ${{ inputs.runner }} ${{ inputs.framework }} | tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} | disagg-${{ inputs.disagg }} spec-${{ inputs.spec-decoding }} conc-${{ inputs.conc }}${{ inputs.eval-only && ' | eval-only' || (inputs.run-eval && ' | eval' || '') }}"
     steps:
       - name: Resource cleanup (pre-run)
         run: &resource-cleanup |
@@ -123,13 +131,14 @@ jobs:
                 sleep 5
               done
             fi
-          fi
+          fi 
 
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
         with:
           token: ${{ secrets.REPO_PAT }}
           fetch-depth: 0
           ref: ${{ inputs.ref || github.ref }}
+          clean: false
 
       - name: Launch job script
         env:
@@ -145,28 +154,42 @@ jobs:
           echo "RESULT_FILENAME=${RESULT_FILENAME}" >> $GITHUB_ENV
 
           bash ./runners/launch_${RUNNER_NAME%%_*}.sh
-          FOUND_RESULT_FILE=
-          for i in {1..10}; do
-            if [ -f "$RESULT_FILENAME.json" ]; then
-              FOUND_RESULT_FILE=true
-              break
+
+          if [ "${{ inputs.eval-only }}" = "true" ]; then
+            echo "Eval-only mode: skipping benchmark result file check"
+            # Verify eval produced results
+            if ! ls results*.json 1>/dev/null 2>&1; then
+              echo "Eval-only run failed: no results*.json files found." >&2
+              exit 1
             fi
-            echo "Waiting for result file... (attempt $i)"
-            sleep 1
-          done
+            # Verify eval scores meet per-benchmark minimum thresholds
+            python3 utils/evals/validate_scores.py
+          else
+            FOUND_RESULT_FILE=
+            for i in {1..10}; do
+              if [ -f "$RESULT_FILENAME.json" ]; then
+                FOUND_RESULT_FILE=true
+                break
+              fi
+              echo "Waiting for result file... (attempt $i)"
+              sleep 1
+            done
 
-          if [ -z "$FOUND_RESULT_FILE" ]; then
-            echo "Run failed: Benchmark result $RESULT_FILENAME.json not found." >&2
-            exit 1
+            if [ -z "$FOUND_RESULT_FILE" ]; then
+              echo "Run failed: Benchmark result $RESULT_FILENAME.json not found." >&2
+              exit 1
+            fi
           fi
 
       - name: Process result
+        if: ${{ !inputs.eval-only }}
         env:
           RUNNER_TYPE: ${{ inputs.runner }}
         run: |
           python3 utils/process_result.py
 
       - name: Upload result
+        if: ${{ !inputs.eval-only }}
         uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         with:
           name: bmk_${{ env.RESULT_FILENAME }}
@@ -176,35 +199,36 @@ jobs:
         if: always()
         uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         with:
-          name: server_logs_${{ env.RESULT_FILENAME }}
+          name: ${{ inputs.eval-only && 'eval_server_logs_' || 'server_logs_' }}${{ env.RESULT_FILENAME }}
           path: server.log
           if-no-files-found: ignore
 
       - name: Upload GPU metrics
         if: always()
         uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         with:
-          name: gpu_metrics_${{ env.RESULT_FILENAME }}
+          name: ${{ inputs.eval-only && 'eval_gpu_metrics_' || 'gpu_metrics_' }}${{ env.RESULT_FILENAME }}
           path: gpu_metrics.csv
           if-no-files-found: ignore
 
       - name: Upload eval results (if any)
-        if: ${{ env.RUN_EVAL == 'true' }}
+        if: ${{ always() && (env.RUN_EVAL == 'true' || inputs.eval-only) }}
         uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         with:
           name: eval_${{ env.EXP_NAME }}_${{ env.RESULT_FILENAME }}
           path: |
             meta_env.json
             results*.json
             sample*.jsonl
-          if-no-files-found: ignore
+          if-no-files-found: ${{ inputs.eval-only && 'error' || 'ignore' }}
 
       - name: Cleanup eval outputs (post-upload)
-        if: ${{ env.RUN_EVAL == 'true' }}
+        if: ${{ always() && (env.RUN_EVAL == 'true' || inputs.eval-only) }}
         run: |
           rm -f meta_env.json || true
           # Remove any eval results JSONs that were moved into workspace
           rm -f results*.json || true
+          rm -f sample*.jsonl || true
 
       - name: Resource cleanup (post-run)
         if: always()

diff --git a/.github/workflows/claude.yml b/.github/workflows/claude.yml
@@ -97,7 +97,7 @@ jobs:
             The `generate-cli-command` input accepts arguments for `generate_sweep_configs.py`. Usage: `generate_sweep_configs.py` `[-h]` `{full-sweep,runner-model-sweep,test-config}`
 
             **Subcommand reference:**
-            - `full-sweep`: Use this subcommand with filter flags like `--model-prefix`, `--framework`, `--precision`, `--runner-type`, `--min-conc`, `--max-conc`, `--seq-len`. This is the primary subcommand for running benchmarks.
+            - `full-sweep`: Use this subcommand with filter flags like `--model-prefix`, `--framework`, `--precision`, `--runner-type`, `--min-conc`, `--max-conc`, `--seq-lens`. This is the primary subcommand for running benchmarks.
             - `test-config`: Use this subcommand ONLY when prompted to with 'test-config'. Uses the flags `--config-files` and `--config-keys`, does NOT accept any other arguments.
 
             Examples:
@@ -119,7 +119,7 @@ jobs:
 
             **Specify concurrency and sequence length:**
             ```
-            generate-cli-command: "full-sweep --config-files .github/configs/nvidia-master.yaml --single-node --model-prefix dsr1 --min-conc 4 --max-conc 4 --seq-len 1k1k"
+            generate-cli-command: "full-sweep --config-files .github/configs/nvidia-master.yaml --single-node --model-prefix dsr1 --min-conc 4 --max-conc 4 --seq-lens 1k1k"
             ```
 
             **Test specific config keys (MUST USE `--conc`):**
@@ -130,7 +130,7 @@ jobs:
             **IMPORTANT: Keep runs precise and efficient:**
             - Use `full-sweep` with filter flags to narrow down the benchmark scope - "full-sweep" does NOT mean running everything
             - When using `full-sweep`, you must use `--min-conc` and `--max-conc` together to specify a single concurrency value. Unless prompted otherwise, use `--min-conc 4 --max-conc 4`
-            - When using `full-sweep`, you can use `--seq-len` to specify a single sequence length (choices: 1k1k, 1k8k, 8k1k). Unless prompted otherwise, use `--seq-len 1k1k`
+            - When using `full-sweep`, you can use `--seq-lens` to specify sequence lengths (choices: 1k1k, 8k1k). Unless prompted otherwise, use `--seq-lens 1k1k`
             - Use `test-config` ONLY when given specific config keys to test - Use `--config-files`, `--config-keys`, and `--conc` flags ONLY
             - Always filter by specific models, frameworks, precision, conc, or config keys when possible
 
@@ -291,4 +291,3 @@ jobs:
             # Then use $EP in the vllm serve command
             ```
             This ensures the script respects the `ep` setting in the master config YAML's search-space.
-
diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml
@@ -37,6 +37,7 @@ jobs:
         outputs:
             single-node-config: ${{ steps.get-jobs.outputs.single-node-config }}
             multi-node-config: ${{ steps.get-jobs.outputs.multi-node-config }}
+            eval-config: ${{ steps.get-jobs.outputs.eval-config }}
         steps:
             - name: Checkout code (ref)
               if: ${{ inputs.ref && inputs.ref != '' }}
@@ -53,10 +54,12 @@ jobs:
                   pip install pydantic
                   CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py \
                     ${{ inputs.generate-cli-command || github.event.inputs.generate-cli-command }})
-                  SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x]))")
+                  SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and not x.get('run-eval', False)]))")
                   MULTI=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x]))")
+                  EVALS=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('run-eval', False)]))")
                   echo "single-node-config=$SINGLE" >> $GITHUB_OUTPUT
                   echo "multi-node-config=$MULTI" >> $GITHUB_OUTPUT
+                  echo "eval-config=$EVALS" >> $GITHUB_OUTPUT
 
     test-sweep-multi-node:
         needs: get-jobs
@@ -123,7 +126,38 @@ jobs:
             conc: ${{ matrix.config.conc }}
             spec-decoding: ${{ matrix.config.spec-decoding }}
             disagg: ${{ matrix.config.disagg }}
-            run-eval: ${{ matrix.config.run-eval }}
+            run-eval: false
+            ref: ${{ inputs.ref }}
+
+    test-sweep-evals:
+        needs: get-jobs
+        if: ${{ needs.get-jobs.outputs.eval-config != '[]' }}
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        name: eval /
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-jobs.outputs.eval-config) }}
+        secrets: inherit
+        with:
+            exp-name: ${{ matrix.config.exp-name }}
+            isl: ${{ matrix.config.isl }}
+            osl: ${{ matrix.config.osl }}
+            max-model-len: ${{ matrix.config.max-model-len }}
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            model-prefix: ${{ matrix.config.model-prefix }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
+            conc: ${{ matrix.config.conc }}
+            spec-decoding: ${{ matrix.config.spec-decoding }}
+            disagg: ${{ matrix.config.disagg }}
+            run-eval: true
+            eval-only: true
             ref: ${{ inputs.ref }}
 
     collect-results:
@@ -135,8 +169,8 @@ jobs:
             result-prefix: "bmk"
 
     collect-evals:
-        needs: [test-sweep-multi-node, test-sweep-single-node]
-        if: ${{ always() }}
+        needs: [test-sweep-evals]
+        if: ${{ always() && needs.test-sweep-evals.result != 'skipped' }}
         uses: ./.github/workflows/collect-evals.yml
         secrets: inherit
 

diff --git a/.github/workflows/profile.yml b/.github/workflows/profile.yml
@@ -35,6 +35,8 @@ env:
   HF_HUB_CACHE: '/mnt/hf_hub_cache/'
   RANDOM_RANGE_RATIO: '0.8'
   PERFETTO_RELAY_URL: https://semianalysisai.github.io/InferenceX-trace-storage
+  PYTHONDONTWRITEBYTECODE: '1'
+  PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache
 
 jobs:
   get-jobs:
@@ -87,7 +89,7 @@ jobs:
       - name: Fail if no matching entries
         if: ${{ steps.filter.outputs.count == '0' }}
         run: |
-          echo "No entries produced for config-key=${{ inputs.config-key }}, seq-lens=${{ inputs.seq-lens }}, conc=${{ inputs.conc }}." >&2
+          echo "No entries produced for config-key=${{ inputs.config-key }}, conc=${{ inputs.conc }}." >&2
           exit 1
 
   profile:
@@ -153,6 +155,7 @@ jobs:
         with:
           fetch-depth: 0
           ref: ${{ inputs.ref || github.ref }}
+          clean: false
 
       - name: Launch + Profile (single-node sglang/vllm)
         id: run