diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index b9560803e..a5986e32a 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -12,10 +12,6 @@ dsr1-fp4-mi355x-sglang:
     search-space:
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -35,11 +31,6 @@ dsr1-fp4-mi355x-atom:
     search-space:
     - { tp: 4, ep: 1, conc-start: 32, conc-end: 256 }
     - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 4, ep: 1, conc-start: 128, conc-end: 256 }
-    - { tp: 8, ep: 1, conc-start: 4, conc-end: 128 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -61,11 +52,6 @@ dsr1-fp4-mi355x-atom-mtp:
     search-space:
     - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp }
     - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    # - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp }
-    - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp }
   - isl: 8192
     osl: 1024
     search-space:
@@ -85,10 +71,6 @@ dsr1-fp8-mi300x-sglang:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -107,10 +89,6 @@ dsr1-fp8-mi325x-sglang:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -129,10 +107,6 @@ dsr1-fp8-mi355x-sglang:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -152,10 +126,6 @@ qwen3.5-bf16-mi355x-sglang:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -174,10 +144,6 @@ qwen3.5-bf16-mi300x-sglang:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -196,10 +162,6 @@ qwen3.5-bf16-mi325x-sglang:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -218,10 +180,6 @@ qwen3.5-fp8-mi325x-sglang:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -240,10 +198,6 @@ qwen3.5-fp8-mi355x-sglang:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -262,10 +216,6 @@ qwen3.5-fp8-mi300x-sglang:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -284,10 +234,6 @@ glm5-fp8-mi355x-sglang:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -306,10 +252,6 @@ kimik2.5-int4-mi355x-vllm:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -328,10 +270,6 @@ kimik2.5-int4-mi325x-vllm:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -351,12 +289,6 @@ kimik2.5-fp4-mi355x-vllm:
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -378,12 +310,6 @@ minimaxm2.5-fp8-mi355x-vllm:
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, ep: 8, conc-start: 32, conc-end: 256 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 32, conc-end: 256 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -405,11 +331,6 @@ minimaxm2.5-fp8-mi300x-vllm:
     search-space:
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -430,11 +351,6 @@ minimaxm2.5-fp8-mi325x-vllm:
     search-space:
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -457,13 +373,6 @@ gptoss-fp4-mi300x-vllm:
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 16 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 1, conc-start: 64, conc-end: 64 }
-    - { tp: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 16 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -488,13 +397,6 @@ gptoss-fp4-mi325x-vllm:
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 1, conc-start: 64, conc-end: 64 }
-    - { tp: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, conc-start: 64, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -518,12 +420,6 @@ gptoss-fp4-mi355x-vllm:
     - { tp: 1, conc-start: 4, conc-end: 128 }
     - { tp: 4, conc-start: 4, conc-end: 8 }
     - { tp: 8, conc-start: 4, conc-end: 16 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 1, conc-start: 4, conc-end: 128 }
-    - { tp: 4, conc-start: 4, conc-end: 8 }
-    - { tp: 8, conc-start: 4, conc-end: 16 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -545,11 +441,6 @@ gptoss-fp4-mi355x-atom:
     search-space:
     - { tp: 1, conc-start: 16, conc-end: 128 }
     - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 1, conc-start: 16, conc-end: 128 }
-    - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -570,10 +461,6 @@ dsr1-fp8-mi355x-atom:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 128 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 128 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -593,10 +480,6 @@ dsr1-fp8-mi355x-atom-mtp:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp  }
   - isl: 8192
     osl: 1024
     search-space:
@@ -911,129 +794,6 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=2"
 
-  # FIXME(billishyahao): disable 1k8k for now
-  # - isl: 1024
-  #   osl: 8192
-  #   search-space:
-  #   # MTP configurations
-  #   # "Top of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8)
-  #   - spec-decoding: "mtp"
-  #     conc-list: [ 2048 ]
-  #     prefill:
-  #       num-worker: 1
-  #       tp: 1
-  #       ep: 8
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "PREFILL_NODES=1"
-  #     decode:
-  #       num-worker: 1
-  #       tp: 1
-  #       ep: 16
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "DECODE_NODES=2"
-  #       - "DECODE_MTP_SIZE=1"
-
-
-  #   # "Middle of curve" (1 prefill worker at DEP8 and 2 decode workers each at DEP8)
-  #   - spec-decoding: "mtp"
-  #     conc-list: [ 256, 512, 1024 ]
-  #     prefill:
-  #       num-worker: 1
-  #       tp: 1
-  #       ep: 8
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "PREFILL_NODES=1"
-  #     decode:
-  #       num-worker: 2
-  #       tp: 1
-  #       ep: 8
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "DECODE_NODES=2"
-  #       - "DECODE_MTP_SIZE=1"
-
-
-  #   # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8)
-  #   - spec-decoding: "mtp"
-  #     conc-list: [ 32, 64, 128 ]
-  #     prefill:
-  #       num-worker: 1
-  #       tp: 8
-  #       ep: 8
-  #       dp-attn: false
-  #       additional-settings:
-  #       - "PREFILL_NODES=1"
-
-  #     decode:
-  #       num-worker: 2
-  #       tp: 8
-  #       ep: 8
-  #       dp-attn: false
-  #       additional-settings:
-  #       - "DECODE_NODES=2"
-  #       - "DECODE_MTP_SIZE=1"
-
-  #   # non-MTP configurations
-  #   # "Top of curve" (1 prefill workers each at DEP8 and 1 decode workers at DEP16)
-  #   - spec-decoding: "none"
-  #     conc-list: [ 2048 ]
-  #     prefill:
-  #       num-worker: 1
-  #       tp: 1
-  #       ep: 8
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "PREFILL_NODES=1"
-  #     decode:
-  #       num-worker: 1
-  #       tp: 1
-  #       ep: 16
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "DECODE_NODES=2"
-  #       - "DECODE_MTP_SIZE=0"
-
-  #   # "Middle of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8)
-  #   - spec-decoding: "none"
-  #     conc-list: [ 256, 512, 1024 ]
-  #     prefill:
-  #       num-worker: 1
-  #       tp: 1
-  #       ep: 8
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "PREFILL_NODES=1"
-  #     decode:
-  #       num-worker: 2
-  #       tp: 1
-  #       ep: 8
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "DECODE_NODES=2"
-  #       - "DECODE_MTP_SIZE=0"
-
-  #   # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8)
-  #   - spec-decoding: "none"
-  #     conc-list: [ 32, 64, 128 ]
-  #     prefill:
-  #       num-worker: 1
-  #       tp: 8
-  #       ep: 8
-  #       dp-attn: false
-  #       additional-settings:
-  #       - "PREFILL_NODES=1"
-  #     decode:
-  #       num-worker: 2
-  #       tp: 8
-  #       ep: 8
-  #       dp-attn: false
-  #       additional-settings:
-  #       - "DECODE_NODES=2"
-  #       - "DECODE_MTP_SIZE=0"
-
 
 dsr1-fp4-mi355x-sglang-disagg:
   image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3
@@ -1453,49 +1213,3 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=1"
 
-
-  # FIXME(billishyahao): disable FP4 1k8k for now
-  # - isl: 1024
-  #   osl: 8192
-  #   search-space:
-  #   # MTP configurations
-  #   # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8)
-  #   - spec-decoding: "mtp"
-  #     conc-list: [ 32, 64, 128 ]
-  #     prefill:
-  #       num-worker: 1
-  #       tp: 8
-  #       ep: 8
-  #       dp-attn: false
-  #       additional-settings:
-  #       - "PREFILL_NODES=1"
-
-  #     decode:
-  #       num-worker: 2
-  #       tp: 8
-  #       ep: 8
-  #       dp-attn: false
-  #       additional-settings:
-  #       - "DECODE_NODES=2"
-  #       - "DECODE_MTP_SIZE=1"
-
-  #   # non-MTP configurations
-  #   # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8)
-  #   - spec-decoding: "none"
-  #     conc-list: [ 32, 64, 128 ]
-  #     prefill:
-  #       num-worker: 1
-  #       tp: 8
-  #       ep: 8
-  #       dp-attn: false
-  #       additional-settings:
-  #       - "PREFILL_NODES=1"
-  #     decode:
-  #       num-worker: 2
-  #       tp: 8
-  #       ep: 8
-  #       dp-attn: false
-  #       additional-settings:
-  #       - "DECODE_NODES=2"
-  #       - "DECODE_MTP_SIZE=0"
-
diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index c76aee8f6..33751270b 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1663,11 +1663,6 @@ dsr1-fp4-b200-sglang:
     search-space:
     - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 }
     - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -1694,17 +1689,6 @@ dsr1-fp4-b200-trt:
     - { tp: 8, conc-start: 4, conc-end: 4 }
     - { tp: 8, ep: 8, conc-start: 64, conc-end: 64 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    # low concurrency cases use TP only
-    # concurrency 64 uses TP & EP
-    # high concurrency cases use TP & EP & DP-ATTN
-    - { tp: 4, conc-start: 4, conc-end: 16 }
-    - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 256 }
-    - { tp: 8, conc-start: 4, conc-end: 4 }
-    - { tp: 8, ep: 8, conc-start: 64, conc-end: 64 }
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -1737,17 +1721,6 @@ dsr1-fp4-b200-trt-mtp:
     - { tp: 8, conc-start: 128, conc-end: 128, spec-decoding: mtp }
     - { tp: 8, ep: 8, conc-start: 32, conc-end: 128, spec-decoding: mtp }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 64, spec-decoding: mtp }
-  - isl: 1024
-    osl: 8192
-    search-space:
-      # TP=4 configurations
-    - { tp: 4, conc-start: 16, conc-end: 16, spec-decoding: mtp }
-    - { tp: 4, ep: 4, conc-start: 8, conc-end: 8, spec-decoding: mtp }
-    - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 256, spec-decoding: mtp }
-      # TP=8 configurations
-    - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp }
-    - { tp: 8, ep: 8, conc-start: 32, conc-end: 64, spec-decoding: mtp }
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
   - isl: 8192
     osl: 1024
     search-space:
@@ -1772,10 +1745,6 @@ dsr1-fp8-b200-sglang:
     osl: 1024
     search-space:
     - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -1795,10 +1764,6 @@ qwen3.5-bf16-b200-sglang:
     osl: 1024
     search-space:
     - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -1818,11 +1783,6 @@ qwen3.5-fp8-b200-sglang:
     search-space:
     - { tp: 8, ep: 1, conc-start: 4, conc-end: 16 }
     - { tp: 4, ep: 4, conc-start: 16, conc-end: 128 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, ep: 1, conc-start: 4, conc-end: 16 }
-    - { tp: 4, ep: 4, conc-start: 16, conc-end: 128 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -1842,10 +1802,6 @@ glm5-fp8-b200-sglang:
     osl: 1024
     search-space:
     - { tp: 8, ep: 1, conc-start: 4, conc-end: 128 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, ep: 1, conc-start: 4, conc-end: 128 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -1864,10 +1820,6 @@ qwen3.5-fp8-b200-sglang-mtp:
     osl: 1024
     search-space:
     - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
   - isl: 8192
     osl: 1024
     search-space:
@@ -1886,10 +1838,6 @@ kimik2.5-int4-b200-vllm:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -1908,10 +1856,6 @@ kimik2.5-int4-h200-vllm:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -1931,10 +1875,6 @@ kimik2.5-fp4-b200-vllm:
     search-space:
     - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
     - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -1954,10 +1894,6 @@ dsr1-fp8-b200-sglang-mtp:
     osl: 1024
     search-space:
     - { tp: 8, ep: 1, conc-start: 4, conc-end: 512, spec-decoding: mtp }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, ep: 1, conc-start: 4, conc-end: 512, spec-decoding: mtp }
   - isl: 8192
     osl: 1024
     search-space:
@@ -1978,11 +1914,6 @@ dsr1-fp8-b200-trt:
     - { tp: 8, ep: 1, conc-start: 64, conc-end: 128 }
     - { tp: 4, ep: 1, conc-start: 8, conc-end: 16 } 
     - { tp: 8, ep: 1, conc-start: 4, conc-end: 8 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256}
-    - { tp: 8, ep: 1, conc-start: 4, conc-end: 128 }    
   - isl: 8192
     osl: 1024
     search-space:
@@ -2007,13 +1938,6 @@ dsr1-fp8-b200-trt-mtp:
     # If CONC == 256, then TP8, EP8, DP_ATTN=true
     - { tp: 8, ep: 1, conc-start: 4, conc-end: 128, spec-decoding: mtp }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256, spec-decoding: mtp }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    # mostly TP8
-    # If CONC >= 128, then TP8, EP8, DP_ATTN=true
-    - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
   - isl: 8192
     osl: 1024
     search-space:
@@ -2033,10 +1957,6 @@ dsr1-fp8-h200-sglang:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -2055,10 +1975,6 @@ qwen3.5-fp8-h200-sglang:
     osl: 1024
     search-space:
     - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -2077,10 +1993,6 @@ glm5-fp8-h200-sglang:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -2101,11 +2013,6 @@ dsr1-fp8-h200-trt:
     # If CONC > 64, then DP_ATTN=true
     search-space:
     - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    # If CONC > 64, then DP_ATTN=true
-    search-space:
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     # If CONC > 32, then DP_ATTN=true
@@ -2129,12 +2036,6 @@ dsr1-fp8-h200-trt-mtp:
     # If CONC >= 128, then DP_ATTN=true, MTP=1
     - { tp: 8, ep: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    # If CONC >= 256, then DP_ATTN=true, MTP=1
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 128, spec-decoding: mtp }
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256, spec-decoding: mtp }
   - isl: 8192
     osl: 1024
     search-space:
@@ -3149,14 +3050,6 @@ gptoss-fp4-b200-trt:
     - { tp: 4, conc-start: 4, conc-end: 4 }
     - { tp: 8, conc-start: 4, conc-end: 4 }
   # Low ==> high TP from Left to Right of pareto
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 1, conc-start: 256, conc-end: 256}
-    - { tp: 2, conc-start: 128, conc-end: 256}
-    - { tp: 4, conc-start:   4, conc-end: 256}
-    - { tp: 8, conc-start:   4, conc-end:   4}
-  # Low ==> high TP from Left to Right of pareto
   - isl: 8192
     osl: 1024
     search-space:
@@ -3181,13 +3074,6 @@ gptoss-fp4-b200-vllm:
     - { tp: 2, conc-start: 4, conc-end: 128 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 8 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 1, conc-start: 4, conc-end: 128 }
-    - { tp: 2, conc-start: 4, conc-end: 128 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 8 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -3210,11 +3096,6 @@ minimaxm2.5-fp8-b200-vllm:
     search-space:
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -3236,12 +3117,6 @@ gptoss-fp4-h100-vllm:
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -3263,11 +3138,6 @@ minimaxm2.5-fp8-h100-vllm:
     search-space:
     # - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
     - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    # - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -3345,67 +3215,6 @@ dsr1-fp8-h100-dynamo-sglang:
         tp: 16
         ep: 16
         dp-attn: true
-  - isl: 1024
-    osl: 8192
-    search-space:
-    # # STP: Max throughput TEP (1 prefill, 2 decode)
-    # - conc-list: [1, 2, 4, 8, 16, 32]
-    #   prefill:
-    #     num-worker: 1
-    #     tp: 16
-    #     ep: 1
-    #     dp-attn: false
-    #     additional-settings:
-    #     - "CONFIG_FILE=recipes/h100/1k8k/stp/h100-fp8-1p2d-max-tp.yaml"
-    #   decode:
-    #     num-worker: 2
-    #     tp: 16
-    #     ep: 1
-    #     dp-attn: false
-    # # STP: Max throughput DEP (1 prefill, 1 decode, dp-attention)
-    # - conc-list: [1, 2, 4, 8]
-    #   prefill:
-    #     num-worker: 1
-    #     tp: 16
-    #     ep: 1
-    #     dp-attn: false
-    #     additional-settings:
-    #     - "CONFIG_FILE=recipes/h100/1k8k/stp/h100-fp8-1p1d-max-dep.yaml"
-    #   decode:
-    #     num-worker: 1
-    #     tp: 16
-    #     ep: 16
-    #     dp-attn: true
-    # MTP: Max throughput TEP (1 prefill, 2 decode)
-    - spec-decoding: "mtp"
-      conc-list: [1, 2, 4, 8, 16, 32, 64]
-      prefill:
-        num-worker: 1
-        tp: 16
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/h100/1k8k/mtp/h100-fp8-1p2d-max-tp-mtp.yaml"
-      decode:
-        num-worker: 2
-        tp: 16
-        ep: 1
-        dp-attn: false
-    # MTP: Max throughput DEP (1 prefill, 1 decode, dp-attention)
-    - spec-decoding: "mtp"
-      conc-list: [1, 2, 4, 8, 16, 32, 64]
-      prefill:
-        num-worker: 1
-        tp: 16
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/h100/1k8k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml"
-      decode:
-        num-worker: 1
-        tp: 16
-        ep: 16
-        dp-attn: true
   - isl: 8192
     osl: 1024
     search-space:
@@ -3485,13 +3294,6 @@ gptoss-fp4-h200-trt:
     - { tp: 2, ep: 2, dp-attn: false, conc-start: 4, conc-end: 64 }
     - { tp: 4, ep: 4, dp-attn: false, conc-start: 4, conc-end: 32 }
     - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 1, ep: 1, dp-attn: false, conc-start: 4, conc-end: 64 }
-    - { tp: 2, ep: 2, dp-attn: false, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, dp-attn: false, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -3516,13 +3318,6 @@ gptoss-fp4-h200-vllm:
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 1, conc-start: 4, conc-end: 4 }
-    - { tp: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -3544,10 +3339,6 @@ minimaxm2.5-fp8-h200-vllm:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 128 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 8, conc-start: 4, conc-end: 128 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -3744,8 +3535,8 @@ dsr1-fp4-gb200-dynamo-trt:
         ep: 32
         dp-attn: true
 
-  - isl: 1024
-    osl: 8192
+  - isl: 8192
+    osl: 1024
     search-space:
     # MTP configurations (spec_decoding="mtp")
     - spec-decoding: "mtp"
@@ -3756,342 +3547,132 @@ dsr1-fp4-gb200-dynamo-trt:
         ep: 4
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen7_tep8_batch4_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen7_tep8_batch4_eplb0_mtp3.yaml"
-      decode:
-        num-worker: 7
-        tp: 8
-        ep: 8
-        dp-attn: false
-    - spec-decoding: "mtp"
-      conc-list: [ 7 ]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml"
       decode:
-        num-worker: 7
+        num-worker: 4
         tp: 8
         ep: 8
         dp-attn: false
     - spec-decoding: "mtp"
-      conc-list: [ 128 ]
+      conc-list: [ 180 ]
       prefill:
-        num-worker: 1
+        num-worker: 3
         tp: 4
         ep: 4
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml"
       decode:
         num-worker: 1
         tp: 32
         ep: 32
         dp-attn: true
     - spec-decoding: "mtp"
-      conc-list: [ 512 ]
+      conc-list: [ 1229 ]
       prefill:
-        num-worker: 1
+        num-worker: 7
         tp: 4
         ep: 4
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml"
       decode:
         num-worker: 1
-        tp: 32
-        ep: 32
-        dp-attn: true
-    - spec-decoding: "mtp"
-      conc-list: [ 3072 ]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen3_dep16_batch64_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen3_dep16_batch64_eplb0_mtp3.yaml"
-      decode:
-        num-worker: 3
         tp: 16
         ep: 16
         dp-attn: true
     - spec-decoding: "mtp"
-      conc-list: [ 6144 ]
+      conc-list: [ 666 ]
       prefill:
-        num-worker: 1
+        num-worker: 8
         tp: 4
         ep: 4
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen3_dep16_batch128_eplb0_mtp1.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen3_dep16_batch128_eplb0_mtp1.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml"
       decode:
-        num-worker: 3
-        tp: 16
-        ep: 16
+        num-worker: 1
+        tp: 32
+        ep: 32
         dp-attn: true
     - spec-decoding: "mtp"
-      conc-list: [ 8192 ]
+      conc-list: [ 4301 ]
       prefill:
-        num-worker: 1
+        num-worker: 11
         tp: 4
         ep: 4
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen1_dep32_batch256_eplb288_mtp1.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen1_dep32_batch256_eplb288_mtp1.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml"
       decode:
         num-worker: 1
-        tp: 32
-        ep: 32
+        tp: 16
+        ep: 16
         dp-attn: true
 
     # Non-MTP configurations (default spec_decoding="none")
-    - conc-list: [ 5 ]
+    - conc-list: [ 12, 44, 76 ]
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml"
       decode:
-        num-worker: 7
+        num-worker: 4
         tp: 8
         ep: 8
         dp-attn: false
-    - conc-list: [ 60 ]
+    - conc-list: [ 5 ]
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen15_tep4_batch4_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen15_tep4_batch4_eplb0_mtp0.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
       decode:
-        num-worker: 15
-        tp: 4
-        ep: 4
+        num-worker: 4
+        tp: 8
+        ep: 8
         dp-attn: false
-    - conc-list: [ 135 ]
+    - conc-list: [ 333 ]
       prefill:
-        num-worker: 1
+        num-worker: 2
         tp: 4
         ep: 4
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen15_tep4_batch8_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen15_tep4_batch8_eplb0_mtp0.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml"
       decode:
-        num-worker: 15
-        tp: 4
-        ep: 4
-        dp-attn: false
-    - conc-list: [ 563 ]
-      prefill:
         num-worker: 1
+        tp: 32
+        ep: 32
+        dp-attn: true
+    - conc-list: [ 1229 ]
+      prefill:
+        num-worker: 7
         tp: 4
         ep: 4
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 32
         ep: 32
         dp-attn: true
-    - conc-list: [ 2048 ]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch64_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch64_eplb0_mtp0.yaml"
-      decode:
-        num-worker: 1
-        tp: 32
-        ep: 32
-        dp-attn: true
-    - conc-list: [ 4096 ]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch128_eplb288_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch128_eplb288_mtp0.yaml"
-      decode:
-        num-worker: 1
-        tp: 32
-        ep: 32
-        dp-attn: true
-    - conc-list: [ 8192 ]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch256_eplb288_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch256_eplb288_mtp0.yaml"
-      decode:
-        num-worker: 1
-        tp: 32
-        ep: 32
-        dp-attn: true
-
-  - isl: 8192
-    osl: 1024
-    search-space:
-    # MTP configurations (spec_decoding="mtp")
-    - spec-decoding: "mtp"
-      conc-list: [ 4, 8, 12, 24, 48 ]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml"
-      decode:
-        num-worker: 4
-        tp: 8
-        ep: 8
-        dp-attn: false
-    - spec-decoding: "mtp"
-      conc-list: [ 180 ]
-      prefill:
-        num-worker: 3
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml"
-      decode:
-        num-worker: 1
-        tp: 32
-        ep: 32
-        dp-attn: true
-    - spec-decoding: "mtp"
-      conc-list: [ 1229 ]
-      prefill:
-        num-worker: 7
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml"
-      decode:
-        num-worker: 1
-        tp: 16
-        ep: 16
-        dp-attn: true
-    - spec-decoding: "mtp"
-      conc-list: [ 666 ]
-      prefill:
-        num-worker: 8
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml"
-      decode:
-        num-worker: 1
-        tp: 32
-        ep: 32
-        dp-attn: true
-    - spec-decoding: "mtp"
-      conc-list: [ 4301 ]
-      prefill:
-        num-worker: 11
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml"
-      decode:
-        num-worker: 1
-        tp: 16
-        ep: 16
-        dp-attn: true
-
-    # Non-MTP configurations (default spec_decoding="none")
-    - conc-list: [ 12, 44, 76 ]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml"
-      decode:
-        num-worker: 4
-        tp: 8
-        ep: 8
-        dp-attn: false
-    - conc-list: [ 5 ]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
-      decode:
-        num-worker: 4
-        tp: 8
-        ep: 8
-        dp-attn: false
-    - conc-list: [ 333 ]
-      prefill:
-        num-worker: 2
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml"
-      decode:
-        num-worker: 1
-        tp: 32
-        ep: 32
-        dp-attn: true
-    - conc-list: [ 1229 ]
-      prefill:
-        num-worker: 7
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml"
-      decode:
-        num-worker: 1
-        tp: 32
-        ep: 32
-        dp-attn: true
-    - conc-list: [ 2253 ]
+    - conc-list: [ 2253 ]
       prefill:
         num-worker: 8
         tp: 4
@@ -4339,156 +3920,6 @@ dsr1-fp8-gb200-dynamo-trt:
         tp: 8
         ep: 8
         dp-attn: false
-  # 1k8k MTP configs
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - spec-decoding: "mtp"
-      conc-list: [8192]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen2_dep16_batch256_eplb0_mtp1_8192.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen2_dep16_batch256_eplb0_mtp1_8192.yaml"
-      decode:
-        num-worker: 2
-        tp: 16
-        ep: 16
-        dp-attn: true
-    - spec-decoding: "mtp"
-      conc-list: [2152]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen2_dep16_batch64_eplb0_mtp1_2152.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen2_dep16_batch64_eplb0_mtp1_2152.yaml"
-      decode:
-        num-worker: 2
-        tp: 16
-        ep: 16
-        dp-attn: true
-    - spec-decoding: "mtp"
-      conc-list: [564]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_564.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_564.yaml"
-      decode:
-        num-worker: 1
-        tp: 32
-        ep: 32
-        dp-attn: true
-    - spec-decoding: "mtp"
-      conc-list: [72]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen1_dep32_batch2_eplb0_mtp3_72.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen1_dep32_batch2_eplb0_mtp3_72.yaml"
-      decode:
-        num-worker: 1
-        tp: 32
-        ep: 32
-        dp-attn: true
-    - spec-decoding: "mtp"
-      conc-list: [4, 8]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen4_tep8_batch2_eplb0_mtp3_8.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen4_tep8_batch2_eplb0_mtp3_8.yaml"
-      decode:
-        num-worker: 4
-        tp: 8
-        ep: 8
-        dp-attn: false
-    # 1k8k STP configs
-    - conc-list: [8192]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen2_dep16_batch256_eplb0_mtp0_8192.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen2_dep16_batch256_eplb0_mtp0_8192.yaml"
-      decode:
-        num-worker: 2
-        tp: 16
-        ep: 16
-        dp-attn: true
-    - conc-list: [2048]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen1_dep32_batch64_eplb0_mtp0_2048.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen1_dep32_batch64_eplb0_mtp0_2048.yaml"
-      decode:
-        num-worker: 1
-        tp: 32
-        ep: 32
-        dp-attn: true
-    - conc-list: [564]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0_564.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0_564.yaml"
-      decode:
-        num-worker: 1
-        tp: 32
-        ep: 32
-        dp-attn: true
-    - conc-list: [36]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml"
-      decode:
-        num-worker: 4
-        tp: 8
-        ep: 8
-        dp-attn: false
-    - conc-list: [4]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml"
-      decode:
-        num-worker: 4
-        tp: 8
-        ep: 8
-        dp-attn: false
   # 8k1k MTP configs
   - isl: 8192
     osl: 1024
@@ -5079,343 +4510,164 @@ dsr1-fp4-gb300-dynamo-trt:
         - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml"
       decode:
         num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-    - spec-decoding: "mtp"
-      conc-list: [333]
-      prefill:
-        num-worker: 1
-        tp: 2
-        ep: 2
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml"
-      decode:
-        num-worker: 1
-        tp: 32
-        ep: 32
-        dp-attn: true
-    - spec-decoding: "mtp"
-      conc-list: [5]
-      prefill:
-        num-worker: 1
-        tp: 2
-        ep: 2
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml"
-      decode:
-        num-worker: 4
-        tp: 8
-        ep: 8
-        dp-attn: false
-    - spec-decoding: "mtp"
-      conc-list: [8, 12, 24, 48]
-      prefill:
-        num-worker: 1
-        tp: 2
-        ep: 2
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml"
-      decode:
-        num-worker: 4
-        tp: 8
-        ep: 8
-        dp-attn: false
-    - spec-decoding: "mtp"
-      conc-list: [2253]
-      prefill:
-        num-worker: 3
-        tp: 2
-        ep: 2
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml"
-      decode:
-        num-worker: 1
-        tp: 16
-        ep: 16
-        dp-attn: true
-    - spec-decoding: "mtp"
-      conc-list: [1229]
-      prefill:
-        num-worker: 3
-        tp: 2
-        ep: 2
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml"
-      decode:
-        num-worker: 1
-        tp: 32
-        ep: 32
-        dp-attn: true
-    # Non-MTP configurations (default spec_decoding="none")
-    - conc-list: [5]
-      prefill:
-        num-worker: 1
-        tp: 2
-        ep: 2
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
-      decode:
-        num-worker: 4
-        tp: 8
-        ep: 8
-        dp-attn: false
-    - conc-list: [12, 48, 96, 192]
-      prefill:
-        num-worker: 1
-        tp: 2
-        ep: 2
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml"
-      decode:
-        num-worker: 4
-        tp: 8
-        ep: 8
-        dp-attn: false
-    - conc-list: [8192]
-      prefill:
-        num-worker: 2
-        tp: 2
-        ep: 2
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-    - conc-list: [1229]
-      prefill:
-        num-worker: 2
-        tp: 2
-        ep: 2
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml"
-      decode:
-        num-worker: 1
-        tp: 32
-        ep: 32
-        dp-attn: true
-    - conc-list: [4301]
-      prefill:
-        num-worker: 3
-        tp: 2
-        ep: 2
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml"
-      decode:
-        num-worker: 1
-        tp: 16
-        ep: 16
+        tp: 4
+        ep: 4
         dp-attn: true
-    - conc-list: [2253]
+    - spec-decoding: "mtp"
+      conc-list: [333]
       prefill:
-        num-worker: 3
+        num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml"
       decode:
         num-worker: 1
         tp: 32
         ep: 32
         dp-attn: true
-  - isl: 1024
-    osl: 8192
-    search-space:
-    # MTP configurations (spec_decoding="mtp")
     - spec-decoding: "mtp"
-      conc-list: [7]
+      conc-list: [5]
       prefill:
         num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml"
       decode:
-        num-worker: 7
+        num-worker: 4
         tp: 8
         ep: 8
         dp-attn: false
     - spec-decoding: "mtp"
-      conc-list: [63]
+      conc-list: [8, 12, 24, 48]
       prefill:
         num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen7_tep8_batch8_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen7_tep8_batch8_eplb0_mtp3.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml"
       decode:
-        num-worker: 7
+        num-worker: 4
         tp: 8
         ep: 8
         dp-attn: false
     - spec-decoding: "mtp"
-      conc-list: [563]
+      conc-list: [2253]
       prefill:
-        num-worker: 1
+        num-worker: 3
         tp: 2
         ep: 2
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml"
       decode:
         num-worker: 1
-        tp: 32
-        ep: 32
+        tp: 16
+        ep: 16
         dp-attn: true
     - spec-decoding: "mtp"
-      conc-list: [2088]
+      conc-list: [1229]
       prefill:
-        num-worker: 1
+        num-worker: 3
         tp: 2
         ep: 2
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen1_dep32_batch64_eplb288_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen1_dep32_batch64_eplb288_mtp3.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml"
       decode:
         num-worker: 1
         tp: 32
         ep: 32
         dp-attn: true
-    - spec-decoding: "mtp"
-      conc-list: [8192]
-      prefill:
-        num-worker: 1
-        tp: 2
-        ep: 2
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen2_dep16_batch256_eplb256_mtp1.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen2_dep16_batch256_eplb256_mtp1.yaml"
-      decode:
-        num-worker: 2
-        tp: 16
-        ep: 16
-        dp-attn: true
-    - spec-decoding: "mtp"
-      conc-list: [16384]
+    # Non-MTP configurations (default spec_decoding="none")
+    - conc-list: [5]
       prefill:
         num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen4_dep8_batch512_eplb0_mtp1.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen4_dep8_batch512_eplb0_mtp1.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
       decode:
         num-worker: 4
         tp: 8
         ep: 8
-        dp-attn: true
-    # STP configurations (no spec_decoding)
-    - conc-list: [7]
+        dp-attn: false
+    - conc-list: [12, 48, 96, 192]
       prefill:
         num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml"
       decode:
-        num-worker: 7
+        num-worker: 4
         tp: 8
         ep: 8
         dp-attn: false
-    - conc-list: [60]
+    - conc-list: [8192]
       prefill:
-        num-worker: 1
+        num-worker: 2
         tp: 2
         ep: 2
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen15_tep4_batch4_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen15_tep4_batch4_eplb0_mtp0.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml"
       decode:
-        num-worker: 15
-        tp: 4
-        ep: 4
-        dp-attn: false
-    - conc-list: [245]
-      prefill:
         num-worker: 1
-        tp: 2
-        ep: 2
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml"
-      decode:
-        num-worker: 7
         tp: 8
         ep: 8
-        dp-attn: false
-    - conc-list: [1024]
+        dp-attn: true
+    - conc-list: [1229]
       prefill:
-        num-worker: 1
+        num-worker: 2
         tp: 2
         ep: 2
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 32
         ep: 32
         dp-attn: true
-    - conc-list: [4096]
+    - conc-list: [4301]
       prefill:
-        num-worker: 1
+        num-worker: 3
         tp: 2
         ep: 2
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen1_dep32_batch128_eplb288_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen1_dep32_batch128_eplb288_mtp0.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml"
       decode:
         num-worker: 1
-        tp: 32
-        ep: 32
+        tp: 16
+        ep: 16
         dp-attn: true
-    - conc-list: [8192]
+    - conc-list: [2253]
       prefill:
-        num-worker: 1
+        num-worker: 3
         tp: 2
         ep: 2
         dp-attn: true
         additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen1_dep32_batch256_eplb288_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen1_dep32_batch256_eplb288_mtp0.yaml"
+        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml
+        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 32
@@ -6184,187 +5436,6 @@ dsr1-fp8-gb300-dynamo-trt:
         tp: 8
         ep: 8
         dp-attn: true
-  - isl: 1024
-    osl: 8192
-    search-space:
-    # MTP configurations (spec_decoding="mtp")
-    - spec-decoding: "mtp"
-      conc-list: [4]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_4.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_4.yaml"
-      decode:
-        num-worker: 4
-        tp: 8
-        ep: 8
-        dp-attn: false
-    - spec-decoding: "mtp"
-      conc-list: [16]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_16.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_16.yaml"
-      decode:
-        num-worker: 4
-        tp: 8
-        ep: 8
-        dp-attn: false
-    - spec-decoding: "mtp"
-      conc-list: [141]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_141.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_141.yaml"
-      decode:
-        num-worker: 1
-        tp: 32
-        ep: 32
-        dp-attn: true
-    - spec-decoding: "mtp"
-      conc-list: [544]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_544.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_544.yaml"
-      decode:
-        num-worker: 1
-        tp: 32
-        ep: 32
-        dp-attn: true
-    - spec-decoding: "mtp"
-      conc-list: [2048]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen2_dep16_batch64_eplb0_mtp1_2048.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen2_dep16_batch64_eplb0_mtp1_2048.yaml"
-      decode:
-        num-worker: 2
-        tp: 16
-        ep: 16
-        dp-attn: true
-    - spec-decoding: "mtp"
-      conc-list: [8192]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp1_8192.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp1_8192.yaml"
-      decode:
-        num-worker: 4
-        tp: 8
-        ep: 8
-        dp-attn: true
-    # STP configurations (no spec_decoding)
-    - conc-list: [4]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml"
-      decode:
-        num-worker: 4
-        tp: 8
-        ep: 8
-        dp-attn: false
-    - conc-list: [36]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml"
-      decode:
-        num-worker: 4
-        tp: 8
-        ep: 8
-        dp-attn: false
-    - conc-list: [282]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_282.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_282.yaml"
-      decode:
-        num-worker: 1
-        tp: 32
-        ep: 32
-        dp-attn: true
-    - conc-list: [1024]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1024.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1024.yaml"
-      decode:
-        num-worker: 1
-        tp: 32
-        ep: 32
-        dp-attn: true
-    - conc-list: [4096]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen2_dep16_batch128_eplb0_mtp0_4096.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen2_dep16_batch128_eplb0_mtp0_4096.yaml"
-      decode:
-        num-worker: 2
-        tp: 16
-        ep: 16
-        dp-attn: true
-    - conc-list: [8192]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen2_dep16_batch256_eplb0_mtp0_8192.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen2_dep16_batch256_eplb0_mtp0_8192.yaml"
-      decode:
-        num-worker: 2
-        tp: 16
-        ep: 16
-        dp-attn: true
-
-
 gptoss-fp4-gb200-dynamo-trt:
   image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.7.0.post2
   model: openai/gpt-oss-120b
diff --git a/.github/workflows/README.md b/.github/workflows/README.md
index 37e64b8ed..de0a3dcab 100644
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@@ -40,7 +40,7 @@ usage: generate_sweep_configs.py full-sweep
     [--precision PRECISION [PRECISION ...]]
     [--framework FRAMEWORK [FRAMEWORK ...]]
     [--runner-type RUNNER_TYPE [RUNNER_TYPE ...]]
-    [--seq-lens {1k1k,1k8k,8k1k} [{1k1k,1k8k,8k1k} ...]]
+    [--seq-lens {1k1k,8k1k} [{1k1k,8k1k} ...]]
     [--step-size STEP_SIZE]
     [--max-conc MAX_CONC]
     [--max-tp MAX_TP]
@@ -62,9 +62,9 @@ full-sweep --config-files .github/configs/nvidia-master.yaml
 full-sweep --single-node --model-prefix gptoss --runner-type b200 --seq-lens 1k1k --config-files .github/configs/nvidia-master.yaml
 ```
 
-**Test all single-node fp8 precision configs for 1k8k workloads:**
+**Test all single-node fp8 precision configs for 8k1k workloads:**
 ```
-full-sweep --single-node --precision fp8 --seq-lens 1k8k --config-files .github/configs/nvidia-master.yaml .github/configs/amd-master.yaml
+full-sweep --single-node --precision fp8 --seq-lens 8k1k --config-files .github/configs/nvidia-master.yaml .github/configs/amd-master.yaml
 ```
 
 **Test all single-node TRT configs on H200 runners:**
diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
index db1734c9f..e2cda146b 100644
--- a/.github/workflows/benchmark-multinode-tmpl.yml
+++ b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -96,6 +96,8 @@ env:
   CONC_LIST: ${{ join(fromJson(inputs.conc-list), ' ') }}
   SPEC_DECODING: ${{ inputs.spec-decoding }}
   DISAGG: ${{ inputs.disagg }}
+  PYTHONDONTWRITEBYTECODE: '1'
+  PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache
 
   PREFILL_NUM_WORKERS: ${{ inputs.prefill-num-worker }}
   PREFILL_TP: ${{ inputs.prefill-tp }}
@@ -142,6 +144,7 @@ jobs:
           token: ${{ secrets.REPO_PAT }}
           fetch-depth: 0
           ref: ${{ inputs.ref || github.ref }}
+          clean: false
 
       - name: Launch multi-node job script
         env:
diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 16b587657..797505eec 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -54,6 +54,11 @@ on:
         type: boolean
         required: true
         default: false
+      eval-only:
+        description: "Run only evals (skip throughput benchmark)"
+        type: boolean
+        required: false
+        default: false
       random-range-ratio:
         required: false
         type: string
@@ -83,6 +88,9 @@ env:
   SPEC_DECODING: ${{ inputs.spec-decoding }}
   DISAGG: ${{ inputs.disagg }}
   RUN_EVAL: ${{ inputs.run-eval }}
+  EVAL_ONLY: ${{ inputs.eval-only }}
+  PYTHONDONTWRITEBYTECODE: '1'
+  PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache
 
 permissions:
   contents: read
@@ -91,7 +99,7 @@ jobs:
   benchmark:
     runs-on: ${{ inputs.runner }}
     timeout-minutes: 300
-    name: "${{ inputs.exp-name }} ${{ inputs.precision }} ${{ inputs.runner }} ${{ inputs.framework }} | tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} | disagg-${{ inputs.disagg }} spec-${{ inputs.spec-decoding }} conc-${{ inputs.conc }}${{ inputs.run-eval && ' | eval' || '' }}"
+    name: "${{ inputs.exp-name }} ${{ inputs.precision }} ${{ inputs.runner }} ${{ inputs.framework }} | tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} | disagg-${{ inputs.disagg }} spec-${{ inputs.spec-decoding }} conc-${{ inputs.conc }}${{ inputs.eval-only && ' | eval-only' || (inputs.run-eval && ' | eval' || '') }}"
     steps:
       - name: Resource cleanup (pre-run)
         run: &resource-cleanup |
@@ -123,13 +131,14 @@ jobs:
                 sleep 5
               done
             fi
-          fi
+          fi 
 
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
         with:
           token: ${{ secrets.REPO_PAT }}
           fetch-depth: 0
           ref: ${{ inputs.ref || github.ref }}
+          clean: false
 
       - name: Launch job script
         env:
@@ -145,28 +154,42 @@ jobs:
           echo "RESULT_FILENAME=${RESULT_FILENAME}" >> $GITHUB_ENV
 
           bash ./runners/launch_${RUNNER_NAME%%_*}.sh
-          FOUND_RESULT_FILE=
-          for i in {1..10}; do
-            if [ -f "$RESULT_FILENAME.json" ]; then
-              FOUND_RESULT_FILE=true
-              break
+
+          if [ "${{ inputs.eval-only }}" = "true" ]; then
+            echo "Eval-only mode: skipping benchmark result file check"
+            # Verify eval produced results
+            if ! ls results*.json 1>/dev/null 2>&1; then
+              echo "Eval-only run failed: no results*.json files found." >&2
+              exit 1
             fi
-            echo "Waiting for result file... (attempt $i)"
-            sleep 1
-          done
+            # Verify eval scores meet per-benchmark minimum thresholds
+            python3 utils/evals/validate_scores.py
+          else
+            FOUND_RESULT_FILE=
+            for i in {1..10}; do
+              if [ -f "$RESULT_FILENAME.json" ]; then
+                FOUND_RESULT_FILE=true
+                break
+              fi
+              echo "Waiting for result file... (attempt $i)"
+              sleep 1
+            done
 
-          if [ -z "$FOUND_RESULT_FILE" ]; then
-            echo "Run failed: Benchmark result $RESULT_FILENAME.json not found." >&2
-            exit 1
+            if [ -z "$FOUND_RESULT_FILE" ]; then
+              echo "Run failed: Benchmark result $RESULT_FILENAME.json not found." >&2
+              exit 1
+            fi
           fi
 
       - name: Process result
+        if: ${{ !inputs.eval-only }}
         env:
           RUNNER_TYPE: ${{ inputs.runner }}
         run: |
           python3 utils/process_result.py
 
       - name: Upload result
+        if: ${{ !inputs.eval-only }}
         uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         with:
           name: bmk_${{ env.RESULT_FILENAME }}
@@ -176,7 +199,7 @@ jobs:
         if: always()
         uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         with:
-          name: server_logs_${{ env.RESULT_FILENAME }}
+          name: ${{ inputs.eval-only && 'eval_server_logs_' || 'server_logs_' }}${{ env.RESULT_FILENAME }}
           path: server.log
           if-no-files-found: ignore
 
@@ -184,12 +207,12 @@ jobs:
         if: always()
         uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         with:
-          name: gpu_metrics_${{ env.RESULT_FILENAME }}
+          name: ${{ inputs.eval-only && 'eval_gpu_metrics_' || 'gpu_metrics_' }}${{ env.RESULT_FILENAME }}
           path: gpu_metrics.csv
           if-no-files-found: ignore
 
       - name: Upload eval results (if any)
-        if: ${{ env.RUN_EVAL == 'true' }}
+        if: ${{ always() && (env.RUN_EVAL == 'true' || inputs.eval-only) }}
         uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         with:
           name: eval_${{ env.EXP_NAME }}_${{ env.RESULT_FILENAME }}
@@ -197,14 +220,15 @@ jobs:
             meta_env.json
             results*.json
             sample*.jsonl
-          if-no-files-found: ignore
+          if-no-files-found: ${{ inputs.eval-only && 'error' || 'ignore' }}
 
       - name: Cleanup eval outputs (post-upload)
-        if: ${{ env.RUN_EVAL == 'true' }}
+        if: ${{ always() && (env.RUN_EVAL == 'true' || inputs.eval-only) }}
         run: |
           rm -f meta_env.json || true
           # Remove any eval results JSONs that were moved into workspace
           rm -f results*.json || true
+          rm -f sample*.jsonl || true
           
       - name: Resource cleanup (post-run)
         if: always()
diff --git a/.github/workflows/claude.yml b/.github/workflows/claude.yml
index 1be4b1b98..b5b474471 100644
--- a/.github/workflows/claude.yml
+++ b/.github/workflows/claude.yml
@@ -97,7 +97,7 @@ jobs:
             The `generate-cli-command` input accepts arguments for `generate_sweep_configs.py`. Usage: `generate_sweep_configs.py` `[-h]` `{full-sweep,runner-model-sweep,test-config}`
 
             **Subcommand reference:**
-            - `full-sweep`: Use this subcommand with filter flags like `--model-prefix`, `--framework`, `--precision`, `--runner-type`, `--min-conc`, `--max-conc`, `--seq-len`. This is the primary subcommand for running benchmarks.
+            - `full-sweep`: Use this subcommand with filter flags like `--model-prefix`, `--framework`, `--precision`, `--runner-type`, `--min-conc`, `--max-conc`, `--seq-lens`. This is the primary subcommand for running benchmarks.
             - `test-config`: Use this subcommand ONLY when prompted to with 'test-config'. Uses the flags `--config-files` and `--config-keys`, does NOT accept any other arguments.
 
             Examples:
@@ -119,7 +119,7 @@ jobs:
 
             **Specify concurrency and sequence length:**
             ```
-            generate-cli-command: "full-sweep --config-files .github/configs/nvidia-master.yaml --single-node --model-prefix dsr1 --min-conc 4 --max-conc 4 --seq-len 1k1k"
+            generate-cli-command: "full-sweep --config-files .github/configs/nvidia-master.yaml --single-node --model-prefix dsr1 --min-conc 4 --max-conc 4 --seq-lens 1k1k"
             ```
 
             **Test specific config keys (MUST USE `--conc`):**
@@ -130,7 +130,7 @@ jobs:
             **IMPORTANT: Keep runs precise and efficient:**
             - Use `full-sweep` with filter flags to narrow down the benchmark scope - "full-sweep" does NOT mean running everything
             - When using `full-sweep`, you must use `--min-conc` and `--max-conc` together to specify a single concurrency value. Unless prompted otherwise, use `--min-conc 4 --max-conc 4`
-            - When using `full-sweep`, you can use `--seq-len` to specify a single sequence length (choices: 1k1k, 1k8k, 8k1k). Unless prompted otherwise, use `--seq-len 1k1k`
+            - When using `full-sweep`, you can use `--seq-lens` to specify sequence lengths (choices: 1k1k, 8k1k). Unless prompted otherwise, use `--seq-lens 1k1k`
             - Use `test-config` ONLY when given specific config keys to test - Use `--config-files`, `--config-keys`, and `--conc` flags ONLY
             - Always filter by specific models, frameworks, precision, conc, or config keys when possible
 
@@ -291,4 +291,3 @@ jobs:
             # Then use $EP in the vllm serve command
             ```
             This ensures the script respects the `ep` setting in the master config YAML's search-space.
-
diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml
index 505bb515d..d6ecf76b0 100644
--- a/.github/workflows/e2e-tests.yml
+++ b/.github/workflows/e2e-tests.yml
@@ -37,6 +37,7 @@ jobs:
         outputs:
             single-node-config: ${{ steps.get-jobs.outputs.single-node-config }}
             multi-node-config: ${{ steps.get-jobs.outputs.multi-node-config }}
+            eval-config: ${{ steps.get-jobs.outputs.eval-config }}
         steps:
             - name: Checkout code (ref)
               if: ${{ inputs.ref && inputs.ref != '' }}
@@ -53,10 +54,12 @@ jobs:
                   pip install pydantic
                   CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py \
                     ${{ inputs.generate-cli-command || github.event.inputs.generate-cli-command }})
-                  SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x]))")
+                  SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and not x.get('run-eval', False)]))")
                   MULTI=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x]))")
+                  EVALS=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('run-eval', False)]))")
                   echo "single-node-config=$SINGLE" >> $GITHUB_OUTPUT
                   echo "multi-node-config=$MULTI" >> $GITHUB_OUTPUT
+                  echo "eval-config=$EVALS" >> $GITHUB_OUTPUT
 
     test-sweep-multi-node:
         needs: get-jobs
@@ -123,7 +126,38 @@ jobs:
             conc: ${{ matrix.config.conc }}
             spec-decoding: ${{ matrix.config.spec-decoding }}
             disagg: ${{ matrix.config.disagg }}
-            run-eval: ${{ matrix.config.run-eval }}
+            run-eval: false
+            ref: ${{ inputs.ref }}
+
+    test-sweep-evals:
+        needs: get-jobs
+        if: ${{ needs.get-jobs.outputs.eval-config != '[]' }}
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        name: eval /
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-jobs.outputs.eval-config) }}
+        secrets: inherit
+        with:
+            exp-name: ${{ matrix.config.exp-name }}
+            isl: ${{ matrix.config.isl }}
+            osl: ${{ matrix.config.osl }}
+            max-model-len: ${{ matrix.config.max-model-len }}
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            model-prefix: ${{ matrix.config.model-prefix }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
+            conc: ${{ matrix.config.conc }}
+            spec-decoding: ${{ matrix.config.spec-decoding }}
+            disagg: ${{ matrix.config.disagg }}
+            run-eval: true
+            eval-only: true
             ref: ${{ inputs.ref }}
 
     collect-results:
@@ -135,8 +169,8 @@ jobs:
             result-prefix: "bmk"
 
     collect-evals:
-        needs: [test-sweep-multi-node, test-sweep-single-node]
-        if: ${{ always() }}
+        needs: [test-sweep-evals]
+        if: ${{ always() && needs.test-sweep-evals.result != 'skipped' }}
         uses: ./.github/workflows/collect-evals.yml
         secrets: inherit
 
diff --git a/.github/workflows/profile.yml b/.github/workflows/profile.yml
index d72f54b8f..64e4ea531 100644
--- a/.github/workflows/profile.yml
+++ b/.github/workflows/profile.yml
@@ -35,6 +35,8 @@ env:
   HF_HUB_CACHE: '/mnt/hf_hub_cache/'
   RANDOM_RANGE_RATIO: '0.8'
   PERFETTO_RELAY_URL: https://semianalysisai.github.io/InferenceX-trace-storage
+  PYTHONDONTWRITEBYTECODE: '1'
+  PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache
 
 jobs:
   get-jobs:
@@ -87,7 +89,7 @@ jobs:
       - name: Fail if no matching entries
         if: ${{ steps.filter.outputs.count == '0' }}
         run: |
-          echo "No entries produced for config-key=${{ inputs.config-key }}, seq-lens=${{ inputs.seq-lens }}, conc=${{ inputs.conc }}." >&2
+          echo "No entries produced for config-key=${{ inputs.config-key }}, conc=${{ inputs.conc }}." >&2
           exit 1
 
   profile:
@@ -153,6 +155,7 @@ jobs:
         with:
           fetch-depth: 0
           ref: ${{ inputs.ref || github.ref }}
+          clean: false
 
       - name: Launch + Profile (single-node sglang/vllm)
         id: run
diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml
index 683056747..4d61a918c 100644
--- a/.github/workflows/run-sweep.yml
+++ b/.github/workflows/run-sweep.yml
@@ -106,18 +106,6 @@ jobs:
             decode-dp-attn: ${{ matrix.config.decode.dp-attn }}
             decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }}
 
-    sweep-multi-node-1k8k:
-        needs: setup
-        if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['1k8k']) != 'null' }}
-        uses: ./.github/workflows/benchmark-multinode-tmpl.yml
-        name: multi-node 1k8k /
-        strategy:
-            fail-fast: false
-            matrix:
-                config: ${{ fromJson(needs.setup.outputs.search-space-config).multi_node['1k8k'] }}
-        secrets: inherit
-        with: *multi-node-inputs
-
     sweep-multi-node-8k1k:
         needs: setup
         if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['8k1k']) != 'null' }}
@@ -159,38 +147,54 @@ jobs:
             disagg: ${{ matrix.config.disagg }}
             run-eval: ${{ matrix.config.run-eval }}
 
-    sweep-single-node-1k8k:
+    sweep-single-node-8k1k:
         needs: setup
-        if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).single_node['1k8k']) != 'null' }}
+        if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).single_node['8k1k']) != 'null' }}
         uses: ./.github/workflows/benchmark-tmpl.yml
-        name: single-node 1k8k /
+        name: single-node 8k1k /
         strategy:
             fail-fast: false
             matrix:
-                config: ${{ fromJson(needs.setup.outputs.search-space-config).single_node['1k8k'] }}
+                config: ${{ fromJson(needs.setup.outputs.search-space-config).single_node['8k1k'] }}
         secrets: inherit
         with: *single-node-inputs
 
-    sweep-single-node-8k1k:
+    sweep-evals:
         needs: setup
-        if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).single_node['8k1k']) != 'null' }}
+        if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).evals) != '[]' && toJson(fromJson(needs.setup.outputs.search-space-config).evals) != 'null' }}
         uses: ./.github/workflows/benchmark-tmpl.yml
-        name: single-node 8k1k /
+        name: eval /
         strategy:
             fail-fast: false
             matrix:
-                config: ${{ fromJson(needs.setup.outputs.search-space-config).single_node['8k1k'] }}
+                config: ${{ fromJson(needs.setup.outputs.search-space-config).evals }}
         secrets: inherit
-        with: *single-node-inputs
+        with:
+            exp-name: ${{ matrix.config.exp-name }}
+            isl: ${{ matrix.config.isl }}
+            osl: ${{ matrix.config.osl }}
+            max-model-len: ${{ matrix.config.max-model-len }}
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            model-prefix: ${{ matrix.config.model-prefix }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
+            conc: ${{ matrix.config.conc }}
+            spec-decoding: ${{ matrix.config.spec-decoding }}
+            disagg: ${{ matrix.config.disagg }}
+            run-eval: true
+            eval-only: true
 
     collect-results:
         needs:
             [
                 sweep-single-node-1k1k,
-                sweep-single-node-1k8k,
                 sweep-single-node-8k1k,
                 sweep-multi-node-1k1k,
-                sweep-multi-node-1k8k,
                 sweep-multi-node-8k1k,
                 setup,
             ]
@@ -201,17 +205,8 @@ jobs:
             result-prefix: "bmk"
 
     collect-evals:
-        needs:
-            [
-                sweep-single-node-1k1k,
-                sweep-single-node-1k8k,
-                sweep-single-node-8k1k,
-                sweep-multi-node-1k1k,
-                sweep-multi-node-1k8k,
-                sweep-multi-node-8k1k,
-                setup,
-            ]
-        if: ${{ always() && needs.setup.result != 'skipped' }}
+        needs: [sweep-evals, setup]
+        if: ${{ always() && needs.setup.result != 'skipped' && needs.sweep-evals.result != 'skipped' }}
         uses: ./.github/workflows/collect-evals.yml
         secrets: inherit
 
@@ -221,10 +216,12 @@ jobs:
         runs-on: ubuntu-latest
         steps:
             - name: Extract and save changelog metadata
-              env:
-                  CONFIG_JSON: ${{ needs.setup.outputs.search-space-config }}
               run: |
-                  echo "$CONFIG_JSON" | jq '.changelog_metadata' > changelog_metadata.json
+                  cat <<'CONFIGEOF' > _full_config.json
+                  ${{ needs.setup.outputs.search-space-config }}
+                  CONFIGEOF
+                  jq '.changelog_metadata' _full_config.json > changelog_metadata.json
+                  rm -f _full_config.json
 
             - name: Upload changelog artifact
               uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
diff --git a/AGENTS.md b/AGENTS.md
index 6bb4a86c8..94c28e334 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -75,27 +75,27 @@ python -m pytest matrix_logic/ -v
 ```bash
 # Full sweep with all configs
 python utils/matrix_logic/generate_sweep_configs.py full-sweep \
-  --master-config .github/configs/nvidia-master.yaml
+  --config-files .github/configs/nvidia-master.yaml
 
 # Filter by model prefix (dsr1 or gptoss)
 python utils/matrix_logic/generate_sweep_configs.py full-sweep \
-  --master-config .github/configs/nvidia-master.yaml \
-  --model dsr1
+  --config-files .github/configs/nvidia-master.yaml \
+  --model-prefix dsr1
 
 # Filter by framework (sglang, trt, vllm, atom, dynamo-trt, dynamo-sglang)
 python utils/matrix_logic/generate_sweep_configs.py full-sweep \
-  --master-config .github/configs/nvidia-master.yaml \
+  --config-files .github/configs/nvidia-master.yaml \
   --framework sglang
 
 # Filter by precision (fp4, fp8)
 python utils/matrix_logic/generate_sweep_configs.py full-sweep \
-  --master-config .github/configs/nvidia-master.yaml \
+  --config-files .github/configs/nvidia-master.yaml \
   --precision fp8
 
 # Filter by runner type (b200, h100, h200, gb200, mi300x, mi325x, mi355x)
 python utils/matrix_logic/generate_sweep_configs.py full-sweep \
-  --master-config .github/configs/nvidia-master.yaml \
-  --runner b200
+  --config-files .github/configs/nvidia-master.yaml \
+  --runner-type b200
 ```
 
 ### Processing Results
@@ -140,7 +140,6 @@ When working with benchmark configurations, use these valid values:
 
 **Sequence Lengths (ISL/OSL)**:
 - `1k1k` - 1024 input / 1024 output
-- `1k8k` - 1024 input / 8192 output
 - `8k1k` - 8192 input / 1024 output
 
 ## Code Conventions
@@ -266,7 +265,7 @@ dsr1-fp8-h200-dynamo-sglang:
 **7. Validate configuration:**
 ```bash
 python utils/matrix_logic/generate_sweep_configs.py full-sweep \
-  --master-config .github/configs/nvidia-master.yaml \
+  --config-files .github/configs/nvidia-master.yaml \
   --framework dynamo-sglang
 ```
 
@@ -296,18 +295,18 @@ When upgrading Docker images in benchmark scripts and master configs .yaml:
 
 ## Evals (Accuracy Validation)
 
-Evals run optional accuracy checks after throughput benchmarks to ensure model outputs aren't degraded by inference optimizations.
+Evals run optional accuracy checks to ensure model outputs aren't degraded by inference optimizations. They can run alongside benchmarks or independently in eval-only mode.
 
 ### When Evals Run
 
-Evals are **off by default** (`RUN_EVAL=false`). When enabled, they run for two representative points per configuration group:
+Evals are **off by default** (`RUN_EVAL=false`). When enabled, they run at two concurrency levels per configuration group:
 
-- **Lowest TP with highest concurrency** per (model, runner, framework, precision, ISL, OSL, spec-decoding)
-- **Highest TP with highest concurrency** per (model, runner, framework, precision, ISL, OSL, spec-decoding)
+- **Highest concurrency** per (model, runner, framework, precision, ISL, OSL, spec-decoding, dp-attn)
+- **Lower-median concurrency** per (model, runner, framework, precision, ISL, OSL, spec-decoding, dp-attn)
 
 This selection logic is in `mark_eval_entries()` in `utils/matrix_logic/generate_sweep_configs.py`.
 
-**Note**: Evals only run on `1k8k` sequence length.
+**Note**: Evals only run on `8k1k` sequence length.
 
 ### Eval Framework: lm-eval
 
@@ -316,30 +315,42 @@ The default eval framework is [lm-evaluation-harness](https://github.com/Eleuthe
 ### Running Evals via CLI
 
 ```bash
-# Generate configs with evals marked (in addition to all configs)
+# Generate configs (evals marked by default on 8k1k subset)
 python utils/matrix_logic/generate_sweep_configs.py full-sweep \
-  --master-config .github/configs/nvidia-master.yaml \
-  --run-evals
+  --config-files .github/configs/nvidia-master.yaml
+
+# Generate throughput-only configs (skip evals)
+python utils/matrix_logic/generate_sweep_configs.py full-sweep \
+  --config-files .github/configs/nvidia-master.yaml \
+  --no-evals
 
 # Generate ONLY the eval subset (excludes non-eval configs)
 python utils/matrix_logic/generate_sweep_configs.py full-sweep \
-  --master-config .github/configs/nvidia-master.yaml \
+  --config-files .github/configs/nvidia-master.yaml \
   --evals-only
 ```
 
 ### Eval Integration in Benchmark Scripts
 
-All benchmark scripts in `benchmarks/` follow this pattern:
+All benchmark scripts in `benchmarks/` follow one of two flows:
 
 ```bash
+# Combined mode (benchmark + eval):
 # 1. Start server
 # 2. wait_for_server_ready
 # 3. run_benchmark_serving (throughput)
 # 4. Conditionally run evals:
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
-    append_lm_eval_summary  # Writes meta_env.json and moves artifacts
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
 fi
+
+# Eval-only mode (EVAL_ONLY=true):
+# 1. Compute expanded context via compute_eval_context_length
+# 2. Start server with expanded context (--context-length or --max-model-len)
+# 3. wait_for_server_ready
+# 4. run_benchmark_serving returns immediately (skipped)
+# 5. run_eval + append_lm_eval_summary
 ```
 
 ### Key Eval Functions in `benchmarks/benchmark_lib.sh`
@@ -351,6 +362,8 @@ fi
 | `append_lm_eval_summary` | Writes `meta_env.json` and moves eval artifacts to workspace |
 | `_install_lm_eval_deps` | Installs lm-eval dependencies |
 | `_patch_lm_eval` | Patches lm-eval for reasoning tokens and TRT compatibility |
+| `compute_eval_context_length` | Computes eval context length (5x benchmark context, capped at model native max) |
+| `get_native_max_context_length` | Extracts model's native max context length from HF config |
 
 ### Eval Results Collection
 
@@ -390,16 +403,18 @@ cat ./evals/agg_eval_all.json | jq '[.[] | select(.hw == "B200")]'
 
 | Variable | Default | Description |
 |----------|---------|-------------|
-| `RUN_EVAL` | `false` | Enable eval after throughput |
+| `RUN_EVAL` | `false` | Enable eval after throughput benchmark |
+| `EVAL_ONLY` | `false` | Skip throughput, only run evals (set by workflow) |
 | `EVAL_FRAMEWORK` | `lm-eval` | Eval framework to use |
-| `EVAL_TASK` | `gsm8k` | Task definition file (without `.yaml`) |
-| `NUM_FEWSHOT` | `2` | Number of few-shot examples |
+| `EVAL_TASKS_DIR` | `utils/evals/gsm8k.yaml` | Path to lm-eval task YAML |
 | `EVAL_RESULT_DIR` | `/tmp/eval_out-*` | Output directory for eval results |
+| `EVAL_MAX_MODEL_LEN` | `16384` | Max context for eval (set by `compute_eval_context_length`) |
+| `EVAL_CONCURRENT_REQUESTS` | `64` | Concurrent requests during eval |
 
 ### Adding a New Eval Task
 
 1. Create a task YAML in `utils/evals/` (follow lm-eval task format)
-2. Set `EVAL_TASK=<your_task>` when running benchmarks
+2. Set `EVAL_TASKS_DIR=utils/evals/<your_task>.yaml` when running benchmarks
 3. Update `utils/collect_eval_results.py` if new metrics need extraction
 
 ### lm-eval Patches
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index f69d3c418..535313252 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -2,6 +2,13 @@
 
 # Shared benchmarking utilities for InferenceMAX
 
+# Keep Python bytecode out of the mounted workspace. Benchmark jobs often run as
+# root inside containers, and root-owned cache directories break future checkout
+# cleanup on self-hosted runners.
+export PYTHONDONTWRITEBYTECODE=1
+export PYTHONPYCACHEPREFIX="${PYTHONPYCACHEPREFIX:-/tmp/inferencex-pycache}"
+mkdir -p "$PYTHONPYCACHEPREFIX" 2>/dev/null || true
+
 # --------------------------------
 # GPU monitoring helpers
 # --------------------------------
@@ -174,6 +181,12 @@ wait_for_server_ready() {
 #   --trust-remote-code: Optional flag to trust remote code from HuggingFace
 #   --server-pid: Optional server process ID to monitor during benchmark
 run_benchmark_serving() {
+    # In eval-only mode, skip the throughput benchmark entirely.
+    if [ "${EVAL_ONLY}" = "true" ]; then
+        echo "EVAL_ONLY mode: skipping throughput benchmark"
+        return 0
+    fi
+
     set +x
     local model=""
     local port=""
@@ -486,6 +499,10 @@ move_profile_trace_for_relay() {
 # ------------------------------
 
 _install_lm_eval_deps() {
+    # torchvision causes circular imports in ATOM; TRT-LLM/SGLang need it at module level.
+    if [[ "${IMAGE:-}" == *atom* ]]; then
+        python3 -m pip uninstall -y torchvision 2>/dev/null || true
+    fi
     python3 -m pip install -q --no-cache-dir --break-system-packages "lm-eval[api]" || true
     local lm_eval_ref="b315ef3b05176acc9732bb7fdec116abe1ecc476"
     if command -v git >/dev/null 2>&1; then
@@ -574,26 +591,74 @@ PY
     export PYTHONPATH="${patch_dir}:${PYTHONPATH:-}"
 }
 
+get_native_max_context_length() {
+    local model_path="$1"
+    python3 -c "
+from transformers import AutoConfig
+config = AutoConfig.from_pretrained('${model_path}', trust_remote_code=True)
+for attr in ['max_position_embeddings', 'max_sequence_length', 'seq_length', 'n_positions']:
+    if hasattr(config, attr):
+        print(getattr(config, attr))
+        break
+else:
+    print(0)
+"
+}
+
+# Compute the context length for eval-only mode.
+# Uses 5x the benchmark context capped at the model's native max.
+# Sets EVAL_MAX_MODEL_LEN (needed by run_lm_eval).
+# Echoes the computed value for scripts to capture.
+#
+# Usage: local ctx=$(compute_eval_context_length "$MODEL" "${current_ctx}")
+compute_eval_context_length() {
+    local model="$1"
+    local benchmark_ctx="${2:-0}"
+    local native_max
+    native_max=$(get_native_max_context_length "$model")
+    native_max="${native_max:-0}"
+
+    if [ "$benchmark_ctx" -eq 0 ] 2>/dev/null; then
+        benchmark_ctx="${native_max:-0}"
+    fi
+    local eval_ctx=$(( benchmark_ctx * 1 ))
+    if [ "$native_max" -gt 0 ] 2>/dev/null && [ "$eval_ctx" -gt "$native_max" ]; then
+        eval_ctx="$native_max"
+    fi
+    # If eval_ctx is still 0 (both benchmark_ctx and native_max were 0), fall back
+    if [ "$eval_ctx" -le 0 ] 2>/dev/null; then
+        echo "WARN: compute_eval_context_length could not determine context length for $model" >&2
+        eval_ctx="${MAX_MODEL_LEN:-16384}"
+    fi
+    EVAL_MAX_MODEL_LEN="$eval_ctx"
+    echo "$eval_ctx"
+}
+
+# Convenience wrapper: compute eval context from ISL/OSL and export EVAL_MAX_MODEL_LEN.
+# Call directly (not in a subshell) so the export persists.
+# Scripts then wire $EVAL_MAX_MODEL_LEN into whichever server variable they need.
+setup_eval_context() {
+    EVAL_MAX_MODEL_LEN=$(compute_eval_context_length "$MODEL" "$((ISL + OSL + 200))")
+    export EVAL_MAX_MODEL_LEN
+}
+
 run_lm_eval() {
     local port="${PORT:-8888}"
-    local task="${EVAL_TASK:-gsm8k}"
-    local num_fewshot="${NUM_FEWSHOT:-2}"
+    local tasks_dir="${EVAL_TASKS_DIR:-utils/evals/gsm8k.yaml}"
     local results_dir="${EVAL_RESULT_DIR:-$(mktemp -d /tmp/eval_out-XXXXXX)}"
-    local gen_max_tokens=16384
+    local eval_context_len="${EVAL_MAX_MODEL_LEN:-16384}"
     local temperature=0
     local top_p=1
-    local concurrent_requests=32
+    local concurrent_requests="${EVAL_CONCURRENT_REQUESTS:-64}"
 
     while [[ $# -gt 0 ]]; do
         case $1 in
             --port)           port="$2"; shift 2 ;;
-            --task)           task="$2"; shift 2 ;;
-            --num-fewshot)    num_fewshot="$2"; shift 2 ;;
+            --task)           tasks_dir="$2"; shift 2 ;;
             --results-dir)    results_dir="$2"; shift 2 ;;
-            --gen-max-tokens) gen_max_tokens="$2"; shift 2 ;;
+            --gen-max-tokens) eval_context_len="$2"; shift 2 ;;
             --temperature)    temperature="$2"; shift 2 ;;
             --top-p)          top_p="$2"; shift 2 ;;
-            --concurrent-requests) concurrent_requests="$2"; shift 2 ;;
             *)                echo "Unknown parameter: $1"; return 1 ;;
         esac
     done
@@ -606,16 +671,23 @@ run_lm_eval() {
     export OPENAI_API_KEY=${OPENAI_API_KEY:-EMPTY}
     MODEL_NAME=${MODEL_NAME:-$MODEL} # Prefer MODEL_NAME, else MODEL
 
+    # Cap output tokens: must fit within context window (leave room for input),
+    # and avoid excessive KV cache reservation per request on TRT.
+    local max_output_tokens=$(( eval_context_len > 4096 ? eval_context_len - 4096 : eval_context_len / 2 ))
+    if [ "$max_output_tokens" -gt 16384 ]; then
+        max_output_tokens=16384
+    fi
+    echo "Eval budget: eval_context_len=${eval_context_len}, max_output_tokens=${max_output_tokens}"
+
     # Export for append_lm_eval_summary to pick up
     export EVAL_RESULT_DIR="$results_dir"
     set -x
     python3 -m lm_eval --model local-chat-completions --apply_chat_template \
-      --tasks "utils/evals/${task}.yaml" \
-      --num_fewshot "${num_fewshot}" \
+      --tasks "${tasks_dir}" \
       --output_path "${results_dir}" \
       --log_samples \
-      --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=</s>,max_retries=5,num_concurrent=${concurrent_requests},timeout=600,tokenized_requests=False,max_length=${gen_max_tokens}" \
-      --gen_kwargs "max_tokens=8192,temperature=${temperature},top_p=${top_p}"
+      --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=</s>,max_retries=5,num_concurrent=${concurrent_requests},timeout=1800,tokenized_requests=False,max_length=${eval_context_len}" \
+      --gen_kwargs "max_tokens=${max_output_tokens},temperature=${temperature},top_p=${top_p}"
     local eval_exit=$?
     set +x
     return $eval_exit
@@ -623,8 +695,15 @@ run_lm_eval() {
 
 append_lm_eval_summary() {
     local results_dir="${EVAL_RESULT_DIR}"
+    if [ -z "${results_dir}" ]; then
+        echo "WARN: EVAL_RESULT_DIR is empty; skipping artifact collection" >&2
+        return 1
+    fi
     local out_dir="${results_dir}"
-    mkdir -p "$out_dir" || true
+    if [ ! -d "${out_dir}" ]; then
+        echo "WARN: EVAL_RESULT_DIR='${out_dir}' does not exist; skipping artifact collection" >&2
+        return 1
+    fi
 
     # Write minimal meta for collectors that expect it
     local meta_json="${out_dir}/meta_env.json"
@@ -672,13 +751,13 @@ META
 
     # Move eval artifacts into PWD (no new directories in workspace)
     if [ -f "${meta_json}" ]; then
-        mv -f "${meta_json}" ./ || true
+        mv -f "${meta_json}" ./ || echo "WARN: failed to move ${meta_json}" >&2
     fi
     if [ -d "${out_dir}" ]; then
         while IFS= read -r -d '' jf; do
             base=$(basename "$jf")
             if [ "$base" != "meta_env.json" ]; then
-                mv -f "$jf" ./ || true
+                mv -f "$jf" ./ || echo "WARN: failed to move ${jf}" >&2
             fi
         done < <(find "${out_dir}" -type f -name "*.json*" -print0 2>/dev/null)
     fi
@@ -706,8 +785,23 @@ run_eval() {
         esac
     done
 
+    # Compute EVAL_MAX_MODEL_LEN if not already set by the calling script
+    if [ -z "${EVAL_MAX_MODEL_LEN:-}" ]; then
+        compute_eval_context_length "$MODEL" "${MAX_MODEL_LEN:-0}" > /dev/null
+    fi
+
+    local eval_rc=0
     case "$framework" in
-        lm-eval|lm_eval) run_lm_eval "${forwarded[@]}" ;;
-        *)               echo "Unknown framework '${framework}'"; return 1 ;;
+        lm-eval|lm_eval) run_lm_eval "${forwarded[@]}" || eval_rc=$? ;;
+        *)               echo "Unknown framework '${framework}'"; eval_rc=1 ;;
     esac
+
+    if [ "$eval_rc" -ne 0 ]; then
+        echo "ERROR: run_eval failed with exit code $eval_rc" >&2
+        if [ "${EVAL_ONLY}" = "true" ]; then
+            echo "Eval-only mode: failing after artifact collection" >&2
+            return "$eval_rc"
+        fi
+    fi
+    return $eval_rc
 }
diff --git a/benchmarks/single_node/dsr1_fp4_b200.sh b/benchmarks/single_node/dsr1_fp4_b200.sh
index d98fb8e2b..d88941628 100644
--- a/benchmarks/single_node/dsr1_fp4_b200.sh
+++ b/benchmarks/single_node/dsr1_fp4_b200.sh
@@ -31,6 +31,11 @@ else
 fi
 echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL"
 
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -40,7 +45,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.
 --cuda-graph-max-bs 256 --max-running-requests 256 --mem-fraction-static 0.85 --kv-cache-dtype fp8_e4m3 \
 --chunked-prefill-size 16384 \
 --ep-size $EP_SIZE --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
---enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 > $SERVER_LOG 2>&1 &
+--enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -63,7 +68,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/dsr1_fp4_b200_trt.sh b/benchmarks/single_node/dsr1_fp4_b200_trt.sh
index 036c2998e..7a9706d30 100644
--- a/benchmarks/single_node/dsr1_fp4_b200_trt.sh
+++ b/benchmarks/single_node/dsr1_fp4_b200_trt.sh
@@ -77,6 +77,12 @@ MAX_NUM_TOKENS=$(( ($CONC+$ISL+64+63)/64*64 ))
 MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 ))
 MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 ))
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+    MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN"
+fi
+
 if [[ "$PIECEWISE_CUDA_GRAPHS" == "true" ]]; then
     # [2^i for i in range(8)] + [i for i in range(256, max_num_tokens, 256)] + [max_num_tokens]
     capture_tokens=(1 2 4 8 16 32 64 128)
@@ -120,7 +126,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/dsr1_fp4_b200_trt_mtp.sh b/benchmarks/single_node/dsr1_fp4_b200_trt_mtp.sh
index 2a0320e53..59e5a3930 100644
--- a/benchmarks/single_node/dsr1_fp4_b200_trt_mtp.sh
+++ b/benchmarks/single_node/dsr1_fp4_b200_trt_mtp.sh
@@ -76,10 +76,6 @@ if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
     elif [[ $CONC == 128 && $DP_ATTENTION == "false" ]]; then
         PIECEWISE_CUDA_GRAPHS="true"
     fi
-elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-    if [[ $CONC == 64 ]]; then
-        PIECEWISE_CUDA_GRAPHS="true"
-    fi
 fi
 
 if [[ "$PIECEWISE_CUDA_GRAPHS" == "true" ]]; then
@@ -101,6 +97,12 @@ fi  # end of set of configs using piecewise_cuda_graphs
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+    MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN"
+fi
+
 set -x
 # Launch TRT-LLM server
 mpirun -n 1 --oversubscribe --allow-run-as-root \
@@ -134,7 +136,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/dsr1_fp4_mi355x.sh b/benchmarks/single_node/dsr1_fp4_mi355x.sh
index 58c1118eb..578a6c810 100644
--- a/benchmarks/single_node/dsr1_fp4_mi355x.sh
+++ b/benchmarks/single_node/dsr1_fp4_mi355x.sh
@@ -30,6 +30,11 @@ fi
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -44,7 +49,7 @@ python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \
 --max-prefill-tokens=$PREFILL_SIZE \
 --cuda-graph-max-bs=128 \
 --attention-backend aiter \
---kv-cache-dtype fp8_e4m3 > $SERVER_LOG 2>&1 &
+--kv-cache-dtype fp8_e4m3 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -65,7 +70,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/dsr1_fp4_mi355x_atom.sh b/benchmarks/single_node/dsr1_fp4_mi355x_atom.sh
index 08f579244..31554fc22 100644
--- a/benchmarks/single_node/dsr1_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/dsr1_fp4_mi355x_atom.sh
@@ -31,6 +31,11 @@ else
     CALCULATED_MAX_MODEL_LEN=" --max-model-len 10240 "
 fi
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    CALCULATED_MAX_MODEL_LEN=" --max-model-len $EVAL_MAX_MODEL_LEN "
+fi
+
 if [ "$EP_SIZE" -gt 1 ]; then
   EP=" --enable-expert-parallel"
 else
@@ -69,7 +74,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/dsr1_fp4_mi355x_atom_mtp.sh b/benchmarks/single_node/dsr1_fp4_mi355x_atom_mtp.sh
index af1ab6aa4..1d557684e 100644
--- a/benchmarks/single_node/dsr1_fp4_mi355x_atom_mtp.sh
+++ b/benchmarks/single_node/dsr1_fp4_mi355x_atom_mtp.sh
@@ -31,6 +31,11 @@ else
     CALCULATED_MAX_MODEL_LEN=" --max-model-len 10240 "
 fi
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    CALCULATED_MAX_MODEL_LEN=" --max-model-len $EVAL_MAX_MODEL_LEN "
+fi
+
 if [ "$EP_SIZE" -gt 1 ]; then
   EP=" --enable-expert-parallel"
 else
@@ -72,7 +77,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/dsr1_fp8_b200.sh b/benchmarks/single_node/dsr1_fp8_b200.sh
index 7b4be6b2b..e6d8a0e9c 100644
--- a/benchmarks/single_node/dsr1_fp8_b200.sh
+++ b/benchmarks/single_node/dsr1_fp8_b200.sh
@@ -63,6 +63,11 @@ else
 fi
 echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL"
 
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -72,7 +77,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.
 --cuda-graph-max-bs $CUDA_GRAPH_MAX_BATCH_SIZE --max-running-requests $MAX_RUNNING_REQUESTS \
 --mem-fraction-static $MEM_FRAC_STATIC --kv-cache-dtype fp8_e4m3 --chunked-prefill-size $CHUNKED_PREFILL_SIZE --max-prefill-tokens $MAX_PREFILL_TOKENS \
 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL --disable-radix-cache \
---attention-backend trtllm_mla --stream-interval 30 --ep-size $EP_SIZE --moe-runner-backend flashinfer_trtllm --quantization fp8 > $SERVER_LOG 2>&1 &
+--attention-backend trtllm_mla --stream-interval 30 --ep-size $EP_SIZE --moe-runner-backend flashinfer_trtllm --quantization fp8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -95,7 +100,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/dsr1_fp8_b200_mtp.sh b/benchmarks/single_node/dsr1_fp8_b200_mtp.sh
index b5e499ecc..781869bcc 100755
--- a/benchmarks/single_node/dsr1_fp8_b200_mtp.sh
+++ b/benchmarks/single_node/dsr1_fp8_b200_mtp.sh
@@ -56,6 +56,11 @@ SPECULATIVE_EAGLE_TOPK=1
 
 SGLANG_ENABLE_SPEC_V2=1
 
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -85,7 +90,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \
     --speculative-num-steps $SPECULATIVE_NUM_STEPS \
     --speculative-num-draft-tokens $SPECULATIVE_DRAFT_TOKENS \
     --speculative-eagle-topk $SPECULATIVE_EAGLE_TOPK \
-    > $SERVER_LOG 2>&1 &
+    $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -109,7 +114,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/dsr1_fp8_b200_trt.sh b/benchmarks/single_node/dsr1_fp8_b200_trt.sh
index 8df439973..139aae669 100644
--- a/benchmarks/single_node/dsr1_fp8_b200_trt.sh
+++ b/benchmarks/single_node/dsr1_fp8_b200_trt.sh
@@ -37,14 +37,6 @@ if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
         PIECEWISE_CUDA_GRAPHS="true"
         DELAY_BATCHING="true"
     fi
-elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-    if [[ $CONC -ge 256 ]]; then
-        CUDA_GRAPH_MAX_BATCH_SIZE=$(( $CONC / 8 ))
-        MOE_BACKEND="DEEPGEMM"
-        KV_CACHE_FREE_MEM_FRACTION=0.7
-    elif [[ $CONC -ge 128 ]]; then
-        PIECEWISE_CUDA_GRAPHS="true"
-    fi
 elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
     if [[ $CONC -ge 64 ]]; then
         PIECEWISE_CUDA_GRAPHS="true"
@@ -100,6 +92,12 @@ MAX_NUM_TOKENS=$(( ($CONC+$ISL+64+63)/64*64 ))
 MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 ))
 MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 ))
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+    MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN"
+fi
+
 if [[ "$PIECEWISE_CUDA_GRAPHS" == "true" ]]; then
     # [2^i for i in range(8)] + [i for i in range(256, max_num_tokens, 256)] + [max_num_tokens]
     capture_tokens=(1 2 4 8 16 32 64 128)
@@ -146,7 +144,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/dsr1_fp8_b200_trt_mtp.sh b/benchmarks/single_node/dsr1_fp8_b200_trt_mtp.sh
index c60388848..79f84f8a1 100644
--- a/benchmarks/single_node/dsr1_fp8_b200_trt_mtp.sh
+++ b/benchmarks/single_node/dsr1_fp8_b200_trt_mtp.sh
@@ -45,10 +45,6 @@ if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
     if [[ $CONC -le 4 ]]; then
         PIECEWISE_CUDA_GRAPHS="false"
     fi
-elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-    if [[ $CONC -le 8 ]]; then
-        PIECEWISE_CUDA_GRAPHS="false"
-    fi
 elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
     if [[ $CONC -le 16 ]]; then
         PIECEWISE_CUDA_GRAPHS="false"
@@ -89,7 +85,15 @@ attention_dp_config:
 EOF
 fi
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
+
 MAX_NUM_TOKENS=$(( ((MTP+1)*MAX_BATCH_SIZE+ISL+64+63)/64*64 ))
+if [ "${EVAL_ONLY}" = "true" ]; then
+    MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN"
+fi
 
 # prep PW CUDA config per the documentation
 if [[ "$PIECEWISE_CUDA_GRAPHS" == "true" ]]; then
@@ -104,10 +108,9 @@ if [[ "$PIECEWISE_CUDA_GRAPHS" == "true" ]]; then
     cat << EOF >> $EXTRA_CONFIG_FILE
 torch_compile_config:
     capture_num_tokens: [${CAPTURE_TOKENS_LIST%, }]
-    enable_piecewise_cuda_graph: true 
+    enable_piecewise_cuda_graph: true
 EOF
 fi
-
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -144,7 +147,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/dsr1_fp8_h200.sh b/benchmarks/single_node/dsr1_fp8_h200.sh
index fde2cfede..c820d180b 100644
--- a/benchmarks/single_node/dsr1_fp8_h200.sh
+++ b/benchmarks/single_node/dsr1_fp8_h200.sh
@@ -15,7 +15,7 @@ if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
 
-pip3 install --user sentencepiece
+pip3 install --user --break-system-packages sentencepiece
 
 hf download "$MODEL"
 SERVER_LOG=/workspace/server.log
@@ -26,6 +26,12 @@ start_gpu_monitor
 
 export TORCH_CUDA_ARCH_LIST="9.0"
 
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
+
 set -x
 if [[ $ISL -eq 1024 && $OSL -eq 1024 ]]; then
     PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL \
@@ -35,7 +41,7 @@ if [[ $ISL -eq 1024 && $OSL -eq 1024 ]]; then
     --chunked-prefill-size 32768 --max-prefill-tokens 32768 --mem-fraction-static 0.82 \
     --attention-backend flashinfer --stream-interval 10 \
     --decode-log-interval 1 \
-    > $SERVER_LOG 2>&1 &
+    $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 else
     PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL \
     --host 0.0.0.0 --port $PORT --trust-remote-code \
@@ -44,7 +50,7 @@ else
     --chunked-prefill-size 32768 --max-prefill-tokens 32768 --mem-fraction-static 0.82 \
     --attention-backend flashinfer --stream-interval 10 \
     --decode-log-interval 1 \
-    > $SERVER_LOG 2>&1 &
+    $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 fi
 
 SERVER_PID=$!
@@ -66,7 +72,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/dsr1_fp8_h200_trt.sh b/benchmarks/single_node/dsr1_fp8_h200_trt.sh
index 5d98aa75e..383b86065 100644
--- a/benchmarks/single_node/dsr1_fp8_h200_trt.sh
+++ b/benchmarks/single_node/dsr1_fp8_h200_trt.sh
@@ -64,6 +64,12 @@ MAX_NUM_TOKENS=$(( (CONC + ISL + 64 + 63) / 64 * 64 ))
 MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 ))
 MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 ))
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+    MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN"
+fi
+
 # Launch TRT-LLM server
 PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \
     trtllm-serve $MODEL --port=$PORT \
@@ -94,7 +100,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/dsr1_fp8_h200_trt_mtp.sh b/benchmarks/single_node/dsr1_fp8_h200_trt_mtp.sh
index 0ecd48f02..9d0010903 100644
--- a/benchmarks/single_node/dsr1_fp8_h200_trt_mtp.sh
+++ b/benchmarks/single_node/dsr1_fp8_h200_trt_mtp.sh
@@ -80,6 +80,11 @@ fi
 
 MAX_NUM_TOKENS=$(( ((MTP+1)*MAX_BATCH_SIZE+ISL+64+63)/64*64 ))
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+    MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -116,7 +121,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/dsr1_fp8_mi300x.sh b/benchmarks/single_node/dsr1_fp8_mi300x.sh
index 41731427e..a5f161960 100644
--- a/benchmarks/single_node/dsr1_fp8_mi300x.sh
+++ b/benchmarks/single_node/dsr1_fp8_mi300x.sh
@@ -36,6 +36,11 @@ export SGLANG_AITER_MLA_PERSIST=1
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -50,7 +55,7 @@ python3 -m sglang.launch_server \
 --max-prefill-tokens=131072 \
 --kv-cache-dtype fp8_e4m3 \
 --attention-backend aiter \
---disable-radix-cache > $SERVER_LOG 2>&1 &
+--disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -71,7 +76,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/dsr1_fp8_mi325x.sh b/benchmarks/single_node/dsr1_fp8_mi325x.sh
index 6870fe060..ae1e930f0 100644
--- a/benchmarks/single_node/dsr1_fp8_mi325x.sh
+++ b/benchmarks/single_node/dsr1_fp8_mi325x.sh
@@ -29,6 +29,12 @@ export SGLANG_AITER_MLA_PERSIST=1
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
+
 set -x
 python3 -m sglang.launch_server \
 --model-path=$MODEL --host=0.0.0.0 --port=$PORT --trust-remote-code \
@@ -41,7 +47,7 @@ python3 -m sglang.launch_server \
 --kv-cache-dtype fp8_e4m3 \
 --attention-backend aiter \
 --disable-radix-cache \
-> $SERVER_LOG 2>&1 &
+$EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -62,7 +68,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/dsr1_fp8_mi355x.sh b/benchmarks/single_node/dsr1_fp8_mi355x.sh
index 1d00957e4..d629437cf 100644
--- a/benchmarks/single_node/dsr1_fp8_mi355x.sh
+++ b/benchmarks/single_node/dsr1_fp8_mi355x.sh
@@ -27,6 +27,11 @@ export ROCM_QUICK_REDUCE_QUANTIZATION=INT4
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -42,7 +47,7 @@ python3 -m sglang.launch_server \
     --num-continuous-decode-steps 4 \
     --max-prefill-tokens 196608 \
     --kv-cache-dtype fp8_e4m3 \
-    --cuda-graph-max-bs "$CONC" > $SERVER_LOG 2>&1 &
+    --cuda-graph-max-bs "$CONC" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -63,7 +68,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/dsr1_fp8_mi355x_atom.sh b/benchmarks/single_node/dsr1_fp8_mi355x_atom.sh
index 08f579244..31554fc22 100644
--- a/benchmarks/single_node/dsr1_fp8_mi355x_atom.sh
+++ b/benchmarks/single_node/dsr1_fp8_mi355x_atom.sh
@@ -31,6 +31,11 @@ else
     CALCULATED_MAX_MODEL_LEN=" --max-model-len 10240 "
 fi
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    CALCULATED_MAX_MODEL_LEN=" --max-model-len $EVAL_MAX_MODEL_LEN "
+fi
+
 if [ "$EP_SIZE" -gt 1 ]; then
   EP=" --enable-expert-parallel"
 else
@@ -69,7 +74,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/dsr1_fp8_mi355x_atom_mtp.sh b/benchmarks/single_node/dsr1_fp8_mi355x_atom_mtp.sh
index dfb8fafdc..86381bc52 100644
--- a/benchmarks/single_node/dsr1_fp8_mi355x_atom_mtp.sh
+++ b/benchmarks/single_node/dsr1_fp8_mi355x_atom_mtp.sh
@@ -31,6 +31,11 @@ else
     CALCULATED_MAX_MODEL_LEN=" --max-model-len 10240 "
 fi
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    CALCULATED_MAX_MODEL_LEN=" --max-model-len $EVAL_MAX_MODEL_LEN "
+fi
+
 if [ "$EP_SIZE" -gt 1 ]; then
   EP=" --enable-expert-parallel"
 else
@@ -71,7 +76,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/glm5_fp8_b200.sh b/benchmarks/single_node/glm5_fp8_b200.sh
index 5d09645c8..4ca4a215d 100755
--- a/benchmarks/single_node/glm5_fp8_b200.sh
+++ b/benchmarks/single_node/glm5_fp8_b200.sh
@@ -30,6 +30,11 @@ PORT=${PORT:-8888}
 
 echo "EP_SIZE: $EP_SIZE, CONC: $CONC, ISL: $ISL, OSL: $OSL"
 
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -49,7 +54,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.
 --chunked-prefill-size 32768 --max-prefill-tokens 32768 \
 --enable-flashinfer-allreduce-fusion --disable-radix-cache \
 --stream-interval 30 \
---model-loader-extra-config '{"enable_multithread_load": true}' > $SERVER_LOG 2>&1 &
+--model-loader-extra-config '{"enable_multithread_load": true}' $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -72,7 +77,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/glm5_fp8_h200.sh b/benchmarks/single_node/glm5_fp8_h200.sh
index 9194bb870..7a985645f 100644
--- a/benchmarks/single_node/glm5_fp8_h200.sh
+++ b/benchmarks/single_node/glm5_fp8_h200.sh
@@ -22,6 +22,12 @@ hf download "$MODEL"
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
+
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -36,7 +42,7 @@ python3 -m sglang.launch_server \
   --mem-fraction-static 0.85 \
   --served-model-name glm-5-fp8 \
   --trust-remote-code \
-  > "$SERVER_LOG" 2>&1 &
+  $EVAL_CONTEXT_ARGS > "$SERVER_LOG" 2>&1 &
 
 SERVER_PID=$!
 
@@ -60,7 +66,7 @@ run_benchmark_serving \
 # Server accepts glm-5-fp8 (--served-model-name); lm-eval must use that model name
 if [ "${RUN_EVAL}" = "true" ]; then
     export MODEL_NAME=glm-5-fp8
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/glm5_fp8_mi355x.sh b/benchmarks/single_node/glm5_fp8_mi355x.sh
index ee11463ce..3d82fd856 100755
--- a/benchmarks/single_node/glm5_fp8_mi355x.sh
+++ b/benchmarks/single_node/glm5_fp8_mi355x.sh
@@ -30,6 +30,11 @@ export SAFETENSORS_FAST_GPU=1
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -44,7 +49,7 @@ python3 -m sglang.launch_server \
     --mem-fraction-static 0.85 \
     --model-loader-extra-config '{"enable_multithread_load": true, "num_threads": 8}' \
     --nsa-prefill-backend tilelang \
-    --nsa-decode-backend tilelang > $SERVER_LOG 2>&1 &
+    --nsa-decode-backend tilelang $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -65,7 +70,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/gptoss_fp4_b200.sh b/benchmarks/single_node/gptoss_fp4_b200.sh
index 46fccca6a..f6a6f72e9 100644
--- a/benchmarks/single_node/gptoss_fp4_b200.sh
+++ b/benchmarks/single_node/gptoss_fp4_b200.sh
@@ -26,7 +26,12 @@ if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
 elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then
     CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200))
 else
-    CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240}  
+    CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240}
+fi
+
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
 fi
 
 cat > config.yaml << EOF
@@ -77,7 +82,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/gptoss_fp4_b200_trt.sh b/benchmarks/single_node/gptoss_fp4_b200_trt.sh
index 42fa96a94..c9ba2752c 100644
--- a/benchmarks/single_node/gptoss_fp4_b200_trt.sh
+++ b/benchmarks/single_node/gptoss_fp4_b200_trt.sh
@@ -78,6 +78,12 @@ set -x
 
 MAX_NUM_TOKENS=20000
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+    MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN"
+fi
+
 # Launch TRT-LLM server
 mpirun -n 1 --oversubscribe --allow-run-as-root \
     trtllm-serve $MODEL --port=$PORT \
@@ -109,7 +115,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC ))
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/gptoss_fp4_h100.sh b/benchmarks/single_node/gptoss_fp4_h100.sh
index d3831ab06..8d0e773a2 100644
--- a/benchmarks/single_node/gptoss_fp4_h100.sh
+++ b/benchmarks/single_node/gptoss_fp4_h100.sh
@@ -17,11 +17,18 @@ fi
 
 hf download "$MODEL"
 
+MAX_MODEL_LEN=10240
+
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
+
 cat > config.yaml << EOF
 no-enable-prefix-caching: true
 max-cudagraph-capture-size: 2048
 max-num-batched-tokens: 8192
-max-model-len: 10240
+max-model-len: $MAX_MODEL_LEN
 EOF
 
 export PYTHONNOUSERSITE=1
@@ -60,7 +67,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/gptoss_fp4_h200.sh b/benchmarks/single_node/gptoss_fp4_h200.sh
index fe4aa5d28..2a9359b96 100644
--- a/benchmarks/single_node/gptoss_fp4_h200.sh
+++ b/benchmarks/single_node/gptoss_fp4_h200.sh
@@ -29,7 +29,12 @@ if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
 elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then
     CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200))
 else
-    CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240}  
+    CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240}
+fi
+
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
 fi
 
 # Create config.yaml
@@ -71,7 +76,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/gptoss_fp4_h200_trt.sh b/benchmarks/single_node/gptoss_fp4_h200_trt.sh
index a96b311d8..41dede14b 100644
--- a/benchmarks/single_node/gptoss_fp4_h200_trt.sh
+++ b/benchmarks/single_node/gptoss_fp4_h200_trt.sh
@@ -8,6 +8,7 @@ check_env_vars \
     CONC \
     ISL \
     OSL \
+    MAX_MODEL_LEN \
     RANDOM_RANGE_RATIO \
     RESULT_FILENAME \
     DP_ATTENTION \
@@ -48,10 +49,19 @@ print_iter_log: true
 stream_interval: 20 
 EOF
 
+MAX_NUM_TOKENS=20000
+
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+    MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN"
+fi
+
 PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \
 trtllm-serve $MODEL \
 --max_batch_size $CONC \
---max_num_tokens 20000 \
+--max_num_tokens $MAX_NUM_TOKENS \
+--max_seq_len=$MAX_MODEL_LEN \
 --backend pytorch \
 --extra_llm_api_options gptoss-config.yml \
 --ep_size=$EP_SIZE \
@@ -82,7 +92,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/gptoss_fp4_mi300x.sh b/benchmarks/single_node/gptoss_fp4_mi300x.sh
index f71aeb090..56a7823cf 100644
--- a/benchmarks/single_node/gptoss_fp4_mi300x.sh
+++ b/benchmarks/single_node/gptoss_fp4_mi300x.sh
@@ -42,6 +42,10 @@ FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -73,7 +77,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/gptoss_fp4_mi325x.sh b/benchmarks/single_node/gptoss_fp4_mi325x.sh
index f71aeb090..56a7823cf 100644
--- a/benchmarks/single_node/gptoss_fp4_mi325x.sh
+++ b/benchmarks/single_node/gptoss_fp4_mi325x.sh
@@ -42,6 +42,10 @@ FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -73,7 +77,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/gptoss_fp4_mi355x.sh b/benchmarks/single_node/gptoss_fp4_mi355x.sh
index f23949739..37cb358ba 100644
--- a/benchmarks/single_node/gptoss_fp4_mi355x.sh
+++ b/benchmarks/single_node/gptoss_fp4_mi355x.sh
@@ -43,6 +43,10 @@ FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -74,7 +78,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/gptoss_fp4_mi355x_atom.sh b/benchmarks/single_node/gptoss_fp4_mi355x_atom.sh
index cf71cbb3b..76bc87c0c 100644
--- a/benchmarks/single_node/gptoss_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/gptoss_fp4_mi355x_atom.sh
@@ -31,6 +31,11 @@ else
     CALCULATED_MAX_MODEL_LEN=" --max-model-len 10240 "
 fi
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    CALCULATED_MAX_MODEL_LEN=" --max-model-len $EVAL_MAX_MODEL_LEN "
+fi
+
 if [ "$EP_SIZE" -gt 1 ]; then
   EP=" --enable-expert-parallel"
 else
@@ -70,7 +75,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/kimik2.5_fp4_b200.sh b/benchmarks/single_node/kimik2.5_fp4_b200.sh
index d08e23bb2..4818f246e 100644
--- a/benchmarks/single_node/kimik2.5_fp4_b200.sh
+++ b/benchmarks/single_node/kimik2.5_fp4_b200.sh
@@ -26,6 +26,10 @@ export PYTHONNOUSERSITE=1
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -63,7 +67,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/kimik2.5_fp4_mi355x.sh
index 967003232..c680529e2 100755
--- a/benchmarks/single_node/kimik2.5_fp4_mi355x.sh
+++ b/benchmarks/single_node/kimik2.5_fp4_mi355x.sh
@@ -31,6 +31,11 @@ fi
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
+
 # If the machine runs a MEC FW older than 177, RCCL
 # cannot reclaim some memory.
 # Disable that features to avoid crashes.
@@ -93,7 +98,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/kimik2.5_int4_b200.sh b/benchmarks/single_node/kimik2.5_int4_b200.sh
index 6468cc05c..df4c63f6b 100755
--- a/benchmarks/single_node/kimik2.5_int4_b200.sh
+++ b/benchmarks/single_node/kimik2.5_int4_b200.sh
@@ -26,6 +26,10 @@ export VLLM_USE_FLASHINFER_MOE_INT4=1
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -64,7 +68,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/kimik2.5_int4_h200.sh b/benchmarks/single_node/kimik2.5_int4_h200.sh
index 473a1bd73..766fe74a0 100755
--- a/benchmarks/single_node/kimik2.5_int4_h200.sh
+++ b/benchmarks/single_node/kimik2.5_int4_h200.sh
@@ -25,6 +25,11 @@ export PYTHONNOUSERSITE=1
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
+
 # following https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html recipe
 
 # Start GPU monitoring (power, temperature, clocks every second)
@@ -65,7 +70,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/kimik2.5_int4_mi325x.sh b/benchmarks/single_node/kimik2.5_int4_mi325x.sh
index 1a42035a0..a05baddeb 100755
--- a/benchmarks/single_node/kimik2.5_int4_mi325x.sh
+++ b/benchmarks/single_node/kimik2.5_int4_mi325x.sh
@@ -28,6 +28,10 @@ PORT=${PORT:-8888}
 
 # following AMD andy luo's recipe
 # https://x.com/linluo77/status/2017024513595301985
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
 
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
@@ -64,7 +68,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/kimik2.5_int4_mi355x.sh b/benchmarks/single_node/kimik2.5_int4_mi355x.sh
index 420f8044a..5e40da700 100755
--- a/benchmarks/single_node/kimik2.5_int4_mi355x.sh
+++ b/benchmarks/single_node/kimik2.5_int4_mi355x.sh
@@ -26,6 +26,10 @@ fi
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -61,7 +65,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/minimaxm2.5_fp8_b200.sh b/benchmarks/single_node/minimaxm2.5_fp8_b200.sh
index 3fb39c375..5ea1b8657 100755
--- a/benchmarks/single_node/minimaxm2.5_fp8_b200.sh
+++ b/benchmarks/single_node/minimaxm2.5_fp8_b200.sh
@@ -33,6 +33,10 @@ else
   EP=" "
 fi
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -66,7 +70,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/minimaxm2.5_fp8_h100.sh b/benchmarks/single_node/minimaxm2.5_fp8_h100.sh
index 22fbbab8d..0f024ea9f 100755
--- a/benchmarks/single_node/minimaxm2.5_fp8_h100.sh
+++ b/benchmarks/single_node/minimaxm2.5_fp8_h100.sh
@@ -26,6 +26,11 @@ export PYTHONNOUSERSITE=1
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
+
 if [ "$EP_SIZE" -gt 1 ]; then
   EP=" --enable-expert-parallel"
 else
@@ -66,7 +71,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/minimaxm2.5_fp8_h200.sh b/benchmarks/single_node/minimaxm2.5_fp8_h200.sh
index 03caf30b7..84e73b65c 100755
--- a/benchmarks/single_node/minimaxm2.5_fp8_h200.sh
+++ b/benchmarks/single_node/minimaxm2.5_fp8_h200.sh
@@ -22,6 +22,11 @@ hf download "$MODEL"
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
+
 if [ "$EP_SIZE" -ge 1 ]; then
   EP=" --enable-expert-parallel"
 else
@@ -60,7 +65,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi300x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi300x.sh
index 0fd12d9ed..d03f57c9b 100755
--- a/benchmarks/single_node/minimaxm2.5_fp8_mi300x.sh
+++ b/benchmarks/single_node/minimaxm2.5_fp8_mi300x.sh
@@ -28,6 +28,10 @@ export VLLM_ROCM_USE_AITER=1
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -61,7 +65,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh
index 6c9a2ef6b..21abc2e50 100755
--- a/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh
+++ b/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh
@@ -30,6 +30,10 @@ export VLLM_ROCM_USE_AITER=1
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -63,7 +67,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh
index 5400ece04..adfb959cf 100755
--- a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh
+++ b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh
@@ -29,6 +29,11 @@ export VLLM_ROCM_USE_AITER=1
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
+
 if [ "$EP_SIZE" -gt 1 ]; then
   EP=" --enable-expert-parallel"
 else
@@ -68,7 +73,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/qwen3.5_bf16_b200.sh b/benchmarks/single_node/qwen3.5_bf16_b200.sh
index 38785a104..86ce6b66f 100755
--- a/benchmarks/single_node/qwen3.5_bf16_b200.sh
+++ b/benchmarks/single_node/qwen3.5_bf16_b200.sh
@@ -41,6 +41,10 @@ MAX_PREFILL_TOKENS=32768
 CUDA_GRAPH_MAX_BATCH_SIZE=$CONC
 MAX_RUNNING_REQUESTS=128
 CONTEXT_LENGTH=$((ISL + OSL + 20))
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN"
+fi
 
 echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL"
 
@@ -79,7 +83,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/qwen3.5_bf16_mi300x.sh b/benchmarks/single_node/qwen3.5_bf16_mi300x.sh
index ea10647d6..aa74785fe 100755
--- a/benchmarks/single_node/qwen3.5_bf16_mi300x.sh
+++ b/benchmarks/single_node/qwen3.5_bf16_mi300x.sh
@@ -20,6 +20,11 @@ hf download "$MODEL"
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -31,7 +36,7 @@ python3 -m sglang.launch_server \
     --port $PORT \
     --tensor-parallel-size $TP \
     --trust-remote-code \
-    --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 &
+    --mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -52,7 +57,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/qwen3.5_bf16_mi325x.sh b/benchmarks/single_node/qwen3.5_bf16_mi325x.sh
index ea10647d6..aa74785fe 100644
--- a/benchmarks/single_node/qwen3.5_bf16_mi325x.sh
+++ b/benchmarks/single_node/qwen3.5_bf16_mi325x.sh
@@ -20,6 +20,11 @@ hf download "$MODEL"
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -31,7 +36,7 @@ python3 -m sglang.launch_server \
     --port $PORT \
     --tensor-parallel-size $TP \
     --trust-remote-code \
-    --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 &
+    --mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -52,7 +57,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh
index f77390707..701695def 100755
--- a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh
+++ b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh
@@ -20,6 +20,11 @@ hf download "$MODEL"
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -30,7 +35,7 @@ python3 -m sglang.launch_server \
     --port $PORT \
     --tensor-parallel-size $TP \
     --trust-remote-code \
-    --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 &
+    --mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -51,7 +56,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/qwen3.5_fp8_b200.sh b/benchmarks/single_node/qwen3.5_fp8_b200.sh
index 39b020ecc..36e5d579d 100755
--- a/benchmarks/single_node/qwen3.5_fp8_b200.sh
+++ b/benchmarks/single_node/qwen3.5_fp8_b200.sh
@@ -41,6 +41,10 @@ MAX_PREFILL_TOKENS=32768
 CUDA_GRAPH_MAX_BATCH_SIZE=$CONC
 MAX_RUNNING_REQUESTS=128
 CONTEXT_LENGTH=$((ISL + OSL + 20))
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN"
+fi
 
 if [[ $TP -eq 8 ]]; then
   EXTRA_ARGS="--enable-flashinfer-allreduce-fusion"
@@ -87,7 +91,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/qwen3.5_fp8_b200_mtp.sh b/benchmarks/single_node/qwen3.5_fp8_b200_mtp.sh
index 1270c76a6..87933b166 100755
--- a/benchmarks/single_node/qwen3.5_fp8_b200_mtp.sh
+++ b/benchmarks/single_node/qwen3.5_fp8_b200_mtp.sh
@@ -48,6 +48,10 @@ SPECULATIVE_EAGLE_TOPK=1
 
 echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL"
 
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -88,7 +92,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/qwen3.5_fp8_h200.sh b/benchmarks/single_node/qwen3.5_fp8_h200.sh
index 2ae26b771..636a8ee92 100644
--- a/benchmarks/single_node/qwen3.5_fp8_h200.sh
+++ b/benchmarks/single_node/qwen3.5_fp8_h200.sh
@@ -23,6 +23,10 @@ hf download "$MODEL"
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 MAX_SEQ_LEN=$((ISL + OSL + 20))
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_SEQ_LEN="$EVAL_MAX_MODEL_LEN"
+fi
 
 echo "CONC: $CONC, ISL: $ISL, OSL: $OSL, MAX_SEQ_LEN: $MAX_SEQ_LEN"
 
@@ -76,7 +80,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/qwen3.5_fp8_mi300x.sh b/benchmarks/single_node/qwen3.5_fp8_mi300x.sh
index 0640a20ab..7bff57b61 100755
--- a/benchmarks/single_node/qwen3.5_fp8_mi300x.sh
+++ b/benchmarks/single_node/qwen3.5_fp8_mi300x.sh
@@ -20,6 +20,11 @@ hf download "$MODEL"
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -32,7 +37,7 @@ python3 -m sglang.launch_server \
     --port $PORT \
     --tensor-parallel-size $TP \
     --trust-remote-code \
-    --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 &
+    --mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -53,7 +58,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/qwen3.5_fp8_mi325x.sh b/benchmarks/single_node/qwen3.5_fp8_mi325x.sh
index 0640a20ab..7bff57b61 100755
--- a/benchmarks/single_node/qwen3.5_fp8_mi325x.sh
+++ b/benchmarks/single_node/qwen3.5_fp8_mi325x.sh
@@ -20,6 +20,11 @@ hf download "$MODEL"
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -32,7 +37,7 @@ python3 -m sglang.launch_server \
     --port $PORT \
     --tensor-parallel-size $TP \
     --trust-remote-code \
-    --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 &
+    --mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -53,7 +58,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh
index f77390707..701695def 100644
--- a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh
+++ b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh
@@ -20,6 +20,11 @@ hf download "$MODEL"
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -30,7 +35,7 @@ python3 -m sglang.launch_server \
     --port $PORT \
     --tensor-parallel-size $TP \
     --trust-remote-code \
-    --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 &
+    --mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -51,7 +56,7 @@ run_benchmark_serving \
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+    run_eval --framework lm-eval --port "$PORT"
     append_lm_eval_summary
 fi
 
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 6b0689cbf..81fc5f100 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -999,7 +999,7 @@
     - "Benchmark script: benchmarks/single_node/glm5_fp8_h200.sh"
     - "Tool-call-parser glm47, reasoning-parser glm45, mem-fraction-static 0.85"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/914
-  
+
 - config-keys:
     - glm5-fp8-b200-sglang
   description:
@@ -1129,3 +1129,63 @@
   description:
     - "Disable prefix caching (--no-enable-prefix-caching) for all MiniMax benchmarks using random datasets"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/966
+  
+- config-keys:
+    # NVIDIA single-node
+    - dsr1-fp4-b200-sglang
+    - dsr1-fp4-b200-trt
+    - dsr1-fp4-b200-trt-mtp
+    - dsr1-fp8-b200-sglang
+    - dsr1-fp8-b200-sglang-mtp
+    - dsr1-fp8-b200-trt
+    - dsr1-fp8-b200-trt-mtp
+    - dsr1-fp8-h200-sglang
+    - dsr1-fp8-h200-trt
+    - dsr1-fp8-h200-trt-mtp
+    - glm5-fp8-b200-sglang
+    - glm5-fp8-h200-sglang
+    - gptoss-fp4-b200-trt
+    - gptoss-fp4-b200-vllm
+    - gptoss-fp4-h100-vllm
+    - gptoss-fp4-h200-trt
+    - gptoss-fp4-h200-vllm
+    - kimik2.5-fp4-b200-vllm
+    - kimik2.5-int4-b200-vllm
+    - kimik2.5-int4-h200-vllm
+    - minimaxm2.5-fp8-b200-vllm
+    - minimaxm2.5-fp8-h100-vllm
+    - minimaxm2.5-fp8-h200-vllm
+    - qwen3.5-bf16-b200-sglang
+    - qwen3.5-fp8-b200-sglang
+    - qwen3.5-fp8-b200-sglang-mtp
+    - qwen3.5-fp8-h200-sglang
+    # AMD single-node
+    - dsr1-fp4-mi355x-atom
+    - dsr1-fp4-mi355x-atom-mtp
+    - dsr1-fp4-mi355x-sglang
+    - dsr1-fp8-mi325x-sglang
+    - dsr1-fp8-mi300x-sglang
+    - dsr1-fp8-mi355x-atom
+    - dsr1-fp8-mi355x-atom-mtp
+    - dsr1-fp8-mi355x-sglang
+    - glm5-fp8-mi355x-sglang
+    - gptoss-fp4-mi300x-vllm
+    - gptoss-fp4-mi325x-vllm
+    - gptoss-fp4-mi355x-atom
+    - gptoss-fp4-mi355x-vllm
+    - kimik2.5-fp4-mi355x-vllm
+    - kimik2.5-int4-mi325x-vllm
+    - kimik2.5-int4-mi355x-vllm
+    - minimaxm2.5-fp8-mi300x-vllm
+    - minimaxm2.5-fp8-mi325x-vllm
+    - minimaxm2.5-fp8-mi355x-vllm
+    - qwen3.5-bf16-mi300x-sglang
+    - qwen3.5-bf16-mi325x-sglang
+    - qwen3.5-bf16-mi355x-sglang
+    - qwen3.5-fp8-mi300x-sglang
+    - qwen3.5-fp8-mi325x-sglang
+    - qwen3.5-fp8-mi355x-sglang
+  description:
+    - "Separate evals, change to 8k1k, fail loudly, 5-shot, top of curve & middle of curve"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/911
+  evals-only: true
diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh
index 022fd7cb2..f8c614936 100644
--- a/runners/launch_b200-dgxc.sh
+++ b/runners/launch_b200-dgxc.sh
@@ -33,7 +33,7 @@ docker run --rm --init --network host --name $server_name \
 -e NCCL_GRAPH_REGISTER=0 \
 -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
 -e PROFILE -e SGLANG_TORCH_PROFILER_DIR -e VLLM_TORCH_PROFILER_DIR -e VLLM_RPC_TIMEOUT \
--e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e RUN_EVAL -e RUNNER_TYPE \
+-e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e RUN_EVAL -e EVAL_ONLY -e RUNNER_TYPE \
 --entrypoint=/bin/bash \
 $(echo "$IMAGE" | sed 's/#/\//') \
 benchmarks/single_node/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh"
diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh
index 223264914..5100419b9 100644
--- a/runners/launch_h100-cr.sh
+++ b/runners/launch_h100-cr.sh
@@ -10,7 +10,7 @@ docker run --rm --network=host --name=$server_name \
 --runtime=nvidia --gpus=all --ipc=host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \
 -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
--e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RUN_EVAL -e RUNNER_TYPE -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \
+-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RUN_EVAL -e EVAL_ONLY -e RUNNER_TYPE -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \
 -e PROFILE -e SGLANG_TORCH_PROFILER_DIR -e VLLM_TORCH_PROFILER_DIR -e VLLM_RPC_TIMEOUT \
 -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
 --entrypoint=/bin/bash \
diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md
index c3dddfcc6..e32d6d988 100644
--- a/utils/evals/EVALS.md
+++ b/utils/evals/EVALS.md
@@ -6,7 +6,7 @@ Quick graded QnA which measures model performance. Examples of test suites:
 - **gpqa**: Graduate level, Google-Proof multiple choice questions
 
 ## When?
-At highest concurrency for highest TP and lowest TP, per GPU per model only for 1k8k. Logic is defined in `mark_eval_entries` of `utils/matrix-logic/generate_sweep_configs.py`
+At the highest and median concurrency levels (all TPs), per (model, runner, framework, precision, ISL, OSL, spec-decoding, dp-attn), only for 8k1k. In eval-only mode, the server starts with expanded context length. In combined mode (RUN_EVAL=true), evals run against the same server used for throughput benchmarks. Logic is defined in `mark_eval_entries` of `utils/matrix_logic/generate_sweep_configs.py`
 
 ## Why?
 To verify how model outputs are affected by throughput optimizations. 
@@ -15,7 +15,7 @@ To verify how model outputs are affected by throughput optimizations.
 - If there was a tradeoff in accuracy for performance
 
 ## How?
-- `run_eval`, definined in `benchmarks/benchmark_lib.sh`, is called in `benchmarks/*`. EleutherAI/lm-evaluation-harness(lmeval), using the same endpoint as the throughput benchmark. JSON results are processed and converted to a table with `utils/collect_eval_results.py`.
+- `run_eval`, defined in `benchmarks/benchmark_lib.sh`, is called in `benchmarks/*`. It runs EleutherAI/lm-evaluation-harness (lmeval) against the running server's OpenAI-compatible endpoint. In eval-only mode (`EVAL_ONLY=true`), the server is started once with expanded context length (up to 5x benchmark context, capped at model native max). JSON results are processed and converted to a table with `utils/collect_eval_results.py`.
 
 ## Misc
 Following files are task definitions from lmeval, more info on changes within the files
diff --git a/utils/evals/gsm8k.yaml b/utils/evals/gsm8k.yaml
index fb0f0a829..e748119cd 100644
--- a/utils/evals/gsm8k.yaml
+++ b/utils/evals/gsm8k.yaml
@@ -9,7 +9,7 @@ output_type: generate_until
 training_split: train
 fewshot_split: train
 test_split: test
-doc_to_text: "Question: {{question}}\nEnd your answer with: #### <answer>\nAnswer:"
+doc_to_text: "Question: {{question}}\nEnd your response with the answer on the last line, formatted as: #### [number]\nAnswer:"
 doc_to_target: "{{answer}}" #" {{answer.split('### ')[-1].rstrip()}}"
 metric_list:
   - metric: exact_match
diff --git a/utils/evals/thresholds.json b/utils/evals/thresholds.json
new file mode 100644
index 000000000..8ea0b71c0
--- /dev/null
+++ b/utils/evals/thresholds.json
@@ -0,0 +1,4 @@
+{
+  "gsm8k": 0.85,
+  "gpqa_diamond_cot_n_shot": 0.30
+}
diff --git a/utils/evals/validate_scores.py b/utils/evals/validate_scores.py
new file mode 100644
index 000000000..85433ec4b
--- /dev/null
+++ b/utils/evals/validate_scores.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+"""Validate eval scores against minimum thresholds.
+
+Reads lm-eval results JSON files and checks that scored metrics meet the
+required minimum.  Thresholds are configured per-task in a JSON config file
+(default: utils/evals/thresholds.json).
+
+Usage:
+    python3 utils/evals/validate_scores.py
+    python3 utils/evals/validate_scores.py --thresholds my_thresholds.json
+    python3 utils/evals/validate_scores.py --min-score 0.90  # flat threshold, no config
+"""
+import argparse
+import glob
+import json
+import sys
+from pathlib import Path
+
+
+def load_thresholds(path: str) -> dict[str, float]:
+    """Load thresholds config. Returns {task_name: min_score}."""
+    with open(path) as f:
+        return json.load(f)
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Validate eval scores")
+    parser.add_argument(
+        "--min-score", type=float, default=0.85,
+        help="Fallback minimum score when no threshold config matches (default: 0.85)",
+    )
+    parser.add_argument(
+        "--thresholds", default=None,
+        help="Path to thresholds JSON config (default: utils/evals/thresholds.json)",
+    )
+    parser.add_argument(
+        "--metric-prefix", default="exact_match,",
+        help="Only check metrics whose name starts with this prefix (default: 'exact_match,')",
+    )
+    parser.add_argument(
+        "--results-glob", default="results*.json",
+        help="Glob pattern for result files (default: 'results*.json')",
+    )
+    args = parser.parse_args()
+
+    # Load thresholds config
+    thresholds = {}
+    thresholds_path = args.thresholds
+    if thresholds_path is None:
+        default_path = Path(__file__).parent / "thresholds.json"
+        if default_path.exists():
+            thresholds_path = str(default_path)
+    if thresholds_path:
+        try:
+            thresholds = load_thresholds(thresholds_path)
+            print(f"Loaded thresholds from {thresholds_path}")
+        except (json.JSONDecodeError, OSError) as e:
+            print(f"WARN: could not load thresholds from {thresholds_path}: {e}", file=sys.stderr)
+
+    failed = False
+    checked = 0
+
+    for f in sorted(glob.glob(args.results_glob)):
+        with open(f) as fh:
+            data = json.load(fh)
+        for task, metrics in data.get("results", {}).items():
+            min_score = thresholds.get(task, args.min_score)
+            for name, val in metrics.items():
+                if not name.startswith(args.metric_prefix) or "stderr" in name:
+                    continue
+                if not isinstance(val, (int, float)):
+                    continue
+                checked += 1
+                if val < min_score:
+                    print(
+                        f"FAIL: {task} {name} = {val:.4f} (< {min_score})",
+                        file=sys.stderr,
+                    )
+                    failed = True
+                else:
+                    print(f"PASS: {task} {name} = {val:.4f} (>= {min_score})")
+
+    if checked == 0:
+        print("WARN: no metrics matched prefix '{}'".format(args.metric_prefix), file=sys.stderr)
+
+    return 1 if (failed or checked == 0) else 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py
index a31071f65..2a336c960 100644
--- a/utils/matrix_logic/generate_sweep_configs.py
+++ b/utils/matrix_logic/generate_sweep_configs.py
@@ -17,7 +17,6 @@
 
 seq_len_stoi = {
     "1k1k": (1024, 1024),
-    "1k8k": (1024, 8192),
     "8k1k": (8192, 1024)
 }
 
@@ -35,25 +34,19 @@ def seq_len_to_str(isl: int, osl: int) -> str:
 
 def mark_eval_entries(matrix_values: list[dict]) -> list[dict]:
     """Eval selection policy (single-node only):
-    - Only consider 1k8k (isl=1024, osl=8192).
-    - For each unique (model, runner, framework, precision, isl, osl, spec-decoding):
-        - Mark highest TP with highest conc
-        - Mark lowest TP with highest conc
-        
-    Grouping includes spec-decoding so MTP (mtp) and non-MTP (none) are treated
-    independently.
+    - Only consider 8k1k (isl=8192, osl=1024).
+    - For each unique (model, runner, framework, precision, isl, osl, spec-decoding, dp-attn):
+        - Mark all entries at the highest CONC (all TPs)
+        - Mark all entries at the median CONC (all TPs)
     """
     from collections import defaultdict
 
-    # Only run evals on 1k8k
-    target_isl, target_osl = seq_len_stoi["1k8k"]
-    # Group entries by (model, runner, framework, precision, isl, osl)
+    # Only run evals on 8k1k
+    target_isl, target_osl = seq_len_stoi["8k1k"]
+    # Group entries by (model, runner, framework, precision, isl, osl, spec-decoding, dp-attn).
     # Only include entries that have a top-level TP (i.e., single-node schema).
-    # This avoids relying on structural hints like prefill/decode which may be
-    # reused by future single-node disaggregated modes.
     groups = defaultdict(list)
     for i, entry in enumerate(matrix_values):
-        # Skip entries without a top-level TP field
         if Fields.TP.value not in entry:
             continue
 
@@ -72,32 +65,19 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]:
         )
         groups[key].append((i, entry))
 
-    # For each group, find highest TP/highest conc and lowest TP/highest conc
+    # For each group, select entries at highest CONC and median CONC (all TPs)
     eval_indices = set()
     for key, entries in groups.items():
         if not entries:
             continue
 
-        # Find min and max TP values
-        min_tp = min(e[Fields.TP.value] for _, e in entries)
-        max_tp = max(e[Fields.TP.value] for _, e in entries)
-
-        # Find highest conc for highest TP
-        highest_tp_entries = [(i, e) for i, e in entries if e[Fields.TP.value] == max_tp]
-        if highest_tp_entries:
-            max_conc_highest_tp = max(e[Fields.CONC.value] for _, e in highest_tp_entries)
-            for i, e in highest_tp_entries:
-                if e[Fields.CONC.value] == max_conc_highest_tp:
-                    eval_indices.add(i)
-
-        # Find highest conc for lowest TP (only if different from max_tp)
-        if min_tp != max_tp:
-            lowest_tp_entries = [(i, e) for i, e in entries if e[Fields.TP.value] == min_tp]
-            if lowest_tp_entries:
-                max_conc_lowest_tp = max(e[Fields.CONC.value] for _, e in lowest_tp_entries)
-                for i, e in lowest_tp_entries:
-                    if e[Fields.CONC.value] == max_conc_lowest_tp:
-                        eval_indices.add(i)
+        conc_values = sorted(set(e[Fields.CONC.value] for _, e in entries))
+        median_conc = conc_values[len(conc_values) // 2]
+        target_concs = {conc_values[-1], median_conc}
+
+        for i, e in entries:
+            if e[Fields.CONC.value] in target_concs:
+                eval_indices.add(i)
 
     # Mark the selected entries
     for i, entry in enumerate(matrix_values):
@@ -742,9 +722,9 @@ def main():
     )
     eval_group = parent_parser.add_mutually_exclusive_group()
     eval_group.add_argument(
-        '--run-evals',
+        '--no-evals',
         action='store_true',
-        help='When specified, run evals on a subset of configs (in addition to all configs).'
+        help='When specified, skip evals (throughput benchmarks only).'
     )
     eval_group.add_argument(
         '--evals-only',
@@ -949,10 +929,9 @@ def main():
     else:
         parser.error(f"Unknown command: {args.command}")
         
-    # Handle eval options (mutually exclusive)
-    if args.run_evals or args.evals_only:
+    # Handle eval options (mutually exclusive: --no-evals or --evals-only)
+    if not args.no_evals:
         matrix_values = mark_eval_entries(matrix_values)
-        # IF --evals-only is specified, filter to only eval entries
         if args.evals_only:
             matrix_values = [e for e in matrix_values if e.get(Fields.RUN_EVAL.value, False)]
 
diff --git a/utils/matrix_logic/test_generate_sweep_configs.py b/utils/matrix_logic/test_generate_sweep_configs.py
index 84ecddd3d..1fecdd487 100644
--- a/utils/matrix_logic/test_generate_sweep_configs.py
+++ b/utils/matrix_logic/test_generate_sweep_configs.py
@@ -158,13 +158,11 @@ class TestSeqLenMappings:
     def test_seq_len_stoi_values(self):
         """Verify seq_len_stoi has expected mappings."""
         assert seq_len_stoi["1k1k"] == (1024, 1024)
-        assert seq_len_stoi["1k8k"] == (1024, 8192)
         assert seq_len_stoi["8k1k"] == (8192, 1024)
 
     def test_seq_len_itos_reverse_mapping(self):
         """Verify seq_len_itos is reverse of stoi."""
         assert seq_len_itos[(1024, 1024)] == "1k1k"
-        assert seq_len_itos[(1024, 8192)] == "1k8k"
         assert seq_len_itos[(8192, 1024)] == "8k1k"
 
 
@@ -174,7 +172,6 @@ class TestSeqLenToStr:
     def test_known_sequence_lengths(self):
         """Known sequence lengths should return short name."""
         assert seq_len_to_str(1024, 1024) == "1k1k"
-        assert seq_len_to_str(1024, 8192) == "1k8k"
         assert seq_len_to_str(8192, 1024) == "8k1k"
 
     def test_unknown_sequence_lengths(self):
diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py
index ad7658176..697d97de6 100644
--- a/utils/matrix_logic/validation.py
+++ b/utils/matrix_logic/validation.py
@@ -360,6 +360,7 @@ class ChangelogMatrixEntry(BaseModel):
                       ] = Field(default_factory=dict)
     multi_node: dict[str, list[MultiNodeMatrixEntry]
                      ] = Field(default_factory=dict)
+    evals: list[SingleNodeMatrixEntry] = Field(default_factory=list)
     changelog_metadata: ChangelogMetadata
 
 
diff --git a/utils/process_changelog.py b/utils/process_changelog.py
index d17fc3729..7da19d030 100644
--- a/utils/process_changelog.py
+++ b/utils/process_changelog.py
@@ -81,6 +81,7 @@ def main():
     final_results = {
         "single_node": defaultdict(list),
         "multi_node": defaultdict(list),
+        "evals": [],
         "changelog_metadata": {
             "base_ref": args.base_ref,
             "head_ref": args.head_ref,
@@ -88,56 +89,82 @@ def main():
         },
     }
 
-    all_results = []
-    # Deduplicate repeated configs, if for some reason a config key appears multiple times
-    # in one commit, we don't want to run that config two times (there will just be twice as many
-    # data points for that config, which is not useful)
-    all_configs_to_run = set()
+    all_benchmark_results = []
+    all_eval_results = []
+    # Deduplicate repeated configs separately for benchmarks and evals.
+    # An evals-only entry should not prevent a later regular entry from
+    # generating benchmarks for the same config, and vice versa.
+    benchmark_configs_seen = set()
+    eval_configs_seen = set()
 
     for entry_data in changelog_data:
         entry = ChangelogEntry.model_validate(entry_data)
-        configs_to_run = get_config_keys_from_master(
+        all_configs = get_config_keys_from_master(
             entry.config_keys, load_config_files(MASTER_CONFIGS)
         )
 
-        # Skip configs already processed
-        configs_to_run = [c for c in configs_to_run if c not in all_configs_to_run]
-        if not configs_to_run:
-            continue
-        all_configs_to_run.update(configs_to_run)
-
-        # Use --evals-only if specified in changelog entry, otherwise --run-evals
-        eval_flag = "--evals-only" if entry.evals_only else "--run-evals"
-
-        try:
-            result = subprocess.run(
-                [
+        if not entry.evals_only:
+            # Generate benchmark entries (no evals)
+            benchmark_configs = [c for c in all_configs if c not in benchmark_configs_seen]
+            if benchmark_configs:
+                benchmark_configs_seen.update(benchmark_configs)
+                base_cmd = [
                     "python3",
                     GENERATE_SWEEPS_PY_SCRIPT,
                     "test-config",
                     "--config-keys",
-                    *configs_to_run,
+                    *benchmark_configs,
                     "--config-files",
                     *MASTER_CONFIGS,
-                    eval_flag
-                ],
-                capture_output=True,
-                text=True,
-                check=True,
-            )
-        except subprocess.CalledProcessError as e:
-            print(e.stderr)
-            raise
-
-        all_results.extend(json.loads(result.stdout))
-
-    for result in all_results:
+                    "--no-evals",
+                ]
+                try:
+                    result = subprocess.run(
+                        base_cmd,
+                        capture_output=True,
+                        text=True,
+                        check=True,
+                    )
+                except subprocess.CalledProcessError as e:
+                    print(e.stderr)
+                    raise
+                all_benchmark_results.extend(json.loads(result.stdout))
+
+        # Generate eval entries separately
+        eval_configs = [c for c in all_configs if c not in eval_configs_seen]
+        if eval_configs:
+            eval_configs_seen.update(eval_configs)
+            base_cmd = [
+                "python3",
+                GENERATE_SWEEPS_PY_SCRIPT,
+                "test-config",
+                "--config-keys",
+                *eval_configs,
+                "--config-files",
+                *MASTER_CONFIGS,
+                "--evals-only",
+            ]
+            try:
+                eval_result = subprocess.run(
+                    base_cmd,
+                    capture_output=True,
+                    text=True,
+                    check=True,
+                )
+            except subprocess.CalledProcessError as e:
+                print(e.stderr)
+                raise
+            all_eval_results.extend(json.loads(eval_result.stdout))
+
+    for result in all_benchmark_results:
         seq_len_str = seq_len_to_str(result["isl"], result["osl"])
         if "prefill" in result and result["prefill"] is not None:
             final_results["multi_node"][seq_len_str].append(result)
         else:
             final_results["single_node"][seq_len_str].append(result)
 
+    final_results["evals"] = all_eval_results
+
     # Validate final results structure
     validated = ChangelogMatrixEntry.model_validate(final_results)
     print(validated.model_dump_json(by_alias=True))