diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index b9560803e..a5986e32a 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -12,10 +12,6 @@ dsr1-fp4-mi355x-sglang: search-space: - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -35,11 +31,6 @@ dsr1-fp4-mi355x-atom: search-space: - { tp: 4, ep: 1, conc-start: 32, conc-end: 256 } - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 4, ep: 1, conc-start: 128, conc-end: 256 } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 128 } - isl: 8192 osl: 1024 search-space: @@ -61,11 +52,6 @@ dsr1-fp4-mi355x-atom-mtp: search-space: - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp } - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - isl: 1024 - osl: 8192 - search-space: - # - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: @@ -85,10 +71,6 @@ dsr1-fp8-mi300x-sglang: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -107,10 +89,6 @@ dsr1-fp8-mi325x-sglang: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -129,10 +107,6 @@ dsr1-fp8-mi355x-sglang: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -152,10 +126,6 @@ qwen3.5-bf16-mi355x-sglang: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -174,10 +144,6 @@ qwen3.5-bf16-mi300x-sglang: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -196,10 +162,6 @@ qwen3.5-bf16-mi325x-sglang: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -218,10 +180,6 @@ qwen3.5-fp8-mi325x-sglang: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -240,10 +198,6 @@ qwen3.5-fp8-mi355x-sglang: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -262,10 +216,6 @@ qwen3.5-fp8-mi300x-sglang: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -284,10 +234,6 @@ glm5-fp8-mi355x-sglang: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -306,10 +252,6 @@ kimik2.5-int4-mi355x-vllm: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -328,10 +270,6 @@ kimik2.5-int4-mi325x-vllm: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -351,12 +289,6 @@ kimik2.5-fp4-mi355x-vllm: search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -378,12 +310,6 @@ minimaxm2.5-fp8-mi355x-vllm: - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, ep: 8, conc-start: 32, conc-end: 256 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 32, conc-end: 256 } - isl: 8192 osl: 1024 search-space: @@ -405,11 +331,6 @@ minimaxm2.5-fp8-mi300x-vllm: search-space: - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -430,11 +351,6 @@ minimaxm2.5-fp8-mi325x-vllm: search-space: - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -457,13 +373,6 @@ gptoss-fp4-mi300x-vllm: - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 16 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 1, conc-start: 64, conc-end: 64 } - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 16 } - isl: 8192 osl: 1024 search-space: @@ -488,13 +397,6 @@ gptoss-fp4-mi325x-vllm: - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 1, conc-start: 64, conc-end: 64 } - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 64, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -518,12 +420,6 @@ gptoss-fp4-mi355x-vllm: - { tp: 1, conc-start: 4, conc-end: 128 } - { tp: 4, conc-start: 4, conc-end: 8 } - { tp: 8, conc-start: 4, conc-end: 16 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 1, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 8 } - - { tp: 8, conc-start: 4, conc-end: 16 } - isl: 8192 osl: 1024 search-space: @@ -545,11 +441,6 @@ gptoss-fp4-mi355x-atom: search-space: - { tp: 1, conc-start: 16, conc-end: 128 } - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 1, conc-start: 16, conc-end: 128 } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -570,10 +461,6 @@ dsr1-fp8-mi355x-atom: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 128 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 128 } - isl: 8192 osl: 1024 search-space: @@ -593,10 +480,6 @@ dsr1-fp8-mi355x-atom-mtp: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: @@ -911,129 +794,6 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=2" - # FIXME(billishyahao): disable 1k8k for now - # - isl: 1024 - # osl: 8192 - # search-space: - # # MTP configurations - # # "Top of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8) - # - spec-decoding: "mtp" - # conc-list: [ 2048 ] - # prefill: - # num-worker: 1 - # tp: 1 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "PREFILL_NODES=1" - # decode: - # num-worker: 1 - # tp: 1 - # ep: 16 - # dp-attn: true - # additional-settings: - # - "DECODE_NODES=2" - # - "DECODE_MTP_SIZE=1" - - - # # "Middle of curve" (1 prefill worker at DEP8 and 2 decode workers each at DEP8) - # - spec-decoding: "mtp" - # conc-list: [ 256, 512, 1024 ] - # prefill: - # num-worker: 1 - # tp: 1 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "PREFILL_NODES=1" - # decode: - # num-worker: 2 - # tp: 1 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "DECODE_NODES=2" - # - "DECODE_MTP_SIZE=1" - - - # # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) - # - spec-decoding: "mtp" - # conc-list: [ 32, 64, 128 ] - # prefill: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: false - # additional-settings: - # - "PREFILL_NODES=1" - - # decode: - # num-worker: 2 - # tp: 8 - # ep: 8 - # dp-attn: false - # additional-settings: - # - "DECODE_NODES=2" - # - "DECODE_MTP_SIZE=1" - - # # non-MTP configurations - # # "Top of curve" (1 prefill workers each at DEP8 and 1 decode workers at DEP16) - # - spec-decoding: "none" - # conc-list: [ 2048 ] - # prefill: - # num-worker: 1 - # tp: 1 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "PREFILL_NODES=1" - # decode: - # num-worker: 1 - # tp: 1 - # ep: 16 - # dp-attn: true - # additional-settings: - # - "DECODE_NODES=2" - # - "DECODE_MTP_SIZE=0" - - # # "Middle of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8) - # - spec-decoding: "none" - # conc-list: [ 256, 512, 1024 ] - # prefill: - # num-worker: 1 - # tp: 1 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "PREFILL_NODES=1" - # decode: - # num-worker: 2 - # tp: 1 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "DECODE_NODES=2" - # - "DECODE_MTP_SIZE=0" - - # # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) - # - spec-decoding: "none" - # conc-list: [ 32, 64, 128 ] - # prefill: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: false - # additional-settings: - # - "PREFILL_NODES=1" - # decode: - # num-worker: 2 - # tp: 8 - # ep: 8 - # dp-attn: false - # additional-settings: - # - "DECODE_NODES=2" - # - "DECODE_MTP_SIZE=0" - dsr1-fp4-mi355x-sglang-disagg: image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 @@ -1453,49 +1213,3 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" - - # FIXME(billishyahao): disable FP4 1k8k for now - # - isl: 1024 - # osl: 8192 - # search-space: - # # MTP configurations - # # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) - # - spec-decoding: "mtp" - # conc-list: [ 32, 64, 128 ] - # prefill: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: false - # additional-settings: - # - "PREFILL_NODES=1" - - # decode: - # num-worker: 2 - # tp: 8 - # ep: 8 - # dp-attn: false - # additional-settings: - # - "DECODE_NODES=2" - # - "DECODE_MTP_SIZE=1" - - # # non-MTP configurations - # # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) - # - spec-decoding: "none" - # conc-list: [ 32, 64, 128 ] - # prefill: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: false - # additional-settings: - # - "PREFILL_NODES=1" - # decode: - # num-worker: 2 - # tp: 8 - # ep: 8 - # dp-attn: false - # additional-settings: - # - "DECODE_NODES=2" - # - "DECODE_MTP_SIZE=0" - diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index c76aee8f6..33751270b 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1663,11 +1663,6 @@ dsr1-fp4-b200-sglang: search-space: - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } - isl: 8192 osl: 1024 search-space: @@ -1694,17 +1689,6 @@ dsr1-fp4-b200-trt: - { tp: 8, conc-start: 4, conc-end: 4 } - { tp: 8, ep: 8, conc-start: 64, conc-end: 64 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 } - - isl: 1024 - osl: 8192 - search-space: - # low concurrency cases use TP only - # concurrency 64 uses TP & EP - # high concurrency cases use TP & EP & DP-ATTN - - { tp: 4, conc-start: 4, conc-end: 16 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 256 } - - { tp: 8, conc-start: 4, conc-end: 4 } - - { tp: 8, ep: 8, conc-start: 64, conc-end: 64 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 } - isl: 8192 osl: 1024 search-space: @@ -1737,17 +1721,6 @@ dsr1-fp4-b200-trt-mtp: - { tp: 8, conc-start: 128, conc-end: 128, spec-decoding: mtp } - { tp: 8, ep: 8, conc-start: 32, conc-end: 128, spec-decoding: mtp } - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 64, spec-decoding: mtp } - - isl: 1024 - osl: 8192 - search-space: - # TP=4 configurations - - { tp: 4, conc-start: 16, conc-end: 16, spec-decoding: mtp } - - { tp: 4, ep: 4, conc-start: 8, conc-end: 8, spec-decoding: mtp } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 256, spec-decoding: mtp } - # TP=8 configurations - - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp } - - { tp: 8, ep: 8, conc-start: 32, conc-end: 64, spec-decoding: mtp } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: @@ -1772,10 +1745,6 @@ dsr1-fp8-b200-sglang: osl: 1024 search-space: - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -1795,10 +1764,6 @@ qwen3.5-bf16-b200-sglang: osl: 1024 search-space: - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -1818,11 +1783,6 @@ qwen3.5-fp8-b200-sglang: search-space: - { tp: 8, ep: 1, conc-start: 4, conc-end: 16 } - { tp: 4, ep: 4, conc-start: 16, conc-end: 128 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 16 } - - { tp: 4, ep: 4, conc-start: 16, conc-end: 128 } - isl: 8192 osl: 1024 search-space: @@ -1842,10 +1802,6 @@ glm5-fp8-b200-sglang: osl: 1024 search-space: - { tp: 8, ep: 1, conc-start: 4, conc-end: 128 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 128 } - isl: 8192 osl: 1024 search-space: @@ -1864,10 +1820,6 @@ qwen3.5-fp8-b200-sglang-mtp: osl: 1024 search-space: - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: @@ -1886,10 +1838,6 @@ kimik2.5-int4-b200-vllm: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -1908,10 +1856,6 @@ kimik2.5-int4-h200-vllm: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -1931,10 +1875,6 @@ kimik2.5-fp4-b200-vllm: search-space: - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -1954,10 +1894,6 @@ dsr1-fp8-b200-sglang-mtp: osl: 1024 search-space: - { tp: 8, ep: 1, conc-start: 4, conc-end: 512, spec-decoding: mtp } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 512, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: @@ -1978,11 +1914,6 @@ dsr1-fp8-b200-trt: - { tp: 8, ep: 1, conc-start: 64, conc-end: 128 } - { tp: 4, ep: 1, conc-start: 8, conc-end: 16 } - { tp: 8, ep: 1, conc-start: 4, conc-end: 8 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256} - - { tp: 8, ep: 1, conc-start: 4, conc-end: 128 } - isl: 8192 osl: 1024 search-space: @@ -2007,13 +1938,6 @@ dsr1-fp8-b200-trt-mtp: # If CONC == 256, then TP8, EP8, DP_ATTN=true - { tp: 8, ep: 1, conc-start: 4, conc-end: 128, spec-decoding: mtp } - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256, spec-decoding: mtp } - - isl: 1024 - osl: 8192 - search-space: - # mostly TP8 - # If CONC >= 128, then TP8, EP8, DP_ATTN=true - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: @@ -2033,10 +1957,6 @@ dsr1-fp8-h200-sglang: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -2055,10 +1975,6 @@ qwen3.5-fp8-h200-sglang: osl: 1024 search-space: - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -2077,10 +1993,6 @@ glm5-fp8-h200-sglang: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -2101,11 +2013,6 @@ dsr1-fp8-h200-trt: # If CONC > 64, then DP_ATTN=true search-space: - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - # If CONC > 64, then DP_ATTN=true - search-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 # If CONC > 32, then DP_ATTN=true @@ -2129,12 +2036,6 @@ dsr1-fp8-h200-trt-mtp: # If CONC >= 128, then DP_ATTN=true, MTP=1 - { tp: 8, ep: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp } - - isl: 1024 - osl: 8192 - search-space: - # If CONC >= 256, then DP_ATTN=true, MTP=1 - - { tp: 8, ep: 8, conc-start: 4, conc-end: 128, spec-decoding: mtp } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: @@ -3149,14 +3050,6 @@ gptoss-fp4-b200-trt: - { tp: 4, conc-start: 4, conc-end: 4 } - { tp: 8, conc-start: 4, conc-end: 4 } # Low ==> high TP from Left to Right of pareto - - isl: 1024 - osl: 8192 - search-space: - - { tp: 1, conc-start: 256, conc-end: 256} - - { tp: 2, conc-start: 128, conc-end: 256} - - { tp: 4, conc-start: 4, conc-end: 256} - - { tp: 8, conc-start: 4, conc-end: 4} - # Low ==> high TP from Left to Right of pareto - isl: 8192 osl: 1024 search-space: @@ -3181,13 +3074,6 @@ gptoss-fp4-b200-vllm: - { tp: 2, conc-start: 4, conc-end: 128 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 8 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 1, conc-start: 4, conc-end: 128 } - - { tp: 2, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 8 } - isl: 8192 osl: 1024 search-space: @@ -3210,11 +3096,6 @@ minimaxm2.5-fp8-b200-vllm: search-space: - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -3236,12 +3117,6 @@ gptoss-fp4-h100-vllm: - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -3263,11 +3138,6 @@ minimaxm2.5-fp8-h100-vllm: search-space: # - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - # - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -3345,67 +3215,6 @@ dsr1-fp8-h100-dynamo-sglang: tp: 16 ep: 16 dp-attn: true - - isl: 1024 - osl: 8192 - search-space: - # # STP: Max throughput TEP (1 prefill, 2 decode) - # - conc-list: [1, 2, 4, 8, 16, 32] - # prefill: - # num-worker: 1 - # tp: 16 - # ep: 1 - # dp-attn: false - # additional-settings: - # - "CONFIG_FILE=recipes/h100/1k8k/stp/h100-fp8-1p2d-max-tp.yaml" - # decode: - # num-worker: 2 - # tp: 16 - # ep: 1 - # dp-attn: false - # # STP: Max throughput DEP (1 prefill, 1 decode, dp-attention) - # - conc-list: [1, 2, 4, 8] - # prefill: - # num-worker: 1 - # tp: 16 - # ep: 1 - # dp-attn: false - # additional-settings: - # - "CONFIG_FILE=recipes/h100/1k8k/stp/h100-fp8-1p1d-max-dep.yaml" - # decode: - # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true - # MTP: Max throughput TEP (1 prefill, 2 decode) - - spec-decoding: "mtp" - conc-list: [1, 2, 4, 8, 16, 32, 64] - prefill: - num-worker: 1 - tp: 16 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h100/1k8k/mtp/h100-fp8-1p2d-max-tp-mtp.yaml" - decode: - num-worker: 2 - tp: 16 - ep: 1 - dp-attn: false - # MTP: Max throughput DEP (1 prefill, 1 decode, dp-attention) - - spec-decoding: "mtp" - conc-list: [1, 2, 4, 8, 16, 32, 64] - prefill: - num-worker: 1 - tp: 16 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h100/1k8k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - isl: 8192 osl: 1024 search-space: @@ -3485,13 +3294,6 @@ gptoss-fp4-h200-trt: - { tp: 2, ep: 2, dp-attn: false, conc-start: 4, conc-end: 64 } - { tp: 4, ep: 4, dp-attn: false, conc-start: 4, conc-end: 32 } - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 1, ep: 1, dp-attn: false, conc-start: 4, conc-end: 64 } - - { tp: 2, ep: 2, dp-attn: false, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, dp-attn: false, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 } - isl: 8192 osl: 1024 search-space: @@ -3516,13 +3318,6 @@ gptoss-fp4-h200-vllm: - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 1, conc-start: 4, conc-end: 4 } - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: @@ -3544,10 +3339,6 @@ minimaxm2.5-fp8-h200-vllm: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 128 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 128 } - isl: 8192 osl: 1024 search-space: @@ -3744,8 +3535,8 @@ dsr1-fp4-gb200-dynamo-trt: ep: 32 dp-attn: true - - isl: 1024 - osl: 8192 + - isl: 8192 + osl: 1024 search-space: # MTP configurations (spec_decoding="mtp") - spec-decoding: "mtp" @@ -3756,342 +3547,132 @@ dsr1-fp4-gb200-dynamo-trt: ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen7_tep8_batch4_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen7_tep8_batch4_eplb0_mtp3.yaml" - decode: - num-worker: 7 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [ 7 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" decode: - num-worker: 7 + num-worker: 4 tp: 8 ep: 8 dp-attn: false - spec-decoding: "mtp" - conc-list: [ 128 ] + conc-list: [ 180 ] prefill: - num-worker: 1 + num-worker: 3 tp: 4 ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml" decode: num-worker: 1 tp: 32 ep: 32 dp-attn: true - spec-decoding: "mtp" - conc-list: [ 512 ] + conc-list: [ 1229 ] prefill: - num-worker: 1 + num-worker: 7 tp: 4 ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml" decode: num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [ 3072 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen3_dep16_batch64_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen3_dep16_batch64_eplb0_mtp3.yaml" - decode: - num-worker: 3 tp: 16 ep: 16 dp-attn: true - spec-decoding: "mtp" - conc-list: [ 6144 ] + conc-list: [ 666 ] prefill: - num-worker: 1 + num-worker: 8 tp: 4 ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen3_dep16_batch128_eplb0_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen3_dep16_batch128_eplb0_mtp1.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml" decode: - num-worker: 3 - tp: 16 - ep: 16 + num-worker: 1 + tp: 32 + ep: 32 dp-attn: true - spec-decoding: "mtp" - conc-list: [ 8192 ] + conc-list: [ 4301 ] prefill: - num-worker: 1 + num-worker: 11 tp: 4 ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen1_dep32_batch256_eplb288_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen1_dep32_batch256_eplb288_mtp1.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml" decode: num-worker: 1 - tp: 32 - ep: 32 + tp: 16 + ep: 16 dp-attn: true # Non-MTP configurations (default spec_decoding="none") - - conc-list: [ 5 ] + - conc-list: [ 12, 44, 76 ] prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml" decode: - num-worker: 7 + num-worker: 4 tp: 8 ep: 8 dp-attn: false - - conc-list: [ 60 ] + - conc-list: [ 5 ] prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen15_tep4_batch4_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen15_tep4_batch4_eplb0_mtp0.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" decode: - num-worker: 15 - tp: 4 - ep: 4 + num-worker: 4 + tp: 8 + ep: 8 dp-attn: false - - conc-list: [ 135 ] + - conc-list: [ 333 ] prefill: - num-worker: 1 + num-worker: 2 tp: 4 ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen15_tep4_batch8_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen15_tep4_batch8_eplb0_mtp0.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml" decode: - num-worker: 15 - tp: 4 - ep: 4 - dp-attn: false - - conc-list: [ 563 ] - prefill: num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - conc-list: [ 1229 ] + prefill: + num-worker: 7 tp: 4 ep: 4 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 32 ep: 32 dp-attn: true - - conc-list: [ 2048 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch64_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch64_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - conc-list: [ 4096 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch128_eplb288_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch128_eplb288_mtp0.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - conc-list: [ 8192 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch256_eplb288_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch256_eplb288_mtp0.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - - isl: 8192 - osl: 1024 - search-space: - # MTP configurations (spec_decoding="mtp") - - spec-decoding: "mtp" - conc-list: [ 4, 8, 12, 24, 48 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [ 180 ] - prefill: - num-worker: 3 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [ 1229 ] - prefill: - num-worker: 7 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [ 666 ] - prefill: - num-worker: 8 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [ 4301 ] - prefill: - num-worker: 11 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - # Non-MTP configurations (default spec_decoding="none") - - conc-list: [ 12, 44, 76 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [ 5 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [ 333 ] - prefill: - num-worker: 2 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - conc-list: [ 1229 ] - prefill: - num-worker: 7 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - conc-list: [ 2253 ] + - conc-list: [ 2253 ] prefill: num-worker: 8 tp: 4 @@ -4339,156 +3920,6 @@ dsr1-fp8-gb200-dynamo-trt: tp: 8 ep: 8 dp-attn: false - # 1k8k MTP configs - - isl: 1024 - osl: 8192 - search-space: - - spec-decoding: "mtp" - conc-list: [8192] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen2_dep16_batch256_eplb0_mtp1_8192.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen2_dep16_batch256_eplb0_mtp1_8192.yaml" - decode: - num-worker: 2 - tp: 16 - ep: 16 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [2152] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen2_dep16_batch64_eplb0_mtp1_2152.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen2_dep16_batch64_eplb0_mtp1_2152.yaml" - decode: - num-worker: 2 - tp: 16 - ep: 16 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [564] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_564.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_564.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [72] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen1_dep32_batch2_eplb0_mtp3_72.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen1_dep32_batch2_eplb0_mtp3_72.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [4, 8] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen4_tep8_batch2_eplb0_mtp3_8.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/mtp/ctx1_gen4_tep8_batch2_eplb0_mtp3_8.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - # 1k8k STP configs - - conc-list: [8192] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen2_dep16_batch256_eplb0_mtp0_8192.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen2_dep16_batch256_eplb0_mtp0_8192.yaml" - decode: - num-worker: 2 - tp: 16 - ep: 16 - dp-attn: true - - conc-list: [2048] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen1_dep32_batch64_eplb0_mtp0_2048.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen1_dep32_batch64_eplb0_mtp0_2048.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - conc-list: [564] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0_564.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0_564.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - conc-list: [36] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [4] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k8k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false # 8k1k MTP configs - isl: 8192 osl: 1024 @@ -5079,343 +4510,164 @@ dsr1-fp4-gb300-dynamo-trt: - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml" decode: num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [333] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [5] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [8, 12, 24, 48] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [2253] - prefill: - num-worker: 3 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [1229] - prefill: - num-worker: 3 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - # Non-MTP configurations (default spec_decoding="none") - - conc-list: [5] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [12, 48, 96, 192] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [8192] - prefill: - num-worker: 2 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [1229] - prefill: - num-worker: 2 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - conc-list: [4301] - prefill: - num-worker: 3 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 + tp: 4 + ep: 4 dp-attn: true - - conc-list: [2253] + - spec-decoding: "mtp" + conc-list: [333] prefill: - num-worker: 3 + num-worker: 1 tp: 2 ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml" decode: num-worker: 1 tp: 32 ep: 32 dp-attn: true - - isl: 1024 - osl: 8192 - search-space: - # MTP configurations (spec_decoding="mtp") - spec-decoding: "mtp" - conc-list: [7] + conc-list: [5] prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml" decode: - num-worker: 7 + num-worker: 4 tp: 8 ep: 8 dp-attn: false - spec-decoding: "mtp" - conc-list: [63] + conc-list: [8, 12, 24, 48] prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen7_tep8_batch8_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen7_tep8_batch8_eplb0_mtp3.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" decode: - num-worker: 7 + num-worker: 4 tp: 8 ep: 8 dp-attn: false - spec-decoding: "mtp" - conc-list: [563] + conc-list: [2253] prefill: - num-worker: 1 + num-worker: 3 tp: 2 ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml" decode: num-worker: 1 - tp: 32 - ep: 32 + tp: 16 + ep: 16 dp-attn: true - spec-decoding: "mtp" - conc-list: [2088] + conc-list: [1229] prefill: - num-worker: 1 + num-worker: 3 tp: 2 ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen1_dep32_batch64_eplb288_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen1_dep32_batch64_eplb288_mtp3.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml" decode: num-worker: 1 tp: 32 ep: 32 dp-attn: true - - spec-decoding: "mtp" - conc-list: [8192] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen2_dep16_batch256_eplb256_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen2_dep16_batch256_eplb256_mtp1.yaml" - decode: - num-worker: 2 - tp: 16 - ep: 16 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [16384] + # Non-MTP configurations (default spec_decoding="none") + - conc-list: [5] prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen4_dep8_batch512_eplb0_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/mtp/ctx1_gen4_dep8_batch512_eplb0_mtp1.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" decode: num-worker: 4 tp: 8 ep: 8 - dp-attn: true - # STP configurations (no spec_decoding) - - conc-list: [7] + dp-attn: false + - conc-list: [12, 48, 96, 192] prefill: num-worker: 1 tp: 2 ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml" decode: - num-worker: 7 + num-worker: 4 tp: 8 ep: 8 dp-attn: false - - conc-list: [60] + - conc-list: [8192] prefill: - num-worker: 1 + num-worker: 2 tp: 2 ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen15_tep4_batch4_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen15_tep4_batch4_eplb0_mtp0.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml" decode: - num-worker: 15 - tp: 4 - ep: 4 - dp-attn: false - - conc-list: [245] - prefill: num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml" - decode: - num-worker: 7 tp: 8 ep: 8 - dp-attn: false - - conc-list: [1024] + dp-attn: true + - conc-list: [1229] prefill: - num-worker: 1 + num-worker: 2 tp: 2 ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 32 ep: 32 dp-attn: true - - conc-list: [4096] + - conc-list: [4301] prefill: - num-worker: 1 + num-worker: 3 tp: 2 ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen1_dep32_batch128_eplb288_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen1_dep32_batch128_eplb288_mtp0.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml" decode: num-worker: 1 - tp: 32 - ep: 32 + tp: 16 + ep: 16 dp-attn: true - - conc-list: [8192] + - conc-list: [2253] prefill: - num-worker: 1 + num-worker: 3 tp: 2 ep: 2 dp-attn: true additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen1_dep32_batch256_eplb288_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k8k/stp/ctx1_gen1_dep32_batch256_eplb288_mtp0.yaml" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 32 @@ -6184,187 +5436,6 @@ dsr1-fp8-gb300-dynamo-trt: tp: 8 ep: 8 dp-attn: true - - isl: 1024 - osl: 8192 - search-space: - # MTP configurations (spec_decoding="mtp") - - spec-decoding: "mtp" - conc-list: [4] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_4.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_4.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [16] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_16.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_16.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [141] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_141.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_141.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [544] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_544.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_544.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [2048] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen2_dep16_batch64_eplb0_mtp1_2048.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen2_dep16_batch64_eplb0_mtp1_2048.yaml" - decode: - num-worker: 2 - tp: 16 - ep: 16 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [8192] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp1_8192.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp1_8192.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: true - # STP configurations (no spec_decoding) - - conc-list: [4] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [36] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [282] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_282.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_282.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - conc-list: [1024] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1024.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1024.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - conc-list: [4096] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen2_dep16_batch128_eplb0_mtp0_4096.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen2_dep16_batch128_eplb0_mtp0_4096.yaml" - decode: - num-worker: 2 - tp: 16 - ep: 16 - dp-attn: true - - conc-list: [8192] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen2_dep16_batch256_eplb0_mtp0_8192.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k8k/stp/ctx1_gen2_dep16_batch256_eplb0_mtp0_8192.yaml" - decode: - num-worker: 2 - tp: 16 - ep: 16 - dp-attn: true - - gptoss-fp4-gb200-dynamo-trt: image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.7.0.post2 model: openai/gpt-oss-120b diff --git a/.github/workflows/README.md b/.github/workflows/README.md index 37e64b8ed..de0a3dcab 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -40,7 +40,7 @@ usage: generate_sweep_configs.py full-sweep [--precision PRECISION [PRECISION ...]] [--framework FRAMEWORK [FRAMEWORK ...]] [--runner-type RUNNER_TYPE [RUNNER_TYPE ...]] - [--seq-lens {1k1k,1k8k,8k1k} [{1k1k,1k8k,8k1k} ...]] + [--seq-lens {1k1k,8k1k} [{1k1k,8k1k} ...]] [--step-size STEP_SIZE] [--max-conc MAX_CONC] [--max-tp MAX_TP] @@ -62,9 +62,9 @@ full-sweep --config-files .github/configs/nvidia-master.yaml full-sweep --single-node --model-prefix gptoss --runner-type b200 --seq-lens 1k1k --config-files .github/configs/nvidia-master.yaml ``` -**Test all single-node fp8 precision configs for 1k8k workloads:** +**Test all single-node fp8 precision configs for 8k1k workloads:** ``` -full-sweep --single-node --precision fp8 --seq-lens 1k8k --config-files .github/configs/nvidia-master.yaml .github/configs/amd-master.yaml +full-sweep --single-node --precision fp8 --seq-lens 8k1k --config-files .github/configs/nvidia-master.yaml .github/configs/amd-master.yaml ``` **Test all single-node TRT configs on H200 runners:** diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index db1734c9f..e2cda146b 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -96,6 +96,8 @@ env: CONC_LIST: ${{ join(fromJson(inputs.conc-list), ' ') }} SPEC_DECODING: ${{ inputs.spec-decoding }} DISAGG: ${{ inputs.disagg }} + PYTHONDONTWRITEBYTECODE: '1' + PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache PREFILL_NUM_WORKERS: ${{ inputs.prefill-num-worker }} PREFILL_TP: ${{ inputs.prefill-tp }} @@ -142,6 +144,7 @@ jobs: token: ${{ secrets.REPO_PAT }} fetch-depth: 0 ref: ${{ inputs.ref || github.ref }} + clean: false - name: Launch multi-node job script env: diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 16b587657..797505eec 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -54,6 +54,11 @@ on: type: boolean required: true default: false + eval-only: + description: "Run only evals (skip throughput benchmark)" + type: boolean + required: false + default: false random-range-ratio: required: false type: string @@ -83,6 +88,9 @@ env: SPEC_DECODING: ${{ inputs.spec-decoding }} DISAGG: ${{ inputs.disagg }} RUN_EVAL: ${{ inputs.run-eval }} + EVAL_ONLY: ${{ inputs.eval-only }} + PYTHONDONTWRITEBYTECODE: '1' + PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache permissions: contents: read @@ -91,7 +99,7 @@ jobs: benchmark: runs-on: ${{ inputs.runner }} timeout-minutes: 300 - name: "${{ inputs.exp-name }} ${{ inputs.precision }} ${{ inputs.runner }} ${{ inputs.framework }} | tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} | disagg-${{ inputs.disagg }} spec-${{ inputs.spec-decoding }} conc-${{ inputs.conc }}${{ inputs.run-eval && ' | eval' || '' }}" + name: "${{ inputs.exp-name }} ${{ inputs.precision }} ${{ inputs.runner }} ${{ inputs.framework }} | tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} | disagg-${{ inputs.disagg }} spec-${{ inputs.spec-decoding }} conc-${{ inputs.conc }}${{ inputs.eval-only && ' | eval-only' || (inputs.run-eval && ' | eval' || '') }}" steps: - name: Resource cleanup (pre-run) run: &resource-cleanup | @@ -123,13 +131,14 @@ jobs: sleep 5 done fi - fi + fi - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: token: ${{ secrets.REPO_PAT }} fetch-depth: 0 ref: ${{ inputs.ref || github.ref }} + clean: false - name: Launch job script env: @@ -145,28 +154,42 @@ jobs: echo "RESULT_FILENAME=${RESULT_FILENAME}" >> $GITHUB_ENV bash ./runners/launch_${RUNNER_NAME%%_*}.sh - FOUND_RESULT_FILE= - for i in {1..10}; do - if [ -f "$RESULT_FILENAME.json" ]; then - FOUND_RESULT_FILE=true - break + + if [ "${{ inputs.eval-only }}" = "true" ]; then + echo "Eval-only mode: skipping benchmark result file check" + # Verify eval produced results + if ! ls results*.json 1>/dev/null 2>&1; then + echo "Eval-only run failed: no results*.json files found." >&2 + exit 1 fi - echo "Waiting for result file... (attempt $i)" - sleep 1 - done + # Verify eval scores meet per-benchmark minimum thresholds + python3 utils/evals/validate_scores.py + else + FOUND_RESULT_FILE= + for i in {1..10}; do + if [ -f "$RESULT_FILENAME.json" ]; then + FOUND_RESULT_FILE=true + break + fi + echo "Waiting for result file... (attempt $i)" + sleep 1 + done - if [ -z "$FOUND_RESULT_FILE" ]; then - echo "Run failed: Benchmark result $RESULT_FILENAME.json not found." >&2 - exit 1 + if [ -z "$FOUND_RESULT_FILE" ]; then + echo "Run failed: Benchmark result $RESULT_FILENAME.json not found." >&2 + exit 1 + fi fi - name: Process result + if: ${{ !inputs.eval-only }} env: RUNNER_TYPE: ${{ inputs.runner }} run: | python3 utils/process_result.py - name: Upload result + if: ${{ !inputs.eval-only }} uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: name: bmk_${{ env.RESULT_FILENAME }} @@ -176,7 +199,7 @@ jobs: if: always() uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: - name: server_logs_${{ env.RESULT_FILENAME }} + name: ${{ inputs.eval-only && 'eval_server_logs_' || 'server_logs_' }}${{ env.RESULT_FILENAME }} path: server.log if-no-files-found: ignore @@ -184,12 +207,12 @@ jobs: if: always() uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: - name: gpu_metrics_${{ env.RESULT_FILENAME }} + name: ${{ inputs.eval-only && 'eval_gpu_metrics_' || 'gpu_metrics_' }}${{ env.RESULT_FILENAME }} path: gpu_metrics.csv if-no-files-found: ignore - name: Upload eval results (if any) - if: ${{ env.RUN_EVAL == 'true' }} + if: ${{ always() && (env.RUN_EVAL == 'true' || inputs.eval-only) }} uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: name: eval_${{ env.EXP_NAME }}_${{ env.RESULT_FILENAME }} @@ -197,14 +220,15 @@ jobs: meta_env.json results*.json sample*.jsonl - if-no-files-found: ignore + if-no-files-found: ${{ inputs.eval-only && 'error' || 'ignore' }} - name: Cleanup eval outputs (post-upload) - if: ${{ env.RUN_EVAL == 'true' }} + if: ${{ always() && (env.RUN_EVAL == 'true' || inputs.eval-only) }} run: | rm -f meta_env.json || true # Remove any eval results JSONs that were moved into workspace rm -f results*.json || true + rm -f sample*.jsonl || true - name: Resource cleanup (post-run) if: always() diff --git a/.github/workflows/claude.yml b/.github/workflows/claude.yml index 1be4b1b98..b5b474471 100644 --- a/.github/workflows/claude.yml +++ b/.github/workflows/claude.yml @@ -97,7 +97,7 @@ jobs: The `generate-cli-command` input accepts arguments for `generate_sweep_configs.py`. Usage: `generate_sweep_configs.py` `[-h]` `{full-sweep,runner-model-sweep,test-config}` **Subcommand reference:** - - `full-sweep`: Use this subcommand with filter flags like `--model-prefix`, `--framework`, `--precision`, `--runner-type`, `--min-conc`, `--max-conc`, `--seq-len`. This is the primary subcommand for running benchmarks. + - `full-sweep`: Use this subcommand with filter flags like `--model-prefix`, `--framework`, `--precision`, `--runner-type`, `--min-conc`, `--max-conc`, `--seq-lens`. This is the primary subcommand for running benchmarks. - `test-config`: Use this subcommand ONLY when prompted to with 'test-config'. Uses the flags `--config-files` and `--config-keys`, does NOT accept any other arguments. Examples: @@ -119,7 +119,7 @@ jobs: **Specify concurrency and sequence length:** ``` - generate-cli-command: "full-sweep --config-files .github/configs/nvidia-master.yaml --single-node --model-prefix dsr1 --min-conc 4 --max-conc 4 --seq-len 1k1k" + generate-cli-command: "full-sweep --config-files .github/configs/nvidia-master.yaml --single-node --model-prefix dsr1 --min-conc 4 --max-conc 4 --seq-lens 1k1k" ``` **Test specific config keys (MUST USE `--conc`):** @@ -130,7 +130,7 @@ jobs: **IMPORTANT: Keep runs precise and efficient:** - Use `full-sweep` with filter flags to narrow down the benchmark scope - "full-sweep" does NOT mean running everything - When using `full-sweep`, you must use `--min-conc` and `--max-conc` together to specify a single concurrency value. Unless prompted otherwise, use `--min-conc 4 --max-conc 4` - - When using `full-sweep`, you can use `--seq-len` to specify a single sequence length (choices: 1k1k, 1k8k, 8k1k). Unless prompted otherwise, use `--seq-len 1k1k` + - When using `full-sweep`, you can use `--seq-lens` to specify sequence lengths (choices: 1k1k, 8k1k). Unless prompted otherwise, use `--seq-lens 1k1k` - Use `test-config` ONLY when given specific config keys to test - Use `--config-files`, `--config-keys`, and `--conc` flags ONLY - Always filter by specific models, frameworks, precision, conc, or config keys when possible @@ -291,4 +291,3 @@ jobs: # Then use $EP in the vllm serve command ``` This ensures the script respects the `ep` setting in the master config YAML's search-space. - diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index 505bb515d..d6ecf76b0 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -37,6 +37,7 @@ jobs: outputs: single-node-config: ${{ steps.get-jobs.outputs.single-node-config }} multi-node-config: ${{ steps.get-jobs.outputs.multi-node-config }} + eval-config: ${{ steps.get-jobs.outputs.eval-config }} steps: - name: Checkout code (ref) if: ${{ inputs.ref && inputs.ref != '' }} @@ -53,10 +54,12 @@ jobs: pip install pydantic CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py \ ${{ inputs.generate-cli-command || github.event.inputs.generate-cli-command }}) - SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x]))") + SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and not x.get('run-eval', False)]))") MULTI=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x]))") + EVALS=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('run-eval', False)]))") echo "single-node-config=$SINGLE" >> $GITHUB_OUTPUT echo "multi-node-config=$MULTI" >> $GITHUB_OUTPUT + echo "eval-config=$EVALS" >> $GITHUB_OUTPUT test-sweep-multi-node: needs: get-jobs @@ -123,7 +126,38 @@ jobs: conc: ${{ matrix.config.conc }} spec-decoding: ${{ matrix.config.spec-decoding }} disagg: ${{ matrix.config.disagg }} - run-eval: ${{ matrix.config.run-eval }} + run-eval: false + ref: ${{ inputs.ref }} + + test-sweep-evals: + needs: get-jobs + if: ${{ needs.get-jobs.outputs.eval-config != '[]' }} + uses: ./.github/workflows/benchmark-tmpl.yml + name: eval / + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-jobs.outputs.eval-config) }} + secrets: inherit + with: + exp-name: ${{ matrix.config.exp-name }} + isl: ${{ matrix.config.isl }} + osl: ${{ matrix.config.osl }} + max-model-len: ${{ matrix.config.max-model-len }} + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} + conc: ${{ matrix.config.conc }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} + run-eval: true + eval-only: true ref: ${{ inputs.ref }} collect-results: @@ -135,8 +169,8 @@ jobs: result-prefix: "bmk" collect-evals: - needs: [test-sweep-multi-node, test-sweep-single-node] - if: ${{ always() }} + needs: [test-sweep-evals] + if: ${{ always() && needs.test-sweep-evals.result != 'skipped' }} uses: ./.github/workflows/collect-evals.yml secrets: inherit diff --git a/.github/workflows/profile.yml b/.github/workflows/profile.yml index d72f54b8f..64e4ea531 100644 --- a/.github/workflows/profile.yml +++ b/.github/workflows/profile.yml @@ -35,6 +35,8 @@ env: HF_HUB_CACHE: '/mnt/hf_hub_cache/' RANDOM_RANGE_RATIO: '0.8' PERFETTO_RELAY_URL: https://semianalysisai.github.io/InferenceX-trace-storage + PYTHONDONTWRITEBYTECODE: '1' + PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache jobs: get-jobs: @@ -87,7 +89,7 @@ jobs: - name: Fail if no matching entries if: ${{ steps.filter.outputs.count == '0' }} run: | - echo "No entries produced for config-key=${{ inputs.config-key }}, seq-lens=${{ inputs.seq-lens }}, conc=${{ inputs.conc }}." >&2 + echo "No entries produced for config-key=${{ inputs.config-key }}, conc=${{ inputs.conc }}." >&2 exit 1 profile: @@ -153,6 +155,7 @@ jobs: with: fetch-depth: 0 ref: ${{ inputs.ref || github.ref }} + clean: false - name: Launch + Profile (single-node sglang/vllm) id: run diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml index 683056747..4d61a918c 100644 --- a/.github/workflows/run-sweep.yml +++ b/.github/workflows/run-sweep.yml @@ -106,18 +106,6 @@ jobs: decode-dp-attn: ${{ matrix.config.decode.dp-attn }} decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} - sweep-multi-node-1k8k: - needs: setup - if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['1k8k']) != 'null' }} - uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: multi-node 1k8k / - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.setup.outputs.search-space-config).multi_node['1k8k'] }} - secrets: inherit - with: *multi-node-inputs - sweep-multi-node-8k1k: needs: setup if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['8k1k']) != 'null' }} @@ -159,38 +147,54 @@ jobs: disagg: ${{ matrix.config.disagg }} run-eval: ${{ matrix.config.run-eval }} - sweep-single-node-1k8k: + sweep-single-node-8k1k: needs: setup - if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).single_node['1k8k']) != 'null' }} + if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).single_node['8k1k']) != 'null' }} uses: ./.github/workflows/benchmark-tmpl.yml - name: single-node 1k8k / + name: single-node 8k1k / strategy: fail-fast: false matrix: - config: ${{ fromJson(needs.setup.outputs.search-space-config).single_node['1k8k'] }} + config: ${{ fromJson(needs.setup.outputs.search-space-config).single_node['8k1k'] }} secrets: inherit with: *single-node-inputs - sweep-single-node-8k1k: + sweep-evals: needs: setup - if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).single_node['8k1k']) != 'null' }} + if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).evals) != '[]' && toJson(fromJson(needs.setup.outputs.search-space-config).evals) != 'null' }} uses: ./.github/workflows/benchmark-tmpl.yml - name: single-node 8k1k / + name: eval / strategy: fail-fast: false matrix: - config: ${{ fromJson(needs.setup.outputs.search-space-config).single_node['8k1k'] }} + config: ${{ fromJson(needs.setup.outputs.search-space-config).evals }} secrets: inherit - with: *single-node-inputs + with: + exp-name: ${{ matrix.config.exp-name }} + isl: ${{ matrix.config.isl }} + osl: ${{ matrix.config.osl }} + max-model-len: ${{ matrix.config.max-model-len }} + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} + conc: ${{ matrix.config.conc }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} + run-eval: true + eval-only: true collect-results: needs: [ sweep-single-node-1k1k, - sweep-single-node-1k8k, sweep-single-node-8k1k, sweep-multi-node-1k1k, - sweep-multi-node-1k8k, sweep-multi-node-8k1k, setup, ] @@ -201,17 +205,8 @@ jobs: result-prefix: "bmk" collect-evals: - needs: - [ - sweep-single-node-1k1k, - sweep-single-node-1k8k, - sweep-single-node-8k1k, - sweep-multi-node-1k1k, - sweep-multi-node-1k8k, - sweep-multi-node-8k1k, - setup, - ] - if: ${{ always() && needs.setup.result != 'skipped' }} + needs: [sweep-evals, setup] + if: ${{ always() && needs.setup.result != 'skipped' && needs.sweep-evals.result != 'skipped' }} uses: ./.github/workflows/collect-evals.yml secrets: inherit @@ -221,10 +216,12 @@ jobs: runs-on: ubuntu-latest steps: - name: Extract and save changelog metadata - env: - CONFIG_JSON: ${{ needs.setup.outputs.search-space-config }} run: | - echo "$CONFIG_JSON" | jq '.changelog_metadata' > changelog_metadata.json + cat <<'CONFIGEOF' > _full_config.json + ${{ needs.setup.outputs.search-space-config }} + CONFIGEOF + jq '.changelog_metadata' _full_config.json > changelog_metadata.json + rm -f _full_config.json - name: Upload changelog artifact uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 diff --git a/AGENTS.md b/AGENTS.md index 6bb4a86c8..94c28e334 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -75,27 +75,27 @@ python -m pytest matrix_logic/ -v ```bash # Full sweep with all configs python utils/matrix_logic/generate_sweep_configs.py full-sweep \ - --master-config .github/configs/nvidia-master.yaml + --config-files .github/configs/nvidia-master.yaml # Filter by model prefix (dsr1 or gptoss) python utils/matrix_logic/generate_sweep_configs.py full-sweep \ - --master-config .github/configs/nvidia-master.yaml \ - --model dsr1 + --config-files .github/configs/nvidia-master.yaml \ + --model-prefix dsr1 # Filter by framework (sglang, trt, vllm, atom, dynamo-trt, dynamo-sglang) python utils/matrix_logic/generate_sweep_configs.py full-sweep \ - --master-config .github/configs/nvidia-master.yaml \ + --config-files .github/configs/nvidia-master.yaml \ --framework sglang # Filter by precision (fp4, fp8) python utils/matrix_logic/generate_sweep_configs.py full-sweep \ - --master-config .github/configs/nvidia-master.yaml \ + --config-files .github/configs/nvidia-master.yaml \ --precision fp8 # Filter by runner type (b200, h100, h200, gb200, mi300x, mi325x, mi355x) python utils/matrix_logic/generate_sweep_configs.py full-sweep \ - --master-config .github/configs/nvidia-master.yaml \ - --runner b200 + --config-files .github/configs/nvidia-master.yaml \ + --runner-type b200 ``` ### Processing Results @@ -140,7 +140,6 @@ When working with benchmark configurations, use these valid values: **Sequence Lengths (ISL/OSL)**: - `1k1k` - 1024 input / 1024 output -- `1k8k` - 1024 input / 8192 output - `8k1k` - 8192 input / 1024 output ## Code Conventions @@ -266,7 +265,7 @@ dsr1-fp8-h200-dynamo-sglang: **7. Validate configuration:** ```bash python utils/matrix_logic/generate_sweep_configs.py full-sweep \ - --master-config .github/configs/nvidia-master.yaml \ + --config-files .github/configs/nvidia-master.yaml \ --framework dynamo-sglang ``` @@ -296,18 +295,18 @@ When upgrading Docker images in benchmark scripts and master configs .yaml: ## Evals (Accuracy Validation) -Evals run optional accuracy checks after throughput benchmarks to ensure model outputs aren't degraded by inference optimizations. +Evals run optional accuracy checks to ensure model outputs aren't degraded by inference optimizations. They can run alongside benchmarks or independently in eval-only mode. ### When Evals Run -Evals are **off by default** (`RUN_EVAL=false`). When enabled, they run for two representative points per configuration group: +Evals are **off by default** (`RUN_EVAL=false`). When enabled, they run at two concurrency levels per configuration group: -- **Lowest TP with highest concurrency** per (model, runner, framework, precision, ISL, OSL, spec-decoding) -- **Highest TP with highest concurrency** per (model, runner, framework, precision, ISL, OSL, spec-decoding) +- **Highest concurrency** per (model, runner, framework, precision, ISL, OSL, spec-decoding, dp-attn) +- **Lower-median concurrency** per (model, runner, framework, precision, ISL, OSL, spec-decoding, dp-attn) This selection logic is in `mark_eval_entries()` in `utils/matrix_logic/generate_sweep_configs.py`. -**Note**: Evals only run on `1k8k` sequence length. +**Note**: Evals only run on `8k1k` sequence length. ### Eval Framework: lm-eval @@ -316,30 +315,42 @@ The default eval framework is [lm-evaluation-harness](https://github.com/Eleuthe ### Running Evals via CLI ```bash -# Generate configs with evals marked (in addition to all configs) +# Generate configs (evals marked by default on 8k1k subset) python utils/matrix_logic/generate_sweep_configs.py full-sweep \ - --master-config .github/configs/nvidia-master.yaml \ - --run-evals + --config-files .github/configs/nvidia-master.yaml + +# Generate throughput-only configs (skip evals) +python utils/matrix_logic/generate_sweep_configs.py full-sweep \ + --config-files .github/configs/nvidia-master.yaml \ + --no-evals # Generate ONLY the eval subset (excludes non-eval configs) python utils/matrix_logic/generate_sweep_configs.py full-sweep \ - --master-config .github/configs/nvidia-master.yaml \ + --config-files .github/configs/nvidia-master.yaml \ --evals-only ``` ### Eval Integration in Benchmark Scripts -All benchmark scripts in `benchmarks/` follow this pattern: +All benchmark scripts in `benchmarks/` follow one of two flows: ```bash +# Combined mode (benchmark + eval): # 1. Start server # 2. wait_for_server_ready # 3. run_benchmark_serving (throughput) # 4. Conditionally run evals: if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC - append_lm_eval_summary # Writes meta_env.json and moves artifacts + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary fi + +# Eval-only mode (EVAL_ONLY=true): +# 1. Compute expanded context via compute_eval_context_length +# 2. Start server with expanded context (--context-length or --max-model-len) +# 3. wait_for_server_ready +# 4. run_benchmark_serving returns immediately (skipped) +# 5. run_eval + append_lm_eval_summary ``` ### Key Eval Functions in `benchmarks/benchmark_lib.sh` @@ -351,6 +362,8 @@ fi | `append_lm_eval_summary` | Writes `meta_env.json` and moves eval artifacts to workspace | | `_install_lm_eval_deps` | Installs lm-eval dependencies | | `_patch_lm_eval` | Patches lm-eval for reasoning tokens and TRT compatibility | +| `compute_eval_context_length` | Computes eval context length (5x benchmark context, capped at model native max) | +| `get_native_max_context_length` | Extracts model's native max context length from HF config | ### Eval Results Collection @@ -390,16 +403,18 @@ cat ./evals/agg_eval_all.json | jq '[.[] | select(.hw == "B200")]' | Variable | Default | Description | |----------|---------|-------------| -| `RUN_EVAL` | `false` | Enable eval after throughput | +| `RUN_EVAL` | `false` | Enable eval after throughput benchmark | +| `EVAL_ONLY` | `false` | Skip throughput, only run evals (set by workflow) | | `EVAL_FRAMEWORK` | `lm-eval` | Eval framework to use | -| `EVAL_TASK` | `gsm8k` | Task definition file (without `.yaml`) | -| `NUM_FEWSHOT` | `2` | Number of few-shot examples | +| `EVAL_TASKS_DIR` | `utils/evals/gsm8k.yaml` | Path to lm-eval task YAML | | `EVAL_RESULT_DIR` | `/tmp/eval_out-*` | Output directory for eval results | +| `EVAL_MAX_MODEL_LEN` | `16384` | Max context for eval (set by `compute_eval_context_length`) | +| `EVAL_CONCURRENT_REQUESTS` | `64` | Concurrent requests during eval | ### Adding a New Eval Task 1. Create a task YAML in `utils/evals/` (follow lm-eval task format) -2. Set `EVAL_TASK=` when running benchmarks +2. Set `EVAL_TASKS_DIR=utils/evals/.yaml` when running benchmarks 3. Update `utils/collect_eval_results.py` if new metrics need extraction ### lm-eval Patches diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index f69d3c418..535313252 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -2,6 +2,13 @@ # Shared benchmarking utilities for InferenceMAX +# Keep Python bytecode out of the mounted workspace. Benchmark jobs often run as +# root inside containers, and root-owned cache directories break future checkout +# cleanup on self-hosted runners. +export PYTHONDONTWRITEBYTECODE=1 +export PYTHONPYCACHEPREFIX="${PYTHONPYCACHEPREFIX:-/tmp/inferencex-pycache}" +mkdir -p "$PYTHONPYCACHEPREFIX" 2>/dev/null || true + # -------------------------------- # GPU monitoring helpers # -------------------------------- @@ -174,6 +181,12 @@ wait_for_server_ready() { # --trust-remote-code: Optional flag to trust remote code from HuggingFace # --server-pid: Optional server process ID to monitor during benchmark run_benchmark_serving() { + # In eval-only mode, skip the throughput benchmark entirely. + if [ "${EVAL_ONLY}" = "true" ]; then + echo "EVAL_ONLY mode: skipping throughput benchmark" + return 0 + fi + set +x local model="" local port="" @@ -486,6 +499,10 @@ move_profile_trace_for_relay() { # ------------------------------ _install_lm_eval_deps() { + # torchvision causes circular imports in ATOM; TRT-LLM/SGLang need it at module level. + if [[ "${IMAGE:-}" == *atom* ]]; then + python3 -m pip uninstall -y torchvision 2>/dev/null || true + fi python3 -m pip install -q --no-cache-dir --break-system-packages "lm-eval[api]" || true local lm_eval_ref="b315ef3b05176acc9732bb7fdec116abe1ecc476" if command -v git >/dev/null 2>&1; then @@ -574,26 +591,74 @@ PY export PYTHONPATH="${patch_dir}:${PYTHONPATH:-}" } +get_native_max_context_length() { + local model_path="$1" + python3 -c " +from transformers import AutoConfig +config = AutoConfig.from_pretrained('${model_path}', trust_remote_code=True) +for attr in ['max_position_embeddings', 'max_sequence_length', 'seq_length', 'n_positions']: + if hasattr(config, attr): + print(getattr(config, attr)) + break +else: + print(0) +" +} + +# Compute the context length for eval-only mode. +# Uses 5x the benchmark context capped at the model's native max. +# Sets EVAL_MAX_MODEL_LEN (needed by run_lm_eval). +# Echoes the computed value for scripts to capture. +# +# Usage: local ctx=$(compute_eval_context_length "$MODEL" "${current_ctx}") +compute_eval_context_length() { + local model="$1" + local benchmark_ctx="${2:-0}" + local native_max + native_max=$(get_native_max_context_length "$model") + native_max="${native_max:-0}" + + if [ "$benchmark_ctx" -eq 0 ] 2>/dev/null; then + benchmark_ctx="${native_max:-0}" + fi + local eval_ctx=$(( benchmark_ctx * 1 )) + if [ "$native_max" -gt 0 ] 2>/dev/null && [ "$eval_ctx" -gt "$native_max" ]; then + eval_ctx="$native_max" + fi + # If eval_ctx is still 0 (both benchmark_ctx and native_max were 0), fall back + if [ "$eval_ctx" -le 0 ] 2>/dev/null; then + echo "WARN: compute_eval_context_length could not determine context length for $model" >&2 + eval_ctx="${MAX_MODEL_LEN:-16384}" + fi + EVAL_MAX_MODEL_LEN="$eval_ctx" + echo "$eval_ctx" +} + +# Convenience wrapper: compute eval context from ISL/OSL and export EVAL_MAX_MODEL_LEN. +# Call directly (not in a subshell) so the export persists. +# Scripts then wire $EVAL_MAX_MODEL_LEN into whichever server variable they need. +setup_eval_context() { + EVAL_MAX_MODEL_LEN=$(compute_eval_context_length "$MODEL" "$((ISL + OSL + 200))") + export EVAL_MAX_MODEL_LEN +} + run_lm_eval() { local port="${PORT:-8888}" - local task="${EVAL_TASK:-gsm8k}" - local num_fewshot="${NUM_FEWSHOT:-2}" + local tasks_dir="${EVAL_TASKS_DIR:-utils/evals/gsm8k.yaml}" local results_dir="${EVAL_RESULT_DIR:-$(mktemp -d /tmp/eval_out-XXXXXX)}" - local gen_max_tokens=16384 + local eval_context_len="${EVAL_MAX_MODEL_LEN:-16384}" local temperature=0 local top_p=1 - local concurrent_requests=32 + local concurrent_requests="${EVAL_CONCURRENT_REQUESTS:-64}" while [[ $# -gt 0 ]]; do case $1 in --port) port="$2"; shift 2 ;; - --task) task="$2"; shift 2 ;; - --num-fewshot) num_fewshot="$2"; shift 2 ;; + --task) tasks_dir="$2"; shift 2 ;; --results-dir) results_dir="$2"; shift 2 ;; - --gen-max-tokens) gen_max_tokens="$2"; shift 2 ;; + --gen-max-tokens) eval_context_len="$2"; shift 2 ;; --temperature) temperature="$2"; shift 2 ;; --top-p) top_p="$2"; shift 2 ;; - --concurrent-requests) concurrent_requests="$2"; shift 2 ;; *) echo "Unknown parameter: $1"; return 1 ;; esac done @@ -606,16 +671,23 @@ run_lm_eval() { export OPENAI_API_KEY=${OPENAI_API_KEY:-EMPTY} MODEL_NAME=${MODEL_NAME:-$MODEL} # Prefer MODEL_NAME, else MODEL + # Cap output tokens: must fit within context window (leave room for input), + # and avoid excessive KV cache reservation per request on TRT. + local max_output_tokens=$(( eval_context_len > 4096 ? eval_context_len - 4096 : eval_context_len / 2 )) + if [ "$max_output_tokens" -gt 16384 ]; then + max_output_tokens=16384 + fi + echo "Eval budget: eval_context_len=${eval_context_len}, max_output_tokens=${max_output_tokens}" + # Export for append_lm_eval_summary to pick up export EVAL_RESULT_DIR="$results_dir" set -x python3 -m lm_eval --model local-chat-completions --apply_chat_template \ - --tasks "utils/evals/${task}.yaml" \ - --num_fewshot "${num_fewshot}" \ + --tasks "${tasks_dir}" \ --output_path "${results_dir}" \ --log_samples \ - --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=,max_retries=5,num_concurrent=${concurrent_requests},timeout=600,tokenized_requests=False,max_length=${gen_max_tokens}" \ - --gen_kwargs "max_tokens=8192,temperature=${temperature},top_p=${top_p}" + --model_args "model=${MODEL_NAME},base_url=${openai_chat_base},api_key=${OPENAI_API_KEY},eos_string=,max_retries=5,num_concurrent=${concurrent_requests},timeout=1800,tokenized_requests=False,max_length=${eval_context_len}" \ + --gen_kwargs "max_tokens=${max_output_tokens},temperature=${temperature},top_p=${top_p}" local eval_exit=$? set +x return $eval_exit @@ -623,8 +695,15 @@ run_lm_eval() { append_lm_eval_summary() { local results_dir="${EVAL_RESULT_DIR}" + if [ -z "${results_dir}" ]; then + echo "WARN: EVAL_RESULT_DIR is empty; skipping artifact collection" >&2 + return 1 + fi local out_dir="${results_dir}" - mkdir -p "$out_dir" || true + if [ ! -d "${out_dir}" ]; then + echo "WARN: EVAL_RESULT_DIR='${out_dir}' does not exist; skipping artifact collection" >&2 + return 1 + fi # Write minimal meta for collectors that expect it local meta_json="${out_dir}/meta_env.json" @@ -672,13 +751,13 @@ META # Move eval artifacts into PWD (no new directories in workspace) if [ -f "${meta_json}" ]; then - mv -f "${meta_json}" ./ || true + mv -f "${meta_json}" ./ || echo "WARN: failed to move ${meta_json}" >&2 fi if [ -d "${out_dir}" ]; then while IFS= read -r -d '' jf; do base=$(basename "$jf") if [ "$base" != "meta_env.json" ]; then - mv -f "$jf" ./ || true + mv -f "$jf" ./ || echo "WARN: failed to move ${jf}" >&2 fi done < <(find "${out_dir}" -type f -name "*.json*" -print0 2>/dev/null) fi @@ -706,8 +785,23 @@ run_eval() { esac done + # Compute EVAL_MAX_MODEL_LEN if not already set by the calling script + if [ -z "${EVAL_MAX_MODEL_LEN:-}" ]; then + compute_eval_context_length "$MODEL" "${MAX_MODEL_LEN:-0}" > /dev/null + fi + + local eval_rc=0 case "$framework" in - lm-eval|lm_eval) run_lm_eval "${forwarded[@]}" ;; - *) echo "Unknown framework '${framework}'"; return 1 ;; + lm-eval|lm_eval) run_lm_eval "${forwarded[@]}" || eval_rc=$? ;; + *) echo "Unknown framework '${framework}'"; eval_rc=1 ;; esac + + if [ "$eval_rc" -ne 0 ]; then + echo "ERROR: run_eval failed with exit code $eval_rc" >&2 + if [ "${EVAL_ONLY}" = "true" ]; then + echo "Eval-only mode: failing after artifact collection" >&2 + return "$eval_rc" + fi + fi + return $eval_rc } diff --git a/benchmarks/single_node/dsr1_fp4_b200.sh b/benchmarks/single_node/dsr1_fp4_b200.sh index d98fb8e2b..d88941628 100644 --- a/benchmarks/single_node/dsr1_fp4_b200.sh +++ b/benchmarks/single_node/dsr1_fp4_b200.sh @@ -31,6 +31,11 @@ else fi echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL" +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -40,7 +45,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0. --cuda-graph-max-bs 256 --max-running-requests 256 --mem-fraction-static 0.85 --kv-cache-dtype fp8_e4m3 \ --chunked-prefill-size 16384 \ --ep-size $EP_SIZE --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ ---enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 > $SERVER_LOG 2>&1 & +--enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -63,7 +68,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/dsr1_fp4_b200_trt.sh b/benchmarks/single_node/dsr1_fp4_b200_trt.sh index 036c2998e..7a9706d30 100644 --- a/benchmarks/single_node/dsr1_fp4_b200_trt.sh +++ b/benchmarks/single_node/dsr1_fp4_b200_trt.sh @@ -77,6 +77,12 @@ MAX_NUM_TOKENS=$(( ($CONC+$ISL+64+63)/64*64 )) MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 )) MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 )) +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" + MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN" +fi + if [[ "$PIECEWISE_CUDA_GRAPHS" == "true" ]]; then # [2^i for i in range(8)] + [i for i in range(256, max_num_tokens, 256)] + [max_num_tokens] capture_tokens=(1 2 4 8 16 32 64 128) @@ -120,7 +126,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/dsr1_fp4_b200_trt_mtp.sh b/benchmarks/single_node/dsr1_fp4_b200_trt_mtp.sh index 2a0320e53..59e5a3930 100644 --- a/benchmarks/single_node/dsr1_fp4_b200_trt_mtp.sh +++ b/benchmarks/single_node/dsr1_fp4_b200_trt_mtp.sh @@ -76,10 +76,6 @@ if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then elif [[ $CONC == 128 && $DP_ATTENTION == "false" ]]; then PIECEWISE_CUDA_GRAPHS="true" fi -elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - if [[ $CONC == 64 ]]; then - PIECEWISE_CUDA_GRAPHS="true" - fi fi if [[ "$PIECEWISE_CUDA_GRAPHS" == "true" ]]; then @@ -101,6 +97,12 @@ fi # end of set of configs using piecewise_cuda_graphs # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" + MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN" +fi + set -x # Launch TRT-LLM server mpirun -n 1 --oversubscribe --allow-run-as-root \ @@ -134,7 +136,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/dsr1_fp4_mi355x.sh b/benchmarks/single_node/dsr1_fp4_mi355x.sh index 58c1118eb..578a6c810 100644 --- a/benchmarks/single_node/dsr1_fp4_mi355x.sh +++ b/benchmarks/single_node/dsr1_fp4_mi355x.sh @@ -30,6 +30,11 @@ fi SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -44,7 +49,7 @@ python3 -m sglang.launch_server --model-path=$MODEL --trust-remote-code \ --max-prefill-tokens=$PREFILL_SIZE \ --cuda-graph-max-bs=128 \ --attention-backend aiter \ ---kv-cache-dtype fp8_e4m3 > $SERVER_LOG 2>&1 & +--kv-cache-dtype fp8_e4m3 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -65,7 +70,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/dsr1_fp4_mi355x_atom.sh b/benchmarks/single_node/dsr1_fp4_mi355x_atom.sh index 08f579244..31554fc22 100644 --- a/benchmarks/single_node/dsr1_fp4_mi355x_atom.sh +++ b/benchmarks/single_node/dsr1_fp4_mi355x_atom.sh @@ -31,6 +31,11 @@ else CALCULATED_MAX_MODEL_LEN=" --max-model-len 10240 " fi +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CALCULATED_MAX_MODEL_LEN=" --max-model-len $EVAL_MAX_MODEL_LEN " +fi + if [ "$EP_SIZE" -gt 1 ]; then EP=" --enable-expert-parallel" else @@ -69,7 +74,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/dsr1_fp4_mi355x_atom_mtp.sh b/benchmarks/single_node/dsr1_fp4_mi355x_atom_mtp.sh index af1ab6aa4..1d557684e 100644 --- a/benchmarks/single_node/dsr1_fp4_mi355x_atom_mtp.sh +++ b/benchmarks/single_node/dsr1_fp4_mi355x_atom_mtp.sh @@ -31,6 +31,11 @@ else CALCULATED_MAX_MODEL_LEN=" --max-model-len 10240 " fi +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CALCULATED_MAX_MODEL_LEN=" --max-model-len $EVAL_MAX_MODEL_LEN " +fi + if [ "$EP_SIZE" -gt 1 ]; then EP=" --enable-expert-parallel" else @@ -72,7 +77,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/dsr1_fp8_b200.sh b/benchmarks/single_node/dsr1_fp8_b200.sh index 7b4be6b2b..e6d8a0e9c 100644 --- a/benchmarks/single_node/dsr1_fp8_b200.sh +++ b/benchmarks/single_node/dsr1_fp8_b200.sh @@ -63,6 +63,11 @@ else fi echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL" +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -72,7 +77,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0. --cuda-graph-max-bs $CUDA_GRAPH_MAX_BATCH_SIZE --max-running-requests $MAX_RUNNING_REQUESTS \ --mem-fraction-static $MEM_FRAC_STATIC --kv-cache-dtype fp8_e4m3 --chunked-prefill-size $CHUNKED_PREFILL_SIZE --max-prefill-tokens $MAX_PREFILL_TOKENS \ --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL --disable-radix-cache \ ---attention-backend trtllm_mla --stream-interval 30 --ep-size $EP_SIZE --moe-runner-backend flashinfer_trtllm --quantization fp8 > $SERVER_LOG 2>&1 & +--attention-backend trtllm_mla --stream-interval 30 --ep-size $EP_SIZE --moe-runner-backend flashinfer_trtllm --quantization fp8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -95,7 +100,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/dsr1_fp8_b200_mtp.sh b/benchmarks/single_node/dsr1_fp8_b200_mtp.sh index b5e499ecc..781869bcc 100755 --- a/benchmarks/single_node/dsr1_fp8_b200_mtp.sh +++ b/benchmarks/single_node/dsr1_fp8_b200_mtp.sh @@ -56,6 +56,11 @@ SPECULATIVE_EAGLE_TOPK=1 SGLANG_ENABLE_SPEC_V2=1 +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -85,7 +90,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ --speculative-num-steps $SPECULATIVE_NUM_STEPS \ --speculative-num-draft-tokens $SPECULATIVE_DRAFT_TOKENS \ --speculative-eagle-topk $SPECULATIVE_EAGLE_TOPK \ - > $SERVER_LOG 2>&1 & + $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -109,7 +114,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/dsr1_fp8_b200_trt.sh b/benchmarks/single_node/dsr1_fp8_b200_trt.sh index 8df439973..139aae669 100644 --- a/benchmarks/single_node/dsr1_fp8_b200_trt.sh +++ b/benchmarks/single_node/dsr1_fp8_b200_trt.sh @@ -37,14 +37,6 @@ if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then PIECEWISE_CUDA_GRAPHS="true" DELAY_BATCHING="true" fi -elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - if [[ $CONC -ge 256 ]]; then - CUDA_GRAPH_MAX_BATCH_SIZE=$(( $CONC / 8 )) - MOE_BACKEND="DEEPGEMM" - KV_CACHE_FREE_MEM_FRACTION=0.7 - elif [[ $CONC -ge 128 ]]; then - PIECEWISE_CUDA_GRAPHS="true" - fi elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then if [[ $CONC -ge 64 ]]; then PIECEWISE_CUDA_GRAPHS="true" @@ -100,6 +92,12 @@ MAX_NUM_TOKENS=$(( ($CONC+$ISL+64+63)/64*64 )) MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 )) MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 )) +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" + MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN" +fi + if [[ "$PIECEWISE_CUDA_GRAPHS" == "true" ]]; then # [2^i for i in range(8)] + [i for i in range(256, max_num_tokens, 256)] + [max_num_tokens] capture_tokens=(1 2 4 8 16 32 64 128) @@ -146,7 +144,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/dsr1_fp8_b200_trt_mtp.sh b/benchmarks/single_node/dsr1_fp8_b200_trt_mtp.sh index c60388848..79f84f8a1 100644 --- a/benchmarks/single_node/dsr1_fp8_b200_trt_mtp.sh +++ b/benchmarks/single_node/dsr1_fp8_b200_trt_mtp.sh @@ -45,10 +45,6 @@ if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then if [[ $CONC -le 4 ]]; then PIECEWISE_CUDA_GRAPHS="false" fi -elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - if [[ $CONC -le 8 ]]; then - PIECEWISE_CUDA_GRAPHS="false" - fi elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then if [[ $CONC -le 16 ]]; then PIECEWISE_CUDA_GRAPHS="false" @@ -89,7 +85,15 @@ attention_dp_config: EOF fi +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + MAX_NUM_TOKENS=$(( ((MTP+1)*MAX_BATCH_SIZE+ISL+64+63)/64*64 )) +if [ "${EVAL_ONLY}" = "true" ]; then + MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN" +fi # prep PW CUDA config per the documentation if [[ "$PIECEWISE_CUDA_GRAPHS" == "true" ]]; then @@ -104,10 +108,9 @@ if [[ "$PIECEWISE_CUDA_GRAPHS" == "true" ]]; then cat << EOF >> $EXTRA_CONFIG_FILE torch_compile_config: capture_num_tokens: [${CAPTURE_TOKENS_LIST%, }] - enable_piecewise_cuda_graph: true + enable_piecewise_cuda_graph: true EOF fi - # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -144,7 +147,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/dsr1_fp8_h200.sh b/benchmarks/single_node/dsr1_fp8_h200.sh index fde2cfede..c820d180b 100644 --- a/benchmarks/single_node/dsr1_fp8_h200.sh +++ b/benchmarks/single_node/dsr1_fp8_h200.sh @@ -15,7 +15,7 @@ if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi -pip3 install --user sentencepiece +pip3 install --user --break-system-packages sentencepiece hf download "$MODEL" SERVER_LOG=/workspace/server.log @@ -26,6 +26,12 @@ start_gpu_monitor export TORCH_CUDA_ARCH_LIST="9.0" +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi + set -x if [[ $ISL -eq 1024 && $OSL -eq 1024 ]]; then PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL \ @@ -35,7 +41,7 @@ if [[ $ISL -eq 1024 && $OSL -eq 1024 ]]; then --chunked-prefill-size 32768 --max-prefill-tokens 32768 --mem-fraction-static 0.82 \ --attention-backend flashinfer --stream-interval 10 \ --decode-log-interval 1 \ - > $SERVER_LOG 2>&1 & + $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & else PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL \ --host 0.0.0.0 --port $PORT --trust-remote-code \ @@ -44,7 +50,7 @@ else --chunked-prefill-size 32768 --max-prefill-tokens 32768 --mem-fraction-static 0.82 \ --attention-backend flashinfer --stream-interval 10 \ --decode-log-interval 1 \ - > $SERVER_LOG 2>&1 & + $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & fi SERVER_PID=$! @@ -66,7 +72,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/dsr1_fp8_h200_trt.sh b/benchmarks/single_node/dsr1_fp8_h200_trt.sh index 5d98aa75e..383b86065 100644 --- a/benchmarks/single_node/dsr1_fp8_h200_trt.sh +++ b/benchmarks/single_node/dsr1_fp8_h200_trt.sh @@ -64,6 +64,12 @@ MAX_NUM_TOKENS=$(( (CONC + ISL + 64 + 63) / 64 * 64 )) MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 )) MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 )) +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" + MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN" +fi + # Launch TRT-LLM server PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \ trtllm-serve $MODEL --port=$PORT \ @@ -94,7 +100,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/dsr1_fp8_h200_trt_mtp.sh b/benchmarks/single_node/dsr1_fp8_h200_trt_mtp.sh index 0ecd48f02..9d0010903 100644 --- a/benchmarks/single_node/dsr1_fp8_h200_trt_mtp.sh +++ b/benchmarks/single_node/dsr1_fp8_h200_trt_mtp.sh @@ -80,6 +80,11 @@ fi MAX_NUM_TOKENS=$(( ((MTP+1)*MAX_BATCH_SIZE+ISL+64+63)/64*64 )) +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" + MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -116,7 +121,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/dsr1_fp8_mi300x.sh b/benchmarks/single_node/dsr1_fp8_mi300x.sh index 41731427e..a5f161960 100644 --- a/benchmarks/single_node/dsr1_fp8_mi300x.sh +++ b/benchmarks/single_node/dsr1_fp8_mi300x.sh @@ -36,6 +36,11 @@ export SGLANG_AITER_MLA_PERSIST=1 SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -50,7 +55,7 @@ python3 -m sglang.launch_server \ --max-prefill-tokens=131072 \ --kv-cache-dtype fp8_e4m3 \ --attention-backend aiter \ ---disable-radix-cache > $SERVER_LOG 2>&1 & +--disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -71,7 +76,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/dsr1_fp8_mi325x.sh b/benchmarks/single_node/dsr1_fp8_mi325x.sh index 6870fe060..ae1e930f0 100644 --- a/benchmarks/single_node/dsr1_fp8_mi325x.sh +++ b/benchmarks/single_node/dsr1_fp8_mi325x.sh @@ -29,6 +29,12 @@ export SGLANG_AITER_MLA_PERSIST=1 # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi + set -x python3 -m sglang.launch_server \ --model-path=$MODEL --host=0.0.0.0 --port=$PORT --trust-remote-code \ @@ -41,7 +47,7 @@ python3 -m sglang.launch_server \ --kv-cache-dtype fp8_e4m3 \ --attention-backend aiter \ --disable-radix-cache \ -> $SERVER_LOG 2>&1 & +$EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -62,7 +68,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/dsr1_fp8_mi355x.sh b/benchmarks/single_node/dsr1_fp8_mi355x.sh index 1d00957e4..d629437cf 100644 --- a/benchmarks/single_node/dsr1_fp8_mi355x.sh +++ b/benchmarks/single_node/dsr1_fp8_mi355x.sh @@ -27,6 +27,11 @@ export ROCM_QUICK_REDUCE_QUANTIZATION=INT4 SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -42,7 +47,7 @@ python3 -m sglang.launch_server \ --num-continuous-decode-steps 4 \ --max-prefill-tokens 196608 \ --kv-cache-dtype fp8_e4m3 \ - --cuda-graph-max-bs "$CONC" > $SERVER_LOG 2>&1 & + --cuda-graph-max-bs "$CONC" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -63,7 +68,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/dsr1_fp8_mi355x_atom.sh b/benchmarks/single_node/dsr1_fp8_mi355x_atom.sh index 08f579244..31554fc22 100644 --- a/benchmarks/single_node/dsr1_fp8_mi355x_atom.sh +++ b/benchmarks/single_node/dsr1_fp8_mi355x_atom.sh @@ -31,6 +31,11 @@ else CALCULATED_MAX_MODEL_LEN=" --max-model-len 10240 " fi +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CALCULATED_MAX_MODEL_LEN=" --max-model-len $EVAL_MAX_MODEL_LEN " +fi + if [ "$EP_SIZE" -gt 1 ]; then EP=" --enable-expert-parallel" else @@ -69,7 +74,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/dsr1_fp8_mi355x_atom_mtp.sh b/benchmarks/single_node/dsr1_fp8_mi355x_atom_mtp.sh index dfb8fafdc..86381bc52 100644 --- a/benchmarks/single_node/dsr1_fp8_mi355x_atom_mtp.sh +++ b/benchmarks/single_node/dsr1_fp8_mi355x_atom_mtp.sh @@ -31,6 +31,11 @@ else CALCULATED_MAX_MODEL_LEN=" --max-model-len 10240 " fi +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CALCULATED_MAX_MODEL_LEN=" --max-model-len $EVAL_MAX_MODEL_LEN " +fi + if [ "$EP_SIZE" -gt 1 ]; then EP=" --enable-expert-parallel" else @@ -71,7 +76,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/glm5_fp8_b200.sh b/benchmarks/single_node/glm5_fp8_b200.sh index 5d09645c8..4ca4a215d 100755 --- a/benchmarks/single_node/glm5_fp8_b200.sh +++ b/benchmarks/single_node/glm5_fp8_b200.sh @@ -30,6 +30,11 @@ PORT=${PORT:-8888} echo "EP_SIZE: $EP_SIZE, CONC: $CONC, ISL: $ISL, OSL: $OSL" +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -49,7 +54,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0. --chunked-prefill-size 32768 --max-prefill-tokens 32768 \ --enable-flashinfer-allreduce-fusion --disable-radix-cache \ --stream-interval 30 \ ---model-loader-extra-config '{"enable_multithread_load": true}' > $SERVER_LOG 2>&1 & +--model-loader-extra-config '{"enable_multithread_load": true}' $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -72,7 +77,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/glm5_fp8_h200.sh b/benchmarks/single_node/glm5_fp8_h200.sh index 9194bb870..7a985645f 100644 --- a/benchmarks/single_node/glm5_fp8_h200.sh +++ b/benchmarks/single_node/glm5_fp8_h200.sh @@ -22,6 +22,12 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi + # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -36,7 +42,7 @@ python3 -m sglang.launch_server \ --mem-fraction-static 0.85 \ --served-model-name glm-5-fp8 \ --trust-remote-code \ - > "$SERVER_LOG" 2>&1 & + $EVAL_CONTEXT_ARGS > "$SERVER_LOG" 2>&1 & SERVER_PID=$! @@ -60,7 +66,7 @@ run_benchmark_serving \ # Server accepts glm-5-fp8 (--served-model-name); lm-eval must use that model name if [ "${RUN_EVAL}" = "true" ]; then export MODEL_NAME=glm-5-fp8 - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/glm5_fp8_mi355x.sh b/benchmarks/single_node/glm5_fp8_mi355x.sh index ee11463ce..3d82fd856 100755 --- a/benchmarks/single_node/glm5_fp8_mi355x.sh +++ b/benchmarks/single_node/glm5_fp8_mi355x.sh @@ -30,6 +30,11 @@ export SAFETENSORS_FAST_GPU=1 SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -44,7 +49,7 @@ python3 -m sglang.launch_server \ --mem-fraction-static 0.85 \ --model-loader-extra-config '{"enable_multithread_load": true, "num_threads": 8}' \ --nsa-prefill-backend tilelang \ - --nsa-decode-backend tilelang > $SERVER_LOG 2>&1 & + --nsa-decode-backend tilelang $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -65,7 +70,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/gptoss_fp4_b200.sh b/benchmarks/single_node/gptoss_fp4_b200.sh index 46fccca6a..f6a6f72e9 100644 --- a/benchmarks/single_node/gptoss_fp4_b200.sh +++ b/benchmarks/single_node/gptoss_fp4_b200.sh @@ -26,7 +26,12 @@ if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200)) else - CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240} + CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240} +fi + +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" fi cat > config.yaml << EOF @@ -77,7 +82,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/gptoss_fp4_b200_trt.sh b/benchmarks/single_node/gptoss_fp4_b200_trt.sh index 42fa96a94..c9ba2752c 100644 --- a/benchmarks/single_node/gptoss_fp4_b200_trt.sh +++ b/benchmarks/single_node/gptoss_fp4_b200_trt.sh @@ -78,6 +78,12 @@ set -x MAX_NUM_TOKENS=20000 +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" + MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN" +fi + # Launch TRT-LLM server mpirun -n 1 --oversubscribe --allow-run-as-root \ trtllm-serve $MODEL --port=$PORT \ @@ -109,7 +115,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $(( $CONC )) + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/gptoss_fp4_h100.sh b/benchmarks/single_node/gptoss_fp4_h100.sh index d3831ab06..8d0e773a2 100644 --- a/benchmarks/single_node/gptoss_fp4_h100.sh +++ b/benchmarks/single_node/gptoss_fp4_h100.sh @@ -17,11 +17,18 @@ fi hf download "$MODEL" +MAX_MODEL_LEN=10240 + +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + cat > config.yaml << EOF no-enable-prefix-caching: true max-cudagraph-capture-size: 2048 max-num-batched-tokens: 8192 -max-model-len: 10240 +max-model-len: $MAX_MODEL_LEN EOF export PYTHONNOUSERSITE=1 @@ -60,7 +67,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/gptoss_fp4_h200.sh b/benchmarks/single_node/gptoss_fp4_h200.sh index fe4aa5d28..2a9359b96 100644 --- a/benchmarks/single_node/gptoss_fp4_h200.sh +++ b/benchmarks/single_node/gptoss_fp4_h200.sh @@ -29,7 +29,12 @@ if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200)) else - CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240} + CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240} +fi + +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" fi # Create config.yaml @@ -71,7 +76,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/gptoss_fp4_h200_trt.sh b/benchmarks/single_node/gptoss_fp4_h200_trt.sh index a96b311d8..41dede14b 100644 --- a/benchmarks/single_node/gptoss_fp4_h200_trt.sh +++ b/benchmarks/single_node/gptoss_fp4_h200_trt.sh @@ -8,6 +8,7 @@ check_env_vars \ CONC \ ISL \ OSL \ + MAX_MODEL_LEN \ RANDOM_RANGE_RATIO \ RESULT_FILENAME \ DP_ATTENTION \ @@ -48,10 +49,19 @@ print_iter_log: true stream_interval: 20 EOF +MAX_NUM_TOKENS=20000 + +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" + MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN" +fi + PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \ trtllm-serve $MODEL \ --max_batch_size $CONC \ ---max_num_tokens 20000 \ +--max_num_tokens $MAX_NUM_TOKENS \ +--max_seq_len=$MAX_MODEL_LEN \ --backend pytorch \ --extra_llm_api_options gptoss-config.yml \ --ep_size=$EP_SIZE \ @@ -82,7 +92,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/gptoss_fp4_mi300x.sh b/benchmarks/single_node/gptoss_fp4_mi300x.sh index f71aeb090..56a7823cf 100644 --- a/benchmarks/single_node/gptoss_fp4_mi300x.sh +++ b/benchmarks/single_node/gptoss_fp4_mi300x.sh @@ -42,6 +42,10 @@ FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -73,7 +77,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/gptoss_fp4_mi325x.sh b/benchmarks/single_node/gptoss_fp4_mi325x.sh index f71aeb090..56a7823cf 100644 --- a/benchmarks/single_node/gptoss_fp4_mi325x.sh +++ b/benchmarks/single_node/gptoss_fp4_mi325x.sh @@ -42,6 +42,10 @@ FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -73,7 +77,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/gptoss_fp4_mi355x.sh b/benchmarks/single_node/gptoss_fp4_mi355x.sh index f23949739..37cb358ba 100644 --- a/benchmarks/single_node/gptoss_fp4_mi355x.sh +++ b/benchmarks/single_node/gptoss_fp4_mi355x.sh @@ -43,6 +43,10 @@ FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -74,7 +78,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/gptoss_fp4_mi355x_atom.sh b/benchmarks/single_node/gptoss_fp4_mi355x_atom.sh index cf71cbb3b..76bc87c0c 100644 --- a/benchmarks/single_node/gptoss_fp4_mi355x_atom.sh +++ b/benchmarks/single_node/gptoss_fp4_mi355x_atom.sh @@ -31,6 +31,11 @@ else CALCULATED_MAX_MODEL_LEN=" --max-model-len 10240 " fi +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CALCULATED_MAX_MODEL_LEN=" --max-model-len $EVAL_MAX_MODEL_LEN " +fi + if [ "$EP_SIZE" -gt 1 ]; then EP=" --enable-expert-parallel" else @@ -70,7 +75,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/kimik2.5_fp4_b200.sh b/benchmarks/single_node/kimik2.5_fp4_b200.sh index d08e23bb2..4818f246e 100644 --- a/benchmarks/single_node/kimik2.5_fp4_b200.sh +++ b/benchmarks/single_node/kimik2.5_fp4_b200.sh @@ -26,6 +26,10 @@ export PYTHONNOUSERSITE=1 SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -63,7 +67,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/kimik2.5_fp4_mi355x.sh index 967003232..c680529e2 100755 --- a/benchmarks/single_node/kimik2.5_fp4_mi355x.sh +++ b/benchmarks/single_node/kimik2.5_fp4_mi355x.sh @@ -31,6 +31,11 @@ fi SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + # If the machine runs a MEC FW older than 177, RCCL # cannot reclaim some memory. # Disable that features to avoid crashes. @@ -93,7 +98,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/kimik2.5_int4_b200.sh b/benchmarks/single_node/kimik2.5_int4_b200.sh index 6468cc05c..df4c63f6b 100755 --- a/benchmarks/single_node/kimik2.5_int4_b200.sh +++ b/benchmarks/single_node/kimik2.5_int4_b200.sh @@ -26,6 +26,10 @@ export VLLM_USE_FLASHINFER_MOE_INT4=1 SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -64,7 +68,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/kimik2.5_int4_h200.sh b/benchmarks/single_node/kimik2.5_int4_h200.sh index 473a1bd73..766fe74a0 100755 --- a/benchmarks/single_node/kimik2.5_int4_h200.sh +++ b/benchmarks/single_node/kimik2.5_int4_h200.sh @@ -25,6 +25,11 @@ export PYTHONNOUSERSITE=1 SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + # following https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html recipe # Start GPU monitoring (power, temperature, clocks every second) @@ -65,7 +70,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/kimik2.5_int4_mi325x.sh b/benchmarks/single_node/kimik2.5_int4_mi325x.sh index 1a42035a0..a05baddeb 100755 --- a/benchmarks/single_node/kimik2.5_int4_mi325x.sh +++ b/benchmarks/single_node/kimik2.5_int4_mi325x.sh @@ -28,6 +28,10 @@ PORT=${PORT:-8888} # following AMD andy luo's recipe # https://x.com/linluo77/status/2017024513595301985 +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -64,7 +68,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/kimik2.5_int4_mi355x.sh b/benchmarks/single_node/kimik2.5_int4_mi355x.sh index 420f8044a..5e40da700 100755 --- a/benchmarks/single_node/kimik2.5_int4_mi355x.sh +++ b/benchmarks/single_node/kimik2.5_int4_mi355x.sh @@ -26,6 +26,10 @@ fi SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -61,7 +65,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/minimaxm2.5_fp8_b200.sh b/benchmarks/single_node/minimaxm2.5_fp8_b200.sh index 3fb39c375..5ea1b8657 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_b200.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_b200.sh @@ -33,6 +33,10 @@ else EP=" " fi +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -66,7 +70,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/minimaxm2.5_fp8_h100.sh b/benchmarks/single_node/minimaxm2.5_fp8_h100.sh index 22fbbab8d..0f024ea9f 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_h100.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_h100.sh @@ -26,6 +26,11 @@ export PYTHONNOUSERSITE=1 SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + if [ "$EP_SIZE" -gt 1 ]; then EP=" --enable-expert-parallel" else @@ -66,7 +71,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/minimaxm2.5_fp8_h200.sh b/benchmarks/single_node/minimaxm2.5_fp8_h200.sh index 03caf30b7..84e73b65c 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_h200.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_h200.sh @@ -22,6 +22,11 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + if [ "$EP_SIZE" -ge 1 ]; then EP=" --enable-expert-parallel" else @@ -60,7 +65,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi300x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi300x.sh index 0fd12d9ed..d03f57c9b 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_mi300x.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_mi300x.sh @@ -28,6 +28,10 @@ export VLLM_ROCM_USE_AITER=1 SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -61,7 +65,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh index 6c9a2ef6b..21abc2e50 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh @@ -30,6 +30,10 @@ export VLLM_ROCM_USE_AITER=1 SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -63,7 +67,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh index 5400ece04..adfb959cf 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh @@ -29,6 +29,11 @@ export VLLM_ROCM_USE_AITER=1 SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + if [ "$EP_SIZE" -gt 1 ]; then EP=" --enable-expert-parallel" else @@ -68,7 +73,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/qwen3.5_bf16_b200.sh b/benchmarks/single_node/qwen3.5_bf16_b200.sh index 38785a104..86ce6b66f 100755 --- a/benchmarks/single_node/qwen3.5_bf16_b200.sh +++ b/benchmarks/single_node/qwen3.5_bf16_b200.sh @@ -41,6 +41,10 @@ MAX_PREFILL_TOKENS=32768 CUDA_GRAPH_MAX_BATCH_SIZE=$CONC MAX_RUNNING_REQUESTS=128 CONTEXT_LENGTH=$((ISL + OSL + 20)) +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN" +fi echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL" @@ -79,7 +83,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/qwen3.5_bf16_mi300x.sh b/benchmarks/single_node/qwen3.5_bf16_mi300x.sh index ea10647d6..aa74785fe 100755 --- a/benchmarks/single_node/qwen3.5_bf16_mi300x.sh +++ b/benchmarks/single_node/qwen3.5_bf16_mi300x.sh @@ -20,6 +20,11 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -31,7 +36,7 @@ python3 -m sglang.launch_server \ --port $PORT \ --tensor-parallel-size $TP \ --trust-remote-code \ - --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 & + --mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -52,7 +57,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/qwen3.5_bf16_mi325x.sh b/benchmarks/single_node/qwen3.5_bf16_mi325x.sh index ea10647d6..aa74785fe 100644 --- a/benchmarks/single_node/qwen3.5_bf16_mi325x.sh +++ b/benchmarks/single_node/qwen3.5_bf16_mi325x.sh @@ -20,6 +20,11 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -31,7 +36,7 @@ python3 -m sglang.launch_server \ --port $PORT \ --tensor-parallel-size $TP \ --trust-remote-code \ - --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 & + --mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -52,7 +57,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh index f77390707..701695def 100755 --- a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh +++ b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh @@ -20,6 +20,11 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -30,7 +35,7 @@ python3 -m sglang.launch_server \ --port $PORT \ --tensor-parallel-size $TP \ --trust-remote-code \ - --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 & + --mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -51,7 +56,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/qwen3.5_fp8_b200.sh b/benchmarks/single_node/qwen3.5_fp8_b200.sh index 39b020ecc..36e5d579d 100755 --- a/benchmarks/single_node/qwen3.5_fp8_b200.sh +++ b/benchmarks/single_node/qwen3.5_fp8_b200.sh @@ -41,6 +41,10 @@ MAX_PREFILL_TOKENS=32768 CUDA_GRAPH_MAX_BATCH_SIZE=$CONC MAX_RUNNING_REQUESTS=128 CONTEXT_LENGTH=$((ISL + OSL + 20)) +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN" +fi if [[ $TP -eq 8 ]]; then EXTRA_ARGS="--enable-flashinfer-allreduce-fusion" @@ -87,7 +91,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/qwen3.5_fp8_b200_mtp.sh b/benchmarks/single_node/qwen3.5_fp8_b200_mtp.sh index 1270c76a6..87933b166 100755 --- a/benchmarks/single_node/qwen3.5_fp8_b200_mtp.sh +++ b/benchmarks/single_node/qwen3.5_fp8_b200_mtp.sh @@ -48,6 +48,10 @@ SPECULATIVE_EAGLE_TOPK=1 echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -88,7 +92,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/qwen3.5_fp8_h200.sh b/benchmarks/single_node/qwen3.5_fp8_h200.sh index 2ae26b771..636a8ee92 100644 --- a/benchmarks/single_node/qwen3.5_fp8_h200.sh +++ b/benchmarks/single_node/qwen3.5_fp8_h200.sh @@ -23,6 +23,10 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} MAX_SEQ_LEN=$((ISL + OSL + 20)) +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_SEQ_LEN="$EVAL_MAX_MODEL_LEN" +fi echo "CONC: $CONC, ISL: $ISL, OSL: $OSL, MAX_SEQ_LEN: $MAX_SEQ_LEN" @@ -76,7 +80,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/qwen3.5_fp8_mi300x.sh b/benchmarks/single_node/qwen3.5_fp8_mi300x.sh index 0640a20ab..7bff57b61 100755 --- a/benchmarks/single_node/qwen3.5_fp8_mi300x.sh +++ b/benchmarks/single_node/qwen3.5_fp8_mi300x.sh @@ -20,6 +20,11 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -32,7 +37,7 @@ python3 -m sglang.launch_server \ --port $PORT \ --tensor-parallel-size $TP \ --trust-remote-code \ - --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 & + --mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -53,7 +58,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/qwen3.5_fp8_mi325x.sh b/benchmarks/single_node/qwen3.5_fp8_mi325x.sh index 0640a20ab..7bff57b61 100755 --- a/benchmarks/single_node/qwen3.5_fp8_mi325x.sh +++ b/benchmarks/single_node/qwen3.5_fp8_mi325x.sh @@ -20,6 +20,11 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -32,7 +37,7 @@ python3 -m sglang.launch_server \ --port $PORT \ --tensor-parallel-size $TP \ --trust-remote-code \ - --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 & + --mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -53,7 +58,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh index f77390707..701695def 100644 --- a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh +++ b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh @@ -20,6 +20,11 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -30,7 +35,7 @@ python3 -m sglang.launch_server \ --port $PORT \ --tensor-parallel-size $TP \ --trust-remote-code \ - --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 & + --mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -51,7 +56,7 @@ run_benchmark_serving \ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + run_eval --framework lm-eval --port "$PORT" append_lm_eval_summary fi diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 6b0689cbf..81fc5f100 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -999,7 +999,7 @@ - "Benchmark script: benchmarks/single_node/glm5_fp8_h200.sh" - "Tool-call-parser glm47, reasoning-parser glm45, mem-fraction-static 0.85" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/914 - + - config-keys: - glm5-fp8-b200-sglang description: @@ -1129,3 +1129,63 @@ description: - "Disable prefix caching (--no-enable-prefix-caching) for all MiniMax benchmarks using random datasets" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/966 + +- config-keys: + # NVIDIA single-node + - dsr1-fp4-b200-sglang + - dsr1-fp4-b200-trt + - dsr1-fp4-b200-trt-mtp + - dsr1-fp8-b200-sglang + - dsr1-fp8-b200-sglang-mtp + - dsr1-fp8-b200-trt + - dsr1-fp8-b200-trt-mtp + - dsr1-fp8-h200-sglang + - dsr1-fp8-h200-trt + - dsr1-fp8-h200-trt-mtp + - glm5-fp8-b200-sglang + - glm5-fp8-h200-sglang + - gptoss-fp4-b200-trt + - gptoss-fp4-b200-vllm + - gptoss-fp4-h100-vllm + - gptoss-fp4-h200-trt + - gptoss-fp4-h200-vllm + - kimik2.5-fp4-b200-vllm + - kimik2.5-int4-b200-vllm + - kimik2.5-int4-h200-vllm + - minimaxm2.5-fp8-b200-vllm + - minimaxm2.5-fp8-h100-vllm + - minimaxm2.5-fp8-h200-vllm + - qwen3.5-bf16-b200-sglang + - qwen3.5-fp8-b200-sglang + - qwen3.5-fp8-b200-sglang-mtp + - qwen3.5-fp8-h200-sglang + # AMD single-node + - dsr1-fp4-mi355x-atom + - dsr1-fp4-mi355x-atom-mtp + - dsr1-fp4-mi355x-sglang + - dsr1-fp8-mi325x-sglang + - dsr1-fp8-mi300x-sglang + - dsr1-fp8-mi355x-atom + - dsr1-fp8-mi355x-atom-mtp + - dsr1-fp8-mi355x-sglang + - glm5-fp8-mi355x-sglang + - gptoss-fp4-mi300x-vllm + - gptoss-fp4-mi325x-vllm + - gptoss-fp4-mi355x-atom + - gptoss-fp4-mi355x-vllm + - kimik2.5-fp4-mi355x-vllm + - kimik2.5-int4-mi325x-vllm + - kimik2.5-int4-mi355x-vllm + - minimaxm2.5-fp8-mi300x-vllm + - minimaxm2.5-fp8-mi325x-vllm + - minimaxm2.5-fp8-mi355x-vllm + - qwen3.5-bf16-mi300x-sglang + - qwen3.5-bf16-mi325x-sglang + - qwen3.5-bf16-mi355x-sglang + - qwen3.5-fp8-mi300x-sglang + - qwen3.5-fp8-mi325x-sglang + - qwen3.5-fp8-mi355x-sglang + description: + - "Separate evals, change to 8k1k, fail loudly, 5-shot, top of curve & middle of curve" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/911 + evals-only: true diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh index 022fd7cb2..f8c614936 100644 --- a/runners/launch_b200-dgxc.sh +++ b/runners/launch_b200-dgxc.sh @@ -33,7 +33,7 @@ docker run --rm --init --network host --name $server_name \ -e NCCL_GRAPH_REGISTER=0 \ -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ -e PROFILE -e SGLANG_TORCH_PROFILER_DIR -e VLLM_TORCH_PROFILER_DIR -e VLLM_RPC_TIMEOUT \ --e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e RUN_EVAL -e RUNNER_TYPE \ +-e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e RUN_EVAL -e EVAL_ONLY -e RUNNER_TYPE \ --entrypoint=/bin/bash \ $(echo "$IMAGE" | sed 's/#/\//') \ benchmarks/single_node/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh" diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh index 223264914..5100419b9 100644 --- a/runners/launch_h100-cr.sh +++ b/runners/launch_h100-cr.sh @@ -10,7 +10,7 @@ docker run --rm --network=host --name=$server_name \ --runtime=nvidia --gpus=all --ipc=host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RUN_EVAL -e RUNNER_TYPE -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \ +-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RUN_EVAL -e EVAL_ONLY -e RUNNER_TYPE -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \ -e PROFILE -e SGLANG_TORCH_PROFILER_DIR -e VLLM_TORCH_PROFILER_DIR -e VLLM_RPC_TIMEOUT \ -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ --entrypoint=/bin/bash \ diff --git a/utils/evals/EVALS.md b/utils/evals/EVALS.md index c3dddfcc6..e32d6d988 100644 --- a/utils/evals/EVALS.md +++ b/utils/evals/EVALS.md @@ -6,7 +6,7 @@ Quick graded QnA which measures model performance. Examples of test suites: - **gpqa**: Graduate level, Google-Proof multiple choice questions ## When? -At highest concurrency for highest TP and lowest TP, per GPU per model only for 1k8k. Logic is defined in `mark_eval_entries` of `utils/matrix-logic/generate_sweep_configs.py` +At the highest and median concurrency levels (all TPs), per (model, runner, framework, precision, ISL, OSL, spec-decoding, dp-attn), only for 8k1k. In eval-only mode, the server starts with expanded context length. In combined mode (RUN_EVAL=true), evals run against the same server used for throughput benchmarks. Logic is defined in `mark_eval_entries` of `utils/matrix_logic/generate_sweep_configs.py` ## Why? To verify how model outputs are affected by throughput optimizations. @@ -15,7 +15,7 @@ To verify how model outputs are affected by throughput optimizations. - If there was a tradeoff in accuracy for performance ## How? -- `run_eval`, definined in `benchmarks/benchmark_lib.sh`, is called in `benchmarks/*`. EleutherAI/lm-evaluation-harness(lmeval), using the same endpoint as the throughput benchmark. JSON results are processed and converted to a table with `utils/collect_eval_results.py`. +- `run_eval`, defined in `benchmarks/benchmark_lib.sh`, is called in `benchmarks/*`. It runs EleutherAI/lm-evaluation-harness (lmeval) against the running server's OpenAI-compatible endpoint. In eval-only mode (`EVAL_ONLY=true`), the server is started once with expanded context length (up to 5x benchmark context, capped at model native max). JSON results are processed and converted to a table with `utils/collect_eval_results.py`. ## Misc Following files are task definitions from lmeval, more info on changes within the files diff --git a/utils/evals/gsm8k.yaml b/utils/evals/gsm8k.yaml index fb0f0a829..e748119cd 100644 --- a/utils/evals/gsm8k.yaml +++ b/utils/evals/gsm8k.yaml @@ -9,7 +9,7 @@ output_type: generate_until training_split: train fewshot_split: train test_split: test -doc_to_text: "Question: {{question}}\nEnd your answer with: #### \nAnswer:" +doc_to_text: "Question: {{question}}\nEnd your response with the answer on the last line, formatted as: #### [number]\nAnswer:" doc_to_target: "{{answer}}" #" {{answer.split('### ')[-1].rstrip()}}" metric_list: - metric: exact_match diff --git a/utils/evals/thresholds.json b/utils/evals/thresholds.json new file mode 100644 index 000000000..8ea0b71c0 --- /dev/null +++ b/utils/evals/thresholds.json @@ -0,0 +1,4 @@ +{ + "gsm8k": 0.85, + "gpqa_diamond_cot_n_shot": 0.30 +} diff --git a/utils/evals/validate_scores.py b/utils/evals/validate_scores.py new file mode 100644 index 000000000..85433ec4b --- /dev/null +++ b/utils/evals/validate_scores.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 +"""Validate eval scores against minimum thresholds. + +Reads lm-eval results JSON files and checks that scored metrics meet the +required minimum. Thresholds are configured per-task in a JSON config file +(default: utils/evals/thresholds.json). + +Usage: + python3 utils/evals/validate_scores.py + python3 utils/evals/validate_scores.py --thresholds my_thresholds.json + python3 utils/evals/validate_scores.py --min-score 0.90 # flat threshold, no config +""" +import argparse +import glob +import json +import sys +from pathlib import Path + + +def load_thresholds(path: str) -> dict[str, float]: + """Load thresholds config. Returns {task_name: min_score}.""" + with open(path) as f: + return json.load(f) + + +def main() -> int: + parser = argparse.ArgumentParser(description="Validate eval scores") + parser.add_argument( + "--min-score", type=float, default=0.85, + help="Fallback minimum score when no threshold config matches (default: 0.85)", + ) + parser.add_argument( + "--thresholds", default=None, + help="Path to thresholds JSON config (default: utils/evals/thresholds.json)", + ) + parser.add_argument( + "--metric-prefix", default="exact_match,", + help="Only check metrics whose name starts with this prefix (default: 'exact_match,')", + ) + parser.add_argument( + "--results-glob", default="results*.json", + help="Glob pattern for result files (default: 'results*.json')", + ) + args = parser.parse_args() + + # Load thresholds config + thresholds = {} + thresholds_path = args.thresholds + if thresholds_path is None: + default_path = Path(__file__).parent / "thresholds.json" + if default_path.exists(): + thresholds_path = str(default_path) + if thresholds_path: + try: + thresholds = load_thresholds(thresholds_path) + print(f"Loaded thresholds from {thresholds_path}") + except (json.JSONDecodeError, OSError) as e: + print(f"WARN: could not load thresholds from {thresholds_path}: {e}", file=sys.stderr) + + failed = False + checked = 0 + + for f in sorted(glob.glob(args.results_glob)): + with open(f) as fh: + data = json.load(fh) + for task, metrics in data.get("results", {}).items(): + min_score = thresholds.get(task, args.min_score) + for name, val in metrics.items(): + if not name.startswith(args.metric_prefix) or "stderr" in name: + continue + if not isinstance(val, (int, float)): + continue + checked += 1 + if val < min_score: + print( + f"FAIL: {task} {name} = {val:.4f} (< {min_score})", + file=sys.stderr, + ) + failed = True + else: + print(f"PASS: {task} {name} = {val:.4f} (>= {min_score})") + + if checked == 0: + print("WARN: no metrics matched prefix '{}'".format(args.metric_prefix), file=sys.stderr) + + return 1 if (failed or checked == 0) else 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py index a31071f65..2a336c960 100644 --- a/utils/matrix_logic/generate_sweep_configs.py +++ b/utils/matrix_logic/generate_sweep_configs.py @@ -17,7 +17,6 @@ seq_len_stoi = { "1k1k": (1024, 1024), - "1k8k": (1024, 8192), "8k1k": (8192, 1024) } @@ -35,25 +34,19 @@ def seq_len_to_str(isl: int, osl: int) -> str: def mark_eval_entries(matrix_values: list[dict]) -> list[dict]: """Eval selection policy (single-node only): - - Only consider 1k8k (isl=1024, osl=8192). - - For each unique (model, runner, framework, precision, isl, osl, spec-decoding): - - Mark highest TP with highest conc - - Mark lowest TP with highest conc - - Grouping includes spec-decoding so MTP (mtp) and non-MTP (none) are treated - independently. + - Only consider 8k1k (isl=8192, osl=1024). + - For each unique (model, runner, framework, precision, isl, osl, spec-decoding, dp-attn): + - Mark all entries at the highest CONC (all TPs) + - Mark all entries at the median CONC (all TPs) """ from collections import defaultdict - # Only run evals on 1k8k - target_isl, target_osl = seq_len_stoi["1k8k"] - # Group entries by (model, runner, framework, precision, isl, osl) + # Only run evals on 8k1k + target_isl, target_osl = seq_len_stoi["8k1k"] + # Group entries by (model, runner, framework, precision, isl, osl, spec-decoding, dp-attn). # Only include entries that have a top-level TP (i.e., single-node schema). - # This avoids relying on structural hints like prefill/decode which may be - # reused by future single-node disaggregated modes. groups = defaultdict(list) for i, entry in enumerate(matrix_values): - # Skip entries without a top-level TP field if Fields.TP.value not in entry: continue @@ -72,32 +65,19 @@ def mark_eval_entries(matrix_values: list[dict]) -> list[dict]: ) groups[key].append((i, entry)) - # For each group, find highest TP/highest conc and lowest TP/highest conc + # For each group, select entries at highest CONC and median CONC (all TPs) eval_indices = set() for key, entries in groups.items(): if not entries: continue - # Find min and max TP values - min_tp = min(e[Fields.TP.value] for _, e in entries) - max_tp = max(e[Fields.TP.value] for _, e in entries) - - # Find highest conc for highest TP - highest_tp_entries = [(i, e) for i, e in entries if e[Fields.TP.value] == max_tp] - if highest_tp_entries: - max_conc_highest_tp = max(e[Fields.CONC.value] for _, e in highest_tp_entries) - for i, e in highest_tp_entries: - if e[Fields.CONC.value] == max_conc_highest_tp: - eval_indices.add(i) - - # Find highest conc for lowest TP (only if different from max_tp) - if min_tp != max_tp: - lowest_tp_entries = [(i, e) for i, e in entries if e[Fields.TP.value] == min_tp] - if lowest_tp_entries: - max_conc_lowest_tp = max(e[Fields.CONC.value] for _, e in lowest_tp_entries) - for i, e in lowest_tp_entries: - if e[Fields.CONC.value] == max_conc_lowest_tp: - eval_indices.add(i) + conc_values = sorted(set(e[Fields.CONC.value] for _, e in entries)) + median_conc = conc_values[len(conc_values) // 2] + target_concs = {conc_values[-1], median_conc} + + for i, e in entries: + if e[Fields.CONC.value] in target_concs: + eval_indices.add(i) # Mark the selected entries for i, entry in enumerate(matrix_values): @@ -742,9 +722,9 @@ def main(): ) eval_group = parent_parser.add_mutually_exclusive_group() eval_group.add_argument( - '--run-evals', + '--no-evals', action='store_true', - help='When specified, run evals on a subset of configs (in addition to all configs).' + help='When specified, skip evals (throughput benchmarks only).' ) eval_group.add_argument( '--evals-only', @@ -949,10 +929,9 @@ def main(): else: parser.error(f"Unknown command: {args.command}") - # Handle eval options (mutually exclusive) - if args.run_evals or args.evals_only: + # Handle eval options (mutually exclusive: --no-evals or --evals-only) + if not args.no_evals: matrix_values = mark_eval_entries(matrix_values) - # IF --evals-only is specified, filter to only eval entries if args.evals_only: matrix_values = [e for e in matrix_values if e.get(Fields.RUN_EVAL.value, False)] diff --git a/utils/matrix_logic/test_generate_sweep_configs.py b/utils/matrix_logic/test_generate_sweep_configs.py index 84ecddd3d..1fecdd487 100644 --- a/utils/matrix_logic/test_generate_sweep_configs.py +++ b/utils/matrix_logic/test_generate_sweep_configs.py @@ -158,13 +158,11 @@ class TestSeqLenMappings: def test_seq_len_stoi_values(self): """Verify seq_len_stoi has expected mappings.""" assert seq_len_stoi["1k1k"] == (1024, 1024) - assert seq_len_stoi["1k8k"] == (1024, 8192) assert seq_len_stoi["8k1k"] == (8192, 1024) def test_seq_len_itos_reverse_mapping(self): """Verify seq_len_itos is reverse of stoi.""" assert seq_len_itos[(1024, 1024)] == "1k1k" - assert seq_len_itos[(1024, 8192)] == "1k8k" assert seq_len_itos[(8192, 1024)] == "8k1k" @@ -174,7 +172,6 @@ class TestSeqLenToStr: def test_known_sequence_lengths(self): """Known sequence lengths should return short name.""" assert seq_len_to_str(1024, 1024) == "1k1k" - assert seq_len_to_str(1024, 8192) == "1k8k" assert seq_len_to_str(8192, 1024) == "8k1k" def test_unknown_sequence_lengths(self): diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py index ad7658176..697d97de6 100644 --- a/utils/matrix_logic/validation.py +++ b/utils/matrix_logic/validation.py @@ -360,6 +360,7 @@ class ChangelogMatrixEntry(BaseModel): ] = Field(default_factory=dict) multi_node: dict[str, list[MultiNodeMatrixEntry] ] = Field(default_factory=dict) + evals: list[SingleNodeMatrixEntry] = Field(default_factory=list) changelog_metadata: ChangelogMetadata diff --git a/utils/process_changelog.py b/utils/process_changelog.py index d17fc3729..7da19d030 100644 --- a/utils/process_changelog.py +++ b/utils/process_changelog.py @@ -81,6 +81,7 @@ def main(): final_results = { "single_node": defaultdict(list), "multi_node": defaultdict(list), + "evals": [], "changelog_metadata": { "base_ref": args.base_ref, "head_ref": args.head_ref, @@ -88,56 +89,82 @@ def main(): }, } - all_results = [] - # Deduplicate repeated configs, if for some reason a config key appears multiple times - # in one commit, we don't want to run that config two times (there will just be twice as many - # data points for that config, which is not useful) - all_configs_to_run = set() + all_benchmark_results = [] + all_eval_results = [] + # Deduplicate repeated configs separately for benchmarks and evals. + # An evals-only entry should not prevent a later regular entry from + # generating benchmarks for the same config, and vice versa. + benchmark_configs_seen = set() + eval_configs_seen = set() for entry_data in changelog_data: entry = ChangelogEntry.model_validate(entry_data) - configs_to_run = get_config_keys_from_master( + all_configs = get_config_keys_from_master( entry.config_keys, load_config_files(MASTER_CONFIGS) ) - # Skip configs already processed - configs_to_run = [c for c in configs_to_run if c not in all_configs_to_run] - if not configs_to_run: - continue - all_configs_to_run.update(configs_to_run) - - # Use --evals-only if specified in changelog entry, otherwise --run-evals - eval_flag = "--evals-only" if entry.evals_only else "--run-evals" - - try: - result = subprocess.run( - [ + if not entry.evals_only: + # Generate benchmark entries (no evals) + benchmark_configs = [c for c in all_configs if c not in benchmark_configs_seen] + if benchmark_configs: + benchmark_configs_seen.update(benchmark_configs) + base_cmd = [ "python3", GENERATE_SWEEPS_PY_SCRIPT, "test-config", "--config-keys", - *configs_to_run, + *benchmark_configs, "--config-files", *MASTER_CONFIGS, - eval_flag - ], - capture_output=True, - text=True, - check=True, - ) - except subprocess.CalledProcessError as e: - print(e.stderr) - raise - - all_results.extend(json.loads(result.stdout)) - - for result in all_results: + "--no-evals", + ] + try: + result = subprocess.run( + base_cmd, + capture_output=True, + text=True, + check=True, + ) + except subprocess.CalledProcessError as e: + print(e.stderr) + raise + all_benchmark_results.extend(json.loads(result.stdout)) + + # Generate eval entries separately + eval_configs = [c for c in all_configs if c not in eval_configs_seen] + if eval_configs: + eval_configs_seen.update(eval_configs) + base_cmd = [ + "python3", + GENERATE_SWEEPS_PY_SCRIPT, + "test-config", + "--config-keys", + *eval_configs, + "--config-files", + *MASTER_CONFIGS, + "--evals-only", + ] + try: + eval_result = subprocess.run( + base_cmd, + capture_output=True, + text=True, + check=True, + ) + except subprocess.CalledProcessError as e: + print(e.stderr) + raise + all_eval_results.extend(json.loads(eval_result.stdout)) + + for result in all_benchmark_results: seq_len_str = seq_len_to_str(result["isl"], result["osl"]) if "prefill" in result and result["prefill"] is not None: final_results["multi_node"][seq_len_str].append(result) else: final_results["single_node"][seq_len_str].append(result) + final_results["evals"] = all_eval_results + # Validate final results structure validated = ChangelogMatrixEntry.model_validate(final_results) print(validated.model_dump_json(by_alias=True))