From dd77d6f52f8076a67f4bd73dc69484b88b53b976 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Sat, 28 Mar 2026 08:09:16 +0000 Subject: [PATCH 1/4] Disable prefix caching for qwen3.5 & glm5 AMD benchmarks Add --disable-radix-cache to SGLang server launch command in all 7 benchmark scripts listed in #968, rebased on latest main. Closes #968 Co-authored-by: functionstackx --- benchmarks/single_node/glm5_fp8_mi355x.sh | 3 ++- benchmarks/single_node/qwen3.5_bf16_mi300x.sh | 3 ++- benchmarks/single_node/qwen3.5_bf16_mi325x.sh | 3 ++- benchmarks/single_node/qwen3.5_bf16_mi355x.sh | 3 ++- benchmarks/single_node/qwen3.5_fp8_mi300x.sh | 3 ++- benchmarks/single_node/qwen3.5_fp8_mi325x.sh | 3 ++- benchmarks/single_node/qwen3.5_fp8_mi355x.sh | 3 ++- 7 files changed, 14 insertions(+), 7 deletions(-) diff --git a/benchmarks/single_node/glm5_fp8_mi355x.sh b/benchmarks/single_node/glm5_fp8_mi355x.sh index 3d82fd856..ae8416f05 100755 --- a/benchmarks/single_node/glm5_fp8_mi355x.sh +++ b/benchmarks/single_node/glm5_fp8_mi355x.sh @@ -49,7 +49,8 @@ python3 -m sglang.launch_server \ --mem-fraction-static 0.85 \ --model-loader-extra-config '{"enable_multithread_load": true, "num_threads": 8}' \ --nsa-prefill-backend tilelang \ - --nsa-decode-backend tilelang $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + --nsa-decode-backend tilelang \ + --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/qwen3.5_bf16_mi300x.sh b/benchmarks/single_node/qwen3.5_bf16_mi300x.sh index aa74785fe..8aca9860a 100755 --- a/benchmarks/single_node/qwen3.5_bf16_mi300x.sh +++ b/benchmarks/single_node/qwen3.5_bf16_mi300x.sh @@ -36,7 +36,8 @@ python3 -m sglang.launch_server \ --port $PORT \ --tensor-parallel-size $TP \ --trust-remote-code \ - --mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + --mem-fraction-static 0.8 \ + --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/qwen3.5_bf16_mi325x.sh b/benchmarks/single_node/qwen3.5_bf16_mi325x.sh index aa74785fe..8aca9860a 100644 --- a/benchmarks/single_node/qwen3.5_bf16_mi325x.sh +++ b/benchmarks/single_node/qwen3.5_bf16_mi325x.sh @@ -36,7 +36,8 @@ python3 -m sglang.launch_server \ --port $PORT \ --tensor-parallel-size $TP \ --trust-remote-code \ - --mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + --mem-fraction-static 0.8 \ + --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh index 701695def..d1a4782e8 100755 --- a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh +++ b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh @@ -35,7 +35,8 @@ python3 -m sglang.launch_server \ --port $PORT \ --tensor-parallel-size $TP \ --trust-remote-code \ - --mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + --mem-fraction-static 0.8 \ + --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/qwen3.5_fp8_mi300x.sh b/benchmarks/single_node/qwen3.5_fp8_mi300x.sh index 7bff57b61..00cc9cf91 100755 --- a/benchmarks/single_node/qwen3.5_fp8_mi300x.sh +++ b/benchmarks/single_node/qwen3.5_fp8_mi300x.sh @@ -37,7 +37,8 @@ python3 -m sglang.launch_server \ --port $PORT \ --tensor-parallel-size $TP \ --trust-remote-code \ - --mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + --mem-fraction-static 0.8 \ + --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/qwen3.5_fp8_mi325x.sh b/benchmarks/single_node/qwen3.5_fp8_mi325x.sh index 7bff57b61..00cc9cf91 100755 --- a/benchmarks/single_node/qwen3.5_fp8_mi325x.sh +++ b/benchmarks/single_node/qwen3.5_fp8_mi325x.sh @@ -37,7 +37,8 @@ python3 -m sglang.launch_server \ --port $PORT \ --tensor-parallel-size $TP \ --trust-remote-code \ - --mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + --mem-fraction-static 0.8 \ + --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh index 701695def..d1a4782e8 100644 --- a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh +++ b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh @@ -35,7 +35,8 @@ python3 -m sglang.launch_server \ --port $PORT \ --tensor-parallel-size $TP \ --trust-remote-code \ - --mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + --mem-fraction-static 0.8 \ + --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! From 3fbe29ffb5094476094fb114452591e5a21077f9 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Sat, 28 Mar 2026 08:55:06 +0000 Subject: [PATCH 2/4] Add perf-changelog.yaml entry for --disable-radix-cache on qwen3.5 & glm5 AMD benchmarks Co-authored-by: functionstackx --- perf-changelog.yaml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 81fc5f100..3040448fe 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,3 +1,15 @@ +- config-keys: + - qwen3.5-bf16-mi300x-sglang + - qwen3.5-bf16-mi325x-sglang + - qwen3.5-bf16-mi355x-sglang + - qwen3.5-fp8-mi300x-sglang + - qwen3.5-fp8-mi325x-sglang + - qwen3.5-fp8-mi355x-sglang + - glm5-fp8-mi355x-sglang + description: + - "Add --disable-radix-cache to SGLang server launch command for all qwen3.5 and glm5 AMD benchmark scripts" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/970 + - config-keys: - minimaxm2.5-fp8-h100-vllm - minimaxm2.5-fp8-h200-vllm From 9d8f97c66e8b498360266526dfae8b6857c2119d Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Sun, 29 Mar 2026 00:04:47 +0000 Subject: [PATCH 3/4] Remove MI355X changes, keep --disable-radix-cache for MI300X & MI325X only Reverted glm5_fp8_mi355x.sh, qwen3.5_bf16_mi355x.sh, qwen3.5_fp8_mi355x.sh to their original state and updated perf-changelog.yaml accordingly. Co-authored-by: functionstackx --- benchmarks/single_node/glm5_fp8_mi355x.sh | 3 +-- benchmarks/single_node/qwen3.5_bf16_mi355x.sh | 3 +-- benchmarks/single_node/qwen3.5_fp8_mi355x.sh | 3 +-- perf-changelog.yaml | 5 +---- 4 files changed, 4 insertions(+), 10 deletions(-) diff --git a/benchmarks/single_node/glm5_fp8_mi355x.sh b/benchmarks/single_node/glm5_fp8_mi355x.sh index ae8416f05..3d82fd856 100755 --- a/benchmarks/single_node/glm5_fp8_mi355x.sh +++ b/benchmarks/single_node/glm5_fp8_mi355x.sh @@ -49,8 +49,7 @@ python3 -m sglang.launch_server \ --mem-fraction-static 0.85 \ --model-loader-extra-config '{"enable_multithread_load": true, "num_threads": 8}' \ --nsa-prefill-backend tilelang \ - --nsa-decode-backend tilelang \ - --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + --nsa-decode-backend tilelang $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh index d1a4782e8..701695def 100755 --- a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh +++ b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh @@ -35,8 +35,7 @@ python3 -m sglang.launch_server \ --port $PORT \ --tensor-parallel-size $TP \ --trust-remote-code \ - --mem-fraction-static 0.8 \ - --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + --mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh index d1a4782e8..701695def 100644 --- a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh +++ b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh @@ -35,8 +35,7 @@ python3 -m sglang.launch_server \ --port $PORT \ --tensor-parallel-size $TP \ --trust-remote-code \ - --mem-fraction-static 0.8 \ - --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + --mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 3040448fe..509a944b8 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,13 +1,10 @@ - config-keys: - qwen3.5-bf16-mi300x-sglang - qwen3.5-bf16-mi325x-sglang - - qwen3.5-bf16-mi355x-sglang - qwen3.5-fp8-mi300x-sglang - qwen3.5-fp8-mi325x-sglang - - qwen3.5-fp8-mi355x-sglang - - glm5-fp8-mi355x-sglang description: - - "Add --disable-radix-cache to SGLang server launch command for all qwen3.5 and glm5 AMD benchmark scripts" + - "Add --disable-radix-cache to SGLang server launch command for qwen3.5 MI300X and MI325X benchmark scripts" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/970 - config-keys: From f646f6bdf07177bda41e84d6b7d9ca07b9372fe1 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Sun, 29 Mar 2026 00:17:10 +0000 Subject: [PATCH 4/4] Move perf-changelog entry to end of file Co-authored-by: functionstackx --- perf-changelog.yaml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 509a944b8..b3720abc8 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,12 +1,3 @@ -- config-keys: - - qwen3.5-bf16-mi300x-sglang - - qwen3.5-bf16-mi325x-sglang - - qwen3.5-fp8-mi300x-sglang - - qwen3.5-fp8-mi325x-sglang - description: - - "Add --disable-radix-cache to SGLang server launch command for qwen3.5 MI300X and MI325X benchmark scripts" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/970 - - config-keys: - minimaxm2.5-fp8-h100-vllm - minimaxm2.5-fp8-h200-vllm @@ -1198,3 +1189,12 @@ - "Separate evals, change to 8k1k, fail loudly, 5-shot, top of curve & middle of curve" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/911 evals-only: true + +- config-keys: + - qwen3.5-bf16-mi300x-sglang + - qwen3.5-bf16-mi325x-sglang + - qwen3.5-fp8-mi300x-sglang + - qwen3.5-fp8-mi325x-sglang + description: + - "Add --disable-radix-cache to SGLang server launch command for qwen3.5 MI300X and MI325X benchmark scripts" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/970