From 924f6941903b19646ea0c528a8e1c91af4f7cec6 Mon Sep 17 00:00:00 2001 From: "Chen, Todd" Date: Mon, 16 Mar 2026 12:22:47 -0500 Subject: [PATCH 01/11] Enhance safety and logging in launch_mi355x-amds.sh - Introduced a safe_sudo_rm function to prevent path injection when removing files under the workspace. - Updated BENCHMARK_LOGS_DIR to ensure it is always under GITHUB_WORKSPACE, adding error handling for invalid paths. - Replaced direct sudo rm calls with safe_sudo_rm to improve security. - Modified the log collection script to use a temporary file, ensuring cleanup after execution. - Added checks to ensure workspace paths are absolute and valid, enhancing robustness of the script. --- runners/launch_mi355x-amds.sh | 168 +++++++++++++++++++++++----------- 1 file changed, 113 insertions(+), 55 deletions(-) diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index f2471466b..692d7d0a0 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -1,5 +1,16 @@ #!/usr/bin/env bash +# Sudo rm only for paths under workspace; guards against path injection / escaping +safe_sudo_rm() { + local target="$1" + local workspace="${2:-$GITHUB_WORKSPACE}" + if [[ -z "$workspace" || -z "$target" ]]; then return 0; fi + if [[ "$workspace" != /* ]]; then return 0; fi + if [[ "$target" != "$workspace"/* ]]; then return 0; fi + if [[ "$target" == *".."* ]]; then return 0; fi + sudo rm -rf "$target" 2>/dev/null || true +} + scancel_sync() { local jobid=$1 local timeout=${2:-600} @@ -46,10 +57,14 @@ if [[ "$IS_MULTINODE" == "true" ]]; then export ISL="$ISL" export OSL="$OSL" - # Logs go to BENCHMARK_LOGS_DIR (NFS-accessible, outside the repo tree) + # Logs go to BENCHMARK_LOGS_DIR (must be under workspace - no host modification outside) export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$GITHUB_WORKSPACE/benchmark_logs}" + if [[ -z "$GITHUB_WORKSPACE" || -z "$BENCHMARK_LOGS_DIR" ]] || [[ "$BENCHMARK_LOGS_DIR" != "$GITHUB_WORKSPACE"/* ]]; then + echo "ERROR: BENCHMARK_LOGS_DIR must be under GITHUB_WORKSPACE. Got BENCHMARK_LOGS_DIR=$BENCHMARK_LOGS_DIR" >&2 + exit 1 + fi mkdir -p "$BENCHMARK_LOGS_DIR" - sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true + safe_sudo_rm "$BENCHMARK_LOGS_DIR/logs" "$GITHUB_WORKSPACE" SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_mi355x_${FRAMEWORK}.sh" if [[ "$FRAMEWORK" == "sglang-disagg" ]]; then @@ -101,14 +116,16 @@ if [[ "$IS_MULTINODE" == "true" ]]; then # search for "FRAMEWORK_DIFF_IF_STATEMENT #3" for this if-statement # Find the latest log directory that contains the data - cat > collect_latest_results.py <<'PY' + COLLECT_SCRIPT=$(mktemp) + trap "rm -f '$COLLECT_SCRIPT'" EXIT + cat > "$COLLECT_SCRIPT" <<'PY' import os, sys sgl_job_dir, isl, osl, nexp = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]) for path in sorted([f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}" for name in os.listdir(f"{sgl_job_dir}/logs/") if os.path.isdir(f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]: print(path) PY - LOGS_DIR=$(python3 collect_latest_results.py "$BENCHMARK_LOGS_DIR" "$ISL" "$OSL" 1) + LOGS_DIR=$(python3 "$COLLECT_SCRIPT" "$BENCHMARK_LOGS_DIR" "$ISL" "$OSL" 1) if [ -z "$LOGS_DIR" ]; then echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" exit 1 @@ -117,15 +134,13 @@ PY echo "Found logs directory: $LOGS_DIR" ls -la "$LOGS_DIR" - # Result JSON are contained within the result directory - for result_file in $(find $LOGS_DIR -type f); do - # result_file should directly be isl_ISL_osl_OSL_concurrency_CONC_req_rate_R_gpus_N_ctx_M_gen_N.json - file_name=$(basename $result_file) - if [ -f $result_file ]; then - # Copy the result file to workspace with a unique name + # Result JSON are contained within the result directory (copy only into workspace) + for result_file in $(find "$LOGS_DIR" -type f); do + file_name=$(basename "$result_file") + if [[ -f "$result_file" && -n "$GITHUB_WORKSPACE" && "$GITHUB_WORKSPACE" == /* ]]; then WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${file_name}" echo "Found result file ${result_file}. Copying it to ${WORKSPACE_RESULT_FILE}" - cp $result_file $WORKSPACE_RESULT_FILE + cp "$result_file" "$WORKSPACE_RESULT_FILE" fi done @@ -136,10 +151,10 @@ PY set -x echo "Canceled the slurm job $JOB_ID" - sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true + safe_sudo_rm "$BENCHMARK_LOGS_DIR/logs" "$GITHUB_WORKSPACE" - # Upload logs as artifact if running in GitHub Actions - if [[ -n "${GITHUB_ACTIONS:-}" ]]; then + # Upload logs as artifact if running in GitHub Actions (workspace only) + if [[ -n "${GITHUB_ACTIONS:-}" && -n "$GITHUB_WORKSPACE" && "$GITHUB_WORKSPACE" == /* ]]; then ARTIFACT_DIR="$GITHUB_WORKSPACE/benchmark_artifacts" mkdir -p "$ARTIFACT_DIR" cp -r "$BENCHMARK_LOGS_DIR"/slurm_job-${JOB_ID}.{out,err} "$ARTIFACT_DIR/" 2>/dev/null || true @@ -148,53 +163,96 @@ PY else - export HF_HUB_CACHE_MOUNT="/docker/huggingface/hub" + export HF_HUB_CACHE_MOUNT="${HF_HUB_CACHE_MOUNT:-/docker/huggingface/hub}" export PORT_OFFSET=${RUNNER_NAME: -1} export PORT=$(( 8888 + ${PORT_OFFSET} )) FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "atom" ]] && printf '_atom' || printf '') SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') + BENCHMARK_SCRIPT="benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_mi355x${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh" + WORKSPACE="${GITHUB_WORKSPACE:-$(pwd)}" + if [[ -z "$WORKSPACE" || "$WORKSPACE" != /* ]]; then + echo "ERROR: WORKSPACE must be an absolute path. Got: $WORKSPACE" >&2 + exit 1 + fi + mkdir -p "$HF_HUB_CACHE_MOUNT" + + if command -v salloc &>/dev/null && command -v srun &>/dev/null && command -v squeue &>/dev/null; then + # SLURM path: allocate via salloc, run via srun with enroot/squash container + PARTITION="compute" + SQUASH_FILE="/var/lib/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" + LOCK_FILE="${SQUASH_FILE}.lock" + + set -x + salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=128 --time=180 --no-shell --job-name="$RUNNER_NAME" + JOB_ID=$(squeue --name="$RUNNER_NAME" -h -o %A | head -n1) + if [[ -z "$JOB_ID" ]]; then + echo "ERROR: salloc failed or no job found for $RUNNER_NAME. Check partition=$PARTITION and GPU availability." >&2 + exit 1 + fi - PARTITION="compute" - SQUASH_FILE="/var/lib/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" - LOCK_FILE="${SQUASH_FILE}.lock" - - set -x - salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=128 --time=180 --no-shell --job-name="$RUNNER_NAME" - JOB_ID=$(squeue --name="$RUNNER_NAME" -h -o %A | head -n1) - - # Remove leftover bmk-server from previous run so we can reuse the name (targeted cleanup only) - srun --jobid=$JOB_ID bash -c "docker rm -f bmk-server 2>/dev/null || true" - - # Use flock to serialize concurrent imports to the same squash file - srun --jobid=$JOB_ID bash -c " - exec 9>\"$LOCK_FILE\" - flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; } - if [[ \"$FRAMEWORK\" == \"atom\" ]]; then - rm -f \"$SQUASH_FILE\" + srun --jobid=$JOB_ID bash -c "docker rm -f bmk-server 2>/dev/null || true" + + # Note: This block runs on the compute node and modifies /var/lib/squash (enroot cache). + # Only squash files under /var/lib/squash/ are touched - no user data or workspace. + srun --jobid=$JOB_ID bash -c " + exec 9>\"$LOCK_FILE\" + flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; } + if [[ \"$SQUASH_FILE\" != /var/lib/squash/* ]]; then exit 1; fi + if [[ \"$FRAMEWORK\" == \"atom\" ]]; then + rm -f \"$SQUASH_FILE\" + fi + if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then + echo 'Squash file already exists and is valid, skipping import' + else + rm -f \"$SQUASH_FILE\" + enroot import -o \"$SQUASH_FILE\" docker://$IMAGE + fi + " + + export VLLM_CACHE_ROOT="/it-share/gharunners/.cache/vllm" + + srun --jobid=$JOB_ID \ + --container-image=$SQUASH_FILE \ + --container-mounts=$WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ + --container-mount-home \ + --container-writable \ + --container-workdir=/workspace/ \ + --no-container-entrypoint --export=ALL \ + bash "$BENCHMARK_SCRIPT" + + scancel $JOB_ID + + # Remove gpucore temp files only within workspace (no host modification outside) + if ls "$WORKSPACE"/gpucore.* 1> /dev/null 2>&1; then + echo "gpucore files exist. not good" + for f in "$WORKSPACE"/gpucore.*; do + [[ -e "$f" ]] && safe_sudo_rm "$f" "$WORKSPACE" + done fi - if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then - echo 'Squash file already exists and is valid, skipping import' - else - rm -f \"$SQUASH_FILE\" - enroot import -o \"$SQUASH_FILE\" docker://$IMAGE + else + # Non-SLURM path: run directly with Docker (no salloc/srun) + if ! command -v docker &>/dev/null; then + echo "ERROR: Neither SLURM nor Docker found. Install SLURM (for cluster) or Docker (for standalone)." >&2 + exit 1 fi - " - - export VLLM_CACHE_ROOT="/it-share/gharunners/.cache/vllm" - - srun --jobid=$JOB_ID \ - --container-image=$SQUASH_FILE \ - --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ - --container-mount-home \ - --container-writable \ - --container-workdir=/workspace/ \ - --no-container-entrypoint --export=ALL \ - bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_mi355x${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh - - scancel $JOB_ID - - if ls gpucore.* 1> /dev/null 2>&1; then - echo "gpucore files exist. not good" - rm -f gpucore.* + echo "SLURM not available; using Docker directly." + + server_name="bmk-server" + docker rm -f "$server_name" 2>/dev/null || true + + set -x + docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ + --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \ + --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ + -v "$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE" \ + -v "$WORKSPACE:/workspace/" -w /workspace/ \ + -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e EP_SIZE -e DP_ATTENTION -e CONC \ + -e MAX_MODEL_LEN -e PORT=$PORT -e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ + -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL -e RUNNER_TYPE \ + -e PROFILE -e SGLANG_TORCH_PROFILER_DIR -e VLLM_TORCH_PROFILER_DIR -e VLLM_RPC_TIMEOUT \ + -e SPEC_DECODING -e DISAGG \ + --entrypoint=/bin/bash \ + "$IMAGE" \ + "$BENCHMARK_SCRIPT" fi fi From 54a1177b90b68bdf6fbfea2851c585f4fa7f832c Mon Sep 17 00:00:00 2001 From: "Chen, Todd" Date: Wed, 18 Mar 2026 12:10:43 -0500 Subject: [PATCH 02/11] Add aiter and sglang configuration options to workflows - Introduced new inputs for aiter and sglang repositories and references in benchmark-tmpl.yml and e2e-tests.yml. - Updated environment variables in both workflows to utilize the new inputs. - Modified launch_mi355x-amds.sh to conditionally reinstall aiter and sglang based on the new environment variables before running benchmarks. --- .github/workflows/benchmark-tmpl.yml | 20 ++++++++ .github/workflows/e2e-tests.yml | 38 ++++++++++++++- .../single_node/patch_sgl_components.sh | 47 +++++++++++++++++++ runners/launch_mi355x-amds.sh | 9 +++- 4 files changed, 111 insertions(+), 3 deletions(-) create mode 100644 benchmarks/single_node/patch_sgl_components.sh diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index c1c74ffe3..e55de11d0 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -62,6 +62,22 @@ on: description: "Git ref (branch/sha) to checkout" required: false type: string + aiter-remote: + description: "Repo URL to reinstall aiter from" + required: false + type: string + aiter-ref: + description: "Branch or commit for aiter" + required: false + type: string + sglang-remote: + description: "Repo URL to reinstall sglang from" + required: false + type: string + sglang-ref: + description: "Branch or commit for sglang" + required: false + type: string env: RANDOM_RANGE_RATIO: 0.8 @@ -83,6 +99,10 @@ env: SPEC_DECODING: ${{ inputs.spec-decoding }} DISAGG: ${{ inputs.disagg }} RUN_EVAL: ${{ inputs.run-eval }} + AITER_REMOTE: ${{ inputs.aiter-remote }} + AITER_REF: ${{ inputs.aiter-ref }} + SGLANG_REMOTE: ${{ inputs.sglang-remote }} + SGLANG_REF: ${{ inputs.sglang-ref }} permissions: contents: read diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index 0f26c7c41..dee5ab153 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -16,6 +16,22 @@ on: description: "Ref (branch/sha) to checkout for generating configs" required: false type: string + aiter-repo: + description: "Repo URL to reinstall aiter from (e.g. https://github.com/user/aiter.git)" + required: false + type: string + aiter-ref: + description: "Branch or commit for aiter" + required: false + type: string + sglang-repo: + description: "Repo URL to reinstall sglang from (e.g. https://github.com/user/sglang)" + required: false + type: string + sglang-ref: + description: "Branch or commit for sglang" + required: false + type: string workflow_call: inputs: generate-cli-command: @@ -30,6 +46,22 @@ on: description: "Ref (branch/sha) to checkout for generating configs" required: false type: string + aiter-repo: + description: "Repo URL to reinstall aiter from" + required: false + type: string + aiter-ref: + description: "Branch or commit for aiter" + required: false + type: string + sglang-repo: + description: "Repo URL to reinstall sglang from" + required: false + type: string + sglang-ref: + description: "Branch or commit for sglang" + required: false + type: string jobs: get-jobs: @@ -124,7 +156,11 @@ jobs: spec-decoding: ${{ matrix.config.spec-decoding }} disagg: ${{ matrix.config.disagg }} run-eval: ${{ matrix.config.run-eval }} - ref: ${{ inputs.ref }} + ref: ${{ inputs.ref || github.event.inputs.ref }} + aiter-remote: ${{ inputs.aiter-repo || github.event.inputs['aiter-repo'] }} + aiter-ref: ${{ inputs.aiter-ref || github.event.inputs['aiter-ref'] }} + sglang-remote: ${{ inputs.sglang-repo || github.event.inputs['sglang-repo'] }} + sglang-ref: ${{ inputs.sglang-ref || github.event.inputs['sglang-ref'] }} collect-results: needs: [test-sweep-multi-node, test-sweep-single-node] diff --git a/benchmarks/single_node/patch_sgl_components.sh b/benchmarks/single_node/patch_sgl_components.sh new file mode 100644 index 000000000..8b7961d47 --- /dev/null +++ b/benchmarks/single_node/patch_sgl_components.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +# Reinstall aiter and sglang from custom remote/ref. +# Expects AITER_REMOTE, AITER_REF, SGLANG_REMOTE, SGLANG_REF from env. +# If no refs are provided, skip entirely and use default packages in container. + +set -e + +if [[ -z "$AITER_REF" && -z "$SGLANG_REF" ]]; then + echo "No patch refs provided; using default packages in container." + exit 0 +fi + +work_dir="/sgl-workspace" +aiter_remote="${AITER_REMOTE:-https://github.com/zhentaocc/aiter.git}" +aiter_ref="${AITER_REF:-mi35_qwen35_image}" +sglang_remote="${SGLANG_REMOTE:-https://github.com/sgl-project/sglang}" +sglang_ref="${SGLANG_REF:-mi35_qwen35_image}" + +if [[ ! -d "$work_dir" ]]; then + echo "$work_dir not found; assuming image ships correct versions." + exit 0 +fi + +if [[ -n "$AITER_REF" ]]; then + pip uninstall amd-aiter -y + cd "$work_dir" + rm -rf aiter + git clone --recursive "$aiter_remote" aiter + cd aiter + git fetch origin "$aiter_ref" 2>/dev/null || git fetch origin + git checkout "$aiter_ref" 2>/dev/null || git reset --hard "origin/$aiter_ref" 2>/dev/null || git reset --hard "$aiter_ref" + rm -rf aiter/jit/*.so 2>/dev/null || true + PREBUILD_KERNELS=0 python setup.py develop + echo "aiter ($aiter_ref) installed from $aiter_remote" +else + echo "AITER_REF not set; using default aiter in container" +fi + +if [[ -n "$SGLANG_REF" ]] && [[ -d "$work_dir/sglang/.git" ]]; then + cd "$work_dir/sglang" + git remote set-url origin "$sglang_remote" 2>/dev/null || git remote add origin "$sglang_remote" + git fetch origin "$sglang_ref" 2>/dev/null || git fetch origin + git checkout "$sglang_ref" 2>/dev/null || git reset --hard "origin/$sglang_ref" 2>/dev/null || git reset --hard "$sglang_ref" + echo "sglang ($sglang_ref) from $sglang_remote" +elif [[ -z "$SGLANG_REF" ]]; then + echo "SGLANG_REF not set; using default sglang in container" +fi diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index 692d7d0a0..73eb009dc 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -211,6 +211,8 @@ else export VLLM_CACHE_ROOT="/it-share/gharunners/.cache/vllm" + # Reinstall aiter/sglang in /sgl-workspace when AITER_REF or SGLANG_REF are set, then run benchmark + RUN_CMD="bash benchmarks/single_node/patch_sgl_components.sh && exec bash $BENCHMARK_SCRIPT" srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ --container-mounts=$WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ @@ -218,7 +220,7 @@ else --container-writable \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ - bash "$BENCHMARK_SCRIPT" + bash -c "$RUN_CMD" scancel $JOB_ID @@ -241,6 +243,8 @@ else docker rm -f "$server_name" 2>/dev/null || true set -x + # Reinstall aiter/sglang in /sgl-workspace when AITER_REF or SGLANG_REF are set, then run benchmark + RUN_CMD="bash benchmarks/single_node/patch_sgl_components.sh && exec bash $BENCHMARK_SCRIPT" docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ @@ -251,8 +255,9 @@ else -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL -e RUNNER_TYPE \ -e PROFILE -e SGLANG_TORCH_PROFILER_DIR -e VLLM_TORCH_PROFILER_DIR -e VLLM_RPC_TIMEOUT \ -e SPEC_DECODING -e DISAGG \ + -e AITER_REMOTE -e AITER_REF -e SGLANG_REMOTE -e SGLANG_REF \ --entrypoint=/bin/bash \ "$IMAGE" \ - "$BENCHMARK_SCRIPT" + -c "$RUN_CMD" fi fi From 609236c26b479b714ebfaacebff0221af8a63abd Mon Sep 17 00:00:00 2001 From: "Chen, Todd" Date: Wed, 18 Mar 2026 12:53:02 -0500 Subject: [PATCH 03/11] Refactor aiter installation process in patch_sgl_components.sh - Changed the directory navigation for aiter to ensure it operates within the correct path. - Updated the git commands to set the remote URL or add it if it doesn't exist, improving the installation flow. - Added cleanup commands to remove unnecessary build and distribution files, enhancing the installation process's efficiency. --- benchmarks/single_node/patch_sgl_components.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmarks/single_node/patch_sgl_components.sh b/benchmarks/single_node/patch_sgl_components.sh index 8b7961d47..a8800693d 100644 --- a/benchmarks/single_node/patch_sgl_components.sh +++ b/benchmarks/single_node/patch_sgl_components.sh @@ -23,13 +23,13 @@ fi if [[ -n "$AITER_REF" ]]; then pip uninstall amd-aiter -y - cd "$work_dir" - rm -rf aiter - git clone --recursive "$aiter_remote" aiter - cd aiter + cd "$work_dir/aiter" + git remote set-url origin "$aiter_remote" 2>/dev/null || git remote add origin "$aiter_remote" git fetch origin "$aiter_ref" 2>/dev/null || git fetch origin git checkout "$aiter_ref" 2>/dev/null || git reset --hard "origin/$aiter_ref" 2>/dev/null || git reset --hard "$aiter_ref" rm -rf aiter/jit/*.so 2>/dev/null || true + rm -rf aiter/jit/build 2>/dev/null || true + rm -rf aiter/jit/dist 2>/dev/null || true PREBUILD_KERNELS=0 python setup.py develop echo "aiter ($aiter_ref) installed from $aiter_remote" else From 16b7634d20e53e37925e3ef0a54bd9778ec2a7ef Mon Sep 17 00:00:00 2001 From: "Chen, Todd" Date: Wed, 18 Mar 2026 21:28:24 -0500 Subject: [PATCH 04/11] Add git submodule initialization for composable_kernel in patch_sgl_components.sh - Added a command to update and initialize the 3rdparty/composable_kernel submodule during the aiter installation process. - This change enhances the setup flow by ensuring all necessary components are properly initialized before installation. --- benchmarks/single_node/patch_sgl_components.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/single_node/patch_sgl_components.sh b/benchmarks/single_node/patch_sgl_components.sh index a8800693d..fcd6b6e61 100644 --- a/benchmarks/single_node/patch_sgl_components.sh +++ b/benchmarks/single_node/patch_sgl_components.sh @@ -30,6 +30,7 @@ if [[ -n "$AITER_REF" ]]; then rm -rf aiter/jit/*.so 2>/dev/null || true rm -rf aiter/jit/build 2>/dev/null || true rm -rf aiter/jit/dist 2>/dev/null || true + git submodule update --init --force 3rdparty/composable_kernel 2>/dev/null || true PREBUILD_KERNELS=0 python setup.py develop echo "aiter ($aiter_ref) installed from $aiter_remote" else From da6a41074cc3ca4133f5bfdf0cd192e2405e151f Mon Sep 17 00:00:00 2001 From: "Chen, Todd" Date: Wed, 18 Mar 2026 21:29:47 -0500 Subject: [PATCH 05/11] Update qwen3.5 benchmark script to enable fused QK norm ROPE and set environment variable - Added environment variable SGLANG_FUSED_QK_NORM_ROPE_CACHE_PTS_QUANT_SHUFFLE to optimize performance. - Included the --enable-fused-qk-norm-rope flag in the sglang launch command. - Adjusted directory navigation to ensure proper execution context. --- benchmarks/single_node/qwen3.5_bf16_mi355x.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh index f77390707..87df79324 100755 --- a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh +++ b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh @@ -23,6 +23,9 @@ PORT=${PORT:-8888} # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor +export SGLANG_FUSED_QK_NORM_ROPE_CACHE_PTS_QUANT_SHUFFLE=1 + +cd /sgl-workspace/sglang python3 -m sglang.launch_server \ --attention-backend triton \ --model-path $MODEL \ @@ -30,10 +33,11 @@ python3 -m sglang.launch_server \ --port $PORT \ --tensor-parallel-size $TP \ --trust-remote-code \ + --enable-fused-qk-norm-rope \ --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 & SERVER_PID=$! - +cd - # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" From d0f348d5b84d38a12e1aff7c3756a095f57e1e02 Mon Sep 17 00:00:00 2001 From: "Chen, Todd" Date: Wed, 18 Mar 2026 21:29:47 -0500 Subject: [PATCH 06/11] Update qwen3.5 image version in amd-master.yaml - Bumped the image version for qwen3.5 benchmarks from v0.5.8 to v0.5.9 for both mi355x and mi325x configurations, ensuring compatibility with the latest updates. --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 5551860f2..895396b55 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -140,7 +140,7 @@ dsr1-fp8-mi355x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-bf16-mi355x-sglang: - image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260215 + image: rocm/sgl-dev:v0.5.9-rocm720-mi35x-20260227 model: Qwen/Qwen3.5-397B-A17B model-prefix: qwen3.5 runner: mi355x @@ -228,7 +228,7 @@ qwen3.5-fp8-mi325x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-fp8-mi355x-sglang: - image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260218 + image: rocm/sgl-dev:v0.5.9-rocm720-mi35x-20260227 model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: mi355x From 9554b487a98e0696a5c89ebdb22ec7a77dd6652f Mon Sep 17 00:00:00 2001 From: "Chen, Todd" Date: Wed, 18 Mar 2026 21:31:12 -0500 Subject: [PATCH 07/11] Enable fused QK norm ROPE in qwen3.5 benchmark script - Set the environment variable SGLANG_FUSED_QK_NORM_ROPE_CACHE_PTS_QUANT_SHUFFLE to optimize performance. - Added the --enable-fused-qk-norm-rope flag to the sglang launch command for enhanced functionality. --- benchmarks/single_node/qwen3.5_fp8_mi355x.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh index f77390707..1ff52e1cc 100644 --- a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh +++ b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh @@ -20,6 +20,8 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +export SGLANG_FUSED_QK_NORM_ROPE_CACHE_PTS_QUANT_SHUFFLE=1 + # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -30,6 +32,7 @@ python3 -m sglang.launch_server \ --port $PORT \ --tensor-parallel-size $TP \ --trust-remote-code \ + --enable-fused-qk-norm-rope \ --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 & SERVER_PID=$! From 6313809dae9d4927a55ee7318ae6a676ca315d63 Mon Sep 17 00:00:00 2001 From: "Chen, Todd" Date: Fri, 20 Mar 2026 02:19:53 -0500 Subject: [PATCH 08/11] Add interactive benchmark script and result summarization tool for Qwen3.5 - Introduced `qwen3.5_bf16_mi355x_interactive.sh` for running interactive benchmarks with customizable options for tensor parallel size, sequence lengths, and concurrency. - Added `summarize_interactive_results.py` to process benchmark results and output them in a markdown table format, enhancing result analysis and reporting capabilities. --- .../qwen3.5_bf16_mi355x_interactive.sh | 174 ++++++++++ utils/summarize_interactive_results.py | 318 ++++++++++++++++++ 2 files changed, 492 insertions(+) create mode 100755 benchmarks/single_node/qwen3.5_bf16_mi355x_interactive.sh create mode 100644 utils/summarize_interactive_results.py diff --git a/benchmarks/single_node/qwen3.5_bf16_mi355x_interactive.sh b/benchmarks/single_node/qwen3.5_bf16_mi355x_interactive.sh new file mode 100755 index 000000000..f0036d1db --- /dev/null +++ b/benchmarks/single_node/qwen3.5_bf16_mi355x_interactive.sh @@ -0,0 +1,174 @@ +#!/usr/bin/env bash +# +# Interactive benchmark script for Qwen3.5-397B-A17B on MI355X. +# Runs inside Docker with CLI args for TP, sequence lengths, and concurrency. +# +# Usage: +# ./qwen3.5_bf16_mi355x_interactive.sh [OPTIONS] +# +# Options: +# -tp N Tensor parallel size (default: 8) +# -sl ISL,OSL ... Space-separated isl,osl pairs (default: 1024,1024 8192,1024 1024,8192) +# -conc N N ... Space-separated concurrency values (default: 8) +# -result-dir DIR Output directory (default: /workspace/) +# +# Examples: +# ./qwen3.5_bf16_mi355x_interactive.sh +# ./qwen3.5_bf16_mi355x_interactive.sh -tp 8 -sl 1024,1024 8192,1024 -conc 8 16 32 +# ./qwen3.5_bf16_mi355x_interactive.sh -result-dir /workspace/results + +set -e + +# Defaults +TP=8 +SL_LIST=("1024,1024" "8192,1024" "1024,8192") +CONC_LIST=(8) +RESULT_DIR="/workspace/" + +usage() { + echo "Usage: $0 [OPTIONS]" + echo "" + echo "Options:" + echo " -tp N Tensor parallel size (default: 8)" + echo " -sl ISL,OSL ... Space-separated isl,osl pairs (default: 1024,1024 8192,1024 1024,8192)" + echo " -conc N N ... Space-separated concurrency values (default: 8)" + echo " -result-dir DIR Output directory (default: /workspace/)" + echo "" + echo "Examples:" + echo " $0" + echo " $0 -tp 8 -sl 1024,1024 8192,1024 -conc 8 16 32" + echo " $0 -result-dir /workspace/results" + exit 1 +} + +# Parse CLI args +while [[ $# -gt 0 ]]; do + case $1 in + -tp) + TP="$2" + shift 2 + ;; + -sl) + shift + SL_LIST=() + while [[ $# -gt 0 && ! "$1" =~ ^- ]]; do + SL_LIST+=("$1") + shift + done + [[ ${#SL_LIST[@]} -eq 0 ]] && SL_LIST=("1024,1024" "8192,1024" "1024,8192") + ;; + -conc) + shift + CONC_LIST=() + while [[ $# -gt 0 && ! "$1" =~ ^- ]]; do + CONC_LIST+=("$1") + shift + done + [[ ${#CONC_LIST[@]} -eq 0 ]] && CONC_LIST=(8) + ;; + -result-dir) + RESULT_DIR="$2" + shift 2 + ;; + -h|--help) + usage + ;; + *) + echo "Unknown option: $1" + usage + ;; + esac +done + +# Validate TP +if ! [[ "$TP" =~ ^[0-9]+$ ]]; then + echo "Error: -tp must be a positive integer, got: $TP" + exit 1 +fi + +# Validate each sl pair (isl,osl) +for pair in "${SL_LIST[@]}"; do + if ! [[ "$pair" =~ ^[0-9]+,[0-9]+$ ]]; then + echo "Error: -sl pair must be isl,osl (e.g. 1024,1024), got: $pair" + exit 1 + fi +done + +# Validate each conc +for c in "${CONC_LIST[@]}"; do + if ! [[ "$c" =~ ^[0-9]+$ ]]; then + echo "Error: -conc values must be positive integers, got: $c" + exit 1 + fi +done + +# Ensure result dir exists and ends with / +[[ "${RESULT_DIR}" != */ ]] && RESULT_DIR="${RESULT_DIR}/" +mkdir -p "$RESULT_DIR" + +# Set optional env vars +export MODEL="${MODEL:-Qwen/Qwen3.5-397B-A17B}" +export RANDOM_RANGE_RATIO="${RANDOM_RANGE_RATIO:-0.1}" + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars MODEL RANDOM_RANGE_RATIO + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +echo "Config: TP=$TP, SL=${SL_LIST[*]}, CONC=${CONC_LIST[*]}, RESULT_DIR=$RESULT_DIR" + +hf download "$MODEL" + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +# Start GPU monitoring +start_gpu_monitor + +python3 -m sglang.launch_server \ + --attention-backend triton \ + --model-path $MODEL \ + --host=0.0.0.0 \ + --port $PORT \ + --tensor-parallel-size $TP \ + --trust-remote-code \ + --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +# Loop over (isl,osl) and conc +for sl_pair in "${SL_LIST[@]}"; do + IFS=',' read -r ISL OSL <<< "$sl_pair" + for CONC in "${CONC_LIST[@]}"; do + RESULT_FILENAME="result_TP${TP}_CONC${CONC}_ISL${ISL}_OSL${OSL}.json" + echo "Running: ISL=$ISL OSL=$OSL CONC=$CONC -> $RESULT_FILENAME" + + run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir "$RESULT_DIR" + + if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary + fi + done +done + +stop_gpu_monitor +echo "Done. Results in $RESULT_DIR" +echo "" +echo "To summarize results into a markdown table, run (from repo root):" +echo " python3 utils/summarize_interactive_results.py $RESULT_DIR -o summary.md" diff --git a/utils/summarize_interactive_results.py b/utils/summarize_interactive_results.py new file mode 100644 index 000000000..bccd8bbe6 --- /dev/null +++ b/utils/summarize_interactive_results.py @@ -0,0 +1,318 @@ +#!/usr/bin/env python3 +""" +Process raw benchmark result JSONs from the interactive script and summarize into a markdown table. + +Follows the same processing flow as benchmark-tmpl.yml and e2e-tests.yml: +1. Process each result_*.json (raw benchmark_serving output) into agg format +2. Collect all combinations +3. Output markdown table via tabulate + +Usage: + python utils/summarize_interactive_results.py [options] + +Examples: + python utils/summarize_interactive_results.py /workspace/ + python utils/summarize_interactive_results.py ./results -o summary.md + python utils/summarize_interactive_results.py ./results --hw mi355x --model-prefix qwen3.5 +""" +import argparse +import json +import re +import sys +from pathlib import Path +from typing import Any, Optional + +try: + from tabulate import tabulate +except ImportError: + tabulate = None + + +# Filename pattern: result_TP{TP}_CONC{CONC}_ISL{ISL}_OSL{OSL}.json +RESULT_PATTERN = re.compile( + r"result_TP(\d+)_CONC(\d+)_ISL(\d+)_OSL(\d+)\.json", re.IGNORECASE +) + + +def parse_result_filename(path: Path) -> Optional[tuple[int, int, int, int]]: + """Extract TP, CONC, ISL, OSL from filename. Returns (tp, conc, isl, osl) or None.""" + m = RESULT_PATTERN.match(path.name) + if m: + return int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(4)) + return None + + +def process_raw_result( + raw: dict[str, Any], + tp: int, + conc: int, + isl: int, + osl: int, + *, + hw: str = "mi355x", + model_prefix: str = "qwen3.5", + framework: str = "sglang", + precision: str = "bf16", + image: str = "", +) -> dict[str, Any]: + """ + Convert raw benchmark_serving JSON to agg format (same schema as process_result.py output). + """ + model_id = raw.get("model_id", "unknown") + total_tput = float(raw.get("total_token_throughput", 0)) + output_tput = float(raw.get("output_throughput", 0)) + max_conc = int(raw.get("max_concurrency", conc)) + + data: dict[str, Any] = { + "hw": hw, + "conc": max_conc, + "image": image, + "model": model_id, + "infmax_model_prefix": model_prefix, + "framework": framework, + "precision": precision, + "spec_decoding": "none", + "disagg": False, + "isl": isl, + "osl": osl, + "is_multinode": False, + "tp": tp, + "ep": 1, + "dp_attention": False, + "tput_per_gpu": total_tput / tp, + "output_tput_per_gpu": output_tput / tp, + "input_tput_per_gpu": (total_tput - output_tput) / tp, + } + + # Convert *_ms to seconds (process_result logic) + for key, value in raw.items(): + if key.endswith("_ms") and isinstance(value, (int, float)): + data[key.replace("_ms", "")] = float(value) / 1000.0 + if "tpot" in key and isinstance(value, (int, float)) and value > 0: + data[key.replace("_ms", "").replace("tpot", "intvty")] = 1000.0 / float(value) + + # Ensure required fields for summarize (fallbacks if raw uses different keys) + if "median_ttft" not in data and "median_ttft_ms" in raw: + data["median_ttft"] = float(raw["median_ttft_ms"]) / 1000.0 + if "median_tpot" not in data and "median_tpot_ms" in raw: + data["median_tpot"] = float(raw["median_tpot_ms"]) / 1000.0 + if "median_e2el" not in data and "median_e2el_ms" in raw: + data["median_e2el"] = float(raw["median_e2el_ms"]) / 1000.0 + if "median_intvty" not in data and "median_tpot_ms" in raw: + data["median_intvty"] = 1000.0 / float(raw["median_tpot_ms"]) + data.setdefault("median_ttft", 0.0) + data.setdefault("median_tpot", 0.0) + data.setdefault("median_e2el", 0.0) + data.setdefault("median_intvty", 0.0) + + return data + + +def load_and_process( + result_dir: Path, + *, + hw: str = "mi355x", + model_prefix: str = "qwen3.5", + framework: str = "sglang", + precision: str = "bf16", + image: str = "", +) -> list[dict[str, Any]]: + """Load all result_*.json files, process, and return agg records.""" + agg_results: list[dict[str, Any]] = [] + + for path in sorted(result_dir.rglob("*.json")): + parsed = parse_result_filename(path) + if not parsed: + continue + + tp, conc, isl, osl = parsed + try: + with open(path) as f: + raw = json.load(f) + except (json.JSONDecodeError, OSError) as e: + print(f"Warning: Skipping {path}: {e}", file=sys.stderr) + continue + + if "total_token_throughput" not in raw or "model_id" not in raw: + print(f"Warning: Skipping {path}: missing required fields", file=sys.stderr) + continue + + agg = process_raw_result( + raw, + tp, + conc, + isl, + osl, + hw=hw, + model_prefix=model_prefix, + framework=framework, + precision=precision, + image=image, + ) + agg_results.append(agg) + + return agg_results + + +def to_markdown_table(results: list[dict[str, Any]]) -> str: + """Format results as markdown table (same columns as summarize.py single-node).""" + if not results: + return "No results to display.\n" + + results.sort( + key=lambda r: ( + r["infmax_model_prefix"], + r["hw"], + r["framework"], + r["precision"], + r["isl"], + r["osl"], + r["tp"], + r["ep"], + r["conc"], + ) + ) + + headers = [ + "Model", + "Served Model", + "Hardware", + "Framework", + "Precision", + "ISL", + "OSL", + "TP", + "EP", + "DP Attention", + "Conc", + "TTFT (ms)", + "TPOT (ms)", + "Interactivity (tok/s/user)", + "E2EL (s)", + "TPUT per GPU", + "Output TPUT per GPU", + "Input TPUT per GPU", + ] + + rows = [ + [ + r["infmax_model_prefix"], + r["model"], + r["hw"].upper(), + r["framework"].upper(), + r["precision"].upper(), + r["isl"], + r["osl"], + r["tp"], + r["ep"], + r["dp_attention"], + r["conc"], + f"{r['median_ttft'] * 1000:.4f}", + f"{r['median_tpot'] * 1000:.4f}", + f"{r['median_intvty']:.4f}", + f"{r['median_e2el']:.4f}", + f"{r['tput_per_gpu']:.4f}", + f"{r['output_tput_per_gpu']:.4f}", + f"{r['input_tput_per_gpu']:.4f}", + ] + for r in results + ] + + if tabulate: + return tabulate(rows, headers=headers, tablefmt="github") + # Fallback: simple markdown table + sep = "|" + "|".join(["---"] * len(headers)) + "|" + header_row = "|" + "|".join(headers) + "|" + data_rows = "\n".join("|" + "|".join(str(c) for c in row) + "|" for row in rows) + return f"{header_row}\n{sep}\n{data_rows}" + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Process interactive benchmark results and output markdown table" + ) + parser.add_argument( + "result_dir", + type=Path, + help="Directory containing result_TP*_CONC*_ISL*_OSL*.json files", + ) + parser.add_argument( + "-o", + "--output", + type=Path, + default=None, + help="Write markdown to file (default: stdout)", + ) + parser.add_argument( + "--hw", + type=str, + default="mi355x", + help="Hardware label (default: mi355x)", + ) + parser.add_argument( + "--model-prefix", + type=str, + default="qwen3.5", + help="Model prefix (default: qwen3.5)", + ) + parser.add_argument( + "--framework", + type=str, + default="sglang", + help="Framework (default: sglang)", + ) + parser.add_argument( + "--precision", + type=str, + default="bf16", + help="Precision (default: bf16)", + ) + parser.add_argument( + "--image", + type=str, + default="", + help="Docker image (optional)", + ) + parser.add_argument( + "--agg-json", + type=Path, + default=None, + help="Also write aggregated JSON to file (for collect_results compatibility)", + ) + + args = parser.parse_args() + + if not args.result_dir.is_dir(): + print(f"Error: {args.result_dir} is not a directory", file=sys.stderr) + sys.exit(1) + + results = load_and_process( + args.result_dir, + hw=args.hw, + model_prefix=args.model_prefix, + framework=args.framework, + precision=args.precision, + image=args.image, + ) + + if not results: + print("No result files found matching result_TP*_CONC*_ISL*_OSL*.json", file=sys.stderr) + sys.exit(1) + + md = to_markdown_table(results) + full_output = f"## Interactive Benchmark Results\n\n{md}\n" + + if args.output: + args.output.write_text(full_output) + print(f"Wrote summary to {args.output}", file=sys.stderr) + else: + print(full_output) + + if args.agg_json: + with open(args.agg_json, "w") as f: + json.dump(results, f, indent=2) + print(f"Wrote aggregated JSON to {args.agg_json}", file=sys.stderr) + + +if __name__ == "__main__": + main() From eb9298a51887c947f74068a66f2a8828036d09f0 Mon Sep 17 00:00:00 2001 From: "Chen, Todd" Date: Fri, 20 Mar 2026 03:35:05 -0500 Subject: [PATCH 09/11] Update qwen3.5 benchmark script for improved configuration and logging - Changed RESULT_DIR and SERVER_LOG paths to a user-specific directory for better organization. - Added new parameters for memory fraction, chunked prefill size, max prefill tokens, and context length to enhance server launch configuration. - Introduced dynamic SCHEDULER_RECV_INTERVAL based on concurrency level to optimize performance during benchmarking. --- .../qwen3.5_bf16_mi355x_interactive.sh | 32 ++++++++++++++++--- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/benchmarks/single_node/qwen3.5_bf16_mi355x_interactive.sh b/benchmarks/single_node/qwen3.5_bf16_mi355x_interactive.sh index f0036d1db..eb1e6cac1 100755 --- a/benchmarks/single_node/qwen3.5_bf16_mi355x_interactive.sh +++ b/benchmarks/single_node/qwen3.5_bf16_mi355x_interactive.sh @@ -23,7 +23,7 @@ set -e TP=8 SL_LIST=("1024,1024" "8192,1024" "1024,8192") CONC_LIST=(8) -RESULT_DIR="/workspace/" +RESULT_DIR="/home/zhenchen/projects/InferenceX/" usage() { echo "Usage: $0 [OPTIONS]" @@ -37,7 +37,7 @@ usage() { echo "Examples:" echo " $0" echo " $0 -tp 8 -sl 1024,1024 8192,1024 -conc 8 16 32" - echo " $0 -result-dir /workspace/results" + echo " $0 -result-dir /home/zhenchen/projects/InferenceX/results" exit 1 } @@ -122,20 +122,36 @@ echo "Config: TP=$TP, SL=${SL_LIST[*]}, CONC=${CONC_LIST[*]}, RESULT_DIR=$RESULT hf download "$MODEL" -SERVER_LOG=/workspace/server.log +SERVER_LOG=/home/zhenchen/projects/InferenceX/server.log PORT=${PORT:-8888} +MEM_FRAC_STATIC=0.82 +CHUNKED_PREFILL_SIZE=32768 +MAX_PREFILL_TOKENS=32768 +CUDA_GRAPH_MAX_BATCH_SIZE=$CONC +MAX_RUNNING_REQUESTS=128 +CONTEXT_LENGTH=$((ISL + OSL + 20)) # Start GPU monitoring start_gpu_monitor -python3 -m sglang.launch_server \ +set -x +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ --attention-backend triton \ --model-path $MODEL \ --host=0.0.0.0 \ --port $PORT \ --tensor-parallel-size $TP \ --trust-remote-code \ - --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 & + --mem-fraction-static $MEM_FRAC_STATIC \ + --chunked-prefill-size $CHUNKED_PREFILL_SIZE \ + --max-prefill-tokens $MAX_PREFILL_TOKENS \ + --cuda-graph-max-batch-size $CUDA_GRAPH_MAX_BATCH_SIZE \ + --max-running-requests $MAX_RUNNING_REQUESTS \ + --enable-aiter-allreduce-fusion \ + --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ + --tokenizer-worker-num 6 \ + --stream-interval 30 \ + --context-length $CONTEXT_LENGTH > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -148,6 +164,12 @@ for sl_pair in "${SL_LIST[@]}"; do RESULT_FILENAME="result_TP${TP}_CONC${CONC}_ISL${ISL}_OSL${OSL}.json" echo "Running: ISL=$ISL OSL=$OSL CONC=$CONC -> $RESULT_FILENAME" + if [[ $CONC -ge 16 ]]; then + SCHEDULER_RECV_INTERVAL=30 + else + SCHEDULER_RECV_INTERVAL=10 + fi + run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ From 7a864e746cf05671c587863d3dfc39d1e399fa7e Mon Sep 17 00:00:00 2001 From: "Chen, Todd" Date: Fri, 20 Mar 2026 11:25:51 -0500 Subject: [PATCH 10/11] Enhance qwen3.5 benchmark script with additional configuration options - Introduced new parameters for memory fraction, chunked prefill size, max prefill tokens, and context length to improve server launch settings. - Implemented dynamic SCHEDULER_RECV_INTERVAL based on concurrency level to optimize request handling. - Updated the launch command to include new flags for enhanced functionality and performance during benchmarking. --- benchmarks/single_node/qwen3.5_bf16_mi355x.sh | 28 ++++++++++++++-- benchmarks/single_node/qwen3.5_fp8_mi355x.sh | 32 ++++++++++++++++--- 2 files changed, 53 insertions(+), 7 deletions(-) diff --git a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh index 87df79324..daa9b1f37 100755 --- a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh +++ b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh @@ -19,6 +19,19 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +MEM_FRAC_STATIC=0.82 +CHUNKED_PREFILL_SIZE=32768 +MAX_PREFILL_TOKENS=32768 +CUDA_GRAPH_MAX_BATCH_SIZE=$CONC +MAX_RUNNING_REQUESTS=128 +CONTEXT_LENGTH=$((ISL + OSL + 20)) + +# Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls. +if [[ $CONC -ge 16 ]]; then + SCHEDULER_RECV_INTERVAL=30 +else + SCHEDULER_RECV_INTERVAL=10 +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -26,15 +39,24 @@ start_gpu_monitor export SGLANG_FUSED_QK_NORM_ROPE_CACHE_PTS_QUANT_SHUFFLE=1 cd /sgl-workspace/sglang -python3 -m sglang.launch_server \ +set -x +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ --attention-backend triton \ --model-path $MODEL \ --host=0.0.0.0 \ --port $PORT \ --tensor-parallel-size $TP \ --trust-remote-code \ - --enable-fused-qk-norm-rope \ - --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 & + --mem-fraction-static $MEM_FRAC_STATIC \ + --chunked-prefill-size $CHUNKED_PREFILL_SIZE \ + --max-prefill-tokens $MAX_PREFILL_TOKENS \ + --cuda-graph-max-batch-size $CUDA_GRAPH_MAX_BATCH_SIZE \ + --max-running-requests $MAX_RUNNING_REQUESTS \ + --enable-aiter-allreduce-fusion \ + --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ + --tokenizer-worker-num 6 \ + --stream-interval 30 \ + --context-length $CONTEXT_LENGTH > $SERVER_LOG 2>&1 & SERVER_PID=$! cd - diff --git a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh index 1ff52e1cc..b6258a67c 100644 --- a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh +++ b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh @@ -19,21 +19,45 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +MEM_FRAC_STATIC=0.82 +CHUNKED_PREFILL_SIZE=32768 +MAX_PREFILL_TOKENS=32768 +CUDA_GRAPH_MAX_BATCH_SIZE=$CONC +MAX_RUNNING_REQUESTS=128 +CONTEXT_LENGTH=$((ISL + OSL + 20)) -export SGLANG_FUSED_QK_NORM_ROPE_CACHE_PTS_QUANT_SHUFFLE=1 +# Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls. +if [[ $CONC -ge 16 ]]; then + SCHEDULER_RECV_INTERVAL=30 +else + SCHEDULER_RECV_INTERVAL=10 +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor -python3 -m sglang.launch_server \ +export SGLANG_FUSED_QK_NORM_ROPE_CACHE_PTS_QUANT_SHUFFLE=1 + +cd /sgl-workspace/sglang +set -x +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ --attention-backend triton \ --model-path $MODEL \ --host=0.0.0.0 \ --port $PORT \ --tensor-parallel-size $TP \ --trust-remote-code \ - --enable-fused-qk-norm-rope \ - --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 & + --mem-fraction-static $MEM_FRAC_STATIC \ + --chunked-prefill-size $CHUNKED_PREFILL_SIZE \ + --max-prefill-tokens $MAX_PREFILL_TOKENS \ + --cuda-graph-max-batch-size $CUDA_GRAPH_MAX_BATCH_SIZE \ + --max-running-requests $MAX_RUNNING_REQUESTS \ + --enable-aiter-allreduce-fusion \ + --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ + --tokenizer-worker-num 6 \ + --stream-interval 30 \ + --kv-cache-dtype fp8_e4m3 \ + --context-length $CONTEXT_LENGTH > $SERVER_LOG 2>&1 & SERVER_PID=$! From 6407c08533544a05b8e74fc1c986c15c2f335e7a Mon Sep 17 00:00:00 2001 From: "Chen, Todd" Date: Fri, 20 Mar 2026 11:30:19 -0500 Subject: [PATCH 11/11] Update qwen3.5 benchmark scripts to use shortened flag for CUDA graph batch size - Changed the flag from --cuda-graph-max-batch-size to --cuda-graph-max-bs for consistency and brevity in both bf16 and fp8 benchmark scripts. --- benchmarks/single_node/qwen3.5_bf16_mi355x.sh | 2 +- benchmarks/single_node/qwen3.5_fp8_mi355x.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh index daa9b1f37..d98818839 100755 --- a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh +++ b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh @@ -50,7 +50,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ --mem-fraction-static $MEM_FRAC_STATIC \ --chunked-prefill-size $CHUNKED_PREFILL_SIZE \ --max-prefill-tokens $MAX_PREFILL_TOKENS \ - --cuda-graph-max-batch-size $CUDA_GRAPH_MAX_BATCH_SIZE \ + --cuda-graph-max-bs $CUDA_GRAPH_MAX_BATCH_SIZE \ --max-running-requests $MAX_RUNNING_REQUESTS \ --enable-aiter-allreduce-fusion \ --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ diff --git a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh index b6258a67c..44c38b35d 100644 --- a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh +++ b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh @@ -50,7 +50,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ --mem-fraction-static $MEM_FRAC_STATIC \ --chunked-prefill-size $CHUNKED_PREFILL_SIZE \ --max-prefill-tokens $MAX_PREFILL_TOKENS \ - --cuda-graph-max-batch-size $CUDA_GRAPH_MAX_BATCH_SIZE \ + --cuda-graph-max-bs $CUDA_GRAPH_MAX_BATCH_SIZE \ --max-running-requests $MAX_RUNNING_REQUESTS \ --enable-aiter-allreduce-fusion \ --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \