From 924f6941903b19646ea0c528a8e1c91af4f7cec6 Mon Sep 17 00:00:00 2001
From: "Chen, Todd" <zhenchen@amd.com>
Date: Mon, 16 Mar 2026 12:22:47 -0500
Subject: [PATCH 01/11] Enhance safety and logging in launch_mi355x-amds.sh

- Introduced a safe_sudo_rm function to prevent path injection when removing files under the workspace.
- Updated BENCHMARK_LOGS_DIR to ensure it is always under GITHUB_WORKSPACE, adding error handling for invalid paths.
- Replaced direct sudo rm calls with safe_sudo_rm to improve security.
- Modified the log collection script to use a temporary file, ensuring cleanup after execution.
- Added checks to ensure workspace paths are absolute and valid, enhancing robustness of the script.
---
 runners/launch_mi355x-amds.sh | 168 +++++++++++++++++++++++-----------
 1 file changed, 113 insertions(+), 55 deletions(-)

diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh
index f2471466b..692d7d0a0 100644
--- a/runners/launch_mi355x-amds.sh
+++ b/runners/launch_mi355x-amds.sh
@@ -1,5 +1,16 @@
 #!/usr/bin/env bash
 
+# Sudo rm only for paths under workspace; guards against path injection / escaping
+safe_sudo_rm() {
+    local target="$1"
+    local workspace="${2:-$GITHUB_WORKSPACE}"
+    if [[ -z "$workspace" || -z "$target" ]]; then return 0; fi
+    if [[ "$workspace" != /* ]]; then return 0; fi
+    if [[ "$target" != "$workspace"/* ]]; then return 0; fi
+    if [[ "$target" == *".."* ]]; then return 0; fi
+    sudo rm -rf "$target" 2>/dev/null || true
+}
+
 scancel_sync() {
     local jobid=$1
     local timeout=${2:-600}
@@ -46,10 +57,14 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
     export ISL="$ISL"
     export OSL="$OSL"
 
-    # Logs go to BENCHMARK_LOGS_DIR (NFS-accessible, outside the repo tree)
+    # Logs go to BENCHMARK_LOGS_DIR (must be under workspace - no host modification outside)
     export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$GITHUB_WORKSPACE/benchmark_logs}"
+    if [[ -z "$GITHUB_WORKSPACE" || -z "$BENCHMARK_LOGS_DIR" ]] || [[ "$BENCHMARK_LOGS_DIR" != "$GITHUB_WORKSPACE"/* ]]; then
+        echo "ERROR: BENCHMARK_LOGS_DIR must be under GITHUB_WORKSPACE. Got BENCHMARK_LOGS_DIR=$BENCHMARK_LOGS_DIR" >&2
+        exit 1
+    fi
     mkdir -p "$BENCHMARK_LOGS_DIR"
-    sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true
+    safe_sudo_rm "$BENCHMARK_LOGS_DIR/logs" "$GITHUB_WORKSPACE"
 
     SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_mi355x_${FRAMEWORK}.sh"
     if [[ "$FRAMEWORK" == "sglang-disagg" ]]; then
@@ -101,14 +116,16 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
     # search for "FRAMEWORK_DIFF_IF_STATEMENT #3" for this if-statement
     # Find the latest log directory that contains the data
 
-    cat > collect_latest_results.py <<'PY'
+    COLLECT_SCRIPT=$(mktemp)
+    trap "rm -f '$COLLECT_SCRIPT'" EXIT
+    cat > "$COLLECT_SCRIPT" <<'PY'
 import os, sys
 sgl_job_dir, isl, osl, nexp = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4])
 for path in sorted([f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}" for name in os.listdir(f"{sgl_job_dir}/logs/") if os.path.isdir(f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]:
     print(path)
 PY
 
-    LOGS_DIR=$(python3 collect_latest_results.py "$BENCHMARK_LOGS_DIR" "$ISL" "$OSL" 1)
+    LOGS_DIR=$(python3 "$COLLECT_SCRIPT" "$BENCHMARK_LOGS_DIR" "$ISL" "$OSL" 1)
     if [ -z "$LOGS_DIR" ]; then
         echo "No logs directory found for ISL=${ISL}, OSL=${OSL}"
         exit 1
@@ -117,15 +134,13 @@ PY
     echo "Found logs directory: $LOGS_DIR"
     ls -la "$LOGS_DIR"
 
-    # Result JSON are contained within the result directory
-    for result_file in $(find $LOGS_DIR -type f); do
-        # result_file should directly be isl_ISL_osl_OSL_concurrency_CONC_req_rate_R_gpus_N_ctx_M_gen_N.json
-        file_name=$(basename $result_file)
-        if [ -f $result_file ]; then
-            # Copy the result file to workspace with a unique name
+    # Result JSON are contained within the result directory (copy only into workspace)
+    for result_file in $(find "$LOGS_DIR" -type f); do
+        file_name=$(basename "$result_file")
+        if [[ -f "$result_file" && -n "$GITHUB_WORKSPACE" && "$GITHUB_WORKSPACE" == /* ]]; then
             WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${file_name}"
             echo "Found result file ${result_file}. Copying it to ${WORKSPACE_RESULT_FILE}"
-            cp $result_file $WORKSPACE_RESULT_FILE
+            cp "$result_file" "$WORKSPACE_RESULT_FILE"
         fi
     done
 
@@ -136,10 +151,10 @@ PY
     set -x
     echo "Canceled the slurm job $JOB_ID"
 
-    sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true
+    safe_sudo_rm "$BENCHMARK_LOGS_DIR/logs" "$GITHUB_WORKSPACE"
 
-    # Upload logs as artifact if running in GitHub Actions
-    if [[ -n "${GITHUB_ACTIONS:-}" ]]; then
+    # Upload logs as artifact if running in GitHub Actions (workspace only)
+    if [[ -n "${GITHUB_ACTIONS:-}" && -n "$GITHUB_WORKSPACE" && "$GITHUB_WORKSPACE" == /* ]]; then
         ARTIFACT_DIR="$GITHUB_WORKSPACE/benchmark_artifacts"
         mkdir -p "$ARTIFACT_DIR"
         cp -r "$BENCHMARK_LOGS_DIR"/slurm_job-${JOB_ID}.{out,err} "$ARTIFACT_DIR/" 2>/dev/null || true
@@ -148,53 +163,96 @@ PY
 
 else
 
-    export HF_HUB_CACHE_MOUNT="/docker/huggingface/hub"
+    export HF_HUB_CACHE_MOUNT="${HF_HUB_CACHE_MOUNT:-/docker/huggingface/hub}"
     export PORT_OFFSET=${RUNNER_NAME: -1}
     export PORT=$(( 8888 + ${PORT_OFFSET} ))
     FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "atom" ]] && printf '_atom' || printf '')
     SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
+    BENCHMARK_SCRIPT="benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_mi355x${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh"
+    WORKSPACE="${GITHUB_WORKSPACE:-$(pwd)}"
+    if [[ -z "$WORKSPACE" || "$WORKSPACE" != /* ]]; then
+        echo "ERROR: WORKSPACE must be an absolute path. Got: $WORKSPACE" >&2
+        exit 1
+    fi
+    mkdir -p "$HF_HUB_CACHE_MOUNT"
+
+    if command -v salloc &>/dev/null && command -v srun &>/dev/null && command -v squeue &>/dev/null; then
+        # SLURM path: allocate via salloc, run via srun with enroot/squash container
+        PARTITION="compute"
+        SQUASH_FILE="/var/lib/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+        LOCK_FILE="${SQUASH_FILE}.lock"
+
+        set -x
+        salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=128 --time=180 --no-shell --job-name="$RUNNER_NAME"
+        JOB_ID=$(squeue --name="$RUNNER_NAME" -h -o %A | head -n1)
+        if [[ -z "$JOB_ID" ]]; then
+            echo "ERROR: salloc failed or no job found for $RUNNER_NAME. Check partition=$PARTITION and GPU availability." >&2
+            exit 1
+        fi
 
-    PARTITION="compute"
-    SQUASH_FILE="/var/lib/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
-    LOCK_FILE="${SQUASH_FILE}.lock"
-
-    set -x
-    salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=128 --time=180 --no-shell --job-name="$RUNNER_NAME"
-    JOB_ID=$(squeue --name="$RUNNER_NAME" -h -o %A | head -n1)
-
-    # Remove leftover bmk-server from previous run so we can reuse the name (targeted cleanup only)
-    srun --jobid=$JOB_ID bash -c "docker rm -f bmk-server 2>/dev/null || true"
-
-    # Use flock to serialize concurrent imports to the same squash file
-    srun --jobid=$JOB_ID bash -c "
-        exec 9>\"$LOCK_FILE\"
-        flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; }
-        if [[ \"$FRAMEWORK\" == \"atom\" ]]; then
-            rm -f \"$SQUASH_FILE\"
+        srun --jobid=$JOB_ID bash -c "docker rm -f bmk-server 2>/dev/null || true"
+
+        # Note: This block runs on the compute node and modifies /var/lib/squash (enroot cache).
+        # Only squash files under /var/lib/squash/ are touched - no user data or workspace.
+        srun --jobid=$JOB_ID bash -c "
+            exec 9>\"$LOCK_FILE\"
+            flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; }
+            if [[ \"$SQUASH_FILE\" != /var/lib/squash/* ]]; then exit 1; fi
+            if [[ \"$FRAMEWORK\" == \"atom\" ]]; then
+                rm -f \"$SQUASH_FILE\"
+            fi
+            if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then
+                echo 'Squash file already exists and is valid, skipping import'
+            else
+                rm -f \"$SQUASH_FILE\"
+                enroot import -o \"$SQUASH_FILE\" docker://$IMAGE
+            fi
+        "
+
+        export VLLM_CACHE_ROOT="/it-share/gharunners/.cache/vllm"
+
+        srun --jobid=$JOB_ID \
+            --container-image=$SQUASH_FILE \
+            --container-mounts=$WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
+            --container-mount-home \
+            --container-writable \
+            --container-workdir=/workspace/ \
+            --no-container-entrypoint --export=ALL \
+            bash "$BENCHMARK_SCRIPT"
+
+        scancel $JOB_ID
+
+        # Remove gpucore temp files only within workspace (no host modification outside)
+        if ls "$WORKSPACE"/gpucore.* 1> /dev/null 2>&1; then
+            echo "gpucore files exist. not good"
+            for f in "$WORKSPACE"/gpucore.*; do
+                [[ -e "$f" ]] && safe_sudo_rm "$f" "$WORKSPACE"
+            done
         fi
-        if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then
-            echo 'Squash file already exists and is valid, skipping import'
-        else
-            rm -f \"$SQUASH_FILE\"
-            enroot import -o \"$SQUASH_FILE\" docker://$IMAGE
+    else
+        # Non-SLURM path: run directly with Docker (no salloc/srun)
+        if ! command -v docker &>/dev/null; then
+            echo "ERROR: Neither SLURM nor Docker found. Install SLURM (for cluster) or Docker (for standalone)." >&2
+            exit 1
         fi
-    "
-
-    export VLLM_CACHE_ROOT="/it-share/gharunners/.cache/vllm"
-
-    srun --jobid=$JOB_ID \
-        --container-image=$SQUASH_FILE \
-        --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
-        --container-mount-home \
-        --container-writable \
-        --container-workdir=/workspace/ \
-        --no-container-entrypoint --export=ALL \
-        bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_mi355x${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh
-
-    scancel $JOB_ID
-
-    if ls gpucore.* 1> /dev/null 2>&1; then
-        echo "gpucore files exist. not good"
-        rm -f gpucore.*
+        echo "SLURM not available; using Docker directly."
+
+        server_name="bmk-server"
+        docker rm -f "$server_name" 2>/dev/null || true
+
+        set -x
+        docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
+            --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \
+            --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
+            -v "$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE" \
+            -v "$WORKSPACE:/workspace/" -w /workspace/ \
+            -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e EP_SIZE -e DP_ATTENTION -e CONC \
+            -e MAX_MODEL_LEN -e PORT=$PORT -e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \
+            -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL -e RUNNER_TYPE \
+            -e PROFILE -e SGLANG_TORCH_PROFILER_DIR -e VLLM_TORCH_PROFILER_DIR -e VLLM_RPC_TIMEOUT \
+            -e SPEC_DECODING -e DISAGG \
+            --entrypoint=/bin/bash \
+            "$IMAGE" \
+            "$BENCHMARK_SCRIPT"
     fi
 fi

From 54a1177b90b68bdf6fbfea2851c585f4fa7f832c Mon Sep 17 00:00:00 2001
From: "Chen, Todd" <zhenchen@amd.com>
Date: Wed, 18 Mar 2026 12:10:43 -0500
Subject: [PATCH 02/11] Add aiter and sglang configuration options to workflows

- Introduced new inputs for aiter and sglang repositories and references in benchmark-tmpl.yml and e2e-tests.yml.
- Updated environment variables in both workflows to utilize the new inputs.
- Modified launch_mi355x-amds.sh to conditionally reinstall aiter and sglang based on the new environment variables before running benchmarks.
---
 .github/workflows/benchmark-tmpl.yml          | 20 ++++++++
 .github/workflows/e2e-tests.yml               | 38 ++++++++++++++-
 .../single_node/patch_sgl_components.sh       | 47 +++++++++++++++++++
 runners/launch_mi355x-amds.sh                 |  9 +++-
 4 files changed, 111 insertions(+), 3 deletions(-)
 create mode 100644 benchmarks/single_node/patch_sgl_components.sh

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index c1c74ffe3..e55de11d0 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -62,6 +62,22 @@ on:
         description: "Git ref (branch/sha) to checkout"
         required: false
         type: string
+      aiter-remote:
+        description: "Repo URL to reinstall aiter from"
+        required: false
+        type: string
+      aiter-ref:
+        description: "Branch or commit for aiter"
+        required: false
+        type: string
+      sglang-remote:
+        description: "Repo URL to reinstall sglang from"
+        required: false
+        type: string
+      sglang-ref:
+        description: "Branch or commit for sglang"
+        required: false
+        type: string
 
 env:
   RANDOM_RANGE_RATIO: 0.8
@@ -83,6 +99,10 @@ env:
   SPEC_DECODING: ${{ inputs.spec-decoding }}
   DISAGG: ${{ inputs.disagg }}
   RUN_EVAL: ${{ inputs.run-eval }}
+  AITER_REMOTE: ${{ inputs.aiter-remote }}
+  AITER_REF: ${{ inputs.aiter-ref }}
+  SGLANG_REMOTE: ${{ inputs.sglang-remote }}
+  SGLANG_REF: ${{ inputs.sglang-ref }}
 
 permissions:
   contents: read
diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml
index 0f26c7c41..dee5ab153 100644
--- a/.github/workflows/e2e-tests.yml
+++ b/.github/workflows/e2e-tests.yml
@@ -16,6 +16,22 @@ on:
                 description: "Ref (branch/sha) to checkout for generating configs"
                 required: false
                 type: string
+            aiter-repo:
+                description: "Repo URL to reinstall aiter from (e.g. https://github.com/user/aiter.git)"
+                required: false
+                type: string
+            aiter-ref:
+                description: "Branch or commit for aiter"
+                required: false
+                type: string
+            sglang-repo:
+                description: "Repo URL to reinstall sglang from (e.g. https://github.com/user/sglang)"
+                required: false
+                type: string
+            sglang-ref:
+                description: "Branch or commit for sglang"
+                required: false
+                type: string
     workflow_call:
         inputs:
             generate-cli-command:
@@ -30,6 +46,22 @@ on:
                 description: "Ref (branch/sha) to checkout for generating configs"
                 required: false
                 type: string
+            aiter-repo:
+                description: "Repo URL to reinstall aiter from"
+                required: false
+                type: string
+            aiter-ref:
+                description: "Branch or commit for aiter"
+                required: false
+                type: string
+            sglang-repo:
+                description: "Repo URL to reinstall sglang from"
+                required: false
+                type: string
+            sglang-ref:
+                description: "Branch or commit for sglang"
+                required: false
+                type: string
 
 jobs:
     get-jobs:
@@ -124,7 +156,11 @@ jobs:
             spec-decoding: ${{ matrix.config.spec-decoding }}
             disagg: ${{ matrix.config.disagg }}
             run-eval: ${{ matrix.config.run-eval }}
-            ref: ${{ inputs.ref }}
+            ref: ${{ inputs.ref || github.event.inputs.ref }}
+            aiter-remote: ${{ inputs.aiter-repo || github.event.inputs['aiter-repo'] }}
+            aiter-ref: ${{ inputs.aiter-ref || github.event.inputs['aiter-ref'] }}
+            sglang-remote: ${{ inputs.sglang-repo || github.event.inputs['sglang-repo'] }}
+            sglang-ref: ${{ inputs.sglang-ref || github.event.inputs['sglang-ref'] }}
 
     collect-results:
         needs: [test-sweep-multi-node, test-sweep-single-node]
diff --git a/benchmarks/single_node/patch_sgl_components.sh b/benchmarks/single_node/patch_sgl_components.sh
new file mode 100644
index 000000000..8b7961d47
--- /dev/null
+++ b/benchmarks/single_node/patch_sgl_components.sh
@@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+# Reinstall aiter and sglang from custom remote/ref.
+# Expects AITER_REMOTE, AITER_REF, SGLANG_REMOTE, SGLANG_REF from env.
+# If no refs are provided, skip entirely and use default packages in container.
+
+set -e
+
+if [[ -z "$AITER_REF" && -z "$SGLANG_REF" ]]; then
+    echo "No patch refs provided; using default packages in container."
+    exit 0
+fi
+
+work_dir="/sgl-workspace"
+aiter_remote="${AITER_REMOTE:-https://github.com/zhentaocc/aiter.git}"
+aiter_ref="${AITER_REF:-mi35_qwen35_image}"
+sglang_remote="${SGLANG_REMOTE:-https://github.com/sgl-project/sglang}"
+sglang_ref="${SGLANG_REF:-mi35_qwen35_image}"
+
+if [[ ! -d "$work_dir" ]]; then
+    echo "$work_dir not found; assuming image ships correct versions."
+    exit 0
+fi
+
+if [[ -n "$AITER_REF" ]]; then
+    pip uninstall amd-aiter -y
+    cd "$work_dir"
+    rm -rf aiter
+    git clone --recursive "$aiter_remote" aiter
+    cd aiter
+    git fetch origin "$aiter_ref" 2>/dev/null || git fetch origin
+    git checkout "$aiter_ref" 2>/dev/null || git reset --hard "origin/$aiter_ref" 2>/dev/null || git reset --hard "$aiter_ref"
+    rm -rf aiter/jit/*.so 2>/dev/null || true
+    PREBUILD_KERNELS=0 python setup.py develop
+    echo "aiter ($aiter_ref) installed from $aiter_remote"
+else
+    echo "AITER_REF not set; using default aiter in container"
+fi
+
+if [[ -n "$SGLANG_REF" ]] && [[ -d "$work_dir/sglang/.git" ]]; then
+    cd "$work_dir/sglang"
+    git remote set-url origin "$sglang_remote" 2>/dev/null || git remote add origin "$sglang_remote"
+    git fetch origin "$sglang_ref" 2>/dev/null || git fetch origin
+    git checkout "$sglang_ref" 2>/dev/null || git reset --hard "origin/$sglang_ref" 2>/dev/null || git reset --hard "$sglang_ref"
+    echo "sglang ($sglang_ref) from $sglang_remote"
+elif [[ -z "$SGLANG_REF" ]]; then
+    echo "SGLANG_REF not set; using default sglang in container"
+fi
diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh
index 692d7d0a0..73eb009dc 100644
--- a/runners/launch_mi355x-amds.sh
+++ b/runners/launch_mi355x-amds.sh
@@ -211,6 +211,8 @@ else
 
         export VLLM_CACHE_ROOT="/it-share/gharunners/.cache/vllm"
 
+        # Reinstall aiter/sglang in /sgl-workspace when AITER_REF or SGLANG_REF are set, then run benchmark
+        RUN_CMD="bash benchmarks/single_node/patch_sgl_components.sh && exec bash $BENCHMARK_SCRIPT"
         srun --jobid=$JOB_ID \
             --container-image=$SQUASH_FILE \
             --container-mounts=$WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
@@ -218,7 +220,7 @@ else
             --container-writable \
             --container-workdir=/workspace/ \
             --no-container-entrypoint --export=ALL \
-            bash "$BENCHMARK_SCRIPT"
+            bash -c "$RUN_CMD"
 
         scancel $JOB_ID
 
@@ -241,6 +243,8 @@ else
         docker rm -f "$server_name" 2>/dev/null || true
 
         set -x
+        # Reinstall aiter/sglang in /sgl-workspace when AITER_REF or SGLANG_REF are set, then run benchmark
+        RUN_CMD="bash benchmarks/single_node/patch_sgl_components.sh && exec bash $BENCHMARK_SCRIPT"
         docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
             --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \
             --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
@@ -251,8 +255,9 @@ else
             -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL -e RUNNER_TYPE \
             -e PROFILE -e SGLANG_TORCH_PROFILER_DIR -e VLLM_TORCH_PROFILER_DIR -e VLLM_RPC_TIMEOUT \
             -e SPEC_DECODING -e DISAGG \
+            -e AITER_REMOTE -e AITER_REF -e SGLANG_REMOTE -e SGLANG_REF \
             --entrypoint=/bin/bash \
             "$IMAGE" \
-            "$BENCHMARK_SCRIPT"
+            -c "$RUN_CMD"
     fi
 fi

From 609236c26b479b714ebfaacebff0221af8a63abd Mon Sep 17 00:00:00 2001
From: "Chen, Todd" <zhenchen@amd.com>
Date: Wed, 18 Mar 2026 12:53:02 -0500
Subject: [PATCH 03/11] Refactor aiter installation process in
 patch_sgl_components.sh

- Changed the directory navigation for aiter to ensure it operates within the correct path.
- Updated the git commands to set the remote URL or add it if it doesn't exist, improving the installation flow.
- Added cleanup commands to remove unnecessary build and distribution files, enhancing the installation process's efficiency.
---
 benchmarks/single_node/patch_sgl_components.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmarks/single_node/patch_sgl_components.sh b/benchmarks/single_node/patch_sgl_components.sh
index 8b7961d47..a8800693d 100644
--- a/benchmarks/single_node/patch_sgl_components.sh
+++ b/benchmarks/single_node/patch_sgl_components.sh
@@ -23,13 +23,13 @@ fi
 
 if [[ -n "$AITER_REF" ]]; then
     pip uninstall amd-aiter -y
-    cd "$work_dir"
-    rm -rf aiter
-    git clone --recursive "$aiter_remote" aiter
-    cd aiter
+    cd "$work_dir/aiter"
+    git remote set-url origin "$aiter_remote" 2>/dev/null || git remote add origin "$aiter_remote"
     git fetch origin "$aiter_ref" 2>/dev/null || git fetch origin
     git checkout "$aiter_ref" 2>/dev/null || git reset --hard "origin/$aiter_ref" 2>/dev/null || git reset --hard "$aiter_ref"
     rm -rf aiter/jit/*.so 2>/dev/null || true
+    rm -rf aiter/jit/build 2>/dev/null || true
+    rm -rf aiter/jit/dist 2>/dev/null || true
     PREBUILD_KERNELS=0 python setup.py develop
     echo "aiter ($aiter_ref) installed from $aiter_remote"
 else

From 16b7634d20e53e37925e3ef0a54bd9778ec2a7ef Mon Sep 17 00:00:00 2001
From: "Chen, Todd" <zhenchen@amd.com>
Date: Wed, 18 Mar 2026 21:28:24 -0500
Subject: [PATCH 04/11] Add git submodule initialization for composable_kernel
 in patch_sgl_components.sh

- Added a command to update and initialize the 3rdparty/composable_kernel submodule during the aiter installation process.
- This change enhances the setup flow by ensuring all necessary components are properly initialized before installation.
---
 benchmarks/single_node/patch_sgl_components.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/single_node/patch_sgl_components.sh b/benchmarks/single_node/patch_sgl_components.sh
index a8800693d..fcd6b6e61 100644
--- a/benchmarks/single_node/patch_sgl_components.sh
+++ b/benchmarks/single_node/patch_sgl_components.sh
@@ -30,6 +30,7 @@ if [[ -n "$AITER_REF" ]]; then
     rm -rf aiter/jit/*.so 2>/dev/null || true
     rm -rf aiter/jit/build 2>/dev/null || true
     rm -rf aiter/jit/dist 2>/dev/null || true
+    git submodule update --init --force 3rdparty/composable_kernel 2>/dev/null || true
     PREBUILD_KERNELS=0 python setup.py develop
     echo "aiter ($aiter_ref) installed from $aiter_remote"
 else

From da6a41074cc3ca4133f5bfdf0cd192e2405e151f Mon Sep 17 00:00:00 2001
From: "Chen, Todd" <zhenchen@amd.com>
Date: Wed, 18 Mar 2026 21:29:47 -0500
Subject: [PATCH 05/11] Update qwen3.5 benchmark script to enable fused QK norm
 ROPE and set environment variable

- Added environment variable SGLANG_FUSED_QK_NORM_ROPE_CACHE_PTS_QUANT_SHUFFLE to optimize performance.
- Included the --enable-fused-qk-norm-rope flag in the sglang launch command.
- Adjusted directory navigation to ensure proper execution context.
---
 benchmarks/single_node/qwen3.5_bf16_mi355x.sh | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh
index f77390707..87df79324 100755
--- a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh
+++ b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh
@@ -23,6 +23,9 @@ PORT=${PORT:-8888}
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
+export SGLANG_FUSED_QK_NORM_ROPE_CACHE_PTS_QUANT_SHUFFLE=1
+
+cd /sgl-workspace/sglang
 python3 -m sglang.launch_server \
     --attention-backend triton \
     --model-path $MODEL \
@@ -30,10 +33,11 @@ python3 -m sglang.launch_server \
     --port $PORT \
     --tensor-parallel-size $TP \
     --trust-remote-code \
+    --enable-fused-qk-norm-rope \
     --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
-
+cd -
 # Wait for server to be ready
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 

From d0f348d5b84d38a12e1aff7c3756a095f57e1e02 Mon Sep 17 00:00:00 2001
From: "Chen, Todd" <zhenchen@amd.com>
Date: Wed, 18 Mar 2026 21:29:47 -0500
Subject: [PATCH 06/11] Update qwen3.5 image version in amd-master.yaml

- Bumped the image version for qwen3.5 benchmarks from v0.5.8 to v0.5.9 for both mi355x and mi325x configurations, ensuring compatibility with the latest updates.
---
 .github/configs/amd-master.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 5551860f2..895396b55 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -140,7 +140,7 @@ dsr1-fp8-mi355x-sglang:
     - { tp: 8, conc-start: 4, conc-end: 64 }
 
 qwen3.5-bf16-mi355x-sglang:
-  image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260215
+  image: rocm/sgl-dev:v0.5.9-rocm720-mi35x-20260227
   model: Qwen/Qwen3.5-397B-A17B
   model-prefix: qwen3.5
   runner: mi355x
@@ -228,7 +228,7 @@ qwen3.5-fp8-mi325x-sglang:
     - { tp: 8, conc-start: 4, conc-end: 64 }
 
 qwen3.5-fp8-mi355x-sglang:
-  image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260218
+  image: rocm/sgl-dev:v0.5.9-rocm720-mi35x-20260227
   model: Qwen/Qwen3.5-397B-A17B-FP8
   model-prefix: qwen3.5
   runner: mi355x

From 9554b487a98e0696a5c89ebdb22ec7a77dd6652f Mon Sep 17 00:00:00 2001
From: "Chen, Todd" <zhenchen@amd.com>
Date: Wed, 18 Mar 2026 21:31:12 -0500
Subject: [PATCH 07/11] Enable fused QK norm ROPE in qwen3.5 benchmark script

- Set the environment variable SGLANG_FUSED_QK_NORM_ROPE_CACHE_PTS_QUANT_SHUFFLE to optimize performance.
- Added the --enable-fused-qk-norm-rope flag to the sglang launch command for enhanced functionality.
---
 benchmarks/single_node/qwen3.5_fp8_mi355x.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh
index f77390707..1ff52e1cc 100644
--- a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh
+++ b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh
@@ -20,6 +20,8 @@ hf download "$MODEL"
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
+export SGLANG_FUSED_QK_NORM_ROPE_CACHE_PTS_QUANT_SHUFFLE=1
+
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -30,6 +32,7 @@ python3 -m sglang.launch_server \
     --port $PORT \
     --tensor-parallel-size $TP \
     --trust-remote-code \
+    --enable-fused-qk-norm-rope \
     --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!

From 6313809dae9d4927a55ee7318ae6a676ca315d63 Mon Sep 17 00:00:00 2001
From: "Chen, Todd" <zhenchen@amd.com>
Date: Fri, 20 Mar 2026 02:19:53 -0500
Subject: [PATCH 08/11] Add interactive benchmark script and result
 summarization tool for Qwen3.5

- Introduced `qwen3.5_bf16_mi355x_interactive.sh` for running interactive benchmarks with customizable options for tensor parallel size, sequence lengths, and concurrency.
- Added `summarize_interactive_results.py` to process benchmark results and output them in a markdown table format, enhancing result analysis and reporting capabilities.
---
 .../qwen3.5_bf16_mi355x_interactive.sh        | 174 ++++++++++
 utils/summarize_interactive_results.py        | 318 ++++++++++++++++++
 2 files changed, 492 insertions(+)
 create mode 100755 benchmarks/single_node/qwen3.5_bf16_mi355x_interactive.sh
 create mode 100644 utils/summarize_interactive_results.py

diff --git a/benchmarks/single_node/qwen3.5_bf16_mi355x_interactive.sh b/benchmarks/single_node/qwen3.5_bf16_mi355x_interactive.sh
new file mode 100755
index 000000000..f0036d1db
--- /dev/null
+++ b/benchmarks/single_node/qwen3.5_bf16_mi355x_interactive.sh
@@ -0,0 +1,174 @@
+#!/usr/bin/env bash
+#
+# Interactive benchmark script for Qwen3.5-397B-A17B on MI355X.
+# Runs inside Docker with CLI args for TP, sequence lengths, and concurrency.
+#
+# Usage:
+#   ./qwen3.5_bf16_mi355x_interactive.sh [OPTIONS]
+#
+# Options:
+#   -tp N              Tensor parallel size (default: 8)
+#   -sl ISL,OSL ...    Space-separated isl,osl pairs (default: 1024,1024 8192,1024 1024,8192)
+#   -conc N N ...      Space-separated concurrency values (default: 8)
+#   -result-dir DIR    Output directory (default: /workspace/)
+#
+# Examples:
+#   ./qwen3.5_bf16_mi355x_interactive.sh
+#   ./qwen3.5_bf16_mi355x_interactive.sh -tp 8 -sl 1024,1024 8192,1024 -conc 8 16 32
+#   ./qwen3.5_bf16_mi355x_interactive.sh -result-dir /workspace/results
+
+set -e
+
+# Defaults
+TP=8
+SL_LIST=("1024,1024" "8192,1024" "1024,8192")
+CONC_LIST=(8)
+RESULT_DIR="/workspace/"
+
+usage() {
+    echo "Usage: $0 [OPTIONS]"
+    echo ""
+    echo "Options:"
+    echo "  -tp N              Tensor parallel size (default: 8)"
+    echo "  -sl ISL,OSL ...    Space-separated isl,osl pairs (default: 1024,1024 8192,1024 1024,8192)"
+    echo "  -conc N N ...      Space-separated concurrency values (default: 8)"
+    echo "  -result-dir DIR   Output directory (default: /workspace/)"
+    echo ""
+    echo "Examples:"
+    echo "  $0"
+    echo "  $0 -tp 8 -sl 1024,1024 8192,1024 -conc 8 16 32"
+    echo "  $0 -result-dir /workspace/results"
+    exit 1
+}
+
+# Parse CLI args
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -tp)
+            TP="$2"
+            shift 2
+            ;;
+        -sl)
+            shift
+            SL_LIST=()
+            while [[ $# -gt 0 && ! "$1" =~ ^- ]]; do
+                SL_LIST+=("$1")
+                shift
+            done
+            [[ ${#SL_LIST[@]} -eq 0 ]] && SL_LIST=("1024,1024" "8192,1024" "1024,8192")
+            ;;
+        -conc)
+            shift
+            CONC_LIST=()
+            while [[ $# -gt 0 && ! "$1" =~ ^- ]]; do
+                CONC_LIST+=("$1")
+                shift
+            done
+            [[ ${#CONC_LIST[@]} -eq 0 ]] && CONC_LIST=(8)
+            ;;
+        -result-dir)
+            RESULT_DIR="$2"
+            shift 2
+            ;;
+        -h|--help)
+            usage
+            ;;
+        *)
+            echo "Unknown option: $1"
+            usage
+            ;;
+    esac
+done
+
+# Validate TP
+if ! [[ "$TP" =~ ^[0-9]+$ ]]; then
+    echo "Error: -tp must be a positive integer, got: $TP"
+    exit 1
+fi
+
+# Validate each sl pair (isl,osl)
+for pair in "${SL_LIST[@]}"; do
+    if ! [[ "$pair" =~ ^[0-9]+,[0-9]+$ ]]; then
+        echo "Error: -sl pair must be isl,osl (e.g. 1024,1024), got: $pair"
+        exit 1
+    fi
+done
+
+# Validate each conc
+for c in "${CONC_LIST[@]}"; do
+    if ! [[ "$c" =~ ^[0-9]+$ ]]; then
+        echo "Error: -conc values must be positive integers, got: $c"
+        exit 1
+    fi
+done
+
+# Ensure result dir exists and ends with /
+[[ "${RESULT_DIR}" != */ ]] && RESULT_DIR="${RESULT_DIR}/"
+mkdir -p "$RESULT_DIR"
+
+# Set optional env vars
+export MODEL="${MODEL:-Qwen/Qwen3.5-397B-A17B}"
+export RANDOM_RANGE_RATIO="${RANDOM_RANGE_RATIO:-0.1}"
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars MODEL RANDOM_RANGE_RATIO
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+echo "Config: TP=$TP, SL=${SL_LIST[*]}, CONC=${CONC_LIST[*]}, RESULT_DIR=$RESULT_DIR"
+
+hf download "$MODEL"
+
+SERVER_LOG=/workspace/server.log
+PORT=${PORT:-8888}
+
+# Start GPU monitoring
+start_gpu_monitor
+
+python3 -m sglang.launch_server \
+    --attention-backend triton \
+    --model-path $MODEL \
+    --host=0.0.0.0 \
+    --port $PORT \
+    --tensor-parallel-size $TP \
+    --trust-remote-code \
+    --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 &
+
+SERVER_PID=$!
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+# Loop over (isl,osl) and conc
+for sl_pair in "${SL_LIST[@]}"; do
+    IFS=',' read -r ISL OSL <<< "$sl_pair"
+    for CONC in "${CONC_LIST[@]}"; do
+        RESULT_FILENAME="result_TP${TP}_CONC${CONC}_ISL${ISL}_OSL${OSL}.json"
+        echo "Running: ISL=$ISL OSL=$OSL CONC=$CONC -> $RESULT_FILENAME"
+
+        run_benchmark_serving \
+            --model "$MODEL" \
+            --port "$PORT" \
+            --backend vllm \
+            --input-len "$ISL" \
+            --output-len "$OSL" \
+            --random-range-ratio "$RANDOM_RANGE_RATIO" \
+            --num-prompts "$((CONC * 10))" \
+            --max-concurrency "$CONC" \
+            --result-filename "$RESULT_FILENAME" \
+            --result-dir "$RESULT_DIR"
+
+        if [ "${RUN_EVAL}" = "true" ]; then
+            run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
+            append_lm_eval_summary
+        fi
+    done
+done
+
+stop_gpu_monitor
+echo "Done. Results in $RESULT_DIR"
+echo ""
+echo "To summarize results into a markdown table, run (from repo root):"
+echo "  python3 utils/summarize_interactive_results.py $RESULT_DIR -o summary.md"
diff --git a/utils/summarize_interactive_results.py b/utils/summarize_interactive_results.py
new file mode 100644
index 000000000..bccd8bbe6
--- /dev/null
+++ b/utils/summarize_interactive_results.py
@@ -0,0 +1,318 @@
+#!/usr/bin/env python3
+"""
+Process raw benchmark result JSONs from the interactive script and summarize into a markdown table.
+
+Follows the same processing flow as benchmark-tmpl.yml and e2e-tests.yml:
+1. Process each result_*.json (raw benchmark_serving output) into agg format
+2. Collect all combinations
+3. Output markdown table via tabulate
+
+Usage:
+    python utils/summarize_interactive_results.py <result_dir> [options]
+
+Examples:
+    python utils/summarize_interactive_results.py /workspace/
+    python utils/summarize_interactive_results.py ./results -o summary.md
+    python utils/summarize_interactive_results.py ./results --hw mi355x --model-prefix qwen3.5
+"""
+import argparse
+import json
+import re
+import sys
+from pathlib import Path
+from typing import Any, Optional
+
+try:
+    from tabulate import tabulate
+except ImportError:
+    tabulate = None
+
+
+# Filename pattern: result_TP{TP}_CONC{CONC}_ISL{ISL}_OSL{OSL}.json
+RESULT_PATTERN = re.compile(
+    r"result_TP(\d+)_CONC(\d+)_ISL(\d+)_OSL(\d+)\.json", re.IGNORECASE
+)
+
+
+def parse_result_filename(path: Path) -> Optional[tuple[int, int, int, int]]:
+    """Extract TP, CONC, ISL, OSL from filename. Returns (tp, conc, isl, osl) or None."""
+    m = RESULT_PATTERN.match(path.name)
+    if m:
+        return int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(4))
+    return None
+
+
+def process_raw_result(
+    raw: dict[str, Any],
+    tp: int,
+    conc: int,
+    isl: int,
+    osl: int,
+    *,
+    hw: str = "mi355x",
+    model_prefix: str = "qwen3.5",
+    framework: str = "sglang",
+    precision: str = "bf16",
+    image: str = "",
+) -> dict[str, Any]:
+    """
+    Convert raw benchmark_serving JSON to agg format (same schema as process_result.py output).
+    """
+    model_id = raw.get("model_id", "unknown")
+    total_tput = float(raw.get("total_token_throughput", 0))
+    output_tput = float(raw.get("output_throughput", 0))
+    max_conc = int(raw.get("max_concurrency", conc))
+
+    data: dict[str, Any] = {
+        "hw": hw,
+        "conc": max_conc,
+        "image": image,
+        "model": model_id,
+        "infmax_model_prefix": model_prefix,
+        "framework": framework,
+        "precision": precision,
+        "spec_decoding": "none",
+        "disagg": False,
+        "isl": isl,
+        "osl": osl,
+        "is_multinode": False,
+        "tp": tp,
+        "ep": 1,
+        "dp_attention": False,
+        "tput_per_gpu": total_tput / tp,
+        "output_tput_per_gpu": output_tput / tp,
+        "input_tput_per_gpu": (total_tput - output_tput) / tp,
+    }
+
+    # Convert *_ms to seconds (process_result logic)
+    for key, value in raw.items():
+        if key.endswith("_ms") and isinstance(value, (int, float)):
+            data[key.replace("_ms", "")] = float(value) / 1000.0
+        if "tpot" in key and isinstance(value, (int, float)) and value > 0:
+            data[key.replace("_ms", "").replace("tpot", "intvty")] = 1000.0 / float(value)
+
+    # Ensure required fields for summarize (fallbacks if raw uses different keys)
+    if "median_ttft" not in data and "median_ttft_ms" in raw:
+        data["median_ttft"] = float(raw["median_ttft_ms"]) / 1000.0
+    if "median_tpot" not in data and "median_tpot_ms" in raw:
+        data["median_tpot"] = float(raw["median_tpot_ms"]) / 1000.0
+    if "median_e2el" not in data and "median_e2el_ms" in raw:
+        data["median_e2el"] = float(raw["median_e2el_ms"]) / 1000.0
+    if "median_intvty" not in data and "median_tpot_ms" in raw:
+        data["median_intvty"] = 1000.0 / float(raw["median_tpot_ms"])
+    data.setdefault("median_ttft", 0.0)
+    data.setdefault("median_tpot", 0.0)
+    data.setdefault("median_e2el", 0.0)
+    data.setdefault("median_intvty", 0.0)
+
+    return data
+
+
+def load_and_process(
+    result_dir: Path,
+    *,
+    hw: str = "mi355x",
+    model_prefix: str = "qwen3.5",
+    framework: str = "sglang",
+    precision: str = "bf16",
+    image: str = "",
+) -> list[dict[str, Any]]:
+    """Load all result_*.json files, process, and return agg records."""
+    agg_results: list[dict[str, Any]] = []
+
+    for path in sorted(result_dir.rglob("*.json")):
+        parsed = parse_result_filename(path)
+        if not parsed:
+            continue
+
+        tp, conc, isl, osl = parsed
+        try:
+            with open(path) as f:
+                raw = json.load(f)
+        except (json.JSONDecodeError, OSError) as e:
+            print(f"Warning: Skipping {path}: {e}", file=sys.stderr)
+            continue
+
+        if "total_token_throughput" not in raw or "model_id" not in raw:
+            print(f"Warning: Skipping {path}: missing required fields", file=sys.stderr)
+            continue
+
+        agg = process_raw_result(
+            raw,
+            tp,
+            conc,
+            isl,
+            osl,
+            hw=hw,
+            model_prefix=model_prefix,
+            framework=framework,
+            precision=precision,
+            image=image,
+        )
+        agg_results.append(agg)
+
+    return agg_results
+
+
+def to_markdown_table(results: list[dict[str, Any]]) -> str:
+    """Format results as markdown table (same columns as summarize.py single-node)."""
+    if not results:
+        return "No results to display.\n"
+
+    results.sort(
+        key=lambda r: (
+            r["infmax_model_prefix"],
+            r["hw"],
+            r["framework"],
+            r["precision"],
+            r["isl"],
+            r["osl"],
+            r["tp"],
+            r["ep"],
+            r["conc"],
+        )
+    )
+
+    headers = [
+        "Model",
+        "Served Model",
+        "Hardware",
+        "Framework",
+        "Precision",
+        "ISL",
+        "OSL",
+        "TP",
+        "EP",
+        "DP Attention",
+        "Conc",
+        "TTFT (ms)",
+        "TPOT (ms)",
+        "Interactivity (tok/s/user)",
+        "E2EL (s)",
+        "TPUT per GPU",
+        "Output TPUT per GPU",
+        "Input TPUT per GPU",
+    ]
+
+    rows = [
+        [
+            r["infmax_model_prefix"],
+            r["model"],
+            r["hw"].upper(),
+            r["framework"].upper(),
+            r["precision"].upper(),
+            r["isl"],
+            r["osl"],
+            r["tp"],
+            r["ep"],
+            r["dp_attention"],
+            r["conc"],
+            f"{r['median_ttft'] * 1000:.4f}",
+            f"{r['median_tpot'] * 1000:.4f}",
+            f"{r['median_intvty']:.4f}",
+            f"{r['median_e2el']:.4f}",
+            f"{r['tput_per_gpu']:.4f}",
+            f"{r['output_tput_per_gpu']:.4f}",
+            f"{r['input_tput_per_gpu']:.4f}",
+        ]
+        for r in results
+    ]
+
+    if tabulate:
+        return tabulate(rows, headers=headers, tablefmt="github")
+    # Fallback: simple markdown table
+    sep = "|" + "|".join(["---"] * len(headers)) + "|"
+    header_row = "|" + "|".join(headers) + "|"
+    data_rows = "\n".join("|" + "|".join(str(c) for c in row) + "|" for row in rows)
+    return f"{header_row}\n{sep}\n{data_rows}"
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Process interactive benchmark results and output markdown table"
+    )
+    parser.add_argument(
+        "result_dir",
+        type=Path,
+        help="Directory containing result_TP*_CONC*_ISL*_OSL*.json files",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        type=Path,
+        default=None,
+        help="Write markdown to file (default: stdout)",
+    )
+    parser.add_argument(
+        "--hw",
+        type=str,
+        default="mi355x",
+        help="Hardware label (default: mi355x)",
+    )
+    parser.add_argument(
+        "--model-prefix",
+        type=str,
+        default="qwen3.5",
+        help="Model prefix (default: qwen3.5)",
+    )
+    parser.add_argument(
+        "--framework",
+        type=str,
+        default="sglang",
+        help="Framework (default: sglang)",
+    )
+    parser.add_argument(
+        "--precision",
+        type=str,
+        default="bf16",
+        help="Precision (default: bf16)",
+    )
+    parser.add_argument(
+        "--image",
+        type=str,
+        default="",
+        help="Docker image (optional)",
+    )
+    parser.add_argument(
+        "--agg-json",
+        type=Path,
+        default=None,
+        help="Also write aggregated JSON to file (for collect_results compatibility)",
+    )
+
+    args = parser.parse_args()
+
+    if not args.result_dir.is_dir():
+        print(f"Error: {args.result_dir} is not a directory", file=sys.stderr)
+        sys.exit(1)
+
+    results = load_and_process(
+        args.result_dir,
+        hw=args.hw,
+        model_prefix=args.model_prefix,
+        framework=args.framework,
+        precision=args.precision,
+        image=args.image,
+    )
+
+    if not results:
+        print("No result files found matching result_TP*_CONC*_ISL*_OSL*.json", file=sys.stderr)
+        sys.exit(1)
+
+    md = to_markdown_table(results)
+    full_output = f"## Interactive Benchmark Results\n\n{md}\n"
+
+    if args.output:
+        args.output.write_text(full_output)
+        print(f"Wrote summary to {args.output}", file=sys.stderr)
+    else:
+        print(full_output)
+
+    if args.agg_json:
+        with open(args.agg_json, "w") as f:
+            json.dump(results, f, indent=2)
+        print(f"Wrote aggregated JSON to {args.agg_json}", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()

From eb9298a51887c947f74068a66f2a8828036d09f0 Mon Sep 17 00:00:00 2001
From: "Chen, Todd" <zhenchen@amd.com>
Date: Fri, 20 Mar 2026 03:35:05 -0500
Subject: [PATCH 09/11] Update qwen3.5 benchmark script for improved
 configuration and logging

- Changed RESULT_DIR and SERVER_LOG paths to a user-specific directory for better organization.
- Added new parameters for memory fraction, chunked prefill size, max prefill tokens, and context length to enhance server launch configuration.
- Introduced dynamic SCHEDULER_RECV_INTERVAL based on concurrency level to optimize performance during benchmarking.
---
 .../qwen3.5_bf16_mi355x_interactive.sh        | 32 ++++++++++++++++---
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/benchmarks/single_node/qwen3.5_bf16_mi355x_interactive.sh b/benchmarks/single_node/qwen3.5_bf16_mi355x_interactive.sh
index f0036d1db..eb1e6cac1 100755
--- a/benchmarks/single_node/qwen3.5_bf16_mi355x_interactive.sh
+++ b/benchmarks/single_node/qwen3.5_bf16_mi355x_interactive.sh
@@ -23,7 +23,7 @@ set -e
 TP=8
 SL_LIST=("1024,1024" "8192,1024" "1024,8192")
 CONC_LIST=(8)
-RESULT_DIR="/workspace/"
+RESULT_DIR="/home/zhenchen/projects/InferenceX/"
 
 usage() {
     echo "Usage: $0 [OPTIONS]"
@@ -37,7 +37,7 @@ usage() {
     echo "Examples:"
     echo "  $0"
     echo "  $0 -tp 8 -sl 1024,1024 8192,1024 -conc 8 16 32"
-    echo "  $0 -result-dir /workspace/results"
+    echo "  $0 -result-dir /home/zhenchen/projects/InferenceX/results"
     exit 1
 }
 
@@ -122,20 +122,36 @@ echo "Config: TP=$TP, SL=${SL_LIST[*]}, CONC=${CONC_LIST[*]}, RESULT_DIR=$RESULT
 
 hf download "$MODEL"
 
-SERVER_LOG=/workspace/server.log
+SERVER_LOG=/home/zhenchen/projects/InferenceX/server.log
 PORT=${PORT:-8888}
+MEM_FRAC_STATIC=0.82
+CHUNKED_PREFILL_SIZE=32768
+MAX_PREFILL_TOKENS=32768
+CUDA_GRAPH_MAX_BATCH_SIZE=$CONC
+MAX_RUNNING_REQUESTS=128
+CONTEXT_LENGTH=$((ISL + OSL + 20))
 
 # Start GPU monitoring
 start_gpu_monitor
 
-python3 -m sglang.launch_server \
+set -x
+PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \
     --attention-backend triton \
     --model-path $MODEL \
     --host=0.0.0.0 \
     --port $PORT \
     --tensor-parallel-size $TP \
     --trust-remote-code \
-    --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 &
+    --mem-fraction-static $MEM_FRAC_STATIC \
+    --chunked-prefill-size $CHUNKED_PREFILL_SIZE \
+    --max-prefill-tokens $MAX_PREFILL_TOKENS \
+    --cuda-graph-max-batch-size $CUDA_GRAPH_MAX_BATCH_SIZE \
+    --max-running-requests $MAX_RUNNING_REQUESTS \
+    --enable-aiter-allreduce-fusion \
+    --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
+    --tokenizer-worker-num 6 \
+    --stream-interval 30 \
+    --context-length $CONTEXT_LENGTH > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -148,6 +164,12 @@ for sl_pair in "${SL_LIST[@]}"; do
         RESULT_FILENAME="result_TP${TP}_CONC${CONC}_ISL${ISL}_OSL${OSL}.json"
         echo "Running: ISL=$ISL OSL=$OSL CONC=$CONC -> $RESULT_FILENAME"
 
+        if [[ $CONC -ge 16 ]]; then
+            SCHEDULER_RECV_INTERVAL=30
+        else
+            SCHEDULER_RECV_INTERVAL=10
+        fi
+
         run_benchmark_serving \
             --model "$MODEL" \
             --port "$PORT" \

From 7a864e746cf05671c587863d3dfc39d1e399fa7e Mon Sep 17 00:00:00 2001
From: "Chen, Todd" <zhenchen@amd.com>
Date: Fri, 20 Mar 2026 11:25:51 -0500
Subject: [PATCH 10/11] Enhance qwen3.5 benchmark script with additional
 configuration options

- Introduced new parameters for memory fraction, chunked prefill size, max prefill tokens, and context length to improve server launch settings.
- Implemented dynamic SCHEDULER_RECV_INTERVAL based on concurrency level to optimize request handling.
- Updated the launch command to include new flags for enhanced functionality and performance during benchmarking.
---
 benchmarks/single_node/qwen3.5_bf16_mi355x.sh | 28 ++++++++++++++--
 benchmarks/single_node/qwen3.5_fp8_mi355x.sh  | 32 ++++++++++++++++---
 2 files changed, 53 insertions(+), 7 deletions(-)

diff --git a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh
index 87df79324..daa9b1f37 100755
--- a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh
+++ b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh
@@ -19,6 +19,19 @@ hf download "$MODEL"
 
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
+MEM_FRAC_STATIC=0.82
+CHUNKED_PREFILL_SIZE=32768
+MAX_PREFILL_TOKENS=32768
+CUDA_GRAPH_MAX_BATCH_SIZE=$CONC
+MAX_RUNNING_REQUESTS=128
+CONTEXT_LENGTH=$((ISL + OSL + 20))
+
+# Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls.
+if [[ $CONC -ge 16 ]]; then
+  SCHEDULER_RECV_INTERVAL=30
+else
+  SCHEDULER_RECV_INTERVAL=10
+fi
 
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
@@ -26,15 +39,24 @@ start_gpu_monitor
 export SGLANG_FUSED_QK_NORM_ROPE_CACHE_PTS_QUANT_SHUFFLE=1
 
 cd /sgl-workspace/sglang
-python3 -m sglang.launch_server \
+set -x
+PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \
     --attention-backend triton \
     --model-path $MODEL \
     --host=0.0.0.0 \
     --port $PORT \
     --tensor-parallel-size $TP \
     --trust-remote-code \
-    --enable-fused-qk-norm-rope \
-    --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 &
+    --mem-fraction-static $MEM_FRAC_STATIC \
+    --chunked-prefill-size $CHUNKED_PREFILL_SIZE \
+    --max-prefill-tokens $MAX_PREFILL_TOKENS \
+    --cuda-graph-max-batch-size $CUDA_GRAPH_MAX_BATCH_SIZE \
+    --max-running-requests $MAX_RUNNING_REQUESTS \
+    --enable-aiter-allreduce-fusion \
+    --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
+    --tokenizer-worker-num 6 \
+    --stream-interval 30 \
+    --context-length $CONTEXT_LENGTH > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 cd -
diff --git a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh
index 1ff52e1cc..b6258a67c 100644
--- a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh
+++ b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh
@@ -19,21 +19,45 @@ hf download "$MODEL"
 
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
+MEM_FRAC_STATIC=0.82
+CHUNKED_PREFILL_SIZE=32768
+MAX_PREFILL_TOKENS=32768
+CUDA_GRAPH_MAX_BATCH_SIZE=$CONC
+MAX_RUNNING_REQUESTS=128
+CONTEXT_LENGTH=$((ISL + OSL + 20))
 
-export SGLANG_FUSED_QK_NORM_ROPE_CACHE_PTS_QUANT_SHUFFLE=1
+# Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls.
+if [[ $CONC -ge 16 ]]; then
+  SCHEDULER_RECV_INTERVAL=30
+else
+  SCHEDULER_RECV_INTERVAL=10
+fi
 
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
-python3 -m sglang.launch_server \
+export SGLANG_FUSED_QK_NORM_ROPE_CACHE_PTS_QUANT_SHUFFLE=1
+
+cd /sgl-workspace/sglang
+set -x
+PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \
     --attention-backend triton \
     --model-path $MODEL \
     --host=0.0.0.0 \
     --port $PORT \
     --tensor-parallel-size $TP \
     --trust-remote-code \
-    --enable-fused-qk-norm-rope \
-    --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 &
+    --mem-fraction-static $MEM_FRAC_STATIC \
+    --chunked-prefill-size $CHUNKED_PREFILL_SIZE \
+    --max-prefill-tokens $MAX_PREFILL_TOKENS \
+    --cuda-graph-max-batch-size $CUDA_GRAPH_MAX_BATCH_SIZE \
+    --max-running-requests $MAX_RUNNING_REQUESTS \
+    --enable-aiter-allreduce-fusion \
+    --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
+    --tokenizer-worker-num 6 \
+    --stream-interval 30 \
+    --kv-cache-dtype fp8_e4m3 \
+    --context-length $CONTEXT_LENGTH > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 

From 6407c08533544a05b8e74fc1c986c15c2f335e7a Mon Sep 17 00:00:00 2001
From: "Chen, Todd" <zhenchen@amd.com>
Date: Fri, 20 Mar 2026 11:30:19 -0500
Subject: [PATCH 11/11] Update qwen3.5 benchmark scripts to use shortened flag
 for CUDA graph batch size

- Changed the flag from --cuda-graph-max-batch-size to --cuda-graph-max-bs for consistency and brevity in both bf16 and fp8 benchmark scripts.
---
 benchmarks/single_node/qwen3.5_bf16_mi355x.sh | 2 +-
 benchmarks/single_node/qwen3.5_fp8_mi355x.sh  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh
index daa9b1f37..d98818839 100755
--- a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh
+++ b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh
@@ -50,7 +50,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \
     --mem-fraction-static $MEM_FRAC_STATIC \
     --chunked-prefill-size $CHUNKED_PREFILL_SIZE \
     --max-prefill-tokens $MAX_PREFILL_TOKENS \
-    --cuda-graph-max-batch-size $CUDA_GRAPH_MAX_BATCH_SIZE \
+    --cuda-graph-max-bs $CUDA_GRAPH_MAX_BATCH_SIZE \
     --max-running-requests $MAX_RUNNING_REQUESTS \
     --enable-aiter-allreduce-fusion \
     --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
diff --git a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh
index b6258a67c..44c38b35d 100644
--- a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh
+++ b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh
@@ -50,7 +50,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \
     --mem-fraction-static $MEM_FRAC_STATIC \
     --chunked-prefill-size $CHUNKED_PREFILL_SIZE \
     --max-prefill-tokens $MAX_PREFILL_TOKENS \
-    --cuda-graph-max-batch-size $CUDA_GRAPH_MAX_BATCH_SIZE \
+    --cuda-graph-max-bs $CUDA_GRAPH_MAX_BATCH_SIZE \
     --max-running-requests $MAX_RUNNING_REQUESTS \
     --enable-aiter-allreduce-fusion \
     --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \