diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 5551860f2..895396b55 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -140,7 +140,7 @@ dsr1-fp8-mi355x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-bf16-mi355x-sglang: - image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260215 + image: rocm/sgl-dev:v0.5.9-rocm720-mi35x-20260227 model: Qwen/Qwen3.5-397B-A17B model-prefix: qwen3.5 runner: mi355x @@ -228,7 +228,7 @@ qwen3.5-fp8-mi325x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-fp8-mi355x-sglang: - image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260218 + image: rocm/sgl-dev:v0.5.9-rocm720-mi35x-20260227 model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: mi355x diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index c1c74ffe3..e55de11d0 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -62,6 +62,22 @@ on: description: "Git ref (branch/sha) to checkout" required: false type: string + aiter-remote: + description: "Repo URL to reinstall aiter from" + required: false + type: string + aiter-ref: + description: "Branch or commit for aiter" + required: false + type: string + sglang-remote: + description: "Repo URL to reinstall sglang from" + required: false + type: string + sglang-ref: + description: "Branch or commit for sglang" + required: false + type: string env: RANDOM_RANGE_RATIO: 0.8 @@ -83,6 +99,10 @@ env: SPEC_DECODING: ${{ inputs.spec-decoding }} DISAGG: ${{ inputs.disagg }} RUN_EVAL: ${{ inputs.run-eval }} + AITER_REMOTE: ${{ inputs.aiter-remote }} + AITER_REF: ${{ inputs.aiter-ref }} + SGLANG_REMOTE: ${{ inputs.sglang-remote }} + SGLANG_REF: ${{ inputs.sglang-ref }} permissions: contents: read diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index 0f26c7c41..dee5ab153 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -16,6 +16,22 @@ on: description: "Ref (branch/sha) to checkout for generating configs" required: false type: string + aiter-repo: + description: "Repo URL to reinstall aiter from (e.g. https://github.com/user/aiter.git)" + required: false + type: string + aiter-ref: + description: "Branch or commit for aiter" + required: false + type: string + sglang-repo: + description: "Repo URL to reinstall sglang from (e.g. https://github.com/user/sglang)" + required: false + type: string + sglang-ref: + description: "Branch or commit for sglang" + required: false + type: string workflow_call: inputs: generate-cli-command: @@ -30,6 +46,22 @@ on: description: "Ref (branch/sha) to checkout for generating configs" required: false type: string + aiter-repo: + description: "Repo URL to reinstall aiter from" + required: false + type: string + aiter-ref: + description: "Branch or commit for aiter" + required: false + type: string + sglang-repo: + description: "Repo URL to reinstall sglang from" + required: false + type: string + sglang-ref: + description: "Branch or commit for sglang" + required: false + type: string jobs: get-jobs: @@ -124,7 +156,11 @@ jobs: spec-decoding: ${{ matrix.config.spec-decoding }} disagg: ${{ matrix.config.disagg }} run-eval: ${{ matrix.config.run-eval }} - ref: ${{ inputs.ref }} + ref: ${{ inputs.ref || github.event.inputs.ref }} + aiter-remote: ${{ inputs.aiter-repo || github.event.inputs['aiter-repo'] }} + aiter-ref: ${{ inputs.aiter-ref || github.event.inputs['aiter-ref'] }} + sglang-remote: ${{ inputs.sglang-repo || github.event.inputs['sglang-repo'] }} + sglang-ref: ${{ inputs.sglang-ref || github.event.inputs['sglang-ref'] }} collect-results: needs: [test-sweep-multi-node, test-sweep-single-node] diff --git a/benchmarks/single_node/patch_sgl_components.sh b/benchmarks/single_node/patch_sgl_components.sh new file mode 100644 index 000000000..fcd6b6e61 --- /dev/null +++ b/benchmarks/single_node/patch_sgl_components.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash +# Reinstall aiter and sglang from custom remote/ref. +# Expects AITER_REMOTE, AITER_REF, SGLANG_REMOTE, SGLANG_REF from env. +# If no refs are provided, skip entirely and use default packages in container. + +set -e + +if [[ -z "$AITER_REF" && -z "$SGLANG_REF" ]]; then + echo "No patch refs provided; using default packages in container." + exit 0 +fi + +work_dir="/sgl-workspace" +aiter_remote="${AITER_REMOTE:-https://github.com/zhentaocc/aiter.git}" +aiter_ref="${AITER_REF:-mi35_qwen35_image}" +sglang_remote="${SGLANG_REMOTE:-https://github.com/sgl-project/sglang}" +sglang_ref="${SGLANG_REF:-mi35_qwen35_image}" + +if [[ ! -d "$work_dir" ]]; then + echo "$work_dir not found; assuming image ships correct versions." + exit 0 +fi + +if [[ -n "$AITER_REF" ]]; then + pip uninstall amd-aiter -y + cd "$work_dir/aiter" + git remote set-url origin "$aiter_remote" 2>/dev/null || git remote add origin "$aiter_remote" + git fetch origin "$aiter_ref" 2>/dev/null || git fetch origin + git checkout "$aiter_ref" 2>/dev/null || git reset --hard "origin/$aiter_ref" 2>/dev/null || git reset --hard "$aiter_ref" + rm -rf aiter/jit/*.so 2>/dev/null || true + rm -rf aiter/jit/build 2>/dev/null || true + rm -rf aiter/jit/dist 2>/dev/null || true + git submodule update --init --force 3rdparty/composable_kernel 2>/dev/null || true + PREBUILD_KERNELS=0 python setup.py develop + echo "aiter ($aiter_ref) installed from $aiter_remote" +else + echo "AITER_REF not set; using default aiter in container" +fi + +if [[ -n "$SGLANG_REF" ]] && [[ -d "$work_dir/sglang/.git" ]]; then + cd "$work_dir/sglang" + git remote set-url origin "$sglang_remote" 2>/dev/null || git remote add origin "$sglang_remote" + git fetch origin "$sglang_ref" 2>/dev/null || git fetch origin + git checkout "$sglang_ref" 2>/dev/null || git reset --hard "origin/$sglang_ref" 2>/dev/null || git reset --hard "$sglang_ref" + echo "sglang ($sglang_ref) from $sglang_remote" +elif [[ -z "$SGLANG_REF" ]]; then + echo "SGLANG_REF not set; using default sglang in container" +fi diff --git a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh index f77390707..d98818839 100755 --- a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh +++ b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh @@ -19,21 +19,47 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +MEM_FRAC_STATIC=0.82 +CHUNKED_PREFILL_SIZE=32768 +MAX_PREFILL_TOKENS=32768 +CUDA_GRAPH_MAX_BATCH_SIZE=$CONC +MAX_RUNNING_REQUESTS=128 +CONTEXT_LENGTH=$((ISL + OSL + 20)) + +# Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls. +if [[ $CONC -ge 16 ]]; then + SCHEDULER_RECV_INTERVAL=30 +else + SCHEDULER_RECV_INTERVAL=10 +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor -python3 -m sglang.launch_server \ +export SGLANG_FUSED_QK_NORM_ROPE_CACHE_PTS_QUANT_SHUFFLE=1 + +cd /sgl-workspace/sglang +set -x +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ --attention-backend triton \ --model-path $MODEL \ --host=0.0.0.0 \ --port $PORT \ --tensor-parallel-size $TP \ --trust-remote-code \ - --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 & + --mem-fraction-static $MEM_FRAC_STATIC \ + --chunked-prefill-size $CHUNKED_PREFILL_SIZE \ + --max-prefill-tokens $MAX_PREFILL_TOKENS \ + --cuda-graph-max-bs $CUDA_GRAPH_MAX_BATCH_SIZE \ + --max-running-requests $MAX_RUNNING_REQUESTS \ + --enable-aiter-allreduce-fusion \ + --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ + --tokenizer-worker-num 6 \ + --stream-interval 30 \ + --context-length $CONTEXT_LENGTH > $SERVER_LOG 2>&1 & SERVER_PID=$! - +cd - # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" diff --git a/benchmarks/single_node/qwen3.5_bf16_mi355x_interactive.sh b/benchmarks/single_node/qwen3.5_bf16_mi355x_interactive.sh new file mode 100755 index 000000000..eb1e6cac1 --- /dev/null +++ b/benchmarks/single_node/qwen3.5_bf16_mi355x_interactive.sh @@ -0,0 +1,196 @@ +#!/usr/bin/env bash +# +# Interactive benchmark script for Qwen3.5-397B-A17B on MI355X. +# Runs inside Docker with CLI args for TP, sequence lengths, and concurrency. +# +# Usage: +# ./qwen3.5_bf16_mi355x_interactive.sh [OPTIONS] +# +# Options: +# -tp N Tensor parallel size (default: 8) +# -sl ISL,OSL ... Space-separated isl,osl pairs (default: 1024,1024 8192,1024 1024,8192) +# -conc N N ... Space-separated concurrency values (default: 8) +# -result-dir DIR Output directory (default: /workspace/) +# +# Examples: +# ./qwen3.5_bf16_mi355x_interactive.sh +# ./qwen3.5_bf16_mi355x_interactive.sh -tp 8 -sl 1024,1024 8192,1024 -conc 8 16 32 +# ./qwen3.5_bf16_mi355x_interactive.sh -result-dir /workspace/results + +set -e + +# Defaults +TP=8 +SL_LIST=("1024,1024" "8192,1024" "1024,8192") +CONC_LIST=(8) +RESULT_DIR="/home/zhenchen/projects/InferenceX/" + +usage() { + echo "Usage: $0 [OPTIONS]" + echo "" + echo "Options:" + echo " -tp N Tensor parallel size (default: 8)" + echo " -sl ISL,OSL ... Space-separated isl,osl pairs (default: 1024,1024 8192,1024 1024,8192)" + echo " -conc N N ... Space-separated concurrency values (default: 8)" + echo " -result-dir DIR Output directory (default: /workspace/)" + echo "" + echo "Examples:" + echo " $0" + echo " $0 -tp 8 -sl 1024,1024 8192,1024 -conc 8 16 32" + echo " $0 -result-dir /home/zhenchen/projects/InferenceX/results" + exit 1 +} + +# Parse CLI args +while [[ $# -gt 0 ]]; do + case $1 in + -tp) + TP="$2" + shift 2 + ;; + -sl) + shift + SL_LIST=() + while [[ $# -gt 0 && ! "$1" =~ ^- ]]; do + SL_LIST+=("$1") + shift + done + [[ ${#SL_LIST[@]} -eq 0 ]] && SL_LIST=("1024,1024" "8192,1024" "1024,8192") + ;; + -conc) + shift + CONC_LIST=() + while [[ $# -gt 0 && ! "$1" =~ ^- ]]; do + CONC_LIST+=("$1") + shift + done + [[ ${#CONC_LIST[@]} -eq 0 ]] && CONC_LIST=(8) + ;; + -result-dir) + RESULT_DIR="$2" + shift 2 + ;; + -h|--help) + usage + ;; + *) + echo "Unknown option: $1" + usage + ;; + esac +done + +# Validate TP +if ! [[ "$TP" =~ ^[0-9]+$ ]]; then + echo "Error: -tp must be a positive integer, got: $TP" + exit 1 +fi + +# Validate each sl pair (isl,osl) +for pair in "${SL_LIST[@]}"; do + if ! [[ "$pair" =~ ^[0-9]+,[0-9]+$ ]]; then + echo "Error: -sl pair must be isl,osl (e.g. 1024,1024), got: $pair" + exit 1 + fi +done + +# Validate each conc +for c in "${CONC_LIST[@]}"; do + if ! [[ "$c" =~ ^[0-9]+$ ]]; then + echo "Error: -conc values must be positive integers, got: $c" + exit 1 + fi +done + +# Ensure result dir exists and ends with / +[[ "${RESULT_DIR}" != */ ]] && RESULT_DIR="${RESULT_DIR}/" +mkdir -p "$RESULT_DIR" + +# Set optional env vars +export MODEL="${MODEL:-Qwen/Qwen3.5-397B-A17B}" +export RANDOM_RANGE_RATIO="${RANDOM_RANGE_RATIO:-0.1}" + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars MODEL RANDOM_RANGE_RATIO + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +echo "Config: TP=$TP, SL=${SL_LIST[*]}, CONC=${CONC_LIST[*]}, RESULT_DIR=$RESULT_DIR" + +hf download "$MODEL" + +SERVER_LOG=/home/zhenchen/projects/InferenceX/server.log +PORT=${PORT:-8888} +MEM_FRAC_STATIC=0.82 +CHUNKED_PREFILL_SIZE=32768 +MAX_PREFILL_TOKENS=32768 +CUDA_GRAPH_MAX_BATCH_SIZE=$CONC +MAX_RUNNING_REQUESTS=128 +CONTEXT_LENGTH=$((ISL + OSL + 20)) + +# Start GPU monitoring +start_gpu_monitor + +set -x +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ + --attention-backend triton \ + --model-path $MODEL \ + --host=0.0.0.0 \ + --port $PORT \ + --tensor-parallel-size $TP \ + --trust-remote-code \ + --mem-fraction-static $MEM_FRAC_STATIC \ + --chunked-prefill-size $CHUNKED_PREFILL_SIZE \ + --max-prefill-tokens $MAX_PREFILL_TOKENS \ + --cuda-graph-max-batch-size $CUDA_GRAPH_MAX_BATCH_SIZE \ + --max-running-requests $MAX_RUNNING_REQUESTS \ + --enable-aiter-allreduce-fusion \ + --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ + --tokenizer-worker-num 6 \ + --stream-interval 30 \ + --context-length $CONTEXT_LENGTH > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +# Loop over (isl,osl) and conc +for sl_pair in "${SL_LIST[@]}"; do + IFS=',' read -r ISL OSL <<< "$sl_pair" + for CONC in "${CONC_LIST[@]}"; do + RESULT_FILENAME="result_TP${TP}_CONC${CONC}_ISL${ISL}_OSL${OSL}.json" + echo "Running: ISL=$ISL OSL=$OSL CONC=$CONC -> $RESULT_FILENAME" + + if [[ $CONC -ge 16 ]]; then + SCHEDULER_RECV_INTERVAL=30 + else + SCHEDULER_RECV_INTERVAL=10 + fi + + run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir "$RESULT_DIR" + + if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary + fi + done +done + +stop_gpu_monitor +echo "Done. Results in $RESULT_DIR" +echo "" +echo "To summarize results into a markdown table, run (from repo root):" +echo " python3 utils/summarize_interactive_results.py $RESULT_DIR -o summary.md" diff --git a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh index f77390707..44c38b35d 100644 --- a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh +++ b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh @@ -19,18 +19,45 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +MEM_FRAC_STATIC=0.82 +CHUNKED_PREFILL_SIZE=32768 +MAX_PREFILL_TOKENS=32768 +CUDA_GRAPH_MAX_BATCH_SIZE=$CONC +MAX_RUNNING_REQUESTS=128 +CONTEXT_LENGTH=$((ISL + OSL + 20)) + +# Default: recv every ~10 requests; if CONC ≥ 16, relax to ~30 requests between scheduler recv polls. +if [[ $CONC -ge 16 ]]; then + SCHEDULER_RECV_INTERVAL=30 +else + SCHEDULER_RECV_INTERVAL=10 +fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor -python3 -m sglang.launch_server \ +export SGLANG_FUSED_QK_NORM_ROPE_CACHE_PTS_QUANT_SHUFFLE=1 + +cd /sgl-workspace/sglang +set -x +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ --attention-backend triton \ --model-path $MODEL \ --host=0.0.0.0 \ --port $PORT \ --tensor-parallel-size $TP \ --trust-remote-code \ - --mem-fraction-static 0.8 > $SERVER_LOG 2>&1 & + --mem-fraction-static $MEM_FRAC_STATIC \ + --chunked-prefill-size $CHUNKED_PREFILL_SIZE \ + --max-prefill-tokens $MAX_PREFILL_TOKENS \ + --cuda-graph-max-bs $CUDA_GRAPH_MAX_BATCH_SIZE \ + --max-running-requests $MAX_RUNNING_REQUESTS \ + --enable-aiter-allreduce-fusion \ + --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ + --tokenizer-worker-num 6 \ + --stream-interval 30 \ + --kv-cache-dtype fp8_e4m3 \ + --context-length $CONTEXT_LENGTH > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index f2471466b..73eb009dc 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -1,5 +1,16 @@ #!/usr/bin/env bash +# Sudo rm only for paths under workspace; guards against path injection / escaping +safe_sudo_rm() { + local target="$1" + local workspace="${2:-$GITHUB_WORKSPACE}" + if [[ -z "$workspace" || -z "$target" ]]; then return 0; fi + if [[ "$workspace" != /* ]]; then return 0; fi + if [[ "$target" != "$workspace"/* ]]; then return 0; fi + if [[ "$target" == *".."* ]]; then return 0; fi + sudo rm -rf "$target" 2>/dev/null || true +} + scancel_sync() { local jobid=$1 local timeout=${2:-600} @@ -46,10 +57,14 @@ if [[ "$IS_MULTINODE" == "true" ]]; then export ISL="$ISL" export OSL="$OSL" - # Logs go to BENCHMARK_LOGS_DIR (NFS-accessible, outside the repo tree) + # Logs go to BENCHMARK_LOGS_DIR (must be under workspace - no host modification outside) export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$GITHUB_WORKSPACE/benchmark_logs}" + if [[ -z "$GITHUB_WORKSPACE" || -z "$BENCHMARK_LOGS_DIR" ]] || [[ "$BENCHMARK_LOGS_DIR" != "$GITHUB_WORKSPACE"/* ]]; then + echo "ERROR: BENCHMARK_LOGS_DIR must be under GITHUB_WORKSPACE. Got BENCHMARK_LOGS_DIR=$BENCHMARK_LOGS_DIR" >&2 + exit 1 + fi mkdir -p "$BENCHMARK_LOGS_DIR" - sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true + safe_sudo_rm "$BENCHMARK_LOGS_DIR/logs" "$GITHUB_WORKSPACE" SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_mi355x_${FRAMEWORK}.sh" if [[ "$FRAMEWORK" == "sglang-disagg" ]]; then @@ -101,14 +116,16 @@ if [[ "$IS_MULTINODE" == "true" ]]; then # search for "FRAMEWORK_DIFF_IF_STATEMENT #3" for this if-statement # Find the latest log directory that contains the data - cat > collect_latest_results.py <<'PY' + COLLECT_SCRIPT=$(mktemp) + trap "rm -f '$COLLECT_SCRIPT'" EXIT + cat > "$COLLECT_SCRIPT" <<'PY' import os, sys sgl_job_dir, isl, osl, nexp = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]) for path in sorted([f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}" for name in os.listdir(f"{sgl_job_dir}/logs/") if os.path.isdir(f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]: print(path) PY - LOGS_DIR=$(python3 collect_latest_results.py "$BENCHMARK_LOGS_DIR" "$ISL" "$OSL" 1) + LOGS_DIR=$(python3 "$COLLECT_SCRIPT" "$BENCHMARK_LOGS_DIR" "$ISL" "$OSL" 1) if [ -z "$LOGS_DIR" ]; then echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" exit 1 @@ -117,15 +134,13 @@ PY echo "Found logs directory: $LOGS_DIR" ls -la "$LOGS_DIR" - # Result JSON are contained within the result directory - for result_file in $(find $LOGS_DIR -type f); do - # result_file should directly be isl_ISL_osl_OSL_concurrency_CONC_req_rate_R_gpus_N_ctx_M_gen_N.json - file_name=$(basename $result_file) - if [ -f $result_file ]; then - # Copy the result file to workspace with a unique name + # Result JSON are contained within the result directory (copy only into workspace) + for result_file in $(find "$LOGS_DIR" -type f); do + file_name=$(basename "$result_file") + if [[ -f "$result_file" && -n "$GITHUB_WORKSPACE" && "$GITHUB_WORKSPACE" == /* ]]; then WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${file_name}" echo "Found result file ${result_file}. Copying it to ${WORKSPACE_RESULT_FILE}" - cp $result_file $WORKSPACE_RESULT_FILE + cp "$result_file" "$WORKSPACE_RESULT_FILE" fi done @@ -136,10 +151,10 @@ PY set -x echo "Canceled the slurm job $JOB_ID" - sudo rm -rf "$BENCHMARK_LOGS_DIR/logs" 2>/dev/null || true + safe_sudo_rm "$BENCHMARK_LOGS_DIR/logs" "$GITHUB_WORKSPACE" - # Upload logs as artifact if running in GitHub Actions - if [[ -n "${GITHUB_ACTIONS:-}" ]]; then + # Upload logs as artifact if running in GitHub Actions (workspace only) + if [[ -n "${GITHUB_ACTIONS:-}" && -n "$GITHUB_WORKSPACE" && "$GITHUB_WORKSPACE" == /* ]]; then ARTIFACT_DIR="$GITHUB_WORKSPACE/benchmark_artifacts" mkdir -p "$ARTIFACT_DIR" cp -r "$BENCHMARK_LOGS_DIR"/slurm_job-${JOB_ID}.{out,err} "$ARTIFACT_DIR/" 2>/dev/null || true @@ -148,53 +163,101 @@ PY else - export HF_HUB_CACHE_MOUNT="/docker/huggingface/hub" + export HF_HUB_CACHE_MOUNT="${HF_HUB_CACHE_MOUNT:-/docker/huggingface/hub}" export PORT_OFFSET=${RUNNER_NAME: -1} export PORT=$(( 8888 + ${PORT_OFFSET} )) FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "atom" ]] && printf '_atom' || printf '') SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') + BENCHMARK_SCRIPT="benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_mi355x${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh" + WORKSPACE="${GITHUB_WORKSPACE:-$(pwd)}" + if [[ -z "$WORKSPACE" || "$WORKSPACE" != /* ]]; then + echo "ERROR: WORKSPACE must be an absolute path. Got: $WORKSPACE" >&2 + exit 1 + fi + mkdir -p "$HF_HUB_CACHE_MOUNT" + + if command -v salloc &>/dev/null && command -v srun &>/dev/null && command -v squeue &>/dev/null; then + # SLURM path: allocate via salloc, run via srun with enroot/squash container + PARTITION="compute" + SQUASH_FILE="/var/lib/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" + LOCK_FILE="${SQUASH_FILE}.lock" + + set -x + salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=128 --time=180 --no-shell --job-name="$RUNNER_NAME" + JOB_ID=$(squeue --name="$RUNNER_NAME" -h -o %A | head -n1) + if [[ -z "$JOB_ID" ]]; then + echo "ERROR: salloc failed or no job found for $RUNNER_NAME. Check partition=$PARTITION and GPU availability." >&2 + exit 1 + fi - PARTITION="compute" - SQUASH_FILE="/var/lib/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" - LOCK_FILE="${SQUASH_FILE}.lock" - - set -x - salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=128 --time=180 --no-shell --job-name="$RUNNER_NAME" - JOB_ID=$(squeue --name="$RUNNER_NAME" -h -o %A | head -n1) - - # Remove leftover bmk-server from previous run so we can reuse the name (targeted cleanup only) - srun --jobid=$JOB_ID bash -c "docker rm -f bmk-server 2>/dev/null || true" - - # Use flock to serialize concurrent imports to the same squash file - srun --jobid=$JOB_ID bash -c " - exec 9>\"$LOCK_FILE\" - flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; } - if [[ \"$FRAMEWORK\" == \"atom\" ]]; then - rm -f \"$SQUASH_FILE\" + srun --jobid=$JOB_ID bash -c "docker rm -f bmk-server 2>/dev/null || true" + + # Note: This block runs on the compute node and modifies /var/lib/squash (enroot cache). + # Only squash files under /var/lib/squash/ are touched - no user data or workspace. + srun --jobid=$JOB_ID bash -c " + exec 9>\"$LOCK_FILE\" + flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; } + if [[ \"$SQUASH_FILE\" != /var/lib/squash/* ]]; then exit 1; fi + if [[ \"$FRAMEWORK\" == \"atom\" ]]; then + rm -f \"$SQUASH_FILE\" + fi + if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then + echo 'Squash file already exists and is valid, skipping import' + else + rm -f \"$SQUASH_FILE\" + enroot import -o \"$SQUASH_FILE\" docker://$IMAGE + fi + " + + export VLLM_CACHE_ROOT="/it-share/gharunners/.cache/vllm" + + # Reinstall aiter/sglang in /sgl-workspace when AITER_REF or SGLANG_REF are set, then run benchmark + RUN_CMD="bash benchmarks/single_node/patch_sgl_components.sh && exec bash $BENCHMARK_SCRIPT" + srun --jobid=$JOB_ID \ + --container-image=$SQUASH_FILE \ + --container-mounts=$WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ + --container-mount-home \ + --container-writable \ + --container-workdir=/workspace/ \ + --no-container-entrypoint --export=ALL \ + bash -c "$RUN_CMD" + + scancel $JOB_ID + + # Remove gpucore temp files only within workspace (no host modification outside) + if ls "$WORKSPACE"/gpucore.* 1> /dev/null 2>&1; then + echo "gpucore files exist. not good" + for f in "$WORKSPACE"/gpucore.*; do + [[ -e "$f" ]] && safe_sudo_rm "$f" "$WORKSPACE" + done fi - if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then - echo 'Squash file already exists and is valid, skipping import' - else - rm -f \"$SQUASH_FILE\" - enroot import -o \"$SQUASH_FILE\" docker://$IMAGE + else + # Non-SLURM path: run directly with Docker (no salloc/srun) + if ! command -v docker &>/dev/null; then + echo "ERROR: Neither SLURM nor Docker found. Install SLURM (for cluster) or Docker (for standalone)." >&2 + exit 1 fi - " - - export VLLM_CACHE_ROOT="/it-share/gharunners/.cache/vllm" - - srun --jobid=$JOB_ID \ - --container-image=$SQUASH_FILE \ - --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ - --container-mount-home \ - --container-writable \ - --container-workdir=/workspace/ \ - --no-container-entrypoint --export=ALL \ - bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_mi355x${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh - - scancel $JOB_ID - - if ls gpucore.* 1> /dev/null 2>&1; then - echo "gpucore files exist. not good" - rm -f gpucore.* + echo "SLURM not available; using Docker directly." + + server_name="bmk-server" + docker rm -f "$server_name" 2>/dev/null || true + + set -x + # Reinstall aiter/sglang in /sgl-workspace when AITER_REF or SGLANG_REF are set, then run benchmark + RUN_CMD="bash benchmarks/single_node/patch_sgl_components.sh && exec bash $BENCHMARK_SCRIPT" + docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \ + --privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \ + --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ + -v "$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE" \ + -v "$WORKSPACE:/workspace/" -w /workspace/ \ + -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e EP_SIZE -e DP_ATTENTION -e CONC \ + -e MAX_MODEL_LEN -e PORT=$PORT -e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ + -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL -e RUNNER_TYPE \ + -e PROFILE -e SGLANG_TORCH_PROFILER_DIR -e VLLM_TORCH_PROFILER_DIR -e VLLM_RPC_TIMEOUT \ + -e SPEC_DECODING -e DISAGG \ + -e AITER_REMOTE -e AITER_REF -e SGLANG_REMOTE -e SGLANG_REF \ + --entrypoint=/bin/bash \ + "$IMAGE" \ + -c "$RUN_CMD" fi fi diff --git a/utils/summarize_interactive_results.py b/utils/summarize_interactive_results.py new file mode 100644 index 000000000..bccd8bbe6 --- /dev/null +++ b/utils/summarize_interactive_results.py @@ -0,0 +1,318 @@ +#!/usr/bin/env python3 +""" +Process raw benchmark result JSONs from the interactive script and summarize into a markdown table. + +Follows the same processing flow as benchmark-tmpl.yml and e2e-tests.yml: +1. Process each result_*.json (raw benchmark_serving output) into agg format +2. Collect all combinations +3. Output markdown table via tabulate + +Usage: + python utils/summarize_interactive_results.py [options] + +Examples: + python utils/summarize_interactive_results.py /workspace/ + python utils/summarize_interactive_results.py ./results -o summary.md + python utils/summarize_interactive_results.py ./results --hw mi355x --model-prefix qwen3.5 +""" +import argparse +import json +import re +import sys +from pathlib import Path +from typing import Any, Optional + +try: + from tabulate import tabulate +except ImportError: + tabulate = None + + +# Filename pattern: result_TP{TP}_CONC{CONC}_ISL{ISL}_OSL{OSL}.json +RESULT_PATTERN = re.compile( + r"result_TP(\d+)_CONC(\d+)_ISL(\d+)_OSL(\d+)\.json", re.IGNORECASE +) + + +def parse_result_filename(path: Path) -> Optional[tuple[int, int, int, int]]: + """Extract TP, CONC, ISL, OSL from filename. Returns (tp, conc, isl, osl) or None.""" + m = RESULT_PATTERN.match(path.name) + if m: + return int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(4)) + return None + + +def process_raw_result( + raw: dict[str, Any], + tp: int, + conc: int, + isl: int, + osl: int, + *, + hw: str = "mi355x", + model_prefix: str = "qwen3.5", + framework: str = "sglang", + precision: str = "bf16", + image: str = "", +) -> dict[str, Any]: + """ + Convert raw benchmark_serving JSON to agg format (same schema as process_result.py output). + """ + model_id = raw.get("model_id", "unknown") + total_tput = float(raw.get("total_token_throughput", 0)) + output_tput = float(raw.get("output_throughput", 0)) + max_conc = int(raw.get("max_concurrency", conc)) + + data: dict[str, Any] = { + "hw": hw, + "conc": max_conc, + "image": image, + "model": model_id, + "infmax_model_prefix": model_prefix, + "framework": framework, + "precision": precision, + "spec_decoding": "none", + "disagg": False, + "isl": isl, + "osl": osl, + "is_multinode": False, + "tp": tp, + "ep": 1, + "dp_attention": False, + "tput_per_gpu": total_tput / tp, + "output_tput_per_gpu": output_tput / tp, + "input_tput_per_gpu": (total_tput - output_tput) / tp, + } + + # Convert *_ms to seconds (process_result logic) + for key, value in raw.items(): + if key.endswith("_ms") and isinstance(value, (int, float)): + data[key.replace("_ms", "")] = float(value) / 1000.0 + if "tpot" in key and isinstance(value, (int, float)) and value > 0: + data[key.replace("_ms", "").replace("tpot", "intvty")] = 1000.0 / float(value) + + # Ensure required fields for summarize (fallbacks if raw uses different keys) + if "median_ttft" not in data and "median_ttft_ms" in raw: + data["median_ttft"] = float(raw["median_ttft_ms"]) / 1000.0 + if "median_tpot" not in data and "median_tpot_ms" in raw: + data["median_tpot"] = float(raw["median_tpot_ms"]) / 1000.0 + if "median_e2el" not in data and "median_e2el_ms" in raw: + data["median_e2el"] = float(raw["median_e2el_ms"]) / 1000.0 + if "median_intvty" not in data and "median_tpot_ms" in raw: + data["median_intvty"] = 1000.0 / float(raw["median_tpot_ms"]) + data.setdefault("median_ttft", 0.0) + data.setdefault("median_tpot", 0.0) + data.setdefault("median_e2el", 0.0) + data.setdefault("median_intvty", 0.0) + + return data + + +def load_and_process( + result_dir: Path, + *, + hw: str = "mi355x", + model_prefix: str = "qwen3.5", + framework: str = "sglang", + precision: str = "bf16", + image: str = "", +) -> list[dict[str, Any]]: + """Load all result_*.json files, process, and return agg records.""" + agg_results: list[dict[str, Any]] = [] + + for path in sorted(result_dir.rglob("*.json")): + parsed = parse_result_filename(path) + if not parsed: + continue + + tp, conc, isl, osl = parsed + try: + with open(path) as f: + raw = json.load(f) + except (json.JSONDecodeError, OSError) as e: + print(f"Warning: Skipping {path}: {e}", file=sys.stderr) + continue + + if "total_token_throughput" not in raw or "model_id" not in raw: + print(f"Warning: Skipping {path}: missing required fields", file=sys.stderr) + continue + + agg = process_raw_result( + raw, + tp, + conc, + isl, + osl, + hw=hw, + model_prefix=model_prefix, + framework=framework, + precision=precision, + image=image, + ) + agg_results.append(agg) + + return agg_results + + +def to_markdown_table(results: list[dict[str, Any]]) -> str: + """Format results as markdown table (same columns as summarize.py single-node).""" + if not results: + return "No results to display.\n" + + results.sort( + key=lambda r: ( + r["infmax_model_prefix"], + r["hw"], + r["framework"], + r["precision"], + r["isl"], + r["osl"], + r["tp"], + r["ep"], + r["conc"], + ) + ) + + headers = [ + "Model", + "Served Model", + "Hardware", + "Framework", + "Precision", + "ISL", + "OSL", + "TP", + "EP", + "DP Attention", + "Conc", + "TTFT (ms)", + "TPOT (ms)", + "Interactivity (tok/s/user)", + "E2EL (s)", + "TPUT per GPU", + "Output TPUT per GPU", + "Input TPUT per GPU", + ] + + rows = [ + [ + r["infmax_model_prefix"], + r["model"], + r["hw"].upper(), + r["framework"].upper(), + r["precision"].upper(), + r["isl"], + r["osl"], + r["tp"], + r["ep"], + r["dp_attention"], + r["conc"], + f"{r['median_ttft'] * 1000:.4f}", + f"{r['median_tpot'] * 1000:.4f}", + f"{r['median_intvty']:.4f}", + f"{r['median_e2el']:.4f}", + f"{r['tput_per_gpu']:.4f}", + f"{r['output_tput_per_gpu']:.4f}", + f"{r['input_tput_per_gpu']:.4f}", + ] + for r in results + ] + + if tabulate: + return tabulate(rows, headers=headers, tablefmt="github") + # Fallback: simple markdown table + sep = "|" + "|".join(["---"] * len(headers)) + "|" + header_row = "|" + "|".join(headers) + "|" + data_rows = "\n".join("|" + "|".join(str(c) for c in row) + "|" for row in rows) + return f"{header_row}\n{sep}\n{data_rows}" + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Process interactive benchmark results and output markdown table" + ) + parser.add_argument( + "result_dir", + type=Path, + help="Directory containing result_TP*_CONC*_ISL*_OSL*.json files", + ) + parser.add_argument( + "-o", + "--output", + type=Path, + default=None, + help="Write markdown to file (default: stdout)", + ) + parser.add_argument( + "--hw", + type=str, + default="mi355x", + help="Hardware label (default: mi355x)", + ) + parser.add_argument( + "--model-prefix", + type=str, + default="qwen3.5", + help="Model prefix (default: qwen3.5)", + ) + parser.add_argument( + "--framework", + type=str, + default="sglang", + help="Framework (default: sglang)", + ) + parser.add_argument( + "--precision", + type=str, + default="bf16", + help="Precision (default: bf16)", + ) + parser.add_argument( + "--image", + type=str, + default="", + help="Docker image (optional)", + ) + parser.add_argument( + "--agg-json", + type=Path, + default=None, + help="Also write aggregated JSON to file (for collect_results compatibility)", + ) + + args = parser.parse_args() + + if not args.result_dir.is_dir(): + print(f"Error: {args.result_dir} is not a directory", file=sys.stderr) + sys.exit(1) + + results = load_and_process( + args.result_dir, + hw=args.hw, + model_prefix=args.model_prefix, + framework=args.framework, + precision=args.precision, + image=args.image, + ) + + if not results: + print("No result files found matching result_TP*_CONC*_ISL*_OSL*.json", file=sys.stderr) + sys.exit(1) + + md = to_markdown_table(results) + full_output = f"## Interactive Benchmark Results\n\n{md}\n" + + if args.output: + args.output.write_text(full_output) + print(f"Wrote summary to {args.output}", file=sys.stderr) + else: + print(full_output) + + if args.agg_json: + with open(args.agg_json, "w") as f: + json.dump(results, f, indent=2) + print(f"Wrote aggregated JSON to {args.agg_json}", file=sys.stderr) + + +if __name__ == "__main__": + main()