From 8738e721732510d72981a1b2e45a59bc143ef1fe Mon Sep 17 00:00:00 2001
From: root <zhimding@amd.com>
Date: Thu, 21 May 2026 08:51:22 +0000
Subject: [PATCH 1/7] CI: add MoE perf regression check (bench_moe)

Catch MoE kernel performance regressions per-PR by piggybacking on the
existing test_moe_2stage.py run in aiter-test:

- test_moe_2stage.py drops a moe_bench.csv (CSV-mode rows only, perf-only)
- standard job uploads the csv alongside latest_test.log
- new bench_moe job (ubuntu, no GPU) downloads the linux-aiter-mi35x-1
  shard csv, compares vs the last main baseline (artifact moe-bench-<SHA>,
  90d retention), reports to STEP_SUMMARY (warn-only for now)
- main push / workflow_dispatch publishes the next baseline

Warn thresholds default 1.10/1.15 (slow ratio cur/base); --fail-on-regress
is off until noise floor is characterized over 2-4 weeks.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/scripts/check_moe_regression.sh |  36 +++
 .github/workflows/aiter-test.yaml       | 149 +++++++++-
 op_tests/test_moe_2stage.py             |  12 +
 scripts/compare_benchmark.py            | 343 ++++++++++++++++++++++++
 4 files changed, 539 insertions(+), 1 deletion(-)
 create mode 100755 .github/scripts/check_moe_regression.sh
 create mode 100644 scripts/compare_benchmark.py
diff --git a/.github/scripts/check_moe_regression.sh b/.github/scripts/check_moe_regression.sh
new file mode 100755
index 0000000000..6e2694e41a
--- /dev/null
+++ b/.github/scripts/check_moe_regression.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: MIT
+# Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# Wrap scripts/compare_benchmark.py for the bench_moe CI job.
+# Prints a comparison table to stdout (captured by the job step into
+# $GITHUB_STEP_SUMMARY). Exits 0 unless --fail-on-regress is passed and
+# at least one REGRESS row is found.
+#
+# Usage: check_moe_regression.sh <baseline_csv> <current_csv> [extra args...]
+set -euo pipefail
+
+BASE=${1:?baseline csv path required}
+CURR=${2:?current csv path required}
+shift 2
+
+BASE_LABEL="baseline"
+CURR_LABEL="current"
+if [[ -n "${BASE_SHA:-}" ]]; then
+    BASE_LABEL="main(${BASE_SHA:0:7})"
+fi
+if [[ -n "${CURR_SHA:-}" ]]; then
+    CURR_LABEL="PR(${CURR_SHA:0:7})"
+elif [[ -n "${GITHUB_SHA:-}" ]]; then
+    CURR_LABEL="${GITHUB_SHA:0:7}"
+fi
+
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+
+python3 "${REPO_ROOT}/scripts/compare_benchmark.py" \
+    "$BASE" "$CURR" \
+    --baseline-label "$BASE_LABEL" \
+    --current-label "$CURR_LABEL" \
+    --warn 1.10 \
+    --fail 1.15 \
+    "$@"
diff --git a/.github/workflows/aiter-test.yaml b/.github/workflows/aiter-test.yaml
index b8a2f40fa6..d34eb7bebc 100644
--- a/.github/workflows/aiter-test.yaml
+++ b/.github/workflows/aiter-test.yaml
@@ -507,7 +507,9 @@ jobs:
         if: success()
         with:
           name: standard-test-log-${{ matrix.runner }}-shard-${{ matrix.shard_idx }}
-          path: latest_test.log
+          path: |
+            latest_test.log
+            moe_bench.csv
           retention-days: 7
 
       - name: Cleanup container
@@ -756,3 +758,148 @@ jobs:
         if: always()
         run: |
           ./.github/scripts/clean_up_rocm.sh
+
+  bench_moe:
+    # MoE perf regression check.
+    # - PR: pull last main baseline CSV, compare current shard's CSV, warn-only
+    # - push to main / workflow_dispatch: publish current CSV as next baseline
+    # Only consumes csv from linux-aiter-mi35x-1 runner to avoid cross-arch noise.
+    name: MoE Bench Regression
+    if: >-
+      !github.event.pull_request.draft &&
+      github.event.action != 'labeled' &&
+      github.event_name != 'schedule'
+    runs-on: ubuntu-latest
+    needs: [standard]
+    timeout-minutes: 15
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Download standard test logs (mi35x only)
+        uses: actions/download-artifact@v4
+        with:
+          pattern: standard-test-log-linux-aiter-mi35x-1-shard-*
+          path: /tmp/logs/
+
+      - name: Locate current moe_bench.csv
+        id: current
+        run: |
+          shopt -s nullglob
+          csv_files=(/tmp/logs/*/moe_bench.csv)
+          if [[ ${#csv_files[@]} -eq 0 ]]; then
+              echo "::warning::No moe_bench.csv found in any mi35x shard; skipping bench_moe"
+              echo "found=false" >> "$GITHUB_OUTPUT"
+              exit 0
+          fi
+          if [[ ${#csv_files[@]} -gt 1 ]]; then
+              echo "::warning::Multiple moe_bench.csv found, using first: ${csv_files[0]}"
+              printf '  %s\n' "${csv_files[@]}"
+          fi
+          cp "${csv_files[0]}" /tmp/current.csv
+          echo "Current CSV: ${csv_files[0]} ($(wc -l < /tmp/current.csv) lines)"
+          echo "found=true" >> "$GITHUB_OUTPUT"
+
+      # ── PR path: compare vs baseline ──
+      - name: Fetch baseline from PR base.sha
+        if: steps.current.outputs.found == 'true' && github.event_name == 'pull_request'
+        id: baseline_pinned
+        continue-on-error: true
+        uses: dawidd6/action-download-artifact@v3
+        with:
+          workflow: aiter-test.yaml
+          commit: ${{ github.event.pull_request.base.sha }}
+          name: moe-bench-${{ github.event.pull_request.base.sha }}
+          path: /tmp/baseline_pinned/
+          if_no_artifact_found: warn
+
+      - name: Fallback — fetch baseline from latest main
+        if: >-
+          steps.current.outputs.found == 'true' &&
+          github.event_name == 'pull_request' &&
+          steps.baseline_pinned.outcome != 'success'
+        id: baseline_main
+        continue-on-error: true
+        uses: dawidd6/action-download-artifact@v3
+        with:
+          workflow: aiter-test.yaml
+          branch: main
+          name_is_regexp: true
+          name: ^moe-bench-[a-f0-9]+$
+          path: /tmp/baseline_main/
+          if_no_artifact_found: warn
+
+      - name: Compare
+        if: steps.current.outputs.found == 'true' && github.event_name == 'pull_request'
+        env:
+          BASE_SHA: ${{ github.event.pull_request.base.sha }}
+          CURR_SHA: ${{ github.event.pull_request.head.sha }}
+        run: |
+          set -e
+          baseline_csv=""
+          if [[ -f /tmp/baseline_pinned/moe_bench.csv ]]; then
+              baseline_csv=/tmp/baseline_pinned/moe_bench.csv
+              echo "Using baseline pinned to PR.base.sha=${BASE_SHA:0:7}"
+          else
+              # fallback: pick first match under /tmp/baseline_main/*
+              shopt -s nullglob
+              candidates=(/tmp/baseline_main/*/moe_bench.csv /tmp/baseline_main/moe_bench.csv)
+              for c in "${candidates[@]}"; do
+                  if [[ -f "$c" ]]; then
+                      baseline_csv="$c"
+                      echo "Using fallback baseline from latest main: $c"
+                      break
+                  fi
+              done
+          fi
+          if [[ -z "$baseline_csv" ]]; then
+              echo "::warning::No MoE baseline found (neither pinned PR.base.sha nor latest main); skipping compare."
+              {
+                  echo "## MoE Bench"
+                  echo
+                  echo "_No baseline available — first run on this branch or main hasn't published baseline yet._"
+              } >> "$GITHUB_STEP_SUMMARY"
+              exit 0
+          fi
+          echo "## MoE Bench (vs baseline)" >> "$GITHUB_STEP_SUMMARY"
+          echo '```' >> "$GITHUB_STEP_SUMMARY"
+          bash .github/scripts/check_moe_regression.sh \
+              "$baseline_csv" /tmp/current.csv \
+              | tee -a "$GITHUB_STEP_SUMMARY"
+          echo '```' >> "$GITHUB_STEP_SUMMARY"
+
+      # ── main push / workflow_dispatch path: publish baseline ──
+      - name: Stage baseline payload
+        if: >-
+          steps.current.outputs.found == 'true' &&
+          (github.event_name == 'push' && github.ref == 'refs/heads/main'
+           || github.event_name == 'workflow_dispatch')
+        run: |
+          mkdir -p /tmp/publish
+          cp /tmp/current.csv /tmp/publish/moe_bench.csv
+          python3 -c "
+          import json, os, datetime
+          meta = {
+              'commit': os.environ['GITHUB_SHA'],
+              'ref': os.environ['GITHUB_REF'],
+              'event': os.environ['GITHUB_EVENT_NAME'],
+              'runner_pool': 'linux-aiter-mi35x-1',
+              'gpu_arch_list': os.environ.get('GPU_ARCH_LIST', ''),
+              'ran_at': datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ'),
+          }
+          with open('/tmp/publish/metadata.json', 'w') as f:
+              json.dump(meta, f, indent=2)
+          print(json.dumps(meta, indent=2))
+          "
+          ls -la /tmp/publish/
+          wc -l /tmp/publish/moe_bench.csv
+
+      - name: Publish baseline artifact
+        if: >-
+          steps.current.outputs.found == 'true' &&
+          (github.event_name == 'push' && github.ref == 'refs/heads/main'
+           || github.event_name == 'workflow_dispatch')
+        uses: actions/upload-artifact@v4
+        with:
+          name: moe-bench-${{ github.sha }}
+          path: /tmp/publish/
+          retention-days: 90
diff --git a/op_tests/test_moe_2stage.py b/op_tests/test_moe_2stage.py
index 16c3f55ea7..ab8821c6c0 100644
--- a/op_tests/test_moe_2stage.py
+++ b/op_tests/test_moe_2stage.py
@@ -863,3 +863,15 @@ def _kw(
 df = pd.DataFrame(df)
 df_md = df.to_markdown(index=False)
 aiter.logger.info("moe_2stage summary (markdown):\n%s", df_md)
+
+_csv_out = os.environ.get("AITER_MOE_BENCH_CSV", "moe_bench.csv")
+if _csv_out and len(df) > 0:
+    if "model" in df.columns:
+        csv_df = df[df["model"] != "legacy"].copy()
+    else:
+        csv_df = df.copy()
+    csv_df = csv_df.drop(columns=["logits_diff"], errors="ignore")
+    csv_df.to_csv(_csv_out, index=False)
+    aiter.logger.info(
+        "moe_2stage: wrote %d csv-mode rows to %s", len(csv_df), _csv_out
+    )
diff --git a/scripts/compare_benchmark.py b/scripts/compare_benchmark.py
new file mode 100644
index 0000000000..9331250f4e
--- /dev/null
+++ b/scripts/compare_benchmark.py
@@ -0,0 +1,343 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+# Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+"""Compare two MoE benchmark CSVs (wide-table, metric=us, lower is better).
+
+Used by .github/workflows/aiter-test.yaml bench_moe job to flag MoE
+performance regressions between a PR and main.
+
+CSV schema:
+    - Columns derived from tuned_fmoe.csv shape/params (dtype, token,
+      model_dim, inter_dim, E, topk, actType, ...) form the JOIN KEY.
+    - `us` is the metric (microseconds, lower = faster).
+    - `kernelName1`, `kernelName2` are NOT part of the key (kernel choice
+      may differ between baseline and current); they are shown on a
+      follow-up line when they differ.
+    - Any other column is treated as a key column.
+
+Status legend (all rows are printed, prefixed with status tag):
+    [REGRESS]  ratio = current_us / baseline_us > FAIL threshold
+    [WARN]     ratio > WARN threshold (and <= FAIL)
+    [OK]       ratio <= WARN (including faster-than-baseline)
+    [NEW]      shape present in current only (no baseline)
+    [REMOVED]  shape present in baseline only (current missing)
+    [SKIPPED]  missing or invalid us value on either side
+
+Rows are sorted worst-first: REGRESS, WARN, OK, NEW, REMOVED, SKIPPED.
+
+Exit code:
+    0      always, unless --fail-on-regress is set AND >= 1 REGRESS row exists.
+    1      with --fail-on-regress when REGRESS detected.
+"""
+
+from __future__ import annotations
+
+import argparse
+import csv
+import sys
+from pathlib import Path
+from typing import Dict, Tuple
+
+METRIC = "us"
+KERNEL_COLS = ("kernelName1", "kernelName2")
+NON_KEY = {METRIC, *KERNEL_COLS}
+
+# Cols kept in the join key (for correct shape matching) but hidden from
+# the printed table (low signal — usually constant across runs).
+HIDE_DISPLAY_COLS = (
+    "preshuffle", "strict_accuracy", "check_aot_cache", "swiglu_limit",
+    # Source cols folded into derived `hip` column below
+    "hidden_pad", "intermediate_pad",
+)
+
+# Derived display columns. Each entry: derived_name -> (source_col_a, source_col_b)
+# Value rendered as "(a, b)" tuple. Source cols stay in the join key; they're
+# just hidden from display (covered by HIDE_DISPLAY_COLS above) and a synthetic
+# tuple-valued col is inserted in their place.
+DERIVED_TUPLE_COLS = {
+    "hip": ("hidden_pad", "intermediate_pad"),  # hidden_pad / intermediate_pad
+}
+
+# Display-only abbreviations. Applied at print time; underlying join key
+# still uses full strings, so matching across files is unaffected.
+# NOTE: `torch.float8_e4m3fnuz` is AMD's default fp8, mapped to `fp8` so it
+# stays consistent with `torch.fp8` alias. The OCP / e5m2 variants keep
+# the suffix so they remain distinguishable.
+_VALUE_ABBREV = {
+    "torch.bfloat16": "bf16",
+    "torch.float16": "fp16",
+    "torch.float32": "fp32",
+    "torch.float8_e4m3fnuz": "fp8",        # AMD default
+    "torch.float8_e4m3fn": "fp8e4m3fn",    # OCP
+    "torch.float8_e5m2": "fp8e5m2",
+    "torch.float4_e2m1fn_x2": "fp4",       # x2 = packed (2 elems per byte)
+    "torch.fp8": "fp8",
+    "torch.fp4x2": "fp4",
+    "torch.int8": "i8",
+    "torch.int4": "i4",
+    "torch.i4x2": "i4",
+    # Booleans
+    "True": "T",
+    "False": "F",
+}
+# Enum class prefixes to strip ("ActivationType.Silu" -> "Silu")
+_STRIP_PREFIXES = ("ActivationType.", "QuantType.", "GateMode.")
+
+
+def _abbreviate(val: str) -> str:
+    """Shorten verbose enum/dtype/bool values for table display."""
+    if val in _VALUE_ABBREV:
+        return _VALUE_ABBREV[val]
+    for prefix in _STRIP_PREFIXES:
+        if val.startswith(prefix):
+            return val[len(prefix):]
+    return val
+
+
+def _natural_key(val: str):
+    """Cast numeric strings to numbers for natural sort (token=2 < 16 < 128)."""
+    try:
+        return (0, int(val))
+    except ValueError:
+        try:
+            return (0, float(val))
+        except ValueError:
+            return (1, val)
+
+
+Row = Dict[str, str]
+Key = Tuple[Tuple[str, str], ...]
+
+
+def _read_csv(path: Path) -> Tuple[Dict[Key, Row], Tuple[str, ...]]:
+    """Return ({key: row}, key_col_order). Whitespace stripped from values."""
+    rows: Dict[Key, Row] = {}
+    if not path.exists():
+        raise SystemExit(f"input csv not found: {path}")
+    with path.open(newline="") as f:
+        reader = csv.DictReader(f)
+        if reader.fieldnames is None or METRIC not in reader.fieldnames:
+            raise SystemExit(
+                f"{path} missing required column `{METRIC}`; "
+                f"got columns: {reader.fieldnames}"
+            )
+        key_cols = tuple(c for c in reader.fieldnames if c not in NON_KEY)
+        for raw in reader:
+            # Strip whitespace from every value to avoid silent join misses
+            # caused by trailing/leading spaces.
+            raw = {k: (v.strip() if isinstance(v, str) else v) for k, v in raw.items()}
+            key = tuple(sorted((c, raw.get(c, "")) for c in key_cols))
+            rows[key] = raw
+    return rows, key_cols
+
+
+def _parse_us(raw: Row) -> float | None:
+    val = raw.get(METRIC, "")
+    if val in ("", "-", "skip", "nan", "NaN"):
+        return None
+    try:
+        return float(val)
+    except ValueError:
+        return None
+
+
+def _fmt_key_compact(
+    key: Key, key_cols_order: Tuple[str, ...], constants: Dict[str, str]
+) -> str:
+    """Format key showing only cols whose value is NOT in `constants`."""
+    d = dict(key)
+    parts = []
+    for c in key_cols_order:
+        if c in d and c not in constants:
+            parts.append(f"{c}={d[c]}")
+    return " ".join(parts) if parts else "(common)"
+
+
+def _find_constants(
+    keys: list[Key], key_cols_order: Tuple[str, ...]
+) -> Dict[str, str]:
+    """Return cols whose value is identical across all `keys`."""
+    if not keys:
+        return {}
+    first = dict(keys[0])
+    constants = {}
+    for c in key_cols_order:
+        if c not in first:
+            continue
+        v = first[c]
+        if all(dict(k).get(c) == v for k in keys):
+            constants[c] = v
+    return constants
+
+
+def _kernel_diff(base_row: Row, cur_row: Row) -> list[str]:
+    """Return list of `kernelNameX: <base> -> <cur>` for cols that differ."""
+    diffs = []
+    for c in KERNEL_COLS:
+        b, k = base_row.get(c, ""), cur_row.get(c, "")
+        if b != k:
+            diffs.append(f"{c}: {b}  ->  {k}")
+    return diffs
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    parser.add_argument("baseline_csv", type=Path)
+    parser.add_argument("current_csv", type=Path)
+    parser.add_argument("--baseline-label", default="baseline")
+    parser.add_argument("--current-label", default="current")
+    parser.add_argument(
+        "--warn",
+        type=float,
+        default=1.10,
+        help="warn threshold: ratio > this is `warn` (default 1.10 = 10%% slower)",
+    )
+    parser.add_argument(
+        "--fail",
+        type=float,
+        default=1.15,
+        help="regress threshold: ratio > this is `REGRESS` (default 1.15 = 15%% slower)",
+    )
+    parser.add_argument(
+        "--fail-on-regress",
+        action="store_true",
+        help="exit 1 if any REGRESS row found (default: report only, exit 0)",
+    )
+    args = parser.parse_args()
+
+    if args.warn >= args.fail:
+        raise SystemExit(f"--warn ({args.warn}) must be < --fail ({args.fail})")
+
+    baseline, _ = _read_csv(args.baseline_csv)
+    current, key_cols = _read_csv(args.current_csv)
+
+    print(f"=== MoE bench: {args.current_label} vs {args.baseline_label} ===")
+    print(f"  baseline: {args.baseline_csv}  ({len(baseline)} rows)")
+    print(f"  current:  {args.current_csv}  ({len(current)} rows)")
+    print(f"  thresholds: warn>{args.warn:.2f}, fail>{args.fail:.2f}")
+    print()
+
+    common = sorted(baseline.keys() & current.keys())
+    only_curr = sorted(current.keys() - baseline.keys())
+    only_base = sorted(baseline.keys() - current.keys())
+
+    # Classify every row. Each entry: (sort_rank, status_tag, key, base, cur, ratio)
+    # sort_rank: 0=REGRESS, 1=WARN, 2=OK, 3=NEW, 4=REMOVED, 5=SKIPPED
+    entries: list[tuple[int, str, Key, float | None, float | None, float | None]] = []
+    n_regress = n_warn = n_ok = n_skip = 0
+    for key in common:
+        b_us = _parse_us(baseline[key])
+        c_us = _parse_us(current[key])
+        if b_us is None or c_us is None or b_us <= 0:
+            n_skip += 1
+            entries.append((5, "SKIPPED", key, b_us, c_us, None))
+            continue
+        ratio = c_us / b_us
+        if ratio > args.fail:
+            rank, tag = 0, "REGRESS"
+            n_regress += 1
+        elif ratio > args.warn:
+            rank, tag = 1, "WARN"
+            n_warn += 1
+        else:
+            rank, tag = 2, "OK"
+            n_ok += 1
+        entries.append((rank, tag, key, b_us, c_us, ratio))
+    for key in only_curr:
+        c_us = _parse_us(current[key])
+        entries.append((3, "NEW", key, None, c_us, None))
+    for key in only_base:
+        b_us = _parse_us(baseline[key])
+        entries.append((4, "REMOVED", key, b_us, None, None))
+
+    # Sort worst-first, then by key (natural sort: token=2 < 16 < 128)
+    def _entry_sort_key(e):
+        rank, _tag, key, *_ = e
+        return (rank, [_natural_key(v) for _, v in key])
+
+    entries.sort(key=_entry_sort_key)
+
+    # ── Build proper tabular output ──
+    # Columns: status, *display_cols, cur(us), base(us), ratio
+    # display_cols = key_cols minus HIDE_DISPLAY_COLS, with each derived
+    # tuple col inserted at the position of its first source col.
+    # Hidden source cols still contribute to the join key.
+    # Kernel diffs (not in table) go on indented ↳ sub-lines below each row.
+    METRIC_HDRS = ("cur(us)", "base(us)", "ratio")
+
+    # Build display_cols: walk key_cols, drop hidden, splice derived in place
+    _derived_sources = {src for sources in DERIVED_TUPLE_COLS.values() for src in sources}
+    _derived_first_src = {sources[0]: name for name, sources in DERIVED_TUPLE_COLS.items()}
+    display_cols: list[str] = []
+    for c in key_cols:
+        if c in _derived_first_src:
+            display_cols.append(_derived_first_src[c])
+        if c not in HIDE_DISPLAY_COLS:
+            display_cols.append(c)
+
+    def _cell_value(c: str, d: Dict[str, str]) -> str:
+        if c in DERIVED_TUPLE_COLS:
+            srcs = DERIVED_TUPLE_COLS[c]
+            return "(" + ", ".join(d.get(s, "") for s in srcs) + ")"
+        return _abbreviate(d.get(c, ""))
+
+    def _row_cells(rank, tag, key, b_us, c_us, ratio):
+        d = dict(key)
+        cells = [f"[{tag}]"]
+        for c in display_cols:
+            cells.append(_cell_value(c, d))
+        cells.append(f"{c_us:.2f}" if c_us is not None else "-")
+        cells.append(f"{b_us:.2f}" if b_us is not None else "-")
+        cells.append(f"{ratio:.3f}" if ratio is not None else "-")
+        return cells
+
+    header = ["status", *display_cols, *METRIC_HDRS]
+    body = [_row_cells(*e) for e in entries]
+
+    # Column widths = max(header, max value)
+    widths = [
+        max(len(header[i]), *(len(r[i]) for r in body)) if body else len(header[i])
+        for i in range(len(header))
+    ]
+
+    # Right-justify the 3 metric cols (numbers), left-justify the rest.
+    def _fmt_row(cells):
+        out = []
+        n = len(cells)
+        for i, c in enumerate(cells):
+            justify = str.rjust if i >= n - 3 else str.ljust
+            out.append(justify(c, widths[i]))
+        return "  ".join(out)
+
+    print(_fmt_row(header))
+    print("  ".join("-" * w for w in widths))
+    for cells, e in zip(body, entries):
+        print(_fmt_row(cells))
+        # Kernel-diff sub-lines (indented to align under shape columns)
+        tag = e[1]
+        key = e[2]
+        if tag in ("REGRESS", "WARN", "OK"):
+            for d in _kernel_diff(baseline[key], current[key]):
+                # Indent past the status column for visual hierarchy
+                print(" " * (widths[0] + 2) + "↳  " + d)
+
+    print()
+    print("Summary:")
+    print(f"  compared: {len(common)}")
+    print(f"  REGRESS:  {n_regress}")
+    print(f"  WARN:     {n_warn}")
+    print(f"  OK:       {n_ok}")
+    print(f"  NEW:      {len(only_curr)}")
+    print(f"  REMOVED:  {len(only_base)}")
+    print(f"  SKIPPED:  {n_skip}  (missing/invalid us value)")
+
+    if args.fail_on_regress and n_regress > 0:
+        print(f"\nFAIL: {n_regress} regression(s) above threshold.", file=sys.stderr)
+        return 1
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From c6ea826cdeaee2734ec6964166065162a7608546 Mon Sep 17 00:00:00 2001
From: root <zhimding@amd.com>
Date: Thu, 21 May 2026 08:58:01 +0000
Subject: [PATCH 2/7] fix: apply black format to bench_moe files

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 op_tests/test_moe_2stage.py  |  4 +---
 scripts/compare_benchmark.py | 28 +++++++++++++++++-----------
 2 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/op_tests/test_moe_2stage.py b/op_tests/test_moe_2stage.py
index ab8821c6c0..e9dfdded65 100644
--- a/op_tests/test_moe_2stage.py
+++ b/op_tests/test_moe_2stage.py
@@ -872,6 +872,4 @@ def _kw(
         csv_df = df.copy()
     csv_df = csv_df.drop(columns=["logits_diff"], errors="ignore")
     csv_df.to_csv(_csv_out, index=False)
-    aiter.logger.info(
-        "moe_2stage: wrote %d csv-mode rows to %s", len(csv_df), _csv_out
-    )
+    aiter.logger.info("moe_2stage: wrote %d csv-mode rows to %s", len(csv_df), _csv_out)
diff --git a/scripts/compare_benchmark.py b/scripts/compare_benchmark.py
index 9331250f4e..6a03faf1fb 100644
--- a/scripts/compare_benchmark.py
+++ b/scripts/compare_benchmark.py
@@ -45,9 +45,13 @@
 # Cols kept in the join key (for correct shape matching) but hidden from
 # the printed table (low signal — usually constant across runs).
 HIDE_DISPLAY_COLS = (
-    "preshuffle", "strict_accuracy", "check_aot_cache", "swiglu_limit",
+    "preshuffle",
+    "strict_accuracy",
+    "check_aot_cache",
+    "swiglu_limit",
     # Source cols folded into derived `hip` column below
-    "hidden_pad", "intermediate_pad",
+    "hidden_pad",
+    "intermediate_pad",
 )
 
 # Derived display columns. Each entry: derived_name -> (source_col_a, source_col_b)
@@ -67,10 +71,10 @@
     "torch.bfloat16": "bf16",
     "torch.float16": "fp16",
     "torch.float32": "fp32",
-    "torch.float8_e4m3fnuz": "fp8",        # AMD default
-    "torch.float8_e4m3fn": "fp8e4m3fn",    # OCP
+    "torch.float8_e4m3fnuz": "fp8",  # AMD default
+    "torch.float8_e4m3fn": "fp8e4m3fn",  # OCP
     "torch.float8_e5m2": "fp8e5m2",
-    "torch.float4_e2m1fn_x2": "fp4",       # x2 = packed (2 elems per byte)
+    "torch.float4_e2m1fn_x2": "fp4",  # x2 = packed (2 elems per byte)
     "torch.fp8": "fp8",
     "torch.fp4x2": "fp4",
     "torch.int8": "i8",
@@ -90,7 +94,7 @@ def _abbreviate(val: str) -> str:
         return _VALUE_ABBREV[val]
     for prefix in _STRIP_PREFIXES:
         if val.startswith(prefix):
-            return val[len(prefix):]
+            return val[len(prefix) :]
     return val
 
 
@@ -153,9 +157,7 @@ def _fmt_key_compact(
     return " ".join(parts) if parts else "(common)"
 
 
-def _find_constants(
-    keys: list[Key], key_cols_order: Tuple[str, ...]
-) -> Dict[str, str]:
+def _find_constants(keys: list[Key], key_cols_order: Tuple[str, ...]) -> Dict[str, str]:
     """Return cols whose value is identical across all `keys`."""
     if not keys:
         return {}
@@ -268,8 +270,12 @@ def _entry_sort_key(e):
     METRIC_HDRS = ("cur(us)", "base(us)", "ratio")
 
     # Build display_cols: walk key_cols, drop hidden, splice derived in place
-    _derived_sources = {src for sources in DERIVED_TUPLE_COLS.values() for src in sources}
-    _derived_first_src = {sources[0]: name for name, sources in DERIVED_TUPLE_COLS.items()}
+    _derived_sources = {
+        src for sources in DERIVED_TUPLE_COLS.values() for src in sources
+    }
+    _derived_first_src = {
+        sources[0]: name for name, sources in DERIVED_TUPLE_COLS.items()
+    }
     display_cols: list[str] = []
     for c in key_cols:
         if c in _derived_first_src:

From 14eee5fe23c0a283b9afb350e724188ddf97e349 Mon Sep 17 00:00:00 2001
From: coderfeli <felix.li@amd.com>
Date: Sat, 23 May 2026 11:47:35 +0000
Subject: [PATCH 3/7] Rename MoE bench CI summary.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 ...ession.sh => check_tuned_op_regression.sh} |  4 +-
 .github/workflows/aiter-test.yaml             | 40 +++++++++----------
 op_tests/test_moe_2stage.py                   |  2 +-
 scripts/compare_benchmark.py                  | 21 +++++-----
 4 files changed, 33 insertions(+), 34 deletions(-)
 rename .github/scripts/{check_moe_regression.sh => check_tuned_op_regression.sh} (86%)

diff --git a/.github/scripts/check_moe_regression.sh b/.github/scripts/check_tuned_op_regression.sh
similarity index 86%
rename from .github/scripts/check_moe_regression.sh
rename to .github/scripts/check_tuned_op_regression.sh
index 6e2694e41a..2a48507c7d 100755
--- a/.github/scripts/check_moe_regression.sh
+++ b/.github/scripts/check_tuned_op_regression.sh
@@ -2,12 +2,12 @@
 # SPDX-License-Identifier: MIT
 # Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
 #
-# Wrap scripts/compare_benchmark.py for the bench_moe CI job.
+# Wrap scripts/compare_benchmark.py for the tuned_op_bench CI job.
 # Prints a comparison table to stdout (captured by the job step into
 # $GITHUB_STEP_SUMMARY). Exits 0 unless --fail-on-regress is passed and
 # at least one REGRESS row is found.
 #
-# Usage: check_moe_regression.sh <baseline_csv> <current_csv> [extra args...]
+# Usage: check_tuned_op_regression.sh <baseline_csv> <current_csv> [extra args...]
 set -euo pipefail
 
 BASE=${1:?baseline csv path required}
diff --git a/.github/workflows/aiter-test.yaml b/.github/workflows/aiter-test.yaml
index d34eb7bebc..1c4a36cf35 100644
--- a/.github/workflows/aiter-test.yaml
+++ b/.github/workflows/aiter-test.yaml
@@ -509,7 +509,7 @@ jobs:
           name: standard-test-log-${{ matrix.runner }}-shard-${{ matrix.shard_idx }}
           path: |
             latest_test.log
-            moe_bench.csv
+            tuned_op_bench.csv
           retention-days: 7
 
       - name: Cleanup container
@@ -759,12 +759,12 @@ jobs:
         run: |
           ./.github/scripts/clean_up_rocm.sh
 
-  bench_moe:
-    # MoE perf regression check.
+  tuned_op_bench:
+    # Tuned operator perf regression check.
     # - PR: pull last main baseline CSV, compare current shard's CSV, warn-only
     # - push to main / workflow_dispatch: publish current CSV as next baseline
     # Only consumes csv from linux-aiter-mi35x-1 runner to avoid cross-arch noise.
-    name: MoE Bench Regression
+    name: Tuned Op Bench
     if: >-
       !github.event.pull_request.draft &&
       github.event.action != 'labeled' &&
@@ -781,18 +781,18 @@ jobs:
           pattern: standard-test-log-linux-aiter-mi35x-1-shard-*
           path: /tmp/logs/
 
-      - name: Locate current moe_bench.csv
+      - name: Locate current tuned_op_bench.csv
         id: current
         run: |
           shopt -s nullglob
-          csv_files=(/tmp/logs/*/moe_bench.csv)
+          csv_files=(/tmp/logs/*/tuned_op_bench.csv)
           if [[ ${#csv_files[@]} -eq 0 ]]; then
-              echo "::warning::No moe_bench.csv found in any mi35x shard; skipping bench_moe"
+              echo "::warning::No tuned op benchmark CSV found in any mi35x shard; skipping tuned_op_bench"
               echo "found=false" >> "$GITHUB_OUTPUT"
               exit 0
           fi
           if [[ ${#csv_files[@]} -gt 1 ]]; then
-              echo "::warning::Multiple moe_bench.csv found, using first: ${csv_files[0]}"
+              echo "::warning::Multiple tuned op benchmark CSVs found, using first: ${csv_files[0]}"
               printf '  %s\n' "${csv_files[@]}"
           fi
           cp "${csv_files[0]}" /tmp/current.csv
@@ -808,7 +808,7 @@ jobs:
         with:
           workflow: aiter-test.yaml
           commit: ${{ github.event.pull_request.base.sha }}
-          name: moe-bench-${{ github.event.pull_request.base.sha }}
+          name: tuned-op-bench-${{ github.event.pull_request.base.sha }}
           path: /tmp/baseline_pinned/
           if_no_artifact_found: warn
 
@@ -824,7 +824,7 @@ jobs:
           workflow: aiter-test.yaml
           branch: main
           name_is_regexp: true
-          name: ^moe-bench-[a-f0-9]+$
+          name: ^tuned-op-bench-[a-f0-9]+$
           path: /tmp/baseline_main/
           if_no_artifact_found: warn
 
@@ -836,13 +836,13 @@ jobs:
         run: |
           set -e
           baseline_csv=""
-          if [[ -f /tmp/baseline_pinned/moe_bench.csv ]]; then
-              baseline_csv=/tmp/baseline_pinned/moe_bench.csv
+          if [[ -f /tmp/baseline_pinned/tuned_op_bench.csv ]]; then
+              baseline_csv=/tmp/baseline_pinned/tuned_op_bench.csv
               echo "Using baseline pinned to PR.base.sha=${BASE_SHA:0:7}"
           else
               # fallback: pick first match under /tmp/baseline_main/*
               shopt -s nullglob
-              candidates=(/tmp/baseline_main/*/moe_bench.csv /tmp/baseline_main/moe_bench.csv)
+              candidates=(/tmp/baseline_main/*/tuned_op_bench.csv /tmp/baseline_main/tuned_op_bench.csv)
               for c in "${candidates[@]}"; do
                   if [[ -f "$c" ]]; then
                       baseline_csv="$c"
@@ -852,17 +852,17 @@ jobs:
               done
           fi
           if [[ -z "$baseline_csv" ]]; then
-              echo "::warning::No MoE baseline found (neither pinned PR.base.sha nor latest main); skipping compare."
+              echo "::warning::No tuned op benchmark baseline found (neither pinned PR.base.sha nor latest main); skipping compare."
               {
-                  echo "## MoE Bench"
+                  echo "## Tuned Op Bench"
                   echo
                   echo "_No baseline available — first run on this branch or main hasn't published baseline yet._"
               } >> "$GITHUB_STEP_SUMMARY"
               exit 0
           fi
-          echo "## MoE Bench (vs baseline)" >> "$GITHUB_STEP_SUMMARY"
+          echo "## Tuned Op Bench (vs baseline)" >> "$GITHUB_STEP_SUMMARY"
           echo '```' >> "$GITHUB_STEP_SUMMARY"
-          bash .github/scripts/check_moe_regression.sh \
+          bash .github/scripts/check_tuned_op_regression.sh \
               "$baseline_csv" /tmp/current.csv \
               | tee -a "$GITHUB_STEP_SUMMARY"
           echo '```' >> "$GITHUB_STEP_SUMMARY"
@@ -875,7 +875,7 @@ jobs:
            || github.event_name == 'workflow_dispatch')
         run: |
           mkdir -p /tmp/publish
-          cp /tmp/current.csv /tmp/publish/moe_bench.csv
+          cp /tmp/current.csv /tmp/publish/tuned_op_bench.csv
           python3 -c "
           import json, os, datetime
           meta = {
@@ -891,7 +891,7 @@ jobs:
           print(json.dumps(meta, indent=2))
           "
           ls -la /tmp/publish/
-          wc -l /tmp/publish/moe_bench.csv
+          wc -l /tmp/publish/tuned_op_bench.csv
 
       - name: Publish baseline artifact
         if: >-
@@ -900,6 +900,6 @@ jobs:
            || github.event_name == 'workflow_dispatch')
         uses: actions/upload-artifact@v4
         with:
-          name: moe-bench-${{ github.sha }}
+          name: tuned-op-bench-${{ github.sha }}
           path: /tmp/publish/
           retention-days: 90
diff --git a/op_tests/test_moe_2stage.py b/op_tests/test_moe_2stage.py
index e9dfdded65..d88d6785d7 100644
--- a/op_tests/test_moe_2stage.py
+++ b/op_tests/test_moe_2stage.py
@@ -864,7 +864,7 @@ def _kw(
 df_md = df.to_markdown(index=False)
 aiter.logger.info("moe_2stage summary (markdown):\n%s", df_md)
 
-_csv_out = os.environ.get("AITER_MOE_BENCH_CSV", "moe_bench.csv")
+_csv_out = os.environ.get("AITER_TUNED_OP_BENCH_CSV", "tuned_op_bench.csv")
 if _csv_out and len(df) > 0:
     if "model" in df.columns:
         csv_df = df[df["model"] != "legacy"].copy()
diff --git a/scripts/compare_benchmark.py b/scripts/compare_benchmark.py
index 6a03faf1fb..fd982b652b 100644
--- a/scripts/compare_benchmark.py
+++ b/scripts/compare_benchmark.py
@@ -1,9 +1,9 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: MIT
 # Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
-"""Compare two MoE benchmark CSVs (wide-table, metric=us, lower is better).
+"""Compare two tuned operator benchmark CSVs (wide-table, metric=us, lower is better).
 
-Used by .github/workflows/aiter-test.yaml bench_moe job to flag MoE
+Used by .github/workflows/aiter-test.yaml tuned_op_bench job to flag tuned operator
 performance regressions between a PR and main.
 
 CSV schema:
@@ -215,7 +215,7 @@ def main() -> int:
     baseline, _ = _read_csv(args.baseline_csv)
     current, key_cols = _read_csv(args.current_csv)
 
-    print(f"=== MoE bench: {args.current_label} vs {args.baseline_label} ===")
+    print(f"=== Tuned op bench: {args.current_label} vs {args.baseline_label} ===")
     print(f"  baseline: {args.baseline_csv}  ({len(baseline)} rows)")
     print(f"  current:  {args.current_csv}  ({len(current)} rows)")
     print(f"  thresholds: warn>{args.warn:.2f}, fail>{args.fail:.2f}")
@@ -262,12 +262,12 @@ def _entry_sort_key(e):
     entries.sort(key=_entry_sort_key)
 
     # ── Build proper tabular output ──
-    # Columns: status, *display_cols, cur(us), base(us), ratio
+    # Columns: status, ratio, cur(us), base(us), *display_cols
     # display_cols = key_cols minus HIDE_DISPLAY_COLS, with each derived
     # tuple col inserted at the position of its first source col.
     # Hidden source cols still contribute to the join key.
     # Kernel diffs (not in table) go on indented ↳ sub-lines below each row.
-    METRIC_HDRS = ("cur(us)", "base(us)", "ratio")
+    METRIC_HDRS = ("ratio", "cur(us)", "base(us)")
 
     # Build display_cols: walk key_cols, drop hidden, splice derived in place
     _derived_sources = {
@@ -292,14 +292,14 @@ def _cell_value(c: str, d: Dict[str, str]) -> str:
     def _row_cells(rank, tag, key, b_us, c_us, ratio):
         d = dict(key)
         cells = [f"[{tag}]"]
-        for c in display_cols:
-            cells.append(_cell_value(c, d))
+        cells.append(f"{ratio:.3f}" if ratio is not None else "-")
         cells.append(f"{c_us:.2f}" if c_us is not None else "-")
         cells.append(f"{b_us:.2f}" if b_us is not None else "-")
-        cells.append(f"{ratio:.3f}" if ratio is not None else "-")
+        for c in display_cols:
+            cells.append(_cell_value(c, d))
         return cells
 
-    header = ["status", *display_cols, *METRIC_HDRS]
+    header = ["status", *METRIC_HDRS, *display_cols]
     body = [_row_cells(*e) for e in entries]
 
     # Column widths = max(header, max value)
@@ -311,9 +311,8 @@ def _row_cells(rank, tag, key, b_us, c_us, ratio):
     # Right-justify the 3 metric cols (numbers), left-justify the rest.
     def _fmt_row(cells):
         out = []
-        n = len(cells)
         for i, c in enumerate(cells):
-            justify = str.rjust if i >= n - 3 else str.ljust
+            justify = str.rjust if 1 <= i <= 3 else str.ljust
             out.append(justify(c, widths[i]))
         return "  ".join(out)
 

From 47ee273c150776f4bbfb15e99b5abf62fa9c86c8 Mon Sep 17 00:00:00 2001
From: coderfeli <felix.li@amd.com>
Date: Sun, 24 May 2026 14:42:37 +0000
Subject: [PATCH 4/7] Keep tuned op bench data on failures

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .github/workflows/aiter-test.yaml |  6 +++++-
 op_tests/test_moe_2stage.py       | 33 +++++++++++++++++++++----------
 2 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/aiter-test.yaml b/.github/workflows/aiter-test.yaml
index 1c4a36cf35..03d5941640 100644
--- a/.github/workflows/aiter-test.yaml
+++ b/.github/workflows/aiter-test.yaml
@@ -504,12 +504,13 @@ jobs:
 
       - name: Upload test logs
         uses: actions/upload-artifact@v4
-        if: success()
+        if: always()
         with:
           name: standard-test-log-${{ matrix.runner }}-shard-${{ matrix.shard_idx }}
           path: |
             latest_test.log
             tuned_op_bench.csv
+          if-no-files-found: warn
           retention-days: 7
 
       - name: Cleanup container
@@ -766,6 +767,8 @@ jobs:
     # Only consumes csv from linux-aiter-mi35x-1 runner to avoid cross-arch noise.
     name: Tuned Op Bench
     if: >-
+      always() &&
+      !cancelled() &&
       !github.event.pull_request.draft &&
       github.event.action != 'labeled' &&
       github.event_name != 'schedule'
@@ -777,6 +780,7 @@ jobs:
 
       - name: Download standard test logs (mi35x only)
         uses: actions/download-artifact@v4
+        continue-on-error: true
         with:
           pattern: standard-test-log-linux-aiter-mi35x-1-shard-*
           path: /tmp/logs/
diff --git a/op_tests/test_moe_2stage.py b/op_tests/test_moe_2stage.py
index d88d6785d7..fcf40f594b 100644
--- a/op_tests/test_moe_2stage.py
+++ b/op_tests/test_moe_2stage.py
@@ -818,6 +818,28 @@ def _kw(
     _case_iters.append(_iter_legacy_cases())
 case_iter = itertools.chain(*_case_iters)
 
+_csv_out = os.environ.get("AITER_TUNED_OP_BENCH_CSV", "tuned_op_bench.csv")
+if _csv_out and os.path.exists(_csv_out):
+    os.remove(_csv_out)
+
+
+def _write_bench_csv(rows):
+    if not _csv_out or len(rows) == 0:
+        return
+    csv_df = pd.DataFrame(rows)
+    if "model" in csv_df.columns:
+        csv_df = csv_df[csv_df["model"] != "legacy"].copy()
+    else:
+        csv_df = csv_df.copy()
+    if len(csv_df) == 0:
+        return
+    csv_df = csv_df.drop(columns=["logits_diff"], errors="ignore")
+    csv_df.to_csv(_csv_out, index=False)
+    aiter.logger.info(
+        "moe_2stage: wrote %d csv-mode rows to %s", len(csv_df), _csv_out
+    )
+
+
 df = []
 seen = 0
 for kwargs, extras in case_iter:
@@ -853,6 +875,7 @@ def _kw(
         continue
     ret.update(extras)
     df.append(ret)
+    _write_bench_csv(df)
 
 aiter.logger.info(
     "moe_2stage: scanned %d cases, recorded %d results (skipped %d)",
@@ -863,13 +886,3 @@ def _kw(
 df = pd.DataFrame(df)
 df_md = df.to_markdown(index=False)
 aiter.logger.info("moe_2stage summary (markdown):\n%s", df_md)
-
-_csv_out = os.environ.get("AITER_TUNED_OP_BENCH_CSV", "tuned_op_bench.csv")
-if _csv_out and len(df) > 0:
-    if "model" in df.columns:
-        csv_df = df[df["model"] != "legacy"].copy()
-    else:
-        csv_df = df.copy()
-    csv_df = csv_df.drop(columns=["logits_diff"], errors="ignore")
-    csv_df.to_csv(_csv_out, index=False)
-    aiter.logger.info("moe_2stage: wrote %d csv-mode rows to %s", len(csv_df), _csv_out)

From 6f899a4659e2257660f43537ffc95378af8d0a2e Mon Sep 17 00:00:00 2001
From: coderfeli <felix.li@amd.com>
Date: Sun, 24 May 2026 14:55:17 +0000
Subject: [PATCH 5/7] Apply Black formatting to MoE bench writer

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 op_tests/test_moe_2stage.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/op_tests/test_moe_2stage.py b/op_tests/test_moe_2stage.py
index fcf40f594b..af994a314e 100644
--- a/op_tests/test_moe_2stage.py
+++ b/op_tests/test_moe_2stage.py
@@ -835,9 +835,7 @@ def _write_bench_csv(rows):
         return
     csv_df = csv_df.drop(columns=["logits_diff"], errors="ignore")
     csv_df.to_csv(_csv_out, index=False)
-    aiter.logger.info(
-        "moe_2stage: wrote %d csv-mode rows to %s", len(csv_df), _csv_out
-    )
+    aiter.logger.info("moe_2stage: wrote %d csv-mode rows to %s", len(csv_df), _csv_out)
 
 
 df = []

From 2e8902d57ae1a442bab2033caed47a7b6563d2ca Mon Sep 17 00:00:00 2001
From: coderfeli <felix.li@amd.com>
Date: Mon, 25 May 2026 02:23:25 +0000
Subject: [PATCH 6/7] Improve tuned op benchmark coverage

Make the benchmark comparison resilient to missing pinned baselines and schema drift, and include FlyDSL GEMM CSV cases in the shared tuned-op artifact.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .github/workflows/aiter-test.yaml |  42 +++++++++---
 op_tests/test_gemm_a8w8.py        |  43 ++++++++----
 op_tests/test_moe_2stage.py       |  25 +++----
 op_tests/tuned_op_bench_utils.py  | 105 ++++++++++++++++++++++++++++++
 scripts/compare_benchmark.py      |  51 +++++++++++----
 5 files changed, 222 insertions(+), 44 deletions(-)
 create mode 100644 op_tests/tuned_op_bench_utils.py

diff --git a/.github/workflows/aiter-test.yaml b/.github/workflows/aiter-test.yaml
index 03d5941640..5629535cd3 100644
--- a/.github/workflows/aiter-test.yaml
+++ b/.github/workflows/aiter-test.yaml
@@ -795,12 +795,37 @@ jobs:
               echo "found=false" >> "$GITHUB_OUTPUT"
               exit 0
           fi
-          if [[ ${#csv_files[@]} -gt 1 ]]; then
-              echo "::warning::Multiple tuned op benchmark CSVs found, using first: ${csv_files[0]}"
-              printf '  %s\n' "${csv_files[@]}"
-          fi
-          cp "${csv_files[0]}" /tmp/current.csv
-          echo "Current CSV: ${csv_files[0]} ($(wc -l < /tmp/current.csv) lines)"
+          echo "Merging tuned op benchmark CSVs:"
+          printf '  %s\n' "${csv_files[@]}"
+          python3 - "${csv_files[@]}" <<'PY'
+          import csv
+          import sys
+
+          rows = []
+          fieldnames = []
+          seen = set()
+          for path in sys.argv[1:]:
+              with open(path, newline="") as f:
+                  reader = csv.DictReader(f)
+                  if not reader.fieldnames:
+                      continue
+                  for name in reader.fieldnames:
+                      if name not in seen:
+                          fieldnames.append(name)
+                          seen.add(name)
+                  rows.extend(reader)
+
+          if "us" not in seen:
+              raise SystemExit("merged tuned op benchmark CSV is missing required `us` column")
+
+          with open("/tmp/current.csv", "w", newline="") as f:
+              writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction="ignore")
+              writer.writeheader()
+              for row in rows:
+                  writer.writerow({name: row.get(name, "") for name in fieldnames})
+          print(f"Merged {len(rows)} row(s) into /tmp/current.csv")
+          PY
+          echo "Current merged CSV: /tmp/current.csv ($(wc -l < /tmp/current.csv) lines)"
           echo "found=true" >> "$GITHUB_OUTPUT"
 
       # ── PR path: compare vs baseline ──
@@ -819,8 +844,7 @@ jobs:
       - name: Fallback — fetch baseline from latest main
         if: >-
           steps.current.outputs.found == 'true' &&
-          github.event_name == 'pull_request' &&
-          steps.baseline_pinned.outcome != 'success'
+          github.event_name == 'pull_request'
         id: baseline_main
         continue-on-error: true
         uses: dawidd6/action-download-artifact@v3
@@ -838,7 +862,7 @@ jobs:
           BASE_SHA: ${{ github.event.pull_request.base.sha }}
           CURR_SHA: ${{ github.event.pull_request.head.sha }}
         run: |
-          set -e
+          set -euo pipefail
           baseline_csv=""
           if [[ -f /tmp/baseline_pinned/tuned_op_bench.csv ]]; then
               baseline_csv=/tmp/baseline_pinned/tuned_op_bench.csv
diff --git a/op_tests/test_gemm_a8w8.py b/op_tests/test_gemm_a8w8.py
index f89e2c6701..55c58adfe7 100755
--- a/op_tests/test_gemm_a8w8.py
+++ b/op_tests/test_gemm_a8w8.py
@@ -12,6 +12,7 @@
 from aiter.test_common import checkAllclose, perftest, benchmark
 from aiter import hipb_mm, hipb_create_extension
 from aiter.jit.utils.chip_info import get_gfx_runtime as get_gfx, get_cu_num
+from op_tests.tuned_op_bench_utils import append_tuned_op_bench_rows
 import pandas as pd
 import argparse
 from functools import lru_cache
@@ -409,7 +410,7 @@ def test_skinny_gemm_a8w8_pertoken_quant():
 
 
 def _iter_flydsl_csv_cases():
-    """Yield test_gemm kwargs for every flydsl row in the merged bpreshuffle tuned CSV."""
+    """Yield (test_gemm kwargs, bench metadata) for flydsl tuned CSV rows."""
     gfx, cu = get_gfx(), get_cu_num()
     merged_csv = AITER_CONFIGS.AITER_CONFIG_GEMM_A8W8_BPRESHUFFLE_FILE
     df = pd.read_csv(merged_csv)
@@ -423,14 +424,21 @@ def _iter_flydsl_csv_cases():
     )
     for _, row in rows.iterrows():
         q_dtype = dtypes.fp8 if "float8" in str(row["q_dtype_w"]) else dtypes.i8
-        yield dict(
-            dtype=dtypes.bf16,
-            m=int(row["M"]),
-            n=int(row["N"]),
-            k=int(row["K"]),
-            quantDtype=q_dtype,
-            pad_a=128,
-            skip_ck=True,
+        yield (
+            dict(
+                dtype=dtypes.bf16,
+                m=int(row["M"]),
+                n=int(row["N"]),
+                k=int(row["K"]),
+                quantDtype=q_dtype,
+                pad_a=128,
+                skip_ck=True,
+            ),
+            {
+                "source": "flydsl_csv",
+                "libtype": str(row.get("libtype", "")),
+                "kernelName1": str(row.get("kernelName", "")),
+            },
         )
 
 
@@ -558,8 +566,21 @@ def _iter_flydsl_csv_cases():
 args = parser.parse_args()
 
 if not args.no_flydsl_csv:
-    for kwargs in _iter_flydsl_csv_cases():
-        test_gemm(**kwargs)
+    bench_csv = os.environ.get("AITER_TUNED_OP_BENCH_CSV", "tuned_op_bench.csv")
+    for kwargs, extras in _iter_flydsl_csv_cases():
+        ret = test_gemm(**kwargs)
+        ret.update(extras)
+        written = append_tuned_op_bench_rows(
+            bench_csv,
+            [ret],
+            op_name="gemm_a8w8",
+        )
+        if written:
+            aiter.logger.info(
+                "gemm_a8w8: appended %d tuned op bench row(s) to %s",
+                written,
+                bench_csv,
+            )
 
 if not args.no_legacy:
     if args.csv is not None:
diff --git a/op_tests/test_moe_2stage.py b/op_tests/test_moe_2stage.py
index af994a314e..e56e32e48d 100644
--- a/op_tests/test_moe_2stage.py
+++ b/op_tests/test_moe_2stage.py
@@ -30,6 +30,7 @@
 from aiter.aot.flydsl.common import fail_on_aot_cache_miss
 from aiter.ops.flydsl.moe_common import GateMode
 import aiter.ops.flydsl.moe_kernels as _aiter_mk
+from op_tests.tuned_op_bench_utils import append_tuned_op_bench_rows
 
 
 from aiter.ops.shuffle import (
@@ -819,23 +820,25 @@ def _kw(
 case_iter = itertools.chain(*_case_iters)
 
 _csv_out = os.environ.get("AITER_TUNED_OP_BENCH_CSV", "tuned_op_bench.csv")
-if _csv_out and os.path.exists(_csv_out):
-    os.remove(_csv_out)
 
 
 def _write_bench_csv(rows):
     if not _csv_out or len(rows) == 0:
         return
-    csv_df = pd.DataFrame(rows)
-    if "model" in csv_df.columns:
-        csv_df = csv_df[csv_df["model"] != "legacy"].copy()
-    else:
-        csv_df = csv_df.copy()
-    if len(csv_df) == 0:
+    row = rows[-1]
+    if row.get("model") == "legacy":
         return
-    csv_df = csv_df.drop(columns=["logits_diff"], errors="ignore")
-    csv_df.to_csv(_csv_out, index=False)
-    aiter.logger.info("moe_2stage: wrote %d csv-mode rows to %s", len(csv_df), _csv_out)
+    written = append_tuned_op_bench_rows(
+        _csv_out,
+        [row],
+        op_name="moe_2stage",
+        metric_cols=("us",),
+        default_impl="fused_moe",
+    )
+    if written:
+        aiter.logger.info(
+            "moe_2stage: appended %d tuned op bench row(s) to %s", written, _csv_out
+        )
 
 
 df = []
diff --git a/op_tests/tuned_op_bench_utils.py b/op_tests/tuned_op_bench_utils.py
new file mode 100644
index 0000000000..8ee5404941
--- /dev/null
+++ b/op_tests/tuned_op_bench_utils.py
@@ -0,0 +1,105 @@
+# SPDX-License-Identifier: MIT
+# Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Iterable, Mapping
+
+import pandas as pd
+
+_DERIVED_METRIC_SUFFIXES = (" us",)
+_DROP_SUFFIXES = (" err", " TFLOPS", " TB/s")
+_DROP_COLS = {"logits_diff"}
+
+
+def _is_missing(value) -> bool:
+    if value is None:
+        return True
+    try:
+        return bool(pd.isna(value))
+    except (TypeError, ValueError):
+        return False
+
+
+def _cell(value):
+    if _is_missing(value):
+        return ""
+    return str(value)
+
+
+def _metric_columns(row: Mapping[str, object], metric_cols: Iterable[str] | None):
+    if metric_cols is not None:
+        return [col for col in metric_cols if col in row]
+    if "us" in row:
+        return ["us"]
+    return [
+        col
+        for col in row
+        if any(col.endswith(suffix) for suffix in _DERIVED_METRIC_SUFFIXES)
+    ]
+
+
+def _impl_from_metric(metric_col: str, default_impl: str) -> str:
+    if metric_col == "us":
+        return default_impl
+    for suffix in _DERIVED_METRIC_SUFFIXES:
+        if metric_col.endswith(suffix):
+            return metric_col[: -len(suffix)]
+    return metric_col
+
+
+def _base_columns(row: Mapping[str, object], metric_cols: set[str]):
+    base = {}
+    for col, value in row.items():
+        if col in metric_cols or col in _DROP_COLS:
+            continue
+        if any(col.endswith(suffix) for suffix in _DROP_SUFFIXES):
+            continue
+        if any(col.endswith(suffix) for suffix in _DERIVED_METRIC_SUFFIXES):
+            continue
+        base[col] = _cell(value)
+    return base
+
+
+def append_tuned_op_bench_rows(
+    csv_path: str | Path,
+    rows: Iterable[Mapping[str, object]],
+    *,
+    op_name: str,
+    metric_cols: Iterable[str] | None = None,
+    default_impl: str = "",
+) -> int:
+    """Append benchmark rows to the shared tuned-op CI CSV.
+
+    Input rows are usually wide benchmark dictionaries. This writes a stable
+    long-table schema with one `us` metric per row so different operator tests
+    can share the same artifact.
+    """
+    output_rows = []
+    for row in rows:
+        row_metric_cols = _metric_columns(row, metric_cols)
+        metric_col_set = set(row_metric_cols)
+        base = _base_columns(row, metric_col_set)
+        base["op"] = op_name
+        for metric_col in row_metric_cols:
+            value = row.get(metric_col)
+            if _is_missing(value):
+                continue
+            out = dict(base)
+            impl = _impl_from_metric(metric_col, default_impl)
+            if impl:
+                out["impl"] = impl
+            out["us"] = _cell(value)
+            output_rows.append(out)
+
+    if not output_rows:
+        return 0
+
+    csv_path = Path(csv_path)
+    new_df = pd.DataFrame(output_rows)
+    if csv_path.exists() and csv_path.stat().st_size > 0:
+        old_df = pd.read_csv(csv_path, dtype=str).fillna("")
+        new_df = pd.concat([old_df, new_df.astype(str).fillna("")], ignore_index=True)
+    new_df.to_csv(csv_path, index=False)
+    return len(output_rows)
diff --git a/scripts/compare_benchmark.py b/scripts/compare_benchmark.py
index fd982b652b..b51cf25db8 100644
--- a/scripts/compare_benchmark.py
+++ b/scripts/compare_benchmark.py
@@ -113,9 +113,17 @@ def _natural_key(val: str):
 Key = Tuple[Tuple[str, str], ...]
 
 
-def _read_csv(path: Path) -> Tuple[Dict[Key, Row], Tuple[str, ...]]:
-    """Return ({key: row}, key_col_order). Whitespace stripped from values."""
-    rows: Dict[Key, Row] = {}
+def _normalize_row(raw: Row) -> Row:
+    """Strip whitespace and coerce missing CSV fields to empty strings."""
+    return {
+        k: ("" if v is None else v.strip() if isinstance(v, str) else str(v))
+        for k, v in raw.items()
+        if k is not None
+    }
+
+
+def _read_csv_rows(path: Path) -> Tuple[list[Row], Tuple[str, ...]]:
+    """Return (rows, fieldnames). Whitespace stripped from values."""
     if not path.exists():
         raise SystemExit(f"input csv not found: {path}")
     with path.open(newline="") as f:
@@ -125,14 +133,28 @@ def _read_csv(path: Path) -> Tuple[Dict[Key, Row], Tuple[str, ...]]:
                 f"{path} missing required column `{METRIC}`; "
                 f"got columns: {reader.fieldnames}"
             )
-        key_cols = tuple(c for c in reader.fieldnames if c not in NON_KEY)
-        for raw in reader:
-            # Strip whitespace from every value to avoid silent join misses
-            # caused by trailing/leading spaces.
-            raw = {k: (v.strip() if isinstance(v, str) else v) for k, v in raw.items()}
-            key = tuple(sorted((c, raw.get(c, "")) for c in key_cols))
-            rows[key] = raw
-    return rows, key_cols
+        rows = [_normalize_row(raw) for raw in reader]
+    return rows, tuple(reader.fieldnames)
+
+
+def _key_cols(base_cols: Tuple[str, ...], cur_cols: Tuple[str, ...]) -> Tuple[str, ...]:
+    """Stable key column order across baseline/current schema drift."""
+    cols = []
+    seen = set()
+    for col in (*base_cols, *cur_cols):
+        if col in NON_KEY or col in seen:
+            continue
+        cols.append(col)
+        seen.add(col)
+    return tuple(cols)
+
+
+def _index_rows(rows: list[Row], key_cols: Tuple[str, ...]) -> Dict[Key, Row]:
+    indexed: Dict[Key, Row] = {}
+    for row in rows:
+        key = tuple(sorted((c, row.get(c, "")) for c in key_cols))
+        indexed[key] = row
+    return indexed
 
 
 def _parse_us(raw: Row) -> float | None:
@@ -212,8 +234,11 @@ def main() -> int:
     if args.warn >= args.fail:
         raise SystemExit(f"--warn ({args.warn}) must be < --fail ({args.fail})")
 
-    baseline, _ = _read_csv(args.baseline_csv)
-    current, key_cols = _read_csv(args.current_csv)
+    baseline_rows, baseline_cols = _read_csv_rows(args.baseline_csv)
+    current_rows, current_cols = _read_csv_rows(args.current_csv)
+    key_cols = _key_cols(baseline_cols, current_cols)
+    baseline = _index_rows(baseline_rows, key_cols)
+    current = _index_rows(current_rows, key_cols)
 
     print(f"=== Tuned op bench: {args.current_label} vs {args.baseline_label} ===")
     print(f"  baseline: {args.baseline_csv}  ({len(baseline)} rows)")

From 166181d063ec1bff6175ae3cfcbdbbae6c2d3542 Mon Sep 17 00:00:00 2001
From: coderfeli <felix.li@amd.com>
Date: Mon, 25 May 2026 06:45:54 +0000
Subject: [PATCH 7/7] Address tuned op bench review comments

Move the comparison helper under CI scripts and make benchmark helper imports work both when op tests run as scripts and as package modules.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .github/scripts/check_tuned_op_regression.sh      | 4 ++--
 {scripts => .github/scripts}/compare_benchmark.py | 3 ---
 op_tests/test_gemm_a8w8.py                        | 8 +++++++-
 op_tests/test_moe_2stage.py                       | 8 +++++++-
 4 files changed, 16 insertions(+), 7 deletions(-)
 rename {scripts => .github/scripts}/compare_benchmark.py (99%)

diff --git a/.github/scripts/check_tuned_op_regression.sh b/.github/scripts/check_tuned_op_regression.sh
index 2a48507c7d..a5f694ca15 100755
--- a/.github/scripts/check_tuned_op_regression.sh
+++ b/.github/scripts/check_tuned_op_regression.sh
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: MIT
 # Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
 #
-# Wrap scripts/compare_benchmark.py for the tuned_op_bench CI job.
+# Wrap .github/scripts/compare_benchmark.py for the tuned_op_bench CI job.
 # Prints a comparison table to stdout (captured by the job step into
 # $GITHUB_STEP_SUMMARY). Exits 0 unless --fail-on-regress is passed and
 # at least one REGRESS row is found.
@@ -27,7 +27,7 @@ fi
 
 REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
 
-python3 "${REPO_ROOT}/scripts/compare_benchmark.py" \
+python3 "${REPO_ROOT}/.github/scripts/compare_benchmark.py" \
     "$BASE" "$CURR" \
     --baseline-label "$BASE_LABEL" \
     --current-label "$CURR_LABEL" \
diff --git a/scripts/compare_benchmark.py b/.github/scripts/compare_benchmark.py
similarity index 99%
rename from scripts/compare_benchmark.py
rename to .github/scripts/compare_benchmark.py
index b51cf25db8..8cf4229549 100644
--- a/scripts/compare_benchmark.py
+++ b/.github/scripts/compare_benchmark.py
@@ -295,9 +295,6 @@ def _entry_sort_key(e):
     METRIC_HDRS = ("ratio", "cur(us)", "base(us)")
 
     # Build display_cols: walk key_cols, drop hidden, splice derived in place
-    _derived_sources = {
-        src for sources in DERIVED_TUPLE_COLS.values() for src in sources
-    }
     _derived_first_src = {
         sources[0]: name for name, sources in DERIVED_TUPLE_COLS.items()
     }
diff --git a/op_tests/test_gemm_a8w8.py b/op_tests/test_gemm_a8w8.py
index 55c58adfe7..2fb13b8db9 100755
--- a/op_tests/test_gemm_a8w8.py
+++ b/op_tests/test_gemm_a8w8.py
@@ -12,7 +12,13 @@
 from aiter.test_common import checkAllclose, perftest, benchmark
 from aiter import hipb_mm, hipb_create_extension
 from aiter.jit.utils.chip_info import get_gfx_runtime as get_gfx, get_cu_num
-from op_tests.tuned_op_bench_utils import append_tuned_op_bench_rows
+
+try:
+    from tuned_op_bench_utils import append_tuned_op_bench_rows
+except ModuleNotFoundError as e:
+    if e.name != "tuned_op_bench_utils":
+        raise
+    from op_tests.tuned_op_bench_utils import append_tuned_op_bench_rows
 import pandas as pd
 import argparse
 from functools import lru_cache
diff --git a/op_tests/test_moe_2stage.py b/op_tests/test_moe_2stage.py
index e56e32e48d..b249391791 100644
--- a/op_tests/test_moe_2stage.py
+++ b/op_tests/test_moe_2stage.py
@@ -30,7 +30,13 @@
 from aiter.aot.flydsl.common import fail_on_aot_cache_miss
 from aiter.ops.flydsl.moe_common import GateMode
 import aiter.ops.flydsl.moe_kernels as _aiter_mk
-from op_tests.tuned_op_bench_utils import append_tuned_op_bench_rows
+
+try:
+    from tuned_op_bench_utils import append_tuned_op_bench_rows
+except ModuleNotFoundError as e:
+    if e.name != "tuned_op_bench_utils":
+        raise
+    from op_tests.tuned_op_bench_utils import append_tuned_op_bench_rows
 
 
 from aiter.ops.shuffle import (