From 4535fc3fd15d8223adb3ffc0eb02adc017322d66 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 19 Feb 2026 14:59:43 -0500 Subject: [PATCH 01/15] Add test sharding, proactive clean, and retry logic for self-hosted CI - Shard Frontier GPU tests into 2 parts for faster parallel execution - Add proactive ./mfc.sh clean in Phoenix test scripts to prevent cross-compiler contamination from stale build artifacts - Add --requeue to Phoenix SLURM jobs for preemption recovery - Add lint-gate job that must pass before self-hosted tests run - Add retry logic for GitHub runner tests (retry <=5 failures) - Add Frontier AMD test support with dedicated submit/test scripts - Restructure self-hosted matrix with explicit cluster names Co-Authored-By: Claude Opus 4.6 --- .github/workflows/frontier/submit.sh | 8 +-- .github/workflows/frontier/test.sh | 7 ++- .github/workflows/frontier_amd/submit.sh | 8 +-- .github/workflows/frontier_amd/test.sh | 7 ++- .github/workflows/phoenix/submit.sh | 1 + .github/workflows/phoenix/test.sh | 4 ++ .github/workflows/test.yml | 67 ++++++++++++++++++++---- 7 files changed, 85 insertions(+), 17 deletions(-) diff --git a/.github/workflows/frontier/submit.sh b/.github/workflows/frontier/submit.sh index d5b416c65a..4c3e0e3e27 100644 --- a/.github/workflows/frontier/submit.sh +++ b/.github/workflows/frontier/submit.sh @@ -34,12 +34,13 @@ output_file="$job_slug.out" submit_output=$(sbatch < Date: Thu, 19 Feb 2026 16:58:06 -0500 Subject: [PATCH 02/15] Add --shard and failed_uuids.txt support to test toolchain The CI test scripts use --shard for splitting Frontier GPU tests across multiple jobs, and failed_uuids.txt for retry logic. These toolchain changes were missing from the cherry-pick. Co-Authored-By: Claude Opus 4.6 --- toolchain/mfc/cli/commands.py | 6 ++++++ toolchain/mfc/test/test.py | 17 +++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/toolchain/mfc/cli/commands.py b/toolchain/mfc/cli/commands.py index 8ad8c4bd07..018e3cef83 100644 --- a/toolchain/mfc/cli/commands.py +++ b/toolchain/mfc/cli/commands.py @@ -452,6 +452,12 @@ default=False, dest="dry_run", ), + Argument( + name="shard", + help="Run only a subset of tests (e.g., '1/2' for first half, '2/2' for second half).", + type=str, + default=None, + ), ], mutually_exclusive=[ MutuallyExclusiveGroup(arguments=[ diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index 31a3771cb9..54e00186dd 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -99,6 +99,14 @@ def __filter(cases_) -> typing.List[TestCase]: skipped_cases += example_cases cases = [case for case in cases if case not in example_cases] + if ARG("shard") is not None: + parts = ARG("shard").split("/") + if len(parts) != 2 or not all(p.isdigit() for p in parts) or int(parts[1]) < 1 or not 1 <= int(parts[0]) <= int(parts[1]): + raise MFCException(f"Invalid --shard '{ARG('shard')}': expected 'i/n' with 1 <= i <= n (e.g., '1/2').") + shard_idx, shard_count = int(parts[0]), int(parts[1]) + skipped_cases += [c for i, c in enumerate(cases) if i % shard_count != shard_idx - 1] + cases = [c for i, c in enumerate(cases) if i % shard_count == shard_idx - 1] + if ARG("percent") == 100: return cases, skipped_cases @@ -206,6 +214,15 @@ def test(): # Build the summary report _print_test_summary(nPASS, nFAIL, nSKIP, minutes, seconds, failed_tests, skipped_cases) + # Write failed UUIDs to file for CI retry logic + failed_uuids_path = os.path.join(common.MFC_TEST_DIR, "failed_uuids.txt") + if failed_tests: + with open(failed_uuids_path, "w") as f: + for test_info in failed_tests: + f.write(test_info['uuid'] + "\n") + elif os.path.exists(failed_uuids_path): + os.remove(failed_uuids_path) + exit(nFAIL) From 9de84d63c18f81151fb4a3abad3a341c38c49b9a Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 20 Feb 2026 14:43:00 -0500 Subject: [PATCH 03/15] Fix stale failed_uuids.txt on abort, guard empty retry, quote nproc - Clean up failed_uuids.txt on early abort path so CI doesn't retry stale UUIDs from a previous run - Guard retry condition with NUM_FAILED > 0 to prevent full-suite rerun when the file exists but is empty - Quote $(nproc) to silence shellcheck SC2046 warnings Co-Authored-By: Claude Opus 4.6 --- .github/workflows/test.yml | 10 +++++----- toolchain/mfc/test/test.py | 5 +++++ 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2d88007613..32a0410e75 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -28,7 +28,7 @@ jobs: - name: Check Formatting run: | - ./mfc.sh format -j $(nproc) + ./mfc.sh format -j "$(nproc)" git diff --exit-code || (echo "::error::Code is not formatted. Run './mfc.sh format' locally." && exit 1) - name: Spell Check @@ -138,7 +138,7 @@ jobs: - name: Build run: | - /bin/bash mfc.sh test -v --dry-run -j $(nproc) --${{ matrix.debug }} --${{ matrix.mpi }} --${{ matrix.precision }} $TEST_ALL + /bin/bash mfc.sh test -v --dry-run -j "$(nproc)" --${{ matrix.debug }} --${{ matrix.mpi }} --${{ matrix.precision }} $TEST_ALL env: TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }} @@ -146,17 +146,17 @@ jobs: run: | rm -f tests/failed_uuids.txt TEST_EXIT=0 - /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT || TEST_EXIT=$? + /bin/bash mfc.sh test -v --max-attempts 3 -j "$(nproc)" $TEST_ALL $TEST_PCT || TEST_EXIT=$? # Retry only if a small number of tests failed (sporadic failures) if [ -f tests/failed_uuids.txt ]; then NUM_FAILED=$(wc -l < tests/failed_uuids.txt) - if [ "$NUM_FAILED" -le 5 ]; then + if [ "$NUM_FAILED" -gt 0 ] && [ "$NUM_FAILED" -le 5 ]; then FAILED=$(cat tests/failed_uuids.txt | tr '\n' ' ') echo "" echo "=== Retrying $NUM_FAILED failed test(s): $FAILED ===" echo "" - /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) --only $FAILED $TEST_ALL || exit $? + /bin/bash mfc.sh test -v --max-attempts 3 -j "$(nproc)" --only $FAILED $TEST_ALL || exit $? else echo "Too many failures ($NUM_FAILED) to retry — likely a real issue." exit 1 diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index 54e00186dd..681f59f6ae 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -190,6 +190,11 @@ def test(): # Check if we aborted due to high failure rate if abort_tests.is_set(): + # Clean up stale failed_uuids.txt so CI doesn't retry wrong tests + failed_uuids_path = os.path.join(common.MFC_TEST_DIR, "failed_uuids.txt") + if os.path.exists(failed_uuids_path): + os.remove(failed_uuids_path) + total_completed = nFAIL + nPASS cons.print() cons.unindent() From 92deb41f63d26ef68ead305d63b2f51146da0aae Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 20 Feb 2026 15:36:28 -0500 Subject: [PATCH 04/15] Remove proactive clean from Phoenix test script The build system should handle compiler changes correctly. Proactive clean forces full rebuilds of FFTW/LAPACK from scratch every run, which is slow and exposes builds to transient filesystem failures (CMake TryCompile errors on Phoenix scratch). Co-Authored-By: Claude Opus 4.6 --- .github/workflows/phoenix/test.sh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/phoenix/test.sh b/.github/workflows/phoenix/test.sh index e6912f70b6..74c31c9fba 100644 --- a/.github/workflows/phoenix/test.sh +++ b/.github/workflows/phoenix/test.sh @@ -1,9 +1,5 @@ #!/bin/bash -# Clean stale build artifacts from previous CI runs to prevent -# cross-compiler contamination (e.g. gfortran LAPACK linked by NVHPC) -./mfc.sh clean - build_opts="" if [ "$job_device" = "gpu" ]; then build_opts="--gpu" From 877127b68131b7bb2ac6f756f35ac20acd04338b Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 20 Feb 2026 22:20:18 -0500 Subject: [PATCH 05/15] Skip benchmark workflow for bot review events Bot reviews (AI code reviewers) were triggering the benchmark workflow, and the concurrency group was cancelling the real benchmark run from the pull_request event. Gate the workflow early by skipping when the review author is a Bot account type. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/bench.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 56735da9c1..b45fc45e40 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -13,6 +13,9 @@ concurrency: jobs: file-changes: name: Detect File Changes + if: > + github.event_name != 'pull_request_review' || + github.event.review.user.type != 'Bot' runs-on: 'ubuntu-latest' outputs: checkall: ${{ steps.changes.outputs.checkall }} From eaab95ae87012ab7be3834b3e0724841a33d6e1c Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sat, 21 Feb 2026 00:16:32 -0500 Subject: [PATCH 06/15] Fix CI edge cases: guard os.remove, skip bare -- flag, use -s for empty file check - Wrap os.remove() in try/except OSError on abort path so permission errors don't mask the real MFCException - Only pass --precision flag when matrix.precision is non-empty to avoid invalid bare -- argument - Use -s instead of -f for failed_uuids.txt to skip retry when file exists but is empty Co-Authored-By: Claude Opus 4.6 --- .github/workflows/test.yml | 4 ++-- toolchain/mfc/test/test.py | 7 +++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 32a0410e75..e90bd1d300 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -138,7 +138,7 @@ jobs: - name: Build run: | - /bin/bash mfc.sh test -v --dry-run -j "$(nproc)" --${{ matrix.debug }} --${{ matrix.mpi }} --${{ matrix.precision }} $TEST_ALL + /bin/bash mfc.sh test -v --dry-run -j "$(nproc)" --${{ matrix.debug }} --${{ matrix.mpi }} ${{ matrix.precision != '' && format('--{0}', matrix.precision) || '' }} $TEST_ALL env: TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }} @@ -149,7 +149,7 @@ jobs: /bin/bash mfc.sh test -v --max-attempts 3 -j "$(nproc)" $TEST_ALL $TEST_PCT || TEST_EXIT=$? # Retry only if a small number of tests failed (sporadic failures) - if [ -f tests/failed_uuids.txt ]; then + if [ -s tests/failed_uuids.txt ]; then NUM_FAILED=$(wc -l < tests/failed_uuids.txt) if [ "$NUM_FAILED" -gt 0 ] && [ "$NUM_FAILED" -le 5 ]; then FAILED=$(cat tests/failed_uuids.txt | tr '\n' ' ') diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index 681f59f6ae..26be08fb8a 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -192,8 +192,11 @@ def test(): if abort_tests.is_set(): # Clean up stale failed_uuids.txt so CI doesn't retry wrong tests failed_uuids_path = os.path.join(common.MFC_TEST_DIR, "failed_uuids.txt") - if os.path.exists(failed_uuids_path): - os.remove(failed_uuids_path) + try: + if os.path.exists(failed_uuids_path): + os.remove(failed_uuids_path) + except OSError: + pass total_completed = nFAIL + nPASS cons.print() From aa26048e4233123d9b72023f5961912a1538e422 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Mon, 23 Feb 2026 09:28:33 -0500 Subject: [PATCH 07/15] Fix --only filter silently matching zero tests with multiple UUIDs The subset check required ALL passed UUIDs to match a single test case's trace, which is impossible since each case has one UUID. With 2+ failed tests, the CI retry selected 0 tests and exited 0, silently masking real failures. Changed to intersection so each case is kept if ANY of the passed UUIDs matches. Co-Authored-By: Claude Opus 4.6 --- toolchain/mfc/test/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index 26be08fb8a..9fb0bd8eaf 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -71,7 +71,7 @@ def __filter(cases_) -> typing.List[TestCase]: checkCase = case.trace.split(" -> ") checkCase.append(case.get_uuid()) - if not set(ARG("only")).issubset(set(checkCase)): + if not set(ARG("only")).intersection(set(checkCase)): cases.remove(case) skipped_cases.append(case) From b5c095fb2b7c7db4d788b0bfafea2a4a65f93681 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Wed, 25 Feb 2026 21:23:43 -0500 Subject: [PATCH 08/15] Remove redundant NUM_FAILED > 0 guard in test retry logic The -s check already guarantees the file is non-empty, so NUM_FAILED > 0 is always true in that branch. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e90bd1d300..aa3c29d6ff 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -151,7 +151,7 @@ jobs: # Retry only if a small number of tests failed (sporadic failures) if [ -s tests/failed_uuids.txt ]; then NUM_FAILED=$(wc -l < tests/failed_uuids.txt) - if [ "$NUM_FAILED" -gt 0 ] && [ "$NUM_FAILED" -le 5 ]; then + if [ "$NUM_FAILED" -le 5 ]; then FAILED=$(cat tests/failed_uuids.txt | tr '\n' ' ') echo "" echo "=== Retrying $NUM_FAILED failed test(s): $FAILED ===" From 6b43b9b9e6ce2e5df9dfbf252d05dba8a668b1be Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Wed, 25 Feb 2026 21:40:15 -0500 Subject: [PATCH 09/15] Address review findings: shard slug collision, script consolidation, zero-match guard - Include shard in SLURM job_slug to prevent output file collisions between parallel shards (e.g., test-gpu-acc-1-of-2.out) - Consolidate frontier/ and frontier_amd/ submit.sh and test.sh into identical scripts that derive compiler flag and config from directory - Add $shard_opts to CPU test branch for future-proofing - Add zero-match guard for --only filter to fail loudly instead of silently exiting 0 when no tests match - Hoist failed_uuids_path to single definition at top of test() - Compute log slug dynamically in test.yml for shard-aware filenames - Remove unnecessary shard: '' from non-sharded matrix entries - Replace useless cat|tr pipeline with tr < file Co-Authored-By: Claude Opus 4.6 --- .github/workflows/frontier/submit.sh | 22 +++++++++++++++++----- .github/workflows/frontier/test.sh | 8 ++++++-- .github/workflows/frontier_amd/submit.sh | 22 +++++++++++++++++----- .github/workflows/frontier_amd/test.sh | 8 ++++++-- .github/workflows/test.yml | 24 +++++++++++++++--------- toolchain/mfc/test/test.py | 11 ++++++++--- 6 files changed, 69 insertions(+), 26 deletions(-) diff --git a/.github/workflows/frontier/submit.sh b/.github/workflows/frontier/submit.sh index 4c3e0e3e27..ef0289696c 100644 --- a/.github/workflows/frontier/submit.sh +++ b/.github/workflows/frontier/submit.sh @@ -5,8 +5,17 @@ set -e # Ignore SIGHUP to survive login node session drops trap '' HUP +# Determine compiler flag from directory name +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cluster_name="$(basename "$SCRIPT_DIR")" +case "$cluster_name" in + frontier) compiler_flag="f" ;; + frontier_amd) compiler_flag="famd" ;; + *) echo "ERROR: Unknown cluster '$cluster_name'"; exit 1 ;; +esac + usage() { - echo "Usage: $0 [script.sh] [cpu|gpu]" + echo "Usage: $0 [script.sh] [cpu|gpu] [none|acc|omp] [shard]" } if [ ! -z "$1" ]; then @@ -27,8 +36,11 @@ else exit 1 fi - -job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2-$3" +shard_suffix="" +if [ -n "$4" ]; then + shard_suffix="-$(echo "$4" | sed 's|/|-of-|')" +fi +job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2-$3${shard_suffix}" output_file="$job_slug.out" submit_output=$(sbatch <> "$GITHUB_OUTPUT" + - name: Print Logs if: always() - run: cat test-${{ matrix.device }}-${{ matrix.interface }}.out + run: cat ${{ steps.log.outputs.slug }}.out - name: Archive Logs uses: actions/upload-artifact@v4 if: matrix.cluster != 'phoenix' with: - name: logs-${{ strategy.job-index }}-${{ matrix.device }}-${{ matrix.interface }} - path: test-${{ matrix.device }}-${{ matrix.interface }}.out + name: logs-${{ strategy.job-index }}-${{ steps.log.outputs.slug }} + path: ${{ steps.log.outputs.slug }}.out diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index 9fb0bd8eaf..73dd8f3414 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -42,7 +42,7 @@ class TestTimeoutError(MFCException): pass -# pylint: disable=too-many-branches, trailing-whitespace +# pylint: disable=too-many-branches, too-many-statements, trailing-whitespace def __filter(cases_) -> typing.List[TestCase]: cases = cases_[:] selected_cases = [] @@ -75,6 +75,12 @@ def __filter(cases_) -> typing.List[TestCase]: cases.remove(case) skipped_cases.append(case) + if not cases: + raise MFCException( + f"--only filter matched zero test cases. " + f"Specified: {ARG('only')}. Check that UUIDs/names are valid." + ) + for case in cases[:]: if case.ppn > 1 and not ARG("mpi"): cases.remove(case) @@ -123,6 +129,7 @@ def test(): global errors, failed_tests, test_start_time test_start_time = time.time() # Start timing + failed_uuids_path = os.path.join(common.MFC_TEST_DIR, "failed_uuids.txt") cases = list_cases() # Delete UUIDs that are not in the list of cases from tests/ @@ -191,7 +198,6 @@ def test(): # Check if we aborted due to high failure rate if abort_tests.is_set(): # Clean up stale failed_uuids.txt so CI doesn't retry wrong tests - failed_uuids_path = os.path.join(common.MFC_TEST_DIR, "failed_uuids.txt") try: if os.path.exists(failed_uuids_path): os.remove(failed_uuids_path) @@ -223,7 +229,6 @@ def test(): _print_test_summary(nPASS, nFAIL, nSKIP, minutes, seconds, failed_tests, skipped_cases) # Write failed UUIDs to file for CI retry logic - failed_uuids_path = os.path.join(common.MFC_TEST_DIR, "failed_uuids.txt") if failed_tests: with open(failed_uuids_path, "w") as f: for test_info in failed_tests: From adac6887452b7e8013ceea93ed13f939c326c73f Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Wed, 25 Feb 2026 22:24:49 -0500 Subject: [PATCH 10/15] Use AND logic for labels and OR logic for UUIDs in --only filter The --only filter now detects whether each term is a UUID (8-char hex) or a trace label and applies appropriate matching: - Labels: AND logic (--only 2D Bubbles matches tests with both) - UUIDs: OR logic (--only UUID1 UUID2 matches tests with either) - Mixed: keep case if all labels match OR any UUID matches This preserves the documented behavior for label filtering while correctly supporting the CI retry path that passes multiple UUIDs. Co-Authored-By: Claude Opus 4.6 --- toolchain/mfc/test/test.py | 43 +++++++++++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index 73dd8f3414..def4305bdd 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -42,6 +42,40 @@ class TestTimeoutError(MFCException): pass +def _filter_only(cases, skipped_cases): + """Filter cases by --only terms using AND for labels, OR for UUIDs. + + Labels (non-UUID terms): case must match ALL labels (AND logic). + UUIDs (8-char hex terms): case must match ANY UUID (OR logic). + Mixed: keep case if all labels match OR any UUID matches. + """ + def is_uuid(term): + return len(term) == 8 and all(c in '0123456789abcdefABCDEF' for c in term) + + uuids = [t for t in ARG("only") if is_uuid(t)] + labels = [t for t in ARG("only") if not is_uuid(t)] + + for case in cases[:]: + check = set(case.trace.split(" -> ")) + check.add(case.get_uuid()) + + label_ok = all(l in check for l in labels) if labels else True + uuid_ok = any(u in check for u in uuids) if uuids else True + + if labels and uuids: + keep = label_ok or uuid_ok + elif labels: + keep = label_ok + else: + keep = uuid_ok + + if not keep: + cases.remove(case) + skipped_cases.append(case) + + return cases, skipped_cases + + # pylint: disable=too-many-branches, too-many-statements, trailing-whitespace def __filter(cases_) -> typing.List[TestCase]: cases = cases_[:] @@ -66,14 +100,7 @@ def __filter(cases_) -> typing.List[TestCase]: raise MFCException("Testing: Your specified range [--from,--to] is incorrect. Please ensure both IDs exist and are in the correct order.") if len(ARG("only")) > 0: - for case in cases[:]: - case: TestCase - - checkCase = case.trace.split(" -> ") - checkCase.append(case.get_uuid()) - if not set(ARG("only")).intersection(set(checkCase)): - cases.remove(case) - skipped_cases.append(case) + cases, skipped_cases = _filter_only(cases, skipped_cases) if not cases: raise MFCException( From 06c0641029342b84fea968b4dc86cc7b3b00a54e Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Wed, 25 Feb 2026 22:47:33 -0500 Subject: [PATCH 11/15] Consolidate CI submit scripts: merge submit-bench.sh into submit.sh submit.sh now auto-detects job type (bench vs test) from the submitted script's basename, selecting the appropriate SBATCH account, time limit, and partition. This eliminates three submit-bench.sh files and makes frontier/ and frontier_amd/ scripts byte-identical via directory-name detection for compiler flags and cluster-specific options. Co-Authored-By: Claude Opus 4.6 --- .github/scripts/submit_and_monitor_bench.sh | 51 ++++----------- .github/workflows/frontier/bench.sh | 4 +- .github/workflows/frontier/build.sh | 15 ++++- .github/workflows/frontier/submit-bench.sh | 54 --------------- .github/workflows/frontier/submit.sh | 28 ++++++-- .github/workflows/frontier_amd/bench.sh | 4 +- .github/workflows/frontier_amd/build.sh | 15 ++++- .../workflows/frontier_amd/submit-bench.sh | 54 --------------- .github/workflows/frontier_amd/submit.sh | 28 ++++++-- .github/workflows/phoenix/submit-bench.sh | 65 ------------------- .github/workflows/phoenix/submit.sh | 21 +++++- 11 files changed, 109 insertions(+), 230 deletions(-) delete mode 100644 .github/workflows/frontier/submit-bench.sh delete mode 100644 .github/workflows/frontier_amd/submit-bench.sh delete mode 100644 .github/workflows/phoenix/submit-bench.sh diff --git a/.github/scripts/submit_and_monitor_bench.sh b/.github/scripts/submit_and_monitor_bench.sh index 80790752d7..c081c8692a 100755 --- a/.github/scripts/submit_and_monitor_bench.sh +++ b/.github/scripts/submit_and_monitor_bench.sh @@ -14,50 +14,27 @@ device="$2" interface="$3" cluster="$4" -# Get the directory where this script lives -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - echo "[$dir] Submitting benchmark for $device-$interface on $cluster..." cd "$dir" -# Submit job -submit_output=$(bash .github/workflows/$cluster/submit-bench.sh \ - .github/workflows/$cluster/bench.sh "$device" "$interface" 2>&1) - -job_id=$(echo "$submit_output" | sed -n 's/.*Submitted batch job \([0-9][0-9]*\).*/\1/p') -job_slug="bench-$device-$interface" -output_file="${job_slug}.out" - -if [ -z "$job_id" ]; then - echo "[$dir] ERROR: Failed to submit job" - echo "$submit_output" - exit 1 -fi - -echo "[$dir] Job ID: $job_id, monitoring output file: $output_file" - -# Use the monitoring script from PR (where this script lives) -monitor_exit=0 -bash "${SCRIPT_DIR}/monitor_slurm_job.sh" "$job_id" "$output_file" || monitor_exit=$? -if [ "$monitor_exit" -ne 0 ]; then - echo "[$dir] WARNING: SLURM job exited with code $monitor_exit" -else - echo "[$dir] Monitoring complete for job $job_id" -fi +# Submit and monitor job (submit.sh auto-detects bench mode from script name) +bash .github/workflows/$cluster/submit.sh \ + .github/workflows/$cluster/bench.sh "$device" "$interface" # Verify the YAML output file was created +job_slug="bench-$device-$interface" yaml_file="${job_slug}.yaml" if [ ! -f "$yaml_file" ]; then - echo "[$dir] ERROR: Expected output file not found: $yaml_file" - echo "[$dir] Directory contents:" - ls -la *.yaml 2>/dev/null || echo " No YAML files found" - echo "" - echo "[$dir] Last 100 lines of job output ($output_file):" - echo "----------------------------------------" - tail -n 100 "$output_file" 2>/dev/null || echo " Could not read output file" - echo "----------------------------------------" - exit 1 + echo "[$dir] ERROR: Expected output file not found: $yaml_file" + echo "[$dir] Directory contents:" + ls -la *.yaml 2>/dev/null || echo " No YAML files found" + echo "" + output_file="${job_slug}.out" + echo "[$dir] Last 100 lines of job output ($output_file):" + echo "----------------------------------------" + tail -n 100 "$output_file" 2>/dev/null || echo " Could not read output file" + echo "----------------------------------------" + exit 1 fi echo "[$dir] Verified output file exists: $yaml_file ($(stat -f%z "$yaml_file" 2>/dev/null || stat -c%s "$yaml_file" 2>/dev/null) bytes)" - diff --git a/.github/workflows/frontier/bench.sh b/.github/workflows/frontier/bench.sh index 35b4c5950e..a79f1a2fc5 100644 --- a/.github/workflows/frontier/bench.sh +++ b/.github/workflows/frontier/bench.sh @@ -16,7 +16,7 @@ if [ "$job_device" = "gpu" ]; then fi if [ "$job_device" = "gpu" ]; then - ./mfc.sh bench --mem 12 -j $n_ranks -o "$job_slug.yaml" -- -c frontier $device_opts -n $n_ranks + ./mfc.sh bench --mem 12 -j $n_ranks -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks else - ./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c frontier $device_opts -n $n_ranks + ./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks fi diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh index ca09c2a116..84036641c6 100644 --- a/.github/workflows/frontier/build.sh +++ b/.github/workflows/frontier/build.sh @@ -3,6 +3,15 @@ # Ignore SIGHUP to survive login node session drops trap '' HUP +# Determine compiler flag from directory name +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cluster_name="$(basename "$SCRIPT_DIR")" +case "$cluster_name" in + frontier) compiler_flag="f" ;; + frontier_amd) compiler_flag="famd" ;; + *) echo "ERROR: Unknown cluster '$cluster_name'"; exit 1 ;; +esac + job_device=$1 job_interface=$2 run_bench=$3 @@ -16,11 +25,11 @@ if [ "$job_device" = "gpu" ]; then fi fi -. ./mfc.sh load -c f -m g +. ./mfc.sh load -c $compiler_flag -m g # Only set up build cache for test suite, not benchmarks if [ "$run_bench" != "bench" ]; then - source .github/scripts/setup-build-cache.sh frontier "$job_device" "$job_interface" + source .github/scripts/setup-build-cache.sh "$cluster_name" "$job_device" "$job_interface" fi max_attempts=3 @@ -37,7 +46,7 @@ while [ $attempt -le $max_attempts ]; do fi done else - if ./mfc.sh test -v -a --dry-run --rdma-mpi -j 8 $build_opts; then + if ./mfc.sh test -v -a --dry-run $([ "$cluster_name" = "frontier" ] && echo "--rdma-mpi") -j 8 $build_opts; then build_cmd_ok=true else build_cmd_ok=false diff --git a/.github/workflows/frontier/submit-bench.sh b/.github/workflows/frontier/submit-bench.sh deleted file mode 100644 index 81b9b274e6..0000000000 --- a/.github/workflows/frontier/submit-bench.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash - -set -e - -usage() { - echo "Usage: $0 [script.sh] [cpu|gpu]" -} - -if [ ! -z "$1" ]; then - sbatch_script_contents=`cat $1` -else - usage - exit 1 -fi - -if [ "$2" = "cpu" ]; then - sbatch_device_opts="\ -#SBATCH -n 32 # Number of cores required" -elif [ "$2" = "gpu" ]; then - sbatch_device_opts="\ -#SBATCH -n 8 # Number of cores required" -else - usage; exit 1 -fi - - -job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2-$3" - -sbatch < Date: Thu, 26 Feb 2026 09:40:38 -0500 Subject: [PATCH 12/15] Use normal QOS instead of hackathon for Frontier test jobs Co-Authored-By: Claude Opus 4.6 --- .github/workflows/frontier/submit.sh | 2 +- .github/workflows/frontier_amd/submit.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/frontier/submit.sh b/.github/workflows/frontier/submit.sh index 13018c595d..16d4f0d73c 100644 --- a/.github/workflows/frontier/submit.sh +++ b/.github/workflows/frontier/submit.sh @@ -53,7 +53,7 @@ else sbatch_account="#SBATCH -A CFD154" sbatch_time="#SBATCH -t 01:59:00" sbatch_partition="#SBATCH -p batch" - sbatch_extra="#SBATCH --qos=hackathon" + sbatch_extra="#SBATCH --qos=normal" fi shard_suffix="" diff --git a/.github/workflows/frontier_amd/submit.sh b/.github/workflows/frontier_amd/submit.sh index 13018c595d..16d4f0d73c 100644 --- a/.github/workflows/frontier_amd/submit.sh +++ b/.github/workflows/frontier_amd/submit.sh @@ -53,7 +53,7 @@ else sbatch_account="#SBATCH -A CFD154" sbatch_time="#SBATCH -t 01:59:00" sbatch_partition="#SBATCH -p batch" - sbatch_extra="#SBATCH --qos=hackathon" + sbatch_extra="#SBATCH --qos=normal" fi shard_suffix="" From d5612834faa19df101d72fe29bd1408bbe7d8d06 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 26 Feb 2026 16:59:51 -0500 Subject: [PATCH 13/15] Add zero-test guard after shard filtering and pin retry action to SHA - Raise MFCException when --shard produces zero cases (prevents silent green CI with nothing executed) - Pin nick-fields/retry to commit SHA for security on self-hosted runners with cluster credentials Co-Authored-By: Claude Opus 4.6 --- .github/workflows/test.yml | 2 +- toolchain/mfc/test/test.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 3b0cd4e465..b6aee7e204 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -254,7 +254,7 @@ jobs: - name: Build if: matrix.cluster != 'phoenix' - uses: nick-fields/retry@v3 + uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # v3 with: max_attempts: 3 retry_wait_seconds: 60 diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index def4305bdd..9a97018300 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -140,6 +140,12 @@ def __filter(cases_) -> typing.List[TestCase]: skipped_cases += [c for i, c in enumerate(cases) if i % shard_count != shard_idx - 1] cases = [c for i, c in enumerate(cases) if i % shard_count == shard_idx - 1] + if not cases: + raise MFCException( + f"--shard {ARG('shard')} matched zero test cases. " + f"Total cases before sharding may be less than shard count." + ) + if ARG("percent") == 100: return cases, skipped_cases From 46dcd73d350cb7f227dd390a637ee90d2bde67b9 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 26 Feb 2026 18:20:50 -0500 Subject: [PATCH 14/15] Trigger CI From a2431bf29381f903327b1f4e1da0d598bed6d517 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 26 Feb 2026 19:15:40 -0500 Subject: [PATCH 15/15] Rename ambiguous single-letter variable `l` to `label` in _filter_only Co-Authored-By: Claude Opus 4.6 --- toolchain/mfc/test/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index 9a97018300..049af9e560 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -59,7 +59,7 @@ def is_uuid(term): check = set(case.trace.split(" -> ")) check.add(case.get_uuid()) - label_ok = all(l in check for l in labels) if labels else True + label_ok = all(label in check for label in labels) if labels else True uuid_ok = any(u in check for u in uuids) if uuids else True if labels and uuids: