diff --git a/.github/scripts/submit_and_monitor_bench.sh b/.github/scripts/submit_and_monitor_bench.sh index 80790752d7..c081c8692a 100755 --- a/.github/scripts/submit_and_monitor_bench.sh +++ b/.github/scripts/submit_and_monitor_bench.sh @@ -14,50 +14,27 @@ device="$2" interface="$3" cluster="$4" -# Get the directory where this script lives -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - echo "[$dir] Submitting benchmark for $device-$interface on $cluster..." cd "$dir" -# Submit job -submit_output=$(bash .github/workflows/$cluster/submit-bench.sh \ - .github/workflows/$cluster/bench.sh "$device" "$interface" 2>&1) - -job_id=$(echo "$submit_output" | sed -n 's/.*Submitted batch job \([0-9][0-9]*\).*/\1/p') -job_slug="bench-$device-$interface" -output_file="${job_slug}.out" - -if [ -z "$job_id" ]; then - echo "[$dir] ERROR: Failed to submit job" - echo "$submit_output" - exit 1 -fi - -echo "[$dir] Job ID: $job_id, monitoring output file: $output_file" - -# Use the monitoring script from PR (where this script lives) -monitor_exit=0 -bash "${SCRIPT_DIR}/monitor_slurm_job.sh" "$job_id" "$output_file" || monitor_exit=$? -if [ "$monitor_exit" -ne 0 ]; then - echo "[$dir] WARNING: SLURM job exited with code $monitor_exit" -else - echo "[$dir] Monitoring complete for job $job_id" -fi +# Submit and monitor job (submit.sh auto-detects bench mode from script name) +bash .github/workflows/$cluster/submit.sh \ + .github/workflows/$cluster/bench.sh "$device" "$interface" # Verify the YAML output file was created +job_slug="bench-$device-$interface" yaml_file="${job_slug}.yaml" if [ ! -f "$yaml_file" ]; then - echo "[$dir] ERROR: Expected output file not found: $yaml_file" - echo "[$dir] Directory contents:" - ls -la *.yaml 2>/dev/null || echo " No YAML files found" - echo "" - echo "[$dir] Last 100 lines of job output ($output_file):" - echo "----------------------------------------" - tail -n 100 "$output_file" 2>/dev/null || echo " Could not read output file" - echo "----------------------------------------" - exit 1 + echo "[$dir] ERROR: Expected output file not found: $yaml_file" + echo "[$dir] Directory contents:" + ls -la *.yaml 2>/dev/null || echo " No YAML files found" + echo "" + output_file="${job_slug}.out" + echo "[$dir] Last 100 lines of job output ($output_file):" + echo "----------------------------------------" + tail -n 100 "$output_file" 2>/dev/null || echo " Could not read output file" + echo "----------------------------------------" + exit 1 fi echo "[$dir] Verified output file exists: $yaml_file ($(stat -f%z "$yaml_file" 2>/dev/null || stat -c%s "$yaml_file" 2>/dev/null) bytes)" - diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 56735da9c1..b45fc45e40 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -13,6 +13,9 @@ concurrency: jobs: file-changes: name: Detect File Changes + if: > + github.event_name != 'pull_request_review' || + github.event.review.user.type != 'Bot' runs-on: 'ubuntu-latest' outputs: checkall: ${{ steps.changes.outputs.checkall }} diff --git a/.github/workflows/frontier/bench.sh b/.github/workflows/frontier/bench.sh index 35b4c5950e..a79f1a2fc5 100644 --- a/.github/workflows/frontier/bench.sh +++ b/.github/workflows/frontier/bench.sh @@ -16,7 +16,7 @@ if [ "$job_device" = "gpu" ]; then fi if [ "$job_device" = "gpu" ]; then - ./mfc.sh bench --mem 12 -j $n_ranks -o "$job_slug.yaml" -- -c frontier $device_opts -n $n_ranks + ./mfc.sh bench --mem 12 -j $n_ranks -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks else - ./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c frontier $device_opts -n $n_ranks + ./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks fi diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh index ca09c2a116..84036641c6 100644 --- a/.github/workflows/frontier/build.sh +++ b/.github/workflows/frontier/build.sh @@ -3,6 +3,15 @@ # Ignore SIGHUP to survive login node session drops trap '' HUP +# Determine compiler flag from directory name +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cluster_name="$(basename "$SCRIPT_DIR")" +case "$cluster_name" in + frontier) compiler_flag="f" ;; + frontier_amd) compiler_flag="famd" ;; + *) echo "ERROR: Unknown cluster '$cluster_name'"; exit 1 ;; +esac + job_device=$1 job_interface=$2 run_bench=$3 @@ -16,11 +25,11 @@ if [ "$job_device" = "gpu" ]; then fi fi -. ./mfc.sh load -c f -m g +. ./mfc.sh load -c $compiler_flag -m g # Only set up build cache for test suite, not benchmarks if [ "$run_bench" != "bench" ]; then - source .github/scripts/setup-build-cache.sh frontier "$job_device" "$job_interface" + source .github/scripts/setup-build-cache.sh "$cluster_name" "$job_device" "$job_interface" fi max_attempts=3 @@ -37,7 +46,7 @@ while [ $attempt -le $max_attempts ]; do fi done else - if ./mfc.sh test -v -a --dry-run --rdma-mpi -j 8 $build_opts; then + if ./mfc.sh test -v -a --dry-run $([ "$cluster_name" = "frontier" ] && echo "--rdma-mpi") -j 8 $build_opts; then build_cmd_ok=true else build_cmd_ok=false diff --git a/.github/workflows/frontier/submit-bench.sh b/.github/workflows/frontier/submit-bench.sh deleted file mode 100644 index 81b9b274e6..0000000000 --- a/.github/workflows/frontier/submit-bench.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash - -set -e - -usage() { - echo "Usage: $0 [script.sh] [cpu|gpu]" -} - -if [ ! -z "$1" ]; then - sbatch_script_contents=`cat $1` -else - usage - exit 1 -fi - -if [ "$2" = "cpu" ]; then - sbatch_device_opts="\ -#SBATCH -n 32 # Number of cores required" -elif [ "$2" = "gpu" ]; then - sbatch_device_opts="\ -#SBATCH -n 8 # Number of cores required" -else - usage; exit 1 -fi - - -job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2-$3" - -sbatch <> "$GITHUB_OUTPUT" - name: Print Logs if: always() - run: cat test-${{ matrix.device }}-${{ matrix.interface }}.out + run: cat ${{ steps.log.outputs.slug }}.out - name: Archive Logs uses: actions/upload-artifact@v4 if: matrix.cluster != 'phoenix' with: - name: logs-${{ strategy.job-index }}-${{ matrix.device }}-${{ matrix.interface }} - path: test-${{ matrix.device }}-${{ matrix.interface }}.out + name: logs-${{ strategy.job-index }}-${{ steps.log.outputs.slug }} + path: ${{ steps.log.outputs.slug }}.out diff --git a/toolchain/mfc/cli/commands.py b/toolchain/mfc/cli/commands.py index 8ad8c4bd07..018e3cef83 100644 --- a/toolchain/mfc/cli/commands.py +++ b/toolchain/mfc/cli/commands.py @@ -452,6 +452,12 @@ default=False, dest="dry_run", ), + Argument( + name="shard", + help="Run only a subset of tests (e.g., '1/2' for first half, '2/2' for second half).", + type=str, + default=None, + ), ], mutually_exclusive=[ MutuallyExclusiveGroup(arguments=[ diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index 31a3771cb9..049af9e560 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -42,7 +42,41 @@ class TestTimeoutError(MFCException): pass -# pylint: disable=too-many-branches, trailing-whitespace +def _filter_only(cases, skipped_cases): + """Filter cases by --only terms using AND for labels, OR for UUIDs. + + Labels (non-UUID terms): case must match ALL labels (AND logic). + UUIDs (8-char hex terms): case must match ANY UUID (OR logic). + Mixed: keep case if all labels match OR any UUID matches. + """ + def is_uuid(term): + return len(term) == 8 and all(c in '0123456789abcdefABCDEF' for c in term) + + uuids = [t for t in ARG("only") if is_uuid(t)] + labels = [t for t in ARG("only") if not is_uuid(t)] + + for case in cases[:]: + check = set(case.trace.split(" -> ")) + check.add(case.get_uuid()) + + label_ok = all(label in check for label in labels) if labels else True + uuid_ok = any(u in check for u in uuids) if uuids else True + + if labels and uuids: + keep = label_ok or uuid_ok + elif labels: + keep = label_ok + else: + keep = uuid_ok + + if not keep: + cases.remove(case) + skipped_cases.append(case) + + return cases, skipped_cases + + +# pylint: disable=too-many-branches, too-many-statements, trailing-whitespace def __filter(cases_) -> typing.List[TestCase]: cases = cases_[:] selected_cases = [] @@ -66,14 +100,13 @@ def __filter(cases_) -> typing.List[TestCase]: raise MFCException("Testing: Your specified range [--from,--to] is incorrect. Please ensure both IDs exist and are in the correct order.") if len(ARG("only")) > 0: - for case in cases[:]: - case: TestCase + cases, skipped_cases = _filter_only(cases, skipped_cases) - checkCase = case.trace.split(" -> ") - checkCase.append(case.get_uuid()) - if not set(ARG("only")).issubset(set(checkCase)): - cases.remove(case) - skipped_cases.append(case) + if not cases: + raise MFCException( + f"--only filter matched zero test cases. " + f"Specified: {ARG('only')}. Check that UUIDs/names are valid." + ) for case in cases[:]: if case.ppn > 1 and not ARG("mpi"): @@ -99,6 +132,20 @@ def __filter(cases_) -> typing.List[TestCase]: skipped_cases += example_cases cases = [case for case in cases if case not in example_cases] + if ARG("shard") is not None: + parts = ARG("shard").split("/") + if len(parts) != 2 or not all(p.isdigit() for p in parts) or int(parts[1]) < 1 or not 1 <= int(parts[0]) <= int(parts[1]): + raise MFCException(f"Invalid --shard '{ARG('shard')}': expected 'i/n' with 1 <= i <= n (e.g., '1/2').") + shard_idx, shard_count = int(parts[0]), int(parts[1]) + skipped_cases += [c for i, c in enumerate(cases) if i % shard_count != shard_idx - 1] + cases = [c for i, c in enumerate(cases) if i % shard_count == shard_idx - 1] + + if not cases: + raise MFCException( + f"--shard {ARG('shard')} matched zero test cases. " + f"Total cases before sharding may be less than shard count." + ) + if ARG("percent") == 100: return cases, skipped_cases @@ -115,6 +162,7 @@ def test(): global errors, failed_tests, test_start_time test_start_time = time.time() # Start timing + failed_uuids_path = os.path.join(common.MFC_TEST_DIR, "failed_uuids.txt") cases = list_cases() # Delete UUIDs that are not in the list of cases from tests/ @@ -182,6 +230,13 @@ def test(): # Check if we aborted due to high failure rate if abort_tests.is_set(): + # Clean up stale failed_uuids.txt so CI doesn't retry wrong tests + try: + if os.path.exists(failed_uuids_path): + os.remove(failed_uuids_path) + except OSError: + pass + total_completed = nFAIL + nPASS cons.print() cons.unindent() @@ -206,6 +261,14 @@ def test(): # Build the summary report _print_test_summary(nPASS, nFAIL, nSKIP, minutes, seconds, failed_tests, skipped_cases) + # Write failed UUIDs to file for CI retry logic + if failed_tests: + with open(failed_uuids_path, "w") as f: + for test_info in failed_tests: + f.write(test_info['uuid'] + "\n") + elif os.path.exists(failed_uuids_path): + os.remove(failed_uuids_path) + exit(nFAIL)