Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 14 additions & 37 deletions .github/scripts/submit_and_monitor_bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,50 +14,27 @@ device="$2"
interface="$3"
cluster="$4"

# Get the directory where this script lives
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

echo "[$dir] Submitting benchmark for $device-$interface on $cluster..."
cd "$dir"

# Submit job
submit_output=$(bash .github/workflows/$cluster/submit-bench.sh \
.github/workflows/$cluster/bench.sh "$device" "$interface" 2>&1)

job_id=$(echo "$submit_output" | sed -n 's/.*Submitted batch job \([0-9][0-9]*\).*/\1/p')
job_slug="bench-$device-$interface"
output_file="${job_slug}.out"

if [ -z "$job_id" ]; then
echo "[$dir] ERROR: Failed to submit job"
echo "$submit_output"
exit 1
fi

echo "[$dir] Job ID: $job_id, monitoring output file: $output_file"

# Use the monitoring script from PR (where this script lives)
monitor_exit=0
bash "${SCRIPT_DIR}/monitor_slurm_job.sh" "$job_id" "$output_file" || monitor_exit=$?
if [ "$monitor_exit" -ne 0 ]; then
echo "[$dir] WARNING: SLURM job exited with code $monitor_exit"
else
echo "[$dir] Monitoring complete for job $job_id"
fi
# Submit and monitor job (submit.sh auto-detects bench mode from script name)
bash .github/workflows/$cluster/submit.sh \
.github/workflows/$cluster/bench.sh "$device" "$interface"

# Verify the YAML output file was created
job_slug="bench-$device-$interface"
yaml_file="${job_slug}.yaml"
if [ ! -f "$yaml_file" ]; then
echo "[$dir] ERROR: Expected output file not found: $yaml_file"
echo "[$dir] Directory contents:"
ls -la *.yaml 2>/dev/null || echo " No YAML files found"
echo ""
echo "[$dir] Last 100 lines of job output ($output_file):"
echo "----------------------------------------"
tail -n 100 "$output_file" 2>/dev/null || echo " Could not read output file"
echo "----------------------------------------"
exit 1
echo "[$dir] ERROR: Expected output file not found: $yaml_file"
echo "[$dir] Directory contents:"
ls -la *.yaml 2>/dev/null || echo " No YAML files found"
echo ""
output_file="${job_slug}.out"
echo "[$dir] Last 100 lines of job output ($output_file):"
echo "----------------------------------------"
tail -n 100 "$output_file" 2>/dev/null || echo " Could not read output file"
echo "----------------------------------------"
exit 1
fi

echo "[$dir] Verified output file exists: $yaml_file ($(stat -f%z "$yaml_file" 2>/dev/null || stat -c%s "$yaml_file" 2>/dev/null) bytes)"

3 changes: 3 additions & 0 deletions .github/workflows/bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ concurrency:
jobs:
file-changes:
name: Detect File Changes
if: >
github.event_name != 'pull_request_review' ||
github.event.review.user.type != 'Bot'
runs-on: 'ubuntu-latest'
outputs:
checkall: ${{ steps.changes.outputs.checkall }}
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/frontier/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ if [ "$job_device" = "gpu" ]; then
fi

if [ "$job_device" = "gpu" ]; then
./mfc.sh bench --mem 12 -j $n_ranks -o "$job_slug.yaml" -- -c frontier $device_opts -n $n_ranks
./mfc.sh bench --mem 12 -j $n_ranks -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks
else
./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c frontier $device_opts -n $n_ranks
./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks
fi
15 changes: 12 additions & 3 deletions .github/workflows/frontier/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,15 @@
# Ignore SIGHUP to survive login node session drops
trap '' HUP

# Determine compiler flag from directory name
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cluster_name="$(basename "$SCRIPT_DIR")"
case "$cluster_name" in
frontier) compiler_flag="f" ;;
frontier_amd) compiler_flag="famd" ;;
*) echo "ERROR: Unknown cluster '$cluster_name'"; exit 1 ;;
esac

job_device=$1
job_interface=$2
run_bench=$3
Expand All @@ -16,11 +25,11 @@ if [ "$job_device" = "gpu" ]; then
fi
fi

. ./mfc.sh load -c f -m g
. ./mfc.sh load -c $compiler_flag -m g

# Only set up build cache for test suite, not benchmarks
if [ "$run_bench" != "bench" ]; then
source .github/scripts/setup-build-cache.sh frontier "$job_device" "$job_interface"
source .github/scripts/setup-build-cache.sh "$cluster_name" "$job_device" "$job_interface"
fi

max_attempts=3
Expand All @@ -37,7 +46,7 @@ while [ $attempt -le $max_attempts ]; do
fi
done
else
if ./mfc.sh test -v -a --dry-run --rdma-mpi -j 8 $build_opts; then
if ./mfc.sh test -v -a --dry-run $([ "$cluster_name" = "frontier" ] && echo "--rdma-mpi") -j 8 $build_opts; then
build_cmd_ok=true
else
build_cmd_ok=false
Expand Down
54 changes: 0 additions & 54 deletions .github/workflows/frontier/submit-bench.sh

This file was deleted.

48 changes: 41 additions & 7 deletions .github/workflows/frontier/submit.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,17 @@ set -e
# Ignore SIGHUP to survive login node session drops
trap '' HUP

# Determine compiler flag from directory name
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cluster_name="$(basename "$SCRIPT_DIR")"
case "$cluster_name" in
frontier) compiler_flag="f" ;;
frontier_amd) compiler_flag="famd" ;;
*) echo "ERROR: Unknown cluster '$cluster_name'"; exit 1 ;;
esac

usage() {
echo "Usage: $0 [script.sh] [cpu|gpu]"
echo "Usage: $0 [script.sh] [cpu|gpu] [none|acc|omp] [shard]"
}

if [ ! -z "$1" ]; then
Expand All @@ -16,6 +25,13 @@ else
exit 1
fi

# Detect job type from submitted script basename
script_basename="$(basename "$1" .sh)"
case "$script_basename" in
bench*) job_type="bench" ;;
*) job_type="test" ;;
esac

if [ "$2" = "cpu" ]; then
sbatch_device_opts="\
#SBATCH -n 32 # Number of cores required"
Expand All @@ -27,19 +43,36 @@ else
exit 1
fi

# Select SBATCH params based on job type
if [ "$job_type" = "bench" ]; then
sbatch_account="#SBATCH -A ENG160"
sbatch_time="#SBATCH -t 05:59:00"
sbatch_partition="#SBATCH -p extended"
sbatch_extra=""
else
sbatch_account="#SBATCH -A CFD154"
sbatch_time="#SBATCH -t 01:59:00"
sbatch_partition="#SBATCH -p batch"
sbatch_extra="#SBATCH --qos=normal"
fi

job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2-$3"
shard_suffix=""
if [ -n "$4" ]; then
shard_suffix="-$(echo "$4" | sed 's|/|-of-|')"
fi
job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2-$3${shard_suffix}"
output_file="$job_slug.out"

submit_output=$(sbatch <<EOT
#!/bin/bash
#SBATCH -J MFC-$job_slug # Job name
#SBATCH -A ENG160 # charge account
$sbatch_account
#SBATCH -N 1 # Number of nodes required
$sbatch_device_opts
#SBATCH -t 05:59:00 # Duration of the job (Ex: 15 mins)
$sbatch_time
#SBATCH -o$output_file # Combined output and error messages file
#SBATCH -p extended # Extended partition for shorter queues
$sbatch_partition
$sbatch_extra

set -e
set -x
Expand All @@ -50,8 +83,10 @@ echo "Running in $(pwd):"
job_slug="$job_slug"
job_device="$2"
job_interface="$3"
job_shard="$4"
job_cluster="$cluster_name"

. ./mfc.sh load -c f -m $([ "$2" = "gpu" ] && echo "g" || echo "c")
. ./mfc.sh load -c $compiler_flag -m $([ "$2" = "gpu" ] && echo "g" || echo "c")

$sbatch_script_contents

Expand All @@ -68,5 +103,4 @@ fi
echo "Submitted batch job $job_id"

# Use resilient monitoring instead of sbatch -W
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
bash "$SCRIPT_DIR/../../scripts/monitor_slurm_job.sh" "$job_id" "$output_file"
13 changes: 11 additions & 2 deletions .github/workflows/frontier/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,17 @@ if [ "$job_device" = "gpu" ]; then
fi
fi

shard_opts=""
if [ -n "$job_shard" ]; then
shard_opts="--shard $job_shard"
fi

if [ "$job_device" = "gpu" ]; then
./mfc.sh test -v -a --rdma-mpi --max-attempts 3 -j $ngpus $device_opts -- -c frontier
rdma_opts=""
if [ "$job_cluster" = "frontier" ]; then
rdma_opts="--rdma-mpi"
fi
./mfc.sh test -v -a $rdma_opts --max-attempts 3 -j $ngpus $device_opts $shard_opts -- -c $job_cluster
else
./mfc.sh test -v -a --max-attempts 3 -j 32 --no-gpu -- -c frontier
./mfc.sh test -v -a --max-attempts 3 -j 32 --no-gpu $shard_opts -- -c $job_cluster
fi
4 changes: 2 additions & 2 deletions .github/workflows/frontier_amd/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ if [ "$job_device" = "gpu" ]; then
fi

if [ "$job_device" = "gpu" ]; then
./mfc.sh bench --mem 12 -j $n_ranks -o "$job_slug.yaml" -- -c frontier_amd $device_opts -n $n_ranks
./mfc.sh bench --mem 12 -j $n_ranks -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks
else
./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c frontier_amd $device_opts -n $n_ranks
./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks
fi
15 changes: 12 additions & 3 deletions .github/workflows/frontier_amd/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,15 @@
# Ignore SIGHUP to survive login node session drops
trap '' HUP

# Determine compiler flag from directory name
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cluster_name="$(basename "$SCRIPT_DIR")"
case "$cluster_name" in
frontier) compiler_flag="f" ;;
frontier_amd) compiler_flag="famd" ;;
*) echo "ERROR: Unknown cluster '$cluster_name'"; exit 1 ;;
esac

job_device=$1
job_interface=$2
run_bench=$3
Expand All @@ -16,11 +25,11 @@ if [ "$job_device" = "gpu" ]; then
fi
fi

. ./mfc.sh load -c famd -m g
. ./mfc.sh load -c $compiler_flag -m g

# Only set up build cache for test suite, not benchmarks
if [ "$run_bench" != "bench" ]; then
source .github/scripts/setup-build-cache.sh frontier_amd "$job_device" "$job_interface"
source .github/scripts/setup-build-cache.sh "$cluster_name" "$job_device" "$job_interface"
fi

max_attempts=3
Expand All @@ -37,7 +46,7 @@ while [ $attempt -le $max_attempts ]; do
fi
done
else
if ./mfc.sh test -v -a --dry-run -j 8 $build_opts; then
if ./mfc.sh test -v -a --dry-run $([ "$cluster_name" = "frontier" ] && echo "--rdma-mpi") -j 8 $build_opts; then
build_cmd_ok=true
else
build_cmd_ok=false
Expand Down
54 changes: 0 additions & 54 deletions .github/workflows/frontier_amd/submit-bench.sh

This file was deleted.

Loading
Loading