From 8fcabdea0a026666a77557b4fe685a9c9e6c6c46 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 7 Feb 2026 14:22:55 +0000
Subject: [PATCH 1/9] Initial plan
From 0455e1b8ab0a23d99762681f3dfe9ccea1a98238 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 7 Feb 2026 14:25:19 +0000
Subject: [PATCH 2/9] Add nightly Triton testing workflow
Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
.../workflows/iris-nightly-triton-test.yml | 195 ++++++++++++++++++
1 file changed, 195 insertions(+)
create mode 100644 .github/workflows/iris-nightly-triton-test.yml
diff --git a/.github/workflows/iris-nightly-triton-test.yml b/.github/workflows/iris-nightly-triton-test.yml
new file mode 100644
index 00000000..2f533bb5
--- /dev/null
+++ b/.github/workflows/iris-nightly-triton-test.yml
@@ -0,0 +1,195 @@
+name: Iris Nightly Triton Test
+
+on:
+ schedule:
+ # Run nightly at 2 AM UTC
+ - cron: '0 2 * * *'
+ workflow_dispatch: # Allow manual triggering
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ build-container-with-latest-triton:
+ runs-on: [self-hosted, mi3xx]
+ timeout-minutes: 120 # Building with latest Triton may take longer
+
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v4
+
+ - name: Setup Apptainer (if not available)
+ run: |
+ if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then
+ echo "Neither Apptainer nor Docker found, installing Apptainer..."
+ apt-get update && apt-get install -y software-properties-common
+ add-apt-repository -y ppa:apptainer/ppa
+ apt-get update && apt-get install -y apptainer
+ else
+ echo "Container runtime already available"
+ fi
+
+ - name: Modify Apptainer def file to use latest Triton
+ run: |
+ echo "Modifying apptainer/iris.def to use latest Triton from main branch"
+ # Replace the specific commit checkout with main branch
+ sed -i 's/git checkout bcbcabdd0cff6539c7168299075992b2a23ff38e/git checkout main/' apptainer/iris.def
+ echo "Modified iris.def:"
+ grep -A2 -B2 "git checkout" apptainer/iris.def
+
+ - name: Build Iris container with latest Triton
+ run: |
+ set -e
+
+ # Check /dev/shm size
+ shm_size_gb=$(df -k /dev/shm | tail -1 | awk '{print int($2/1024/1024)}')
+ if [ "${shm_size_gb:-0}" -lt 64 ]; then
+ echo "❌ ERROR: /dev/shm is too small (${shm_size_gb}GB < 64GB)"
+ echo "Fix: mount -o remount,size=64G /dev/shm"
+ exit 1
+ fi
+ echo "✅ /dev/shm size OK (${shm_size_gb}GB)"
+
+ # Build with Apptainer, forcing rebuild
+ DEF_FILE=apptainer/iris.def
+ IMAGE_PATH="${HOME}/iris-apptainer-nightly/iris-dev-nightly.sif"
+
+ mkdir -p "${HOME}/iris-apptainer-nightly"
+
+ echo "Building Apptainer image with latest Triton..."
+ apptainer build --force "$IMAGE_PATH" "$DEF_FILE"
+
+ echo "Built image: $IMAGE_PATH"
+
+ test-nightly-triton:
+ name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks)
+ needs: build-container-with-latest-triton
+ runs-on: [self-hosted, mi3xx]
+ timeout-minutes: 60
+ strategy:
+ fail-fast: false
+ matrix:
+ include:
+ # Test each subdirectory with different rank counts
+ - test_dir: examples
+ num_ranks: 1
+ gpu_devices: "0,1"
+ - test_dir: examples
+ num_ranks: 2
+ gpu_devices: "2,3"
+ - test_dir: examples
+ num_ranks: 4
+ gpu_devices: "4,5,6,7"
+ - test_dir: examples
+ num_ranks: 8
+ gpu_devices: "0,1,2,3,4,5,6,7"
+ - test_dir: unittests
+ num_ranks: 1
+ gpu_devices: "0,1"
+ - test_dir: unittests
+ num_ranks: 2
+ gpu_devices: "2,3"
+ - test_dir: unittests
+ num_ranks: 4
+ gpu_devices: "4,5,6,7"
+ - test_dir: unittests
+ num_ranks: 8
+ gpu_devices: "0,1,2,3,4,5,6,7"
+ - test_dir: ccl
+ num_ranks: 1
+ gpu_devices: "0,1"
+ - test_dir: ccl
+ num_ranks: 2
+ gpu_devices: "2,3"
+ - test_dir: ccl
+ num_ranks: 4
+ gpu_devices: "4,5,6,7"
+ - test_dir: ccl
+ num_ranks: 8
+ gpu_devices: "0,1,2,3,4,5,6,7"
+ - test_dir: x
+ num_ranks: 1
+ gpu_devices: "0,1"
+ - test_dir: x
+ num_ranks: 2
+ gpu_devices: "2,3"
+ - test_dir: x
+ num_ranks: 4
+ gpu_devices: "4,5,6,7"
+ - test_dir: x
+ num_ranks: 8
+ gpu_devices: "0,1,2,3,4,5,6,7"
+ - test_dir: ops
+ num_ranks: 1
+ gpu_devices: "0,1"
+ - test_dir: ops
+ num_ranks: 2
+ gpu_devices: "2,3"
+ - test_dir: ops
+ num_ranks: 4
+ gpu_devices: "4,5,6,7"
+ - test_dir: ops
+ num_ranks: 8
+ gpu_devices: "0,1,2,3,4,5,6,7"
+
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v4
+
+ - name: Cleanup lingering ports before tests
+ run: |
+ bash .github/scripts/cleanup_ports.sh
+
+ - name: Run ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks
+ run: |
+ set -e
+ echo "::group::Running ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (nightly Triton)"
+
+ # Use nightly image path
+ NIGHTLY_IMAGE="${HOME}/iris-apptainer-nightly/iris-dev-nightly.sif"
+
+ # Build GPU argument
+ GPU_ARG=""
+ if [ -n "${{ matrix.gpu_devices }}" ]; then
+ GPU_ARG="--gpus ${{ matrix.gpu_devices }}"
+ fi
+
+ # Run tests in container using the nightly image
+ .github/scripts/container_exec.sh $GPU_ARG --image "$NIGHTLY_IMAGE" "
+ set -e
+
+ # Install tritonBLAS if not already installed
+ echo \"Checking for tritonBLAS...\"
+ if ! python -c 'import tritonblas' 2>/dev/null; then
+ echo \"Installing tritonBLAS...\"
+ TRITONBLAS_DIR=\"./tritonblas_install\"
+ if [ ! -d \"\$TRITONBLAS_DIR\" ]; then
+ git clone https://github.com/ROCm/tritonBLAS.git \"\$TRITONBLAS_DIR\"
+ cd \"\$TRITONBLAS_DIR\"
+ git checkout 47768c93acb7f89511d797964b84544c30ab81ad
+ else
+ cd \"\$TRITONBLAS_DIR\"
+ git fetch
+ git checkout 47768c93acb7f89511d797964b84544c30ab81ad
+ fi
+ pip install -e .
+ cd ..
+ echo \"tritonBLAS installed successfully\"
+ else
+ echo \"tritonBLAS already installed\"
+ fi
+
+ echo \"Installing iris in editable mode\"
+ pip install -e .
+
+ # Run tests in the specified directory
+ for test_file in tests/${{ matrix.test_dir }}/test_*.py; do
+ if [ -f \"\$test_file\" ]; then
+ echo \"Testing: \$test_file with ${{ matrix.num_ranks }} ranks (nightly Triton)\"
+ python tests/run_tests_distributed.py --num_ranks ${{ matrix.num_ranks }} \"\$test_file\" -v --tb=short --durations=10
+ fi
+ done
+ "
+ echo "::endgroup::"
+ echo "✅ ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (nightly Triton) passed!"
From 5a800539e953c9720a526da1d1a2b65a5148d16a Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 7 Feb 2026 14:25:56 +0000
Subject: [PATCH 3/9] Add nightly Triton test badge to README
Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
README.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/README.md b/README.md
index 9fa2d55b..6f9a921c 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,7 @@ Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+
From 006afdb706a289c1b1ee0bf8ee9f71bc37444650 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 7 Feb 2026 14:27:07 +0000
Subject: [PATCH 4/9] Improve nightly workflow robustness per code review
Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
.github/workflows/iris-nightly-triton-test.yml | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)
diff --git a/.github/workflows/iris-nightly-triton-test.yml b/.github/workflows/iris-nightly-triton-test.yml
index 2f533bb5..7317759e 100644
--- a/.github/workflows/iris-nightly-triton-test.yml
+++ b/.github/workflows/iris-nightly-triton-test.yml
@@ -33,8 +33,8 @@ jobs:
- name: Modify Apptainer def file to use latest Triton
run: |
echo "Modifying apptainer/iris.def to use latest Triton from main branch"
- # Replace the specific commit checkout with main branch
- sed -i 's/git checkout bcbcabdd0cff6539c7168299075992b2a23ff38e/git checkout main/' apptainer/iris.def
+ # Replace any 40-character hex commit checkout with main branch
+ sed -i 's/git checkout [0-9a-f]\{40\}/git checkout main/' apptainer/iris.def
echo "Modified iris.def:"
grep -A2 -B2 "git checkout" apptainer/iris.def
@@ -159,6 +159,9 @@ jobs:
.github/scripts/container_exec.sh $GPU_ARG --image "$NIGHTLY_IMAGE" "
set -e
+ # tritonBLAS version to use
+ TRITONBLAS_COMMIT=\"47768c93acb7f89511d797964b84544c30ab81ad\"
+
# Install tritonBLAS if not already installed
echo \"Checking for tritonBLAS...\"
if ! python -c 'import tritonblas' 2>/dev/null; then
@@ -167,11 +170,11 @@ jobs:
if [ ! -d \"\$TRITONBLAS_DIR\" ]; then
git clone https://github.com/ROCm/tritonBLAS.git \"\$TRITONBLAS_DIR\"
cd \"\$TRITONBLAS_DIR\"
- git checkout 47768c93acb7f89511d797964b84544c30ab81ad
+ git checkout \"\$TRITONBLAS_COMMIT\"
else
cd \"\$TRITONBLAS_DIR\"
git fetch
- git checkout 47768c93acb7f89511d797964b84544c30ab81ad
+ git checkout \"\$TRITONBLAS_COMMIT\"
fi
pip install -e .
cd ..
From 4125f82faa421bef02c740631de289ca9aa2c588 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 7 Feb 2026 14:28:38 +0000
Subject: [PATCH 5/9] Add explicit permissions to nightly workflow for security
Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
.github/workflows/iris-nightly-triton-test.yml | 3 +++
1 file changed, 3 insertions(+)
diff --git a/.github/workflows/iris-nightly-triton-test.yml b/.github/workflows/iris-nightly-triton-test.yml
index 7317759e..c9d1d698 100644
--- a/.github/workflows/iris-nightly-triton-test.yml
+++ b/.github/workflows/iris-nightly-triton-test.yml
@@ -10,6 +10,9 @@ concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
+permissions:
+ contents: read
+
jobs:
build-container-with-latest-triton:
runs-on: [self-hosted, mi3xx]
From 587d5bcfa8c1042a75dcfbe8c4feb0b3a688da0c Mon Sep 17 00:00:00 2001
From: Muhammad Awad <112003944+mawad-amd@users.noreply.github.com>
Date: Sun, 22 Mar 2026 20:04:14 -0700
Subject: [PATCH 6/9] Rewrite nightly workflow to reuse existing CI container
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Instead of rebuilding the container with Triton from main (slow,
fragile), reuse the existing cached CI container and reinstall
Triton from main at test time via pip. This means:
- Zero changes to build infrastructure
- Each test job is independent (parallel-safe, own container overlay)
- Uses container_build.sh for cache hits, container_exec.sh for execution
- Same test matrix as iris-tests.yml (5 dirs × 4 ranks)
- Editable install only (no git/pip variants needed for nightly)
Co-Authored-By: Claude Opus 4.6
---
.../workflows/iris-nightly-triton-test.yml | 80 ++++++-------------
1 file changed, 26 insertions(+), 54 deletions(-)
diff --git a/.github/workflows/iris-nightly-triton-test.yml b/.github/workflows/iris-nightly-triton-test.yml
index c9d1d698..14e0c6c8 100644
--- a/.github/workflows/iris-nightly-triton-test.yml
+++ b/.github/workflows/iris-nightly-triton-test.yml
@@ -7,16 +7,15 @@ on:
workflow_dispatch: # Allow manual triggering
concurrency:
- group: ${{ github.workflow }}-${{ github.ref }}
+ group: ${{ github.workflow }}
cancel-in-progress: true
permissions:
contents: read
jobs:
- build-container-with-latest-triton:
+ build-container-image:
runs-on: [self-hosted, mi3xx]
- timeout-minutes: 120 # Building with latest Triton may take longer
steps:
- name: Checkout repository
@@ -33,48 +32,19 @@ jobs:
echo "Container runtime already available"
fi
- - name: Modify Apptainer def file to use latest Triton
+ - name: Build Iris container
run: |
- echo "Modifying apptainer/iris.def to use latest Triton from main branch"
- # Replace any 40-character hex commit checkout with main branch
- sed -i 's/git checkout [0-9a-f]\{40\}/git checkout main/' apptainer/iris.def
- echo "Modified iris.def:"
- grep -A2 -B2 "git checkout" apptainer/iris.def
+ bash .github/scripts/container_build.sh
- - name: Build Iris container with latest Triton
- run: |
- set -e
-
- # Check /dev/shm size
- shm_size_gb=$(df -k /dev/shm | tail -1 | awk '{print int($2/1024/1024)}')
- if [ "${shm_size_gb:-0}" -lt 64 ]; then
- echo "❌ ERROR: /dev/shm is too small (${shm_size_gb}GB < 64GB)"
- echo "Fix: mount -o remount,size=64G /dev/shm"
- exit 1
- fi
- echo "✅ /dev/shm size OK (${shm_size_gb}GB)"
-
- # Build with Apptainer, forcing rebuild
- DEF_FILE=apptainer/iris.def
- IMAGE_PATH="${HOME}/iris-apptainer-nightly/iris-dev-nightly.sif"
-
- mkdir -p "${HOME}/iris-apptainer-nightly"
-
- echo "Building Apptainer image with latest Triton..."
- apptainer build --force "$IMAGE_PATH" "$DEF_FILE"
-
- echo "Built image: $IMAGE_PATH"
-
- test-nightly-triton:
- name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks)
- needs: build-container-with-latest-triton
+ test-nightly:
+ name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks, nightly Triton)
+ needs: build-container-image
runs-on: [self-hosted, mi3xx]
timeout-minutes: 60
strategy:
fail-fast: false
matrix:
include:
- # Test each subdirectory with different rank counts
- test_dir: examples
num_ranks: 1
gpu_devices: "0,1"
@@ -144,27 +114,27 @@ jobs:
run: |
bash .github/scripts/cleanup_ports.sh
- - name: Run ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks
+ - name: Run ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (nightly Triton)
run: |
set -e
echo "::group::Running ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (nightly Triton)"
-
- # Use nightly image path
- NIGHTLY_IMAGE="${HOME}/iris-apptainer-nightly/iris-dev-nightly.sif"
-
+
# Build GPU argument
GPU_ARG=""
if [ -n "${{ matrix.gpu_devices }}" ]; then
GPU_ARG="--gpus ${{ matrix.gpu_devices }}"
fi
-
- # Run tests in container using the nightly image
- .github/scripts/container_exec.sh $GPU_ARG --image "$NIGHTLY_IMAGE" "
+
+ # Run tests in container, reinstalling Triton from main first
+ bash .github/scripts/container_exec.sh $GPU_ARG "
set -e
-
- # tritonBLAS version to use
- TRITONBLAS_COMMIT=\"47768c93acb7f89511d797964b84544c30ab81ad\"
-
+
+ # Reinstall Triton from main branch
+ echo \"Reinstalling Triton from main branch...\"
+ pip install --force-reinstall --no-deps \
+ git+https://github.com/triton-lang/triton@main
+ echo \"Triton version: \$(pip show triton 2>/dev/null | grep Version || echo unknown)\"
+
# Install tritonBLAS if not already installed
echo \"Checking for tritonBLAS...\"
if ! python -c 'import tritonblas' 2>/dev/null; then
@@ -173,11 +143,11 @@ jobs:
if [ ! -d \"\$TRITONBLAS_DIR\" ]; then
git clone https://github.com/ROCm/tritonBLAS.git \"\$TRITONBLAS_DIR\"
cd \"\$TRITONBLAS_DIR\"
- git checkout \"\$TRITONBLAS_COMMIT\"
+ git checkout 47768c93acb7f89511d797964b84544c30ab81ad
else
cd \"\$TRITONBLAS_DIR\"
git fetch
- git checkout \"\$TRITONBLAS_COMMIT\"
+ git checkout 47768c93acb7f89511d797964b84544c30ab81ad
fi
pip install -e .
cd ..
@@ -185,15 +155,17 @@ jobs:
else
echo \"tritonBLAS already installed\"
fi
-
+
+ # Install iris in editable mode
echo \"Installing iris in editable mode\"
pip install -e .
-
+
# Run tests in the specified directory
for test_file in tests/${{ matrix.test_dir }}/test_*.py; do
if [ -f \"\$test_file\" ]; then
echo \"Testing: \$test_file with ${{ matrix.num_ranks }} ranks (nightly Triton)\"
- python tests/run_tests_distributed.py --num_ranks ${{ matrix.num_ranks }} \"\$test_file\" -v --tb=short --durations=10
+ python tests/run_tests_distributed.py \
+ --num_ranks ${{ matrix.num_ranks }} \"\$test_file\" -v --tb=short --durations=10
fi
done
"
From 02457b853abbe1d59f95fff25515135e05ac63a0 Mon Sep 17 00:00:00 2001
From: Muhammad Awad <112003944+mawad-amd@users.noreply.github.com>
Date: Sun, 22 Mar 2026 20:11:26 -0700
Subject: [PATCH 7/9] Sync nightly workflow with current CI patterns
Update to match main's CI infrastructure:
- Runner: linux-mi325-8gpu-ossci-rad (not self-hosted/mi3xx)
- GPU allocation: flock-based acquire_gpus.sh/release_gpus.sh
- Remove hardcoded gpu_devices from matrix
- Remove separate build-container-image job (each job builds its own)
- Remove manual tritonBLAS install (now baked into container)
- Use torchrun --rdzv-endpoint=localhost:0 (not run_tests_distributed --num_ranks)
- Match timeout-minutes: 180
Co-Authored-By: Claude Opus 4.6
---
.../workflows/iris-nightly-triton-test.yml | 103 +++++-------------
1 file changed, 30 insertions(+), 73 deletions(-)
diff --git a/.github/workflows/iris-nightly-triton-test.yml b/.github/workflows/iris-nightly-triton-test.yml
index 14e0c6c8..10e4d673 100644
--- a/.github/workflows/iris-nightly-triton-test.yml
+++ b/.github/workflows/iris-nightly-triton-test.yml
@@ -14,115 +14,87 @@ permissions:
contents: read
jobs:
- build-container-image:
- runs-on: [self-hosted, mi3xx]
-
- steps:
- - name: Checkout repository
- uses: actions/checkout@v4
-
- - name: Setup Apptainer (if not available)
- run: |
- if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then
- echo "Neither Apptainer nor Docker found, installing Apptainer..."
- apt-get update && apt-get install -y software-properties-common
- add-apt-repository -y ppa:apptainer/ppa
- apt-get update && apt-get install -y apptainer
- else
- echo "Container runtime already available"
- fi
-
- - name: Build Iris container
- run: |
- bash .github/scripts/container_build.sh
-
test-nightly:
name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks, nightly Triton)
- needs: build-container-image
- runs-on: [self-hosted, mi3xx]
- timeout-minutes: 60
+ runs-on: [linux-mi325-8gpu-ossci-rad]
+ timeout-minutes: 180
strategy:
fail-fast: false
matrix:
include:
- test_dir: examples
num_ranks: 1
- gpu_devices: "0,1"
- test_dir: examples
num_ranks: 2
- gpu_devices: "2,3"
- test_dir: examples
num_ranks: 4
- gpu_devices: "4,5,6,7"
- test_dir: examples
num_ranks: 8
- gpu_devices: "0,1,2,3,4,5,6,7"
- test_dir: unittests
num_ranks: 1
- gpu_devices: "0,1"
- test_dir: unittests
num_ranks: 2
- gpu_devices: "2,3"
- test_dir: unittests
num_ranks: 4
- gpu_devices: "4,5,6,7"
- test_dir: unittests
num_ranks: 8
- gpu_devices: "0,1,2,3,4,5,6,7"
- test_dir: ccl
num_ranks: 1
- gpu_devices: "0,1"
- test_dir: ccl
num_ranks: 2
- gpu_devices: "2,3"
- test_dir: ccl
num_ranks: 4
- gpu_devices: "4,5,6,7"
- test_dir: ccl
num_ranks: 8
- gpu_devices: "0,1,2,3,4,5,6,7"
- test_dir: x
num_ranks: 1
- gpu_devices: "0,1"
- test_dir: x
num_ranks: 2
- gpu_devices: "2,3"
- test_dir: x
num_ranks: 4
- gpu_devices: "4,5,6,7"
- test_dir: x
num_ranks: 8
- gpu_devices: "0,1,2,3,4,5,6,7"
- test_dir: ops
num_ranks: 1
- gpu_devices: "0,1"
- test_dir: ops
num_ranks: 2
- gpu_devices: "2,3"
- test_dir: ops
num_ranks: 4
- gpu_devices: "4,5,6,7"
- test_dir: ops
num_ranks: 8
- gpu_devices: "0,1,2,3,4,5,6,7"
steps:
- name: Checkout repository
uses: actions/checkout@v4
- - name: Cleanup lingering ports before tests
+ - name: Setup Apptainer (if not available)
run: |
- bash .github/scripts/cleanup_ports.sh
+ if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then
+ echo "Neither Apptainer nor Docker found, installing Apptainer..."
+ apt-get update && apt-get install -y software-properties-common
+ add-apt-repository -y ppa:apptainer/ppa
+ apt-get update && apt-get install -y apptainer
+ else
+ echo "Container runtime already available"
+ fi
+
+ - name: Build Iris container
+ run: |
+ bash .github/scripts/container_build.sh
+
+ - name: Acquire GPUs
+ run: |
+ bash .github/scripts/acquire_gpus.sh "${{ matrix.num_ranks }}"
- name: Run ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (nightly Triton)
run: |
set -e
echo "::group::Running ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (nightly Triton)"
- # Build GPU argument
+ # Build GPU argument (GPU_DEVICES set by acquire_gpus.sh)
GPU_ARG=""
- if [ -n "${{ matrix.gpu_devices }}" ]; then
- GPU_ARG="--gpus ${{ matrix.gpu_devices }}"
+ if [ -n "$GPU_DEVICES" ]; then
+ GPU_ARG="--gpus $GPU_DEVICES"
fi
# Run tests in container, reinstalling Triton from main first
@@ -135,27 +107,6 @@ jobs:
git+https://github.com/triton-lang/triton@main
echo \"Triton version: \$(pip show triton 2>/dev/null | grep Version || echo unknown)\"
- # Install tritonBLAS if not already installed
- echo \"Checking for tritonBLAS...\"
- if ! python -c 'import tritonblas' 2>/dev/null; then
- echo \"Installing tritonBLAS...\"
- TRITONBLAS_DIR=\"./tritonblas_install\"
- if [ ! -d \"\$TRITONBLAS_DIR\" ]; then
- git clone https://github.com/ROCm/tritonBLAS.git \"\$TRITONBLAS_DIR\"
- cd \"\$TRITONBLAS_DIR\"
- git checkout 47768c93acb7f89511d797964b84544c30ab81ad
- else
- cd \"\$TRITONBLAS_DIR\"
- git fetch
- git checkout 47768c93acb7f89511d797964b84544c30ab81ad
- fi
- pip install -e .
- cd ..
- echo \"tritonBLAS installed successfully\"
- else
- echo \"tritonBLAS already installed\"
- fi
-
# Install iris in editable mode
echo \"Installing iris in editable mode\"
pip install -e .
@@ -164,10 +115,16 @@ jobs:
for test_file in tests/${{ matrix.test_dir }}/test_*.py; do
if [ -f \"\$test_file\" ]; then
echo \"Testing: \$test_file with ${{ matrix.num_ranks }} ranks (nightly Triton)\"
- python tests/run_tests_distributed.py \
- --num_ranks ${{ matrix.num_ranks }} \"\$test_file\" -v --tb=short --durations=10
+ torchrun --rdzv-backend=c10d --rdzv-endpoint=localhost:0 \
+ --nnodes=1 --nproc_per_node=${{ matrix.num_ranks }} \
+ tests/run_tests_distributed.py \"\$test_file\" -v --tb=short --durations=10
fi
done
"
echo "::endgroup::"
echo "✅ ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (nightly Triton) passed!"
+
+ - name: Release GPUs
+ if: always()
+ run: |
+ bash .github/scripts/release_gpus.sh
From 6dcb6976a8ffc7fcf29ae5cd284e6043e53ce55e Mon Sep 17 00:00:00 2001
From: Muhammad Awad <112003944+mawad-amd@users.noreply.github.com>
Date: Sun, 22 Mar 2026 20:14:05 -0700
Subject: [PATCH 8/9] Fix nightly cron to midnight Pacific (7 AM UTC)
Co-Authored-By: Claude Opus 4.6
---
.github/workflows/iris-nightly-triton-test.yml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/.github/workflows/iris-nightly-triton-test.yml b/.github/workflows/iris-nightly-triton-test.yml
index 10e4d673..65997258 100644
--- a/.github/workflows/iris-nightly-triton-test.yml
+++ b/.github/workflows/iris-nightly-triton-test.yml
@@ -2,8 +2,8 @@ name: Iris Nightly Triton Test
on:
schedule:
- # Run nightly at 2 AM UTC
- - cron: '0 2 * * *'
+ # Run nightly at midnight Pacific (7 AM UTC / PDT)
+ - cron: '0 7 * * *'
workflow_dispatch: # Allow manual triggering
concurrency:
From 1c639169e521021481c15f3d65f2ac3753367b19 Mon Sep 17 00:00:00 2001
From: Muhammad Awad <112003944+mawad-amd@users.noreply.github.com>
Date: Sun, 22 Mar 2026 20:32:25 -0700
Subject: [PATCH 9/9] Remove nightly badge from README [skip ci]
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Only the new workflow file remains — no existing files modified.
Co-Authored-By: Claude Opus 4.6
---
README.md | 1 -
1 file changed, 1 deletion(-)
diff --git a/README.md b/README.md
index 6f9a921c..9fa2d55b 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,6 @@ Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
-