ROCm · mawad-amd · Mar 23, 2026 · Feb 7, 2026 · Feb 7, 2026 · Feb 7, 2026
@@ -0,0 +1,130 @@
+name: Iris Nightly Triton Test
+
+on:
+  schedule:
+    # Run nightly at midnight Pacific (7 AM UTC / PDT)
+    - cron: '0 7 * * *'
+  workflow_dispatch:  # Allow manual triggering
+
+concurrency:
+  group: ${{ github.workflow }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+
+jobs:
+  test-nightly:
+    name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks, nightly Triton)
+    runs-on: [linux-mi325-8gpu-ossci-rad]
+    timeout-minutes: 180
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - test_dir: examples
+            num_ranks: 1
+          - test_dir: examples
+            num_ranks: 2
+          - test_dir: examples
+            num_ranks: 4
+          - test_dir: examples
+            num_ranks: 8
+          - test_dir: unittests
+            num_ranks: 1
+          - test_dir: unittests
+            num_ranks: 2
+          - test_dir: unittests
+            num_ranks: 4
+          - test_dir: unittests
+            num_ranks: 8
+          - test_dir: ccl
+            num_ranks: 1
+          - test_dir: ccl
+            num_ranks: 2
+          - test_dir: ccl
+            num_ranks: 4
+          - test_dir: ccl
+            num_ranks: 8
+          - test_dir: x
+            num_ranks: 1
+          - test_dir: x
+            num_ranks: 2
+          - test_dir: x
+            num_ranks: 4
+          - test_dir: x
+            num_ranks: 8
+          - test_dir: ops
+            num_ranks: 1
+          - test_dir: ops
+            num_ranks: 2
+          - test_dir: ops
+            num_ranks: 4
+          - test_dir: ops
+            num_ranks: 8
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Setup Apptainer (if not available)
+        run: |
+          if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then
+            echo "Neither Apptainer nor Docker found, installing Apptainer..."
+            apt-get update && apt-get install -y software-properties-common
+            add-apt-repository -y ppa:apptainer/ppa
+            apt-get update && apt-get install -y apptainer
+          else
+            echo "Container runtime already available"
-          if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then
-            echo "Neither Apptainer nor Docker found, installing Apptainer..."
-            apt-get update && apt-get install -y software-properties-common
-            add-apt-repository -y ppa:apptainer/ppa
-            apt-get update && apt-get install -y apptainer
-          else
-            echo "Container runtime already available"
+          if ! command -v apptainer &> /dev/null; then
+            echo "Apptainer not found, attempting to install..."
+            if command -v sudo &> /dev/null; then
+              sudo apt-get update && sudo apt-get install -y software-properties-common
+              sudo add-apt-repository -y ppa:apptainer/ppa
+              sudo apt-get update && sudo apt-get install -y apptainer
+            else
+              echo "Error: sudo is not available; Apptainer must be preinstalled on this runner."
+              exit 1
+            fi
+          else
+            echo "Apptainer already available"
-          if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then
-            echo "Neither Apptainer nor Docker found, installing Apptainer..."
-            apt-get update && apt-get install -y software-properties-common
-            add-apt-repository -y ppa:apptainer/ppa
-            apt-get update && apt-get install -y apptainer
-          else
-            echo "Container runtime already available"
+          if ! command -v apptainer &> /dev/null; then
+            echo "Apptainer not found, attempting to install..."
+            if command -v sudo &> /dev/null; then
+              sudo apt-get update && sudo apt-get install -y software-properties-common
+              sudo add-apt-repository -y ppa:apptainer/ppa
+              sudo apt-get update && sudo apt-get install -y apptainer
+            else
+              echo "Error: sudo is not available; Apptainer must be preinstalled on this runner."
+              exit 1
+            fi
+          else
+            echo "Apptainer already available"
+          fi
+
+      - name: Build Iris container
+        run: |
+          bash .github/scripts/container_build.sh
+
+      - name: Acquire GPUs
+        run: |
+          bash .github/scripts/acquire_gpus.sh "${{ matrix.num_ranks }}"
+
+      - name: Run ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (nightly Triton)
+        run: |
+          set -e
+          echo "::group::Running ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (nightly Triton)"
+
+          # Build GPU argument (GPU_DEVICES set by acquire_gpus.sh)
+          GPU_ARG=""
+          if [ -n "$GPU_DEVICES" ]; then
+              GPU_ARG="--gpus $GPU_DEVICES"
+          fi
+
+          # Run tests in container, reinstalling Triton from main first
+          bash .github/scripts/container_exec.sh $GPU_ARG "
+              set -e
+
+              # Reinstall Triton from main branch
+              echo \"Reinstalling Triton from main branch...\"
+              pip install --force-reinstall --no-deps \
+                  git+https://github.com/triton-lang/triton@main
+              echo \"Triton version: \$(pip show triton 2>/dev/null | grep Version || echo unknown)\"
+
+              # Install iris in editable mode
+              echo \"Installing iris in editable mode\"
+              pip install -e .
+
+              # Run tests in the specified directory
+              for test_file in tests/${{ matrix.test_dir }}/test_*.py; do
+                  if [ -f \"\$test_file\" ]; then
+                      echo \"Testing: \$test_file with ${{ matrix.num_ranks }} ranks (nightly Triton)\"
+                      torchrun --rdzv-backend=c10d --rdzv-endpoint=localhost:0 \
+                          --nnodes=1 --nproc_per_node=${{ matrix.num_ranks }} \
+                          tests/run_tests_distributed.py \"\$test_file\" -v --tb=short --durations=10
+                  fi
+              done
+          "
+          echo "::endgroup::"
+          echo "✅ ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (nightly Triton) passed!"
+
+      - name: Release GPUs
+        if: always()
+        run: |
+          bash .github/scripts/release_gpus.sh