From 8fcabdea0a026666a77557b4fe685a9c9e6c6c46 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 7 Feb 2026 14:22:55 +0000 Subject: [PATCH 1/9] Initial plan From 0455e1b8ab0a23d99762681f3dfe9ccea1a98238 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 7 Feb 2026 14:25:19 +0000 Subject: [PATCH 2/9] Add nightly Triton testing workflow Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com> --- .../workflows/iris-nightly-triton-test.yml | 195 ++++++++++++++++++ 1 file changed, 195 insertions(+) create mode 100644 .github/workflows/iris-nightly-triton-test.yml diff --git a/.github/workflows/iris-nightly-triton-test.yml b/.github/workflows/iris-nightly-triton-test.yml new file mode 100644 index 00000000..2f533bb5 --- /dev/null +++ b/.github/workflows/iris-nightly-triton-test.yml @@ -0,0 +1,195 @@ +name: Iris Nightly Triton Test + +on: + schedule: + # Run nightly at 2 AM UTC + - cron: '0 2 * * *' + workflow_dispatch: # Allow manual triggering + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + build-container-with-latest-triton: + runs-on: [self-hosted, mi3xx] + timeout-minutes: 120 # Building with latest Triton may take longer + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Setup Apptainer (if not available) + run: | + if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then + echo "Neither Apptainer nor Docker found, installing Apptainer..." + apt-get update && apt-get install -y software-properties-common + add-apt-repository -y ppa:apptainer/ppa + apt-get update && apt-get install -y apptainer + else + echo "Container runtime already available" + fi + + - name: Modify Apptainer def file to use latest Triton + run: | + echo "Modifying apptainer/iris.def to use latest Triton from main branch" + # Replace the specific commit checkout with main branch + sed -i 's/git checkout bcbcabdd0cff6539c7168299075992b2a23ff38e/git checkout main/' apptainer/iris.def + echo "Modified iris.def:" + grep -A2 -B2 "git checkout" apptainer/iris.def + + - name: Build Iris container with latest Triton + run: | + set -e + + # Check /dev/shm size + shm_size_gb=$(df -k /dev/shm | tail -1 | awk '{print int($2/1024/1024)}') + if [ "${shm_size_gb:-0}" -lt 64 ]; then + echo "❌ ERROR: /dev/shm is too small (${shm_size_gb}GB < 64GB)" + echo "Fix: mount -o remount,size=64G /dev/shm" + exit 1 + fi + echo "✅ /dev/shm size OK (${shm_size_gb}GB)" + + # Build with Apptainer, forcing rebuild + DEF_FILE=apptainer/iris.def + IMAGE_PATH="${HOME}/iris-apptainer-nightly/iris-dev-nightly.sif" + + mkdir -p "${HOME}/iris-apptainer-nightly" + + echo "Building Apptainer image with latest Triton..." + apptainer build --force "$IMAGE_PATH" "$DEF_FILE" + + echo "Built image: $IMAGE_PATH" + + test-nightly-triton: + name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks) + needs: build-container-with-latest-triton + runs-on: [self-hosted, mi3xx] + timeout-minutes: 60 + strategy: + fail-fast: false + matrix: + include: + # Test each subdirectory with different rank counts + - test_dir: examples + num_ranks: 1 + gpu_devices: "0,1" + - test_dir: examples + num_ranks: 2 + gpu_devices: "2,3" + - test_dir: examples + num_ranks: 4 + gpu_devices: "4,5,6,7" + - test_dir: examples + num_ranks: 8 + gpu_devices: "0,1,2,3,4,5,6,7" + - test_dir: unittests + num_ranks: 1 + gpu_devices: "0,1" + - test_dir: unittests + num_ranks: 2 + gpu_devices: "2,3" + - test_dir: unittests + num_ranks: 4 + gpu_devices: "4,5,6,7" + - test_dir: unittests + num_ranks: 8 + gpu_devices: "0,1,2,3,4,5,6,7" + - test_dir: ccl + num_ranks: 1 + gpu_devices: "0,1" + - test_dir: ccl + num_ranks: 2 + gpu_devices: "2,3" + - test_dir: ccl + num_ranks: 4 + gpu_devices: "4,5,6,7" + - test_dir: ccl + num_ranks: 8 + gpu_devices: "0,1,2,3,4,5,6,7" + - test_dir: x + num_ranks: 1 + gpu_devices: "0,1" + - test_dir: x + num_ranks: 2 + gpu_devices: "2,3" + - test_dir: x + num_ranks: 4 + gpu_devices: "4,5,6,7" + - test_dir: x + num_ranks: 8 + gpu_devices: "0,1,2,3,4,5,6,7" + - test_dir: ops + num_ranks: 1 + gpu_devices: "0,1" + - test_dir: ops + num_ranks: 2 + gpu_devices: "2,3" + - test_dir: ops + num_ranks: 4 + gpu_devices: "4,5,6,7" + - test_dir: ops + num_ranks: 8 + gpu_devices: "0,1,2,3,4,5,6,7" + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Cleanup lingering ports before tests + run: | + bash .github/scripts/cleanup_ports.sh + + - name: Run ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks + run: | + set -e + echo "::group::Running ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (nightly Triton)" + + # Use nightly image path + NIGHTLY_IMAGE="${HOME}/iris-apptainer-nightly/iris-dev-nightly.sif" + + # Build GPU argument + GPU_ARG="" + if [ -n "${{ matrix.gpu_devices }}" ]; then + GPU_ARG="--gpus ${{ matrix.gpu_devices }}" + fi + + # Run tests in container using the nightly image + .github/scripts/container_exec.sh $GPU_ARG --image "$NIGHTLY_IMAGE" " + set -e + + # Install tritonBLAS if not already installed + echo \"Checking for tritonBLAS...\" + if ! python -c 'import tritonblas' 2>/dev/null; then + echo \"Installing tritonBLAS...\" + TRITONBLAS_DIR=\"./tritonblas_install\" + if [ ! -d \"\$TRITONBLAS_DIR\" ]; then + git clone https://github.com/ROCm/tritonBLAS.git \"\$TRITONBLAS_DIR\" + cd \"\$TRITONBLAS_DIR\" + git checkout 47768c93acb7f89511d797964b84544c30ab81ad + else + cd \"\$TRITONBLAS_DIR\" + git fetch + git checkout 47768c93acb7f89511d797964b84544c30ab81ad + fi + pip install -e . + cd .. + echo \"tritonBLAS installed successfully\" + else + echo \"tritonBLAS already installed\" + fi + + echo \"Installing iris in editable mode\" + pip install -e . + + # Run tests in the specified directory + for test_file in tests/${{ matrix.test_dir }}/test_*.py; do + if [ -f \"\$test_file\" ]; then + echo \"Testing: \$test_file with ${{ matrix.num_ranks }} ranks (nightly Triton)\" + python tests/run_tests_distributed.py --num_ranks ${{ matrix.num_ranks }} \"\$test_file\" -v --tb=short --durations=10 + fi + done + " + echo "::endgroup::" + echo "✅ ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (nightly Triton) passed!" From 5a800539e953c9720a526da1d1a2b65a5148d16a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 7 Feb 2026 14:25:56 +0000 Subject: [PATCH 3/9] Add nightly Triton test badge to README Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com> --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 9fa2d55b..6f9a921c 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. License: MIT Ruff Iris Tests + Nightly Triton Tests DOI DOI

From 006afdb706a289c1b1ee0bf8ee9f71bc37444650 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 7 Feb 2026 14:27:07 +0000 Subject: [PATCH 4/9] Improve nightly workflow robustness per code review Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com> --- .github/workflows/iris-nightly-triton-test.yml | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/.github/workflows/iris-nightly-triton-test.yml b/.github/workflows/iris-nightly-triton-test.yml index 2f533bb5..7317759e 100644 --- a/.github/workflows/iris-nightly-triton-test.yml +++ b/.github/workflows/iris-nightly-triton-test.yml @@ -33,8 +33,8 @@ jobs: - name: Modify Apptainer def file to use latest Triton run: | echo "Modifying apptainer/iris.def to use latest Triton from main branch" - # Replace the specific commit checkout with main branch - sed -i 's/git checkout bcbcabdd0cff6539c7168299075992b2a23ff38e/git checkout main/' apptainer/iris.def + # Replace any 40-character hex commit checkout with main branch + sed -i 's/git checkout [0-9a-f]\{40\}/git checkout main/' apptainer/iris.def echo "Modified iris.def:" grep -A2 -B2 "git checkout" apptainer/iris.def @@ -159,6 +159,9 @@ jobs: .github/scripts/container_exec.sh $GPU_ARG --image "$NIGHTLY_IMAGE" " set -e + # tritonBLAS version to use + TRITONBLAS_COMMIT=\"47768c93acb7f89511d797964b84544c30ab81ad\" + # Install tritonBLAS if not already installed echo \"Checking for tritonBLAS...\" if ! python -c 'import tritonblas' 2>/dev/null; then @@ -167,11 +170,11 @@ jobs: if [ ! -d \"\$TRITONBLAS_DIR\" ]; then git clone https://github.com/ROCm/tritonBLAS.git \"\$TRITONBLAS_DIR\" cd \"\$TRITONBLAS_DIR\" - git checkout 47768c93acb7f89511d797964b84544c30ab81ad + git checkout \"\$TRITONBLAS_COMMIT\" else cd \"\$TRITONBLAS_DIR\" git fetch - git checkout 47768c93acb7f89511d797964b84544c30ab81ad + git checkout \"\$TRITONBLAS_COMMIT\" fi pip install -e . cd .. From 4125f82faa421bef02c740631de289ca9aa2c588 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 7 Feb 2026 14:28:38 +0000 Subject: [PATCH 5/9] Add explicit permissions to nightly workflow for security Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com> --- .github/workflows/iris-nightly-triton-test.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/iris-nightly-triton-test.yml b/.github/workflows/iris-nightly-triton-test.yml index 7317759e..c9d1d698 100644 --- a/.github/workflows/iris-nightly-triton-test.yml +++ b/.github/workflows/iris-nightly-triton-test.yml @@ -10,6 +10,9 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true +permissions: + contents: read + jobs: build-container-with-latest-triton: runs-on: [self-hosted, mi3xx] From 587d5bcfa8c1042a75dcfbe8c4feb0b3a688da0c Mon Sep 17 00:00:00 2001 From: Muhammad Awad <112003944+mawad-amd@users.noreply.github.com> Date: Sun, 22 Mar 2026 20:04:14 -0700 Subject: [PATCH 6/9] Rewrite nightly workflow to reuse existing CI container MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of rebuilding the container with Triton from main (slow, fragile), reuse the existing cached CI container and reinstall Triton from main at test time via pip. This means: - Zero changes to build infrastructure - Each test job is independent (parallel-safe, own container overlay) - Uses container_build.sh for cache hits, container_exec.sh for execution - Same test matrix as iris-tests.yml (5 dirs × 4 ranks) - Editable install only (no git/pip variants needed for nightly) Co-Authored-By: Claude Opus 4.6 --- .../workflows/iris-nightly-triton-test.yml | 80 ++++++------------- 1 file changed, 26 insertions(+), 54 deletions(-) diff --git a/.github/workflows/iris-nightly-triton-test.yml b/.github/workflows/iris-nightly-triton-test.yml index c9d1d698..14e0c6c8 100644 --- a/.github/workflows/iris-nightly-triton-test.yml +++ b/.github/workflows/iris-nightly-triton-test.yml @@ -7,16 +7,15 @@ on: workflow_dispatch: # Allow manual triggering concurrency: - group: ${{ github.workflow }}-${{ github.ref }} + group: ${{ github.workflow }} cancel-in-progress: true permissions: contents: read jobs: - build-container-with-latest-triton: + build-container-image: runs-on: [self-hosted, mi3xx] - timeout-minutes: 120 # Building with latest Triton may take longer steps: - name: Checkout repository @@ -33,48 +32,19 @@ jobs: echo "Container runtime already available" fi - - name: Modify Apptainer def file to use latest Triton + - name: Build Iris container run: | - echo "Modifying apptainer/iris.def to use latest Triton from main branch" - # Replace any 40-character hex commit checkout with main branch - sed -i 's/git checkout [0-9a-f]\{40\}/git checkout main/' apptainer/iris.def - echo "Modified iris.def:" - grep -A2 -B2 "git checkout" apptainer/iris.def + bash .github/scripts/container_build.sh - - name: Build Iris container with latest Triton - run: | - set -e - - # Check /dev/shm size - shm_size_gb=$(df -k /dev/shm | tail -1 | awk '{print int($2/1024/1024)}') - if [ "${shm_size_gb:-0}" -lt 64 ]; then - echo "❌ ERROR: /dev/shm is too small (${shm_size_gb}GB < 64GB)" - echo "Fix: mount -o remount,size=64G /dev/shm" - exit 1 - fi - echo "✅ /dev/shm size OK (${shm_size_gb}GB)" - - # Build with Apptainer, forcing rebuild - DEF_FILE=apptainer/iris.def - IMAGE_PATH="${HOME}/iris-apptainer-nightly/iris-dev-nightly.sif" - - mkdir -p "${HOME}/iris-apptainer-nightly" - - echo "Building Apptainer image with latest Triton..." - apptainer build --force "$IMAGE_PATH" "$DEF_FILE" - - echo "Built image: $IMAGE_PATH" - - test-nightly-triton: - name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks) - needs: build-container-with-latest-triton + test-nightly: + name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks, nightly Triton) + needs: build-container-image runs-on: [self-hosted, mi3xx] timeout-minutes: 60 strategy: fail-fast: false matrix: include: - # Test each subdirectory with different rank counts - test_dir: examples num_ranks: 1 gpu_devices: "0,1" @@ -144,27 +114,27 @@ jobs: run: | bash .github/scripts/cleanup_ports.sh - - name: Run ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks + - name: Run ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (nightly Triton) run: | set -e echo "::group::Running ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (nightly Triton)" - - # Use nightly image path - NIGHTLY_IMAGE="${HOME}/iris-apptainer-nightly/iris-dev-nightly.sif" - + # Build GPU argument GPU_ARG="" if [ -n "${{ matrix.gpu_devices }}" ]; then GPU_ARG="--gpus ${{ matrix.gpu_devices }}" fi - - # Run tests in container using the nightly image - .github/scripts/container_exec.sh $GPU_ARG --image "$NIGHTLY_IMAGE" " + + # Run tests in container, reinstalling Triton from main first + bash .github/scripts/container_exec.sh $GPU_ARG " set -e - - # tritonBLAS version to use - TRITONBLAS_COMMIT=\"47768c93acb7f89511d797964b84544c30ab81ad\" - + + # Reinstall Triton from main branch + echo \"Reinstalling Triton from main branch...\" + pip install --force-reinstall --no-deps \ + git+https://github.com/triton-lang/triton@main + echo \"Triton version: \$(pip show triton 2>/dev/null | grep Version || echo unknown)\" + # Install tritonBLAS if not already installed echo \"Checking for tritonBLAS...\" if ! python -c 'import tritonblas' 2>/dev/null; then @@ -173,11 +143,11 @@ jobs: if [ ! -d \"\$TRITONBLAS_DIR\" ]; then git clone https://github.com/ROCm/tritonBLAS.git \"\$TRITONBLAS_DIR\" cd \"\$TRITONBLAS_DIR\" - git checkout \"\$TRITONBLAS_COMMIT\" + git checkout 47768c93acb7f89511d797964b84544c30ab81ad else cd \"\$TRITONBLAS_DIR\" git fetch - git checkout \"\$TRITONBLAS_COMMIT\" + git checkout 47768c93acb7f89511d797964b84544c30ab81ad fi pip install -e . cd .. @@ -185,15 +155,17 @@ jobs: else echo \"tritonBLAS already installed\" fi - + + # Install iris in editable mode echo \"Installing iris in editable mode\" pip install -e . - + # Run tests in the specified directory for test_file in tests/${{ matrix.test_dir }}/test_*.py; do if [ -f \"\$test_file\" ]; then echo \"Testing: \$test_file with ${{ matrix.num_ranks }} ranks (nightly Triton)\" - python tests/run_tests_distributed.py --num_ranks ${{ matrix.num_ranks }} \"\$test_file\" -v --tb=short --durations=10 + python tests/run_tests_distributed.py \ + --num_ranks ${{ matrix.num_ranks }} \"\$test_file\" -v --tb=short --durations=10 fi done " From 02457b853abbe1d59f95fff25515135e05ac63a0 Mon Sep 17 00:00:00 2001 From: Muhammad Awad <112003944+mawad-amd@users.noreply.github.com> Date: Sun, 22 Mar 2026 20:11:26 -0700 Subject: [PATCH 7/9] Sync nightly workflow with current CI patterns Update to match main's CI infrastructure: - Runner: linux-mi325-8gpu-ossci-rad (not self-hosted/mi3xx) - GPU allocation: flock-based acquire_gpus.sh/release_gpus.sh - Remove hardcoded gpu_devices from matrix - Remove separate build-container-image job (each job builds its own) - Remove manual tritonBLAS install (now baked into container) - Use torchrun --rdzv-endpoint=localhost:0 (not run_tests_distributed --num_ranks) - Match timeout-minutes: 180 Co-Authored-By: Claude Opus 4.6 --- .../workflows/iris-nightly-triton-test.yml | 103 +++++------------- 1 file changed, 30 insertions(+), 73 deletions(-) diff --git a/.github/workflows/iris-nightly-triton-test.yml b/.github/workflows/iris-nightly-triton-test.yml index 14e0c6c8..10e4d673 100644 --- a/.github/workflows/iris-nightly-triton-test.yml +++ b/.github/workflows/iris-nightly-triton-test.yml @@ -14,115 +14,87 @@ permissions: contents: read jobs: - build-container-image: - runs-on: [self-hosted, mi3xx] - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Setup Apptainer (if not available) - run: | - if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then - echo "Neither Apptainer nor Docker found, installing Apptainer..." - apt-get update && apt-get install -y software-properties-common - add-apt-repository -y ppa:apptainer/ppa - apt-get update && apt-get install -y apptainer - else - echo "Container runtime already available" - fi - - - name: Build Iris container - run: | - bash .github/scripts/container_build.sh - test-nightly: name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks, nightly Triton) - needs: build-container-image - runs-on: [self-hosted, mi3xx] - timeout-minutes: 60 + runs-on: [linux-mi325-8gpu-ossci-rad] + timeout-minutes: 180 strategy: fail-fast: false matrix: include: - test_dir: examples num_ranks: 1 - gpu_devices: "0,1" - test_dir: examples num_ranks: 2 - gpu_devices: "2,3" - test_dir: examples num_ranks: 4 - gpu_devices: "4,5,6,7" - test_dir: examples num_ranks: 8 - gpu_devices: "0,1,2,3,4,5,6,7" - test_dir: unittests num_ranks: 1 - gpu_devices: "0,1" - test_dir: unittests num_ranks: 2 - gpu_devices: "2,3" - test_dir: unittests num_ranks: 4 - gpu_devices: "4,5,6,7" - test_dir: unittests num_ranks: 8 - gpu_devices: "0,1,2,3,4,5,6,7" - test_dir: ccl num_ranks: 1 - gpu_devices: "0,1" - test_dir: ccl num_ranks: 2 - gpu_devices: "2,3" - test_dir: ccl num_ranks: 4 - gpu_devices: "4,5,6,7" - test_dir: ccl num_ranks: 8 - gpu_devices: "0,1,2,3,4,5,6,7" - test_dir: x num_ranks: 1 - gpu_devices: "0,1" - test_dir: x num_ranks: 2 - gpu_devices: "2,3" - test_dir: x num_ranks: 4 - gpu_devices: "4,5,6,7" - test_dir: x num_ranks: 8 - gpu_devices: "0,1,2,3,4,5,6,7" - test_dir: ops num_ranks: 1 - gpu_devices: "0,1" - test_dir: ops num_ranks: 2 - gpu_devices: "2,3" - test_dir: ops num_ranks: 4 - gpu_devices: "4,5,6,7" - test_dir: ops num_ranks: 8 - gpu_devices: "0,1,2,3,4,5,6,7" steps: - name: Checkout repository uses: actions/checkout@v4 - - name: Cleanup lingering ports before tests + - name: Setup Apptainer (if not available) run: | - bash .github/scripts/cleanup_ports.sh + if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then + echo "Neither Apptainer nor Docker found, installing Apptainer..." + apt-get update && apt-get install -y software-properties-common + add-apt-repository -y ppa:apptainer/ppa + apt-get update && apt-get install -y apptainer + else + echo "Container runtime already available" + fi + + - name: Build Iris container + run: | + bash .github/scripts/container_build.sh + + - name: Acquire GPUs + run: | + bash .github/scripts/acquire_gpus.sh "${{ matrix.num_ranks }}" - name: Run ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (nightly Triton) run: | set -e echo "::group::Running ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (nightly Triton)" - # Build GPU argument + # Build GPU argument (GPU_DEVICES set by acquire_gpus.sh) GPU_ARG="" - if [ -n "${{ matrix.gpu_devices }}" ]; then - GPU_ARG="--gpus ${{ matrix.gpu_devices }}" + if [ -n "$GPU_DEVICES" ]; then + GPU_ARG="--gpus $GPU_DEVICES" fi # Run tests in container, reinstalling Triton from main first @@ -135,27 +107,6 @@ jobs: git+https://github.com/triton-lang/triton@main echo \"Triton version: \$(pip show triton 2>/dev/null | grep Version || echo unknown)\" - # Install tritonBLAS if not already installed - echo \"Checking for tritonBLAS...\" - if ! python -c 'import tritonblas' 2>/dev/null; then - echo \"Installing tritonBLAS...\" - TRITONBLAS_DIR=\"./tritonblas_install\" - if [ ! -d \"\$TRITONBLAS_DIR\" ]; then - git clone https://github.com/ROCm/tritonBLAS.git \"\$TRITONBLAS_DIR\" - cd \"\$TRITONBLAS_DIR\" - git checkout 47768c93acb7f89511d797964b84544c30ab81ad - else - cd \"\$TRITONBLAS_DIR\" - git fetch - git checkout 47768c93acb7f89511d797964b84544c30ab81ad - fi - pip install -e . - cd .. - echo \"tritonBLAS installed successfully\" - else - echo \"tritonBLAS already installed\" - fi - # Install iris in editable mode echo \"Installing iris in editable mode\" pip install -e . @@ -164,10 +115,16 @@ jobs: for test_file in tests/${{ matrix.test_dir }}/test_*.py; do if [ -f \"\$test_file\" ]; then echo \"Testing: \$test_file with ${{ matrix.num_ranks }} ranks (nightly Triton)\" - python tests/run_tests_distributed.py \ - --num_ranks ${{ matrix.num_ranks }} \"\$test_file\" -v --tb=short --durations=10 + torchrun --rdzv-backend=c10d --rdzv-endpoint=localhost:0 \ + --nnodes=1 --nproc_per_node=${{ matrix.num_ranks }} \ + tests/run_tests_distributed.py \"\$test_file\" -v --tb=short --durations=10 fi done " echo "::endgroup::" echo "✅ ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (nightly Triton) passed!" + + - name: Release GPUs + if: always() + run: | + bash .github/scripts/release_gpus.sh From 6dcb6976a8ffc7fcf29ae5cd284e6043e53ce55e Mon Sep 17 00:00:00 2001 From: Muhammad Awad <112003944+mawad-amd@users.noreply.github.com> Date: Sun, 22 Mar 2026 20:14:05 -0700 Subject: [PATCH 8/9] Fix nightly cron to midnight Pacific (7 AM UTC) Co-Authored-By: Claude Opus 4.6 --- .github/workflows/iris-nightly-triton-test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/iris-nightly-triton-test.yml b/.github/workflows/iris-nightly-triton-test.yml index 10e4d673..65997258 100644 --- a/.github/workflows/iris-nightly-triton-test.yml +++ b/.github/workflows/iris-nightly-triton-test.yml @@ -2,8 +2,8 @@ name: Iris Nightly Triton Test on: schedule: - # Run nightly at 2 AM UTC - - cron: '0 2 * * *' + # Run nightly at midnight Pacific (7 AM UTC / PDT) + - cron: '0 7 * * *' workflow_dispatch: # Allow manual triggering concurrency: From 1c639169e521021481c15f3d65f2ac3753367b19 Mon Sep 17 00:00:00 2001 From: Muhammad Awad <112003944+mawad-amd@users.noreply.github.com> Date: Sun, 22 Mar 2026 20:32:25 -0700 Subject: [PATCH 9/9] Remove nightly badge from README [skip ci] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only the new workflow file remains — no existing files modified. Co-Authored-By: Claude Opus 4.6 --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 6f9a921c..9fa2d55b 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,6 @@ Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. License: MIT Ruff Iris Tests - Nightly Triton Tests DOI DOI