diff --git a/.github/workflows/iris-nightly-triton-test.yml b/.github/workflows/iris-nightly-triton-test.yml new file mode 100644 index 00000000..65997258 --- /dev/null +++ b/.github/workflows/iris-nightly-triton-test.yml @@ -0,0 +1,130 @@ +name: Iris Nightly Triton Test + +on: + schedule: + # Run nightly at midnight Pacific (7 AM UTC / PDT) + - cron: '0 7 * * *' + workflow_dispatch: # Allow manual triggering + +concurrency: + group: ${{ github.workflow }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + test-nightly: + name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks, nightly Triton) + runs-on: [linux-mi325-8gpu-ossci-rad] + timeout-minutes: 180 + strategy: + fail-fast: false + matrix: + include: + - test_dir: examples + num_ranks: 1 + - test_dir: examples + num_ranks: 2 + - test_dir: examples + num_ranks: 4 + - test_dir: examples + num_ranks: 8 + - test_dir: unittests + num_ranks: 1 + - test_dir: unittests + num_ranks: 2 + - test_dir: unittests + num_ranks: 4 + - test_dir: unittests + num_ranks: 8 + - test_dir: ccl + num_ranks: 1 + - test_dir: ccl + num_ranks: 2 + - test_dir: ccl + num_ranks: 4 + - test_dir: ccl + num_ranks: 8 + - test_dir: x + num_ranks: 1 + - test_dir: x + num_ranks: 2 + - test_dir: x + num_ranks: 4 + - test_dir: x + num_ranks: 8 + - test_dir: ops + num_ranks: 1 + - test_dir: ops + num_ranks: 2 + - test_dir: ops + num_ranks: 4 + - test_dir: ops + num_ranks: 8 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Setup Apptainer (if not available) + run: | + if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then + echo "Neither Apptainer nor Docker found, installing Apptainer..." + apt-get update && apt-get install -y software-properties-common + add-apt-repository -y ppa:apptainer/ppa + apt-get update && apt-get install -y apptainer + else + echo "Container runtime already available" + fi + + - name: Build Iris container + run: | + bash .github/scripts/container_build.sh + + - name: Acquire GPUs + run: | + bash .github/scripts/acquire_gpus.sh "${{ matrix.num_ranks }}" + + - name: Run ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (nightly Triton) + run: | + set -e + echo "::group::Running ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (nightly Triton)" + + # Build GPU argument (GPU_DEVICES set by acquire_gpus.sh) + GPU_ARG="" + if [ -n "$GPU_DEVICES" ]; then + GPU_ARG="--gpus $GPU_DEVICES" + fi + + # Run tests in container, reinstalling Triton from main first + bash .github/scripts/container_exec.sh $GPU_ARG " + set -e + + # Reinstall Triton from main branch + echo \"Reinstalling Triton from main branch...\" + pip install --force-reinstall --no-deps \ + git+https://github.com/triton-lang/triton@main + echo \"Triton version: \$(pip show triton 2>/dev/null | grep Version || echo unknown)\" + + # Install iris in editable mode + echo \"Installing iris in editable mode\" + pip install -e . + + # Run tests in the specified directory + for test_file in tests/${{ matrix.test_dir }}/test_*.py; do + if [ -f \"\$test_file\" ]; then + echo \"Testing: \$test_file with ${{ matrix.num_ranks }} ranks (nightly Triton)\" + torchrun --rdzv-backend=c10d --rdzv-endpoint=localhost:0 \ + --nnodes=1 --nproc_per_node=${{ matrix.num_ranks }} \ + tests/run_tests_distributed.py \"\$test_file\" -v --tb=short --durations=10 + fi + done + " + echo "::endgroup::" + echo "✅ ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (nightly Triton) passed!" + + - name: Release GPUs + if: always() + run: | + bash .github/scripts/release_gpus.sh