Iris Nightly Triton Test #61
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Iris Nightly Triton Test | |
| on: | |
| schedule: | |
| # Run nightly at midnight Pacific (7 AM UTC / PDT) | |
| - cron: '0 7 * * *' | |
| workflow_dispatch: # Allow manual triggering | |
| concurrency: | |
| group: ${{ github.workflow }} | |
| cancel-in-progress: true | |
| permissions: | |
| contents: read | |
| jobs: | |
| test-nightly: | |
| name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks, nightly Triton) | |
| runs-on: [linux-mi325-8gpu-ossci-rad] | |
| timeout-minutes: 180 | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| include: | |
| - test_dir: examples | |
| num_ranks: 1 | |
| - test_dir: examples | |
| num_ranks: 2 | |
| - test_dir: examples | |
| num_ranks: 4 | |
| - test_dir: examples | |
| num_ranks: 8 | |
| - test_dir: unittests | |
| num_ranks: 1 | |
| - test_dir: unittests | |
| num_ranks: 2 | |
| - test_dir: unittests | |
| num_ranks: 4 | |
| - test_dir: unittests | |
| num_ranks: 8 | |
| - test_dir: ccl | |
| num_ranks: 1 | |
| - test_dir: ccl | |
| num_ranks: 2 | |
| - test_dir: ccl | |
| num_ranks: 4 | |
| - test_dir: ccl | |
| num_ranks: 8 | |
| - test_dir: context | |
| num_ranks: 1 | |
| - test_dir: context | |
| num_ranks: 2 | |
| - test_dir: context | |
| num_ranks: 4 | |
| - test_dir: context | |
| num_ranks: 8 | |
| - test_dir: ops | |
| num_ranks: 1 | |
| - test_dir: ops | |
| num_ranks: 2 | |
| - test_dir: ops | |
| num_ranks: 4 | |
| - test_dir: ops | |
| num_ranks: 8 | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Setup Apptainer (if not available) | |
| run: | | |
| if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then | |
| echo "Neither Apptainer nor Docker found, installing Apptainer..." | |
| apt-get update && apt-get install -y software-properties-common | |
| add-apt-repository -y ppa:apptainer/ppa | |
| apt-get update && apt-get install -y apptainer | |
| else | |
| echo "Container runtime already available" | |
| fi | |
| - name: Pull nightly iris-dev image | |
| run: | | |
| # Use pre-built image from Nightly Docker Build (has Triton from main) | |
| NIGHTLY_DIR="${HOME}/iris-apptainer-images/nightly" | |
| NIGHTLY_SIF="${NIGHTLY_DIR}/iris-dev-nightly.sif" | |
| LOCK="${NIGHTLY_DIR}/.pull.lock" | |
| mkdir -p "$NIGHTLY_DIR" | |
| # Use flock so only the first job pulls; others wait and reuse | |
| ( | |
| flock -x 200 | |
| # Pull if SIF is missing or older than 3 hours | |
| if [ ! -f "$NIGHTLY_SIF" ] || \ | |
| [ "$(( $(date +%s) - $(stat -c %Y "$NIGHTLY_SIF" 2>/dev/null || echo 0) ))" -gt 10800 ]; then | |
| echo "[INFO] Pulling muhaawad/iris-dev:latest as SIF..." | |
| apptainer pull --force "$NIGHTLY_SIF" docker://muhaawad/iris-dev:latest | |
| else | |
| echo "[INFO] Using cached nightly SIF (< 3h old)" | |
| fi | |
| ) 200>"$LOCK" | |
| echo "NIGHTLY_SIF=$NIGHTLY_SIF" >> "$GITHUB_ENV" | |
| - name: Acquire GPUs | |
| run: | | |
| bash .github/scripts/acquire_gpus.sh "${{ matrix.num_ranks }}" | |
| - name: Run ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (nightly Triton) | |
| run: | | |
| set -e | |
| echo "::group::Running ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (nightly Triton)" | |
| # Build GPU argument (GPU_DEVICES set by acquire_gpus.sh) | |
| GPU_ARG="" | |
| if [ -n "$GPU_DEVICES" ]; then | |
| GPU_ARG="--gpus $GPU_DEVICES" | |
| fi | |
| # Run tests in pre-built nightly container (Triton from main already installed) | |
| bash .github/scripts/container_exec.sh --image "$NIGHTLY_SIF" $GPU_ARG " | |
| set -e | |
| echo \"Triton version: \$(pip show triton 2>/dev/null | grep Version || echo unknown)\" | |
| # Install iris in editable mode | |
| echo \"Installing iris in editable mode\" | |
| pip install -e . | |
| # Run tests in the specified directory | |
| for test_file in tests/${{ matrix.test_dir }}/test_*.py; do | |
| if [ -f \"\$test_file\" ]; then | |
| echo \"Testing: \$test_file with ${{ matrix.num_ranks }} ranks (nightly Triton)\" | |
| torchrun --rdzv-backend=c10d --rdzv-endpoint=localhost:0 \ | |
| --nnodes=1 --nproc_per_node=${{ matrix.num_ranks }} \ | |
| tests/run_tests_distributed.py \"\$test_file\" -v --tb=short --durations=10 | |
| fi | |
| done | |
| " | |
| echo "::endgroup::" | |
| echo "✅ ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (nightly Triton) passed!" | |
| - name: Release GPUs | |
| if: always() | |
| run: | | |
| bash .github/scripts/release_gpus.sh |