Skip to content

Iris Nightly Triton Test #61

Iris Nightly Triton Test

Iris Nightly Triton Test #61

name: Iris Nightly Triton Test
on:
schedule:
# Run nightly at midnight Pacific (7 AM UTC / PDT)
- cron: '0 7 * * *'
workflow_dispatch: # Allow manual triggering
concurrency:
group: ${{ github.workflow }}
cancel-in-progress: true
permissions:
contents: read
jobs:
test-nightly:
name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks, nightly Triton)
runs-on: [linux-mi325-8gpu-ossci-rad]
timeout-minutes: 180
strategy:
fail-fast: false
matrix:
include:
- test_dir: examples
num_ranks: 1
- test_dir: examples
num_ranks: 2
- test_dir: examples
num_ranks: 4
- test_dir: examples
num_ranks: 8
- test_dir: unittests
num_ranks: 1
- test_dir: unittests
num_ranks: 2
- test_dir: unittests
num_ranks: 4
- test_dir: unittests
num_ranks: 8
- test_dir: ccl
num_ranks: 1
- test_dir: ccl
num_ranks: 2
- test_dir: ccl
num_ranks: 4
- test_dir: ccl
num_ranks: 8
- test_dir: context
num_ranks: 1
- test_dir: context
num_ranks: 2
- test_dir: context
num_ranks: 4
- test_dir: context
num_ranks: 8
- test_dir: ops
num_ranks: 1
- test_dir: ops
num_ranks: 2
- test_dir: ops
num_ranks: 4
- test_dir: ops
num_ranks: 8
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Setup Apptainer (if not available)
run: |
if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then
echo "Neither Apptainer nor Docker found, installing Apptainer..."
apt-get update && apt-get install -y software-properties-common
add-apt-repository -y ppa:apptainer/ppa
apt-get update && apt-get install -y apptainer
else
echo "Container runtime already available"
fi
- name: Pull nightly iris-dev image
run: |
# Use pre-built image from Nightly Docker Build (has Triton from main)
NIGHTLY_DIR="${HOME}/iris-apptainer-images/nightly"
NIGHTLY_SIF="${NIGHTLY_DIR}/iris-dev-nightly.sif"
LOCK="${NIGHTLY_DIR}/.pull.lock"
mkdir -p "$NIGHTLY_DIR"
# Use flock so only the first job pulls; others wait and reuse
(
flock -x 200
# Pull if SIF is missing or older than 3 hours
if [ ! -f "$NIGHTLY_SIF" ] || \
[ "$(( $(date +%s) - $(stat -c %Y "$NIGHTLY_SIF" 2>/dev/null || echo 0) ))" -gt 10800 ]; then
echo "[INFO] Pulling muhaawad/iris-dev:latest as SIF..."
apptainer pull --force "$NIGHTLY_SIF" docker://muhaawad/iris-dev:latest
else
echo "[INFO] Using cached nightly SIF (< 3h old)"
fi
) 200>"$LOCK"
echo "NIGHTLY_SIF=$NIGHTLY_SIF" >> "$GITHUB_ENV"
- name: Acquire GPUs
run: |
bash .github/scripts/acquire_gpus.sh "${{ matrix.num_ranks }}"
- name: Run ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (nightly Triton)
run: |
set -e
echo "::group::Running ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (nightly Triton)"
# Build GPU argument (GPU_DEVICES set by acquire_gpus.sh)
GPU_ARG=""
if [ -n "$GPU_DEVICES" ]; then
GPU_ARG="--gpus $GPU_DEVICES"
fi
# Run tests in pre-built nightly container (Triton from main already installed)
bash .github/scripts/container_exec.sh --image "$NIGHTLY_SIF" $GPU_ARG "
set -e
echo \"Triton version: \$(pip show triton 2>/dev/null | grep Version || echo unknown)\"
# Install iris in editable mode
echo \"Installing iris in editable mode\"
pip install -e .
# Run tests in the specified directory
for test_file in tests/${{ matrix.test_dir }}/test_*.py; do
if [ -f \"\$test_file\" ]; then
echo \"Testing: \$test_file with ${{ matrix.num_ranks }} ranks (nightly Triton)\"
torchrun --rdzv-backend=c10d --rdzv-endpoint=localhost:0 \
--nnodes=1 --nproc_per_node=${{ matrix.num_ranks }} \
tests/run_tests_distributed.py \"\$test_file\" -v --tb=short --durations=10
fi
done
"
echo "::endgroup::"
echo "✅ ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (nightly Triton) passed!"
- name: Release GPUs
if: always()
run: |
bash .github/scripts/release_gpus.sh