Skip to content
Merged
117 changes: 0 additions & 117 deletions .github/workflows/ci-gpu-rdna.yaml

This file was deleted.

74 changes: 32 additions & 42 deletions .github/workflows/ci-gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -103,59 +103,47 @@ jobs:


test:
name: "${{ matrix.os }} :: ${{ matrix.version }} :: Unit Tests and Type Checking"
name: "${{ contains( matrix.os, 'self-hosted') && matrix.os[0] || matrix.os }} :: ${{ matrix.version }} :: Unit Tests and Type Checking"
strategy:
fail-fast: false
matrix:
version: [3.11]
os: [ubuntu-22.04, linux-mi325-1gpu-ossci-iree-org, linux-mi35x-1gpu-ossci-iree-org] # nodai-amdgpu-mi250-x86-64
os: [ubuntu-22.04, linux-mi325-1gpu-ossci-iree-org, linux-mi35x-1gpu-ossci-iree-org, [rdna4, self-hosted, Linux, X64, shark49]] # nodai-amdgpu-mi250-x86-64
runs-on: ${{matrix.os}}
timeout-minutes: 60
needs: build_llvm_linux
if: github.event_name != 'pull_request' || github.event.pull_request.draft == false
env:
VENV_DIR: ${{ github.workspace }}/.wave-venv
IS_CDNA3: ${{ contains(matrix.os, 'mi325') }}
IS_CDNA4: ${{ contains(matrix.os, 'mi35x') }}
IS_RDNA4: ${{ contains(matrix.os, 'rdna4') }}
HAS_GPU: ${{ contains(matrix.os, 'rdna4') || contains(matrix.os, 'mi325') || contains(matrix.os, 'mi35x') }}

steps:
- name: Set environment variables
run: |
if [[ "${{ contains(matrix.os, 'mi325') }}" == 'true' ]]; then
echo "IS_MI325=true" >> $GITHUB_ENV
else
echo "IS_MI325=false" >> $GITHUB_ENV
fi

if [[ "${{ contains(matrix.os, 'mi35x') }}" == 'true' ]]; then
echo "IS_MI35X=true" >> $GITHUB_ENV
else
echo "IS_MI35X=false" >> $GITHUB_ENV
fi

if [[ "${{ contains(matrix.os, 'mi325') }}" == 'true' || "${{ contains(matrix.os, 'mi35x') }}" == 'true' || "${{ contains(matrix.os, 'mi250') }}" == 'true' ]]; then
echo "HAS_GPU=true" >> $GITHUB_ENV
echo "HAS_NO_GPU=false" >> $GITHUB_ENV
else
echo "HAS_GPU=false" >> $GITHUB_ENV
echo "HAS_NO_GPU=true" >> $GITHUB_ENV
fi

- name: Checkout repo
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
with:
fetch-depth: 0

- name: Print env
run: |
echo "IS_CDNA3=$IS_CDNA3"
echo "IS_CDNA4=$IS_CDNA4"
echo "IS_RDNA4=$IS_RDNA4"
echo "HAS_GPU=$HAS_GPU"

- name: Setup Cache Vars
if: ${{ env.IS_MI325 == 'true' || env.IS_MI35X == 'true' }}
if: ${{ env.IS_CDNA3 == 'true' || env.IS_CDNA4 == 'true' }}
run: |
echo "LLVM_SHA=$(cat $GITHUB_WORKSPACE/water/$LLVM_SHA_FILE)" >> $GITHUB_ENV
echo "WAVE_TEST_WATER=1" >> $GITHUB_ENV
echo "WAVE_BUILD_WATER=1" >> $GITHUB_ENV
echo "WAVE_LLVM_DIR=${GITHUB_WORKSPACE}/llvm-mlir/_mlir_install" >> $GITHUB_ENV

- name: Cache LLVM-MLIR
id: cache-llvm-mlir
uses: actions/cache@9255dc7a253b0ccc959486e2bca901246202afeb # v5.0.1
if: ${{ env.IS_MI325 == 'true' || env.IS_MI35X == 'true' }}
if: ${{ env.IS_CDNA3 == 'true' || env.IS_CDNA4 == 'true' }}
with:
path: llvm-mlir/_mlir_install/**
key: ${{ runner.os }}-build-llvm-${{ env.LLVM_CACHE_NUMBER }}-${{ env.LLVM_SHA }}
Expand All @@ -179,12 +167,13 @@ jobs:
toolchain: stable

- name: "Install dwarfdump"
if: ${{ env.IS_RDNA4 == 'false' }}
run: |
sudo apt-get update
sudo apt-get install -y dwarfdump

- name: Install pip deps
if: ${{ env.HAS_NO_GPU == 'true' }}
if: ${{ env.HAS_GPU == 'false' }}
run: |
python -m pip install --no-compile --upgrade pip
# Note: We install in three steps in order to satisfy requirements
Expand All @@ -194,8 +183,8 @@ jobs:
pip install --no-cache-dir -r requirements-iree-pinned.txt --upgrade
pip install -r requirements.txt -e .

- name: Install pip deps (mi35x)
if: ${{ env.IS_MI35X == 'true' }}
- name: Install pip deps (CDNA4)
if: ${{ env.IS_CDNA4 == 'true' }}
run: |
# Install TheRock
python -m pip install --upgrade pip
Expand All @@ -211,8 +200,8 @@ jobs:
pip install --no-cache-dir -r requirements-iree-pinned.txt --upgrade
pip install -r requirements.txt -e .

- name: Install pip deps (mi250/mi325)
if: ${{ env.HAS_GPU == 'true' && env.IS_MI35X == 'false' }}
- name: Install pip deps (CDNA3/RDNA4)
if: ${{ env.HAS_GPU == 'true' && env.IS_CDNA4 == 'false' }}
run: |
python -m pip install --upgrade pip
pip install -r pytorch-rocm-requirements.txt
Expand All @@ -224,7 +213,7 @@ jobs:
pytest -n 4 --capture=tee-sys -vv ./tests/unittests/

- name: Test TKW runtime related stack on amdgpu
if: ${{ env.HAS_GPU == 'true' && !cancelled() }}
if: ${{ env.HAS_GPU == 'true' }}
run: |
python -c "import torch; print(torch.cuda.get_device_properties().gcnArchName if torch.cuda.is_available() else 'cpu')"
export WAVE_CACHE_DIR=$PWD/.wave
Expand All @@ -233,26 +222,27 @@ jobs:
WAVE_CACHE_ON=1 pytest --timeout=300 --capture=tee-sys -vv --run-e2e --durations=100 ./tests/kernel/runtime

- name: Run e2e tests on AMD GPU
if: ${{ env.HAS_GPU == 'true' && (github.event_name == 'pull_request') && !cancelled() }}
if: ${{ env.HAS_GPU == 'true' && (github.event_name == 'pull_request') }}
run: |
WAVE_CACHE_ON=0 pytest -n 4 --timeout=300 --capture=tee-sys -vv --run-e2e --durations=100 ./tests/kernel/

- name: Run expensive e2e tests on AMD GPU
if: ${{ env.HAS_GPU == 'true' && (github.event_name != 'pull_request') && !cancelled() }}
if: ${{ env.HAS_GPU == 'true' && (github.event_name != 'pull_request') }}
run: |
WAVE_CACHE_ON=0 pytest -n 4 --timeout=600 --capture=tee-sys -vv --run-e2e --run-expensive-tests --durations=100 ./tests/kernel/

- name: Run LIT tests
if: ${{ !cancelled() }}
env:
WAVE_TEST_WATER: ${{ env.IS_CDNA3 == 'true' && '1' || '0' }}
WAVE_TEST_DWARFDUMP: ${{ env.IS_RDNA4 == 'false' && '1' || '0' }}
run: |
if [[ "${{ contains(matrix.os, 'mi35x') }}" == 'true' ]]; then
# TODO: mlir_converter tests segfault on mi35x
export WAVE_TEST_WATER=0
fi
WAVE_TEST_DWARFDUMP=1 lit lit_tests/ -v
# TODO: mlir_converter tests segfault on mi35x
# TODO: can't sudo to install dwarfdump on rdna4
echo "WAVE_TEST_WATER=$WAVE_TEST_WATER"
echo "WAVE_TEST_DWARFDUMP=$WAVE_TEST_DWARFDUMP"
lit lit_tests/ -v

- name: MyPy Type Checking
if: ${{ !cancelled() }}
run: |
mypy

Expand Down