diff --git a/.github/workflows/ci-gpu-rdna.yaml b/.github/workflows/ci-gpu-rdna.yaml deleted file mode 100644 index c1abc7def..000000000 --- a/.github/workflows/ci-gpu-rdna.yaml +++ /dev/null @@ -1,117 +0,0 @@ -# Copyright 2024 The IREE Authors -# -# Licensed under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -name: "Wave GPU CI (rdna4)" - -on: - # manual - workflow_dispatch: {} - pull_request: - types: [opened, synchronize, ready_for_review, converted_to_draft] - push: - branches: - - main - -concurrency: - # A PR number if a pull request and otherwise the commit hash. This cancels - # queued and in-progress runs for the same PR (presubmit) or commit - # (postsubmit). The workflow name is prepended to avoid conflicts between - # different workflows. - group: ${{ github.workflow }}-${{ github.event.number || github.sha }} - cancel-in-progress: true - -jobs: - test: - name: "${{ matrix.os }} :: ${{ matrix.version }} :: Unit Tests and Type Checking" - strategy: - fail-fast: false - matrix: - version: ["3.11"] - os: [Shark49] - runs-on: [self-hosted, Linux, X64, rdna4, shark49] - timeout-minutes: 60 - if: github.event_name != 'pull_request' || github.event.pull_request.draft == false - - container: - image: 'ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:405945a40deaff9db90b9839c0f41d4cba4a383c1a7459b28627047bf6302a26' - options: >- - --ipc host - --group-add 44 - --group-add 992 - --device /dev/kfd - --device /dev/dri - -v "/opt/rocm":"/opt/rocm":ro - -e "ROCM_PATH=/opt/rocm" - -v "/opt/amdgpu":"/opt/amdgpu":ro - -e "LD_LIBRARY_PATH=/opt/rocm/lib:/opt/rocm/lib64:/opt/amdgpu/lib/x86_64-linux-gnu" - -e "PATH=/opt/rocm/bin:/opt/rocm/hip/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" - --security-opt seccomp=unconfined - - defaults: - run: - shell: bash - - env: - VENV_DIR: ${{ github.workspace }}/.wave-venv - - steps: - - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 - - - name: "Setting up Python" - id: setup_python - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 - with: - python-version: ${{matrix.version}} - - name: Create Python venv - run: | - python3 -m venv ${VENV_DIR} - source ${VENV_DIR}/bin/activate - echo VIRTUAL_ENV=$VIRTUAL_ENV >> "$GITHUB_ENV" - echo "$VENV_DIR/bin" >> "$GITHUB_PATH" - - - name: "Setting up Rust" - uses: actions-rust-lang/setup-rust-toolchain@1780873c7b576612439a134613cc4cc74ce5538c # v1.15.2 - with: - toolchain: stable - - - name: Install pip deps - run: | - # Install User libraries - sudo apt-get update - sudo apt install -y libnuma1 numactl gfortran build-essential binutils dwarfdump - - # Install torch+rocm6.4 - python -m pip install --upgrade pip - pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/rocm6.4 - - # Install wave deps - pip install --no-cache-dir -r requirements-iree-pinned.txt --upgrade - pip install -r requirements.txt -e . - - - name: Run unit tests - run: | - pytest -n 4 --capture=tee-sys -vv ./tests/unittests/ - - - name: Test TKW runtime related stack on amdgpu - run: | - export WAVE_CACHE_DIR=$PWD/.wave - rm -rf ./.wave - nproc - WAVE_CACHE_ON=1 pytest --timeout=300 --capture=tee-sys -vv --run-e2e --durations=100 ./tests/kernel/runtime - - - name: Run e2e tests on AMD GPU - if: ${{ github.event_name == 'pull_request' }} - run: | - WAVE_CACHE_ON=0 pytest -n 1 --timeout=300 --capture=tee-sys -vv --run-e2e --durations=100 ./tests/kernel/ - - - name: Run expensive e2e tests on AMD GPU - if: ${{ (github.event_name != 'pull_request') && !cancelled() }} - run: | - WAVE_CACHE_ON=0 pytest -n 1 --timeout=600 --capture=tee-sys -vv --run-e2e --run-expensive-tests --durations=100 ./tests/kernel/ - - - name: Run LIT tests - run: | - WAVE_TEST_DWARFDUMP=1 lit lit_tests/ -v diff --git a/.github/workflows/ci-gpu.yaml b/.github/workflows/ci-gpu.yaml index 826845307..198724b4c 100644 --- a/.github/workflows/ci-gpu.yaml +++ b/.github/workflows/ci-gpu.yaml @@ -103,59 +103,47 @@ jobs: test: - name: "${{ matrix.os }} :: ${{ matrix.version }} :: Unit Tests and Type Checking" + name: "${{ contains( matrix.os, 'self-hosted') && matrix.os[0] || matrix.os }} :: ${{ matrix.version }} :: Unit Tests and Type Checking" strategy: fail-fast: false matrix: version: [3.11] - os: [ubuntu-22.04, linux-mi325-1gpu-ossci-iree-org, linux-mi35x-1gpu-ossci-iree-org] # nodai-amdgpu-mi250-x86-64 + os: [ubuntu-22.04, linux-mi325-1gpu-ossci-iree-org, linux-mi35x-1gpu-ossci-iree-org, [rdna4, self-hosted, Linux, X64, shark49]] # nodai-amdgpu-mi250-x86-64 runs-on: ${{matrix.os}} timeout-minutes: 60 needs: build_llvm_linux if: github.event_name != 'pull_request' || github.event.pull_request.draft == false env: VENV_DIR: ${{ github.workspace }}/.wave-venv + IS_CDNA3: ${{ contains(matrix.os, 'mi325') }} + IS_CDNA4: ${{ contains(matrix.os, 'mi35x') }} + IS_RDNA4: ${{ contains(matrix.os, 'rdna4') }} + HAS_GPU: ${{ contains(matrix.os, 'rdna4') || contains(matrix.os, 'mi325') || contains(matrix.os, 'mi35x') }} steps: - - name: Set environment variables - run: | - if [[ "${{ contains(matrix.os, 'mi325') }}" == 'true' ]]; then - echo "IS_MI325=true" >> $GITHUB_ENV - else - echo "IS_MI325=false" >> $GITHUB_ENV - fi - - if [[ "${{ contains(matrix.os, 'mi35x') }}" == 'true' ]]; then - echo "IS_MI35X=true" >> $GITHUB_ENV - else - echo "IS_MI35X=false" >> $GITHUB_ENV - fi - - if [[ "${{ contains(matrix.os, 'mi325') }}" == 'true' || "${{ contains(matrix.os, 'mi35x') }}" == 'true' || "${{ contains(matrix.os, 'mi250') }}" == 'true' ]]; then - echo "HAS_GPU=true" >> $GITHUB_ENV - echo "HAS_NO_GPU=false" >> $GITHUB_ENV - else - echo "HAS_GPU=false" >> $GITHUB_ENV - echo "HAS_NO_GPU=true" >> $GITHUB_ENV - fi - - name: Checkout repo uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 with: fetch-depth: 0 + - name: Print env + run: | + echo "IS_CDNA3=$IS_CDNA3" + echo "IS_CDNA4=$IS_CDNA4" + echo "IS_RDNA4=$IS_RDNA4" + echo "HAS_GPU=$HAS_GPU" + - name: Setup Cache Vars - if: ${{ env.IS_MI325 == 'true' || env.IS_MI35X == 'true' }} + if: ${{ env.IS_CDNA3 == 'true' || env.IS_CDNA4 == 'true' }} run: | echo "LLVM_SHA=$(cat $GITHUB_WORKSPACE/water/$LLVM_SHA_FILE)" >> $GITHUB_ENV - echo "WAVE_TEST_WATER=1" >> $GITHUB_ENV echo "WAVE_BUILD_WATER=1" >> $GITHUB_ENV echo "WAVE_LLVM_DIR=${GITHUB_WORKSPACE}/llvm-mlir/_mlir_install" >> $GITHUB_ENV - name: Cache LLVM-MLIR id: cache-llvm-mlir uses: actions/cache@9255dc7a253b0ccc959486e2bca901246202afeb # v5.0.1 - if: ${{ env.IS_MI325 == 'true' || env.IS_MI35X == 'true' }} + if: ${{ env.IS_CDNA3 == 'true' || env.IS_CDNA4 == 'true' }} with: path: llvm-mlir/_mlir_install/** key: ${{ runner.os }}-build-llvm-${{ env.LLVM_CACHE_NUMBER }}-${{ env.LLVM_SHA }} @@ -179,12 +167,13 @@ jobs: toolchain: stable - name: "Install dwarfdump" + if: ${{ env.IS_RDNA4 == 'false' }} run: | sudo apt-get update sudo apt-get install -y dwarfdump - name: Install pip deps - if: ${{ env.HAS_NO_GPU == 'true' }} + if: ${{ env.HAS_GPU == 'false' }} run: | python -m pip install --no-compile --upgrade pip # Note: We install in three steps in order to satisfy requirements @@ -194,8 +183,8 @@ jobs: pip install --no-cache-dir -r requirements-iree-pinned.txt --upgrade pip install -r requirements.txt -e . - - name: Install pip deps (mi35x) - if: ${{ env.IS_MI35X == 'true' }} + - name: Install pip deps (CDNA4) + if: ${{ env.IS_CDNA4 == 'true' }} run: | # Install TheRock python -m pip install --upgrade pip @@ -211,8 +200,8 @@ jobs: pip install --no-cache-dir -r requirements-iree-pinned.txt --upgrade pip install -r requirements.txt -e . - - name: Install pip deps (mi250/mi325) - if: ${{ env.HAS_GPU == 'true' && env.IS_MI35X == 'false' }} + - name: Install pip deps (CDNA3/RDNA4) + if: ${{ env.HAS_GPU == 'true' && env.IS_CDNA4 == 'false' }} run: | python -m pip install --upgrade pip pip install -r pytorch-rocm-requirements.txt @@ -224,7 +213,7 @@ jobs: pytest -n 4 --capture=tee-sys -vv ./tests/unittests/ - name: Test TKW runtime related stack on amdgpu - if: ${{ env.HAS_GPU == 'true' && !cancelled() }} + if: ${{ env.HAS_GPU == 'true' }} run: | python -c "import torch; print(torch.cuda.get_device_properties().gcnArchName if torch.cuda.is_available() else 'cpu')" export WAVE_CACHE_DIR=$PWD/.wave @@ -233,26 +222,27 @@ jobs: WAVE_CACHE_ON=1 pytest --timeout=300 --capture=tee-sys -vv --run-e2e --durations=100 ./tests/kernel/runtime - name: Run e2e tests on AMD GPU - if: ${{ env.HAS_GPU == 'true' && (github.event_name == 'pull_request') && !cancelled() }} + if: ${{ env.HAS_GPU == 'true' && (github.event_name == 'pull_request') }} run: | WAVE_CACHE_ON=0 pytest -n 4 --timeout=300 --capture=tee-sys -vv --run-e2e --durations=100 ./tests/kernel/ - name: Run expensive e2e tests on AMD GPU - if: ${{ env.HAS_GPU == 'true' && (github.event_name != 'pull_request') && !cancelled() }} + if: ${{ env.HAS_GPU == 'true' && (github.event_name != 'pull_request') }} run: | WAVE_CACHE_ON=0 pytest -n 4 --timeout=600 --capture=tee-sys -vv --run-e2e --run-expensive-tests --durations=100 ./tests/kernel/ - name: Run LIT tests - if: ${{ !cancelled() }} + env: + WAVE_TEST_WATER: ${{ env.IS_CDNA3 == 'true' && '1' || '0' }} + WAVE_TEST_DWARFDUMP: ${{ env.IS_RDNA4 == 'false' && '1' || '0' }} run: | - if [[ "${{ contains(matrix.os, 'mi35x') }}" == 'true' ]]; then - # TODO: mlir_converter tests segfault on mi35x - export WAVE_TEST_WATER=0 - fi - WAVE_TEST_DWARFDUMP=1 lit lit_tests/ -v + # TODO: mlir_converter tests segfault on mi35x + # TODO: can't sudo to install dwarfdump on rdna4 + echo "WAVE_TEST_WATER=$WAVE_TEST_WATER" + echo "WAVE_TEST_DWARFDUMP=$WAVE_TEST_DWARFDUMP" + lit lit_tests/ -v - name: MyPy Type Checking - if: ${{ !cancelled() }} run: | mypy