diff --git a/.github/workflows/ci-gpu-rdna.yaml b/.github/workflows/ci-gpu-rdna.yaml
deleted file mode 100644
index c1abc7def..000000000
--- a/.github/workflows/ci-gpu-rdna.yaml
+++ /dev/null
@@ -1,117 +0,0 @@
-# Copyright 2024 The IREE Authors
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-name: "Wave GPU CI (rdna4)"
-
-on:
-  # manual
-  workflow_dispatch: {}
-  pull_request:
-    types: [opened, synchronize, ready_for_review, converted_to_draft]
-  push:
-    branches:
-      - main
-
-concurrency:
-  # A PR number if a pull request and otherwise the commit hash. This cancels
-  # queued and in-progress runs for the same PR (presubmit) or commit
-  # (postsubmit). The workflow name is prepended to avoid conflicts between
-  # different workflows.
-  group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
-  cancel-in-progress: true
-
-jobs:
-  test:
-    name: "${{ matrix.os }} :: ${{ matrix.version }} :: Unit Tests and Type Checking"
-    strategy:
-      fail-fast: false
-      matrix:
-        version: ["3.11"]
-        os: [Shark49]
-    runs-on: [self-hosted, Linux, X64, rdna4, shark49]
-    timeout-minutes: 60
-    if: github.event_name != 'pull_request' || github.event.pull_request.draft == false
-
-    container:
-      image: 'ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:405945a40deaff9db90b9839c0f41d4cba4a383c1a7459b28627047bf6302a26'
-      options: >-
-        --ipc host
-        --group-add 44
-        --group-add 992
-        --device /dev/kfd
-        --device /dev/dri
-        -v "/opt/rocm":"/opt/rocm":ro
-        -e "ROCM_PATH=/opt/rocm"
-        -v "/opt/amdgpu":"/opt/amdgpu":ro
-        -e "LD_LIBRARY_PATH=/opt/rocm/lib:/opt/rocm/lib64:/opt/amdgpu/lib/x86_64-linux-gnu"
-        -e "PATH=/opt/rocm/bin:/opt/rocm/hip/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
-        --security-opt seccomp=unconfined
-
-    defaults:
-      run:
-        shell: bash
-
-    env:
-      VENV_DIR: ${{ github.workspace }}/.wave-venv
-
-    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
-
-      - name: "Setting up Python"
-        id: setup_python
-        uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
-        with:
-          python-version: ${{matrix.version}}
-      - name: Create Python venv
-        run: |
-           python3 -m venv ${VENV_DIR}
-           source ${VENV_DIR}/bin/activate
-           echo VIRTUAL_ENV=$VIRTUAL_ENV >> "$GITHUB_ENV"
-           echo "$VENV_DIR/bin" >> "$GITHUB_PATH"
-
-      - name: "Setting up Rust"
-        uses: actions-rust-lang/setup-rust-toolchain@1780873c7b576612439a134613cc4cc74ce5538c # v1.15.2
-        with:
-          toolchain: stable
-
-      - name: Install pip deps
-        run: |
-          # Install User libraries
-          sudo apt-get update
-          sudo apt install -y libnuma1 numactl gfortran build-essential binutils dwarfdump
-
-          # Install torch+rocm6.4
-          python -m pip install --upgrade pip
-          pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/rocm6.4
-
-          # Install wave deps
-          pip install --no-cache-dir -r requirements-iree-pinned.txt --upgrade
-          pip install -r requirements.txt -e .
-
-      - name: Run unit tests
-        run: |
-          pytest -n 4 --capture=tee-sys -vv ./tests/unittests/
-
-      - name: Test TKW runtime related stack on amdgpu
-        run: |
-          export WAVE_CACHE_DIR=$PWD/.wave
-          rm -rf ./.wave
-          nproc
-          WAVE_CACHE_ON=1 pytest --timeout=300 --capture=tee-sys -vv --run-e2e --durations=100 ./tests/kernel/runtime
-
-      - name: Run e2e tests on AMD GPU
-        if: ${{ github.event_name == 'pull_request' }}
-        run: |
-          WAVE_CACHE_ON=0 pytest -n 1 --timeout=300 --capture=tee-sys -vv --run-e2e --durations=100 ./tests/kernel/
-
-      - name: Run expensive e2e tests on AMD GPU
-        if: ${{ (github.event_name != 'pull_request') && !cancelled() }}
-        run: |
-          WAVE_CACHE_ON=0 pytest -n 1 --timeout=600 --capture=tee-sys -vv --run-e2e --run-expensive-tests --durations=100 ./tests/kernel/
-
-      - name: Run LIT tests
-        run: |
-          WAVE_TEST_DWARFDUMP=1 lit lit_tests/ -v
diff --git a/.github/workflows/ci-gpu.yaml b/.github/workflows/ci-gpu.yaml
index 826845307..198724b4c 100644
--- a/.github/workflows/ci-gpu.yaml
+++ b/.github/workflows/ci-gpu.yaml
@@ -103,59 +103,47 @@ jobs:
 
 
   test:
-    name: "${{ matrix.os }} :: ${{ matrix.version }} :: Unit Tests and Type Checking"
+    name: "${{ contains( matrix.os, 'self-hosted') && matrix.os[0] || matrix.os }} :: ${{ matrix.version }} :: Unit Tests and Type Checking"
     strategy:
       fail-fast: false
       matrix:
         version: [3.11]
-        os: [ubuntu-22.04, linux-mi325-1gpu-ossci-iree-org, linux-mi35x-1gpu-ossci-iree-org] # nodai-amdgpu-mi250-x86-64
+        os: [ubuntu-22.04, linux-mi325-1gpu-ossci-iree-org, linux-mi35x-1gpu-ossci-iree-org, [rdna4, self-hosted, Linux, X64, shark49]] # nodai-amdgpu-mi250-x86-64
     runs-on: ${{matrix.os}}
     timeout-minutes: 60
     needs: build_llvm_linux
     if: github.event_name != 'pull_request' || github.event.pull_request.draft == false
     env:
       VENV_DIR: ${{ github.workspace }}/.wave-venv
+      IS_CDNA3: ${{ contains(matrix.os, 'mi325') }}
+      IS_CDNA4: ${{ contains(matrix.os, 'mi35x') }}
+      IS_RDNA4: ${{ contains(matrix.os, 'rdna4') }}
+      HAS_GPU: ${{ contains(matrix.os, 'rdna4') || contains(matrix.os, 'mi325') ||  contains(matrix.os, 'mi35x') }}
 
     steps:
-      - name: Set environment variables
-        run: |
-          if [[ "${{ contains(matrix.os, 'mi325') }}" == 'true' ]]; then
-            echo "IS_MI325=true" >> $GITHUB_ENV
-          else
-            echo "IS_MI325=false" >> $GITHUB_ENV
-          fi
-
-          if [[ "${{ contains(matrix.os, 'mi35x') }}" == 'true' ]]; then
-            echo "IS_MI35X=true" >> $GITHUB_ENV
-          else
-            echo "IS_MI35X=false" >> $GITHUB_ENV
-          fi
-
-          if [[ "${{ contains(matrix.os, 'mi325') }}" == 'true' || "${{ contains(matrix.os, 'mi35x') }}" == 'true' || "${{ contains(matrix.os, 'mi250') }}" == 'true' ]]; then
-            echo "HAS_GPU=true" >> $GITHUB_ENV
-            echo "HAS_NO_GPU=false" >> $GITHUB_ENV
-          else
-            echo "HAS_GPU=false" >> $GITHUB_ENV
-            echo "HAS_NO_GPU=true" >> $GITHUB_ENV
-          fi
-
       - name: Checkout repo
         uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
         with:
           fetch-depth: 0
 
+      - name: Print env
+        run: |
+          echo "IS_CDNA3=$IS_CDNA3"
+          echo "IS_CDNA4=$IS_CDNA4"
+          echo "IS_RDNA4=$IS_RDNA4"
+          echo "HAS_GPU=$HAS_GPU"
+
       - name: Setup Cache Vars
-        if: ${{ env.IS_MI325 == 'true' || env.IS_MI35X == 'true' }}
+        if: ${{ env.IS_CDNA3 == 'true' || env.IS_CDNA4 == 'true' }}
         run: |
           echo "LLVM_SHA=$(cat $GITHUB_WORKSPACE/water/$LLVM_SHA_FILE)" >> $GITHUB_ENV
-          echo "WAVE_TEST_WATER=1" >> $GITHUB_ENV
           echo "WAVE_BUILD_WATER=1" >> $GITHUB_ENV
           echo "WAVE_LLVM_DIR=${GITHUB_WORKSPACE}/llvm-mlir/_mlir_install" >> $GITHUB_ENV
 
       - name: Cache LLVM-MLIR
         id: cache-llvm-mlir
         uses: actions/cache@9255dc7a253b0ccc959486e2bca901246202afeb # v5.0.1
-        if: ${{ env.IS_MI325 == 'true' || env.IS_MI35X == 'true' }}
+        if: ${{ env.IS_CDNA3 == 'true' || env.IS_CDNA4 == 'true' }}
         with:
           path: llvm-mlir/_mlir_install/**
           key: ${{ runner.os }}-build-llvm-${{ env.LLVM_CACHE_NUMBER }}-${{ env.LLVM_SHA }}
@@ -179,12 +167,13 @@ jobs:
           toolchain: stable
 
       - name: "Install dwarfdump"
+        if: ${{ env.IS_RDNA4 == 'false' }}
         run: |
           sudo apt-get update
           sudo apt-get install -y dwarfdump
 
       - name: Install pip deps
-        if: ${{ env.HAS_NO_GPU == 'true' }}
+        if: ${{ env.HAS_GPU == 'false' }}
         run: |
           python -m pip install --no-compile --upgrade pip
           # Note: We install in three steps in order to satisfy requirements
@@ -194,8 +183,8 @@ jobs:
           pip install --no-cache-dir -r requirements-iree-pinned.txt --upgrade
           pip install -r requirements.txt -e .
 
-      - name: Install pip deps (mi35x)
-        if: ${{ env.IS_MI35X == 'true' }}
+      - name: Install pip deps (CDNA4)
+        if: ${{ env.IS_CDNA4 == 'true' }}
         run: |
           # Install TheRock
           python -m pip install --upgrade pip
@@ -211,8 +200,8 @@ jobs:
           pip install --no-cache-dir -r requirements-iree-pinned.txt --upgrade
           pip install -r requirements.txt -e .
 
-      - name: Install pip deps (mi250/mi325)
-        if: ${{ env.HAS_GPU == 'true' && env.IS_MI35X == 'false' }}
+      - name: Install pip deps (CDNA3/RDNA4)
+        if: ${{ env.HAS_GPU == 'true' && env.IS_CDNA4 == 'false' }}
         run: |
           python -m pip install --upgrade pip
           pip install -r pytorch-rocm-requirements.txt
@@ -224,7 +213,7 @@ jobs:
           pytest -n 4 --capture=tee-sys -vv ./tests/unittests/
 
       - name: Test TKW runtime related stack on amdgpu
-        if: ${{ env.HAS_GPU == 'true' && !cancelled() }}
+        if: ${{ env.HAS_GPU == 'true' }}
         run: |
           python -c "import torch; print(torch.cuda.get_device_properties().gcnArchName if torch.cuda.is_available() else 'cpu')"
           export WAVE_CACHE_DIR=$PWD/.wave
@@ -233,26 +222,27 @@ jobs:
           WAVE_CACHE_ON=1 pytest --timeout=300 --capture=tee-sys -vv --run-e2e --durations=100 ./tests/kernel/runtime
 
       - name: Run e2e tests on AMD GPU
-        if: ${{ env.HAS_GPU == 'true' && (github.event_name == 'pull_request') && !cancelled() }}
+        if: ${{ env.HAS_GPU == 'true' && (github.event_name == 'pull_request') }}
         run: |
           WAVE_CACHE_ON=0 pytest -n 4 --timeout=300 --capture=tee-sys -vv --run-e2e --durations=100 ./tests/kernel/
 
       - name: Run expensive e2e tests on AMD GPU
-        if: ${{ env.HAS_GPU == 'true' && (github.event_name != 'pull_request') && !cancelled() }}
+        if: ${{ env.HAS_GPU == 'true' && (github.event_name != 'pull_request') }}
         run: |
           WAVE_CACHE_ON=0 pytest -n 4 --timeout=600 --capture=tee-sys -vv --run-e2e --run-expensive-tests --durations=100 ./tests/kernel/
 
       - name: Run LIT tests
-        if: ${{ !cancelled() }}
+        env:
+           WAVE_TEST_WATER: ${{ env.IS_CDNA3 == 'true' && '1' || '0'  }}
+           WAVE_TEST_DWARFDUMP: ${{ env.IS_RDNA4 == 'false' && '1' || '0' }}
         run: |
-          if [[ "${{ contains(matrix.os, 'mi35x') }}" == 'true' ]]; then
-            # TODO: mlir_converter tests segfault on mi35x
-            export WAVE_TEST_WATER=0
-          fi
-          WAVE_TEST_DWARFDUMP=1 lit lit_tests/ -v
+          # TODO: mlir_converter tests segfault on mi35x
+          # TODO: can't sudo to install dwarfdump on rdna4
+          echo "WAVE_TEST_WATER=$WAVE_TEST_WATER"
+          echo "WAVE_TEST_DWARFDUMP=$WAVE_TEST_DWARFDUMP"
+          lit lit_tests/ -v
 
       - name: MyPy Type Checking
-        if: ${{ !cancelled() }}
         run: |
           mypy