Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CI] Simplify GPU reset handling #17646

Merged
merged 5 commits into from
Mar 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 0 additions & 12 deletions .github/workflows/sycl-linux-precommit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ jobs:
runner: '["Linux", "gen12"]'
image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
target_devices: level_zero:gpu;opencl:gpu;opencl:cpu
reset_intel_gpu: true
extra_lit_opts: --param gpu-intel-gen12=True
- name: NVIDIA/CUDA
runner: '["Linux", "cuda"]'
Expand All @@ -73,46 +72,39 @@ jobs:
runner: '["Linux", "amdgpu"]'
image_options: -u 1001 --device=/dev/dri --device=/dev/kfd
target_devices: hip:gpu
reset_intel_gpu: false
extra_lit_opts: -j 1
- name: Intel Arc A-Series Graphics
runner: '["Linux", "arc"]'
image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
target_devices: level_zero:gpu;opencl:gpu;level_zero_v2:gpu
reset_intel_gpu: true
extra_lit_opts: --param matrix-xmx8=True
- name: E2E tests with dev igc on Intel Arc A-Series Graphics
runner: '["Linux", "arc"]'
image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:devigc
image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
target_devices: level_zero:gpu;opencl:gpu
reset_intel_gpu: true
extra_lit_opts: --param matrix-xmx8=True
use_igc_dev: true
- name: E2E tests on Intel Ponte Vecchio GPU
runner: '["Linux", "pvc"]'
image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
target_devices: level_zero:gpu;opencl:gpu
reset_intel_gpu: true
extra_lit_opts: -j 50
- name: Dev IGC on Intel Ponte Vecchio GPU
runner: '["Linux", "pvc"]'
image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:devigc
image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
target_devices: level_zero:gpu;opencl:gpu
reset_intel_gpu: true
use_igc_dev: true
extra_lit_opts: -j 50
- name: Intel Battlemage Graphics
runner: '["Linux", "bmg"]'
image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
target_devices: level_zero:gpu
reset_intel_gpu: true
- name: SPIR-V Backend / Intel Battlemage Graphics
runner: '["Linux", "bmg"]'
image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
target_devices: level_zero:gpu;opencl:gpu;opencl:cpu
reset_intel_gpu: true
extra_lit_opts: --param spirv-backend=True
e2e_binaries_artifact: sycl_e2e_bin_default_spirv_backend
uses: ./.github/workflows/sycl-linux-run-tests.yml
Expand All @@ -123,7 +115,6 @@ jobs:
image_options: ${{ matrix.image_options }}
target_devices: ${{ matrix.target_devices }}
extra_lit_opts: ${{ matrix.extra_lit_opts }}
reset_intel_gpu: ${{ matrix.reset_intel_gpu }}
repo_ref: ${{ github.sha }}
sycl_toolchain_artifact: sycl_linux_default
sycl_toolchain_archive: ${{ needs.build.outputs.artifact_archive_name }}
Expand Down Expand Up @@ -159,11 +150,9 @@ jobs:
- name: Intel GEN12 Graphics system
runner: '["Linux", "gen12"]'
image_extra_opts: --device=/dev/dri
reset_intel_gpu: true
- name: Intel Arc A-Series Graphics system
runner: '["Linux", "arc"]'
image_extra_opts: --device=/dev/dri
reset_intel_gpu: true
- name: AMD system
runner: '["Linux", "amdgpu"]'
image_extra_opts: --device=/dev/dri --device=/dev/kfd
Expand All @@ -176,7 +165,6 @@ jobs:
runner: ${{ matrix. runner }}
image_options: -u 1001 --privileged --cap-add SYS_ADMIN ${{ matrix.image_extra_opts }}
target_devices: all
reset_intel_gpu: ${{ matrix.reset_intel_gpu }}

env: '{"LIT_FILTER":"PerformanceTests/"}'
extra_lit_opts: -a -j 1 --param enable-perf-tests=True
Expand Down
31 changes: 2 additions & 29 deletions .github/workflows/sycl-linux-run-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,6 @@ on:
type: string
default: 1

reset_intel_gpu:
type: string
required: False
install_igc_driver:
type: string
required: False
Expand Down Expand Up @@ -171,14 +168,6 @@ on:
Extra options to be added to LIT_OPTS.
default: ''

reset_intel_gpu:
description: |
Reset Intel GPUs
type: choice
options:
- false
- true

e2e_testing_mode:
type: choice
options:
Expand All @@ -199,31 +188,15 @@ jobs:
options: ${{ inputs.image_options }}
env: ${{ fromJSON(inputs.env) }}
steps:
- name: Reset Intel GPU
if: inputs.reset_intel_gpu == 'true'
shell: bash
run: |
if [[ '${{ inputs.runner }}' == '["Linux", "bmg"]' ]]; then
sudo bash -c 'echo 0000:05:00.0 > /sys/bus/pci/drivers/xe/unbind'
sudo bash -c 'echo 1 > /sys/bus/pci/devices/0000:05:00.0/reset'
sudo bash -c 'echo 0000:05:00.0 > /sys/bus/pci/drivers/xe/bind'
else
sudo mount -t debugfs none /sys/kernel/debug
base_dir="/sys/kernel/debug/dri"

for dir in "$base_dir"/*; do
if [ -f "$dir/i915_wedged" ]; then
sudo bash -c 'echo 1 > $0/i915_wedged' $dir
fi
done
fi
- uses: actions/checkout@v4
with:
ref: ${{ inputs.devops_ref || inputs.repo_ref }}
sparse-checkout: |
devops
- name: Register cleanup after job is finished
uses: ./devops/actions/cleanup
- name: Reset Intel GPU
uses: ./devops/actions/reset_gpu
- name: Install drivers
if: inputs.install_igc_driver == 'true' || inputs.install_dev_igc_driver == 'true'
env:
Expand Down
10 changes: 0 additions & 10 deletions .github/workflows/sycl-nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,31 +70,26 @@ jobs:
runner: '["Linux", "gen12"]'
image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
target_devices: level_zero:gpu
reset_intel_gpu: true

- name: Intel L0 Ponte Vecchio GPU
runner: '["Linux", "pvc"]'
image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
target_devices: level_zero:gpu
reset_intel_gpu: true

- name: Intel L0 Battlemage GPU
runner: '["Linux", "bmg"]'
image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
target_devices: level_zero:gpu
reset_intel_gpu: true

- name: Intel L0 Arc A-Series GPU
runner: '["Linux", "arc"]'
image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
target_devices: level_zero:gpu
reset_intel_gpu: true

- name: Intel OCL Gen12 GPU
runner: '["Linux", "gen12"]'
image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
target_devices: opencl:gpu
reset_intel_gpu: true

- name: OCL CPU (AMD)
runner: '["Linux", "amdcpu"]'
Expand All @@ -115,7 +110,6 @@ jobs:
runner: '["Linux", "pvc"]'
image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
target_devices: level_zero:gpu
reset_intel_gpu: true
extra_lit_opts: --param test-preview-mode=True

uses: ./.github/workflows/sycl-linux-run-tests.yml
Expand All @@ -126,7 +120,6 @@ jobs:
target_devices: ${{ matrix.target_devices }}
tests_selector: e2e
extra_lit_opts: "--param 'cxx_flags=-D_GLIBCXX_USE_CXX11_ABI=0' ${{ matrix.extra_lit_opts }}"
reset_intel_gpu: ${{ matrix.reset_intel_gpu }}
repo_ref: ${{ github.sha }}
sycl_toolchain_artifact: sycl_linux_default
sycl_toolchain_archive: ${{ needs.ubuntu2204_build.outputs.artifact_archive_name }}
Expand All @@ -140,7 +133,6 @@ jobs:
name: Intel PVC L0 oneAPI
runner: '["Linux", "pvc"]'
target_devices: level_zero:gpu
reset_intel_gpu: true
extra_lit_opts: -j 50
image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
repo_ref: ${{ github.sha }}
Expand Down Expand Up @@ -302,7 +294,6 @@ jobs:
runner: '["PVC_PERF"]'
image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
target_devices: level_zero:gpu
reset_intel_gpu: true
uses: ./.github/workflows/sycl-linux-run-tests.yml
secrets: inherit
with:
Expand All @@ -311,7 +302,6 @@ jobs:
image_options: ${{ matrix.image_options }}
target_devices: ${{ matrix.target_devices }}
tests_selector: compute-benchmarks
reset_intel_gpu: ${{ matrix.reset_intel_gpu }}
repo_ref: ${{ github.sha }}
sycl_toolchain_artifact: sycl_linux_default
sycl_toolchain_archive: ${{ needs.ubuntu2204_build.outputs.artifact_archive_name }}
Expand Down
5 changes: 0 additions & 5 deletions .github/workflows/sycl-post-commit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,9 @@ jobs:
- name: Intel GEN12 Graphics with Level Zero
runner: '["Linux", "gen12"]'
target_devices: level_zero:gpu
reset_intel_gpu: true
- name: Intel Arc A-Series Graphics with Level Zero
runner: '["Linux", "arc"]'
extra_lit_opts: --param matrix-xmx8=True
reset_intel_gpu: true
# Performance tests below. Specifics:
# - only run performance tests (use LIT_FILTER env)
# - ask llvm-lit to show all the output, even for PASS (-a)
Expand All @@ -69,20 +67,17 @@ jobs:
env: '{"LIT_FILTER":"PerformanceTests/"}'
extra_lit_opts: -a -j 1 --param enable-perf-tests=True
target_devices: all
reset_intel_gpu: true
- name: Perf tests on Intel Arc A-Series Graphics system
runner: '["Linux", "arc"]'
env: '{"LIT_FILTER":"PerformanceTests/"}'
extra_lit_opts: -a -j 1 --param enable-perf-tests=True
target_devices: all
reset_intel_gpu: true
uses: ./.github/workflows/sycl-linux-run-tests.yml
with:
name: ${{ matrix.name }}
runner: ${{ matrix. runner }}
image_options: ${{ matrix.image_options || '-u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN' }}
target_devices: ${{ matrix.target_devices || 'level_zero:gpu' }}
reset_intel_gpu: ${{ matrix.reset_intel_gpu }}

extra_lit_opts: ${{ matrix.extra_lit_opts }}
env: ${{ matrix.env || '{}' }}
Expand Down
3 changes: 0 additions & 3 deletions .github/workflows/sycl-rel-nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,15 +63,13 @@ jobs:
runner: '["Linux", "gen12"]'
image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
target_devices: level_zero:gpu
reset_intel_gpu: true
tests_selector: e2e
extra_lit_opts: --param gpu-intel-gen12=True

- name: Intel OCL GPU
runner: '["Linux", "gen12"]'
image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
target_devices: opencl:gpu
reset_intel_gpu: true
tests_selector: e2e
extra_lit_opts: --param gpu-intel-gen12=True

Expand All @@ -88,7 +86,6 @@ jobs:
target_devices: ${{ matrix.target_devices }}
tests_selector: ${{ matrix.tests_selector }}
extra_lit_opts: ${{ matrix.extra_lit_opts }}
reset_intel_gpu: ${{ matrix.reset_intel_gpu }}
repo_ref: sycl-rel-6_1_0
devops_ref: sycl
sycl_toolchain_artifact: sycl_linux_default
Expand Down
32 changes: 32 additions & 0 deletions devops/actions/reset_gpu/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
name: Reset Intel GPU

runs:
using: "composite"
steps:
- name: Reset Intel GPU
shell: bash
run: |
# First reset all xe devices.
driver_path="/sys/bus/pci/drivers/xe"

if [ -d "$driver_path" ]; then
# Extract PCI paths of devices bound to xe
for device in $(ls "$driver_path" | grep -E '^[0-9a-f]{4}:[0-9a-f]{2}:[0-9a-f]{2}\.[0-9]$'); do
sysfs_path="/sys/bus/pci/devices/$device"
sudo bash -c 'echo $0 > $1/unbind' $device $driver_path
sudo bash -c 'echo 1 > $0/reset' $sysfs_path
sudo bash -c 'echo $0 > $1/bind' $device $driver_path
echo "Reset $device"
done
fi

# Then reset all i915 devices. We don't do a PCI FLR because
# it seems to fail on some older GPUs.
sudo mount -t debugfs none /sys/kernel/debug || true
dirs=$(sudo bash -c 'ls -d /sys/kernel/debug/dri/*') || true
for dir in $dirs; do
if sudo test -e "${dir}/i915_wedged"; then
sudo bash -c 'echo 1 > ${dir}/i915_wedged' $dir
echo "Reset $(basename $dir)"
fi
done
Loading