Skip to content

Commit ad8a855

Browse files
committed
[CI] Simplify GPU reset handling
Signed-off-by: Sarnie, Nick <[email protected]>
1 parent 8fec832 commit ad8a855

File tree

6 files changed

+34
-59
lines changed

6 files changed

+34
-59
lines changed

.github/workflows/sycl-linux-precommit.yml

-12
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,6 @@ jobs:
6363
runner: '["Linux", "gen12"]'
6464
image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
6565
target_devices: level_zero:gpu;opencl:gpu;opencl:cpu
66-
reset_intel_gpu: true
6766
extra_lit_opts: --param gpu-intel-gen12=True
6867
- name: NVIDIA/CUDA
6968
runner: '["Linux", "cuda"]'
@@ -73,46 +72,39 @@ jobs:
7372
runner: '["Linux", "amdgpu"]'
7473
image_options: -u 1001 --device=/dev/dri --device=/dev/kfd
7574
target_devices: hip:gpu
76-
reset_intel_gpu: false
7775
extra_lit_opts: -j 1
7876
- name: Intel Arc A-Series Graphics
7977
runner: '["Linux", "arc"]'
8078
image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
8179
target_devices: level_zero:gpu;opencl:gpu;level_zero_v2:gpu
82-
reset_intel_gpu: true
8380
extra_lit_opts: --param matrix-xmx8=True
8481
- name: E2E tests with dev igc on Intel Arc A-Series Graphics
8582
runner: '["Linux", "arc"]'
8683
image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:devigc
8784
image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
8885
target_devices: level_zero:gpu;opencl:gpu
89-
reset_intel_gpu: true
9086
extra_lit_opts: --param matrix-xmx8=True
9187
use_igc_dev: true
9288
- name: E2E tests on Intel Ponte Vecchio GPU
9389
runner: '["Linux", "pvc"]'
9490
image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
9591
target_devices: level_zero:gpu;opencl:gpu
96-
reset_intel_gpu: true
9792
extra_lit_opts: -j 50
9893
- name: Dev IGC on Intel Ponte Vecchio GPU
9994
runner: '["Linux", "pvc"]'
10095
image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:devigc
10196
image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
10297
target_devices: level_zero:gpu;opencl:gpu
103-
reset_intel_gpu: true
10498
use_igc_dev: true
10599
extra_lit_opts: -j 50
106100
- name: Intel Battlemage Graphics
107101
runner: '["Linux", "bmg"]'
108102
image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
109103
target_devices: level_zero:gpu
110-
reset_intel_gpu: true
111104
- name: SPIR-V Backend / Intel Battlemage Graphics
112105
runner: '["Linux", "bmg"]'
113106
image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
114107
target_devices: level_zero:gpu;opencl:gpu;opencl:cpu
115-
reset_intel_gpu: true
116108
extra_lit_opts: --param spirv-backend=True
117109
e2e_binaries_artifact: sycl_e2e_bin_default_spirv_backend
118110
uses: ./.github/workflows/sycl-linux-run-tests.yml
@@ -123,7 +115,6 @@ jobs:
123115
image_options: ${{ matrix.image_options }}
124116
target_devices: ${{ matrix.target_devices }}
125117
extra_lit_opts: ${{ matrix.extra_lit_opts }}
126-
reset_intel_gpu: ${{ matrix.reset_intel_gpu }}
127118
repo_ref: ${{ github.sha }}
128119
sycl_toolchain_artifact: sycl_linux_default
129120
sycl_toolchain_archive: ${{ needs.build.outputs.artifact_archive_name }}
@@ -159,11 +150,9 @@ jobs:
159150
- name: Intel GEN12 Graphics system
160151
runner: '["Linux", "gen12"]'
161152
image_extra_opts: --device=/dev/dri
162-
reset_intel_gpu: true
163153
- name: Intel Arc A-Series Graphics system
164154
runner: '["Linux", "arc"]'
165155
image_extra_opts: --device=/dev/dri
166-
reset_intel_gpu: true
167156
- name: AMD system
168157
runner: '["Linux", "amdgpu"]'
169158
image_extra_opts: --device=/dev/dri --device=/dev/kfd
@@ -176,7 +165,6 @@ jobs:
176165
runner: ${{ matrix. runner }}
177166
image_options: -u 1001 --privileged --cap-add SYS_ADMIN ${{ matrix.image_extra_opts }}
178167
target_devices: all
179-
reset_intel_gpu: ${{ matrix.reset_intel_gpu }}
180168

181169
env: '{"LIT_FILTER":"PerformanceTests/"}'
182170
extra_lit_opts: -a -j 1 --param enable-perf-tests=True

.github/workflows/sycl-linux-run-tests.yml

+2-29
Original file line numberDiff line numberDiff line change
@@ -82,9 +82,6 @@ on:
8282
type: string
8383
default: 1
8484

85-
reset_intel_gpu:
86-
type: string
87-
required: False
8885
install_igc_driver:
8986
type: string
9087
required: False
@@ -171,14 +168,6 @@ on:
171168
Extra options to be added to LIT_OPTS.
172169
default: ''
173170

174-
reset_intel_gpu:
175-
description: |
176-
Reset Intel GPUs
177-
type: choice
178-
options:
179-
- false
180-
- true
181-
182171
e2e_testing_mode:
183172
type: choice
184173
options:
@@ -199,31 +188,15 @@ jobs:
199188
options: ${{ inputs.image_options }}
200189
env: ${{ fromJSON(inputs.env) }}
201190
steps:
202-
- name: Reset Intel GPU
203-
if: inputs.reset_intel_gpu == 'true'
204-
shell: bash
205-
run: |
206-
if [[ '${{ inputs.runner }}' == '["Linux", "bmg"]' ]]; then
207-
sudo bash -c 'echo 0000:05:00.0 > /sys/bus/pci/drivers/xe/unbind'
208-
sudo bash -c 'echo 1 > /sys/bus/pci/devices/0000:05:00.0/reset'
209-
sudo bash -c 'echo 0000:05:00.0 > /sys/bus/pci/drivers/xe/bind'
210-
else
211-
sudo mount -t debugfs none /sys/kernel/debug
212-
base_dir="/sys/kernel/debug/dri"
213-
214-
for dir in "$base_dir"/*; do
215-
if [ -f "$dir/i915_wedged" ]; then
216-
sudo bash -c 'echo 1 > $0/i915_wedged' $dir
217-
fi
218-
done
219-
fi
220191
- uses: actions/checkout@v4
221192
with:
222193
ref: ${{ inputs.devops_ref || inputs.repo_ref }}
223194
sparse-checkout: |
224195
devops
225196
- name: Register cleanup after job is finished
226197
uses: ./devops/actions/cleanup
198+
- name: Reset Intel GPU
199+
uses: ./devops/actions/reset_gpu
227200
- name: Install drivers
228201
if: inputs.install_igc_driver == 'true' || inputs.install_dev_igc_driver == 'true'
229202
env:

.github/workflows/sycl-nightly.yml

-10
Original file line numberDiff line numberDiff line change
@@ -70,31 +70,26 @@ jobs:
7070
runner: '["Linux", "gen12"]'
7171
image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
7272
target_devices: level_zero:gpu
73-
reset_intel_gpu: true
7473

7574
- name: Intel L0 Ponte Vecchio GPU
7675
runner: '["Linux", "pvc"]'
7776
image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
7877
target_devices: level_zero:gpu
79-
reset_intel_gpu: true
8078

8179
- name: Intel L0 Battlemage GPU
8280
runner: '["Linux", "bmg"]'
8381
image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
8482
target_devices: level_zero:gpu
85-
reset_intel_gpu: true
8683

8784
- name: Intel L0 Arc A-Series GPU
8885
runner: '["Linux", "arc"]'
8986
image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
9087
target_devices: level_zero:gpu
91-
reset_intel_gpu: true
9288

9389
- name: Intel OCL Gen12 GPU
9490
runner: '["Linux", "gen12"]'
9591
image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
9692
target_devices: opencl:gpu
97-
reset_intel_gpu: true
9893

9994
- name: OCL CPU (AMD)
10095
runner: '["Linux", "amdcpu"]'
@@ -115,7 +110,6 @@ jobs:
115110
runner: '["Linux", "pvc"]'
116111
image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
117112
target_devices: level_zero:gpu
118-
reset_intel_gpu: true
119113
extra_lit_opts: --param test-preview-mode=True
120114

121115
uses: ./.github/workflows/sycl-linux-run-tests.yml
@@ -126,7 +120,6 @@ jobs:
126120
target_devices: ${{ matrix.target_devices }}
127121
tests_selector: e2e
128122
extra_lit_opts: "--param 'cxx_flags=-D_GLIBCXX_USE_CXX11_ABI=0' ${{ matrix.extra_lit_opts }}"
129-
reset_intel_gpu: ${{ matrix.reset_intel_gpu }}
130123
repo_ref: ${{ github.sha }}
131124
sycl_toolchain_artifact: sycl_linux_default
132125
sycl_toolchain_archive: ${{ needs.ubuntu2204_build.outputs.artifact_archive_name }}
@@ -140,7 +133,6 @@ jobs:
140133
name: Intel PVC L0 oneAPI
141134
runner: '["Linux", "pvc"]'
142135
target_devices: level_zero:gpu
143-
reset_intel_gpu: true
144136
extra_lit_opts: -j 50
145137
image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
146138
repo_ref: ${{ github.sha }}
@@ -302,7 +294,6 @@ jobs:
302294
runner: '["PVC_PERF"]'
303295
image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
304296
target_devices: level_zero:gpu
305-
reset_intel_gpu: true
306297
uses: ./.github/workflows/sycl-linux-run-tests.yml
307298
secrets: inherit
308299
with:
@@ -311,7 +302,6 @@ jobs:
311302
image_options: ${{ matrix.image_options }}
312303
target_devices: ${{ matrix.target_devices }}
313304
tests_selector: compute-benchmarks
314-
reset_intel_gpu: ${{ matrix.reset_intel_gpu }}
315305
repo_ref: ${{ github.sha }}
316306
sycl_toolchain_artifact: sycl_linux_default
317307
sycl_toolchain_archive: ${{ needs.ubuntu2204_build.outputs.artifact_archive_name }}

.github/workflows/sycl-post-commit.yml

-5
Original file line numberDiff line numberDiff line change
@@ -53,11 +53,9 @@ jobs:
5353
- name: Intel GEN12 Graphics with Level Zero
5454
runner: '["Linux", "gen12"]'
5555
target_devices: level_zero:gpu
56-
reset_intel_gpu: true
5756
- name: Intel Arc A-Series Graphics with Level Zero
5857
runner: '["Linux", "arc"]'
5958
extra_lit_opts: --param matrix-xmx8=True
60-
reset_intel_gpu: true
6159
# Performance tests below. Specifics:
6260
# - only run performance tests (use LIT_FILTER env)
6361
# - ask llvm-lit to show all the output, even for PASS (-a)
@@ -69,20 +67,17 @@ jobs:
6967
env: '{"LIT_FILTER":"PerformanceTests/"}'
7068
extra_lit_opts: -a -j 1 --param enable-perf-tests=True
7169
target_devices: all
72-
reset_intel_gpu: true
7370
- name: Perf tests on Intel Arc A-Series Graphics system
7471
runner: '["Linux", "arc"]'
7572
env: '{"LIT_FILTER":"PerformanceTests/"}'
7673
extra_lit_opts: -a -j 1 --param enable-perf-tests=True
7774
target_devices: all
78-
reset_intel_gpu: true
7975
uses: ./.github/workflows/sycl-linux-run-tests.yml
8076
with:
8177
name: ${{ matrix.name }}
8278
runner: ${{ matrix. runner }}
8379
image_options: ${{ matrix.image_options || '-u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN' }}
8480
target_devices: ${{ matrix.target_devices || 'level_zero:gpu' }}
85-
reset_intel_gpu: ${{ matrix.reset_intel_gpu }}
8681

8782
extra_lit_opts: ${{ matrix.extra_lit_opts }}
8883
env: ${{ matrix.env || '{}' }}

.github/workflows/sycl-rel-nightly.yml

-3
Original file line numberDiff line numberDiff line change
@@ -63,15 +63,13 @@ jobs:
6363
runner: '["Linux", "gen12"]'
6464
image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
6565
target_devices: level_zero:gpu
66-
reset_intel_gpu: true
6766
tests_selector: e2e
6867
extra_lit_opts: --param gpu-intel-gen12=True
6968

7069
- name: Intel OCL GPU
7170
runner: '["Linux", "gen12"]'
7271
image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
7372
target_devices: opencl:gpu
74-
reset_intel_gpu: true
7573
tests_selector: e2e
7674
extra_lit_opts: --param gpu-intel-gen12=True
7775

@@ -88,7 +86,6 @@ jobs:
8886
target_devices: ${{ matrix.target_devices }}
8987
tests_selector: ${{ matrix.tests_selector }}
9088
extra_lit_opts: ${{ matrix.extra_lit_opts }}
91-
reset_intel_gpu: ${{ matrix.reset_intel_gpu }}
9289
repo_ref: sycl-rel-6_1_0
9390
devops_ref: sycl
9491
sycl_toolchain_artifact: sycl_linux_default

devops/actions/reset_gpu/action.yml

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
name: Reset Intel GPU
2+
3+
runs:
4+
using: "composite"
5+
steps:
6+
- name: Reset Intel GPU
7+
shell: bash
8+
run: |
9+
# First reset all xe devices.
10+
driver_path="/sys/bus/pci/drivers/xe"
11+
12+
if [ -d "$driver_path" ]; then
13+
# Extract PCI paths of devices bound to xe
14+
for device in $(ls "$driver_path" | grep -E '^[0-9a-f]{4}:[0-9a-f]{2}:[0-9a-f]{2}\.[0-9]$'); do
15+
sysfs_path="/sys/bus/pci/devices/$device"
16+
sudo bash -c 'echo $0 > $1/unbind' $device $driver_path
17+
sudo bash -c 'echo 1 > $0/reset' $sysfs_path
18+
sudo bash -c 'echo $0 > $1/bind' $device $driver_path
19+
echo "Reset $device"
20+
done
21+
fi
22+
23+
# Then reset all i915 devices. We don't do a PCI FLR because
24+
# it seems to fail on some older GPUs.
25+
sudo mount -t debugfs none /sys/kernel/debug || true
26+
base_dir="/sys/kernel/debug/dri"
27+
for dir in "$base_dir"/*; do
28+
if [ -f "$dir/i915_wedged" ]; then
29+
sudo bash -c 'echo 1 > $0/i915_wedged' $dir
30+
echo "Reset $(basename $dir)"
31+
fi
32+
done

0 commit comments

Comments
 (0)