From ad8a8556ad715e246922551d846d55f88dfcb7ca Mon Sep 17 00:00:00 2001 From: "Sarnie, Nick" Date: Tue, 25 Mar 2025 13:20:24 -0700 Subject: [PATCH 1/5] [CI] Simplify GPU reset handling Signed-off-by: Sarnie, Nick --- .github/workflows/sycl-linux-precommit.yml | 12 -------- .github/workflows/sycl-linux-run-tests.yml | 31 ++------------------- .github/workflows/sycl-nightly.yml | 10 ------- .github/workflows/sycl-post-commit.yml | 5 ---- .github/workflows/sycl-rel-nightly.yml | 3 -- devops/actions/reset_gpu/action.yml | 32 ++++++++++++++++++++++ 6 files changed, 34 insertions(+), 59 deletions(-) create mode 100644 devops/actions/reset_gpu/action.yml diff --git a/.github/workflows/sycl-linux-precommit.yml b/.github/workflows/sycl-linux-precommit.yml index 48e4befe65b0e..b1ea2fdb03eb6 100644 --- a/.github/workflows/sycl-linux-precommit.yml +++ b/.github/workflows/sycl-linux-precommit.yml @@ -63,7 +63,6 @@ jobs: runner: '["Linux", "gen12"]' image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN target_devices: level_zero:gpu;opencl:gpu;opencl:cpu - reset_intel_gpu: true extra_lit_opts: --param gpu-intel-gen12=True - name: NVIDIA/CUDA runner: '["Linux", "cuda"]' @@ -73,46 +72,39 @@ jobs: runner: '["Linux", "amdgpu"]' image_options: -u 1001 --device=/dev/dri --device=/dev/kfd target_devices: hip:gpu - reset_intel_gpu: false extra_lit_opts: -j 1 - name: Intel Arc A-Series Graphics runner: '["Linux", "arc"]' image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN target_devices: level_zero:gpu;opencl:gpu;level_zero_v2:gpu - reset_intel_gpu: true extra_lit_opts: --param matrix-xmx8=True - name: E2E tests with dev igc on Intel Arc A-Series Graphics runner: '["Linux", "arc"]' image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:devigc image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN target_devices: level_zero:gpu;opencl:gpu - reset_intel_gpu: true extra_lit_opts: --param matrix-xmx8=True use_igc_dev: true - name: E2E tests on Intel Ponte Vecchio GPU runner: '["Linux", "pvc"]' image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN target_devices: level_zero:gpu;opencl:gpu - reset_intel_gpu: true extra_lit_opts: -j 50 - name: Dev IGC on Intel Ponte Vecchio GPU runner: '["Linux", "pvc"]' image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:devigc image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN target_devices: level_zero:gpu;opencl:gpu - reset_intel_gpu: true use_igc_dev: true extra_lit_opts: -j 50 - name: Intel Battlemage Graphics runner: '["Linux", "bmg"]' image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN target_devices: level_zero:gpu - reset_intel_gpu: true - name: SPIR-V Backend / Intel Battlemage Graphics runner: '["Linux", "bmg"]' image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN target_devices: level_zero:gpu;opencl:gpu;opencl:cpu - reset_intel_gpu: true extra_lit_opts: --param spirv-backend=True e2e_binaries_artifact: sycl_e2e_bin_default_spirv_backend uses: ./.github/workflows/sycl-linux-run-tests.yml @@ -123,7 +115,6 @@ jobs: image_options: ${{ matrix.image_options }} target_devices: ${{ matrix.target_devices }} extra_lit_opts: ${{ matrix.extra_lit_opts }} - reset_intel_gpu: ${{ matrix.reset_intel_gpu }} repo_ref: ${{ github.sha }} sycl_toolchain_artifact: sycl_linux_default sycl_toolchain_archive: ${{ needs.build.outputs.artifact_archive_name }} @@ -159,11 +150,9 @@ jobs: - name: Intel GEN12 Graphics system runner: '["Linux", "gen12"]' image_extra_opts: --device=/dev/dri - reset_intel_gpu: true - name: Intel Arc A-Series Graphics system runner: '["Linux", "arc"]' image_extra_opts: --device=/dev/dri - reset_intel_gpu: true - name: AMD system runner: '["Linux", "amdgpu"]' image_extra_opts: --device=/dev/dri --device=/dev/kfd @@ -176,7 +165,6 @@ jobs: runner: ${{ matrix. runner }} image_options: -u 1001 --privileged --cap-add SYS_ADMIN ${{ matrix.image_extra_opts }} target_devices: all - reset_intel_gpu: ${{ matrix.reset_intel_gpu }} env: '{"LIT_FILTER":"PerformanceTests/"}' extra_lit_opts: -a -j 1 --param enable-perf-tests=True diff --git a/.github/workflows/sycl-linux-run-tests.yml b/.github/workflows/sycl-linux-run-tests.yml index 2f3c02bf334ed..2420450630d5f 100644 --- a/.github/workflows/sycl-linux-run-tests.yml +++ b/.github/workflows/sycl-linux-run-tests.yml @@ -82,9 +82,6 @@ on: type: string default: 1 - reset_intel_gpu: - type: string - required: False install_igc_driver: type: string required: False @@ -171,14 +168,6 @@ on: Extra options to be added to LIT_OPTS. default: '' - reset_intel_gpu: - description: | - Reset Intel GPUs - type: choice - options: - - false - - true - e2e_testing_mode: type: choice options: @@ -199,24 +188,6 @@ jobs: options: ${{ inputs.image_options }} env: ${{ fromJSON(inputs.env) }} steps: - - name: Reset Intel GPU - if: inputs.reset_intel_gpu == 'true' - shell: bash - run: | - if [[ '${{ inputs.runner }}' == '["Linux", "bmg"]' ]]; then - sudo bash -c 'echo 0000:05:00.0 > /sys/bus/pci/drivers/xe/unbind' - sudo bash -c 'echo 1 > /sys/bus/pci/devices/0000:05:00.0/reset' - sudo bash -c 'echo 0000:05:00.0 > /sys/bus/pci/drivers/xe/bind' - else - sudo mount -t debugfs none /sys/kernel/debug - base_dir="/sys/kernel/debug/dri" - - for dir in "$base_dir"/*; do - if [ -f "$dir/i915_wedged" ]; then - sudo bash -c 'echo 1 > $0/i915_wedged' $dir - fi - done - fi - uses: actions/checkout@v4 with: ref: ${{ inputs.devops_ref || inputs.repo_ref }} @@ -224,6 +195,8 @@ jobs: devops - name: Register cleanup after job is finished uses: ./devops/actions/cleanup + - name: Reset Intel GPU + uses: ./devops/actions/reset_gpu - name: Install drivers if: inputs.install_igc_driver == 'true' || inputs.install_dev_igc_driver == 'true' env: diff --git a/.github/workflows/sycl-nightly.yml b/.github/workflows/sycl-nightly.yml index 331911824ef3c..d419671c8be6b 100644 --- a/.github/workflows/sycl-nightly.yml +++ b/.github/workflows/sycl-nightly.yml @@ -70,31 +70,26 @@ jobs: runner: '["Linux", "gen12"]' image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN target_devices: level_zero:gpu - reset_intel_gpu: true - name: Intel L0 Ponte Vecchio GPU runner: '["Linux", "pvc"]' image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN target_devices: level_zero:gpu - reset_intel_gpu: true - name: Intel L0 Battlemage GPU runner: '["Linux", "bmg"]' image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN target_devices: level_zero:gpu - reset_intel_gpu: true - name: Intel L0 Arc A-Series GPU runner: '["Linux", "arc"]' image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN target_devices: level_zero:gpu - reset_intel_gpu: true - name: Intel OCL Gen12 GPU runner: '["Linux", "gen12"]' image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN target_devices: opencl:gpu - reset_intel_gpu: true - name: OCL CPU (AMD) runner: '["Linux", "amdcpu"]' @@ -115,7 +110,6 @@ jobs: runner: '["Linux", "pvc"]' image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN target_devices: level_zero:gpu - reset_intel_gpu: true extra_lit_opts: --param test-preview-mode=True uses: ./.github/workflows/sycl-linux-run-tests.yml @@ -126,7 +120,6 @@ jobs: target_devices: ${{ matrix.target_devices }} tests_selector: e2e extra_lit_opts: "--param 'cxx_flags=-D_GLIBCXX_USE_CXX11_ABI=0' ${{ matrix.extra_lit_opts }}" - reset_intel_gpu: ${{ matrix.reset_intel_gpu }} repo_ref: ${{ github.sha }} sycl_toolchain_artifact: sycl_linux_default sycl_toolchain_archive: ${{ needs.ubuntu2204_build.outputs.artifact_archive_name }} @@ -140,7 +133,6 @@ jobs: name: Intel PVC L0 oneAPI runner: '["Linux", "pvc"]' target_devices: level_zero:gpu - reset_intel_gpu: true extra_lit_opts: -j 50 image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN repo_ref: ${{ github.sha }} @@ -302,7 +294,6 @@ jobs: runner: '["PVC_PERF"]' image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN target_devices: level_zero:gpu - reset_intel_gpu: true uses: ./.github/workflows/sycl-linux-run-tests.yml secrets: inherit with: @@ -311,7 +302,6 @@ jobs: image_options: ${{ matrix.image_options }} target_devices: ${{ matrix.target_devices }} tests_selector: compute-benchmarks - reset_intel_gpu: ${{ matrix.reset_intel_gpu }} repo_ref: ${{ github.sha }} sycl_toolchain_artifact: sycl_linux_default sycl_toolchain_archive: ${{ needs.ubuntu2204_build.outputs.artifact_archive_name }} diff --git a/.github/workflows/sycl-post-commit.yml b/.github/workflows/sycl-post-commit.yml index a2b388a0ca487..95ae6f52ac547 100644 --- a/.github/workflows/sycl-post-commit.yml +++ b/.github/workflows/sycl-post-commit.yml @@ -53,11 +53,9 @@ jobs: - name: Intel GEN12 Graphics with Level Zero runner: '["Linux", "gen12"]' target_devices: level_zero:gpu - reset_intel_gpu: true - name: Intel Arc A-Series Graphics with Level Zero runner: '["Linux", "arc"]' extra_lit_opts: --param matrix-xmx8=True - reset_intel_gpu: true # Performance tests below. Specifics: # - only run performance tests (use LIT_FILTER env) # - ask llvm-lit to show all the output, even for PASS (-a) @@ -69,20 +67,17 @@ jobs: env: '{"LIT_FILTER":"PerformanceTests/"}' extra_lit_opts: -a -j 1 --param enable-perf-tests=True target_devices: all - reset_intel_gpu: true - name: Perf tests on Intel Arc A-Series Graphics system runner: '["Linux", "arc"]' env: '{"LIT_FILTER":"PerformanceTests/"}' extra_lit_opts: -a -j 1 --param enable-perf-tests=True target_devices: all - reset_intel_gpu: true uses: ./.github/workflows/sycl-linux-run-tests.yml with: name: ${{ matrix.name }} runner: ${{ matrix. runner }} image_options: ${{ matrix.image_options || '-u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN' }} target_devices: ${{ matrix.target_devices || 'level_zero:gpu' }} - reset_intel_gpu: ${{ matrix.reset_intel_gpu }} extra_lit_opts: ${{ matrix.extra_lit_opts }} env: ${{ matrix.env || '{}' }} diff --git a/.github/workflows/sycl-rel-nightly.yml b/.github/workflows/sycl-rel-nightly.yml index b0487f6d435bc..94f931264ec65 100644 --- a/.github/workflows/sycl-rel-nightly.yml +++ b/.github/workflows/sycl-rel-nightly.yml @@ -63,7 +63,6 @@ jobs: runner: '["Linux", "gen12"]' image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN target_devices: level_zero:gpu - reset_intel_gpu: true tests_selector: e2e extra_lit_opts: --param gpu-intel-gen12=True @@ -71,7 +70,6 @@ jobs: runner: '["Linux", "gen12"]' image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN target_devices: opencl:gpu - reset_intel_gpu: true tests_selector: e2e extra_lit_opts: --param gpu-intel-gen12=True @@ -88,7 +86,6 @@ jobs: target_devices: ${{ matrix.target_devices }} tests_selector: ${{ matrix.tests_selector }} extra_lit_opts: ${{ matrix.extra_lit_opts }} - reset_intel_gpu: ${{ matrix.reset_intel_gpu }} repo_ref: sycl-rel-6_1_0 devops_ref: sycl sycl_toolchain_artifact: sycl_linux_default diff --git a/devops/actions/reset_gpu/action.yml b/devops/actions/reset_gpu/action.yml new file mode 100644 index 0000000000000..142a7d9cbac2b --- /dev/null +++ b/devops/actions/reset_gpu/action.yml @@ -0,0 +1,32 @@ +name: Reset Intel GPU + +runs: + using: "composite" + steps: + - name: Reset Intel GPU + shell: bash + run: | + # First reset all xe devices. + driver_path="/sys/bus/pci/drivers/xe" + + if [ -d "$driver_path" ]; then + # Extract PCI paths of devices bound to xe + for device in $(ls "$driver_path" | grep -E '^[0-9a-f]{4}:[0-9a-f]{2}:[0-9a-f]{2}\.[0-9]$'); do + sysfs_path="/sys/bus/pci/devices/$device" + sudo bash -c 'echo $0 > $1/unbind' $device $driver_path + sudo bash -c 'echo 1 > $0/reset' $sysfs_path + sudo bash -c 'echo $0 > $1/bind' $device $driver_path + echo "Reset $device" + done + fi + + # Then reset all i915 devices. We don't do a PCI FLR because + # it seems to fail on some older GPUs. + sudo mount -t debugfs none /sys/kernel/debug || true + base_dir="/sys/kernel/debug/dri" + for dir in "$base_dir"/*; do + if [ -f "$dir/i915_wedged" ]; then + sudo bash -c 'echo 1 > $0/i915_wedged' $dir + echo "Reset $(basename $dir)" + fi + done \ No newline at end of file From bc4253b4700198c1a784f4cf87a3146daf2ab37b Mon Sep 17 00:00:00 2001 From: "Sarnie, Nick" Date: Thu, 27 Mar 2025 14:00:11 -0700 Subject: [PATCH 2/5] fix Signed-off-by: Sarnie, Nick --- devops/actions/reset_gpu/action.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/devops/actions/reset_gpu/action.yml b/devops/actions/reset_gpu/action.yml index 142a7d9cbac2b..d17b1c8442d80 100644 --- a/devops/actions/reset_gpu/action.yml +++ b/devops/actions/reset_gpu/action.yml @@ -23,10 +23,10 @@ runs: # Then reset all i915 devices. We don't do a PCI FLR because # it seems to fail on some older GPUs. sudo mount -t debugfs none /sys/kernel/debug || true - base_dir="/sys/kernel/debug/dri" - for dir in "$base_dir"/*; do - if [ -f "$dir/i915_wedged" ]; then - sudo bash -c 'echo 1 > $0/i915_wedged' $dir + dirs=$(sudo ls -d /sys/kernel/debug/dri/*/) + for dir in $dirs; do + if sudo test -e "${dir}/i915_wedged"; then + sudo bash -c 'echo 1 > ${dir}/i915_wedged' $dir echo "Reset $(basename $dir)" fi done \ No newline at end of file From c8ca1d284d6b17cf8a04fe149ff37dc71b1657cb Mon Sep 17 00:00:00 2001 From: "Sarnie, Nick" Date: Thu, 27 Mar 2025 17:24:39 -0700 Subject: [PATCH 3/5] fix Signed-off-by: Sarnie, Nick --- devops/actions/reset_gpu/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devops/actions/reset_gpu/action.yml b/devops/actions/reset_gpu/action.yml index d17b1c8442d80..c22bfed50eca3 100644 --- a/devops/actions/reset_gpu/action.yml +++ b/devops/actions/reset_gpu/action.yml @@ -23,7 +23,7 @@ runs: # Then reset all i915 devices. We don't do a PCI FLR because # it seems to fail on some older GPUs. sudo mount -t debugfs none /sys/kernel/debug || true - dirs=$(sudo ls -d /sys/kernel/debug/dri/*/) + dirs=$(sudo bash -c 'ls -d /sys/kernel/debug/dri/*') for dir in $dirs; do if sudo test -e "${dir}/i915_wedged"; then sudo bash -c 'echo 1 > ${dir}/i915_wedged' $dir From f3e369422a2361435316fcc5c4cf41ceeb0bb088 Mon Sep 17 00:00:00 2001 From: "Sarnie, Nick" Date: Thu, 27 Mar 2025 19:30:29 -0700 Subject: [PATCH 4/5] fix again Signed-off-by: Sarnie, Nick --- devops/actions/reset_gpu/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devops/actions/reset_gpu/action.yml b/devops/actions/reset_gpu/action.yml index c22bfed50eca3..c06f9c11ba855 100644 --- a/devops/actions/reset_gpu/action.yml +++ b/devops/actions/reset_gpu/action.yml @@ -23,7 +23,7 @@ runs: # Then reset all i915 devices. We don't do a PCI FLR because # it seems to fail on some older GPUs. sudo mount -t debugfs none /sys/kernel/debug || true - dirs=$(sudo bash -c 'ls -d /sys/kernel/debug/dri/*') + dirs=$(sudo bash -c 'ls -d /sys/kernel/debug/dri/*') || true for dir in $dirs; do if sudo test -e "${dir}/i915_wedged"; then sudo bash -c 'echo 1 > ${dir}/i915_wedged' $dir From 5a99b337e2ce35c29eabd7e149568cbc4fb66671 Mon Sep 17 00:00:00 2001 From: "Sarnie, Nick" Date: Fri, 28 Mar 2025 07:22:33 -0700 Subject: [PATCH 5/5] newline Signed-off-by: Sarnie, Nick --- devops/actions/reset_gpu/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devops/actions/reset_gpu/action.yml b/devops/actions/reset_gpu/action.yml index c06f9c11ba855..7fca091828385 100644 --- a/devops/actions/reset_gpu/action.yml +++ b/devops/actions/reset_gpu/action.yml @@ -29,4 +29,4 @@ runs: sudo bash -c 'echo 1 > ${dir}/i915_wedged' $dir echo "Reset $(basename $dir)" fi - done \ No newline at end of file + done