Skip to content

Commit 4eeefd7

Browse files
authored
Add shader key validation step in WebGPU CI pipeline (#24243)
### Description This PR adds a shader key validation step to the WebGPU CI pipeline. The shader key validation works in this way: - first, run onnxruntime_test_all with verbose logging, dumping the logs into a file - then, parse the file and found WebGPU EP program logs. The log contains the following information: - the shader cache key - the corresponding shader code The script will aggregate those information and make sure for each cache key, the corresponding shader code must be consistent. To make the validation work, this PR also modified a few things: - set the locale of `std::wclog` to ".UTF-8" to support Unicode characters. Otherwise the logger will fail and no longer output future logs. A fix is submitted in PR #24237 but there is a concern if this may potentially break some users. Setting inside onnxruntime_test_all is pretty safe. - re-enable the WebGPU device auto collect which was introduced in #24115. Now we have a better way to detect cache key inconsistency. ### Next Step The newly added test is marked as `continue-on-error: true`, which means even if it failed it does not block the CI pipeline. We should fix those failures one-by-one and eventually the test should pass. then we can remove the `continue-on-error: true` flag.
1 parent ba2999c commit 4eeefd7

File tree

5 files changed

+86
-35
lines changed

5 files changed

+86
-35
lines changed

.github/actions/webgpu-validate-shader-key/action.yml

+3-2
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,15 @@ runs:
1414
using: "composite"
1515
steps:
1616
- name: Validate shader keys (chromium log)
17-
if: ${{ inputs.is_chromium_log }}
17+
# GitHub Actions treats all inputs as strings even if it's specified as a boolean.
18+
if: ${{ inputs.is_chromium_log == 'true' }}
1819
shell: cmd
1920
run: |
2021
node parse-chromium-debug-log.js < "${{ inputs.log_file_path }}" | node validate-shader-key.js
2122
working-directory: ${{ github.action_path }}
2223

2324
- name: Validate shader keys (native log)
24-
if: ${{ !inputs.is_chromium_log }}
25+
if: ${{ !inputs.is_chromium_log != 'true' }}
2526
shell: cmd
2627
run: |
2728
node validate-shader-key.js < "${{ inputs.log_file_path }}"

.github/actions/webgpu-validate-shader-key/validate-shader-key.js

+34-6
Original file line numberDiff line numberDiff line change
@@ -11,21 +11,30 @@ const readline = require("readline");
1111

1212
const shaderMap = new Map();
1313

14+
const regexStartingProgram =
15+
/onnxruntime::webgpu::WebGpuContext::Run.+Starting program \"(?<key>.+)\"/;
1416
const regexShaderStart =
15-
/^===\ WebGPU\ Shader\ code\ \[.+?Key=\"(?<key>.+)\"]\ Start\ ===$/;
17+
/^===\ WebGPU\ Shader\ code\ \[.+?(Key=\"(?<key>.+)\")?]\ Start\ ===$/;
1618
const regexShaderEnd =
17-
/^===\ WebGPU\ Shader\ code\ \[.+?Key=\"(?<key>.+)\"]\ End\ ===$/;
19+
/^===\ WebGPU\ Shader\ code\ \[.+?(Key=\"(?<key>.+)\")?]\ End\ ===$/;
1820

1921
async function processVerboseLog() {
2022
const rl = readline.createInterface({
2123
input: process.stdin,
2224
crlfDelay: Infinity,
2325
});
2426

27+
let lastProgramKey = null;
2528
let currentShaderKey = null;
2629
let currentShaderCode = null;
2730

2831
for await (const line of rl) {
32+
const startingProgram = regexStartingProgram.exec(line);
33+
if (startingProgram) {
34+
lastProgramKey = startingProgram.groups.key;
35+
continue;
36+
}
37+
2938
const resultStart = regexShaderStart.exec(line);
3039
if (resultStart) {
3140
if (currentShaderKey) {
@@ -34,7 +43,18 @@ async function processVerboseLog() {
3443
);
3544
}
3645

37-
currentShaderKey = resultStart.groups.key;
46+
const key = resultStart.groups.key ?? lastProgramKey;
47+
if (!key) {
48+
throw new Error(
49+
'No shader key is found in the log. Please use debug build or enable verbose logging in session options in release build.'
50+
);
51+
}
52+
if (lastProgramKey && key !== lastProgramKey) {
53+
throw new Error(
54+
`Found incorrect shader key from log. Expected "${lastProgramKey}", but got "${key}".`
55+
);
56+
}
57+
currentShaderKey = key;
3858
currentShaderCode = "";
3959
continue;
4060
}
@@ -45,9 +65,17 @@ async function processVerboseLog() {
4565
throw new Error(
4666
`Found unexpected shader end for key "${resultEnd.groups.key}".`
4767
);
48-
} else if (currentShaderKey !== resultEnd.groups.key) {
68+
}
69+
70+
const key = resultEnd.groups.key ?? lastProgramKey;
71+
if (!key) {
72+
throw new Error(
73+
'No shader key is found in the log. Please use debug build or enable verbose logging in session options in release build.'
74+
);
75+
}
76+
if (lastProgramKey && key !== lastProgramKey) {
4977
throw new Error(
50-
`Found inconsistent shader key. Expected "${currentShaderKey}", but got "${resultEnd.groups.key}".`
78+
`Found incorrect shader key from log. Expected "${lastProgramKey}", but got "${key}".`
5179
);
5280
}
5381

@@ -87,7 +115,7 @@ ${currentShaderCode}
87115
}
88116

89117
console.log(
90-
`All shader code is consistent. Total ${shaderMap.size} shader code found.`
118+
`All shader code is consistent. Total ${shaderMap.size} shader keys found.`
91119
);
92120
}
93121

.github/workflows/windows_webgpu.yml

+40-23
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,10 @@ jobs:
2424
OnnxRuntimeBuildDirectory: ${{ github.workspace }}
2525
DOTNET_SKIP_FIRST_TIME_EXPERIENCE: true
2626
setVcvars: true
27-
ALLOW_RELEASED_ONNX_OPSET_ONLY: '0'
27+
ALLOW_RELEASED_ONNX_OPSET_ONLY: "0"
2828
DocUpdateNeeded: false
29-
NVIDIA_TF32_OVERRIDE: '0'
30-
ONNXRUNTIME_TEST_GPU_DEVICE_ID: '0'
29+
NVIDIA_TF32_OVERRIDE: "0"
30+
ONNXRUNTIME_TEST_GPU_DEVICE_ID: "0"
3131
steps:
3232
- name: Checkout
3333
uses: actions/checkout@v4
@@ -38,7 +38,7 @@ jobs:
3838
- name: Setup Python 3.12
3939
uses: actions/setup-python@v5
4040
with:
41-
python-version: '3.12'
41+
python-version: "3.12"
4242
architecture: x64
4343

4444
- name: Locate vcvarsall and Setup Env
@@ -54,13 +54,13 @@ jobs:
5454
- name: Setup Node.js
5555
uses: actions/setup-node@v4
5656
with:
57-
node-version: '20.x'
57+
node-version: "20.x"
5858

5959
- name: Setup Java
6060
uses: actions/setup-java@v4
6161
with:
62-
distribution: 'temurin'
63-
java-version: '17'
62+
distribution: "temurin"
63+
java-version: "17"
6464
architecture: x64
6565

6666
- name: API Documentation Check and generate
@@ -78,12 +78,12 @@ jobs:
7878
env:
7979
PROCESSOR_ARCHITECTURE: x64
8080
with:
81-
dotnet-version: '8.x'
81+
dotnet-version: "8.x"
8282

8383
- name: Use Nuget 6.x
8484
uses: nuget/setup-nuget@v2
8585
with:
86-
nuget-version: '6.x'
86+
nuget-version: "6.x"
8787

8888
- name: NuGet restore
8989
run: |
@@ -113,13 +113,30 @@ jobs:
113113
}
114114
Remove-Item "${{ github.workspace }}\RelWithDebInfo" -Include "*.obj" -Recurse
115115
116+
- name: Run tests (onnxruntime_test_all) with verbose logging
117+
shell: pwsh
118+
run: |
119+
$env:ORT_UNIT_TEST_MAIN_LOG_LEVEL = "0"
120+
.\onnxruntime_test_all.exe 2>.\onnxruntime_test_all_stderr.log
121+
working-directory: ${{ github.workspace }}\RelWithDebInfo\RelWithDebInfo
122+
123+
- name: Check log file
124+
shell: cmd
125+
run: |
126+
dir ${{ github.workspace }}\RelWithDebInfo\RelWithDebInfo\onnxruntime_test_all_stderr.log
127+
128+
- name: Validate shader keys
129+
continue-on-error: true
130+
uses: ./.github/actions/webgpu-validate-shader-key
131+
with:
132+
log_file_path: ${{ github.workspace }}\RelWithDebInfo\RelWithDebInfo\onnxruntime_test_all_stderr.log
133+
116134
- name: Validate C# native delegates
117135
run: python tools\ValidateNativeDelegateAttributes.py
118136
shell: cmd
119137
working-directory: ${{ github.workspace }}\csharp
120138
continue-on-error: true
121139

122-
123140
webgpu_external_dawn_build_x64_RelWithDebInfo:
124141
runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Win2022-GPU-A10"]
125142
timeout-minutes: 300
@@ -133,7 +150,7 @@ jobs:
133150
- name: Setup Python 3.12
134151
uses: actions/setup-python@v5
135152
with:
136-
python-version: '3.12'
153+
python-version: "3.12"
137154
architecture: x64
138155

139156
- name: Locate vcvarsall and Setup Env
@@ -177,12 +194,12 @@ jobs:
177194
runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Win2022-GPU-A10"]
178195
timeout-minutes: 300
179196
env:
180-
OrtPackageId: Microsoft.ML.OnnxRuntime
181-
OnnxRuntimeBuildDirectory: ${{ github.workspace }}
182-
DOTNET_SKIP_FIRST_TIME_EXPERIENCE: true
183-
ALLOW_RELEASED_ONNX_OPSET_ONLY: '0'
184-
DocUpdateNeeded: false
185-
ONNXRUNTIME_TEST_GPU_DEVICE_ID: '0'
197+
OrtPackageId: Microsoft.ML.OnnxRuntime
198+
OnnxRuntimeBuildDirectory: ${{ github.workspace }}
199+
DOTNET_SKIP_FIRST_TIME_EXPERIENCE: true
200+
ALLOW_RELEASED_ONNX_OPSET_ONLY: "0"
201+
DocUpdateNeeded: false
202+
ONNXRUNTIME_TEST_GPU_DEVICE_ID: "0"
186203
steps:
187204
- name: Checkout
188205
uses: actions/checkout@v4
@@ -193,7 +210,7 @@ jobs:
193210
- name: Setup Python 3.12
194211
uses: actions/setup-python@v5
195212
with:
196-
python-version: '3.12'
213+
python-version: "3.12"
197214
architecture: x64
198215

199216
- name: Locate vcvarsall and Setup Env
@@ -209,13 +226,13 @@ jobs:
209226
- name: Setup Node.js
210227
uses: actions/setup-node@v4
211228
with:
212-
node-version: '20.x'
229+
node-version: "20.x"
213230

214231
- name: Setup Java
215232
uses: actions/setup-java@v4
216233
with:
217-
distribution: 'temurin'
218-
java-version: '17'
234+
distribution: "temurin"
235+
java-version: "17"
219236
architecture: x64
220237

221238
- name: API Documentation Check and generate
@@ -233,12 +250,12 @@ jobs:
233250
env:
234251
PROCESSOR_ARCHITECTURE: x64
235252
with:
236-
dotnet-version: '8.x'
253+
dotnet-version: "8.x"
237254

238255
- name: Use Nuget 6.x
239256
uses: nuget/setup-nuget@v2
240257
with:
241-
nuget-version: '6.x'
258+
nuget-version: "6.x"
242259

243260
- name: NuGet restore
244261
run: |

onnxruntime/test/unittest_main/test_main.cc

+9
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@
55
#include <cstdlib>
66
#include <optional>
77
#include <string>
8+
#ifdef _WIN32
9+
#include <iostream>
10+
#include <locale>
11+
#endif
812

913
#ifndef USE_ONNXRUNTIME_DLL
1014
#ifdef __GNUC__
@@ -29,6 +33,11 @@ std::unique_ptr<Ort::Env> ort_env;
2933

3034
// ortenv_setup() and ortenv_teardown() are used by onnxruntime/test/xctest/xcgtest.mm so can't be file local
3135
extern "C" void ortenv_setup() {
36+
#ifdef _WIN32
37+
// Set the locale to UTF-8 to ensure proper handling of wide characters on Windows
38+
std::wclog.imbue(std::locale(".UTF-8", std::locale::ctype));
39+
#endif
40+
3241
OrtThreadingOptions tpo;
3342

3443
// allow verbose logging to be enabled by setting this environment variable to a numeric log level

onnxruntime/test/util/default_providers.cc

-4
Original file line numberDiff line numberDiff line change
@@ -303,10 +303,6 @@ std::unique_ptr<IExecutionProvider> DefaultWebGpuExecutionProvider() {
303303
ORT_ENFORCE(config_options.AddConfigEntry(webgpu::options::kStorageBufferCacheMode,
304304
webgpu::options::kBufferCacheMode_Disabled)
305305
.IsOK());
306-
// Disable device auto collect
307-
ORT_ENFORCE(config_options.AddConfigEntry(webgpu::options::kPreserveDevice,
308-
webgpu::options::kPreserveDevice_ON)
309-
.IsOK());
310306
return WebGpuProviderFactoryCreator::Create(config_options)->CreateProvider();
311307
#else
312308
return nullptr;

0 commit comments

Comments
 (0)