Skip to content

Commit ffceed9

Browse files
prathikrmszhanyiedgchen1wangyemsYour Name
authored
ORT 1.19.2 Release: Cherry Pick Round 1 (#21861)
Approved cherry picks for ORT 1.19.2 release. --------- Co-authored-by: Yi Zhang <[email protected]> Co-authored-by: Edward Chen <[email protected]> Co-authored-by: Ye Wang <[email protected]> Co-authored-by: Your Name <[email protected]> Co-authored-by: Tianlei Wu <[email protected]> Co-authored-by: aciddelgado <[email protected]> Co-authored-by: mindest <[email protected]> Co-authored-by: Changming Sun <[email protected]>
1 parent d651463 commit ffceed9

File tree

79 files changed

+1788
-847
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

79 files changed

+1788
-847
lines changed

VERSION_NUMBER

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.19.1
1+
1.19.2

cmake/patches/abseil/absl_windows.patch

+13
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,19 @@ index 2d85ac74..4875d668 100644
7474
# The decorated name was longer than the compiler limit
7575
"/wd4503",
7676
# forcing value to bool 'true' or 'false' (performance warning)
77+
diff --git a/absl/debugging/symbolize.cc b/absl/debugging/symbolize.cc
78+
index 638d3954..6b817075 100644
79+
--- a/absl/debugging/symbolize.cc
80+
+++ b/absl/debugging/symbolize.cc
81+
@@ -14,7 +14,7 @@
82+
83+
#include "absl/debugging/symbolize.h"
84+
85+
-#ifdef _WIN32
86+
+#if defined(_WIN32) && !defined(NDEBUG)
87+
#include <winapifamily.h>
88+
#if !(WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)) || \
89+
WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
7790
diff --git a/absl/debugging/symbolize_win32.inc b/absl/debugging/symbolize_win32.inc
7891
index 53a099a1..34d210d6 100644
7992
--- a/absl/debugging/symbolize_win32.inc
+55-4
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,64 @@
1+
diff --git a/examples/41_fused_multi_head_attention/kernel_forward.h b/examples/41_fused_multi_head_attention/kernel_forward.h
2+
index 4c80f549..34327633 100644
3+
--- a/examples/41_fused_multi_head_attention/kernel_forward.h
4+
+++ b/examples/41_fused_multi_head_attention/kernel_forward.h
5+
@@ -221,6 +221,8 @@ struct AttentionKernel {
6+
int32_t num_batches = 0;
7+
int32_t num_heads = 0;
8+
9+
+ bool use_smooth_softmax = false;
10+
+
11+
// dropout
12+
bool use_dropout = false;
13+
unsigned long long dropout_batch_head_rng_offset = 0;
14+
@@ -897,7 +899,8 @@ struct AttentionKernel {
15+
p.num_keys - iter_key_start,
16+
iter_key_start == 0,
17+
iteratorC_tile_offset,
18+
- kSupportsBias ? 1.0f : p.scale);
19+
+ kSupportsBias ? 1.0f : p.scale,
20+
+ p.use_smooth_softmax);
21+
22+
// Output results to shared-memory
23+
int warp_idx_mn_0 = my_warp_id %
24+
@@ -1166,7 +1169,8 @@ struct AttentionKernel {
25+
int max_col,
26+
bool is_first,
27+
typename WarpIteratorC::TensorCoord const& tile_offset,
28+
- float scaling) {
29+
+ float scaling,
30+
+ bool use_smooth_softmax) {
31+
/* Iterates on the accumulator and corresponding position on result matrix
32+
33+
(1) Update `mi[r]` to the max value of the row `r`
34+
@@ -1257,7 +1261,7 @@ struct AttentionKernel {
35+
accum_t mi_row, total_row;
36+
LambdaIterator::iterateRows(
37+
lane_offset,
38+
- [&](int accum_m) { mi_row = mi[accum_m]; },
39+
+ [&](int accum_m) { mi_row = mi[accum_m];},
40+
[&](int accum_m, int accum_n, int idx) {
41+
frag[idx] =
42+
(accum_n < max_col) ? exp2f(frag[idx] - mi_row) : accum_t(0.0);
43+
@@ -1294,7 +1298,7 @@ struct AttentionKernel {
44+
for (int i = 0; i < MM0::MmaCore::WarpCount::kN; ++i) {
45+
total_row += addition_storage[id + kQueriesPerBlock * i];
46+
}
47+
- s_prime[id] = total_row;
48+
+ s_prime[id] = (use_smooth_softmax && (max_col <= kKeysPerBlock)) ? total_row + exp2f(-mi[id]) : total_row;
49+
}
50+
}
51+
152
diff --git a/include/cutlass/functional.h b/include/cutlass/functional.h
253
index 964d2ff3..b366bc14 100644
354
--- a/include/cutlass/functional.h
455
+++ b/include/cutlass/functional.h
556
@@ -39,6 +39,7 @@
657
#include "cutlass/numeric_types.h"
7-
58+
859
#include <cuda_runtime.h>
960
+#include <cuda_fp16.h>
10-
61+
1162
#if defined(CUTLASS_ARCH_WMMA_ENABLED)
1263
#include <mma.h>
1364
@@ -230,8 +231,12 @@ struct inverse_square_root<half_t> {
@@ -19,7 +70,7 @@ index 964d2ff3..b366bc14 100644
1970
return reinterpret_cast<half_t const &>(result);
2071
+#else
2172
+ return half_t::convert((rsqrtf(half_t::convert(lhs))));
22-
+#endif
73+
+#endif
2374
#else
2475
return half_t(1.f / std::sqrt(half_t::convert(lhs)));
25-
#endif
76+
#endif

docs/ContribOperators.md

+12-4
Original file line numberDiff line numberDiff line change
@@ -2482,6 +2482,8 @@ This version of the operator has been available since version 1 of the 'com.micr
24822482
<dd>Rotate using interleaved pattern. Default value is 0 (False).</dd>
24832483
<dt><tt>scale</tt> : float</dt>
24842484
<dd>Custom scale will be used if specified. Default value is 1/sqrt(head_size)</dd>
2485+
<dt><tt>smooth_softmax</tt> : int</dt>
2486+
<dd>Use a smooth factor in softmax.</dd>
24852487
</dl>
24862488

24872489
#### Inputs (7 - 9)
@@ -3022,6 +3024,8 @@ This version of the operator has been available since version 1 of the 'com.micr
30223024
<dd>Number of top experts to select from expert pool</dd>
30233025
<dt><tt>normalize_routing_weights</tt> : int</dt>
30243026
<dd>Whether to normalize routing weights</dd>
3027+
<dt><tt>use_sparse_mixer</tt> : int</dt>
3028+
<dd>Whether to use sparse mixer</dd>
30253029
</dl>
30263030

30273031
#### Inputs (5 - 8)
@@ -4337,7 +4341,7 @@ This version of the operator has been available since version 1 of the 'com.micr
43374341

43384342
### <a name="com.microsoft.QMoE"></a><a name="com.microsoft.qmoe">**com.microsoft.QMoE**</a>
43394343

4340-
Int4 MoE
4344+
Quantized MoE
43414345

43424346
#### Version
43434347

@@ -4348,10 +4352,14 @@ This version of the operator has been available since version 1 of the 'com.micr
43484352
<dl>
43494353
<dt><tt>activation_type</tt> : string</dt>
43504354
<dd>Activation function to use. Choose from relu, gelu, silu and identity. Default is relu</dd>
4355+
<dt><tt>expert_weight_bits</tt> : int</dt>
4356+
<dd>Number of bits used in quantized weights. Default is 4 bits</dd>
43514357
<dt><tt>k</tt> : int</dt>
43524358
<dd>Number of top experts to select from expert pool</dd>
43534359
<dt><tt>normalize_routing_weights</tt> : int</dt>
43544360
<dd>Whether to normalize routing weights</dd>
4361+
<dt><tt>use_sparse_mixer</tt> : int</dt>
4362+
<dd>Whether to use sparse mixer</dd>
43554363
</dl>
43564364

43574365
#### Inputs (7 - 11)
@@ -4362,19 +4370,19 @@ This version of the operator has been available since version 1 of the 'com.micr
43624370
<dt><tt>router_probs</tt> : T</dt>
43634371
<dd>2D input tensor with shape (num_rows, num_experts)</dd>
43644372
<dt><tt>fc1_experts_weights</tt> : T1</dt>
4365-
<dd>3D input tensor with shape (num_experts, hidden_size, inter_size / 2)</dd>
4373+
<dd>3D input tensor with shape (num_experts, hidden_size, inter_size) or (num_experts, hidden_size, inter_size / 2)</dd>
43664374
<dt><tt>fc1_scales</tt> : T</dt>
43674375
<dd>2D input tensor with shape (num_experts, inter_size)</dd>
43684376
<dt><tt>fc1_experts_bias</tt> (optional) : T</dt>
43694377
<dd>2D optional input tensor with shape (num_experts, inter_size)</dd>
43704378
<dt><tt>fc2_experts_weights</tt> : T1</dt>
4371-
<dd>3D input tensor with shape (num_experts, inter_size, hidden_size / 2)</dd>
4379+
<dd>3D input tensor with shape (num_experts, inter_size, hidden_size) or (num_experts, inter_size, hidden_size / 2)</dd>
43724380
<dt><tt>fc2_scales</tt> : T</dt>
43734381
<dd>2D input tensor with shape (num_experts, hidden_size)</dd>
43744382
<dt><tt>fc2_experts_bias</tt> (optional) : T</dt>
43754383
<dd>2D optional input tensor with shape (num_experts, hidden_size)</dd>
43764384
<dt><tt>fc3_experts_weights</tt> (optional) : T1</dt>
4377-
<dd>3D optional input tensor with shape (num_experts, hidden_size, inter_size / 2)</dd>
4385+
<dd>3D optional input tensor with shape (num_experts, hidden_size, inter_size) or (num_experts, hidden_size, inter_size / 2)</dd>
43784386
<dt><tt>fc3_scales</tt> (optional) : T</dt>
43794387
<dd>2D optional input tensor with shape (num_experts, inter_size)</dd>
43804388
<dt><tt>fc3_experts_bias</tt> (optional) : T</dt>

docs/python/README.rst

+5
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,11 @@ For more information on ONNX Runtime, please see `aka.ms/onnxruntime <https://ak
88
Changes
99
-------
1010

11+
1.19.2
12+
^^^^^^
13+
14+
Release Notes : https://github.com/Microsoft/onnxruntime/releases/tag/v1.19.2
15+
1116
1.19.1
1217
^^^^^^
1318

include/onnxruntime/core/graph/graph_nodes.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -117,13 +117,14 @@ class ValidNodes {
117117
return (current_ != other.current_);
118118
}
119119

120-
void operator++() {
120+
NodeIterator<TIterator>& operator++() {
121121
if (current_ < end_) {
122122
while (++current_ != end_) {
123123
if (*current_ != nullptr && (!apply_filter_ || (*filter_func_)((*current_)->Index()) == false))
124124
break;
125125
}
126126
}
127+
return *this;
127128
}
128129

129130
NodeIterator<TIterator> operator++(int) {

js/common/lib/version.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@
44
// This file is generated by /js/scripts/update-version.ts
55
// Do not modify file content manually.
66

7-
export const version = '1.19.1';
7+
export const version = '1.19.2';

js/common/package-lock.json

+2-2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

js/common/package.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"license": "MIT",
33
"type": "module",
44
"name": "onnxruntime-common",
5-
"version": "1.19.1",
5+
"version": "1.19.2",
66
"repository": {
77
"url": "https://github.com/Microsoft/onnxruntime.git",
88
"type": "git"

js/node/lib/version.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@
44
// This file is generated by /js/scripts/update-version.ts
55
// Do not modify file content manually.
66

7-
export const version = '1.19.1';
7+
export const version = '1.19.2';

js/node/package-lock.json

+3-3
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

js/node/package.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
3
1414
]
1515
},
16-
"version": "1.19.1",
16+
"version": "1.19.2",
1717
"dependencies": {
1818
"onnxruntime-common": "file:../common",
1919
"tar": "^7.0.1"

js/react_native/lib/version.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@
44
// This file is generated by /js/scripts/update-version.ts
55
// Do not modify file content manually.
66

7-
export const version = '1.19.1';
7+
export const version = '1.19.2';

js/react_native/package.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
"registry": "https://registry.npmjs.org/"
3737
},
3838
"source": "lib/index",
39-
"version": "1.19.1",
39+
"version": "1.19.2",
4040
"main": "dist/commonjs/index",
4141
"homepage": "https://github.com/microsoft/onnxruntime/blob/main/js/react_native/README.md",
4242
"files": [

js/react_native/yarn.lock

+1-1
Original file line numberDiff line numberDiff line change
@@ -5254,7 +5254,7 @@ onetime@^5.1.0, onetime@^5.1.2:
52545254
mimic-fn "^2.1.0"
52555255

52565256
"onnxruntime-common@file:../common":
5257-
version "1.19.1"
5257+
version "1.19.2"
52585258

52595259
open@^6.2.0:
52605260
version "6.4.0"

js/web/lib/version.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@
44
// This file is generated by /js/scripts/update-version.ts
55
// Do not modify file content manually.
66

7-
export const version = '1.19.1';
7+
export const version = '1.19.2';

js/web/package-lock.json

+3-3
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

js/web/package.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
"type": "git"
88
},
99
"author": "fs-eire",
10-
"version": "1.19.1",
10+
"version": "1.19.2",
1111
"jsdelivr": "dist/ort.min.js",
1212
"dependencies": {
1313
"flatbuffers": "^1.12.0",

onnxruntime/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
For more information on ONNX Runtime, please see `aka.ms/onnxruntime <https://aka.ms/onnxruntime/>`_
88
or the `Github project <https://github.com/microsoft/onnxruntime/>`_.
99
"""
10-
__version__ = "1.19.1"
10+
__version__ = "1.19.2"
1111
__author__ = "Microsoft"
1212

1313
# we need to do device version validation (for example to check Cuda version for an onnxruntime-training package).

onnxruntime/contrib_ops/cpu/bert/attention_common.h

+2
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ struct GroupQueryAttentionParameters {
9999
int sequence_length; // sequence length of input query, key, value
100100
int seqlen_past_kv_cache; // sequence length of past kv tensor
101101
int seqlen_present_kv_cache; // sequence length of present kv tensor
102+
int total_sequence_length; // maximum total sequence length (past_sequence_length + sequence_length) among keys
102103
int hidden_size;
103104
int num_heads;
104105
int head_size;
@@ -113,6 +114,7 @@ struct GroupQueryAttentionParameters {
113114
bool is_prompt; // determines if seqlens_k is past or kv sequence length tensor
114115
bool do_rotary;
115116
bool rotary_interleaved;
117+
bool use_smooth_softmax;
116118
float scale;
117119
AttentionQkvFormat qkv_format;
118120
AttentionQkvFormat past_kv_format;

0 commit comments

Comments
 (0)