@@ -232,8 +232,8 @@ struct __parallel_scan_submitter;
232
232
template <typename _CustomName, typename ... _PropagateScanName>
233
233
struct __parallel_scan_submitter <_CustomName, __internal::__optional_kernel_name<_PropagateScanName...>>
234
234
{
235
- template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _InitType,
236
- typename _LocalScan, typename _GroupScan, typename _GlobalScan>
235
+ template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _InitType, typename _LocalScan,
236
+ typename _GroupScan, typename _GlobalScan>
237
237
auto
238
238
operator ()(const _ExecutionPolicy& __exec, _Range1&& __rng1, _Range2&& __rng2, _InitType __init,
239
239
_LocalScan __local_scan, _GroupScan __group_scan, _GlobalScan __global_scan) const
@@ -594,8 +594,8 @@ __parallel_transform_scan_single_group(oneapi::dpl::__internal::__device_backend
594
594
::std::integral_constant<::std::uint16_t , __wg_size>,
595
595
::std::integral_constant<::std::uint16_t , __num_elems_per_item>, _BinaryOperation,
596
596
/* _IsFullGroup= */ std::true_type, _Inclusive, _CustomName>>>()(
597
- __exec, std::forward<_InRng>(__in_rng),
598
- std::forward<_OutRng>(__out_rng), __n, __init, __binary_op, __unary_op);
597
+ __exec, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, __init, __binary_op,
598
+ __unary_op);
599
599
else
600
600
__event = __parallel_transform_scan_static_single_group_submitter<
601
601
_Inclusive::value, __num_elems_per_item, __wg_size,
@@ -604,8 +604,8 @@ __parallel_transform_scan_single_group(oneapi::dpl::__internal::__device_backend
604
604
::std::integral_constant<::std::uint16_t , __wg_size>,
605
605
::std::integral_constant<::std::uint16_t , __num_elems_per_item>, _BinaryOperation,
606
606
/* _IsFullGroup= */ ::std::false_type, _Inclusive, _CustomName>>>()(
607
- __exec, std::forward<_InRng>(__in_rng),
608
- std::forward<_OutRng>(__out_rng), __n, __init, __binary_op, __unary_op);
607
+ __exec, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, __init, __binary_op,
608
+ __unary_op);
609
609
return __future (__event, __dummy_result_and_scratch);
610
610
};
611
611
if (__n <= 16 )
@@ -638,8 +638,8 @@ __parallel_transform_scan_single_group(oneapi::dpl::__internal::__device_backend
638
638
639
639
auto __event =
640
640
__parallel_transform_scan_dynamic_single_group_submitter<_Inclusive::value, _DynamicGroupScanKernel>()(
641
- __exec, std::forward<_InRng>(__in_rng),
642
- std::forward<_OutRng>(__out_rng), __n, __init, __binary_op, __unary_op, __max_wg_size);
641
+ __exec, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, __init, __binary_op,
642
+ __unary_op, __max_wg_size);
643
643
return __future (__event, __dummy_result_and_scratch);
644
644
}
645
645
}
@@ -656,9 +656,9 @@ __parallel_transform_scan_base(oneapi::dpl::__internal::__device_backend_tag, co
656
656
using _PropagateKernel =
657
657
oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__scan_propagate_kernel<_CustomName>>;
658
658
659
- return __parallel_scan_submitter<_CustomName, _PropagateKernel>()(
660
- __exec, std::forward<_Range1>(__in_rng), std::forward<_Range2>(__out_rng),
661
- __init, __local_scan, __group_scan, __global_scan);
659
+ return __parallel_scan_submitter<_CustomName, _PropagateKernel>()(__exec, std::forward<_Range1>(__in_rng),
660
+ std::forward<_Range2>(__out_rng), __init ,
661
+ __local_scan, __group_scan, __global_scan);
662
662
}
663
663
664
664
template <typename _Type>
@@ -1075,9 +1075,9 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
1075
1075
std::size_t __single_group_upper_limit = __use_reduce_then_scan ? 2048 : 16384 ;
1076
1076
if (__group_scan_fits_in_slm<_Type>(__exec.queue (), __n, __n_uniform, __single_group_upper_limit))
1077
1077
{
1078
- return __parallel_transform_scan_single_group (
1079
- __backend_tag, __exec, std::forward<_Range1>(__in_rng) ,
1080
- std::forward<_Range2>(__out_rng), __n, __unary_op, __init, __binary_op, _Inclusive{});
1078
+ return __parallel_transform_scan_single_group (__backend_tag, __exec, std::forward<_Range1>(__in_rng),
1079
+ std::forward<_Range2>(__out_rng), __n, __unary_op, __init ,
1080
+ __binary_op, _Inclusive{});
1081
1081
}
1082
1082
}
1083
1083
if (__use_reduce_then_scan)
@@ -1088,10 +1088,10 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
1088
1088
1089
1089
_GenInput __gen_transform{__unary_op};
1090
1090
1091
- return __parallel_transform_reduce_then_scan (
1092
- __backend_tag, __exec, std::forward<_Range1>(__in_rng) ,
1093
- std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform, _ScanInputTransform{},
1094
- _WriteOp{}, __init, _Inclusive{}, /* _IsUniquePattern=*/ std::false_type{});
1091
+ return __parallel_transform_reduce_then_scan (__backend_tag, __exec, std::forward<_Range1>(__in_rng),
1092
+ std::forward<_Range2>(__out_rng), __gen_transform, __binary_op ,
1093
+ __gen_transform, _ScanInputTransform{}, _WriteOp{}, __init ,
1094
+ _Inclusive{}, /* _IsUniquePattern=*/ std::false_type{});
1095
1095
}
1096
1096
}
1097
1097
@@ -1148,8 +1148,8 @@ struct __invoke_single_group_copy_if
1148
1148
using _FullKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<_FullKernel>;
1149
1149
return __par_backend_hetero::__parallel_copy_if_static_single_group_submitter<
1150
1150
_SizeType, __num_elems_per_item, __wg_size, true , _FullKernelName>()(
1151
- __exec, std::forward<_InRng>(__in_rng),
1152
- std::forward<_OutRng>(__out_rng), __n, _InitType{}, _ReduceOp{}, __pred, __assign);
1151
+ __exec, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, _InitType{}, _ReduceOp{},
1152
+ __pred, __assign);
1153
1153
}
1154
1154
else
1155
1155
{
@@ -1161,29 +1161,29 @@ struct __invoke_single_group_copy_if
1161
1161
oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<_NonFullKernel>;
1162
1162
return __par_backend_hetero::__parallel_copy_if_static_single_group_submitter<
1163
1163
_SizeType, __num_elems_per_item, __wg_size, false , _NonFullKernelName>()(
1164
- __exec, std::forward<_InRng>(__in_rng),
1165
- std::forward<_OutRng>(__out_rng), __n, _InitType{}, _ReduceOp{}, __pred, __assign);
1164
+ __exec, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, _InitType{}, _ReduceOp{},
1165
+ __pred, __assign);
1166
1166
}
1167
1167
}
1168
1168
};
1169
1169
1170
1170
template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Size, typename _GenMask,
1171
1171
typename _WriteOp, typename _IsUniquePattern>
1172
1172
auto
1173
- __parallel_reduce_then_scan_copy (oneapi::dpl::__internal::__device_backend_tag __backend_tag, const _ExecutionPolicy& __exec,
1174
- _InRng&& __in_rng, _OutRng&& __out_rng, _Size, _GenMask __generate_mask ,
1175
- _WriteOp __write_op, _IsUniquePattern __is_unique_pattern)
1173
+ __parallel_reduce_then_scan_copy (oneapi::dpl::__internal::__device_backend_tag __backend_tag,
1174
+ const _ExecutionPolicy& __exec, _InRng&& __in_rng, _OutRng&& __out_rng, _Size,
1175
+ _GenMask __generate_mask, _WriteOp __write_op, _IsUniquePattern __is_unique_pattern)
1176
1176
{
1177
1177
using _GenReduceInput = oneapi::dpl::__par_backend_hetero::__gen_count_mask<_GenMask>;
1178
1178
using _ReduceOp = std::plus<_Size>;
1179
1179
using _GenScanInput = oneapi::dpl::__par_backend_hetero::__gen_expand_count_mask<_GenMask>;
1180
1180
using _ScanInputTransform = oneapi::dpl::__par_backend_hetero::__get_zeroth_element;
1181
1181
1182
- return __parallel_transform_reduce_then_scan (
1183
- __backend_tag, __exec, std::forward<_InRng>(__in_rng) ,
1184
- std::forward<_OutRng>(__out_rng), _GenReduceInput{__generate_mask}, _ReduceOp{}, _GenScanInput{__generate_mask},
1185
- _ScanInputTransform{}, __write_op, oneapi::dpl::unseq_backend::__no_init_value<_Size>{},
1186
- /* _Inclusive=*/ std::true_type{}, __is_unique_pattern);
1182
+ return __parallel_transform_reduce_then_scan (__backend_tag, __exec, std::forward<_InRng>(__in_rng),
1183
+ std::forward<_OutRng>(__out_rng), _GenReduceInput{__generate_mask} ,
1184
+ _ReduceOp{}, _GenScanInput{__generate_mask}, _ScanInputTransform{ },
1185
+ __write_op, oneapi::dpl::unseq_backend::__no_init_value<_Size>{},
1186
+ /* _Inclusive=*/ std::true_type{}, __is_unique_pattern);
1187
1187
}
1188
1188
1189
1189
template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Size, typename _CreateMaskOp,
@@ -1379,9 +1379,9 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
1379
1379
template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Compare,
1380
1380
typename _IsOpDifference>
1381
1381
auto
1382
- __parallel_set_reduce_then_scan (oneapi::dpl::__internal::__device_backend_tag __backend_tag, const _ExecutionPolicy& __exec,
1383
- _Range1&& __rng1, _Range2&& __rng2, _Range3&& __result, _Compare __comp ,
1384
- _IsOpDifference)
1382
+ __parallel_set_reduce_then_scan (oneapi::dpl::__internal::__device_backend_tag __backend_tag,
1383
+ const _ExecutionPolicy& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __result,
1384
+ _Compare __comp, _IsOpDifference)
1385
1385
{
1386
1386
// fill in reduce then scan impl
1387
1387
using _GenMaskReduce = oneapi::dpl::__par_backend_hetero::__gen_set_mask<_IsOpDifference, _Compare>;
@@ -1469,9 +1469,9 @@ __parallel_set_op(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _
1469
1469
{
1470
1470
if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz (__exec))
1471
1471
{
1472
- return __parallel_set_reduce_then_scan (__backend_tag, __exec,
1473
- std::forward<_Range1>(__rng1 ), std::forward<_Range2>(__rng2) ,
1474
- std::forward<_Range3>(__result), __comp, __is_op_difference);
1472
+ return __parallel_set_reduce_then_scan (__backend_tag, __exec, std::forward<_Range1>(__rng1),
1473
+ std::forward<_Range2>(__rng2 ), std::forward<_Range3>(__result), __comp ,
1474
+ __is_op_difference);
1475
1475
}
1476
1476
else
1477
1477
{
@@ -1907,8 +1907,8 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, const _Executi
1907
1907
1908
1908
// Single WG implementation
1909
1909
__result = __parallel_find_or_impl_one_wg<__or_tag_check, __find_or_one_wg_kernel_name>()(
1910
- oneapi::dpl::__internal::__device_backend_tag{}, __exec, __brick_tag,
1911
- __rng_n, __wgroup_size, __init_value, __pred, std::forward<_Ranges>(__rngs)...);
1910
+ oneapi::dpl::__internal::__device_backend_tag{}, __exec, __brick_tag, __rng_n, __wgroup_size, __init_value,
1911
+ __pred, std::forward<_Ranges>(__rngs)...);
1912
1912
}
1913
1913
else
1914
1914
{
@@ -1920,8 +1920,8 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, const _Executi
1920
1920
1921
1921
// Multiple WG implementation
1922
1922
__result = __parallel_find_or_impl_multiple_wgs<__or_tag_check, __find_or_kernel_name>()(
1923
- oneapi::dpl::__internal::__device_backend_tag{}, __exec, __brick_tag,
1924
- __rng_n, __n_groups, __wgroup_size, __init_value, __pred, std::forward<_Ranges>(__rngs)...);
1923
+ oneapi::dpl::__internal::__device_backend_tag{}, __exec, __brick_tag, __rng_n, __n_groups, __wgroup_size,
1924
+ __init_value, __pred, std::forward<_Ranges>(__rngs)...);
1925
1925
}
1926
1926
1927
1927
if constexpr (__or_tag_check)
@@ -2174,8 +2174,8 @@ class __sort_global_kernel;
2174
2174
2175
2175
template <typename _ExecutionPolicy, typename _Range, typename _Merge, typename _Compare>
2176
2176
auto
2177
- __parallel_partial_sort_impl (oneapi::dpl::__internal::__device_backend_tag, const _ExecutionPolicy& __exec, _Range&& __rng,
2178
- _Merge __merge, _Compare __comp)
2177
+ __parallel_partial_sort_impl (oneapi::dpl::__internal::__device_backend_tag, const _ExecutionPolicy& __exec,
2178
+ _Range&& __rng, _Merge __merge, _Compare __comp)
2179
2179
{
2180
2180
using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
2181
2181
using _GlobalSortKernel =
@@ -2184,8 +2184,7 @@ __parallel_partial_sort_impl(oneapi::dpl::__internal::__device_backend_tag, cons
2184
2184
oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__sort_copy_back_kernel<_CustomName>>;
2185
2185
2186
2186
return __parallel_partial_sort_submitter<_GlobalSortKernel, _CopyBackKernel>()(
2187
- oneapi::dpl::__internal::__device_backend_tag{}, __exec,
2188
- ::std::forward<_Range>(__rng), __merge, __comp);
2187
+ oneapi::dpl::__internal::__device_backend_tag{}, __exec, ::std::forward<_Range>(__rng), __merge, __comp);
2189
2188
}
2190
2189
2191
2190
// ------------------------------------------------------------------------
@@ -2413,9 +2412,9 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, _Exe
2413
2412
if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz (__exec))
2414
2413
{
2415
2414
auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan (
2416
- oneapi::dpl::__internal::__device_backend_tag{}, __exec,
2417
- std::forward<_Range1>(__keys ), std::forward<_Range2>(__values ), std::forward<_Range3>(__out_keys ),
2418
- std::forward<_Range4>(__out_values), __binary_pred, __binary_op);
2415
+ oneapi::dpl::__internal::__device_backend_tag{}, __exec, std::forward<_Range1>(__keys),
2416
+ std::forward<_Range2>(__values ), std::forward<_Range3>(__out_keys ), std::forward<_Range4>(__out_values ),
2417
+ __binary_pred, __binary_op);
2419
2418
// Because our init type ends up being tuple<std::size_t, ValType>, return the first component which is the write index. Add 1 to return the
2420
2419
// past-the-end iterator pair of segmented reduction.
2421
2420
return std::get<0 >(__res.get ()) + 1 ;
0 commit comments