@@ -282,15 +282,15 @@ struct __parallel_reduce_then_scan_reduce_submitter
282
282
{
283
283
// Step 1 - SubGroupReduce is expected to perform sub-group reductions to global memory
284
284
// input buffer
285
- template <typename _ExecutionPolicy, typename _InRng, typename _TmpStorageAcc>
285
+ template <typename _InRng, typename _TmpStorageAcc>
286
286
sycl::event
287
- operator ()(_ExecutionPolicy&& __exec , const sycl::nd_range<1 > __nd_range, _InRng&& __in_rng,
287
+ operator ()(sycl::queue __q , const sycl::nd_range<1 > __nd_range, _InRng&& __in_rng,
288
288
_TmpStorageAcc& __scratch_container, const sycl::event& __prior_event,
289
289
const std::uint32_t __inputs_per_sub_group, const std::uint32_t __inputs_per_item,
290
290
const std::size_t __block_num, const sycl::kernel& __reduce_kernel) const
291
291
{
292
292
using _InitValueType = typename _InitType::__value_type;
293
- return __exec. queue () .submit ([&, this ](sycl::handler& __cgh) {
293
+ return __q .submit ([&, this ](sycl::handler& __cgh) {
294
294
__dpl_sycl::__local_accessor<_InitValueType> __sub_group_partials (__num_sub_groups_local, __cgh);
295
295
__cgh.depends_on (__prior_event);
296
296
oneapi::dpl::__ranges::__require_access (__cgh, __in_rng);
@@ -432,17 +432,17 @@ struct __parallel_reduce_then_scan_scan_submitter
432
432
__tmp_ptr[__num_sub_groups_global + 1 - (__block_num % 2 )] = __block_carry_out;
433
433
}
434
434
435
- template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _TmpStorageAcc>
435
+ template <typename _InRng, typename _OutRng, typename _TmpStorageAcc>
436
436
sycl::event
437
- operator ()(_ExecutionPolicy&& __exec , const sycl::nd_range<1 > __nd_range, _InRng&& __in_rng, _OutRng&& __out_rng,
437
+ operator ()(sycl::queue __q , const sycl::nd_range<1 > __nd_range, _InRng&& __in_rng, _OutRng&& __out_rng,
438
438
_TmpStorageAcc& __scratch_container, const sycl::event& __prior_event,
439
439
const std::uint32_t __inputs_per_sub_group, const std::uint32_t __inputs_per_item,
440
440
const std::size_t __block_num, const sycl::kernel& __scan_kernel) const
441
441
{
442
442
std::uint32_t __inputs_in_block = std::min (__n - __block_num * __max_block_size, std::size_t {__max_block_size});
443
443
std::uint32_t __active_groups = oneapi::dpl::__internal::__dpl_ceiling_div (
444
444
__inputs_in_block, __inputs_per_sub_group * __num_sub_groups_local);
445
- return __exec. queue () .submit ([&, this ](sycl::handler& __cgh) {
445
+ return __q .submit ([&, this ](sycl::handler& __cgh) {
446
446
// We need __num_sub_groups_local + 1 temporary SLM locations to store intermediate results:
447
447
// __num_sub_groups_local for each sub-group partial from the reduce kernel +
448
448
// 1 element for the accumulated block-local carry-in from previous groups in the block
@@ -845,6 +845,7 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
845
845
__write_op,
846
846
__init};
847
847
sycl::event __event;
848
+ sycl::queue __q = __exec.queue ();
848
849
// Data is processed in 2-kernel blocks to allow contiguous input segment to persist in LLC between the first and second kernel for accelerators
849
850
// with sufficiently large L2 / L3 caches.
850
851
for (std::size_t __b = 0 ; __b < __num_blocks; ++__b)
@@ -857,10 +858,10 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
857
858
auto __local_range = sycl::range<1 >(__work_group_size);
858
859
auto __kernel_nd_range = sycl::nd_range<1 >(__global_range, __local_range);
859
860
// 1. Reduce step - Reduce assigned input per sub-group, compute and apply intra-wg carries, and write to global memory.
860
- __event = __reduce_submitter (__exec , __kernel_nd_range, __in_rng, __result_and_scratch, __event,
861
+ __event = __reduce_submitter (__q , __kernel_nd_range, __in_rng, __result_and_scratch, __event,
861
862
__inputs_per_sub_group, __inputs_per_item, __b, __reduce_kernel);
862
863
// 2. Scan step - Compute intra-wg carries, determine sub-group carry-ins, and perform full input block scan.
863
- __event = __scan_submitter (__exec , __kernel_nd_range, __in_rng, __out_rng, __result_and_scratch, __event,
864
+ __event = __scan_submitter (__q , __kernel_nd_range, __in_rng, __out_rng, __result_and_scratch, __event,
864
865
__inputs_per_sub_group, __inputs_per_item, __b, __scan_kernel);
865
866
__inputs_remaining -= std::min (__inputs_remaining, __block_size);
866
867
// We only need to resize these parameters prior to the last block as it is the only non-full case.
0 commit comments