Skip to content

Commit 560480c

Browse files
committed
Add fix for duplicate kernel names due to policy cv-ref
Signed-off-by: Matthew Michel <[email protected]>
1 parent 4766dae commit 560480c

File tree

1 file changed

+9
-8
lines changed

1 file changed

+9
-8
lines changed

include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h

+9-8
Original file line numberDiff line numberDiff line change
@@ -282,15 +282,15 @@ struct __parallel_reduce_then_scan_reduce_submitter
282282
{
283283
// Step 1 - SubGroupReduce is expected to perform sub-group reductions to global memory
284284
// input buffer
285-
template <typename _ExecutionPolicy, typename _InRng, typename _TmpStorageAcc>
285+
template <typename _InRng, typename _TmpStorageAcc>
286286
sycl::event
287-
operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng,
287+
operator()(sycl::queue __q, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng,
288288
_TmpStorageAcc& __scratch_container, const sycl::event& __prior_event,
289289
const std::uint32_t __inputs_per_sub_group, const std::uint32_t __inputs_per_item,
290290
const std::size_t __block_num, const sycl::kernel& __reduce_kernel) const
291291
{
292292
using _InitValueType = typename _InitType::__value_type;
293-
return __exec.queue().submit([&, this](sycl::handler& __cgh) {
293+
return __q.submit([&, this](sycl::handler& __cgh) {
294294
__dpl_sycl::__local_accessor<_InitValueType> __sub_group_partials(__num_sub_groups_local, __cgh);
295295
__cgh.depends_on(__prior_event);
296296
oneapi::dpl::__ranges::__require_access(__cgh, __in_rng);
@@ -432,17 +432,17 @@ struct __parallel_reduce_then_scan_scan_submitter
432432
__tmp_ptr[__num_sub_groups_global + 1 - (__block_num % 2)] = __block_carry_out;
433433
}
434434

435-
template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _TmpStorageAcc>
435+
template <typename _InRng, typename _OutRng, typename _TmpStorageAcc>
436436
sycl::event
437-
operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng, _OutRng&& __out_rng,
437+
operator()(sycl::queue __q, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng, _OutRng&& __out_rng,
438438
_TmpStorageAcc& __scratch_container, const sycl::event& __prior_event,
439439
const std::uint32_t __inputs_per_sub_group, const std::uint32_t __inputs_per_item,
440440
const std::size_t __block_num, const sycl::kernel& __scan_kernel) const
441441
{
442442
std::uint32_t __inputs_in_block = std::min(__n - __block_num * __max_block_size, std::size_t{__max_block_size});
443443
std::uint32_t __active_groups = oneapi::dpl::__internal::__dpl_ceiling_div(
444444
__inputs_in_block, __inputs_per_sub_group * __num_sub_groups_local);
445-
return __exec.queue().submit([&, this](sycl::handler& __cgh) {
445+
return __q.submit([&, this](sycl::handler& __cgh) {
446446
// We need __num_sub_groups_local + 1 temporary SLM locations to store intermediate results:
447447
// __num_sub_groups_local for each sub-group partial from the reduce kernel +
448448
// 1 element for the accumulated block-local carry-in from previous groups in the block
@@ -845,6 +845,7 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
845845
__write_op,
846846
__init};
847847
sycl::event __event;
848+
sycl::queue __q = __exec.queue();
848849
// Data is processed in 2-kernel blocks to allow contiguous input segment to persist in LLC between the first and second kernel for accelerators
849850
// with sufficiently large L2 / L3 caches.
850851
for (std::size_t __b = 0; __b < __num_blocks; ++__b)
@@ -857,10 +858,10 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
857858
auto __local_range = sycl::range<1>(__work_group_size);
858859
auto __kernel_nd_range = sycl::nd_range<1>(__global_range, __local_range);
859860
// 1. Reduce step - Reduce assigned input per sub-group, compute and apply intra-wg carries, and write to global memory.
860-
__event = __reduce_submitter(__exec, __kernel_nd_range, __in_rng, __result_and_scratch, __event,
861+
__event = __reduce_submitter(__q, __kernel_nd_range, __in_rng, __result_and_scratch, __event,
861862
__inputs_per_sub_group, __inputs_per_item, __b, __reduce_kernel);
862863
// 2. Scan step - Compute intra-wg carries, determine sub-group carry-ins, and perform full input block scan.
863-
__event = __scan_submitter(__exec, __kernel_nd_range, __in_rng, __out_rng, __result_and_scratch, __event,
864+
__event = __scan_submitter(__q, __kernel_nd_range, __in_rng, __out_rng, __result_and_scratch, __event,
864865
__inputs_per_sub_group, __inputs_per_item, __b, __scan_kernel);
865866
__inputs_remaining -= std::min(__inputs_remaining, __block_size);
866867
// We only need to resize these parameters prior to the last block as it is the only non-full case.

0 commit comments

Comments
 (0)