Skip to content

Commit f58ae41

Browse files
adamfideltimmiesmith
authored andcommitted
Adjust local memory requirement for single-group scan, sort and reduce (#833)
* Adjust local memory requirement for single-group scan, sort and reduce * Remove unused variable * auto -> ::std::size_t * Change floating point multiply to integer divide * Fix underflow in SLM size check in radix sort * Guard single-work-group radix sort with _ONEDPL_USE_SINGLE_GROUP_RADIX_SORT macro set to 0
1 parent 6e196bc commit f58ae41

6 files changed

+13
-6
lines changed

include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h

-1
Original file line numberDiff line numberDiff line change
@@ -517,7 +517,6 @@ struct __parallel_transform_scan_static_single_group_submitter<_Inclusive, _Elem
517517
// This kernel is only launched for sizes less than 2^16
518518
const ::std::uint16_t __item_id = __self_item.get_local_linear_id();
519519
const ::std::uint16_t __subgroup_id = __subgroup.get_group_id();
520-
const ::std::uint16_t __id_in_subgroup = __subgroup.get_local_id();
521520
const ::std::uint16_t __subgroup_size = __subgroup.get_local_linear_range();
522521

523522
#if _ONEDPL_SYCL_SUB_GROUP_LOAD_STORE_PRESENT

include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -694,7 +694,7 @@ __parallel_radix_sort(_ExecutionPolicy&& __exec, _Range&& __in_rng)
694694

695695
const auto __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec);
696696

697-
#if __SYCL_COMPILER_VERSION >= 20230101 //for Intel(R) oneAPI C++ Compiler Classic 2023 and later
697+
#if _ONEDPL_USE_SINGLE_GROUP_RADIX_SORT
698698
//TODO: 1.to reduce number of the kernels; 2.to define work group size in runtime, depending on number of elements
699699
constexpr auto __wg_size = 64;
700700

include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h

+4-2
Original file line numberDiff line numberDiff line change
@@ -128,13 +128,15 @@ struct __subgroup_radix_sort
128128
{
129129
assert(__n <= 1 << (sizeof(uint16_t) * 8)); //the kernel is designed for data size <= 64K
130130

131-
const auto __max_slm_size = __q.get_device().template get_info<sycl::info::device::local_mem_size>();
131+
// Pessimistically only use half of the memory to take into account memory used by compiled kernel
132+
const ::std::size_t __max_slm_size =
133+
__q.get_device().template get_info<sycl::info::device::local_mem_size>() / 2;
132134

133135
const auto __n_uniform = 1 << (::std::uint32_t(log2(__n - 1)) + 1);
134136
const auto __req_slm_size_val = sizeof(_T) * __n_uniform;
135137
const auto __req_slm_size_counters = __counter_buf_sz * sizeof(uint32_t);
136138

137-
return __req_slm_size_val <= __max_slm_size - __req_slm_size_counters; //counters should be placed in SLM
139+
return __req_slm_size_val + __req_slm_size_counters <= __max_slm_size; //counters should be placed in SLM
138140
}
139141

140142
template <typename _KernelName>

include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -314,8 +314,9 @@ __parallel_transform_reduce(_ExecutionPolicy&& __exec, _ReduceOp __reduce_op, _T
314314
// TODO: find a way to generalize getting of reliable work-group size
315315
::std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec);
316316
// change __work_group_size according to local memory limit
317+
// Pessimistically double the memory requirement to take into account memory used by compiled kernel
317318
__work_group_size = oneapi::dpl::__internal::__max_local_allocation_size(::std::forward<_ExecutionPolicy>(__exec),
318-
sizeof(_Tp), __work_group_size);
319+
sizeof(_Tp) * 2, __work_group_size);
319320
if (__n <= 65536 && __work_group_size >= 512)
320321
{
321322
if (__n <= 128)

include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h

+3
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,9 @@
6262
# define _ONEDPL_SYCL_REQD_SUB_GROUP_SIZE(SIZE) intel::reqd_sub_group_size(SIZE)
6363
#endif
6464

65+
// TODO: re-enable when sort test passes for all devices of interest
66+
#define _ONEDPL_USE_SINGLE_GROUP_RADIX_SORT 0
67+
6568
namespace __dpl_sycl
6669
{
6770

include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h

+3-1
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,9 @@ __pattern_transform_scan_base_impl(_ExecutionPolicy&& __exec, _Iterator1 __first
120120
if ((__n_uniform & (__n_uniform - 1)) != 0)
121121
__n_uniform = __par_backend_hetero::__dpl_bit_floor(__n) << 1;
122122

123-
const auto __max_slm_size = __exec.queue().get_device().template get_info<sycl::info::device::local_mem_size>();
123+
// Pessimistically only use half of the memory to take into account memory used by compiled kernel
124+
const ::std::size_t __max_slm_size =
125+
__exec.queue().get_device().template get_info<sycl::info::device::local_mem_size>() / 2;
124126
const auto __req_slm_size = sizeof(_Type) * __n_uniform;
125127

126128
constexpr int __single_group_upper_limit = 16384;

0 commit comments

Comments
 (0)