Adjust local memory requirement for single-group scan, sort and reduce (#833)

adamfidel · timmiesmith · commit f58ae41a61a8 · 2023-03-06T16:57:52.000-06:00
* Adjust local memory requirement for single-group scan, sort and reduce

* Remove unused variable

* auto -&gt; ::std::size_t

* Change floating point multiply to integer divide

* Fix underflow in SLM size check in radix sort

* Guard single-work-group radix sort with _ONEDPL_USE_SINGLE_GROUP_RADIX_SORT macro set to 0
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -517,7 +517,6 @@ struct __parallel_transform_scan_static_single_group_submitter<_Inclusive, _Elem
                     // This kernel is only launched for sizes less than 2^16
                     const ::std::uint16_t __item_id = __self_item.get_local_linear_id();
                     const ::std::uint16_t __subgroup_id = __subgroup.get_group_id();
-                    const ::std::uint16_t __id_in_subgroup = __subgroup.get_local_id();
                     const ::std::uint16_t __subgroup_size = __subgroup.get_local_linear_range();
 
 #if _ONEDPL_SYCL_SUB_GROUP_LOAD_STORE_PRESENT
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h
@@ -694,7 +694,7 @@ __parallel_radix_sort(_ExecutionPolicy&& __exec, _Range&& __in_rng)
 
     const auto __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec);
 
-#if __SYCL_COMPILER_VERSION >= 20230101 //for Intel(R) oneAPI C++ Compiler Classic 2023 and later
+#if _ONEDPL_USE_SINGLE_GROUP_RADIX_SORT
     //TODO: 1.to reduce number of the kernels; 2.to define work group size in runtime, depending on number of elements
     constexpr auto __wg_size = 64;
 
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h
@@ -128,13 +128,15 @@ struct __subgroup_radix_sort
     {
         assert(__n <= 1 << (sizeof(uint16_t) * 8)); //the kernel is designed for data size <= 64K
 
-        const auto __max_slm_size = __q.get_device().template get_info<sycl::info::device::local_mem_size>();
+        // Pessimistically only use half of the memory to take into account memory used by compiled kernel
+        const ::std::size_t __max_slm_size =
+            __q.get_device().template get_info<sycl::info::device::local_mem_size>() / 2;
 
         const auto __n_uniform = 1 << (::std::uint32_t(log2(__n - 1)) + 1);
         const auto __req_slm_size_val = sizeof(_T) * __n_uniform;
         const auto __req_slm_size_counters = __counter_buf_sz * sizeof(uint32_t);
 
-        return __req_slm_size_val <= __max_slm_size - __req_slm_size_counters; //counters should be placed in SLM
+        return __req_slm_size_val + __req_slm_size_counters <= __max_slm_size; //counters should be placed in SLM
     }
 
     template <typename _KernelName>
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h
@@ -314,8 +314,9 @@ __parallel_transform_reduce(_ExecutionPolicy&& __exec, _ReduceOp __reduce_op, _T
     // TODO: find a way to generalize getting of reliable work-group size
     ::std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec);
     // change __work_group_size according to local memory limit
+    // Pessimistically double the memory requirement to take into account memory used by compiled kernel
     __work_group_size = oneapi::dpl::__internal::__max_local_allocation_size(::std::forward<_ExecutionPolicy>(__exec),
-                                                                             sizeof(_Tp), __work_group_size);
+                                                                             sizeof(_Tp) * 2, __work_group_size);
     if (__n <= 65536 && __work_group_size >= 512)
     {
         if (__n <= 128)
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h
@@ -62,6 +62,9 @@
 #    define _ONEDPL_SYCL_REQD_SUB_GROUP_SIZE(SIZE) intel::reqd_sub_group_size(SIZE)
 #endif
 
+// TODO: re-enable when sort test passes for all devices of interest
+#define _ONEDPL_USE_SINGLE_GROUP_RADIX_SORT 0
+
 namespace __dpl_sycl
 {
 
diff --git a/include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h
@@ -120,7 +120,9 @@ __pattern_transform_scan_base_impl(_ExecutionPolicy&& __exec, _Iterator1 __first
     if ((__n_uniform & (__n_uniform - 1)) != 0)
         __n_uniform = __par_backend_hetero::__dpl_bit_floor(__n) << 1;
 
-    const auto __max_slm_size = __exec.queue().get_device().template get_info<sycl::info::device::local_mem_size>();
+    // Pessimistically only use half of the memory to take into account memory used by compiled kernel
+    const ::std::size_t __max_slm_size =
+        __exec.queue().get_device().template get_info<sycl::info::device::local_mem_size>() / 2;
     const auto __req_slm_size = sizeof(_Type) * __n_uniform;
 
     constexpr int __single_group_upper_limit = 16384;

Original file line number	Diff line number	Diff line change
`@@ -128,13 +128,15 @@ struct __subgroup_radix_sort`
`128`	`128`	`{`
`129`	`129`	`assert(__n <= 1 << (sizeof(uint16_t) * 8)); //the kernel is designed for data size <= 64K`
`130`	`130`
`131`		`- const auto __max_slm_size = __q.get_device().template get_info<sycl::info::device::local_mem_size>();`
	`131`	`+ // Pessimistically only use half of the memory to take into account memory used by compiled kernel`
	`132`	`+ const ::std::size_t __max_slm_size =`
	`133`	`+ __q.get_device().template get_info<sycl::info::device::local_mem_size>() / 2;`
`132`	`134`
`133`	`135`	`const auto __n_uniform = 1 << (::std::uint32_t(log2(__n - 1)) + 1);`
`134`	`136`	`const auto __req_slm_size_val = sizeof(_T) * __n_uniform;`
`135`	`137`	`const auto __req_slm_size_counters = __counter_buf_sz * sizeof(uint32_t);`
`136`	`138`
`137`		`- return __req_slm_size_val <= __max_slm_size - __req_slm_size_counters; //counters should be placed in SLM`
	`139`	`+ return __req_slm_size_val + __req_slm_size_counters <= __max_slm_size; //counters should be placed in SLM`
`138`	`140`	`}`
`139`	`141`
`140`	`142`	`template <typename _KernelName>`
Original file line number	Diff line number	Diff line change
`@@ -62,6 +62,9 @@`
`62`	`62`	`# define _ONEDPL_SYCL_REQD_SUB_GROUP_SIZE(SIZE) intel::reqd_sub_group_size(SIZE)`
`63`	`63`	`#endif`
`64`	`64`
	`65`	`+// TODO: re-enable when sort test passes for all devices of interest`
	`66`	`+#define _ONEDPL_USE_SINGLE_GROUP_RADIX_SORT 0`
	`67`	`+`
`65`	`68`	`namespace __dpl_sycl`
`66`	`69`	`{`
`67`	`70`