General code cleanup

mmichel11 · mmichel11 · commit 63031ab5cd6e · 2024-12-18T16:35:47.000-06:00
Signed-off-by: Matthew Michel &lt;matthew.michel@intel.com&gt;
diff --git a/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h b/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h
@@ -46,7 +46,8 @@ __pattern_walk1_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For
 
     auto __future_obj = oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-        unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, _Function, decltype(__view)>{{}, __f, std::size_t(__n)},
+        unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, _Function, decltype(__view)>{
+            {}, __f, static_cast<std::size_t>(__n)},
         __n, __view);
     return __future_obj;
 }
@@ -74,7 +75,7 @@ __pattern_walk2_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For
     auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
         unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2)>{
-            {}, __f, std::size_t(__n)},
+            {}, __f, static_cast<std::size_t>(__n)},
         __n, __view1, __view2);
 
     return __future.__make_future(__first2 + __n);
@@ -106,7 +107,7 @@ __pattern_walk3_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For
     auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
         unseq_backend::walk3_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2),
-                                                decltype(__view3)>{{}, __f, size_t(__n)},
+                                                decltype(__view3)>{{}, __f, static_cast<size_t>(__n)},
         __n, __view1, __view2, __view3);
 
     return __future.__make_future(__first3 + __n);
diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
@@ -111,7 +111,7 @@ __pattern_walk2(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ForwardIt
     auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
         unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2)>{
-            {}, __f, size_t(__n)},
+            {}, __f, static_cast<std::size_t>(__n)},
         __n, __view1, __view2);
 
     // Call no wait, wait or deferrable wait depending on _WaitMode
@@ -157,7 +157,7 @@ __pattern_swap(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Forw
     auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
         unseq_backend::__brick_swap<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2)>{
-            {}, __f, size_t(__n)},
+            {}, __f, static_cast<std::size_t>(__n)},
         __n, __view1, __view2);
     __future.wait(__par_backend_hetero::__deferrable_mode{});
     return __first2 + __n;
@@ -192,9 +192,9 @@ __pattern_walk3(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ForwardIt
     auto __view3 = __buf3.all_view();
 
     oneapi::dpl::__par_backend_hetero::__parallel_for(
-        _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
+        _BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
         unseq_backend::walk3_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2),
-                                                decltype(__view3)>{{}, __f, size_t(__n)},
+                                                decltype(__view3)>{{}, __f, static_cast<std::size_t>(__n)},
         __n, __view1, __view2, __view3)
         .__deferrable_wait();
 
@@ -1597,8 +1597,8 @@ __pattern_reverse(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterato
     auto __keep = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _Iterator>();
     auto __buf = __keep(__first, __last);
     oneapi::dpl::__par_backend_hetero::__parallel_for(
-        _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-        unseq_backend::__reverse_functor<typename ::std::iterator_traits<_Iterator>::difference_type,
+        _BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
+        unseq_backend::__reverse_functor<typename std::iterator_traits<_Iterator>::difference_type,
                                          decltype(__buf.all_view())>{{}, __n},
         __n / 2, __buf.all_view())
         .__deferrable_wait();
@@ -1626,8 +1626,8 @@ __pattern_reverse_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Bi
     auto __view1 = __buf1.all_view();
     auto __view2 = __buf2.all_view();
     oneapi::dpl::__par_backend_hetero::__parallel_for(
-        _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-        unseq_backend::__reverse_copy<typename ::std::iterator_traits<_BidirectionalIterator>::difference_type,
+        _BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
+        unseq_backend::__reverse_copy<typename std::iterator_traits<_BidirectionalIterator>::difference_type,
                                       decltype(__view1), decltype(__view2)>{{}, __n},
         __n, __view1, __view2)
         .__deferrable_wait();
@@ -1670,7 +1670,7 @@ __pattern_rotate(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator
     const auto __shift = __new_first - __first;
     oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__rotate_wrapper>(__exec),
-        unseq_backend::__rotate_copy<typename ::std::iterator_traits<_Iterator>::difference_type, decltype(__view),
+        unseq_backend::__rotate_copy<typename std::iterator_traits<_Iterator>::difference_type, decltype(__view),
                                      decltype(__temp_rng_w)>{{}, __n, __shift},
         __n, __view, __temp_rng_w);
 
@@ -1683,7 +1683,7 @@ __pattern_rotate(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator
     auto __brick = unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__temp_rng_rw),
                                                            decltype(__buf.all_view())>{
         {}, _Function{}, static_cast<std::size_t>(__n)};
-    oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __brick,
+    oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), __brick,
                                                       __n, __temp_rng_rw, __buf.all_view())
         .__deferrable_wait();
 
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -28,12 +28,10 @@
 #include <cmath>
 #include <limits>
 #include <cstdint>
-#include <tuple>
 
 #include "../../iterator_impl.h"
 #include "../../execution_impl.h"
 #include "../../utils_ranges.h"
-#include "../../utils.h"
 
 #include "sycl_defs.h"
 #include "parallel_backend_sycl_utils.h"
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
@@ -60,7 +60,7 @@ struct __parallel_for_small_submitter<__internal::__optional_kernel_name<_Name..
     {
         assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0);
         _PRINT_INFO_IN_DEBUG_MODE(__exec);
-        auto __event = __exec.queue().submit([&__rngs..., &__brick, __count](sycl::handler& __cgh) {
+        auto __event = __exec.queue().submit([__rngs..., __brick, __count](sycl::handler& __cgh) {
             //get an access to data under SYCL buffer:
             oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
 
@@ -142,20 +142,20 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name..
     {
         assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0);
         _PRINT_INFO_IN_DEBUG_MODE(__exec);
-        auto __event = __exec.queue().submit([&__rngs..., &__brick, &__exec, __count](sycl::handler& __cgh) {
+        auto __event = __exec.queue().submit([__rngs..., __brick, __exec, __count](sycl::handler& __cgh) {
             //get an access to data under SYCL buffer:
             oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
             constexpr static std::uint16_t __iters_per_work_item = _Fp::__preferred_iters_per_item;
             const std::size_t __work_group_size =
                 oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size);
             const std::size_t __num_groups =
-                oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * decltype(__brick)::__preferred_vector_size * __iters_per_work_item));
+                oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * _Fp::__preferred_vector_size * __iters_per_work_item));
             const std::size_t __num_items = __num_groups * __work_group_size;
             __cgh.parallel_for<_Name...>(
                 sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)),
                 [=](sycl::nd_item</*dim=*/1> __item) {
                     auto [__idx, __stride, __is_full] =
-                        __stride_recommender(__item, __count, __iters_per_work_item, decltype(__brick)::__preferred_vector_size, __work_group_size);
+                        __stride_recommender(__item, __count, __iters_per_work_item, _Fp::__preferred_vector_size, __work_group_size);
                     __strided_loop<__iters_per_work_item> __execute_loop{static_cast<std::size_t>(__count)};
                     if (__is_full)
                     {
@@ -190,7 +190,7 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&&
     // Compile two kernels: one for small-to-medium inputs and a second for large. This avoids runtime checks within a
     // single kernel that worsen performance for small cases. If the number of iterations of the large submitter is 1,
     // then only compile the basic kernel as the two versions are effectively the same.
-    if constexpr (_Fp::__preferred_iters_per_item > 1)
+    if constexpr (_Fp::__preferred_iters_per_item > 1 || _Fp::__preferred_vector_size > 1)
     {
         if (__count >= __large_submitter::__estimate_best_start_size(__exec, __brick))
         {
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
@@ -845,26 +845,26 @@ struct __lazy_load_transform_op
     }
 };
 
-template <std::uint16_t __vec_size>
+template <std::uint8_t __vec_size>
 struct __vector_load
 {
-    static_assert(__vec_size <= 4);
+    static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported");
     std::size_t __n;
     template <typename _IdxType, typename _LoadOp, typename... _Acc>
     void
     operator()(std::true_type, _IdxType __start_idx, _LoadOp __load_op, _Acc... __acc) const
     {
         _ONEDPL_PRAGMA_UNROLL
-        for (std::uint16_t __i = 0; __i < __vec_size; ++__i)
+        for (std::uint8_t __i = 0; __i < __vec_size; ++__i)
             __load_op(__start_idx + __i, __i, __acc...);
     }
 
     template <typename _IdxType, typename _LoadOp, typename... _Acc>
     void
     operator()(std::false_type, _IdxType __start_idx, _LoadOp __load_op, _Acc... __acc) const
     {
-        std::uint16_t __elements = std::min(__vec_size, decltype(__vec_size)(__n - __start_idx));
-        for (std::uint16_t __i = 0; __i < __elements; ++__i)
+        std::uint8_t __elements = std::min(std::size_t{__vec_size}, std::size_t{__n - __start_idx});
+        for (std::uint8_t __i = 0; __i < __elements; ++__i)
             __load_op(__start_idx + __i, __i, __acc...);
     }
 };
@@ -891,20 +891,19 @@ struct __lazy_store_transform_op
     }
 };
 
-template <std::uint16_t __vec_size>
+template <std::uint8_t __vec_size>
 struct __vector_walk
 {
-    static_assert(__vec_size <= 4);
+    static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported");
     std::size_t __n;
 
     template <typename _IdxType, typename _WalkFunction, typename... _Rngs>
     void
     operator()(std::true_type, _IdxType __idx, _WalkFunction __f, _Rngs&&... __rngs) const
     {
         _ONEDPL_PRAGMA_UNROLL
-        for (std::uint16_t __i = 0; __i < __vec_size; ++__i)
+        for (std::uint8_t __i = 0; __i < __vec_size; ++__i)
         {
-
             __f(__rngs[__idx + __i]...);
         }
     }
@@ -914,61 +913,63 @@ struct __vector_walk
     void
     operator()(std::false_type, _IdxType __idx, _WalkFunction __f, _Rngs&&... __rngs) const
     {
-        std::uint16_t __elements = std::min(__vec_size, decltype(__vec_size)(__n - __idx));
-        for (std::uint16_t __i = 0; __i < __elements; ++__i)
+        std::uint8_t __elements = std::min(std::size_t{__vec_size}, std::size_t{__n - __idx});
+        for (std::uint8_t __i = 0; __i < __elements; ++__i)
         {
             __f(__rngs[__idx + __i]...);
         }
     }
 };
 
-template <std::uint16_t __vec_size>
+template <std::uint8_t __vec_size>
 struct __vector_store
 {
+    static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported");
     std::size_t __n;
-    static_assert(__vec_size <= 4);
-    template <typename _IdxType, typename _StoreOp, typename... _Acc>
+
+    template <typename _IdxType, typename _StoreOp, typename... _Rngs>
     void
-    operator()(std::true_type, _IdxType __start_idx, _StoreOp __store_op, _Acc... __acc) const
+    operator()(std::true_type, _IdxType __start_idx, _StoreOp __store_op, _Rngs... __rngs) const
     {
         _ONEDPL_PRAGMA_UNROLL
-        for (std::uint16_t __i = 0; __i < __vec_size; ++__i)
-            __store_op(__i, __start_idx + __i, __acc...);
+        for (std::uint8_t __i = 0; __i < __vec_size; ++__i)
+            __store_op(__i, __start_idx + __i, __rngs...);
     }
-    template <typename _IdxType, typename _StoreOp, typename... _Acc>
+    template <typename _IdxType, typename _StoreOp, typename... _Rngs>
     void
-    operator()(std::false_type, _IdxType __start_idx, _StoreOp __store_op, _Acc... __acc) const
+    operator()(std::false_type, _IdxType __start_idx, _StoreOp __store_op, _Rngs... __rngs) const
     {
-        std::uint16_t __elements = std::min(__vec_size, decltype(__vec_size)(__n - __start_idx));
-        for (std::uint16_t __i = 0; __i < __elements; ++__i)
-            __store_op(__i, __start_idx + __i, __acc...);
+        std::uint8_t __elements = std::min(std::size_t{__vec_size}, std::size_t{__n - __start_idx});
+        for (std::uint8_t __i = 0; __i < __elements; ++__i)
+            __store_op(__i, __start_idx + __i, __rngs...);
     }
 };
 
-template <std::uint16_t __vec_size>
+template <std::uint8_t __vec_size>
 struct __vector_reverse
 {
+    static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported");
     template <typename _IsFull, typename _Idx, typename _Array>
     void
     operator()(_IsFull __is_full, const _Idx __elements_to_process, _Array __array) const
     {
         if constexpr (__is_full)
         {
             _ONEDPL_PRAGMA_UNROLL
-            for (std::uint16_t __i = 0; __i != __vec_size / 2; ++__i)
+            for (std::uint8_t __i = 0; __i < __vec_size / 2; ++__i)
                 std::swap(__array[__i].__v, __array[__vec_size - __i - 1].__v);
         }
         else
         {
-            for (std::uint16_t __i = 0; __i != __elements_to_process / 2; ++__i)
+            for (std::uint8_t __i = 0; __i < __elements_to_process / 2; ++__i)
                 std::swap(__array[__i].__v, __array[__elements_to_process - __i - 1].__v);
         }
     }
 };
 
 // Processes a loop with a given stride. Intended to be used with sub-group / work-group strides for good memory access patterns
 // (potentially with vectorization)
-template <std::uint16_t __num_strides>
+template <std::uint8_t __num_strides>
 struct __strided_loop
 {
     std::size_t __n;
@@ -978,7 +979,7 @@ struct __strided_loop
                _Ranges&&... __rngs) const
     {
         _ONEDPL_PRAGMA_UNROLL
-        for (std::uint16_t __i = 0; __i < __num_strides; ++__i)
+        for (std::uint8_t __i = 0; __i < __num_strides; ++__i)
         {
             __loop_body_op(std::true_type{}, __idx, __rngs...);
             __idx += __stride;
@@ -992,7 +993,7 @@ struct __strided_loop
         // Constrain the number of iterations as much as possible and then pass the knowledge that we are not a full loop to the body operation
         const std::uint8_t __adjusted_iters_per_work_item =
             oneapi::dpl::__internal::__dpl_ceiling_div(__n - __idx, __stride);
-        for (std::uint16_t __i = 0; __i < __adjusted_iters_per_work_item; ++__i)
+        for (std::uint8_t __i = 0; __i < __adjusted_iters_per_work_item; ++__i)
         {
             __loop_body_op(std::false_type{}, __idx, __rngs...);
             __idx += __stride;
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h

Original file line number	Diff line number	Diff line change
`@@ -845,26 +845,26 @@ struct __lazy_load_transform_op`
`845`	`845`	`}`
`846`	`846`	`};`
`847`	`847`
`848`		`-template <std::uint16_t __vec_size>`
	`848`	`+template <std::uint8_t __vec_size>`
`849`	`849`	`struct __vector_load`
`850`	`850`	`{`
`851`		`- static_assert(__vec_size <= 4);`
	`851`	`+ static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported");`
`852`	`852`	`std::size_t __n;`
`853`	`853`	`template <typename _IdxType, typename _LoadOp, typename... _Acc>`
`854`	`854`	`void`
`855`	`855`	`operator()(std::true_type, _IdxType __start_idx, _LoadOp __load_op, _Acc... __acc) const`
`856`	`856`	`{`
`857`	`857`	`_ONEDPL_PRAGMA_UNROLL`
`858`		`- for (std::uint16_t __i = 0; __i < __vec_size; ++__i)`
	`858`	`+ for (std::uint8_t __i = 0; __i < __vec_size; ++__i)`
`859`	`859`	`__load_op(__start_idx + __i, __i, __acc...);`
`860`	`860`	`}`
`861`	`861`
`862`	`862`	`template <typename _IdxType, typename _LoadOp, typename... _Acc>`
`863`	`863`	`void`
`864`	`864`	`operator()(std::false_type, _IdxType __start_idx, _LoadOp __load_op, _Acc... __acc) const`
`865`	`865`	`{`
`866`		`- std::uint16_t __elements = std::min(__vec_size, decltype(__vec_size)(__n - __start_idx));`
`867`		`- for (std::uint16_t __i = 0; __i < __elements; ++__i)`
	`866`	`+ std::uint8_t __elements = std::min(std::size_t{__vec_size}, std::size_t{__n - __start_idx});`
	`867`	`+ for (std::uint8_t __i = 0; __i < __elements; ++__i)`
`868`	`868`	`__load_op(__start_idx + __i, __i, __acc...);`
`869`	`869`	`}`
`870`	`870`	`};`
`@@ -891,20 +891,19 @@ struct __lazy_store_transform_op`
`891`	`891`	`}`
`892`	`892`	`};`
`893`	`893`
`894`		`-template <std::uint16_t __vec_size>`
	`894`	`+template <std::uint8_t __vec_size>`
`895`	`895`	`struct __vector_walk`
`896`	`896`	`{`
`897`		`- static_assert(__vec_size <= 4);`
	`897`	`+ static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported");`
`898`	`898`	`std::size_t __n;`
`899`	`899`
`900`	`900`	`template <typename _IdxType, typename _WalkFunction, typename... _Rngs>`
`901`	`901`	`void`
`902`	`902`	`operator()(std::true_type, _IdxType __idx, _WalkFunction __f, _Rngs&&... __rngs) const`
`903`	`903`	`{`
`904`	`904`	`_ONEDPL_PRAGMA_UNROLL`
`905`		`- for (std::uint16_t __i = 0; __i < __vec_size; ++__i)`
	`905`	`+ for (std::uint8_t __i = 0; __i < __vec_size; ++__i)`
`906`	`906`	`{`
`907`		`-`
`908`	`907`	`__f(__rngs[__idx + __i]...);`
`909`	`908`	`}`
`910`	`909`	`}`
`@@ -914,61 +913,63 @@ struct __vector_walk`
`914`	`913`	`void`
`915`	`914`	`operator()(std::false_type, _IdxType __idx, _WalkFunction __f, _Rngs&&... __rngs) const`
`916`	`915`	`{`
`917`		`- std::uint16_t __elements = std::min(__vec_size, decltype(__vec_size)(__n - __idx));`
`918`		`- for (std::uint16_t __i = 0; __i < __elements; ++__i)`
	`916`	`+ std::uint8_t __elements = std::min(std::size_t{__vec_size}, std::size_t{__n - __idx});`
	`917`	`+ for (std::uint8_t __i = 0; __i < __elements; ++__i)`
`919`	`918`	`{`
`920`	`919`	`__f(__rngs[__idx + __i]...);`
`921`	`920`	`}`
`922`	`921`	`}`
`923`	`922`	`};`
`924`	`923`
`925`		`-template <std::uint16_t __vec_size>`
	`924`	`+template <std::uint8_t __vec_size>`
`926`	`925`	`struct __vector_store`
`927`	`926`	`{`
	`927`	`+ static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported");`
`928`	`928`	`std::size_t __n;`
`929`		`- static_assert(__vec_size <= 4);`
`930`		`- template <typename _IdxType, typename _StoreOp, typename... _Acc>`
	`929`	`+`
	`930`	`+ template <typename _IdxType, typename _StoreOp, typename... _Rngs>`
`931`	`931`	`void`
`932`		`- operator()(std::true_type, _IdxType __start_idx, _StoreOp __store_op, _Acc... __acc) const`
	`932`	`+ operator()(std::true_type, _IdxType __start_idx, _StoreOp __store_op, _Rngs... __rngs) const`
`933`	`933`	`{`
`934`	`934`	`_ONEDPL_PRAGMA_UNROLL`
`935`		`- for (std::uint16_t __i = 0; __i < __vec_size; ++__i)`
`936`		`- __store_op(__i, __start_idx + __i, __acc...);`
	`935`	`+ for (std::uint8_t __i = 0; __i < __vec_size; ++__i)`
	`936`	`+ __store_op(__i, __start_idx + __i, __rngs...);`
`937`	`937`	`}`
`938`		`- template <typename _IdxType, typename _StoreOp, typename... _Acc>`
	`938`	`+ template <typename _IdxType, typename _StoreOp, typename... _Rngs>`
`939`	`939`	`void`
`940`		`- operator()(std::false_type, _IdxType __start_idx, _StoreOp __store_op, _Acc... __acc) const`
	`940`	`+ operator()(std::false_type, _IdxType __start_idx, _StoreOp __store_op, _Rngs... __rngs) const`
`941`	`941`	`{`
`942`		`- std::uint16_t __elements = std::min(__vec_size, decltype(__vec_size)(__n - __start_idx));`
`943`		`- for (std::uint16_t __i = 0; __i < __elements; ++__i)`
`944`		`- __store_op(__i, __start_idx + __i, __acc...);`
	`942`	`+ std::uint8_t __elements = std::min(std::size_t{__vec_size}, std::size_t{__n - __start_idx});`
	`943`	`+ for (std::uint8_t __i = 0; __i < __elements; ++__i)`
	`944`	`+ __store_op(__i, __start_idx + __i, __rngs...);`
`945`	`945`	`}`
`946`	`946`	`};`
`947`	`947`
`948`		`-template <std::uint16_t __vec_size>`
	`948`	`+template <std::uint8_t __vec_size>`
`949`	`949`	`struct __vector_reverse`
`950`	`950`	`{`
	`951`	`+ static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported");`
`951`	`952`	`template <typename _IsFull, typename _Idx, typename _Array>`
`952`	`953`	`void`
`953`	`954`	`operator()(_IsFull __is_full, const _Idx __elements_to_process, _Array __array) const`
`954`	`955`	`{`
`955`	`956`	`if constexpr (__is_full)`
`956`	`957`	`{`
`957`	`958`	`_ONEDPL_PRAGMA_UNROLL`
`958`		`- for (std::uint16_t __i = 0; __i != __vec_size / 2; ++__i)`
	`959`	`+ for (std::uint8_t __i = 0; __i < __vec_size / 2; ++__i)`
`959`	`960`	`std::swap(__array[__i].__v, __array[__vec_size - __i - 1].__v);`
`960`	`961`	`}`
`961`	`962`	`else`
`962`	`963`	`{`
`963`		`- for (std::uint16_t __i = 0; __i != __elements_to_process / 2; ++__i)`
	`964`	`+ for (std::uint8_t __i = 0; __i < __elements_to_process / 2; ++__i)`
`964`	`965`	`std::swap(__array[__i].__v, __array[__elements_to_process - __i - 1].__v);`
`965`	`966`	`}`
`966`	`967`	`}`
`967`	`968`	`};`
`968`	`969`
`969`	`970`	`// Processes a loop with a given stride. Intended to be used with sub-group / work-group strides for good memory access patterns`
`970`	`971`	`// (potentially with vectorization)`
`971`		`-template <std::uint16_t __num_strides>`
	`972`	`+template <std::uint8_t __num_strides>`
`972`	`973`	`struct __strided_loop`
`973`	`974`	`{`
`974`	`975`	`std::size_t __n;`
`@@ -978,7 +979,7 @@ struct __strided_loop`
`978`	`979`	`_Ranges&&... __rngs) const`
`979`	`980`	`{`
`980`	`981`	`_ONEDPL_PRAGMA_UNROLL`
`981`		`- for (std::uint16_t __i = 0; __i < __num_strides; ++__i)`
	`982`	`+ for (std::uint8_t __i = 0; __i < __num_strides; ++__i)`
`982`	`983`	`{`
`983`	`984`	`__loop_body_op(std::true_type{}, __idx, __rngs...);`
`984`	`985`	`__idx += __stride;`
`@@ -992,7 +993,7 @@ struct __strided_loop`
`992`	`993`	`// Constrain the number of iterations as much as possible and then pass the knowledge that we are not a full loop to the body operation`
`993`	`994`	`const std::uint8_t __adjusted_iters_per_work_item =`
`994`	`995`	`oneapi::dpl::__internal::__dpl_ceiling_div(__n - __idx, __stride);`
`995`		`- for (std::uint16_t __i = 0; __i < __adjusted_iters_per_work_item; ++__i)`
	`996`	`+ for (std::uint8_t __i = 0; __i < __adjusted_iters_per_work_item; ++__i)`
`996`	`997`	`{`
`997`	`998`	`__loop_body_op(std::false_type{}, __idx, __rngs...);`
`998`	`999`	`__idx += __stride;`