Skip to content

Commit 63031ab

Browse files
committed
General code cleanup
Signed-off-by: Matthew Michel <[email protected]>
1 parent eb90206 commit 63031ab

File tree

6 files changed

+137
-134
lines changed

6 files changed

+137
-134
lines changed

include/oneapi/dpl/internal/async_impl/async_impl_hetero.h

+4-3
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,8 @@ __pattern_walk1_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For
4646

4747
auto __future_obj = oneapi::dpl::__par_backend_hetero::__parallel_for(
4848
_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
49-
unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, _Function, decltype(__view)>{{}, __f, std::size_t(__n)},
49+
unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, _Function, decltype(__view)>{
50+
{}, __f, static_cast<std::size_t>(__n)},
5051
__n, __view);
5152
return __future_obj;
5253
}
@@ -74,7 +75,7 @@ __pattern_walk2_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For
7475
auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
7576
_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
7677
unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2)>{
77-
{}, __f, std::size_t(__n)},
78+
{}, __f, static_cast<std::size_t>(__n)},
7879
__n, __view1, __view2);
7980

8081
return __future.__make_future(__first2 + __n);
@@ -106,7 +107,7 @@ __pattern_walk3_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For
106107
auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
107108
_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
108109
unseq_backend::walk3_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2),
109-
decltype(__view3)>{{}, __f, size_t(__n)},
110+
decltype(__view3)>{{}, __f, static_cast<size_t>(__n)},
110111
__n, __view1, __view2, __view3);
111112

112113
return __future.__make_future(__first3 + __n);

include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h

+10-10
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ __pattern_walk2(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ForwardIt
111111
auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
112112
_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
113113
unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2)>{
114-
{}, __f, size_t(__n)},
114+
{}, __f, static_cast<std::size_t>(__n)},
115115
__n, __view1, __view2);
116116

117117
// Call no wait, wait or deferrable wait depending on _WaitMode
@@ -157,7 +157,7 @@ __pattern_swap(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Forw
157157
auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
158158
_BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
159159
unseq_backend::__brick_swap<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2)>{
160-
{}, __f, size_t(__n)},
160+
{}, __f, static_cast<std::size_t>(__n)},
161161
__n, __view1, __view2);
162162
__future.wait(__par_backend_hetero::__deferrable_mode{});
163163
return __first2 + __n;
@@ -192,9 +192,9 @@ __pattern_walk3(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ForwardIt
192192
auto __view3 = __buf3.all_view();
193193

194194
oneapi::dpl::__par_backend_hetero::__parallel_for(
195-
_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
195+
_BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
196196
unseq_backend::walk3_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2),
197-
decltype(__view3)>{{}, __f, size_t(__n)},
197+
decltype(__view3)>{{}, __f, static_cast<std::size_t>(__n)},
198198
__n, __view1, __view2, __view3)
199199
.__deferrable_wait();
200200

@@ -1597,8 +1597,8 @@ __pattern_reverse(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterato
15971597
auto __keep = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _Iterator>();
15981598
auto __buf = __keep(__first, __last);
15991599
oneapi::dpl::__par_backend_hetero::__parallel_for(
1600-
_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
1601-
unseq_backend::__reverse_functor<typename ::std::iterator_traits<_Iterator>::difference_type,
1600+
_BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
1601+
unseq_backend::__reverse_functor<typename std::iterator_traits<_Iterator>::difference_type,
16021602
decltype(__buf.all_view())>{{}, __n},
16031603
__n / 2, __buf.all_view())
16041604
.__deferrable_wait();
@@ -1626,8 +1626,8 @@ __pattern_reverse_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Bi
16261626
auto __view1 = __buf1.all_view();
16271627
auto __view2 = __buf2.all_view();
16281628
oneapi::dpl::__par_backend_hetero::__parallel_for(
1629-
_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
1630-
unseq_backend::__reverse_copy<typename ::std::iterator_traits<_BidirectionalIterator>::difference_type,
1629+
_BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
1630+
unseq_backend::__reverse_copy<typename std::iterator_traits<_BidirectionalIterator>::difference_type,
16311631
decltype(__view1), decltype(__view2)>{{}, __n},
16321632
__n, __view1, __view2)
16331633
.__deferrable_wait();
@@ -1670,7 +1670,7 @@ __pattern_rotate(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator
16701670
const auto __shift = __new_first - __first;
16711671
oneapi::dpl::__par_backend_hetero::__parallel_for(
16721672
_BackendTag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__rotate_wrapper>(__exec),
1673-
unseq_backend::__rotate_copy<typename ::std::iterator_traits<_Iterator>::difference_type, decltype(__view),
1673+
unseq_backend::__rotate_copy<typename std::iterator_traits<_Iterator>::difference_type, decltype(__view),
16741674
decltype(__temp_rng_w)>{{}, __n, __shift},
16751675
__n, __view, __temp_rng_w);
16761676

@@ -1683,7 +1683,7 @@ __pattern_rotate(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator
16831683
auto __brick = unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__temp_rng_rw),
16841684
decltype(__buf.all_view())>{
16851685
{}, _Function{}, static_cast<std::size_t>(__n)};
1686-
oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __brick,
1686+
oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), __brick,
16871687
__n, __temp_rng_rw, __buf.all_view())
16881688
.__deferrable_wait();
16891689

include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h

-2
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,10 @@
2828
#include <cmath>
2929
#include <limits>
3030
#include <cstdint>
31-
#include <tuple>
3231

3332
#include "../../iterator_impl.h"
3433
#include "../../execution_impl.h"
3534
#include "../../utils_ranges.h"
36-
#include "../../utils.h"
3735

3836
#include "sycl_defs.h"
3937
#include "parallel_backend_sycl_utils.h"

include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h

+5-5
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ struct __parallel_for_small_submitter<__internal::__optional_kernel_name<_Name..
6060
{
6161
assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0);
6262
_PRINT_INFO_IN_DEBUG_MODE(__exec);
63-
auto __event = __exec.queue().submit([&__rngs..., &__brick, __count](sycl::handler& __cgh) {
63+
auto __event = __exec.queue().submit([__rngs..., __brick, __count](sycl::handler& __cgh) {
6464
//get an access to data under SYCL buffer:
6565
oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
6666

@@ -142,20 +142,20 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name..
142142
{
143143
assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0);
144144
_PRINT_INFO_IN_DEBUG_MODE(__exec);
145-
auto __event = __exec.queue().submit([&__rngs..., &__brick, &__exec, __count](sycl::handler& __cgh) {
145+
auto __event = __exec.queue().submit([__rngs..., __brick, __exec, __count](sycl::handler& __cgh) {
146146
//get an access to data under SYCL buffer:
147147
oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
148148
constexpr static std::uint16_t __iters_per_work_item = _Fp::__preferred_iters_per_item;
149149
const std::size_t __work_group_size =
150150
oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size);
151151
const std::size_t __num_groups =
152-
oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * decltype(__brick)::__preferred_vector_size * __iters_per_work_item));
152+
oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * _Fp::__preferred_vector_size * __iters_per_work_item));
153153
const std::size_t __num_items = __num_groups * __work_group_size;
154154
__cgh.parallel_for<_Name...>(
155155
sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)),
156156
[=](sycl::nd_item</*dim=*/1> __item) {
157157
auto [__idx, __stride, __is_full] =
158-
__stride_recommender(__item, __count, __iters_per_work_item, decltype(__brick)::__preferred_vector_size, __work_group_size);
158+
__stride_recommender(__item, __count, __iters_per_work_item, _Fp::__preferred_vector_size, __work_group_size);
159159
__strided_loop<__iters_per_work_item> __execute_loop{static_cast<std::size_t>(__count)};
160160
if (__is_full)
161161
{
@@ -190,7 +190,7 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&&
190190
// Compile two kernels: one for small-to-medium inputs and a second for large. This avoids runtime checks within a
191191
// single kernel that worsen performance for small cases. If the number of iterations of the large submitter is 1,
192192
// then only compile the basic kernel as the two versions are effectively the same.
193-
if constexpr (_Fp::__preferred_iters_per_item > 1)
193+
if constexpr (_Fp::__preferred_iters_per_item > 1 || _Fp::__preferred_vector_size > 1)
194194
{
195195
if (__count >= __large_submitter::__estimate_best_start_size(__exec, __brick))
196196
{

include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h

+29-28
Original file line numberDiff line numberDiff line change
@@ -845,26 +845,26 @@ struct __lazy_load_transform_op
845845
}
846846
};
847847

848-
template <std::uint16_t __vec_size>
848+
template <std::uint8_t __vec_size>
849849
struct __vector_load
850850
{
851-
static_assert(__vec_size <= 4);
851+
static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported");
852852
std::size_t __n;
853853
template <typename _IdxType, typename _LoadOp, typename... _Acc>
854854
void
855855
operator()(std::true_type, _IdxType __start_idx, _LoadOp __load_op, _Acc... __acc) const
856856
{
857857
_ONEDPL_PRAGMA_UNROLL
858-
for (std::uint16_t __i = 0; __i < __vec_size; ++__i)
858+
for (std::uint8_t __i = 0; __i < __vec_size; ++__i)
859859
__load_op(__start_idx + __i, __i, __acc...);
860860
}
861861

862862
template <typename _IdxType, typename _LoadOp, typename... _Acc>
863863
void
864864
operator()(std::false_type, _IdxType __start_idx, _LoadOp __load_op, _Acc... __acc) const
865865
{
866-
std::uint16_t __elements = std::min(__vec_size, decltype(__vec_size)(__n - __start_idx));
867-
for (std::uint16_t __i = 0; __i < __elements; ++__i)
866+
std::uint8_t __elements = std::min(std::size_t{__vec_size}, std::size_t{__n - __start_idx});
867+
for (std::uint8_t __i = 0; __i < __elements; ++__i)
868868
__load_op(__start_idx + __i, __i, __acc...);
869869
}
870870
};
@@ -891,20 +891,19 @@ struct __lazy_store_transform_op
891891
}
892892
};
893893

894-
template <std::uint16_t __vec_size>
894+
template <std::uint8_t __vec_size>
895895
struct __vector_walk
896896
{
897-
static_assert(__vec_size <= 4);
897+
static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported");
898898
std::size_t __n;
899899

900900
template <typename _IdxType, typename _WalkFunction, typename... _Rngs>
901901
void
902902
operator()(std::true_type, _IdxType __idx, _WalkFunction __f, _Rngs&&... __rngs) const
903903
{
904904
_ONEDPL_PRAGMA_UNROLL
905-
for (std::uint16_t __i = 0; __i < __vec_size; ++__i)
905+
for (std::uint8_t __i = 0; __i < __vec_size; ++__i)
906906
{
907-
908907
__f(__rngs[__idx + __i]...);
909908
}
910909
}
@@ -914,61 +913,63 @@ struct __vector_walk
914913
void
915914
operator()(std::false_type, _IdxType __idx, _WalkFunction __f, _Rngs&&... __rngs) const
916915
{
917-
std::uint16_t __elements = std::min(__vec_size, decltype(__vec_size)(__n - __idx));
918-
for (std::uint16_t __i = 0; __i < __elements; ++__i)
916+
std::uint8_t __elements = std::min(std::size_t{__vec_size}, std::size_t{__n - __idx});
917+
for (std::uint8_t __i = 0; __i < __elements; ++__i)
919918
{
920919
__f(__rngs[__idx + __i]...);
921920
}
922921
}
923922
};
924923

925-
template <std::uint16_t __vec_size>
924+
template <std::uint8_t __vec_size>
926925
struct __vector_store
927926
{
927+
static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported");
928928
std::size_t __n;
929-
static_assert(__vec_size <= 4);
930-
template <typename _IdxType, typename _StoreOp, typename... _Acc>
929+
930+
template <typename _IdxType, typename _StoreOp, typename... _Rngs>
931931
void
932-
operator()(std::true_type, _IdxType __start_idx, _StoreOp __store_op, _Acc... __acc) const
932+
operator()(std::true_type, _IdxType __start_idx, _StoreOp __store_op, _Rngs... __rngs) const
933933
{
934934
_ONEDPL_PRAGMA_UNROLL
935-
for (std::uint16_t __i = 0; __i < __vec_size; ++__i)
936-
__store_op(__i, __start_idx + __i, __acc...);
935+
for (std::uint8_t __i = 0; __i < __vec_size; ++__i)
936+
__store_op(__i, __start_idx + __i, __rngs...);
937937
}
938-
template <typename _IdxType, typename _StoreOp, typename... _Acc>
938+
template <typename _IdxType, typename _StoreOp, typename... _Rngs>
939939
void
940-
operator()(std::false_type, _IdxType __start_idx, _StoreOp __store_op, _Acc... __acc) const
940+
operator()(std::false_type, _IdxType __start_idx, _StoreOp __store_op, _Rngs... __rngs) const
941941
{
942-
std::uint16_t __elements = std::min(__vec_size, decltype(__vec_size)(__n - __start_idx));
943-
for (std::uint16_t __i = 0; __i < __elements; ++__i)
944-
__store_op(__i, __start_idx + __i, __acc...);
942+
std::uint8_t __elements = std::min(std::size_t{__vec_size}, std::size_t{__n - __start_idx});
943+
for (std::uint8_t __i = 0; __i < __elements; ++__i)
944+
__store_op(__i, __start_idx + __i, __rngs...);
945945
}
946946
};
947947

948-
template <std::uint16_t __vec_size>
948+
template <std::uint8_t __vec_size>
949949
struct __vector_reverse
950950
{
951+
static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported");
951952
template <typename _IsFull, typename _Idx, typename _Array>
952953
void
953954
operator()(_IsFull __is_full, const _Idx __elements_to_process, _Array __array) const
954955
{
955956
if constexpr (__is_full)
956957
{
957958
_ONEDPL_PRAGMA_UNROLL
958-
for (std::uint16_t __i = 0; __i != __vec_size / 2; ++__i)
959+
for (std::uint8_t __i = 0; __i < __vec_size / 2; ++__i)
959960
std::swap(__array[__i].__v, __array[__vec_size - __i - 1].__v);
960961
}
961962
else
962963
{
963-
for (std::uint16_t __i = 0; __i != __elements_to_process / 2; ++__i)
964+
for (std::uint8_t __i = 0; __i < __elements_to_process / 2; ++__i)
964965
std::swap(__array[__i].__v, __array[__elements_to_process - __i - 1].__v);
965966
}
966967
}
967968
};
968969

969970
// Processes a loop with a given stride. Intended to be used with sub-group / work-group strides for good memory access patterns
970971
// (potentially with vectorization)
971-
template <std::uint16_t __num_strides>
972+
template <std::uint8_t __num_strides>
972973
struct __strided_loop
973974
{
974975
std::size_t __n;
@@ -978,7 +979,7 @@ struct __strided_loop
978979
_Ranges&&... __rngs) const
979980
{
980981
_ONEDPL_PRAGMA_UNROLL
981-
for (std::uint16_t __i = 0; __i < __num_strides; ++__i)
982+
for (std::uint8_t __i = 0; __i < __num_strides; ++__i)
982983
{
983984
__loop_body_op(std::true_type{}, __idx, __rngs...);
984985
__idx += __stride;
@@ -992,7 +993,7 @@ struct __strided_loop
992993
// Constrain the number of iterations as much as possible and then pass the knowledge that we are not a full loop to the body operation
993994
const std::uint8_t __adjusted_iters_per_work_item =
994995
oneapi::dpl::__internal::__dpl_ceiling_div(__n - __idx, __stride);
995-
for (std::uint16_t __i = 0; __i < __adjusted_iters_per_work_item; ++__i)
996+
for (std::uint8_t __i = 0; __i < __adjusted_iters_per_work_item; ++__i)
996997
{
997998
__loop_body_op(std::false_type{}, __idx, __rngs...);
998999
__idx += __stride;

0 commit comments

Comments
 (0)