Skip to content

Commit f9a8f25

Browse files
committed
Revert "Work around crashes and incorrect results in scan-based algorithms when compiling with -O0 (#1997)"
This reverts commit 0140666.
1 parent a045eac commit f9a8f25

File tree

4 files changed

+104
-182
lines changed

4 files changed

+104
-182
lines changed

include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h

+76-117
Original file line numberDiff line numberDiff line change
@@ -1057,12 +1057,7 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
10571057
// work-group implementation requires a fundamental type which must also be trivially copyable.
10581058
if constexpr (std::is_trivially_copyable_v<_Type>)
10591059
{
1060-
bool __use_reduce_then_scan =
1061-
#if _ONEDPL_COMPILE_KERNEL
1062-
oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec);
1063-
#else
1064-
false;
1065-
#endif
1060+
bool __use_reduce_then_scan = oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec);
10661061

10671062
// TODO: Consider re-implementing single group scan to support types without known identities. This could also
10681063
// allow us to use single wg scan for the last block of reduce-then-scan if it is sufficiently small.
@@ -1081,27 +1076,19 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
10811076
std::forward<_Range2>(__out_rng), __n, __unary_op, __init, __binary_op, _Inclusive{});
10821077
}
10831078
}
1084-
#if _ONEDPL_COMPILE_KERNEL
10851079
if (__use_reduce_then_scan)
10861080
{
10871081
using _GenInput = oneapi::dpl::__par_backend_hetero::__gen_transform_input<_UnaryOperation>;
10881082
using _ScanInputTransform = oneapi::dpl::__internal::__no_op;
10891083
using _WriteOp = oneapi::dpl::__par_backend_hetero::__simple_write_to_id;
10901084

10911085
_GenInput __gen_transform{__unary_op};
1092-
try
1093-
{
1094-
return __parallel_transform_reduce_then_scan(__backend_tag, __exec, __in_rng, __out_rng,
1095-
__gen_transform, __binary_op, __gen_transform,
1096-
_ScanInputTransform{}, _WriteOp{}, __init, _Inclusive{},
1097-
/*_IsUniquePattern=*/std::false_type{});
1098-
}
1099-
catch (const sycl::exception& __e)
1100-
{
1101-
__bypass_sycl_kernel_not_supported(__e);
1102-
}
1086+
1087+
return __parallel_transform_reduce_then_scan(
1088+
__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
1089+
std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform, _ScanInputTransform{},
1090+
_WriteOp{}, __init, _Inclusive{}, /*_IsUniquePattern=*/std::false_type{});
11031091
}
1104-
#endif
11051092
}
11061093

11071094
//else use multi pass scan implementation
@@ -1176,7 +1163,6 @@ struct __invoke_single_group_copy_if
11761163
}
11771164
};
11781165

1179-
#if _ONEDPL_COMPILE_KERNEL
11801166
template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Size, typename _GenMask,
11811167
typename _WriteOp, typename _IsUniquePattern>
11821168
auto
@@ -1195,7 +1181,6 @@ __parallel_reduce_then_scan_copy(oneapi::dpl::__internal::__device_backend_tag _
11951181
_ScanInputTransform{}, __write_op, oneapi::dpl::unseq_backend::__no_init_value<_Size>{},
11961182
/*_Inclusive=*/std::true_type{}, __is_unique_pattern);
11971183
}
1198-
#endif
11991184

12001185
template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Size, typename _CreateMaskOp,
12011186
typename _CopyByMaskOp>
@@ -1248,36 +1233,32 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t
12481233
// can simply copy the input range to the output.
12491234
assert(__n > 1);
12501235

1251-
#if _ONEDPL_COMPILE_KERNEL
12521236
if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec))
12531237
{
12541238
using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_unique_mask<_BinaryPredicate>;
12551239
using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if<1, _Assign>;
1256-
try
1257-
{
1258-
return __parallel_reduce_then_scan_copy(__backend_tag, __exec, __rng, __result, __n, _GenMask{__pred},
1259-
_WriteOp{_Assign{}},
1260-
/*_IsUniquePattern=*/std::true_type{});
1261-
}
1262-
catch (const sycl::exception& __e)
1263-
{
1264-
__bypass_sycl_kernel_not_supported(__e);
1265-
}
1240+
1241+
return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec),
1242+
std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n,
1243+
_GenMask{__pred}, _WriteOp{_Assign{}},
1244+
/*_IsUniquePattern=*/std::true_type{});
1245+
}
1246+
else
1247+
{
1248+
1249+
using _ReduceOp = std::plus<decltype(__n)>;
1250+
using _CreateOp =
1251+
oneapi::dpl::__internal::__create_mask_unique_copy<oneapi::dpl::__internal::__not_pred<_BinaryPredicate>,
1252+
decltype(__n)>;
1253+
using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign, /*inclusive*/ std::true_type, 1>;
1254+
1255+
return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng),
1256+
std::forward<_Range2>(__result), __n,
1257+
_CreateOp{oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}},
1258+
_CopyOp{_ReduceOp{}, _Assign{}});
12661259
}
1267-
#endif
1268-
using _ReduceOp = std::plus<decltype(__n)>;
1269-
using _CreateOp =
1270-
oneapi::dpl::__internal::__create_mask_unique_copy<oneapi::dpl::__internal::__not_pred<_BinaryPredicate>,
1271-
decltype(__n)>;
1272-
using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign, /*inclusive*/ std::true_type, 1>;
1273-
1274-
return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng),
1275-
std::forward<_Range2>(__result), __n,
1276-
_CreateOp{oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}},
1277-
_CopyOp{_ReduceOp{}, _Assign{}});
12781260
}
12791261

1280-
#if _ONEDPL_COMPILE_KERNEL
12811262
template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Range4,
12821263
typename _BinaryPredicate, typename _BinaryOperator>
12831264
auto
@@ -1302,45 +1283,39 @@ __parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_
13021283
assert(__n > 1);
13031284
return __parallel_transform_reduce_then_scan(
13041285
__backend_tag, std::forward<_ExecutionPolicy>(__exec),
1305-
oneapi::dpl::__ranges::zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)),
1306-
oneapi::dpl::__ranges::zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)),
1286+
oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)),
1287+
oneapi::dpl::__ranges::make_zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)),
13071288
_GenReduceInput{__binary_pred}, _ReduceOp{__binary_op}, _GenScanInput{__binary_pred, __n},
13081289
_ScanInputTransform{}, _WriteOp{__binary_pred, __n},
13091290
oneapi::dpl::unseq_backend::__no_init_value<oneapi::dpl::__internal::tuple<std::size_t, _ValueType>>{},
13101291
/*Inclusive*/ std::true_type{}, /*_IsUniquePattern=*/std::false_type{});
13111292
}
1312-
#endif
13131293

13141294
template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _UnaryPredicate>
13151295
auto
13161296
__parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
13171297
_Range1&& __rng, _Range2&& __result, _UnaryPredicate __pred)
13181298
{
13191299
oneapi::dpl::__internal::__difference_t<_Range1> __n = __rng.size();
1320-
#if _ONEDPL_COMPILE_KERNEL
13211300
if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec))
13221301
{
13231302
using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_UnaryPredicate>;
13241303
using _WriteOp =
13251304
oneapi::dpl::__par_backend_hetero::__write_to_id_if_else<oneapi::dpl::__internal::__pstl_assign>;
1326-
try
1327-
{
1328-
return __parallel_reduce_then_scan_copy(__backend_tag, __exec, __rng, __result, __n, _GenMask{__pred},
1329-
_WriteOp{},
1330-
/*_IsUniquePattern=*/std::false_type{});
1331-
}
1332-
catch (const sycl::exception& __e)
1333-
{
1334-
__bypass_sycl_kernel_not_supported(__e);
1335-
}
1305+
1306+
return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec),
1307+
std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n,
1308+
_GenMask{__pred}, _WriteOp{}, /*_IsUniquePattern=*/std::false_type{});
13361309
}
1337-
#endif
1338-
using _ReduceOp = std::plus<decltype(__n)>;
1339-
using _CreateOp = unseq_backend::__create_mask<_UnaryPredicate, decltype(__n)>;
1340-
using _CopyOp = unseq_backend::__partition_by_mask<_ReduceOp, /*inclusive*/ std::true_type>;
1310+
else
1311+
{
1312+
using _ReduceOp = std::plus<decltype(__n)>;
1313+
using _CreateOp = unseq_backend::__create_mask<_UnaryPredicate, decltype(__n)>;
1314+
using _CopyOp = unseq_backend::__partition_by_mask<_ReduceOp, /*inclusive*/ std::true_type>;
13411315

1342-
return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng),
1343-
std::forward<_Range2>(__result), __n, _CreateOp{__pred}, _CopyOp{_ReduceOp{}});
1316+
return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng),
1317+
std::forward<_Range2>(__result), __n, _CreateOp{__pred}, _CopyOp{_ReduceOp{}});
1318+
}
13441319
}
13451320

13461321
template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Size, typename _Pred,
@@ -1371,37 +1346,32 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
13711346
using _SizeBreakpoints = std::integer_sequence<std::uint16_t, 16, 32, 64, 128, 256, 512, 1024, 2048>;
13721347

13731348
return __par_backend_hetero::__static_monotonic_dispatcher<_SizeBreakpoints>::__dispatch(
1374-
_SingleGroupInvoker{}, __n, __exec, __n, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng),
1375-
__pred, __assign);
1349+
_SingleGroupInvoker{}, __n, std::forward<_ExecutionPolicy>(__exec), __n, std::forward<_InRng>(__in_rng),
1350+
std::forward<_OutRng>(__out_rng), __pred, __assign);
13761351
}
1377-
#if _ONEDPL_COMPILE_KERNEL
13781352
else if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec))
13791353
{
13801354
using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_Pred>;
13811355
using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if<0, _Assign>;
1382-
try
1383-
{
1384-
return __parallel_reduce_then_scan_copy(__backend_tag, __exec, __in_rng, __out_rng, __n, _GenMask{__pred},
1385-
_WriteOp{__assign},
1386-
/*_IsUniquePattern=*/std::false_type{});
1387-
}
1388-
catch (const sycl::exception& __e)
1389-
{
1390-
__bypass_sycl_kernel_not_supported(__e);
1391-
}
1392-
}
1393-
#endif
1394-
using _ReduceOp = std::plus<_Size>;
1395-
using _CreateOp = unseq_backend::__create_mask<_Pred, _Size>;
1396-
using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign,
1397-
/*inclusive*/ std::true_type, 1>;
13981356

1399-
return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng),
1400-
std::forward<_OutRng>(__out_rng), __n, _CreateOp{__pred},
1401-
_CopyOp{_ReduceOp{}, __assign});
1357+
return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec),
1358+
std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n,
1359+
_GenMask{__pred}, _WriteOp{__assign},
1360+
/*_IsUniquePattern=*/std::false_type{});
1361+
}
1362+
else
1363+
{
1364+
using _ReduceOp = std::plus<_Size>;
1365+
using _CreateOp = unseq_backend::__create_mask<_Pred, _Size>;
1366+
using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign,
1367+
/*inclusive*/ std::true_type, 1>;
1368+
1369+
return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec),
1370+
std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n,
1371+
_CreateOp{__pred}, _CopyOp{_ReduceOp{}, __assign});
1372+
}
14021373
}
14031374

1404-
#if _ONEDPL_COMPILE_KERNEL
14051375
template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Compare,
14061376
typename _IsOpDifference>
14071377
auto
@@ -1427,7 +1397,7 @@ __parallel_set_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag __
14271397

14281398
return __parallel_transform_reduce_then_scan(
14291399
__backend_tag, std::forward<_ExecutionPolicy>(__exec),
1430-
oneapi::dpl::__ranges::zip_view(
1400+
oneapi::dpl::__ranges::make_zip_view(
14311401
std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
14321402
oneapi::dpl::__ranges::all_view<std::int32_t, __par_backend_hetero::access_mode::read_write>(
14331403
__mask_buf.get_buffer())),
@@ -1436,7 +1406,6 @@ __parallel_set_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag __
14361406
_ScanInputTransform{}, _WriteOp{}, oneapi::dpl::unseq_backend::__no_init_value<_Size>{},
14371407
/*_Inclusive=*/std::true_type{}, /*__is_unique_pattern=*/std::false_type{});
14381408
}
1439-
#endif
14401409

14411410
template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Compare,
14421411
typename _IsOpDifference>
@@ -1495,23 +1464,18 @@ __parallel_set_op(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _
14951464
_Range1&& __rng1, _Range2&& __rng2, _Range3&& __result, _Compare __comp,
14961465
_IsOpDifference __is_op_difference)
14971466
{
1498-
#if _ONEDPL_COMPILE_KERNEL
14991467
if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec))
15001468
{
1501-
try
1502-
{
1503-
return __parallel_set_reduce_then_scan(__backend_tag, __exec, __rng1, __rng2, __result, __comp,
1504-
__is_op_difference);
1505-
}
1506-
catch (const sycl::exception& __e)
1507-
{
1508-
__bypass_sycl_kernel_not_supported(__e);
1509-
}
1469+
return __parallel_set_reduce_then_scan(__backend_tag, std::forward<_ExecutionPolicy>(__exec),
1470+
std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
1471+
std::forward<_Range3>(__result), __comp, __is_op_difference);
1472+
}
1473+
else
1474+
{
1475+
return __parallel_set_scan(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1),
1476+
std::forward<_Range2>(__rng2), std::forward<_Range3>(__result), __comp,
1477+
__is_op_difference);
15101478
}
1511-
#endif
1512-
return __parallel_set_scan(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1),
1513-
std::forward<_Range2>(__rng2), std::forward<_Range3>(__result), __comp,
1514-
__is_op_difference);
15151479
}
15161480

15171481
//------------------------------------------------------------------------
@@ -2443,24 +2407,19 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, _Exe
24432407

24442408
using __val_type = oneapi::dpl::__internal::__value_t<_Range2>;
24452409
// Prior to icpx 2025.0, the reduce-then-scan path performs poorly and should be avoided.
2446-
#if (!defined(__INTEL_LLVM_COMPILER) || __INTEL_LLVM_COMPILER >= 20250000) && _ONEDPL_COMPILE_KERNEL
2410+
#if !defined(__INTEL_LLVM_COMPILER) || __INTEL_LLVM_COMPILER >= 20250000
24472411
if constexpr (std::is_trivially_copyable_v<__val_type>)
24482412
{
24492413
if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec))
24502414
{
2451-
try
2452-
{
2453-
auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan(
2454-
oneapi::dpl::__internal::__device_backend_tag{}, __exec, __keys, __values, __out_keys, __out_values,
2455-
__binary_pred, __binary_op);
2456-
// Because our init type ends up being tuple<std::size_t, ValType>, return the first component which is the write index. Add 1 to return the
2457-
// past-the-end iterator pair of segmented reduction.
2458-
return std::get<0>(__res.get()) + 1;
2459-
}
2460-
catch (const sycl::exception& __e)
2461-
{
2462-
__bypass_sycl_kernel_not_supported(__e);
2463-
}
2415+
auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan(
2416+
oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec),
2417+
std::forward<_Range1>(__keys), std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys),
2418+
std::forward<_Range4>(__out_values), __binary_pred, __binary_op);
2419+
__res.wait();
2420+
// Because our init type ends up being tuple<std::size_t, ValType>, return the first component which is the write index. Add 1 to return the
2421+
// past-the-end iterator pair of segmented reduction.
2422+
return std::get<0>(__res.get()) + 1;
24642423
}
24652424
}
24662425
#endif

0 commit comments

Comments
 (0)