@@ -1057,12 +1057,7 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
1057
1057
// work-group implementation requires a fundamental type which must also be trivially copyable.
1058
1058
if constexpr (std::is_trivially_copyable_v<_Type>)
1059
1059
{
1060
- bool __use_reduce_then_scan =
1061
- #if _ONEDPL_COMPILE_KERNEL
1062
- oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32 (__exec);
1063
- #else
1064
- false ;
1065
- #endif
1060
+ bool __use_reduce_then_scan = oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz (__exec);
1066
1061
1067
1062
// TODO: Consider re-implementing single group scan to support types without known identities. This could also
1068
1063
// allow us to use single wg scan for the last block of reduce-then-scan if it is sufficiently small.
@@ -1081,27 +1076,19 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
1081
1076
std::forward<_Range2>(__out_rng), __n, __unary_op, __init, __binary_op, _Inclusive{});
1082
1077
}
1083
1078
}
1084
- #if _ONEDPL_COMPILE_KERNEL
1085
1079
if (__use_reduce_then_scan)
1086
1080
{
1087
1081
using _GenInput = oneapi::dpl::__par_backend_hetero::__gen_transform_input<_UnaryOperation>;
1088
1082
using _ScanInputTransform = oneapi::dpl::__internal::__no_op;
1089
1083
using _WriteOp = oneapi::dpl::__par_backend_hetero::__simple_write_to_id;
1090
1084
1091
1085
_GenInput __gen_transform{__unary_op};
1092
- try
1093
- {
1094
- return __parallel_transform_reduce_then_scan (__backend_tag, __exec, __in_rng, __out_rng,
1095
- __gen_transform, __binary_op, __gen_transform,
1096
- _ScanInputTransform{}, _WriteOp{}, __init, _Inclusive{},
1097
- /* _IsUniquePattern=*/ std::false_type{});
1098
- }
1099
- catch (const sycl::exception & __e)
1100
- {
1101
- __bypass_sycl_kernel_not_supported (__e);
1102
- }
1086
+
1087
+ return __parallel_transform_reduce_then_scan (
1088
+ __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
1089
+ std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform, _ScanInputTransform{},
1090
+ _WriteOp{}, __init, _Inclusive{}, /* _IsUniquePattern=*/ std::false_type{});
1103
1091
}
1104
- #endif
1105
1092
}
1106
1093
1107
1094
// else use multi pass scan implementation
@@ -1176,7 +1163,6 @@ struct __invoke_single_group_copy_if
1176
1163
}
1177
1164
};
1178
1165
1179
- #if _ONEDPL_COMPILE_KERNEL
1180
1166
template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Size, typename _GenMask,
1181
1167
typename _WriteOp, typename _IsUniquePattern>
1182
1168
auto
@@ -1195,7 +1181,6 @@ __parallel_reduce_then_scan_copy(oneapi::dpl::__internal::__device_backend_tag _
1195
1181
_ScanInputTransform{}, __write_op, oneapi::dpl::unseq_backend::__no_init_value<_Size>{},
1196
1182
/* _Inclusive=*/ std::true_type{}, __is_unique_pattern);
1197
1183
}
1198
- #endif
1199
1184
1200
1185
template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Size, typename _CreateMaskOp,
1201
1186
typename _CopyByMaskOp>
@@ -1248,36 +1233,32 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t
1248
1233
// can simply copy the input range to the output.
1249
1234
assert (__n > 1 );
1250
1235
1251
- #if _ONEDPL_COMPILE_KERNEL
1252
- if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32 (__exec))
1236
+ if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz (__exec))
1253
1237
{
1254
1238
using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_unique_mask<_BinaryPredicate>;
1255
1239
using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if<1 , _Assign>;
1256
- try
1257
- {
1258
- return __parallel_reduce_then_scan_copy (__backend_tag, __exec, __rng, __result, __n, _GenMask{__pred},
1259
- _WriteOp{_Assign{}},
1260
- /* _IsUniquePattern=*/ std::true_type{});
1261
- }
1262
- catch (const sycl::exception & __e)
1263
- {
1264
- __bypass_sycl_kernel_not_supported (__e);
1265
- }
1240
+
1241
+ return __parallel_reduce_then_scan_copy (__backend_tag, std::forward<_ExecutionPolicy>(__exec),
1242
+ std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n,
1243
+ _GenMask{__pred}, _WriteOp{_Assign{}},
1244
+ /* _IsUniquePattern=*/ std::true_type{});
1245
+ }
1246
+ else
1247
+ {
1248
+
1249
+ using _ReduceOp = std::plus<decltype (__n)>;
1250
+ using _CreateOp =
1251
+ oneapi::dpl::__internal::__create_mask_unique_copy<oneapi::dpl::__internal::__not_pred<_BinaryPredicate>,
1252
+ decltype (__n)>;
1253
+ using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign, /* inclusive*/ std::true_type, 1 >;
1254
+
1255
+ return __parallel_scan_copy (__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng),
1256
+ std::forward<_Range2>(__result), __n,
1257
+ _CreateOp{oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}},
1258
+ _CopyOp{_ReduceOp{}, _Assign{}});
1266
1259
}
1267
- #endif
1268
- using _ReduceOp = std::plus<decltype (__n)>;
1269
- using _CreateOp =
1270
- oneapi::dpl::__internal::__create_mask_unique_copy<oneapi::dpl::__internal::__not_pred<_BinaryPredicate>,
1271
- decltype (__n)>;
1272
- using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign, /* inclusive*/ std::true_type, 1 >;
1273
-
1274
- return __parallel_scan_copy (__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng),
1275
- std::forward<_Range2>(__result), __n,
1276
- _CreateOp{oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}},
1277
- _CopyOp{_ReduceOp{}, _Assign{}});
1278
1260
}
1279
1261
1280
- #if _ONEDPL_COMPILE_KERNEL
1281
1262
template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Range4,
1282
1263
typename _BinaryPredicate, typename _BinaryOperator>
1283
1264
auto
@@ -1302,45 +1283,39 @@ __parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_
1302
1283
assert (__n > 1 );
1303
1284
return __parallel_transform_reduce_then_scan (
1304
1285
__backend_tag, std::forward<_ExecutionPolicy>(__exec),
1305
- oneapi::dpl::__ranges::zip_view (std::forward<_Range1>(__keys), std::forward<_Range2>(__values)),
1306
- oneapi::dpl::__ranges::zip_view (std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)),
1286
+ oneapi::dpl::__ranges::make_zip_view (std::forward<_Range1>(__keys), std::forward<_Range2>(__values)),
1287
+ oneapi::dpl::__ranges::make_zip_view (std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)),
1307
1288
_GenReduceInput{__binary_pred}, _ReduceOp{__binary_op}, _GenScanInput{__binary_pred, __n},
1308
1289
_ScanInputTransform{}, _WriteOp{__binary_pred, __n},
1309
1290
oneapi::dpl::unseq_backend::__no_init_value<oneapi::dpl::__internal::tuple<std::size_t , _ValueType>>{},
1310
1291
/* Inclusive*/ std::true_type{}, /* _IsUniquePattern=*/ std::false_type{});
1311
1292
}
1312
- #endif
1313
1293
1314
1294
template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _UnaryPredicate>
1315
1295
auto
1316
1296
__parallel_partition_copy (oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
1317
1297
_Range1&& __rng, _Range2&& __result, _UnaryPredicate __pred)
1318
1298
{
1319
1299
oneapi::dpl::__internal::__difference_t <_Range1> __n = __rng.size ();
1320
- #if _ONEDPL_COMPILE_KERNEL
1321
- if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32 (__exec))
1300
+ if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz (__exec))
1322
1301
{
1323
1302
using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_UnaryPredicate>;
1324
1303
using _WriteOp =
1325
1304
oneapi::dpl::__par_backend_hetero::__write_to_id_if_else<oneapi::dpl::__internal::__pstl_assign>;
1326
- try
1327
- {
1328
- return __parallel_reduce_then_scan_copy (__backend_tag, __exec, __rng, __result, __n, _GenMask{__pred},
1329
- _WriteOp{},
1330
- /* _IsUniquePattern=*/ std::false_type{});
1331
- }
1332
- catch (const sycl::exception & __e)
1333
- {
1334
- __bypass_sycl_kernel_not_supported (__e);
1335
- }
1305
+
1306
+ return __parallel_reduce_then_scan_copy (__backend_tag, std::forward<_ExecutionPolicy>(__exec),
1307
+ std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n,
1308
+ _GenMask{__pred}, _WriteOp{}, /* _IsUniquePattern=*/ std::false_type{});
1336
1309
}
1337
- #endif
1338
- using _ReduceOp = std::plus<decltype (__n)>;
1339
- using _CreateOp = unseq_backend::__create_mask<_UnaryPredicate, decltype (__n)>;
1340
- using _CopyOp = unseq_backend::__partition_by_mask<_ReduceOp, /* inclusive*/ std::true_type>;
1310
+ else
1311
+ {
1312
+ using _ReduceOp = std::plus<decltype (__n)>;
1313
+ using _CreateOp = unseq_backend::__create_mask<_UnaryPredicate, decltype (__n)>;
1314
+ using _CopyOp = unseq_backend::__partition_by_mask<_ReduceOp, /* inclusive*/ std::true_type>;
1341
1315
1342
- return __parallel_scan_copy (__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng),
1343
- std::forward<_Range2>(__result), __n, _CreateOp{__pred}, _CopyOp{_ReduceOp{}});
1316
+ return __parallel_scan_copy (__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng),
1317
+ std::forward<_Range2>(__result), __n, _CreateOp{__pred}, _CopyOp{_ReduceOp{}});
1318
+ }
1344
1319
}
1345
1320
1346
1321
template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Size, typename _Pred,
@@ -1371,37 +1346,32 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
1371
1346
using _SizeBreakpoints = std::integer_sequence<std::uint16_t , 16 , 32 , 64 , 128 , 256 , 512 , 1024 , 2048 >;
1372
1347
1373
1348
return __par_backend_hetero::__static_monotonic_dispatcher<_SizeBreakpoints>::__dispatch (
1374
- _SingleGroupInvoker{}, __n, __exec, __n, std::forward<_InRng>(__in_rng ), std::forward<_OutRng>(__out_rng ),
1375
- __pred, __assign);
1349
+ _SingleGroupInvoker{}, __n, std::forward<_ExecutionPolicy>(__exec ), __n, std::forward<_InRng>(__in_rng ),
1350
+ std::forward<_OutRng>(__out_rng), __pred, __assign);
1376
1351
}
1377
- #if _ONEDPL_COMPILE_KERNEL
1378
- else if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32 (__exec))
1352
+ else if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz (__exec))
1379
1353
{
1380
1354
using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_Pred>;
1381
1355
using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if<0 , _Assign>;
1382
- try
1383
- {
1384
- return __parallel_reduce_then_scan_copy (__backend_tag, __exec, __in_rng, __out_rng, __n, _GenMask{__pred},
1385
- _WriteOp{__assign},
1386
- /* _IsUniquePattern=*/ std::false_type{});
1387
- }
1388
- catch (const sycl::exception & __e)
1389
- {
1390
- __bypass_sycl_kernel_not_supported (__e);
1391
- }
1392
- }
1393
- #endif
1394
- using _ReduceOp = std::plus<_Size>;
1395
- using _CreateOp = unseq_backend::__create_mask<_Pred, _Size>;
1396
- using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign,
1397
- /* inclusive*/ std::true_type, 1 >;
1398
1356
1399
- return __parallel_scan_copy (__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng),
1400
- std::forward<_OutRng>(__out_rng), __n, _CreateOp{__pred},
1401
- _CopyOp{_ReduceOp{}, __assign});
1357
+ return __parallel_reduce_then_scan_copy (__backend_tag, std::forward<_ExecutionPolicy>(__exec),
1358
+ std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n,
1359
+ _GenMask{__pred}, _WriteOp{__assign},
1360
+ /* _IsUniquePattern=*/ std::false_type{});
1361
+ }
1362
+ else
1363
+ {
1364
+ using _ReduceOp = std::plus<_Size>;
1365
+ using _CreateOp = unseq_backend::__create_mask<_Pred, _Size>;
1366
+ using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign,
1367
+ /* inclusive*/ std::true_type, 1 >;
1368
+
1369
+ return __parallel_scan_copy (__backend_tag, std::forward<_ExecutionPolicy>(__exec),
1370
+ std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n,
1371
+ _CreateOp{__pred}, _CopyOp{_ReduceOp{}, __assign});
1372
+ }
1402
1373
}
1403
1374
1404
- #if _ONEDPL_COMPILE_KERNEL
1405
1375
template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Compare,
1406
1376
typename _IsOpDifference>
1407
1377
auto
@@ -1427,7 +1397,7 @@ __parallel_set_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag __
1427
1397
1428
1398
return __parallel_transform_reduce_then_scan (
1429
1399
__backend_tag, std::forward<_ExecutionPolicy>(__exec),
1430
- oneapi::dpl::__ranges::zip_view (
1400
+ oneapi::dpl::__ranges::make_zip_view (
1431
1401
std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
1432
1402
oneapi::dpl::__ranges::all_view<std::int32_t , __par_backend_hetero::access_mode::read_write>(
1433
1403
__mask_buf.get_buffer ())),
@@ -1436,7 +1406,6 @@ __parallel_set_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag __
1436
1406
_ScanInputTransform{}, _WriteOp{}, oneapi::dpl::unseq_backend::__no_init_value<_Size>{},
1437
1407
/* _Inclusive=*/ std::true_type{}, /* __is_unique_pattern=*/ std::false_type{});
1438
1408
}
1439
- #endif
1440
1409
1441
1410
template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Compare,
1442
1411
typename _IsOpDifference>
@@ -1495,23 +1464,18 @@ __parallel_set_op(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _
1495
1464
_Range1&& __rng1, _Range2&& __rng2, _Range3&& __result, _Compare __comp,
1496
1465
_IsOpDifference __is_op_difference)
1497
1466
{
1498
- #if _ONEDPL_COMPILE_KERNEL
1499
- if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32 (__exec))
1467
+ if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz (__exec))
1500
1468
{
1501
- try
1502
- {
1503
- return __parallel_set_reduce_then_scan (__backend_tag, __exec, __rng1, __rng2, __result, __comp,
1504
- __is_op_difference);
1505
- }
1506
- catch ( const sycl:: exception & __e)
1507
- {
1508
- __bypass_sycl_kernel_not_supported (__e);
1509
- }
1469
+ return __parallel_set_reduce_then_scan (__backend_tag, std::forward<_ExecutionPolicy>(__exec),
1470
+ std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
1471
+ std::forward<_Range3>( __result) , __comp, __is_op_difference);
1472
+ }
1473
+ else
1474
+ {
1475
+ return __parallel_set_scan (__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1),
1476
+ std::forward<_Range2>(__rng2), std::forward<_Range3>(__result), __comp,
1477
+ __is_op_difference);
1510
1478
}
1511
- #endif
1512
- return __parallel_set_scan (__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1),
1513
- std::forward<_Range2>(__rng2), std::forward<_Range3>(__result), __comp,
1514
- __is_op_difference);
1515
1479
}
1516
1480
1517
1481
// ------------------------------------------------------------------------
@@ -2443,24 +2407,18 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, _Exe
2443
2407
2444
2408
using __val_type = oneapi::dpl::__internal::__value_t <_Range2>;
2445
2409
// Prior to icpx 2025.0, the reduce-then-scan path performs poorly and should be avoided.
2446
- #if ( !defined(__INTEL_LLVM_COMPILER) || __INTEL_LLVM_COMPILER >= 20250000) && _ONEDPL_COMPILE_KERNEL
2410
+ #if !defined(__INTEL_LLVM_COMPILER) || __INTEL_LLVM_COMPILER >= 20250000
2447
2411
if constexpr (std::is_trivially_copyable_v<__val_type>)
2448
2412
{
2449
- if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32 (__exec))
2413
+ if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz (__exec))
2450
2414
{
2451
- try
2452
- {
2453
- auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan (
2454
- oneapi::dpl::__internal::__device_backend_tag{}, __exec, __keys, __values, __out_keys, __out_values,
2455
- __binary_pred, __binary_op);
2456
- // Because our init type ends up being tuple<std::size_t, ValType>, return the first component which is the write index. Add 1 to return the
2457
- // past-the-end iterator pair of segmented reduction.
2458
- return std::get<0 >(__res.get ()) + 1 ;
2459
- }
2460
- catch (const sycl::exception & __e)
2461
- {
2462
- __bypass_sycl_kernel_not_supported (__e);
2463
- }
2415
+ auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan (
2416
+ oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec),
2417
+ std::forward<_Range1>(__keys), std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys),
2418
+ std::forward<_Range4>(__out_values), __binary_pred, __binary_op);
2419
+ // Because our init type ends up being tuple<std::size_t, ValType>, return the first component which is the write index. Add 1 to return the
2420
+ // past-the-end iterator pair of segmented reduction.
2421
+ return std::get<0 >(__res.get ()) + 1 ;
2464
2422
}
2465
2423
}
2466
2424
#endif
0 commit comments