Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rely on new IGC behavior to workaround issues with -O0 compilation in reduce-then-scan #2088

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -1057,7 +1057,7 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
// work-group implementation requires a fundamental type which must also be trivially copyable.
if constexpr (std::is_trivially_copyable_v<_Type>)
{
bool __use_reduce_then_scan = oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz(__exec);
bool __use_reduce_then_scan = oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec);

// TODO: Consider re-implementing single group scan to support types without known identities. This could also
// allow us to use single wg scan for the last block of reduce-then-scan if it is sufficiently small.
Expand Down Expand Up @@ -1233,7 +1233,7 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t
// can simply copy the input range to the output.
assert(__n > 1);

if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz(__exec))
if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec))
{
using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_unique_mask<_BinaryPredicate>;
using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if<1, _Assign>;
Expand Down Expand Up @@ -1297,7 +1297,7 @@ __parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backen
_Range1&& __rng, _Range2&& __result, _UnaryPredicate __pred)
{
oneapi::dpl::__internal::__difference_t<_Range1> __n = __rng.size();
if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz(__exec))
if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec))
{
using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_UnaryPredicate>;
using _WriteOp =
Expand Down Expand Up @@ -1349,7 +1349,7 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
_SingleGroupInvoker{}, __n, std::forward<_ExecutionPolicy>(__exec), __n, std::forward<_InRng>(__in_rng),
std::forward<_OutRng>(__out_rng), __pred, __assign);
}
else if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz(__exec))
else if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec))
{
using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_Pred>;
using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if<0, _Assign>;
Expand Down Expand Up @@ -1463,7 +1463,7 @@ __parallel_set_op(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _
_Range1&& __rng1, _Range2&& __rng2, _Range3&& __result, _Compare __comp,
_IsOpDifference __is_op_difference)
{
if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz(__exec))
if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec))
{
return __parallel_set_reduce_then_scan(__backend_tag, std::forward<_ExecutionPolicy>(__exec),
std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
Expand Down Expand Up @@ -2407,7 +2407,7 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, _Exe
#if !defined(__INTEL_LLVM_COMPILER) || __INTEL_LLVM_COMPILER >= 20250000
if constexpr (std::is_trivially_copyable_v<__val_type>)
{
if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz(__exec))
if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec))
{
auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan(
oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -726,27 +726,14 @@ struct __parallel_reduce_then_scan_scan_submitter<
_InitType __init;
};

// With optimization enabled, reduce-then-scan requires a sub-group size of 32. Without optimization, we must compile
// to a sub-group size of 16 to workaround a hardware bug on certain Intel integrated graphics architectures.
constexpr inline std::uint8_t
__get_reduce_then_scan_sg_sz()
{
#if _ONEDPL_DETECT_COMPILER_OPTIMIZATIONS_ENABLED
return 32;
#else
return 16;
#endif
}

// Enable reduce-then-scan if the device uses the required sub-group size and is ran on a device
// with fast coordinated subgroup operations. We do not want to run this scan on CPU targets, as they are not
// performant with this algorithm.
template <typename _ExecutionPolicy>
bool
__is_gpu_with_reduce_then_scan_sg_sz(const _ExecutionPolicy& __exec)
__is_gpu_with_sg_32(const _ExecutionPolicy& __exec)
{
const bool __dev_supports_sg_sz =
oneapi::dpl::__internal::__supports_sub_group_size(__exec, __get_reduce_then_scan_sg_sz());
const bool __dev_supports_sg_sz = oneapi::dpl::__internal::__supports_sub_group_size(__exec, 32);
return (__exec.queue().get_device().is_gpu() && __dev_supports_sg_sz);
}

Expand Down Expand Up @@ -777,7 +764,7 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
__reduce_then_scan_scan_kernel<_CustomName>>;
using _ValueType = typename _InitType::__value_type;

constexpr std::uint8_t __sub_group_size = __get_reduce_then_scan_sg_sz();
constexpr std::uint8_t __sub_group_size = 32;
constexpr std::uint8_t __block_size_scale = std::max(std::size_t{1}, sizeof(double) / sizeof(_ValueType));
// Empirically determined maximum. May be less for non-full blocks.
constexpr std::uint16_t __max_inputs_per_item = 64 * __block_size_scale;
Expand Down
9 changes: 0 additions & 9 deletions include/oneapi/dpl/pstl/onedpl_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,15 +157,6 @@
#define _ONEDPL_CPP17_EXECUTION_POLICIES_PRESENT \
(_ONEDPL___cplusplus >= 201703L && (_MSC_VER >= 1912 || (_GLIBCXX_RELEASE >= 9 && __GLIBCXX__ >= 20190503)))

// In the SYCL backend reduce-then-scan path, we need to be able to differentiate between when a compiler enables
// optimizations and when it does not. With GCC and clang-based compilers, we can detect this with the __OPTIMIZE__
// flag.
#if _ONEDPL_GCC_VERSION > 0 || defined(_ONEDPL_CLANG_VERSION)
# define _ONEDPL_DETECT_COMPILER_OPTIMIZATIONS_ENABLED __OPTIMIZE__
#else
# define _ONEDPL_DETECT_COMPILER_OPTIMIZATIONS_ENABLED 0
#endif

#define _ONEDPL_EARLYEXIT_PRESENT (__INTEL_COMPILER >= 1800)
#if (defined(_PSTL_PRAGMA_SIMD_EARLYEXIT) && _PSTL_EARLYEXIT_PRESENT)
# define _ONEDPL_PRAGMA_SIMD_EARLYEXIT _PSTL_PRAGMA_SIMD_EARLYEXIT
Expand Down