Skip to content

Commit 57b16c2

Browse files
wmaxeymiscco
andauthored
[CUB] Replace several direct uses of __clz (#6099)
* Replace `__clz` in `warp_scan_shfl.cuh`. * Replace `__clz` in `block_radix_rank.cuh` * Replace `__clz` in `warp_reduce_shfl.cuh` * Replace `__clz` in `warp_reduce_smem.cuh` * Replace thrust's `clz` with `cuda::std::countl` * Fully qualify with `::cuda` * Fixup types or copy paste mistakes * Address review comments, `countr_zero` instead of `countl(brev())` * Use __bit_log2 for warp ballot index. * Use `__bit_log2` for block leader in ComputeRanksItem * Ensure that we static cast in `__clz` to int in case we deal with ARM * Rename variable to not conflict with builtin * Use `__bit_log2` * Fix incorrect transformation * Drop internal `clz` function in favor of `countl_zero` * Drop unneeded include * Fix return type of `__ballot_sync` to unsigned * fix typo * Be super safe about unsigned integers * Fix argument type in radix_rank --------- Co-authored-by: Michael Schellenberger Costa <[email protected]>
1 parent 289bdcb commit 57b16c2

File tree

8 files changed

+35
-42
lines changed

8 files changed

+35
-42
lines changed

cub/cub/block/block_radix_rank.cuh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050

5151
#include <cuda/__ptx/instructions/get_sreg.h>
5252
#include <cuda/std/__algorithm/max.h>
53+
#include <cuda/std/__bit/integral.h>
5354
#include <cuda/std/__functional/operations.h>
5455
#include <cuda/std/__type_traits/conditional.h>
5556
#include <cuda/std/__type_traits/is_same.h>
@@ -1072,7 +1073,7 @@ struct BlockRadixRankMatchEarlyCounts
10721073
atomicOr(p_match_mask, lane_mask);
10731074
__syncwarp(WARP_MASK);
10741075
int bin_mask = *p_match_mask;
1075-
int leader = (WARP_THREADS - 1) - __clz(bin_mask);
1076+
int leader = ::cuda::std::__bit_log2(static_cast<unsigned>(bin_mask));
10761077
int warp_offset = 0;
10771078
int popc = __popc(bin_mask & ::cuda::ptx::get_sreg_lanemask_le());
10781079
if (lane == leader)
@@ -1102,7 +1103,7 @@ struct BlockRadixRankMatchEarlyCounts
11021103
::cuda::std::uint32_t bin = Digit(keys[u]);
11031104
int bin_mask =
11041105
detail::warp_in_block_matcher_t<RADIX_BITS, PARTIAL_WARP_THREADS, BLOCK_WARPS - 1>::match_any(bin, warp);
1105-
int leader = (WARP_THREADS - 1) - __clz(bin_mask);
1106+
int leader = ::cuda::std::__bit_log2(static_cast<unsigned>(bin_mask));
11061107
int warp_offset = 0;
11071108
int popc = __popc(bin_mask & ::cuda::ptx::get_sreg_lanemask_le());
11081109
if (lane == leader)

cub/cub/warp/specializations/warp_reduce_shfl.cuh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
#include <cuda/__functional/maximum.h>
5252
#include <cuda/__functional/minimum.h>
5353
#include <cuda/__ptx/instructions/get_sreg.h>
54+
#include <cuda/std/__bit/countr.h>
5455
#include <cuda/std/__functional/operations.h>
5556
#include <cuda/std/__type_traits/enable_if.h>
5657
#include <cuda/std/__type_traits/integral_constant.h>
@@ -701,7 +702,7 @@ struct WarpReduceShfl
701702
_CCCL_DEVICE _CCCL_FORCEINLINE T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op)
702703
{
703704
// Get the start flags for each thread in the warp.
704-
int warp_flags = __ballot_sync(member_mask, flag);
705+
unsigned warp_flags = __ballot_sync(member_mask, flag);
705706

706707
// Convert to tail-segmented
707708
if (HEAD_SEGMENTED)
@@ -722,7 +723,7 @@ struct WarpReduceShfl
722723
warp_flags |= 1u << (LOGICAL_WARP_THREADS - 1);
723724

724725
// Find the next set flag
725-
int last_lane = __clz(__brev(warp_flags));
726+
int last_lane = ::cuda::std::countr_zero(warp_flags);
726727

727728
T output = input;
728729
// Template-iterate reduction steps

cub/cub/warp/specializations/warp_reduce_smem.cuh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
#include <cub/util_type.cuh>
5050

5151
#include <cuda/__ptx/instructions/get_sreg.h>
52+
#include <cuda/std/__bit/countr.h>
5253
#include <cuda/std/__type_traits/integral_constant.h>
5354

5455
CUB_NAMESPACE_BEGIN
@@ -215,7 +216,7 @@ struct WarpReduceSmem
215216
SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op, ::cuda::std::true_type /*has_ballot*/)
216217
{
217218
// Get the start flags for each thread in the warp.
218-
int warp_flags = __ballot_sync(member_mask, flag);
219+
unsigned warp_flags = __ballot_sync(member_mask, flag);
219220

220221
if (!HEAD_SEGMENTED)
221222
{
@@ -232,7 +233,7 @@ struct WarpReduceSmem
232233
}
233234

234235
// Find next flag
235-
int next_flag = __clz(__brev(warp_flags));
236+
int next_flag = ::cuda::std::countr_zero(warp_flags);
236237

237238
// Clip the next segment at the warp boundary if necessary
238239
if (LOGICAL_WARP_THREADS != 32)

cub/cub/warp/specializations/warp_scan_shfl.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@
5050

5151
#include <cuda/__ptx/instructions/get_sreg.h>
5252
#include <cuda/std/__algorithm/clamp.h>
53-
#include <cuda/std/__algorithm/max.h>
5453
#include <cuda/std/__bit/has_single_bit.h>
54+
#include <cuda/std/__bit/integral.h>
5555
#include <cuda/std/__functional/operations.h>
5656
#include <cuda/std/__type_traits/integral_constant.h>
5757
#include <cuda/std/__type_traits/is_integral.h>
@@ -630,7 +630,7 @@ struct WarpScanShfl
630630
ballot = ballot & ::cuda::ptx::get_sreg_lanemask_le();
631631

632632
// Find index of first set bit
633-
int segment_first_lane = ::cuda::std::max(0, 31 - __clz(ballot));
633+
int segment_first_lane = ::cuda::std::__bit_log2(ballot);
634634

635635
// Iterate scan steps
636636
_CCCL_PRAGMA_UNROLL_FULL()

libcudacxx/include/cuda/std/__bit/countl.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,14 @@ template <typename _Tp>
100100
template <typename _Tp>
101101
[[nodiscard]] _CCCL_HIDE_FROM_ABI _CCCL_DEVICE int __cccl_countl_zero_impl_device(_Tp __v) noexcept
102102
{
103-
return (sizeof(_Tp) == sizeof(uint32_t)) ? ::__clz(static_cast<int>(__v)) : ::__clzll(static_cast<long long>(__v));
103+
if constexpr (sizeof(_Tp) == sizeof(uint32_t))
104+
{
105+
return static_cast<int>(::__clz(static_cast<int>(__v)));
106+
}
107+
else
108+
{
109+
return static_cast<int>(::__clzll(static_cast<long long>(__v)));
110+
}
104111
}
105112
#endif // _CCCL_CUDA_COMPILATION()
106113

libcudacxx/include/cuda/std/__bit/countr.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,11 +114,11 @@ template <typename _Tp>
114114
{
115115
if constexpr (sizeof(_Tp) == sizeof(uint32_t))
116116
{
117-
return ::__clz(static_cast<int>(::__brev(__v)));
117+
return static_cast<int>(::__clz(static_cast<int>(::__brev(__v))));
118118
}
119119
else
120120
{
121-
return ::__clzll(static_cast<long long>(::__brevll(__v)));
121+
return static_cast<int>(::__clzll(static_cast<long long>(::__brevll(__v))));
122122
}
123123
}
124124
#endif // _CCCL_CUDA_COMPILATION()

libcudacxx/include/cuda/std/__bit/reference.h

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -275,10 +275,10 @@ _CCCL_API constexpr __bit_iterator<_Cp, false> __copy_aligned(
275275
// do first word
276276
if (__first.__ctz_ != 0)
277277
{
278-
unsigned __clz = __bits_per_word - __first.__ctz_;
279-
difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz), __n);
278+
unsigned __clz_f = __bits_per_word - __first.__ctz_;
279+
difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz_f), __n);
280280
__n -= __dn;
281-
__storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
281+
__storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
282282
__storage_type __b = *__first.__seg_ & __m;
283283
*__result.__seg_ &= ~__m;
284284
*__result.__seg_ |= __b;
@@ -420,8 +420,8 @@ _CCCL_API constexpr __bit_iterator<_Cp, false> __copy_backward_aligned(
420420
{
421421
difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__last.__ctz_), __n);
422422
__n -= __dn;
423-
unsigned __clz = __bits_per_word - __last.__ctz_;
424-
__storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz);
423+
unsigned __clz_f = __bits_per_word - __last.__ctz_;
424+
__storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_f);
425425
__storage_type __b = *__last.__seg_ & __m;
426426
*__result.__seg_ &= ~__m;
427427
*__result.__seg_ |= __b;
@@ -635,10 +635,10 @@ _CCCL_API inline __bit_iterator<_Cr, false> __swap_ranges_aligned(
635635
// do first word
636636
if (__first.__ctz_ != 0)
637637
{
638-
unsigned __clz = __bits_per_word - __first.__ctz_;
639-
difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz), __n);
638+
unsigned __clz_f = __bits_per_word - __first.__ctz_;
639+
difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz_f), __n);
640640
__n -= __dn;
641-
__storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
641+
__storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
642642
__storage_type __b1 = *__first.__seg_ & __m;
643643
*__first.__seg_ &= ~__m;
644644
__storage_type __b2 = *__result.__seg_ & __m;
@@ -988,10 +988,10 @@ _CCCL_API constexpr bool __equal_aligned(
988988
// do first word
989989
if (__first1.__ctz_ != 0)
990990
{
991-
unsigned __clz = __bits_per_word - __first1.__ctz_;
992-
difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz), __n);
991+
unsigned __clz_f = __bits_per_word - __first1.__ctz_;
992+
difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz_f), __n);
993993
__n -= __dn;
994-
__storage_type __m = (~__storage_type(0) << __first1.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
994+
__storage_type __m = (~__storage_type(0) << __first1.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
995995
if ((*__first2.__seg_ & __m) != (*__first1.__seg_ & __m))
996996
{
997997
return false;

thrust/thrust/detail/integer_math.h

Lines changed: 3 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727
#endif // no system header
2828
#include <thrust/detail/type_deduction.h>
2929

30+
#include <cuda/std/__bit/countl.h>
31+
#include <cuda/std/__type_traits/make_unsigned.h>
3032
#include <cuda/std/limits>
3133
#include <cuda/std/type_traits>
3234

@@ -36,25 +38,6 @@ THRUST_NAMESPACE_BEGIN
3638
namespace detail
3739
{
3840

39-
template <typename Integer>
40-
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE Integer clz(Integer x)
41-
{
42-
Integer result;
43-
44-
NV_IF_TARGET(NV_IS_DEVICE,
45-
(result = ::__clz(x);),
46-
(int num_bits = 8 * sizeof(Integer); int num_bits_minus_one = num_bits - 1; result = num_bits;
47-
for (int i = num_bits_minus_one; i >= 0; --i) {
48-
if ((Integer(1) << i) & x)
49-
{
50-
result = num_bits_minus_one - i;
51-
break;
52-
}
53-
}));
54-
55-
return result;
56-
}
57-
5841
template <typename Integer>
5942
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool is_power_of_2(Integer x)
6043
{
@@ -85,7 +68,7 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE Integer log2(Integer x)
8568
Integer num_bits = 8 * sizeof(Integer);
8669
Integer num_bits_minus_one = num_bits - 1;
8770

88-
return num_bits_minus_one - clz(x);
71+
return num_bits_minus_one - ::cuda::std::countl_zero(::cuda::std::__to_unsigned_like(x));
8972
}
9073

9174
template <typename Integer>

0 commit comments

Comments
 (0)