-
Notifications
You must be signed in to change notification settings - Fork 283
[libcu++] Implement cuda::ffs
#6192
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks a lot, this is already looking really great 🎉
I have two minor comments, but this is missing tests. Please have a look what we do for e.g countr_zero
in libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/countr_zero.pass.cpp
If you have any questions, feel free to ask
02c7d56
to
731da14
Compare
Addressed all feedback:
|
libcudacxx/include/cuda/__bit/ffs.h
Outdated
#if _CCCL_COMPILER(GCC) || _CCCL_COMPILER(CLANG) || _CCCL_COMPILER(NVHPC) | ||
# define _CCCL_BUILTIN_FFS(...) __builtin_ffs(__VA_ARGS__) | ||
# define _CCCL_BUILTIN_FFSLL(...) __builtin_ffsll(__VA_ARGS__) | ||
#endif // _CCCL_COMPILER(GCC) || _CCCL_COMPILER(CLANG) || _CCCL_COMPILER(NVHPC) | ||
|
||
#include <cuda/std/__cccl/prologue.h> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#if _CCCL_COMPILER(GCC) || _CCCL_COMPILER(CLANG) || _CCCL_COMPILER(NVHPC) | |
# define _CCCL_BUILTIN_FFS(...) __builtin_ffs(__VA_ARGS__) | |
# define _CCCL_BUILTIN_FFSLL(...) __builtin_ffsll(__VA_ARGS__) | |
#endif // _CCCL_COMPILER(GCC) || _CCCL_COMPILER(CLANG) || _CCCL_COMPILER(NVHPC) | |
#include <cuda/std/__cccl/prologue.h> | |
#include <cuda/std/__cccl/prologue.h> | |
#if _CCCL_HAS_BUILTIN(__builtin_ffs) || _CCCL_COMPILER(GCC) | |
# define _CCCL_BUILTIN_FFS(...) __builtin_ffs(__VA_ARGS__) | |
# define _CCCL_BUILTIN_FFSLL(...) __builtin_ffsll(__VA_ARGS__) | |
#endif // _CCCL_HAS_BUILTIN(__builtin_ffs) || _CCCL_COMPILER(GCC) |
Let's put the check inside the prologue/epilogue scope. Also, it should be enough to use the combination of _CCCL_HAS_BUILTIN(__builtin_ffs)
and _CCCL_COMPILER(GCC)
:)
libcudacxx/include/cuda/__bit/ffs.h
Outdated
using _Signed = ::cuda::std::make_signed_t<_Tp>; | ||
if constexpr (sizeof(_Tp) <= sizeof(int)) | ||
{ | ||
return _CCCL_BUILTIN_FFS(static_cast<_Signed>(__v)); | ||
} | ||
else | ||
{ | ||
return _CCCL_BUILTIN_FFSLL(static_cast<_Signed>(__v)); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
using _Signed = ::cuda::std::make_signed_t<_Tp>; | |
if constexpr (sizeof(_Tp) <= sizeof(int)) | |
{ | |
return _CCCL_BUILTIN_FFS(static_cast<_Signed>(__v)); | |
} | |
else | |
{ | |
return _CCCL_BUILTIN_FFSLL(static_cast<_Signed>(__v)); | |
} | |
if constexpr (sizeof(_Tp) <= sizeof(int)) | |
{ | |
return _CCCL_BUILTIN_FFS(static_cast<int>(__v)); | |
} | |
else | |
{ | |
return _CCCL_BUILTIN_FFSLL(static_cast<long long>(__v)); | |
} |
we can simplify it a bit further
libcudacxx/include/cuda/__bit/ffs.h
Outdated
if constexpr (sizeof(_Tp) <= sizeof(int)) | ||
{ | ||
return ::cuda::__ffs_impl(static_cast<uint32_t>(__v)); | ||
} | ||
else | ||
{ | ||
return ::cuda::__ffs_impl(static_cast<uint64_t>(__v)); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is unnecessary I believe, we can just move contents of __ffs_impl
here directly
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We also support 128-bit integers if available, so we should handle them, too. You can just split them into 2x 64-bit parts and handle them separately
libcudacxx/include/cuda/__bit/ffs.h
Outdated
template <typename _Tp> | ||
[[nodiscard]] _CCCL_API constexpr int __ffs_impl(_Tp __v) noexcept | ||
{ | ||
static_assert(::cuda::std::is_same_v<_Tp, uint32_t> || ::cuda::std::is_same_v<_Tp, uint64_t>); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This can be removed
test<unsigned char>(); | ||
test<unsigned short>(); | ||
test<unsigned>(); | ||
test<unsigned long>(); | ||
test<unsigned long long>(); | ||
|
||
test<uint8_t>(); | ||
test<uint16_t>(); | ||
test<uint32_t>(); | ||
test<uint64_t>(); | ||
test<size_t>(); | ||
test<uintmax_t>(); | ||
test<uintptr_t>(); | ||
|
||
#if _CCCL_HAS_INT128() | ||
test<__uint128_t>(); | ||
#endif // _CCCL_HAS_INT128() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
test<unsigned char>(); | |
test<unsigned short>(); | |
test<unsigned>(); | |
test<unsigned long>(); | |
test<unsigned long long>(); | |
test<uint8_t>(); | |
test<uint16_t>(); | |
test<uint32_t>(); | |
test<uint64_t>(); | |
test<size_t>(); | |
test<uintmax_t>(); | |
test<uintptr_t>(); | |
#if _CCCL_HAS_INT128() | |
test<__uint128_t>(); | |
#endif // _CCCL_HAS_INT128() | |
test<unsigned char>(); | |
test<unsigned short>(); | |
test<unsigned>(); | |
test<unsigned long>(); | |
test<unsigned long long>(); | |
#if _CCCL_HAS_INT128() | |
test<__uint128_t>(); | |
#endif // _CCCL_HAS_INT128() |
Those are not necessary, they are just aliases to the standard integer types
template <typename T> | ||
__host__ __device__ constexpr bool test() | ||
{ | ||
static_assert(cuda::ffs(T(0)) == 0); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These tests should use just assert(...)
instead of static_assert(...)
. The values will be checked in constexpr
context when we call static_assert(test()
from main
static_assert(cuda::ffs(T(127)) == 1); | ||
static_assert(cuda::ffs(T(128)) == 8); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd like to see tests of greater values than 128
. That's also why these tests wouldn't catch that the function won't work for 128-bit integers.
For 128-bit integer literals, you can include #include "literal.h" and use
_u128user defined literals from
test_integer_literals` namespace
static_assert(cuda::ffs(T(0)) == 0); | ||
static_assert(cuda::ffs(T(1)) == 1); | ||
static_assert(cuda::ffs(T(2)) == 2); | ||
static_assert(cuda::ffs(T(3)) == 1); | ||
static_assert(cuda::ffs(T(4)) == 3); | ||
static_assert(cuda::ffs(T(5)) == 1); | ||
static_assert(cuda::ffs(T(6)) == 2); | ||
static_assert(cuda::ffs(T(7)) == 1); | ||
static_assert(cuda::ffs(T(8)) == 4); | ||
static_assert(cuda::ffs(T(9)) == 1); | ||
static_assert(cuda::ffs(T(127)) == 1); | ||
static_assert(cuda::ffs(T(128)) == 8); | ||
|
||
// Test relationship with countr_zero: ffs(x) == countr_zero(x) + 1 for x != 0 | ||
static_assert(cuda::ffs(T(1)) == cuda::std::countr_zero(T(1)) + 1); | ||
static_assert(cuda::ffs(T(2)) == cuda::std::countr_zero(T(2)) + 1); | ||
static_assert(cuda::ffs(T(4)) == cuda::std::countr_zero(T(4)) + 1); | ||
static_assert(cuda::ffs(T(8)) == cuda::std::countr_zero(T(8)) + 1); | ||
|
||
// Test return type | ||
static_assert(cuda::std::is_same_v<int, decltype(cuda::ffs(T(0)))>); | ||
static_assert(noexcept(cuda::ffs(T(0)))); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Important, this will only test at compile time, we want all those static_assert
to be plain assert
At compile time that would trigger a compile error but do the right thing at runtime
#include "test_macros.h" | ||
|
||
template <typename T> | ||
__host__ __device__ constexpr bool test() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
NitpicK: This function could just be void
assert(test()); | ||
static_assert(test()); | ||
return 0; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Important, we had issues with the compiler being too clever and optimizing the whole tests out.
to alleviate that we do
assert(test()); | |
static_assert(test()); | |
return 0; | |
volatile int val = 0; | |
assert(test(val)); | |
static_assert(test(0)); | |
return 0; |
And then inside test(const int)
we static cast to the respective type.
Its a pain, but it ensures that the optimizer does not constant fold the function out
79ef976
to
69d084e
Compare
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
thanks a lot for your contribution. When you get a chance, please also add the documentation
libcudacxx/include/cuda/__bit/ffs.h
Outdated
if (!::cuda::std::__cccl_default_is_constant_evaluated()) | ||
{ | ||
NV_IF_ELSE_TARGET(NV_IS_HOST, | ||
(return ::cuda::__ffs_impl_host(static_cast<uint32_t>(__v));), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
static_cast
is not needed for __ffs_impl_host
and __ffs_impl_device
} | ||
else | ||
{ | ||
if (!::cuda::std::__cccl_default_is_constant_evaluated()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
would be possible to merge this branch with the previous one?
libcudacxx/include/cuda/__bit/ffs.h
Outdated
} | ||
return ::cuda::__ffs_impl_constexpr(__lo); | ||
} | ||
const auto __hi = static_cast<uint64_t>(__v >> 64); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
prefer an unsigned type. Signed shift is undefined behavior (before C++20)
libcudacxx/include/cuda/__bit/ffs.h
Outdated
return ::cuda::__ffs_impl_constexpr(__lo); | ||
} | ||
const auto __hi = static_cast<uint64_t>(__v >> 64); | ||
if (__hi != 0) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this condition is not needed. __ffs_impl_host/device
already return 0 for input 0
libcudacxx/include/cuda/__bit/ffs.h
Outdated
if (!::cuda::std::__cccl_default_is_constant_evaluated()) | ||
{ | ||
NV_IF_ELSE_TARGET( | ||
NV_IS_HOST, (return ::cuda::__ffs_impl_host(__lo);), (return ::cuda::__ffs_impl_device(__lo);)); | ||
} | ||
return ::cuda::__ffs_impl_constexpr(__lo); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
consider also merging this pattern in a separate function
#include <cuda/std/__type_traits/is_constant_evaluated.h> | ||
#include <cuda/std/__type_traits/is_unsigned_integer.h> | ||
#include <cuda/std/cstdint> | ||
#include <cuda/std/limits> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
add nvtarget
#endif // _CCCL_HAS_BUILTIN(__builtin_ffs) || _CCCL_COMPILER(GCC) | ||
|
||
template <typename _Tp> | ||
[[nodiscard]] _CCCL_API constexpr int __ffs_impl_constexpr(_Tp __v) noexcept |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
_Tp
must be an unsigned type to avoid UB with signed bitwise ops
# elif _CCCL_COMPILER(MSVC) | ||
unsigned long __where{}; | ||
unsigned char __res{}; | ||
if constexpr (sizeof(_Tp) <= sizeof(uint32_t)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would also add
using ::cuda::std::uint32_t
using ::cuda::std::uint64_t
#endif // no system header | ||
|
||
#include <cuda/std/__type_traits/is_constant_evaluated.h> | ||
#include <cuda/std/__type_traits/is_unsigned_integer.h> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
looks unused
libcudacxx/include/cuda/__bit/ffs.h
Outdated
(return ::cuda::__ffs_impl_host(static_cast<uint64_t>(__v));), | ||
(return ::cuda::__ffs_impl_device(static_cast<uint64_t>(__v));)); | ||
} | ||
return ::cuda::__ffs_impl_constexpr(static_cast<uint64_t>(__v)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
please also add _CCCL_ASSUME(result >= 0 && result <=32 )
cuda::ffs
69d084e
to
73a7a6a
Compare
Skipped:
|
Implement type-safe
cuda::std::ffs
function as replacement for__ffs
intrinsic.x == 0
correctly (returns 0)Fixes #6108