-
Notifications
You must be signed in to change notification settings - Fork 282
[libcu++] Implement cuda::ffs
#6192
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
73a7a6a
a53b94d
d01c676
07e6422
817a29a
e094a42
45877f7
4961902
9d56e87
7655ac1
9eb8017
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,69 @@ | ||||||
.. _libcudacxx-extended-api-bit-ffs: | ||||||
|
||||||
``cuda::ffs`` | ||||||
============= | ||||||
|
||||||
.. code:: cpp | ||||||
template <typename T> | ||||||
[[nodiscard]] constexpr int | ||||||
ffs(T value) noexcept; | ||||||
The function finds the first (least significant) set bit in ``value`` and returns its 1-based index. If ``value`` is 0, returns 0. | ||||||
|
||||||
**Parameters** | ||||||
|
||||||
- ``value``: Input value | ||||||
|
||||||
**Return value** | ||||||
|
||||||
- The 1-based index of the first set bit, or 0 if ``value`` is 0 | ||||||
|
||||||
**Constraints** | ||||||
|
||||||
- ``T`` is an unsigned integer type. | ||||||
|
||||||
**Relationship with other functions** | ||||||
|
||||||
- For non-zero values: ``ffs(x) == countr_zero(x) + 1`` | ||||||
|
||||||
**Performance considerations** | ||||||
|
||||||
The function performs the following operations: | ||||||
|
||||||
- Device: | ||||||
|
||||||
- ``uint8_t``, ``uint16_t``, ``uint32_t``: ``FFS`` | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
I would omit the other cases |
||||||
- ``uint64_t``: ``FFSLL`` | ||||||
- ``uint128_t``: ``FFSLL`` x2 with conditional logic | ||||||
|
||||||
- Host: | ||||||
|
||||||
- GCC/Clang: ``__builtin_ffs`` / ``__builtin_ffsll`` | ||||||
- MSVC: ``_BitScanForward`` / ``_BitScanForward64`` | ||||||
- Other: Portable constexpr loop implementation | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
|
||||||
.. note:: | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would suggest removing this note. It doesn't look aligned with other similar functions |
||||||
|
||||||
The function is guaranteed to be ``constexpr`` on all platforms, allowing compile-time evaluation when the input is a constant expression. | ||||||
|
||||||
Example | ||||||
------- | ||||||
|
||||||
.. code:: cpp | ||||||
#include <cuda/bit> | ||||||
#include <cuda/std/cassert> | ||||||
__global__ void ffs_kernel() { | ||||||
assert(cuda::ffs(0u) == 0); | ||||||
assert(cuda::ffs(1u) == 1); | ||||||
assert(cuda::ffs(0b1100u) == 3); | ||||||
assert(cuda::ffs(0x80000000u) == 32); | ||||||
} | ||||||
int main() { | ||||||
ffs_kernel<<<1, 1>>>(); | ||||||
cudaDeviceSynchronize(); | ||||||
return 0; | ||||||
} |
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,156 @@ | ||||||||
//===----------------------------------------------------------------------===// | ||||||||
// | ||||||||
// Part of libcu++, the C++ Standard Library for your entire system, | ||||||||
// under the Apache License v2.0 with LLVM Exceptions. | ||||||||
// See https://llvm.org/LICENSE.txt for license information. | ||||||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||||||
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. | ||||||||
// | ||||||||
//===----------------------------------------------------------------------===// | ||||||||
|
||||||||
#ifndef _CUDA___BIT_FFS_H | ||||||||
#define _CUDA___BIT_FFS_H | ||||||||
|
||||||||
#include <cuda/std/detail/__config> | ||||||||
|
||||||||
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) | ||||||||
# pragma GCC system_header | ||||||||
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) | ||||||||
# pragma clang system_header | ||||||||
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) | ||||||||
# pragma system_header | ||||||||
#endif // no system header | ||||||||
|
||||||||
#include <cuda/std/__type_traits/conditional.h> | ||||||||
#include <cuda/std/__type_traits/is_constant_evaluated.h> | ||||||||
#include <cuda/std/__type_traits/is_unsigned_integer.h> | ||||||||
Aminsed marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||
#include <cuda/std/cstdint> | ||||||||
#include <cuda/std/limits> | ||||||||
Aminsed marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||
|
||||||||
#include <nv/target> | ||||||||
|
||||||||
#if _CCCL_COMPILER(MSVC) | ||||||||
# include <intrin.h> | ||||||||
#endif // _CCCL_COMPILER(MSVC) | ||||||||
|
||||||||
#include <cuda/std/__cccl/prologue.h> | ||||||||
|
||||||||
_CCCL_BEGIN_NAMESPACE_CUDA | ||||||||
|
||||||||
#if _CCCL_HAS_BUILTIN(__builtin_ffs) || _CCCL_COMPILER(GCC) | ||||||||
# define _CCCL_BUILTIN_FFS(...) __builtin_ffs(__VA_ARGS__) | ||||||||
# define _CCCL_BUILTIN_FFSLL(...) __builtin_ffsll(__VA_ARGS__) | ||||||||
#endif // _CCCL_HAS_BUILTIN(__builtin_ffs) || _CCCL_COMPILER(GCC) | ||||||||
|
||||||||
template <typename _Tp> | ||||||||
[[nodiscard]] _CCCL_HIDE_FROM_ABI constexpr int __ffs_impl_constexpr(_Tp __v) noexcept | ||||||||
{ | ||||||||
static_assert(::cuda::std::__cccl_is_unsigned_integer_v<_Tp>, "_Tp must be unsigned"); | ||||||||
|
||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
if (__v == 0) | ||||||||
{ | ||||||||
return 0; | ||||||||
} | ||||||||
|
||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
int __pos = 1; | ||||||||
while ((__v & 1) == 0) | ||||||||
{ | ||||||||
__v >>= 1; | ||||||||
++__pos; | ||||||||
} | ||||||||
return __pos; | ||||||||
} | ||||||||
|
||||||||
#if !_CCCL_COMPILER(NVRTC) | ||||||||
template <typename _Tp> | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
[[nodiscard]] _CCCL_HOST_API int __ffs_impl_host(_Tp __v) noexcept | ||||||||
{ | ||||||||
# if defined(_CCCL_BUILTIN_FFS) | ||||||||
if constexpr (sizeof(_Tp) <= sizeof(int)) | ||||||||
{ | ||||||||
return _CCCL_BUILTIN_FFS(static_cast<int>(__v)); | ||||||||
} | ||||||||
else | ||||||||
{ | ||||||||
return _CCCL_BUILTIN_FFSLL(static_cast<long long>(__v)); | ||||||||
} | ||||||||
# elif _CCCL_COMPILER(MSVC) | ||||||||
unsigned long __where{}; | ||||||||
unsigned char __res{}; | ||||||||
if constexpr (sizeof(_Tp) <= sizeof(::cuda::std::uint32_t)) | ||||||||
{ | ||||||||
__res = ::_BitScanForward(&__where, static_cast<::cuda::std::uint32_t>(__v)); | ||||||||
} | ||||||||
else | ||||||||
{ | ||||||||
__res = ::_BitScanForward64(&__where, static_cast<::cuda::std::uint64_t>(__v)); | ||||||||
} | ||||||||
return __res ? (static_cast<int>(__where) + 1) : 0; | ||||||||
# else | ||||||||
return ::cuda::__ffs_impl_constexpr(__v); | ||||||||
# endif // _CCCL_COMPILER(MSVC) | ||||||||
} | ||||||||
#endif // !_CCCL_COMPILER(NVRTC) | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
|
||||||||
#if _CCCL_CUDA_COMPILATION() | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
template <typename _Tp> | ||||||||
[[nodiscard]] _CCCL_DEVICE_API int __ffs_impl_device(_Tp __v) noexcept | ||||||||
{ | ||||||||
if constexpr (sizeof(_Tp) <= sizeof(int)) | ||||||||
{ | ||||||||
return ::__ffs(static_cast<int>(__v)); | ||||||||
} | ||||||||
else | ||||||||
{ | ||||||||
return ::__ffsll(static_cast<long long>(__v)); | ||||||||
} | ||||||||
} | ||||||||
#endif // _CCCL_CUDA_COMPILATION() | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
|
||||||||
_CCCL_TEMPLATE(typename _Tp) | ||||||||
_CCCL_REQUIRES(::cuda::std::__cccl_is_unsigned_integer_v<_Tp>) | ||||||||
[[nodiscard]] _CCCL_API constexpr int ffs(_Tp __v) noexcept | ||||||||
{ | ||||||||
#if _CCCL_HAS_INT128() | ||||||||
if constexpr (sizeof(_Tp) == sizeof(__uint128_t)) | ||||||||
{ | ||||||||
const auto __lo = static_cast<::cuda::std::uint64_t>(__v); | ||||||||
const auto __hi = static_cast<::cuda::std::uint64_t>(static_cast<__uint128_t>(__v) >> 64); | ||||||||
|
||||||||
if (const auto __result = ::cuda::ffs(__lo)) | ||||||||
{ | ||||||||
return __result; | ||||||||
} | ||||||||
if (const auto __result = ::cuda::ffs(__hi)) | ||||||||
{ | ||||||||
return __result + 64; | ||||||||
} | ||||||||
return 0; | ||||||||
} | ||||||||
else | ||||||||
#endif // _CCCL_HAS_INT128() | ||||||||
{ | ||||||||
using _Up = ::cuda::std:: | ||||||||
conditional_t<sizeof(_Tp) == sizeof(::cuda::std::uint64_t), ::cuda::std::uint64_t, ::cuda::std::uint32_t>; | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||||||||
const auto __vu = static_cast<_Up>(__v); | ||||||||
|
||||||||
int __result{}; | ||||||||
if (!::cuda::std::__cccl_default_is_constant_evaluated()) | ||||||||
{ | ||||||||
NV_IF_ELSE_TARGET( | ||||||||
NV_IS_HOST, (__result = ::cuda::__ffs_impl_host(__vu);), (__result = ::cuda::__ffs_impl_device(__vu);)); | ||||||||
} | ||||||||
else | ||||||||
{ | ||||||||
__result = ::cuda::__ffs_impl_constexpr(__vu); | ||||||||
} | ||||||||
_CCCL_ASSUME(__result >= 0 && __result <= ::cuda::std::numeric_limits<_Tp>::digits); | ||||||||
return __result; | ||||||||
} | ||||||||
} | ||||||||
|
||||||||
_CCCL_END_NAMESPACE_CUDA | ||||||||
|
||||||||
#include <cuda/std/__cccl/epilogue.h> | ||||||||
|
||||||||
#endif // _CUDA___BIT_FFS_H |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,7 @@ | |
#endif // no system header | ||
|
||
#if _CCCL_CUDA_COMPILATION() | ||
# include <cuda/__bit/ffs.h> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The right header is cuda/bit only. Remove this line |
||
# include <cuda/__ptx/instructions/bfind.h> | ||
# include <cuda/__ptx/instructions/shl.h> | ||
# include <cuda/__ptx/instructions/shr.h> | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
please add "Defined in the
<cuda/bit>
header.