Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/libcudacxx/extended_api/bit.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Bit
bit/bit_reverse
bit/bitfield_insert
bit/bitfield_extract
bit/ffs

.. list-table::
:widths: 25 45 30 30
Expand Down Expand Up @@ -40,3 +41,8 @@ Bit
- Extract a bitfield
- CCCL 3.0.0
- CUDA 13.0

* - :ref:`ffs <libcudacxx-extended-api-bit-ffs>`
- Find first set bit
- CCCL 3.2.0
- CUDA 13.2
69 changes: 69 additions & 0 deletions docs/libcudacxx/extended_api/bit/ffs.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
.. _libcudacxx-extended-api-bit-ffs:

``cuda::ffs``
=============

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please add "Defined in the <cuda/bit> header.

.. code:: cpp
template <typename T>
[[nodiscard]] constexpr int
ffs(T value) noexcept;
The function finds the first (least significant) set bit in ``value`` and returns its 1-based index. If ``value`` is 0, returns 0.

**Parameters**

- ``value``: Input value

**Return value**

- The 1-based index of the first set bit, or 0 if ``value`` is 0

**Constraints**

- ``T`` is an unsigned integer type.

**Relationship with other functions**

- For non-zero values: ``ffs(x) == countr_zero(x) + 1``

**Performance considerations**

The function performs the following operations:

- Device:

- ``uint8_t``, ``uint16_t``, ``uint32_t``: ``FFS``
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
- ``uint8_t``, ``uint16_t``, ``uint32_t``: ``FFS``
- ``uint8_t``, ``uint16_t``, ``uint32_t``: ``BREV``, ``FLO``, ``IADD3``

I would omit the other cases

- ``uint64_t``: ``FFSLL``
- ``uint128_t``: ``FFSLL`` x2 with conditional logic

- Host:

- GCC/Clang: ``__builtin_ffs`` / ``__builtin_ffsll``
- MSVC: ``_BitScanForward`` / ``_BitScanForward64``
- Other: Portable constexpr loop implementation
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
- Other: Portable constexpr loop implementation


.. note::
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would suggest removing this note. It doesn't look aligned with other similar functions


The function is guaranteed to be ``constexpr`` on all platforms, allowing compile-time evaluation when the input is a constant expression.

Example
-------

.. code:: cpp
#include <cuda/bit>
#include <cuda/std/cassert>
__global__ void ffs_kernel() {
assert(cuda::ffs(0u) == 0);
assert(cuda::ffs(1u) == 1);
assert(cuda::ffs(0b1100u) == 3);
assert(cuda::ffs(0x80000000u) == 32);
}
int main() {
ffs_kernel<<<1, 1>>>();
cudaDeviceSynchronize();
return 0;
}
2 changes: 1 addition & 1 deletion libcudacxx/include/cuda/__barrier/barrier_block_scope.h
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
unsigned int __active = __activeA & __activeB;
int __inc = ::__popc(__active) * __update;

int __leader = ::__ffs(__active) - 1;
int __leader = ::cuda::ffs(__active) - 1;
// All threads in mask synchronize here, establishing cummulativity to the __leader:
::__syncwarp(__mask);
if (__leader == static_cast<int>(::cuda::ptx::get_sreg_laneid())) {
Expand Down
156 changes: 156 additions & 0 deletions libcudacxx/include/cuda/__bit/ffs.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA___BIT_FFS_H
#define _CUDA___BIT_FFS_H

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cuda/std/__type_traits/conditional.h>
#include <cuda/std/__type_traits/is_constant_evaluated.h>
#include <cuda/std/__type_traits/is_unsigned_integer.h>
#include <cuda/std/cstdint>
#include <cuda/std/limits>

#include <nv/target>

#if _CCCL_COMPILER(MSVC)
# include <intrin.h>
#endif // _CCCL_COMPILER(MSVC)

#include <cuda/std/__cccl/prologue.h>

_CCCL_BEGIN_NAMESPACE_CUDA

#if _CCCL_HAS_BUILTIN(__builtin_ffs) || _CCCL_COMPILER(GCC)
# define _CCCL_BUILTIN_FFS(...) __builtin_ffs(__VA_ARGS__)
# define _CCCL_BUILTIN_FFSLL(...) __builtin_ffsll(__VA_ARGS__)
#endif // _CCCL_HAS_BUILTIN(__builtin_ffs) || _CCCL_COMPILER(GCC)

template <typename _Tp>
[[nodiscard]] _CCCL_HIDE_FROM_ABI constexpr int __ffs_impl_constexpr(_Tp __v) noexcept
{
static_assert(::cuda::std::__cccl_is_unsigned_integer_v<_Tp>, "_Tp must be unsigned");

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change

if (__v == 0)
{
return 0;
}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change

int __pos = 1;
while ((__v & 1) == 0)
{
__v >>= 1;
++__pos;
}
return __pos;
}

#if !_CCCL_COMPILER(NVRTC)
template <typename _Tp>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
template <typename _Tp>
template <typename _Tp>

[[nodiscard]] _CCCL_HOST_API int __ffs_impl_host(_Tp __v) noexcept
{
# if defined(_CCCL_BUILTIN_FFS)
if constexpr (sizeof(_Tp) <= sizeof(int))
{
return _CCCL_BUILTIN_FFS(static_cast<int>(__v));
}
else
{
return _CCCL_BUILTIN_FFSLL(static_cast<long long>(__v));
}
# elif _CCCL_COMPILER(MSVC)
unsigned long __where{};
unsigned char __res{};
if constexpr (sizeof(_Tp) <= sizeof(::cuda::std::uint32_t))
{
__res = ::_BitScanForward(&__where, static_cast<::cuda::std::uint32_t>(__v));
}
else
{
__res = ::_BitScanForward64(&__where, static_cast<::cuda::std::uint64_t>(__v));
}
return __res ? (static_cast<int>(__where) + 1) : 0;
# else
return ::cuda::__ffs_impl_constexpr(__v);
# endif // _CCCL_COMPILER(MSVC)
}
#endif // !_CCCL_COMPILER(NVRTC)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
#endif // !_CCCL_COMPILER(NVRTC)
#endif // !_CCCL_COMPILER(NVRTC)


#if _CCCL_CUDA_COMPILATION()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
#if _CCCL_CUDA_COMPILATION()
#if _CCCL_CUDA_COMPILATION()

template <typename _Tp>
[[nodiscard]] _CCCL_DEVICE_API int __ffs_impl_device(_Tp __v) noexcept
{
if constexpr (sizeof(_Tp) <= sizeof(int))
{
return ::__ffs(static_cast<int>(__v));
}
else
{
return ::__ffsll(static_cast<long long>(__v));
}
}
#endif // _CCCL_CUDA_COMPILATION()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
#endif // _CCCL_CUDA_COMPILATION()
#endif // _CCCL_CUDA_COMPILATION()


_CCCL_TEMPLATE(typename _Tp)
_CCCL_REQUIRES(::cuda::std::__cccl_is_unsigned_integer_v<_Tp>)
[[nodiscard]] _CCCL_API constexpr int ffs(_Tp __v) noexcept
{
#if _CCCL_HAS_INT128()
if constexpr (sizeof(_Tp) == sizeof(__uint128_t))
{
const auto __lo = static_cast<::cuda::std::uint64_t>(__v);
const auto __hi = static_cast<::cuda::std::uint64_t>(static_cast<__uint128_t>(__v) >> 64);

if (const auto __result = ::cuda::ffs(__lo))
{
return __result;
}
if (const auto __result = ::cuda::ffs(__hi))
{
return __result + 64;
}
return 0;
}
else
#endif // _CCCL_HAS_INT128()
{
using _Up = ::cuda::std::
conditional_t<sizeof(_Tp) == sizeof(::cuda::std::uint64_t), ::cuda::std::uint64_t, ::cuda::std::uint32_t>;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

conditional_t is expensive. Is it really needed?

const auto __vu = static_cast<_Up>(__v);

int __result{};
if (!::cuda::std::__cccl_default_is_constant_evaluated())
{
NV_IF_ELSE_TARGET(
NV_IS_HOST, (__result = ::cuda::__ffs_impl_host(__vu);), (__result = ::cuda::__ffs_impl_device(__vu);));
}
else
{
__result = ::cuda::__ffs_impl_constexpr(__vu);
}
_CCCL_ASSUME(__result >= 0 && __result <= ::cuda::std::numeric_limits<_Tp>::digits);
return __result;
}
}

_CCCL_END_NAMESPACE_CUDA

#include <cuda/std/__cccl/epilogue.h>

#endif // _CUDA___BIT_FFS_H
1 change: 1 addition & 0 deletions libcudacxx/include/cuda/bit
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <cuda/__bit/bit_reverse.h>
#include <cuda/__bit/bitfield.h>
#include <cuda/__bit/bitmask.h>
#include <cuda/__bit/ffs.h>
#include <cuda/std/bit>

#endif // _CUDA_BIT
5 changes: 3 additions & 2 deletions libcudacxx/include/cuda/pipeline
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include <cuda/__ptx/instructions/get_sreg.h>
#include <cuda/atomic>
#include <cuda/barrier>
#include <cuda/bit>
#include <cuda/std/__algorithm/max.h>
#include <cuda/std/__chrono/duration.h>
#include <cuda/std/__chrono/time_point.h>
Expand Down Expand Up @@ -110,7 +111,7 @@ public:
NV_IS_DEVICE,
const uint32_t __match_mask =
::__match_any_sync(::__activemask(), reinterpret_cast<uintptr_t>(__shared_state_get_refcount()));
const uint32_t __elected_id = ::__ffs(__match_mask) - 1;
const uint32_t __elected_id = ::cuda::ffs(__match_mask) - 1;
__elected = (::cuda::ptx::get_sreg_laneid() == __elected_id);
__sub_count = ::__popc(__match_mask);
, __elected = true;
Expand Down Expand Up @@ -299,7 +300,7 @@ make_pipeline(const _Group& __group, pipeline_shared_state<_Scope, _Stages_count
NV_IS_DEVICE,
const uint32_t __match_mask =
::__match_any_sync(::__activemask(), reinterpret_cast<uintptr_t>(&__shared_state->__refcount));
const uint32_t __elected_id = ::__ffs(__match_mask) - 1;
const uint32_t __elected_id = ::cuda::ffs(__match_mask) - 1;
__elected = (::cuda::ptx::get_sreg_laneid() == __elected_id);
__add_count = ::__popc(__match_mask);
, __elected = true;
Expand Down
1 change: 1 addition & 0 deletions libcudacxx/include/cuda/std/__bit/integral.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#endif // no system header

#if _CCCL_CUDA_COMPILATION()
# include <cuda/__bit/ffs.h>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The right header is cuda/bit only. Remove this line

# include <cuda/__ptx/instructions/bfind.h>
# include <cuda/__ptx/instructions/shl.h>
# include <cuda/__ptx/instructions/shr.h>
Expand Down
Loading
Loading