NVIDIA · Aminsed · Oct 13, 2025 · Oct 21, 2025 · Oct 21, 2025 · Oct 21, 2025
@@ -11,6 +11,7 @@ Bit
    bit/bit_reverse
    bit/bitfield_insert
    bit/bitfield_extract
+   bit/ffs
 
 .. list-table::
    :widths: 25 45 30 30
@@ -40,3 +41,8 @@ Bit
      - Extract a bitfield
      - CCCL 3.0.0
      - CUDA 13.0
+
+   * - :ref:`ffs <libcudacxx-extended-api-bit-ffs>`
+     - Find first set bit
+     - CCCL 3.2.0
+     - CUDA 13.2
@@ -0,0 +1,69 @@
+.. _libcudacxx-extended-api-bit-ffs:
+
+``cuda::ffs``
+=============
+
+.. code:: cpp
+
+   template <typename T>
+   [[nodiscard]] constexpr int
+   ffs(T value) noexcept;
+
+The function finds the first (least significant) set bit in ``value`` and returns its 1-based index. If ``value`` is 0, returns 0.
+
+**Parameters**
+
+- ``value``: Input value
+
+**Return value**
+
+- The 1-based index of the first set bit, or 0 if ``value`` is 0
+
+**Constraints**
+
+- ``T`` is an unsigned integer type.
+
+**Relationship with other functions**
+
+- For non-zero values: ``ffs(x) == countr_zero(x) + 1``
+
+**Performance considerations**
+
+The function performs the following operations:
+
+- Device:
+
+  - ``uint8_t``, ``uint16_t``, ``uint32_t``: ``FFS``
-  - ``uint8_t``, ``uint16_t``, ``uint32_t``: ``FFS``
+  - ``uint8_t``, ``uint16_t``, ``uint32_t``: ``BREV``, ``FLO``, ``IADD3``
-  - ``uint8_t``, ``uint16_t``, ``uint32_t``: ``FFS``
+  - ``uint8_t``, ``uint16_t``, ``uint32_t``: ``BREV``, ``FLO``, ``IADD3``
+  - ``uint64_t``: ``FFSLL``
+  - ``uint128_t``: ``FFSLL`` x2 with conditional logic
+
+- Host:
+
+  - GCC/Clang: ``__builtin_ffs`` / ``__builtin_ffsll``
+  - MSVC: ``_BitScanForward`` / ``_BitScanForward64``
+  - Other: Portable constexpr loop implementation
-  - Other: Portable constexpr loop implementation
-  - Other: Portable constexpr loop implementation
+
+.. note::
+
+    The function is guaranteed to be ``constexpr`` on all platforms, allowing compile-time evaluation when the input is a constant expression.
+
+Example
+-------
+
+.. code:: cpp
+
+    #include <cuda/bit>
+    #include <cuda/std/cassert>
+
+    __global__ void ffs_kernel() {
+        assert(cuda::ffs(0u) == 0);
+        assert(cuda::ffs(1u) == 1);
+        assert(cuda::ffs(0b1100u) == 3);
+        assert(cuda::ffs(0x80000000u) == 32);
+    }
+
+    int main() {
+        ffs_kernel<<<1, 1>>>();
+        cudaDeviceSynchronize();
+        return 0;
+    }
@@ -168,7 +168,7 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
         unsigned int __active  = __activeA & __activeB;
         int __inc              = ::__popc(__active) * __update;
 
-        int __leader = ::__ffs(__active) - 1;
+        int __leader = ::cuda::ffs(__active) - 1;
         // All threads in mask synchronize here, establishing cummulativity to the __leader:
         ::__syncwarp(__mask);
         if (__leader == static_cast<int>(::cuda::ptx::get_sreg_laneid())) {

@@ -0,0 +1,156 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___BIT_FFS_H
+#define _CUDA___BIT_FFS_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/conditional.h>
+#include <cuda/std/__type_traits/is_constant_evaluated.h>
+#include <cuda/std/__type_traits/is_unsigned_integer.h>
+#include <cuda/std/cstdint>
+#include <cuda/std/limits>
+
+#include <nv/target>
+
+#if _CCCL_COMPILER(MSVC)
+#  include <intrin.h>
+#endif // _CCCL_COMPILER(MSVC)
+
+#include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA
+
+#if _CCCL_HAS_BUILTIN(__builtin_ffs) || _CCCL_COMPILER(GCC)
+#  define _CCCL_BUILTIN_FFS(...)   __builtin_ffs(__VA_ARGS__)
+#  define _CCCL_BUILTIN_FFSLL(...) __builtin_ffsll(__VA_ARGS__)
+#endif // _CCCL_HAS_BUILTIN(__builtin_ffs) || _CCCL_COMPILER(GCC)
+
+template <typename _Tp>
+[[nodiscard]] _CCCL_HIDE_FROM_ABI constexpr int __ffs_impl_constexpr(_Tp __v) noexcept
+{
+  static_assert(::cuda::std::__cccl_is_unsigned_integer_v<_Tp>, "_Tp must be unsigned");
+
-
-
+  if (__v == 0)
+  {
+    return 0;
+  }
+
-
-
+  int __pos = 1;
+  while ((__v & 1) == 0)
+  {
+    __v >>= 1;
+    ++__pos;
+  }
+  return __pos;
+}
+
+#if !_CCCL_COMPILER(NVRTC)
+template <typename _Tp>
-template <typename _Tp>
+
+template <typename _Tp>
-template <typename _Tp>
+
+template <typename _Tp>
+[[nodiscard]] _CCCL_HOST_API int __ffs_impl_host(_Tp __v) noexcept
+{
+#  if defined(_CCCL_BUILTIN_FFS)
+  if constexpr (sizeof(_Tp) <= sizeof(int))
+  {
+    return _CCCL_BUILTIN_FFS(static_cast<int>(__v));
+  }
+  else
+  {
+    return _CCCL_BUILTIN_FFSLL(static_cast<long long>(__v));
+  }
+#  elif _CCCL_COMPILER(MSVC)
+  unsigned long __where{};
+  unsigned char __res{};
+  if constexpr (sizeof(_Tp) <= sizeof(::cuda::std::uint32_t))
+  {
+    __res = ::_BitScanForward(&__where, static_cast<::cuda::std::uint32_t>(__v));
+  }
+  else
+  {
+    __res = ::_BitScanForward64(&__where, static_cast<::cuda::std::uint64_t>(__v));
+  }
+  return __res ? (static_cast<int>(__where) + 1) : 0;
+#  else
+  return ::cuda::__ffs_impl_constexpr(__v);
+#  endif // _CCCL_COMPILER(MSVC)
+}
+#endif // !_CCCL_COMPILER(NVRTC)
-#endif // !_CCCL_COMPILER(NVRTC)
+
+#endif // !_CCCL_COMPILER(NVRTC)
-#endif // !_CCCL_COMPILER(NVRTC)
+
+#endif // !_CCCL_COMPILER(NVRTC)
+
+#if _CCCL_CUDA_COMPILATION()
-#if _CCCL_CUDA_COMPILATION()
+#if _CCCL_CUDA_COMPILATION()
+
-#if _CCCL_CUDA_COMPILATION()
+#if _CCCL_CUDA_COMPILATION()
+
+template <typename _Tp>
+[[nodiscard]] _CCCL_DEVICE_API int __ffs_impl_device(_Tp __v) noexcept
+{
+  if constexpr (sizeof(_Tp) <= sizeof(int))
+  {
+    return ::__ffs(static_cast<int>(__v));
+  }
+  else
+  {
+    return ::__ffsll(static_cast<long long>(__v));
+  }
+}
+#endif // _CCCL_CUDA_COMPILATION()
-#endif // _CCCL_CUDA_COMPILATION()
+
+#endif // _CCCL_CUDA_COMPILATION()
-#endif // _CCCL_CUDA_COMPILATION()
+
+#endif // _CCCL_CUDA_COMPILATION()
+
+_CCCL_TEMPLATE(typename _Tp)
+_CCCL_REQUIRES(::cuda::std::__cccl_is_unsigned_integer_v<_Tp>)
+[[nodiscard]] _CCCL_API constexpr int ffs(_Tp __v) noexcept
+{
+#if _CCCL_HAS_INT128()
+  if constexpr (sizeof(_Tp) == sizeof(__uint128_t))
+  {
+    const auto __lo = static_cast<::cuda::std::uint64_t>(__v);
+    const auto __hi = static_cast<::cuda::std::uint64_t>(static_cast<__uint128_t>(__v) >> 64);
+
+    if (const auto __result = ::cuda::ffs(__lo))
+    {
+      return __result;
+    }
+    if (const auto __result = ::cuda::ffs(__hi))
+    {
+      return __result + 64;
+    }
+    return 0;
+  }
+  else
+#endif // _CCCL_HAS_INT128()
+  {
+    using _Up = ::cuda::std::
+      conditional_t<sizeof(_Tp) == sizeof(::cuda::std::uint64_t), ::cuda::std::uint64_t, ::cuda::std::uint32_t>;
+    const auto __vu = static_cast<_Up>(__v);
+
+    int __result{};
+    if (!::cuda::std::__cccl_default_is_constant_evaluated())
+    {
+      NV_IF_ELSE_TARGET(
+        NV_IS_HOST, (__result = ::cuda::__ffs_impl_host(__vu);), (__result = ::cuda::__ffs_impl_device(__vu);));
+    }
+    else
+    {
+      __result = ::cuda::__ffs_impl_constexpr(__vu);
+    }
+    _CCCL_ASSUME(__result >= 0 && __result <= ::cuda::std::numeric_limits<_Tp>::digits);
+    return __result;
+  }
+}
+
+_CCCL_END_NAMESPACE_CUDA
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA___BIT_FFS_H
@@ -24,6 +24,7 @@
 #include <cuda/__bit/bit_reverse.h>
 #include <cuda/__bit/bitfield.h>
 #include <cuda/__bit/bitmask.h>
+#include <cuda/__bit/ffs.h>
 #include <cuda/std/bit>
 
 #endif // _CUDA_BIT
@@ -28,6 +28,7 @@
 #include <cuda/__ptx/instructions/get_sreg.h>
 #include <cuda/atomic>
 #include <cuda/barrier>
+#include <cuda/bit>
 #include <cuda/std/__algorithm/max.h>
 #include <cuda/std/__chrono/duration.h>
 #include <cuda/std/__chrono/time_point.h>
@@ -110,7 +111,7 @@ public:
       NV_IS_DEVICE,
       const uint32_t __match_mask =
         ::__match_any_sync(::__activemask(), reinterpret_cast<uintptr_t>(__shared_state_get_refcount()));
-      const uint32_t __elected_id = ::__ffs(__match_mask) - 1;
+      const uint32_t __elected_id = ::cuda::ffs(__match_mask) - 1;
       __elected                   = (::cuda::ptx::get_sreg_laneid() == __elected_id);
       __sub_count                 = ::__popc(__match_mask);
       , __elected = true;
@@ -299,7 +300,7 @@ make_pipeline(const _Group& __group, pipeline_shared_state<_Scope, _Stages_count
       NV_IS_DEVICE,
       const uint32_t __match_mask =
         ::__match_any_sync(::__activemask(), reinterpret_cast<uintptr_t>(&__shared_state->__refcount));
-      const uint32_t __elected_id = ::__ffs(__match_mask) - 1;
+      const uint32_t __elected_id = ::cuda::ffs(__match_mask) - 1;
       __elected                   = (::cuda::ptx::get_sreg_laneid() == __elected_id);
       __add_count                 = ::__popc(__match_mask);
       , __elected = true;

@@ -22,6 +22,7 @@
 #endif // no system header
 
 #if _CCCL_CUDA_COMPILATION()
+#  include <cuda/__bit/ffs.h>
 #  include <cuda/__ptx/instructions/bfind.h>
 #  include <cuda/__ptx/instructions/shl.h>
 #  include <cuda/__ptx/instructions/shr.h>