[libcu++] Implement `cuda::ffs` #6192

Aminsed · 2025-10-13T03:21:33Z

Implement type-safe cuda::std::ffs function as replacement for __ffs intrinsic.

Returns 1-based index of first set bit (0 for no bits set)
Works on all platforms and integer types
Handles x == 0 correctly (returns 0)

Fixes #6108

copy-pr-bot · 2025-10-13T03:21:38Z

This pull request requires additional validation before any workflows can run on NVIDIA's runners.

Pull request vetters can view their responsibilities here.

Contributors can view more details about this message here.

miscco

Thanks a lot, this is already looking really great 🎉

I have two minor comments, but this is missing tests. Please have a look what we do for e.g countr_zero in libcudacxx/test/libcudacxx/std/numerics/bit/bitops.count/countr_zero.pass.cpp

If you have any questions, feel free to ask

libcudacxx/include/cuda/std/__bit/ffs.h

libcudacxx/include/cuda/__barrier/barrier_block_scope.h

libcudacxx/include/cuda/std/__bit/ffs.h

libcudacxx/include/cuda/std/__cccl/builtin.h

Aminsed · 2025-10-13T14:00:13Z

Addressed all feedback:

Moved to cuda/__bit/ffs.h
Changed to cuda:: namespace
Updated header guards
Using _CCCL_DEVICE_API
Fixed signed cast with make_signed_t
Simplified implementation
Builtin definitions moved to ffs.h
Added comprehensive tests

davebayer · 2025-10-13T15:00:54Z

libcudacxx/include/cuda/__bit/ffs.h

+#if _CCCL_COMPILER(GCC) || _CCCL_COMPILER(CLANG) || _CCCL_COMPILER(NVHPC)
+#  define _CCCL_BUILTIN_FFS(...)   __builtin_ffs(__VA_ARGS__)
+#  define _CCCL_BUILTIN_FFSLL(...) __builtin_ffsll(__VA_ARGS__)
+#endif // _CCCL_COMPILER(GCC) || _CCCL_COMPILER(CLANG) || _CCCL_COMPILER(NVHPC)
+
+#include <cuda/std/__cccl/prologue.h>


Suggested change

#if _CCCL_COMPILER(GCC) || _CCCL_COMPILER(CLANG) || _CCCL_COMPILER(NVHPC)

# define _CCCL_BUILTIN_FFS(...) __builtin_ffs(__VA_ARGS__)

# define _CCCL_BUILTIN_FFSLL(...) __builtin_ffsll(__VA_ARGS__)

#endif // _CCCL_COMPILER(GCC) || _CCCL_COMPILER(CLANG) || _CCCL_COMPILER(NVHPC)

#include <cuda/std/__cccl/prologue.h>

#include <cuda/std/__cccl/prologue.h>

#if _CCCL_HAS_BUILTIN(__builtin_ffs) || _CCCL_COMPILER(GCC)

# define _CCCL_BUILTIN_FFS(...) __builtin_ffs(__VA_ARGS__)

# define _CCCL_BUILTIN_FFSLL(...) __builtin_ffsll(__VA_ARGS__)

#endif // _CCCL_HAS_BUILTIN(__builtin_ffs) || _CCCL_COMPILER(GCC)

Let's put the check inside the prologue/epilogue scope. Also, it should be enough to use the combination of _CCCL_HAS_BUILTIN(__builtin_ffs) and _CCCL_COMPILER(GCC) :)

davebayer · 2025-10-13T15:02:54Z

libcudacxx/include/cuda/__bit/ffs.h

+  using _Signed = ::cuda::std::make_signed_t<_Tp>;
+  if constexpr (sizeof(_Tp) <= sizeof(int))
+  {
+    return _CCCL_BUILTIN_FFS(static_cast<_Signed>(__v));
+  }
+  else
+  {
+    return _CCCL_BUILTIN_FFSLL(static_cast<_Signed>(__v));
+  }


Suggested change

using _Signed = ::cuda::std::make_signed_t<_Tp>;

if constexpr (sizeof(_Tp) <= sizeof(int))

{

return _CCCL_BUILTIN_FFS(static_cast<_Signed>(__v));

}

else

{

return _CCCL_BUILTIN_FFSLL(static_cast<_Signed>(__v));

}

if constexpr (sizeof(_Tp) <= sizeof(int))

{

return _CCCL_BUILTIN_FFS(static_cast<int>(__v));

}

else

{

return _CCCL_BUILTIN_FFSLL(static_cast<long long>(__v));

}

we can simplify it a bit further

davebayer · 2025-10-13T15:07:58Z

libcudacxx/include/cuda/__bit/ffs.h

+  if constexpr (sizeof(_Tp) <= sizeof(int))
+  {
+    return ::cuda::__ffs_impl(static_cast<uint32_t>(__v));
+  }
+  else
+  {
+    return ::cuda::__ffs_impl(static_cast<uint64_t>(__v));
+  }


This is unnecessary I believe, we can just move contents of __ffs_impl here directly

We also support 128-bit integers if available, so we should handle them, too. You can just split them into 2x 64-bit parts and handle them separately

davebayer · 2025-10-13T15:08:11Z

libcudacxx/include/cuda/__bit/ffs.h

+template <typename _Tp>
+[[nodiscard]] _CCCL_API constexpr int __ffs_impl(_Tp __v) noexcept
+{
+  static_assert(::cuda::std::is_same_v<_Tp, uint32_t> || ::cuda::std::is_same_v<_Tp, uint64_t>);


This can be removed

davebayer · 2025-10-13T15:10:28Z

libcudacxx/test/libcudacxx/cuda/bit/ffs.pass.cpp

+  test<unsigned char>();
+  test<unsigned short>();
+  test<unsigned>();
+  test<unsigned long>();
+  test<unsigned long long>();
+
+  test<uint8_t>();
+  test<uint16_t>();
+  test<uint32_t>();
+  test<uint64_t>();
+  test<size_t>();
+  test<uintmax_t>();
+  test<uintptr_t>();
+
+#if _CCCL_HAS_INT128()
+  test<__uint128_t>();
+#endif // _CCCL_HAS_INT128()


Suggested change

test<unsigned char>();

test<unsigned short>();

test<unsigned>();

test<unsigned long>();

test<unsigned long long>();

test<uint8_t>();

test<uint16_t>();

test<uint32_t>();

test<uint64_t>();

test<size_t>();

test<uintmax_t>();

test<uintptr_t>();

#if _CCCL_HAS_INT128()

test<__uint128_t>();

#endif // _CCCL_HAS_INT128()

test<unsigned char>();

test<unsigned short>();

test<unsigned>();

test<unsigned long>();

test<unsigned long long>();

#if _CCCL_HAS_INT128()

test<__uint128_t>();

#endif // _CCCL_HAS_INT128()

Those are not necessary, they are just aliases to the standard integer types

davebayer · 2025-10-13T15:11:55Z

libcudacxx/test/libcudacxx/cuda/bit/ffs.pass.cpp

+template <typename T>
+__host__ __device__ constexpr bool test()
+{
+  static_assert(cuda::ffs(T(0)) == 0);


These tests should use just assert(...) instead of static_assert(...). The values will be checked in constexpr context when we call static_assert(test() from main

davebayer · 2025-10-13T15:14:26Z

libcudacxx/test/libcudacxx/cuda/bit/ffs.pass.cpp

+  static_assert(cuda::ffs(T(127)) == 1);
+  static_assert(cuda::ffs(T(128)) == 8);


I'd like to see tests of greater values than 128. That's also why these tests wouldn't catch that the function won't work for 128-bit integers.

For 128-bit integer literals, you can include #include "literal.h" and use _u128user defined literals fromtest_integer_literals` namespace

miscco · 2025-10-13T15:17:19Z

libcudacxx/test/libcudacxx/cuda/bit/ffs.pass.cpp

+  static_assert(cuda::ffs(T(0)) == 0);
+  static_assert(cuda::ffs(T(1)) == 1);
+  static_assert(cuda::ffs(T(2)) == 2);
+  static_assert(cuda::ffs(T(3)) == 1);
+  static_assert(cuda::ffs(T(4)) == 3);
+  static_assert(cuda::ffs(T(5)) == 1);
+  static_assert(cuda::ffs(T(6)) == 2);
+  static_assert(cuda::ffs(T(7)) == 1);
+  static_assert(cuda::ffs(T(8)) == 4);
+  static_assert(cuda::ffs(T(9)) == 1);
+  static_assert(cuda::ffs(T(127)) == 1);
+  static_assert(cuda::ffs(T(128)) == 8);
+
+  // Test relationship with countr_zero: ffs(x) == countr_zero(x) + 1 for x != 0
+  static_assert(cuda::ffs(T(1)) == cuda::std::countr_zero(T(1)) + 1);
+  static_assert(cuda::ffs(T(2)) == cuda::std::countr_zero(T(2)) + 1);
+  static_assert(cuda::ffs(T(4)) == cuda::std::countr_zero(T(4)) + 1);
+  static_assert(cuda::ffs(T(8)) == cuda::std::countr_zero(T(8)) + 1);
+
+  // Test return type
+  static_assert(cuda::std::is_same_v<int, decltype(cuda::ffs(T(0)))>);
+  static_assert(noexcept(cuda::ffs(T(0))));


Important, this will only test at compile time, we want all those static_assert to be plain assert

At compile time that would trigger a compile error but do the right thing at runtime

miscco · 2025-10-13T15:17:41Z

libcudacxx/test/libcudacxx/cuda/bit/ffs.pass.cpp

+#include "test_macros.h"
+
+template <typename T>
+__host__ __device__ constexpr bool test()


NitpicK: This function could just be void

miscco · 2025-10-13T15:20:19Z

libcudacxx/test/libcudacxx/cuda/bit/ffs.pass.cpp

+  assert(test());
+  static_assert(test());
+  return 0;


Important, we had issues with the compiler being too clever and optimizing the whole tests out.

to alleviate that we do

Suggested change

assert(test());

static_assert(test());

return 0;

volatile int val = 0;

assert(test(val));

static_assert(test(0));

return 0;

And then inside test(const int) we static cast to the respective type.

Its a pain, but it ensures that the optimizer does not constant fold the function out

Aminsed · 2025-10-13T15:53:39Z

Builtins use _CCCL_HAS_BUILTIN inside prologue
Simplified to direct int/long long casts
Flattened ffs() implementation
Added 128-bit integer support with hi/lo split
Tests use assert in constexpr context
Added large value tests with _u128 literals

fbusato

thanks a lot for your contribution. When you get a chance, please also add the documentation

fbusato · 2025-10-13T16:24:43Z

libcudacxx/include/cuda/__bit/ffs.h

+      if (!::cuda::std::__cccl_default_is_constant_evaluated())
+      {
+        NV_IF_ELSE_TARGET(NV_IS_HOST,
+                          (return ::cuda::__ffs_impl_host(static_cast<uint32_t>(__v));),


static_cast is not needed for __ffs_impl_host and __ffs_impl_device

fbusato · 2025-10-13T16:25:08Z

libcudacxx/include/cuda/__bit/ffs.h

+    }
+    else
+    {
+      if (!::cuda::std::__cccl_default_is_constant_evaluated())


would be possible to merge this branch with the previous one?

fbusato · 2025-10-13T16:26:54Z

libcudacxx/include/cuda/__bit/ffs.h

+      }
+      return ::cuda::__ffs_impl_constexpr(__lo);
+    }
+    const auto __hi = static_cast<uint64_t>(__v >> 64);


prefer an unsigned type. Signed shift is undefined behavior (before C++20)

fbusato · 2025-10-13T16:28:35Z

libcudacxx/include/cuda/__bit/ffs.h

+      return ::cuda::__ffs_impl_constexpr(__lo);
+    }
+    const auto __hi = static_cast<uint64_t>(__v >> 64);
+    if (__hi != 0)


this condition is not needed. __ffs_impl_host/device already return 0 for input 0

fbusato · 2025-10-13T16:30:02Z

libcudacxx/include/cuda/__bit/ffs.h

+      if (!::cuda::std::__cccl_default_is_constant_evaluated())
+      {
+        NV_IF_ELSE_TARGET(
+          NV_IS_HOST, (return ::cuda::__ffs_impl_host(__lo);), (return ::cuda::__ffs_impl_device(__lo);));
+      }
+      return ::cuda::__ffs_impl_constexpr(__lo);


consider also merging this pattern in a separate function

fbusato · 2025-10-13T16:33:34Z

libcudacxx/include/cuda/__bit/ffs.h

+#include <cuda/std/__type_traits/is_constant_evaluated.h>
+#include <cuda/std/__type_traits/is_unsigned_integer.h>
+#include <cuda/std/cstdint>
+#include <cuda/std/limits>


add nvtarget

fbusato · 2025-10-13T16:34:10Z

libcudacxx/include/cuda/__bit/ffs.h

+#endif // _CCCL_HAS_BUILTIN(__builtin_ffs) || _CCCL_COMPILER(GCC)
+
+template <typename _Tp>
+[[nodiscard]] _CCCL_API constexpr int __ffs_impl_constexpr(_Tp __v) noexcept


_Tp must be an unsigned type to avoid UB with signed bitwise ops

fbusato · 2025-10-13T16:34:56Z

libcudacxx/include/cuda/__bit/ffs.h

+#  elif _CCCL_COMPILER(MSVC)
+  unsigned long __where{};
+  unsigned char __res{};
+  if constexpr (sizeof(_Tp) <= sizeof(uint32_t))


I would also add

using ::cuda::std::uint32_t using ::cuda::std::uint64_t

fbusato · 2025-10-13T16:35:25Z

libcudacxx/include/cuda/__bit/ffs.h

+#endif // no system header
+
+#include <cuda/std/__type_traits/is_constant_evaluated.h>
+#include <cuda/std/__type_traits/is_unsigned_integer.h>


looks unused

fbusato · 2025-10-13T16:36:27Z

libcudacxx/include/cuda/__bit/ffs.h

+                          (return ::cuda::__ffs_impl_host(static_cast<uint64_t>(__v));),
+                          (return ::cuda::__ffs_impl_device(static_cast<uint64_t>(__v));));
+      }
+      return ::cuda::__ffs_impl_constexpr(static_cast<uint64_t>(__v));


please also add _CCCL_ASSUME(result >= 0 && result <=32 )

Fixes NVIDIA#6108

Aminsed · 2025-10-13T19:12:27Z

@fbusato

Removed unnecessary casts
Fixed unsigned shift (avoid UB)
Added static_assert and _CCCL_ASSUME
Simplified MSVC ternary

Skipped:

Using declarations (bit_reverse.h doesn't use them)
Helper extraction (conflicts with @davebayer's flatten request)

Aminsed requested a review from a team as a code owner October 13, 2025 03:21

github-project-automation bot added this to CCCL Oct 13, 2025

Aminsed requested a review from griwes October 13, 2025 03:21

github-project-automation bot moved this to Todo in CCCL Oct 13, 2025

cccl-authenticator-app bot moved this from Todo to In Review in CCCL Oct 13, 2025

miscco requested changes Oct 13, 2025

View reviewed changes

libcudacxx/include/cuda/std/__bit/ffs.h Outdated Show resolved Hide resolved

libcudacxx/include/cuda/std/__bit/ffs.h Outdated Show resolved Hide resolved

github-project-automation bot moved this from In Review to In Progress in CCCL Oct 13, 2025

davebayer requested changes Oct 13, 2025

View reviewed changes

Aminsed force-pushed the feature/implement_bit_ffs branch from 02c7d56 to 731da14 Compare October 13, 2025 13:57

Aminsed requested review from davebayer and miscco October 13, 2025 14:35

davebayer requested changes Oct 13, 2025

View reviewed changes

miscco reviewed Oct 13, 2025

View reviewed changes

Aminsed force-pushed the feature/implement_bit_ffs branch from 79ef976 to 69d084e Compare October 13, 2025 15:49

Aminsed requested review from davebayer and miscco October 13, 2025 15:59

fbusato requested changes Oct 13, 2025

View reviewed changes

fbusato changed the title ~~[libcu++] Implement cuda::std::ffs~~ [libcu++] Implement cuda::ffs Oct 13, 2025

[libcu++] Implement cuda::ffs

73a7a6a

Fixes NVIDIA#6108

Aminsed force-pushed the feature/implement_bit_ffs branch from 69d084e to 73a7a6a Compare October 13, 2025 17:06

Aminsed requested a review from fbusato October 13, 2025 19:13

		static_assert(cuda::ffs(T(127)) == 1);
		static_assert(cuda::ffs(T(128)) == 8);

[libcu++] Implement cuda::ffs #6192

Are you sure you want to change the base?

[libcu++] Implement cuda::ffs #6192

Uh oh!

Conversation

Aminsed commented Oct 13, 2025

Uh oh!

copy-pr-bot bot commented Oct 13, 2025

Uh oh!

miscco left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Aminsed commented Oct 13, 2025

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Aminsed commented Oct 13, 2025

Uh oh!

fbusato left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Aminsed commented Oct 13, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

[libcu++] Implement `cuda::ffs` #6192

[libcu++] Implement `cuda::ffs` #6192