Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions cub/cub/block/block_radix_rank.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
#include <cuda/__ptx/instructions/get_sreg.h>
#include <cuda/std/__algorithm/max.h>
#include <cuda/std/__bit/integral.h>
#include <cuda/std/__bit/popcount.h>
#include <cuda/std/__functional/operations.h>
#include <cuda/std/__type_traits/conditional.h>
#include <cuda/std/__type_traits/is_same.h>
Expand Down Expand Up @@ -742,10 +743,10 @@ public:
__syncwarp(0xFFFFFFFF);

// Number of peers having same digit as me
int32_t digit_count = __popc(peer_mask);
int32_t digit_count = ::cuda::std::popcount(peer_mask);

// Number of lower-ranked peers having same digit seen so far
int32_t peer_digit_prefix = __popc(peer_mask & lane_mask_lt);
int32_t peer_digit_prefix = ::cuda::std::popcount(peer_mask & lane_mask_lt);

if (peer_digit_prefix == 0)
{
Expand Down Expand Up @@ -1075,7 +1076,7 @@ struct BlockRadixRankMatchEarlyCounts
int bin_mask = *p_match_mask;
int leader = ::cuda::std::__bit_log2(static_cast<unsigned>(bin_mask));
int warp_offset = 0;
int popc = __popc(bin_mask & ::cuda::ptx::get_sreg_lanemask_le());
int popc = ::cuda::std::popcount(bin_mask & ::cuda::ptx::get_sreg_lanemask_le());
if (lane == leader)
{
// atomic is a bit faster
Expand Down Expand Up @@ -1105,7 +1106,7 @@ struct BlockRadixRankMatchEarlyCounts
detail::warp_in_block_matcher_t<RADIX_BITS, PARTIAL_WARP_THREADS, BLOCK_WARPS - 1>::match_any(bin, warp);
int leader = ::cuda::std::__bit_log2(static_cast<unsigned>(bin_mask));
int warp_offset = 0;
int popc = __popc(bin_mask & ::cuda::ptx::get_sreg_lanemask_le());
int popc = ::cuda::std::popcount(bin_mask & ::cuda::ptx::get_sreg_lanemask_le());
if (lane == leader)
{
// atomic is a bit faster
Expand Down
3 changes: 2 additions & 1 deletion libcudacxx/include/cuda/__barrier/barrier_block_scope.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include <cuda/std/__barrier/barrier.h>
#include <cuda/std/__barrier/empty_completion.h>
#include <cuda/std/__barrier/poll_tester.h>
#include <cuda/std/__bit/popcount.h>
#include <cuda/std/__chrono/duration.h>
#include <cuda/std/__chrono/high_resolution_clock.h>
#include <cuda/std/__chrono/time_point.h>
Expand Down Expand Up @@ -165,7 +166,7 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
unsigned int __activeA = ::__match_any_sync(__mask, __update);
unsigned int __activeB = ::__match_any_sync(__mask, reinterpret_cast<::cuda::std::uintptr_t>(&__barrier));
unsigned int __active = __activeA & __activeB;
int __inc = ::__popc(__active) * __update;
int __inc = ::cuda::std::popcount(__active) * __update;

int __leader = ::__ffs(__active) - 1;
// All threads in mask synchronize here, establishing cummulativity to the __leader:
Expand Down
5 changes: 3 additions & 2 deletions libcudacxx/include/cuda/pipeline
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include <cuda/atomic>
#include <cuda/barrier>
#include <cuda/std/__algorithm/max.h>
#include <cuda/std/__bit/popcount.h>
#include <cuda/std/__chrono/duration.h>
#include <cuda/std/__chrono/time_point.h>
#include <cuda/std/cstdint>
Expand Down Expand Up @@ -112,7 +113,7 @@ public:
::__match_any_sync(::__activemask(), reinterpret_cast<uintptr_t>(__shared_state_get_refcount()));
const uint32_t __elected_id = ::__ffs(__match_mask) - 1;
__elected = (::cuda::ptx::get_sreg_laneid() == __elected_id);
__sub_count = ::__popc(__match_mask);
__sub_count = ::cuda::std::popcount(__match_mask);
, __elected = true;
__sub_count = 1;)
bool __released = false;
Expand Down Expand Up @@ -301,7 +302,7 @@ make_pipeline(const _Group& __group, pipeline_shared_state<_Scope, _Stages_count
::__match_any_sync(::__activemask(), reinterpret_cast<uintptr_t>(&__shared_state->__refcount));
const uint32_t __elected_id = ::__ffs(__match_mask) - 1;
__elected = (::cuda::ptx::get_sreg_laneid() == __elected_id);
__add_count = ::__popc(__match_mask);
__add_count = ::cuda::std::popcount(__match_mask);
, __elected = true;
__add_count = 1;)
if (__elected)
Expand Down
3 changes: 2 additions & 1 deletion thrust/thrust/system/cuda/detail/set_operations.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@

# include <cuda/std/__algorithm/max.h>
# include <cuda/std/__algorithm/min.h>
# include <cuda/std/__bit/popcount.h>
# include <cuda/std/cstdint>

THRUST_NAMESPACE_BEGIN
Expand Down Expand Up @@ -560,7 +561,7 @@ struct SetOpAgent
Size tile_output_count = 0;
Size thread_output_prefix = 0;
Size tile_output_prefix = 0;
Size thread_output_count = static_cast<Size>(__popc(active_mask));
Size thread_output_count = static_cast<Size>(::cuda::std::popcount(static_cast<unsigned>(active_mask)));

if (tile_idx == 0) // first tile
{
Expand Down