Skip to content
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions cub/cub/block/block_radix_rank.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
#include <cuda/__ptx/instructions/get_sreg.h>
#include <cuda/std/__algorithm/max.h>
#include <cuda/std/__bit/integral.h>
#include <cuda/std/__bit/popcount.h>
#include <cuda/std/__functional/operations.h>
#include <cuda/std/__type_traits/conditional.h>
#include <cuda/std/__type_traits/is_same.h>
Expand Down Expand Up @@ -742,10 +743,10 @@ public:
__syncwarp(0xFFFFFFFF);

// Number of peers having same digit as me
int32_t digit_count = __popc(peer_mask);
int32_t digit_count = ::cuda::std::popcount(peer_mask);

// Number of lower-ranked peers having same digit seen so far
int32_t peer_digit_prefix = __popc(peer_mask & lane_mask_lt);
int32_t peer_digit_prefix = ::cuda::std::popcount(peer_mask & lane_mask_lt);

if (peer_digit_prefix == 0)
{
Expand Down Expand Up @@ -1075,7 +1076,7 @@ struct BlockRadixRankMatchEarlyCounts
int bin_mask = *p_match_mask;
int leader = ::cuda::std::__bit_log2(static_cast<unsigned>(bin_mask));
int warp_offset = 0;
int popc = __popc(bin_mask & ::cuda::ptx::get_sreg_lanemask_le());
int popc = ::cuda::std::popcount(bin_mask & ::cuda::ptx::get_sreg_lanemask_le());
if (lane == leader)
{
// atomic is a bit faster
Expand Down Expand Up @@ -1105,7 +1106,7 @@ struct BlockRadixRankMatchEarlyCounts
detail::warp_in_block_matcher_t<RADIX_BITS, PARTIAL_WARP_THREADS, BLOCK_WARPS - 1>::match_any(bin, warp);
int leader = ::cuda::std::__bit_log2(static_cast<unsigned>(bin_mask));
int warp_offset = 0;
int popc = __popc(bin_mask & ::cuda::ptx::get_sreg_lanemask_le());
int popc = ::cuda::std::popcount(bin_mask & ::cuda::ptx::get_sreg_lanemask_le());
if (lane == leader)
{
// atomic is a bit faster
Expand Down
3 changes: 2 additions & 1 deletion libcudacxx/include/cuda/__barrier/barrier_block_scope.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include <cuda/std/__barrier/barrier.h>
#include <cuda/std/__barrier/empty_completion.h>
#include <cuda/std/__barrier/poll_tester.h>
#include <cuda/std/__bit/popcount.h>
#include <cuda/std/__chrono/duration.h>
#include <cuda/std/__chrono/high_resolution_clock.h>
#include <cuda/std/__chrono/time_point.h>
Expand Down Expand Up @@ -165,7 +166,7 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
unsigned int __activeA = ::__match_any_sync(__mask, __update);
unsigned int __activeB = ::__match_any_sync(__mask, reinterpret_cast<::cuda::std::uintptr_t>(&__barrier));
unsigned int __active = __activeA & __activeB;
int __inc = ::__popc(__active) * __update;
int __inc = ::cuda::std::popcount(__active) * __update;

int __leader = ::__ffs(__active) - 1;
// All threads in mask synchronize here, establishing cummulativity to the __leader:
Expand Down
5 changes: 3 additions & 2 deletions libcudacxx/include/cuda/pipeline
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include <cuda/atomic>
#include <cuda/barrier>
#include <cuda/std/__algorithm/max.h>
#include <cuda/std/__bit/popcount.h>
#include <cuda/std/__chrono/duration.h>
#include <cuda/std/__chrono/time_point.h>
#include <cuda/std/cstdint>
Expand Down Expand Up @@ -112,7 +113,7 @@ public:
::__match_any_sync(::__activemask(), reinterpret_cast<uintptr_t>(__shared_state_get_refcount()));
const uint32_t __elected_id = ::__ffs(__match_mask) - 1;
__elected = (::cuda::ptx::get_sreg_laneid() == __elected_id);
__sub_count = ::__popc(__match_mask);
__sub_count = ::cuda::std::popcount(__match_mask);
, __elected = true;
__sub_count = 1;)
bool __released = false;
Expand Down Expand Up @@ -301,7 +302,7 @@ make_pipeline(const _Group& __group, pipeline_shared_state<_Scope, _Stages_count
::__match_any_sync(::__activemask(), reinterpret_cast<uintptr_t>(&__shared_state->__refcount));
const uint32_t __elected_id = ::__ffs(__match_mask) - 1;
__elected = (::cuda::ptx::get_sreg_laneid() == __elected_id);
__add_count = ::__popc(__match_mask);
__add_count = ::cuda::std::popcount(__match_mask);
, __elected = true;
__add_count = 1;)
if (__elected)
Expand Down
3 changes: 2 additions & 1 deletion thrust/thrust/system/cuda/detail/set_operations.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@

# include <cuda/std/__algorithm/max.h>
# include <cuda/std/__algorithm/min.h>
# include <cuda/std/__bit/popcount.h>
# include <cuda/std/cstdint>

THRUST_NAMESPACE_BEGIN
Expand Down Expand Up @@ -560,7 +561,7 @@ struct SetOpAgent
Size tile_output_count = 0;
Size thread_output_prefix = 0;
Size tile_output_prefix = 0;
Size thread_output_count = static_cast<Size>(__popc(active_mask));
Size thread_output_count = static_cast<Size>(::cuda::std::popcount(active_mask));

if (tile_idx == 0) // first tile
{
Expand Down
Loading