Skip to content

Commit 289bdcb

Browse files
Remove iterator workarounds for lack of operator+= (#6094)
1 parent 7ed0f91 commit 289bdcb

7 files changed

+13
-233
lines changed

cub/cub/device/dispatch/dispatch_advance_iterators.cuh

Lines changed: 0 additions & 111 deletions
This file was deleted.

cub/cub/device/dispatch/dispatch_radix_sort.cuh

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@
4444
# pragma system_header
4545
#endif // no system header
4646

47-
#include <cub/device/dispatch/dispatch_advance_iterators.cuh>
4847
#include <cub/device/dispatch/kernels/radix_sort.cuh>
4948
#include <cub/device/dispatch/tuning/tuning_radix_sort.cuh>
5049
#include <cub/util_debug.cuh>
@@ -1379,14 +1378,6 @@ struct DispatchSegmentedRadixSort
13791378
// Number of radix sort invocations until all segments have been processed
13801379
const auto num_invocations = ::cuda::ceil_div(num_segments, max_num_segments_per_invocation);
13811380

1382-
// If d_begin_offsets and d_end_offsets do not support operator+ then we can't have more than
1383-
// max_num_segments_per_invocation segments per invocation
1384-
if (num_invocations > 1
1385-
&& !detail::all_iterators_support_add_assign_operator(::cuda::std::int64_t{}, d_begin_offsets, d_end_offsets))
1386-
{
1387-
return cudaErrorInvalidValue;
1388-
}
1389-
13901381
BeginOffsetIteratorT begin_offsets_current_it = d_begin_offsets;
13911382
EndOffsetIteratorT end_offsets_current_it = d_end_offsets;
13921383

@@ -1435,8 +1426,8 @@ struct DispatchSegmentedRadixSort
14351426

14361427
if (invocation_index + 1 < num_invocations)
14371428
{
1438-
detail::advance_iterators_inplace_if_supported(begin_offsets_current_it, num_current_segments);
1439-
detail::advance_iterators_inplace_if_supported(end_offsets_current_it, num_current_segments);
1429+
begin_offsets_current_it += num_current_segments;
1430+
end_offsets_current_it += num_current_segments;
14401431
}
14411432

14421433
// Sync the stream if specified to flush runtime errors

cub/cub/device/dispatch/dispatch_reduce.cuh

Lines changed: 8 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,6 @@
4646

4747
#include <cub/detail/launcher/cuda_runtime.cuh>
4848
#include <cub/detail/type_traits.cuh> // for cub::detail::invoke_result_t
49-
#include <cub/device/dispatch/dispatch_advance_iterators.cuh>
5049
#include <cub/device/dispatch/kernels/reduce.cuh>
5150
#include <cub/device/dispatch/kernels/segmented_reduce.cuh>
5251
#include <cub/device/dispatch/tuning/tuning_reduce.cuh>
@@ -823,17 +822,6 @@ struct DispatchSegmentedReduce
823822
static_cast<::cuda::std::int64_t>(::cuda::std::numeric_limits<::cuda::std::int32_t>::max());
824823
const ::cuda::std::int64_t num_invocations = ::cuda::ceil_div(num_segments, num_segments_per_invocation);
825824

826-
// If we need multiple passes over the segments but the iterators do not support the + operator, we cannot use the
827-
// streaming approach and have to fail, returning cudaErrorInvalidValue. This is because c.parallel passes
828-
// indirect_arg_t as the iterator type, which does not support the + operator.
829-
// TODO (elstehle): Remove this check once https://github.com/NVIDIA/cccl/issues/4148 is resolved.
830-
if (num_invocations > 1
831-
&& !detail::all_iterators_support_add_assign_operator(
832-
::cuda::std::int64_t{}, d_out, d_begin_offsets, d_end_offsets))
833-
{
834-
return cudaErrorInvalidValue;
835-
}
836-
837825
for (::cuda::std::int64_t invocation_index = 0; invocation_index < num_invocations; invocation_index++)
838826
{
839827
const auto current_seg_offset = invocation_index * num_segments_per_invocation;
@@ -865,9 +853,9 @@ struct DispatchSegmentedReduce
865853

866854
if (invocation_index + 1 < num_invocations)
867855
{
868-
detail::advance_iterators_inplace_if_supported(d_out, num_current_segments);
869-
detail::advance_iterators_inplace_if_supported(d_begin_offsets, num_current_segments);
870-
detail::advance_iterators_inplace_if_supported(d_end_offsets, num_current_segments);
856+
d_out += num_current_segments;
857+
d_begin_offsets += num_current_segments;
858+
d_end_offsets += num_current_segments;
871859
}
872860

873861
// Sync the stream if specified to flush runtime errors
@@ -1182,15 +1170,6 @@ struct DispatchFixedSizeSegmentedReduce
11821170

11831171
const ::cuda::std::int64_t num_invocations = ::cuda::ceil_div(num_segments, num_segments_per_invocation);
11841172

1185-
// If we need multiple passes over the segments but the iterators do not support the + operator, we cannot use the
1186-
// streaming approach and have to fail, returning cudaErrorInvalidValue. This is because c.parallel passes
1187-
// indirect_arg_t as the iterator type, which does not support the + operator.
1188-
// TODO (srinivas/elstehle): Remove this check once https://github.com/NVIDIA/cccl/issues/4148 is resolved.
1189-
if (num_invocations > 1 && !detail::all_iterators_support_plus_operator(::cuda::std::int64_t{}, d_in, d_out))
1190-
{
1191-
return cudaErrorInvalidValue;
1192-
}
1193-
11941173
cudaError error = cudaSuccess;
11951174
for (::cuda::std::int64_t invocation_index = 0; invocation_index < num_invocations; invocation_index++)
11961175
{
@@ -1204,13 +1183,16 @@ struct DispatchFixedSizeSegmentedReduce
12041183
launcher_factory(
12051184
static_cast<::cuda::std::int32_t>(num_current_blocks), ActivePolicyT::ReducePolicy::BLOCK_THREADS, 0, stream)
12061185
.doit(fixed_size_segmented_reduce_kernel,
1207-
detail::advance_iterators_if_supported(d_in, current_seg_offset * segment_size),
1208-
detail::advance_iterators_if_supported(d_out, current_seg_offset),
1186+
d_in,
1187+
d_out,
12091188
segment_size,
12101189
static_cast<::cuda::std::int32_t>(num_current_segments),
12111190
reduction_op,
12121191
init);
12131192

1193+
d_in += num_segments_per_invocation * segment_size;
1194+
d_out += num_segments_per_invocation;
1195+
12141196
error = CubDebug(cudaPeekAtLastError());
12151197
if (cudaSuccess != error)
12161198
{

cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -328,11 +328,6 @@ struct DispatchReduceDeterministic
328328
// Alias the allocation for the privatized per-block reductions
329329
deterministic_accum_t* d_block_reductions = (deterministic_accum_t*) allocations[0];
330330

331-
if (num_chunks > 1 && !detail::all_iterators_support_add_assign_operator(::cuda::std::int32_t{}, d_in))
332-
{
333-
return cudaErrorInvalidValue;
334-
}
335-
336331
auto d_chunk_block_reductions = d_block_reductions;
337332
for (int chunk_index = 0; chunk_index < num_chunks; chunk_index++)
338333
{
@@ -372,7 +367,7 @@ struct DispatchReduceDeterministic
372367

373368
if (chunk_index + 1 < num_chunks)
374369
{
375-
detail::advance_iterators_inplace_if_supported(d_in, num_current_items);
370+
d_in += num_current_items;
376371
d_chunk_block_reductions += current_grid_size;
377372
}
378373

cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020

2121
#include <cub/detail/launcher/cuda_runtime.cuh>
2222
#include <cub/detail/type_traits.cuh> // for cub::detail::invoke_result_t
23-
#include <cub/device/dispatch/dispatch_advance_iterators.cuh>
2423
#include <cub/device/dispatch/kernels/reduce.cuh>
2524
#include <cub/device/dispatch/tuning/tuning_reduce.cuh>
2625
#include <cub/grid/grid_even_share.cuh>

cub/cub/device/dispatch/dispatch_segmented_sort.cuh

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@
4040
#include <cub/detail/device_double_buffer.cuh>
4141
#include <cub/detail/temporary_storage.cuh>
4242
#include <cub/device/device_partition.cuh>
43-
#include <cub/device/dispatch/dispatch_advance_iterators.cuh>
4443
#include <cub/device/dispatch/kernels/segmented_sort.cuh>
4544
#include <cub/device/dispatch/tuning/tuning_segmented_sort.cuh>
4645
#include <cub/util_debug.cuh>
@@ -764,8 +763,8 @@ private:
764763
BeginOffsetIteratorT current_begin_offset = d_begin_offsets;
765764
EndOffsetIteratorT current_end_offset = d_end_offsets;
766765

767-
detail::advance_iterators_inplace_if_supported(current_begin_offset, current_seg_offset);
768-
detail::advance_iterators_inplace_if_supported(current_end_offset, current_seg_offset);
766+
current_begin_offset += current_seg_offset;
767+
current_end_offset += current_seg_offset;
769768

770769
auto medium_indices_iterator =
771770
::cuda::std::make_reverse_iterator(large_and_medium_segments_indices.get() + current_num_segments);

cub/test/catch2_test_device_segmented_reduce_large_offsets.cu

Lines changed: 0 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -82,81 +82,6 @@ struct custom_sum_op
8282
}
8383
};
8484

85-
#if TEST_LAUNCH == 0
86-
87-
struct iterator_without_plus_operator
88-
{
89-
using value_type = cuda::std::int64_t;
90-
using difference_type = std::ptrdiff_t;
91-
using pointer = value_type*;
92-
using reference = value_type&;
93-
94-
// Although we provide operator[], we declare this as random-access for demonstration purposes only.
95-
// This iterator still does not implement operator++ or operator+.
96-
using iterator_category = std::random_access_iterator_tag;
97-
98-
// Dereference always returns 0.
99-
__host__ __device__ int operator*() const
100-
{
101-
return 0;
102-
}
103-
104-
// Indexing also always returns 0.
105-
__host__ __device__ int operator[](difference_type /*idx*/) const
106-
{
107-
return 0;
108-
}
109-
110-
// Intentionally no operator++ or operator+ to prevent advancing the iterator.
111-
};
112-
113-
C2H_TEST("Device reduce fails for large number of segments if the iterator cannot be advanced", "[reduce][device]")
114-
{
115-
using offset_t = cuda::std::int64_t;
116-
using segment_index_t = cuda::std::int64_t;
117-
118-
const auto num_segments =
119-
GENERATE_COPY(segment_index_t{4}, static_cast<segment_index_t>(cuda::std::numeric_limits<std::uint32_t>::max()));
120-
auto input_data_it = thrust::make_counting_iterator(offset_t{0});
121-
auto begin_offsets_it = iterator_without_plus_operator{};
122-
auto end_offsets_it = thrust::make_counting_iterator(offset_t{1});
123-
124-
cuda::std::uint8_t* d_temp_storage{};
125-
cuda::std::size_t temp_storage_bytes{};
126-
cudaError_t error = cub::DeviceSegmentedReduce::Min(
127-
d_temp_storage,
128-
temp_storage_bytes,
129-
input_data_it,
130-
thrust::make_discard_iterator(),
131-
num_segments,
132-
begin_offsets_it,
133-
end_offsets_it);
134-
135-
c2h::device_vector<cuda::std::uint8_t> temp_storage(temp_storage_bytes);
136-
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
137-
error = cub::DeviceSegmentedReduce::Min(
138-
d_temp_storage,
139-
temp_storage_bytes,
140-
input_data_it,
141-
thrust::make_discard_iterator(),
142-
num_segments,
143-
begin_offsets_it,
144-
end_offsets_it);
145-
146-
// For small number of segments, the operation should succeed (i.e., we just use a single invocation)
147-
if (num_segments == 4)
148-
{
149-
REQUIRE(error == cudaSuccess);
150-
}
151-
// For large number of segments, the operation should fail (i.e., we use multiple invocations and we cannot advance
152-
// the begin_offsets_it)
153-
else
154-
{
155-
REQUIRE(error == cudaErrorInvalidValue);
156-
}
157-
}
158-
#endif
159-
16085
C2H_TEST("Device reduce works with a very large number of segments", "[reduce][device]")
16186
{
16287
using offset_t = cuda::std::int64_t;

0 commit comments

Comments
 (0)