Remove iterator workarounds for lack of operator+= (#6094)

bernhardmgruber · web-flow · commit 289bdcb1835b · 2025-10-02T16:35:53.000+02:00
diff --git a/cub/cub/device/dispatch/dispatch_advance_iterators.cuh b/cub/cub/device/dispatch/dispatch_advance_iterators.cuh
diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
@@ -44,7 +44,6 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cub/device/dispatch/dispatch_advance_iterators.cuh>
 #include <cub/device/dispatch/kernels/radix_sort.cuh>
 #include <cub/device/dispatch/tuning/tuning_radix_sort.cuh>
 #include <cub/util_debug.cuh>
@@ -1379,14 +1378,6 @@ struct DispatchSegmentedRadixSort
     // Number of radix sort invocations until all segments have been processed
     const auto num_invocations = ::cuda::ceil_div(num_segments, max_num_segments_per_invocation);
 
-    // If d_begin_offsets and d_end_offsets do not support operator+ then we can't have more than
-    // max_num_segments_per_invocation segments per invocation
-    if (num_invocations > 1
-        && !detail::all_iterators_support_add_assign_operator(::cuda::std::int64_t{}, d_begin_offsets, d_end_offsets))
-    {
-      return cudaErrorInvalidValue;
-    }
-
     BeginOffsetIteratorT begin_offsets_current_it = d_begin_offsets;
     EndOffsetIteratorT end_offsets_current_it     = d_end_offsets;
 
@@ -1435,8 +1426,8 @@ struct DispatchSegmentedRadixSort
 
       if (invocation_index + 1 < num_invocations)
       {
-        detail::advance_iterators_inplace_if_supported(begin_offsets_current_it, num_current_segments);
-        detail::advance_iterators_inplace_if_supported(end_offsets_current_it, num_current_segments);
+        begin_offsets_current_it += num_current_segments;
+        end_offsets_current_it += num_current_segments;
       }
 
       // Sync the stream if specified to flush runtime errors
diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh
@@ -46,7 +46,6 @@
 
 #include <cub/detail/launcher/cuda_runtime.cuh>
 #include <cub/detail/type_traits.cuh> // for cub::detail::invoke_result_t
-#include <cub/device/dispatch/dispatch_advance_iterators.cuh>
 #include <cub/device/dispatch/kernels/reduce.cuh>
 #include <cub/device/dispatch/kernels/segmented_reduce.cuh>
 #include <cub/device/dispatch/tuning/tuning_reduce.cuh>
@@ -823,17 +822,6 @@ struct DispatchSegmentedReduce
         static_cast<::cuda::std::int64_t>(::cuda::std::numeric_limits<::cuda::std::int32_t>::max());
       const ::cuda::std::int64_t num_invocations = ::cuda::ceil_div(num_segments, num_segments_per_invocation);
 
-      // If we need multiple passes over the segments but the iterators do not support the + operator, we cannot use the
-      // streaming approach and have to fail, returning cudaErrorInvalidValue. This is because c.parallel passes
-      // indirect_arg_t as the iterator type, which does not support the + operator.
-      // TODO (elstehle): Remove this check once https://github.com/NVIDIA/cccl/issues/4148 is resolved.
-      if (num_invocations > 1
-          && !detail::all_iterators_support_add_assign_operator(
-            ::cuda::std::int64_t{}, d_out, d_begin_offsets, d_end_offsets))
-      {
-        return cudaErrorInvalidValue;
-      }
-
       for (::cuda::std::int64_t invocation_index = 0; invocation_index < num_invocations; invocation_index++)
       {
         const auto current_seg_offset = invocation_index * num_segments_per_invocation;
@@ -865,9 +853,9 @@ struct DispatchSegmentedReduce
 
         if (invocation_index + 1 < num_invocations)
         {
-          detail::advance_iterators_inplace_if_supported(d_out, num_current_segments);
-          detail::advance_iterators_inplace_if_supported(d_begin_offsets, num_current_segments);
-          detail::advance_iterators_inplace_if_supported(d_end_offsets, num_current_segments);
+          d_out += num_current_segments;
+          d_begin_offsets += num_current_segments;
+          d_end_offsets += num_current_segments;
         }
 
         // Sync the stream if specified to flush runtime errors
@@ -1182,15 +1170,6 @@ struct DispatchFixedSizeSegmentedReduce
 
     const ::cuda::std::int64_t num_invocations = ::cuda::ceil_div(num_segments, num_segments_per_invocation);
 
-    // If we need multiple passes over the segments but the iterators do not support the + operator, we cannot use the
-    // streaming approach and have to fail, returning cudaErrorInvalidValue. This is because c.parallel passes
-    // indirect_arg_t as the iterator type, which does not support the + operator.
-    // TODO (srinivas/elstehle): Remove this check once https://github.com/NVIDIA/cccl/issues/4148 is resolved.
-    if (num_invocations > 1 && !detail::all_iterators_support_plus_operator(::cuda::std::int64_t{}, d_in, d_out))
-    {
-      return cudaErrorInvalidValue;
-    }
-
     cudaError error = cudaSuccess;
     for (::cuda::std::int64_t invocation_index = 0; invocation_index < num_invocations; invocation_index++)
     {
@@ -1204,13 +1183,16 @@ struct DispatchFixedSizeSegmentedReduce
       launcher_factory(
         static_cast<::cuda::std::int32_t>(num_current_blocks), ActivePolicyT::ReducePolicy::BLOCK_THREADS, 0, stream)
         .doit(fixed_size_segmented_reduce_kernel,
-              detail::advance_iterators_if_supported(d_in, current_seg_offset * segment_size),
-              detail::advance_iterators_if_supported(d_out, current_seg_offset),
+              d_in,
+              d_out,
               segment_size,
               static_cast<::cuda::std::int32_t>(num_current_segments),
               reduction_op,
               init);
 
+      d_in += num_segments_per_invocation * segment_size;
+      d_out += num_segments_per_invocation;
+
       error = CubDebug(cudaPeekAtLastError());
       if (cudaSuccess != error)
       {
diff --git a/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh b/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh
@@ -328,11 +328,6 @@ struct DispatchReduceDeterministic
     // Alias the allocation for the privatized per-block reductions
     deterministic_accum_t* d_block_reductions = (deterministic_accum_t*) allocations[0];
 
-    if (num_chunks > 1 && !detail::all_iterators_support_add_assign_operator(::cuda::std::int32_t{}, d_in))
-    {
-      return cudaErrorInvalidValue;
-    }
-
     auto d_chunk_block_reductions = d_block_reductions;
     for (int chunk_index = 0; chunk_index < num_chunks; chunk_index++)
     {
@@ -372,7 +367,7 @@ struct DispatchReduceDeterministic
 
       if (chunk_index + 1 < num_chunks)
       {
-        detail::advance_iterators_inplace_if_supported(d_in, num_current_items);
+        d_in += num_current_items;
         d_chunk_block_reductions += current_grid_size;
       }
 
diff --git a/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh b/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh
@@ -20,7 +20,6 @@
 
 #include <cub/detail/launcher/cuda_runtime.cuh>
 #include <cub/detail/type_traits.cuh> // for cub::detail::invoke_result_t
-#include <cub/device/dispatch/dispatch_advance_iterators.cuh>
 #include <cub/device/dispatch/kernels/reduce.cuh>
 #include <cub/device/dispatch/tuning/tuning_reduce.cuh>
 #include <cub/grid/grid_even_share.cuh>
diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
@@ -40,7 +40,6 @@
 #include <cub/detail/device_double_buffer.cuh>
 #include <cub/detail/temporary_storage.cuh>
 #include <cub/device/device_partition.cuh>
-#include <cub/device/dispatch/dispatch_advance_iterators.cuh>
 #include <cub/device/dispatch/kernels/segmented_sort.cuh>
 #include <cub/device/dispatch/tuning/tuning_segmented_sort.cuh>
 #include <cub/util_debug.cuh>
@@ -764,8 +763,8 @@ private:
       BeginOffsetIteratorT current_begin_offset = d_begin_offsets;
       EndOffsetIteratorT current_end_offset     = d_end_offsets;
 
-      detail::advance_iterators_inplace_if_supported(current_begin_offset, current_seg_offset);
-      detail::advance_iterators_inplace_if_supported(current_end_offset, current_seg_offset);
+      current_begin_offset += current_seg_offset;
+      current_end_offset += current_seg_offset;
 
       auto medium_indices_iterator =
         ::cuda::std::make_reverse_iterator(large_and_medium_segments_indices.get() + current_num_segments);
diff --git a/cub/test/catch2_test_device_segmented_reduce_large_offsets.cu b/cub/test/catch2_test_device_segmented_reduce_large_offsets.cu
@@ -82,81 +82,6 @@ struct custom_sum_op
   }
 };
 
-#if TEST_LAUNCH == 0
-
-struct iterator_without_plus_operator
-{
-  using value_type      = cuda::std::int64_t;
-  using difference_type = std::ptrdiff_t;
-  using pointer         = value_type*;
-  using reference       = value_type&;
-
-  // Although we provide operator[], we declare this as random-access for demonstration purposes only.
-  // This iterator still does not implement operator++ or operator+.
-  using iterator_category = std::random_access_iterator_tag;
-
-  // Dereference always returns 0.
-  __host__ __device__ int operator*() const
-  {
-    return 0;
-  }
-
-  // Indexing also always returns 0.
-  __host__ __device__ int operator[](difference_type /*idx*/) const
-  {
-    return 0;
-  }
-
-  // Intentionally no operator++ or operator+ to prevent advancing the iterator.
-};
-
-C2H_TEST("Device reduce fails for large number of segments if the iterator cannot be advanced", "[reduce][device]")
-{
-  using offset_t        = cuda::std::int64_t;
-  using segment_index_t = cuda::std::int64_t;
-
-  const auto num_segments =
-    GENERATE_COPY(segment_index_t{4}, static_cast<segment_index_t>(cuda::std::numeric_limits<std::uint32_t>::max()));
-  auto input_data_it    = thrust::make_counting_iterator(offset_t{0});
-  auto begin_offsets_it = iterator_without_plus_operator{};
-  auto end_offsets_it   = thrust::make_counting_iterator(offset_t{1});
-
-  cuda::std::uint8_t* d_temp_storage{};
-  cuda::std::size_t temp_storage_bytes{};
-  cudaError_t error = cub::DeviceSegmentedReduce::Min(
-    d_temp_storage,
-    temp_storage_bytes,
-    input_data_it,
-    thrust::make_discard_iterator(),
-    num_segments,
-    begin_offsets_it,
-    end_offsets_it);
-
-  c2h::device_vector<cuda::std::uint8_t> temp_storage(temp_storage_bytes);
-  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
-  error          = cub::DeviceSegmentedReduce::Min(
-    d_temp_storage,
-    temp_storage_bytes,
-    input_data_it,
-    thrust::make_discard_iterator(),
-    num_segments,
-    begin_offsets_it,
-    end_offsets_it);
-
-  // For small number of segments, the operation should succeed (i.e., we just use a single invocation)
-  if (num_segments == 4)
-  {
-    REQUIRE(error == cudaSuccess);
-  }
-  // For large number of segments, the operation should fail (i.e., we use multiple invocations and we cannot advance
-  // the begin_offsets_it)
-  else
-  {
-    REQUIRE(error == cudaErrorInvalidValue);
-  }
-}
-#endif
-
 C2H_TEST("Device reduce works with a very large number of segments", "[reduce][device]")
 {
   using offset_t        = cuda::std::int64_t;