PointKernel
diff --git a/‎cpp/benchmarks/ndsh/q09.cpp‎
Lines changed: 1 addition & 0 deletions b/‎cpp/benchmarks/ndsh/q09.cpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cpp/include/cudf/jit/span.cuh‎
Lines changed: 101 additions & 0 deletions b/‎cpp/include/cudf/jit/span.cuh‎
Lines changed: 101 additions & 0 deletions
diff --git a/‎cpp/include/cudf/transform.hpp‎
Lines changed: 11 additions & 4 deletions b/‎cpp/include/cudf/transform.hpp‎
Lines changed: 11 additions & 4 deletions
diff --git a/‎cpp/src/transform/jit/kernel.cu‎
Lines changed: 80 additions & 26 deletions b/‎cpp/src/transform/jit/kernel.cu‎
Lines changed: 80 additions & 26 deletions
@@ -160,6 +160,7 @@ struct q9_data {
                          udf,
                          cudf::data_type{cudf::type_id::FLOAT64},
                          false,
+                         std::nullopt,
                          stream,
                          mr);
 }
 
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/types.hpp>
+
+namespace CUDF_EXPORT cudf {
+
+namespace jit {
+
+/**
+ * @brief C++20 std::span with reduced feature set.
+ *
+ */
+template <typename T>
+struct device_span {
+  using element_type = T;  ///< The type of the elements in the span
+
+ private:
+  element_type* _data = nullptr;
+  size_t _size        = 0;
+
+ public:
+  CUDF_HOST_DEVICE constexpr device_span() {}
+
+  /**
+   * @brief Constructs a span from a pointer and a size.
+   *
+   * @param data Pointer to the first element in the span.
+   * @param size The number of elements in the span.
+   */
+  CUDF_HOST_DEVICE constexpr device_span(element_type* data, size_t size) : _data{data}, _size{size}
+  {
+  }
+
+  /**
+   * @brief Returns a pointer to the beginning of the sequence.
+   *
+   * @return A pointer to the first element of the span
+   */
+  CUDF_HOST_DEVICE [[nodiscard]] constexpr element_type* data() const { return _data; }
+
+  /**
+   * @brief Returns the number of elements in the span.
+   *
+   * @return The number of elements in the span
+   */
+  CUDF_HOST_DEVICE [[nodiscard]] constexpr size_t size() const { return _size; }
+
+  /**
+   * @brief Checks if the span is empty.
+   *
+   * @return True if the span is empty, false otherwise
+   */
+  CUDF_HOST_DEVICE [[nodiscard]] constexpr bool empty() const { return _size == 0; }
+
+  /**
+   * @brief Returns a reference to the idx-th element of the sequence.
+   *
+   * The behavior is undefined if idx is out of range (i.e., if it is greater than or equal to
+   * size()).
+   *
+   * @param idx the index of the element to access
+   * @return A reference to the idx-th element of the sequence, i.e., `data()[idx]`
+   */
+  CUDF_HOST_DEVICE constexpr element_type& operator[](size_t idx) const { return _data[idx]; }
+
+  /**
+   * @brief Returns an iterator to the first element of the span.
+   *
+   * If the span is empty, the returned iterator will be equal to end().
+   *
+   * @return An iterator to the first element of the span
+   */
+  CUDF_HOST_DEVICE [[nodiscard]] constexpr element_type* begin() const { return _data; }
+
+  /**
+   * @brief Returns an iterator to the element following the last element of the span.
+   *
+   * This element acts as a placeholder; attempting to access it results in undefined behavior.
+   *
+   * @return An iterator to the element following the last element of the span
+   */
+  CUDF_HOST_DEVICE [[nodiscard]] constexpr element_type* end() const { return _data + _size; }
+};
+
+}  // namespace jit
+}  // namespace CUDF_EXPORT cudf
@@ -24,6 +24,7 @@
 #include <memory>
 
 namespace CUDF_EXPORT cudf {
+
 /**
  * @addtogroup transformation_transform
  * @{
@@ -40,16 +41,21 @@ namespace CUDF_EXPORT cudf {
  *
  * Note that for every scalar in `inputs` (columns of size 1), `input[i] == input[0]`
  *
- * The output null mask is the same as the null mask of the input columns, so if input[i] is
- * null then output[i] is also null. The size of the resulting column is the size of the largest
- * column.
- * All input columns must have equivalent null masks.
  *
+ * @throws std::invalid_argument if any of the input columns have different sizes (except scalars of
+ * size 1)
+ * @throws std::invalid_argument if `output_type` or any of the inputs are not fixed-width or string
+ * types
+ * @throws std::invalid_argument if any of the input columns have nulls
+ * @throws std::logic_error if JIT is not supported by the runtime
+ *
+ * The size of the resulting column is the size of the largest column.
  *
  * @param inputs        Immutable views of the input columns to transform
  * @param transform_udf The PTX/CUDA string of the transform function to apply
  * @param output_type   The output type that is compatible with the output type in the UDF
  * @param is_ptx        true: the UDF is treated as PTX code; false: the UDF is treated as CUDA code
+ * @param user_data     User-defined device data to pass to the UDF.
  * @param stream        CUDA stream used for device memory operations and kernel launches
  * @param mr            Device memory resource used to allocate the returned column's device memory
  * @return              The column resulting from applying the transform function to
@@ -60,6 +66,7 @@ std::unique_ptr<column> transform(
   std::string const& transform_udf,
   data_type output_type,
   bool is_ptx,
+  std::optional<void*> user_data    = std::nullopt,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 
@@ -15,6 +15,7 @@
  */
 
 #include <cudf/column/column_device_view_base.cuh>
+#include <cudf/jit/span.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/traits.hpp>
@@ -37,27 +38,45 @@ namespace transformation {
 namespace jit {
 
 template <typename T, int32_t Index>
-struct accessor {
+struct column_accessor {
   using type                     = T;
   static constexpr int32_t index = Index;
 
-  static __device__ decltype(auto) element(cudf::mutable_column_device_view_core const* views,
+  static __device__ decltype(auto) element(cudf::mutable_column_device_view_core const* outputs,
                                            cudf::size_type row)
   {
-    return views[index].element<T>(row);
+    return outputs[index].element<T>(row);
   }
 
-  static __device__ decltype(auto) element(cudf::column_device_view_core const* views,
+  static __device__ decltype(auto) element(cudf::column_device_view_core const* inputs,
                                            cudf::size_type row)
   {
-    return views[index].element<T>(row);
+    return inputs[index].element<T>(row);
   }
 
-  static __device__ void assign(cudf::mutable_column_device_view_core const* views,
+  static __device__ void assign(cudf::mutable_column_device_view_core const* outputs,
                                 cudf::size_type row,
                                 T value)
   {
-    views[index].assign<T>(row, value);
+    outputs[index].assign<T>(row, value);
+  }
+};
+
+template <typename T, int32_t Index>
+struct span_accessor {
+  using type                     = T;
+  static constexpr int32_t index = Index;
+
+  static __device__ type& element(cudf::jit::device_span<T> const* spans, cudf::size_type row)
+  {
+    return spans[index][row];
+  }
+
+  static __device__ void assign(cudf::jit::device_span<T> const* outputs,
+                                cudf::size_type row,
+                                T value)
+  {
+    outputs[index][row] = value;
   }
 };
 
@@ -66,59 +85,94 @@ struct scalar {
   using type                     = typename Accessor::type;
   static constexpr int32_t index = Accessor::index;
 
-  static __device__ decltype(auto) element(cudf::mutable_column_device_view_core const* views,
+  static __device__ decltype(auto) element(cudf::mutable_column_device_view_core const* outputs,
                                            cudf::size_type row)
   {
-    return Accessor::element(views, 0);
+    return Accessor::element(outputs, 0);
   }
 
-  static __device__ decltype(auto) element(cudf::column_device_view_core const* views,
+  static __device__ decltype(auto) element(cudf::column_device_view_core const* inputs,
                                            cudf::size_type row)
   {
-    return Accessor::element(views, 0);
+    return Accessor::element(inputs, 0);
   }
 
-  static __device__ void assign(cudf::mutable_column_device_view_core const* views,
+  static __device__ void assign(cudf::mutable_column_device_view_core const* outputs,
                                 cudf::size_type row,
                                 type value)
   {
-    return Accessor::assign(views, 0, value);
+    return Accessor::assign(outputs, 0, value);
   }
 };
 
-template <typename Out, typename... In>
-CUDF_KERNEL void kernel(cudf::mutable_column_device_view_core const* output,
-                        cudf::column_device_view_core const* inputs)
+template <bool has_user_data, typename Out, typename... In>
+CUDF_KERNEL void kernel(cudf::mutable_column_device_view_core const* outputs,
+                        cudf::column_device_view_core const* inputs,
+                        void* user_data)
 {
+  // inputs to JITIFY kernels have to be either sized-integral types or pointers. Structs or
+  // references can't be passed directly/correctly as they will be crossing an ABI boundary
+
   // cannot use global_thread_id utility due to a JIT build issue by including
   // the `cudf/detail/utilities/cuda.cuh` header
   auto const block_size          = static_cast<thread_index_type>(blockDim.x);
   thread_index_type const start  = threadIdx.x + blockIdx.x * block_size;
   thread_index_type const stride = block_size * gridDim.x;
-  thread_index_type const size   = output->size();
+  thread_index_type const size   = outputs[0].size();
 
   for (auto i = start; i < size; i += stride) {
-    GENERIC_TRANSFORM_OP(&Out::element(output, i), In::element(inputs, i)...);
+    if constexpr (has_user_data) {
+      GENERIC_TRANSFORM_OP(user_data, i, &Out::element(outputs, i), In::element(inputs, i)...);
+    } else {
+      GENERIC_TRANSFORM_OP(&Out::element(outputs, i), In::element(inputs, i)...);
+    }
   }
 }
 
-template <typename Out, typename... In>
-CUDF_KERNEL void fixed_point_kernel(cudf::mutable_column_device_view_core const* output,
-                                    cudf::column_device_view_core const* inputs)
+template <bool has_user_data, typename Out, typename... In>
+CUDF_KERNEL void fixed_point_kernel(cudf::mutable_column_device_view_core const* outputs,
+                                    cudf::column_device_view_core const* inputs,
+                                    void* user_data)
 {
   // cannot use global_thread_id utility due to a JIT build issue by including
   // the `cudf/detail/utilities/cuda.cuh` header
   auto const block_size          = static_cast<thread_index_type>(blockDim.x);
   thread_index_type const start  = threadIdx.x + blockIdx.x * block_size;
   thread_index_type const stride = block_size * gridDim.x;
-  thread_index_type const size   = output->size();
-
-  numeric::scale_type const output_scale = static_cast<numeric::scale_type>(output->type().scale());
+  thread_index_type const size   = outputs[0].size();
+  auto const output_scale        = static_cast<numeric::scale_type>(outputs[0].type().scale());
 
   for (auto i = start; i < size; i += stride) {
     typename Out::type result{numeric::scaled_integer<typename Out::type::rep>{0, output_scale}};
-    GENERIC_TRANSFORM_OP(&result, In::element(inputs, i)...);
-    Out::assign(output, i, result);
+
+    if constexpr (has_user_data) {
+      GENERIC_TRANSFORM_OP(user_data, i, &result, In::element(inputs, i)...);
+    } else {
+      GENERIC_TRANSFORM_OP(&result, In::element(inputs, i)...);
+    }
+
+    Out::assign(outputs, i, result);
+  }
+}
+
+template <bool has_user_data, typename Out, typename... In>
+CUDF_KERNEL void span_kernel(cudf::jit::device_span<typename Out::type> const* outputs,
+                             cudf::column_device_view_core const* inputs,
+                             void* user_data)
+{
+  // cannot use global_thread_id utility due to a JIT build issue by including
+  // the `cudf/detail/utilities/cuda.cuh` header
+  auto const block_size          = static_cast<thread_index_type>(blockDim.x);
+  thread_index_type const start  = threadIdx.x + blockIdx.x * block_size;
+  thread_index_type const stride = block_size * gridDim.x;
+  thread_index_type const size   = outputs[0].size();
+
+  for (auto i = start; i < size; i += stride) {
+    if constexpr (has_user_data) {
+      GENERIC_TRANSFORM_OP(user_data, i, &Out::element(outputs, i), In::element(inputs, i)...);
+    } else {
+      GENERIC_TRANSFORM_OP(&Out::element(outputs, i), In::element(inputs, i)...);
+    }
   }
 }
Original file line number	Diff line number	Diff line change
`@@ -160,6 +160,7 @@ struct q9_data {`
`160`	`160`	`udf,`
`161`	`161`	`cudf::data_type{cudf::type_id::FLOAT64},`
`162`	`162`	`false,`
	`163`	`+ std::nullopt,`
`163`	`164`	`stream,`
`164`	`165`	`mr);`
`165`	`166`	`}`