Skip to content

Commit 0824836

Browse files
authored
Merge pull request rapidsai#18837 from rapidsai/branch-25.06
Forward-merge branch-25.06 into branch-25.08
2 parents ef4496f + 66861ae commit 0824836

File tree

8 files changed

+644
-145
lines changed

8 files changed

+644
-145
lines changed

cpp/benchmarks/ndsh/q09.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ struct q9_data {
160160
udf,
161161
cudf::data_type{cudf::type_id::FLOAT64},
162162
false,
163+
std::nullopt,
163164
stream,
164165
mr);
165166
}

cpp/include/cudf/jit/span.cuh

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
/*
2+
* Copyright (c) 2019-2025, NVIDIA CORPORATION.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#pragma once
17+
18+
#include <cudf/types.hpp>
19+
20+
namespace CUDF_EXPORT cudf {
21+
22+
namespace jit {
23+
24+
/**
25+
* @brief C++20 std::span with reduced feature set.
26+
*
27+
*/
28+
template <typename T>
29+
struct device_span {
30+
using element_type = T; ///< The type of the elements in the span
31+
32+
private:
33+
element_type* _data = nullptr;
34+
size_t _size = 0;
35+
36+
public:
37+
CUDF_HOST_DEVICE constexpr device_span() {}
38+
39+
/**
40+
* @brief Constructs a span from a pointer and a size.
41+
*
42+
* @param data Pointer to the first element in the span.
43+
* @param size The number of elements in the span.
44+
*/
45+
CUDF_HOST_DEVICE constexpr device_span(element_type* data, size_t size) : _data{data}, _size{size}
46+
{
47+
}
48+
49+
/**
50+
* @brief Returns a pointer to the beginning of the sequence.
51+
*
52+
* @return A pointer to the first element of the span
53+
*/
54+
CUDF_HOST_DEVICE [[nodiscard]] constexpr element_type* data() const { return _data; }
55+
56+
/**
57+
* @brief Returns the number of elements in the span.
58+
*
59+
* @return The number of elements in the span
60+
*/
61+
CUDF_HOST_DEVICE [[nodiscard]] constexpr size_t size() const { return _size; }
62+
63+
/**
64+
* @brief Checks if the span is empty.
65+
*
66+
* @return True if the span is empty, false otherwise
67+
*/
68+
CUDF_HOST_DEVICE [[nodiscard]] constexpr bool empty() const { return _size == 0; }
69+
70+
/**
71+
* @brief Returns a reference to the idx-th element of the sequence.
72+
*
73+
* The behavior is undefined if idx is out of range (i.e., if it is greater than or equal to
74+
* size()).
75+
*
76+
* @param idx the index of the element to access
77+
* @return A reference to the idx-th element of the sequence, i.e., `data()[idx]`
78+
*/
79+
CUDF_HOST_DEVICE constexpr element_type& operator[](size_t idx) const { return _data[idx]; }
80+
81+
/**
82+
* @brief Returns an iterator to the first element of the span.
83+
*
84+
* If the span is empty, the returned iterator will be equal to end().
85+
*
86+
* @return An iterator to the first element of the span
87+
*/
88+
CUDF_HOST_DEVICE [[nodiscard]] constexpr element_type* begin() const { return _data; }
89+
90+
/**
91+
* @brief Returns an iterator to the element following the last element of the span.
92+
*
93+
* This element acts as a placeholder; attempting to access it results in undefined behavior.
94+
*
95+
* @return An iterator to the element following the last element of the span
96+
*/
97+
CUDF_HOST_DEVICE [[nodiscard]] constexpr element_type* end() const { return _data + _size; }
98+
};
99+
100+
} // namespace jit
101+
} // namespace CUDF_EXPORT cudf

cpp/include/cudf/transform.hpp

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include <memory>
2525

2626
namespace CUDF_EXPORT cudf {
27+
2728
/**
2829
* @addtogroup transformation_transform
2930
* @{
@@ -40,16 +41,21 @@ namespace CUDF_EXPORT cudf {
4041
*
4142
* Note that for every scalar in `inputs` (columns of size 1), `input[i] == input[0]`
4243
*
43-
* The output null mask is the same as the null mask of the input columns, so if input[i] is
44-
* null then output[i] is also null. The size of the resulting column is the size of the largest
45-
* column.
46-
* All input columns must have equivalent null masks.
4744
*
45+
* @throws std::invalid_argument if any of the input columns have different sizes (except scalars of
46+
* size 1)
47+
* @throws std::invalid_argument if `output_type` or any of the inputs are not fixed-width or string
48+
* types
49+
* @throws std::invalid_argument if any of the input columns have nulls
50+
* @throws std::logic_error if JIT is not supported by the runtime
51+
*
52+
* The size of the resulting column is the size of the largest column.
4853
*
4954
* @param inputs Immutable views of the input columns to transform
5055
* @param transform_udf The PTX/CUDA string of the transform function to apply
5156
* @param output_type The output type that is compatible with the output type in the UDF
5257
* @param is_ptx true: the UDF is treated as PTX code; false: the UDF is treated as CUDA code
58+
* @param user_data User-defined device data to pass to the UDF.
5359
* @param stream CUDA stream used for device memory operations and kernel launches
5460
* @param mr Device memory resource used to allocate the returned column's device memory
5561
* @return The column resulting from applying the transform function to
@@ -60,6 +66,7 @@ std::unique_ptr<column> transform(
6066
std::string const& transform_udf,
6167
data_type output_type,
6268
bool is_ptx,
69+
std::optional<void*> user_data = std::nullopt,
6370
rmm::cuda_stream_view stream = cudf::get_default_stream(),
6471
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
6572

cpp/src/transform/jit/kernel.cu

Lines changed: 80 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
*/
1616

1717
#include <cudf/column/column_device_view_base.cuh>
18+
#include <cudf/jit/span.cuh>
1819
#include <cudf/strings/string_view.cuh>
1920
#include <cudf/types.hpp>
2021
#include <cudf/utilities/traits.hpp>
@@ -37,27 +38,45 @@ namespace transformation {
3738
namespace jit {
3839

3940
template <typename T, int32_t Index>
40-
struct accessor {
41+
struct column_accessor {
4142
using type = T;
4243
static constexpr int32_t index = Index;
4344

44-
static __device__ decltype(auto) element(cudf::mutable_column_device_view_core const* views,
45+
static __device__ decltype(auto) element(cudf::mutable_column_device_view_core const* outputs,
4546
cudf::size_type row)
4647
{
47-
return views[index].element<T>(row);
48+
return outputs[index].element<T>(row);
4849
}
4950

50-
static __device__ decltype(auto) element(cudf::column_device_view_core const* views,
51+
static __device__ decltype(auto) element(cudf::column_device_view_core const* inputs,
5152
cudf::size_type row)
5253
{
53-
return views[index].element<T>(row);
54+
return inputs[index].element<T>(row);
5455
}
5556

56-
static __device__ void assign(cudf::mutable_column_device_view_core const* views,
57+
static __device__ void assign(cudf::mutable_column_device_view_core const* outputs,
5758
cudf::size_type row,
5859
T value)
5960
{
60-
views[index].assign<T>(row, value);
61+
outputs[index].assign<T>(row, value);
62+
}
63+
};
64+
65+
template <typename T, int32_t Index>
66+
struct span_accessor {
67+
using type = T;
68+
static constexpr int32_t index = Index;
69+
70+
static __device__ type& element(cudf::jit::device_span<T> const* spans, cudf::size_type row)
71+
{
72+
return spans[index][row];
73+
}
74+
75+
static __device__ void assign(cudf::jit::device_span<T> const* outputs,
76+
cudf::size_type row,
77+
T value)
78+
{
79+
outputs[index][row] = value;
6180
}
6281
};
6382

@@ -66,59 +85,94 @@ struct scalar {
6685
using type = typename Accessor::type;
6786
static constexpr int32_t index = Accessor::index;
6887

69-
static __device__ decltype(auto) element(cudf::mutable_column_device_view_core const* views,
88+
static __device__ decltype(auto) element(cudf::mutable_column_device_view_core const* outputs,
7089
cudf::size_type row)
7190
{
72-
return Accessor::element(views, 0);
91+
return Accessor::element(outputs, 0);
7392
}
7493

75-
static __device__ decltype(auto) element(cudf::column_device_view_core const* views,
94+
static __device__ decltype(auto) element(cudf::column_device_view_core const* inputs,
7695
cudf::size_type row)
7796
{
78-
return Accessor::element(views, 0);
97+
return Accessor::element(inputs, 0);
7998
}
8099

81-
static __device__ void assign(cudf::mutable_column_device_view_core const* views,
100+
static __device__ void assign(cudf::mutable_column_device_view_core const* outputs,
82101
cudf::size_type row,
83102
type value)
84103
{
85-
return Accessor::assign(views, 0, value);
104+
return Accessor::assign(outputs, 0, value);
86105
}
87106
};
88107

89-
template <typename Out, typename... In>
90-
CUDF_KERNEL void kernel(cudf::mutable_column_device_view_core const* output,
91-
cudf::column_device_view_core const* inputs)
108+
template <bool has_user_data, typename Out, typename... In>
109+
CUDF_KERNEL void kernel(cudf::mutable_column_device_view_core const* outputs,
110+
cudf::column_device_view_core const* inputs,
111+
void* user_data)
92112
{
113+
// inputs to JITIFY kernels have to be either sized-integral types or pointers. Structs or
114+
// references can't be passed directly/correctly as they will be crossing an ABI boundary
115+
93116
// cannot use global_thread_id utility due to a JIT build issue by including
94117
// the `cudf/detail/utilities/cuda.cuh` header
95118
auto const block_size = static_cast<thread_index_type>(blockDim.x);
96119
thread_index_type const start = threadIdx.x + blockIdx.x * block_size;
97120
thread_index_type const stride = block_size * gridDim.x;
98-
thread_index_type const size = output->size();
121+
thread_index_type const size = outputs[0].size();
99122

100123
for (auto i = start; i < size; i += stride) {
101-
GENERIC_TRANSFORM_OP(&Out::element(output, i), In::element(inputs, i)...);
124+
if constexpr (has_user_data) {
125+
GENERIC_TRANSFORM_OP(user_data, i, &Out::element(outputs, i), In::element(inputs, i)...);
126+
} else {
127+
GENERIC_TRANSFORM_OP(&Out::element(outputs, i), In::element(inputs, i)...);
128+
}
102129
}
103130
}
104131

105-
template <typename Out, typename... In>
106-
CUDF_KERNEL void fixed_point_kernel(cudf::mutable_column_device_view_core const* output,
107-
cudf::column_device_view_core const* inputs)
132+
template <bool has_user_data, typename Out, typename... In>
133+
CUDF_KERNEL void fixed_point_kernel(cudf::mutable_column_device_view_core const* outputs,
134+
cudf::column_device_view_core const* inputs,
135+
void* user_data)
108136
{
109137
// cannot use global_thread_id utility due to a JIT build issue by including
110138
// the `cudf/detail/utilities/cuda.cuh` header
111139
auto const block_size = static_cast<thread_index_type>(blockDim.x);
112140
thread_index_type const start = threadIdx.x + blockIdx.x * block_size;
113141
thread_index_type const stride = block_size * gridDim.x;
114-
thread_index_type const size = output->size();
115-
116-
numeric::scale_type const output_scale = static_cast<numeric::scale_type>(output->type().scale());
142+
thread_index_type const size = outputs[0].size();
143+
auto const output_scale = static_cast<numeric::scale_type>(outputs[0].type().scale());
117144

118145
for (auto i = start; i < size; i += stride) {
119146
typename Out::type result{numeric::scaled_integer<typename Out::type::rep>{0, output_scale}};
120-
GENERIC_TRANSFORM_OP(&result, In::element(inputs, i)...);
121-
Out::assign(output, i, result);
147+
148+
if constexpr (has_user_data) {
149+
GENERIC_TRANSFORM_OP(user_data, i, &result, In::element(inputs, i)...);
150+
} else {
151+
GENERIC_TRANSFORM_OP(&result, In::element(inputs, i)...);
152+
}
153+
154+
Out::assign(outputs, i, result);
155+
}
156+
}
157+
158+
template <bool has_user_data, typename Out, typename... In>
159+
CUDF_KERNEL void span_kernel(cudf::jit::device_span<typename Out::type> const* outputs,
160+
cudf::column_device_view_core const* inputs,
161+
void* user_data)
162+
{
163+
// cannot use global_thread_id utility due to a JIT build issue by including
164+
// the `cudf/detail/utilities/cuda.cuh` header
165+
auto const block_size = static_cast<thread_index_type>(blockDim.x);
166+
thread_index_type const start = threadIdx.x + blockIdx.x * block_size;
167+
thread_index_type const stride = block_size * gridDim.x;
168+
thread_index_type const size = outputs[0].size();
169+
170+
for (auto i = start; i < size; i += stride) {
171+
if constexpr (has_user_data) {
172+
GENERIC_TRANSFORM_OP(user_data, i, &Out::element(outputs, i), In::element(inputs, i)...);
173+
} else {
174+
GENERIC_TRANSFORM_OP(&Out::element(outputs, i), In::element(inputs, i)...);
175+
}
122176
}
123177
}
124178

0 commit comments

Comments
 (0)