Skip to content
86 changes: 77 additions & 9 deletions cpp/benchmarks/text/edit_distance.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
* Copyright (c) 2023-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -16,6 +16,8 @@

#include <benchmarks/common/generate_input.hpp>

#include <cudf/aggregation.hpp>
#include <cudf/reduction.hpp>
#include <cudf/strings/strings_column_view.hpp>

#include <nvtext/edit_distance.hpp>
Expand All @@ -24,7 +26,7 @@

#include <nvbench/nvbench.cuh>

static void bench_edit_distance(nvbench::state& state)
static void bench_edit_distance_utf8(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
Expand All @@ -34,23 +36,89 @@ static void bench_edit_distance(nvbench::state& state)
cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
auto const strings_table = create_random_table(
{cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, strings_profile);
cudf::strings_column_view input1(strings_table->view().column(0));
cudf::strings_column_view input2(strings_table->view().column(1));
auto input1 = strings_table->get_column(0);
auto input2 = strings_table->get_column(0);
auto sv1 = cudf::strings_column_view(input1.view());
auto sv2 = cudf::strings_column_view(input2.view());

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));

state.add_global_memory_reads<nvbench::int8_t>(input1.alloc_size() + input2.alloc_size());
// output are integers (one per row)
state.add_global_memory_writes<nvbench::int32_t>(num_rows);

state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { auto result = nvtext::edit_distance(sv1, sv2); });
}

static void bench_edit_distance_ascii(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));

auto const max_size = static_cast<int64_t>(num_rows) * static_cast<int64_t>(max_width);
auto const offsets_type = max_size >= std::numeric_limits<cudf::size_type>::max()
? cudf::type_id::INT64
: cudf::type_id::INT32;

data_profile profile = data_profile_builder().no_validity().cardinality(0).distribution(
offsets_type, distribution_id::NORMAL, min_width, max_width);
data_profile ascii_profile = data_profile_builder().no_validity().cardinality(0).distribution(
cudf::type_id::INT8, distribution_id::UNIFORM, 32, 126); // nice ASCII range

auto offsets = create_random_column(offsets_type, row_count{num_rows + 1}, profile);
offsets = cudf::scan(offsets->view(),
*cudf::make_sum_aggregation<cudf::scan_aggregation>(),
cudf::scan_type::EXCLUSIVE);
auto ascii_data1 =
create_random_column(cudf::type_id::INT8, row_count{num_rows * max_width}, ascii_profile);
auto ascii_data2 =
create_random_column(cudf::type_id::INT8, row_count{num_rows * max_width}, ascii_profile);

auto input1 = cudf::column_view(cudf::data_type{cudf::type_id::STRING},
num_rows,
ascii_data1->view().data<char>(),
nullptr,
0,
0,
{offsets->view()});
auto input2 = cudf::column_view(cudf::data_type{cudf::type_id::STRING},
num_rows,
ascii_data2->view().data<char>(),
nullptr,
0,
0,
{offsets->view()});

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));

auto sv1 = cudf::strings_column_view(input1);
auto sv2 = cudf::strings_column_view(input2);
auto chars_size =
input1.chars_size(cudf::get_default_stream()) + input2.chars_size(cudf::get_default_stream());
state.add_global_memory_reads<nvbench::int8_t>(chars_size);
sv1.chars_size(cudf::get_default_stream()) + sv2.chars_size(cudf::get_default_stream());
auto offsets_size = offsets->alloc_size();
state.add_global_memory_reads<nvbench::int8_t>(chars_size + 2 * offsets_size);
// output are integers (one per row)
state.add_global_memory_writes<nvbench::int32_t>(num_rows);

state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { auto result = nvtext::edit_distance(input1, input2); });
[&](nvbench::launch& launch) { auto result = nvtext::edit_distance(sv1, sv2); });
}

static void bench_edit_distance(nvbench::state& state)
{
auto const encode = state.get_string("encode");
if (encode == "ascii") {
bench_edit_distance_ascii(state);
} else {
bench_edit_distance_utf8(state);
}
}

NVBENCH_BENCH(bench_edit_distance)
.set_name("edit_distance")
.add_int64_axis("min_width", {0})
.add_int64_axis("max_width", {32, 64, 128, 256})
.add_int64_axis("num_rows", {32768, 262144});
.add_int64_axis("max_width", {32, 64, 128, 256, 512})
.add_int64_axis("num_rows", {262144, 524288, 1048576})
.add_string_axis("encode", {"utf8", "ascii"});
213 changes: 181 additions & 32 deletions cpp/src/text/edit_distance.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/sequence.hpp>
#include <cudf/detail/sizes_to_offsets_iterator.cuh>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/grid_1d.cuh>
#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/default_stream.hpp>
Expand All @@ -32,6 +35,7 @@
#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>

#include <cooperative_groups.h>
#include <cuda/std/functional>
#include <thrust/execution_policy.h>
#include <thrust/fill.h>
Expand Down Expand Up @@ -88,29 +92,11 @@ __device__ cudf::size_type compute_distance(cudf::string_view const& d_str,
return v0[n];
}

struct edit_distance_levenshtein_algorithm {
cudf::column_device_view d_strings; // computing these
cudf::column_device_view d_targets; // against these;
cudf::size_type* d_buffer; // compute buffer for each string
std::ptrdiff_t const* d_offsets; // locate sub-buffer for each string
cudf::size_type* d_results; // edit distance values

__device__ void operator()(cudf::size_type idx) const
{
auto d_str =
d_strings.is_null(idx) ? cudf::string_view{} : d_strings.element<cudf::string_view>(idx);
auto d_tgt = [&] __device__ { // d_targets is also allowed to have only one valid entry
if (d_targets.size() > 1 && d_targets.is_null(idx)) { return cudf::string_view{}; }
return d_targets.size() == 1 ? d_targets.element<cudf::string_view>(0)
: d_targets.element<cudf::string_view>(idx);
}();
d_results[idx] = compute_distance(d_str, d_tgt, d_buffer + d_offsets[idx]);
}
};

struct calculate_compute_buffer_fn {
cudf::column_device_view d_strings;
cudf::column_device_view d_targets;
int32_t pad;
int32_t count;

__device__ std::ptrdiff_t operator()(cudf::size_type idx) const
{
Expand All @@ -119,16 +105,179 @@ struct calculate_compute_buffer_fn {
auto d_str = d_strings.element<cudf::string_view>(idx);
auto d_tgt = d_targets.size() == 1 ? d_targets.element<cudf::string_view>(0)
: d_targets.element<cudf::string_view>(idx);
// just need 2 integers for each character of the shorter string
return (cuda::std::min(d_str.length(), d_tgt.length()) + 1L) * 2L;
return (cuda::std::min(d_str.length(), d_tgt.length()) + static_cast<int64_t>(pad)) *
static_cast<int64_t>(count);
}
};

} // namespace

/**
* @copydoc nvtext::edit_distance
* @brief Processes the 2 given strings by computing the matrix values
* comparing each character of both strings
*
* The full matrix is computed from top-left to bottom-right with the edit-distance
* as the final value. Only 3 diagonals are needed in memory at any iteration.
*
* - S E C O N D
* - 0 1 2 3 4 5 6
* F 1 = x y z ↘ .
* I 2 x y z ↘ . .
* R 3 y z ↘ . . .
* S 4 z ↘ . . . .
* T 5 → . . . . X
*
* Computing a diagonal only requires the previous 2 and each value
* can be computed independently/parallel a tile at a time.
*
* The diagonal approach inspired by:
* https://ashvardanian.com/posts/stringwars-on-gpus/#dynamic-programming-and-levenshtein-evaluation-order
*/
template <int32_t tile_size = cudf::detail::warp_size, typename Iterator>
__device__ void compute_distance(Iterator input1,
cudf::size_type length1,
Iterator input2,
cudf::size_type length2,
cudf::size_type* d_buffer,
cudf::size_type* d_result)
{
namespace cg = cooperative_groups;
auto const block = cg::this_thread_block();
auto const tile = cg::tiled_partition<tile_size>(block);
auto const lane_idx = tile.thread_rank();

// shortcut if one of the strings is empty
// (null rows are mapped here as well)
if (length1 == 0 || length2 == 0) {
if (lane_idx == 0) { *d_result = length1 == 0 ? length2 : length1; }
return;
}

// setup the 3 working vectors for this string
auto v0 = d_buffer;
auto v1 = v0 + length1 + 2;
auto v2 = v1 + length1 + 2;
if (lane_idx == 0) {
v0[0] = 0; // first diagonal
v1[0] = 1; // second diagonal
v1[1] = 1;
}

// utility for navigating the diagonal of the matrix of characters for the 2 strings
auto next_itr = [](Iterator sitr, cudf::size_type length, Iterator itr, cudf::size_type offset) {
if constexpr (cuda::std::is_pointer_v<Iterator>) {
itr = sitr - offset; // ASCII iterator
} else {
auto const pos = sitr.position() - offset; // minimizes character counting
itr += (pos >= 0) && (pos < length) ? (pos - itr.position()) : 0;
}
return itr;
};

// top-left of the matrix
// includes the diagonal one passed the max(length1,length2) diagonal
for (auto idx = 0; idx < length1; ++idx, ++input1) {
auto const n = idx + 2; // diagonal length
auto const a = n > length1; // extra diagonal adjust

auto jdx = static_cast<cudf::size_type>(lane_idx);
auto it1 = input1;
auto it2 = input2;

auto tile_count = cudf::util::div_rounding_up_safe(n + 1, tile_size);
while (tile_count--) {
auto const offset = (jdx - 1);
// locate the 2 characters to compare along the diagonal
it1 = next_itr(input1, length1, it1, offset);
it2 = next_itr(input2, length2, it2, -offset);
if (jdx == 0) {
if (!a) { v2[0] = n; }
} else if (jdx < n) {
auto const sc = v0[jdx - 1] + (*it1 != *it2);
auto const dc = v1[jdx - 1] + 1;
auto const ic = v1[jdx] + 1;
v2[jdx - a] = cuda::std::min(cuda::std::min(sc, dc), ic);
} else if (jdx == n) {
v2[n - a] = n;
}
tile.sync();
jdx += tile_size;
}
cuda::std::swap(v0, v1);
cuda::std::swap(v1, v2);
}

--input1; // reset
++input2; // iterators

// bottom-right of the matrix
for (auto idx = 1; idx < length2; ++idx, ++input2) {
bool const fl = (length2 - idx) > length1; // fill-last flag
auto const n = (fl ? length1 : (length2 - idx)) + 1;

auto jdx = static_cast<cudf::size_type>(lane_idx);
auto it1 = input1;
auto it2 = input2;

auto tile_count = cudf::util::div_rounding_up_safe(n, tile_size);
while (tile_count--) {
auto const offset = (jdx - 1);
// locate the 2 characters to compare along the diagonal
it1 = next_itr(input1, length1, it1, offset);
it2 = next_itr(input2, length2, it2, -offset);
if (jdx > 0 && jdx < n) {
auto const sc = v0[jdx] + (*it1 != *it2);
auto const dc = v1[jdx - 1] + 1;
auto const ic = v1[jdx] + 1;
v2[jdx - 1] = cuda::std::min(cuda::std::min(sc, dc), ic);
} else if (jdx == n && fl) {
v2[n - 1] = n + idx;
}
tile.sync();
jdx += tile_size;
}
cuda::std::swap(v0, v1);
cuda::std::swap(v1, v2);
}

if (lane_idx == 0) { *d_result = v1[0]; }
}

template <int32_t tile_size = cudf::detail::warp_size>
CUDF_KERNEL void levenshtein_kernel(cudf::column_device_view d_strings,
cudf::column_device_view d_targets,
cudf::size_type* d_work_buffer,
std::ptrdiff_t const* d_offsets,
cudf::size_type* d_results)
{
auto const tid = cudf::detail::grid_1d::global_thread_id();
auto const str_idx = tid / tile_size;
if (str_idx >= d_strings.size()) { return; }
auto d_str1 = d_strings.is_null(str_idx) ? cudf::string_view{}
: d_strings.element<cudf::string_view>(str_idx);
auto d_str2 = [&] { // d_targets is also allowed to have only one valid entry
if (d_targets.size() > 1 && d_targets.is_null(str_idx)) { return cudf::string_view{}; }
return d_targets.size() == 1 ? d_targets.element<cudf::string_view>(0)
: d_targets.element<cudf::string_view>(str_idx);
}();

// compute_distance algorithm is designed such that it expects length1 <= length2
if (d_str1.length() > d_str2.length()) { cuda::std::swap(d_str1, d_str2); }
auto const length1 = d_str1.length();
auto const length2 = d_str2.length();

auto d_buffer = d_work_buffer + d_offsets[str_idx];
auto d_result = d_results + str_idx;

if (length1 == d_str1.size_bytes() && length2 == d_str2.size_bytes()) {
// ASCII path
compute_distance(d_str1.data(), length1, d_str2.data(), length2, d_buffer, d_result);
} else {
// use UTF8 iterator builtin to cudf::string_view
compute_distance(d_str1.begin(), length1, d_str2.begin(), length2, d_buffer, d_result);
}
}

} // namespace

std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& input,
cudf::strings_column_view const& targets,
rmm::cuda_stream_view stream,
Expand All @@ -154,7 +303,7 @@ std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& inp
thrust::counting_iterator<cudf::size_type>(0),
thrust::counting_iterator<cudf::size_type>(input.size()),
offsets.begin(),
calculate_compute_buffer_fn{*d_strings, *d_targets});
calculate_compute_buffer_fn{*d_strings, *d_targets, 2, 3});

// get the total size of the temporary compute buffer
// and convert sizes to offsets in-place
Expand All @@ -167,12 +316,12 @@ std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& inp
output_type, input.size(), rmm::device_buffer{0, stream, mr}, 0, stream, mr);
auto d_results = results->mutable_view().data<cudf::size_type>();

// compute the edit distance into the output column
thrust::for_each_n(rmm::exec_policy_nosync(stream),
thrust::counting_iterator<cudf::size_type>(0),
input.size(),
edit_distance_levenshtein_algorithm{
*d_strings, *d_targets, d_buffer, offsets.data(), d_results});
constexpr auto block_size = 256L;
constexpr auto tile_size = static_cast<cudf::thread_index_type>(cudf::detail::warp_size);
cudf::detail::grid_1d grid{input.size() * tile_size, block_size};
levenshtein_kernel<tile_size><<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
*d_strings, *d_targets, d_buffer, offsets.data(), d_results);

return results;
}

Expand Down
Loading
Loading