rapidsai · davidwendt · Oct 15, 2025 · Oct 15, 2025 · Oct 16, 2025 · Oct 16, 2025
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,8 @@
 
 #include <benchmarks/common/generate_input.hpp>
 
+#include <cudf/aggregation.hpp>
+#include <cudf/reduction.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <nvtext/edit_distance.hpp>
@@ -24,7 +26,7 @@
 
 #include <nvbench/nvbench.cuh>
 
-static void bench_edit_distance(nvbench::state& state)
+static void bench_edit_distance_utf8(nvbench::state& state)
 {
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
   auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
@@ -34,23 +36,89 @@ static void bench_edit_distance(nvbench::state& state)
     cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
   auto const strings_table = create_random_table(
     {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, strings_profile);
-  cudf::strings_column_view input1(strings_table->view().column(0));
-  cudf::strings_column_view input2(strings_table->view().column(1));
+  auto input1 = strings_table->get_column(0);
+  auto input2 = strings_table->get_column(0);
+  auto sv1    = cudf::strings_column_view(input1.view());
+  auto sv2    = cudf::strings_column_view(input2.view());
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+
+  state.add_global_memory_reads<nvbench::int8_t>(input1.alloc_size() + input2.alloc_size());
+  // output are integers (one per row)
+  state.add_global_memory_writes<nvbench::int32_t>(num_rows);
+
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { auto result = nvtext::edit_distance(sv1, sv2); });
+}
+
+static void bench_edit_distance_ascii(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
+
+  auto const max_size     = static_cast<int64_t>(num_rows) * static_cast<int64_t>(max_width);
+  auto const offsets_type = max_size >= std::numeric_limits<cudf::size_type>::max()
+                              ? cudf::type_id::INT64
+                              : cudf::type_id::INT32;
+
+  data_profile profile = data_profile_builder().no_validity().cardinality(0).distribution(
+    offsets_type, distribution_id::NORMAL, min_width, max_width);
+  data_profile ascii_profile = data_profile_builder().no_validity().cardinality(0).distribution(
+    cudf::type_id::INT8, distribution_id::UNIFORM, 32, 126);  // nice ASCII range
+
+  auto offsets = create_random_column(offsets_type, row_count{num_rows + 1}, profile);
+  offsets      = cudf::scan(offsets->view(),
+                       *cudf::make_sum_aggregation<cudf::scan_aggregation>(),
+                       cudf::scan_type::EXCLUSIVE);
+  auto ascii_data1 =
+    create_random_column(cudf::type_id::INT8, row_count{num_rows * max_width}, ascii_profile);
+  auto ascii_data2 =
+    create_random_column(cudf::type_id::INT8, row_count{num_rows * max_width}, ascii_profile);
+
+  auto input1 = cudf::column_view(cudf::data_type{cudf::type_id::STRING},
+                                  num_rows,
+                                  ascii_data1->view().data<char>(),
+                                  nullptr,
+                                  0,
+                                  0,
+                                  {offsets->view()});
+  auto input2 = cudf::column_view(cudf::data_type{cudf::type_id::STRING},
+                                  num_rows,
+                                  ascii_data2->view().data<char>(),
+                                  nullptr,
+                                  0,
+                                  0,
+                                  {offsets->view()});
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
 
+  auto sv1 = cudf::strings_column_view(input1);
+  auto sv2 = cudf::strings_column_view(input2);
   auto chars_size =
-    input1.chars_size(cudf::get_default_stream()) + input2.chars_size(cudf::get_default_stream());
-  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+    sv1.chars_size(cudf::get_default_stream()) + sv2.chars_size(cudf::get_default_stream());
+  auto offsets_size = offsets->alloc_size();
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size + 2 * offsets_size);
   // output are integers (one per row)
   state.add_global_memory_writes<nvbench::int32_t>(num_rows);
 
   state.exec(nvbench::exec_tag::sync,
-             [&](nvbench::launch& launch) { auto result = nvtext::edit_distance(input1, input2); });
+             [&](nvbench::launch& launch) { auto result = nvtext::edit_distance(sv1, sv2); });
+}
+
+static void bench_edit_distance(nvbench::state& state)
+{
+  auto const encode = state.get_string("encode");
+  if (encode == "ascii") {
+    bench_edit_distance_ascii(state);
+  } else {
+    bench_edit_distance_utf8(state);
+  }
 }
 
 NVBENCH_BENCH(bench_edit_distance)
   .set_name("edit_distance")
   .add_int64_axis("min_width", {0})
-  .add_int64_axis("max_width", {32, 64, 128, 256})
-  .add_int64_axis("num_rows", {32768, 262144});
+  .add_int64_axis("max_width", {32, 64, 128, 256, 512})
+  .add_int64_axis("num_rows", {262144, 524288, 1048576})
+  .add_string_axis("encode", {"utf8", "ascii"});
@@ -21,6 +21,9 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sequence.hpp>
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/grid_1d.cuh>
+#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -32,6 +35,7 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cooperative_groups.h>
 #include <cuda/std/functional>
 #include <thrust/execution_policy.h>
 #include <thrust/fill.h>
@@ -88,29 +92,11 @@ __device__ cudf::size_type compute_distance(cudf::string_view const& d_str,
   return v0[n];
 }
 
-struct edit_distance_levenshtein_algorithm {
-  cudf::column_device_view d_strings;  // computing these
-  cudf::column_device_view d_targets;  // against these;
-  cudf::size_type* d_buffer;           // compute buffer for each string
-  std::ptrdiff_t const* d_offsets;     // locate sub-buffer for each string
-  cudf::size_type* d_results;          // edit distance values
-
-  __device__ void operator()(cudf::size_type idx) const
-  {
-    auto d_str =
-      d_strings.is_null(idx) ? cudf::string_view{} : d_strings.element<cudf::string_view>(idx);
-    auto d_tgt = [&] __device__ {  // d_targets is also allowed to have only one valid entry
-      if (d_targets.size() > 1 && d_targets.is_null(idx)) { return cudf::string_view{}; }
-      return d_targets.size() == 1 ? d_targets.element<cudf::string_view>(0)
-                                   : d_targets.element<cudf::string_view>(idx);
-    }();
-    d_results[idx] = compute_distance(d_str, d_tgt, d_buffer + d_offsets[idx]);
-  }
-};
-
 struct calculate_compute_buffer_fn {
   cudf::column_device_view d_strings;
   cudf::column_device_view d_targets;
+  int32_t pad;
+  int32_t count;
 
   __device__ std::ptrdiff_t operator()(cudf::size_type idx) const
   {
@@ -119,16 +105,179 @@ struct calculate_compute_buffer_fn {
     auto d_str = d_strings.element<cudf::string_view>(idx);
     auto d_tgt = d_targets.size() == 1 ? d_targets.element<cudf::string_view>(0)
                                        : d_targets.element<cudf::string_view>(idx);
-    // just need 2 integers for each character of the shorter string
-    return (cuda::std::min(d_str.length(), d_tgt.length()) + 1L) * 2L;
+    return (cuda::std::min(d_str.length(), d_tgt.length()) + static_cast<int64_t>(pad)) *
+           static_cast<int64_t>(count);
   }
 };
 
-}  // namespace
-
 /**
- * @copydoc nvtext::edit_distance
+ * @brief Processes the 2 given strings by computing the matrix values
+ * comparing each character of both strings
+ *
+ * The full matrix is computed from top-left to bottom-right with the edit-distance
+ * as the final value. Only 3 diagonals are needed in memory at any iteration.
+ *
+ *     -  S  E  C  O  N  D
+ *  -  0  1  2  3  4  5  6
+ *  F  1  =  x  y  z  ↘  .
+ *  I  2  x  y  z  ↘  .  .
+ *  R  3  y  z  ↘  .  .  .
+ *  S  4  z  ↘  .  .  .  .
+ *  T  5  →  .  .  .  .  X
+ *
+ * Computing a diagonal only requires the previous 2 and each value
+ * can be computed independently/parallel a tile at a time.
+ *
+ * The diagonal approach inspired by:
+ * https://ashvardanian.com/posts/stringwars-on-gpus/#dynamic-programming-and-levenshtein-evaluation-order
  */
+template <int32_t tile_size = cudf::detail::warp_size, typename Iterator>
+__device__ void compute_distance(Iterator input1,
+                                 cudf::size_type length1,
+                                 Iterator input2,
+                                 cudf::size_type length2,
+                                 cudf::size_type* d_buffer,
+                                 cudf::size_type* d_result)
+{
+  namespace cg        = cooperative_groups;
+  auto const block    = cg::this_thread_block();
+  auto const tile     = cg::tiled_partition<tile_size>(block);
+  auto const lane_idx = tile.thread_rank();
+
+  // shortcut if one of the strings is empty
+  // (null rows are mapped here as well)
+  if (length1 == 0 || length2 == 0) {
+    if (lane_idx == 0) { *d_result = length1 == 0 ? length2 : length1; }
+    return;
+  }
+
+  // setup the 3 working vectors for this string
+  auto v0 = d_buffer;
+  auto v1 = v0 + length1 + 2;
+  auto v2 = v1 + length1 + 2;
+  if (lane_idx == 0) {
+    v0[0] = 0;  // first diagonal
+    v1[0] = 1;  // second diagonal
+    v1[1] = 1;
+  }
+
+  // utility for navigating the diagonal of the matrix of characters for the 2 strings
+  auto next_itr = [](Iterator sitr, cudf::size_type length, Iterator itr, cudf::size_type offset) {
+    if constexpr (cuda::std::is_pointer_v<Iterator>) {
+      itr = sitr - offset;  // ASCII iterator
+    } else {
+      auto const pos = sitr.position() - offset;  // minimizes character counting
+      itr += (pos >= 0) && (pos < length) ? (pos - itr.position()) : 0;
+    }
+    return itr;
+  };
+
+  // top-left of the matrix
+  // includes the diagonal one passed the max(length1,length2) diagonal
+  for (auto idx = 0; idx < length1; ++idx, ++input1) {
+    auto const n = idx + 2;      // diagonal length
+    auto const a = n > length1;  // extra diagonal adjust
+
+    auto jdx = static_cast<cudf::size_type>(lane_idx);
+    auto it1 = input1;
+    auto it2 = input2;
+
+    auto tile_count = cudf::util::div_rounding_up_safe(n + 1, tile_size);
+    while (tile_count--) {
+      auto const offset = (jdx - 1);
+      // locate the 2 characters to compare along the diagonal
+      it1 = next_itr(input1, length1, it1, offset);
+      it2 = next_itr(input2, length2, it2, -offset);
+      if (jdx == 0) {
+        if (!a) { v2[0] = n; }
+      } else if (jdx < n) {
+        auto const sc = v0[jdx - 1] + (*it1 != *it2);
+        auto const dc = v1[jdx - 1] + 1;
+        auto const ic = v1[jdx] + 1;
+        v2[jdx - a]   = cuda::std::min(cuda::std::min(sc, dc), ic);
+      } else if (jdx == n) {
+        v2[n - a] = n;
+      }
+      tile.sync();
+      jdx += tile_size;
+    }
+    cuda::std::swap(v0, v1);
+    cuda::std::swap(v1, v2);
+  }
+
+  --input1;  // reset
+  ++input2;  // iterators
+
+  // bottom-right of the matrix
+  for (auto idx = 1; idx < length2; ++idx, ++input2) {
+    bool const fl = (length2 - idx) > length1;  // fill-last flag
+    auto const n  = (fl ? length1 : (length2 - idx)) + 1;
+
+    auto jdx = static_cast<cudf::size_type>(lane_idx);
+    auto it1 = input1;
+    auto it2 = input2;
+
+    auto tile_count = cudf::util::div_rounding_up_safe(n, tile_size);
+    while (tile_count--) {
+      auto const offset = (jdx - 1);
+      // locate the 2 characters to compare along the diagonal
+      it1 = next_itr(input1, length1, it1, offset);
+      it2 = next_itr(input2, length2, it2, -offset);
+      if (jdx > 0 && jdx < n) {
+        auto const sc = v0[jdx] + (*it1 != *it2);
+        auto const dc = v1[jdx - 1] + 1;
+        auto const ic = v1[jdx] + 1;
+        v2[jdx - 1]   = cuda::std::min(cuda::std::min(sc, dc), ic);
+      } else if (jdx == n && fl) {
+        v2[n - 1] = n + idx;
+      }
+      tile.sync();
+      jdx += tile_size;
+    }
+    cuda::std::swap(v0, v1);
+    cuda::std::swap(v1, v2);
+  }
+
+  if (lane_idx == 0) { *d_result = v1[0]; }
+}
+
+template <int32_t tile_size = cudf::detail::warp_size>
+CUDF_KERNEL void levenshtein_kernel(cudf::column_device_view d_strings,
+                                    cudf::column_device_view d_targets,
+                                    cudf::size_type* d_work_buffer,
+                                    std::ptrdiff_t const* d_offsets,
+                                    cudf::size_type* d_results)
+{
+  auto const tid     = cudf::detail::grid_1d::global_thread_id();
+  auto const str_idx = tid / tile_size;
+  if (str_idx >= d_strings.size()) { return; }
+  auto d_str1 = d_strings.is_null(str_idx) ? cudf::string_view{}
+                                           : d_strings.element<cudf::string_view>(str_idx);
+  auto d_str2 = [&] {  // d_targets is also allowed to have only one valid entry
+    if (d_targets.size() > 1 && d_targets.is_null(str_idx)) { return cudf::string_view{}; }
+    return d_targets.size() == 1 ? d_targets.element<cudf::string_view>(0)
+                                 : d_targets.element<cudf::string_view>(str_idx);
+  }();
+
+  // compute_distance algorithm is designed such that it expects length1 <= length2
+  if (d_str1.length() > d_str2.length()) { cuda::std::swap(d_str1, d_str2); }
+  auto const length1 = d_str1.length();
+  auto const length2 = d_str2.length();
+
+  auto d_buffer = d_work_buffer + d_offsets[str_idx];
+  auto d_result = d_results + str_idx;
+
+  if (length1 == d_str1.size_bytes() && length2 == d_str2.size_bytes()) {
+    // ASCII path
+    compute_distance(d_str1.data(), length1, d_str2.data(), length2, d_buffer, d_result);
+  } else {
+    // use UTF8 iterator builtin to cudf::string_view
+    compute_distance(d_str1.begin(), length1, d_str2.begin(), length2, d_buffer, d_result);
+  }
+}
+
+}  // namespace
+
 std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& input,
                                             cudf::strings_column_view const& targets,
                                             rmm::cuda_stream_view stream,
@@ -154,7 +303,7 @@ std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& inp
                     thrust::counting_iterator<cudf::size_type>(0),
                     thrust::counting_iterator<cudf::size_type>(input.size()),
                     offsets.begin(),
-                    calculate_compute_buffer_fn{*d_strings, *d_targets});
+                    calculate_compute_buffer_fn{*d_strings, *d_targets, 2, 3});
 
   // get the total size of the temporary compute buffer
   // and convert sizes to offsets in-place
@@ -167,12 +316,12 @@ std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& inp
     output_type, input.size(), rmm::device_buffer{0, stream, mr}, 0, stream, mr);
   auto d_results = results->mutable_view().data<cudf::size_type>();
 
-  // compute the edit distance into the output column
-  thrust::for_each_n(rmm::exec_policy_nosync(stream),
-                     thrust::counting_iterator<cudf::size_type>(0),
-                     input.size(),
-                     edit_distance_levenshtein_algorithm{
-                       *d_strings, *d_targets, d_buffer, offsets.data(), d_results});
+  constexpr auto block_size = 256L;
+  constexpr auto tile_size  = static_cast<cudf::thread_index_type>(cudf::detail::warp_size);
+  cudf::detail::grid_1d grid{input.size() * tile_size, block_size};
+  levenshtein_kernel<tile_size><<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+    *d_strings, *d_targets, d_buffer, offsets.data(), d_results);
+
   return results;
 }