From 1b1c30f5aefb8614dd720e759e5d374608901710 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 15 Oct 2025 11:23:21 -0400 Subject: [PATCH 1/4] Performance improvement for nvtext::edit_distance for long strings --- cpp/benchmarks/text/edit_distance.cpp | 81 ++++++++-- cpp/src/text/edit_distance.cu | 211 +++++++++++++++++++++---- cpp/tests/text/edit_distance_tests.cpp | 49 ++++-- 3 files changed, 288 insertions(+), 53 deletions(-) diff --git a/cpp/benchmarks/text/edit_distance.cpp b/cpp/benchmarks/text/edit_distance.cpp index 0ad1ae30f8c..39a8f8be252 100644 --- a/cpp/benchmarks/text/edit_distance.cpp +++ b/cpp/benchmarks/text/edit_distance.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,8 @@ #include +#include +#include #include #include @@ -24,7 +26,7 @@ #include -static void bench_edit_distance(nvbench::state& state) +static void bench_edit_distance_utf8(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); auto const min_width = static_cast(state.get_int64("min_width")); @@ -34,23 +36,84 @@ static void bench_edit_distance(nvbench::state& state) cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const strings_table = create_random_table( {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, strings_profile); - cudf::strings_column_view input1(strings_table->view().column(0)); - cudf::strings_column_view input2(strings_table->view().column(1)); + auto input1 = strings_table->get_column(0); + auto input2 = strings_table->get_column(0); + auto sv1 = cudf::strings_column_view(input1.view()); + auto sv2 = cudf::strings_column_view(input2.view()); state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.add_global_memory_reads(input1.alloc_size() + input2.alloc_size()); + // output are integers (one per row) + state.add_global_memory_writes(num_rows); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { auto result = nvtext::edit_distance(sv1, sv2); }); +} + +static void bench_edit_distance_ascii(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); + + data_profile profile = data_profile_builder().no_validity().cardinality(0).distribution( + cudf::type_id::INT32, distribution_id::NORMAL, min_width, max_width); + data_profile ascii_profile = data_profile_builder().no_validity().cardinality(0).distribution( + cudf::type_id::INT8, distribution_id::UNIFORM, 32, 126); // nice ASCII range + + auto offsets = create_random_column(cudf::type_id::INT32, row_count{num_rows + 1}, profile); + offsets = cudf::scan(offsets->view(), + *cudf::make_sum_aggregation(), + cudf::scan_type::EXCLUSIVE); + auto ascii_data1 = + create_random_column(cudf::type_id::INT8, row_count{num_rows * max_width}, ascii_profile); + auto ascii_data2 = + create_random_column(cudf::type_id::INT8, row_count{num_rows * max_width}, ascii_profile); + + auto input1 = cudf::column_view(cudf::data_type{cudf::type_id::STRING}, + num_rows, + ascii_data1->view().data(), + nullptr, + 0, + 0, + {offsets->view()}); + auto input2 = cudf::column_view(cudf::data_type{cudf::type_id::STRING}, + num_rows, + ascii_data2->view().data(), + nullptr, + 0, + 0, + {offsets->view()}); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + + auto sv1 = cudf::strings_column_view(input1); + auto sv2 = cudf::strings_column_view(input2); auto chars_size = - input1.chars_size(cudf::get_default_stream()) + input2.chars_size(cudf::get_default_stream()); - state.add_global_memory_reads(chars_size); + sv1.chars_size(cudf::get_default_stream()) + sv2.chars_size(cudf::get_default_stream()); + auto offsets_size = offsets->alloc_size(); + state.add_global_memory_reads(chars_size + 2 * offsets_size); // output are integers (one per row) state.add_global_memory_writes(num_rows); state.exec(nvbench::exec_tag::sync, - [&](nvbench::launch& launch) { auto result = nvtext::edit_distance(input1, input2); }); + [&](nvbench::launch& launch) { auto result = nvtext::edit_distance(sv1, sv2); }); +} + +static void bench_edit_distance(nvbench::state& state) +{ + auto const encode = state.get_string("encode"); + if (encode == "ascii") { + bench_edit_distance_ascii(state); + } else { + bench_edit_distance_utf8(state); + } } NVBENCH_BENCH(bench_edit_distance) .set_name("edit_distance") .add_int64_axis("min_width", {0}) - .add_int64_axis("max_width", {32, 64, 128, 256}) - .add_int64_axis("num_rows", {32768, 262144}); + .add_int64_axis("max_width", {32, 64, 128, 256, 512}) + .add_int64_axis("num_rows", {262144, 524288, 1048576}) + .add_string_axis("encode", {"utf8", "ascii"}); diff --git a/cpp/src/text/edit_distance.cu b/cpp/src/text/edit_distance.cu index 22d4b4d4a46..0779de9f7c2 100644 --- a/cpp/src/text/edit_distance.cu +++ b/cpp/src/text/edit_distance.cu @@ -21,6 +21,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -32,6 +35,7 @@ #include #include +#include #include #include #include @@ -88,29 +92,11 @@ __device__ cudf::size_type compute_distance(cudf::string_view const& d_str, return v0[n]; } -struct edit_distance_levenshtein_algorithm { - cudf::column_device_view d_strings; // computing these - cudf::column_device_view d_targets; // against these; - cudf::size_type* d_buffer; // compute buffer for each string - std::ptrdiff_t const* d_offsets; // locate sub-buffer for each string - cudf::size_type* d_results; // edit distance values - - __device__ void operator()(cudf::size_type idx) const - { - auto d_str = - d_strings.is_null(idx) ? cudf::string_view{} : d_strings.element(idx); - auto d_tgt = [&] __device__ { // d_targets is also allowed to have only one valid entry - if (d_targets.size() > 1 && d_targets.is_null(idx)) { return cudf::string_view{}; } - return d_targets.size() == 1 ? d_targets.element(0) - : d_targets.element(idx); - }(); - d_results[idx] = compute_distance(d_str, d_tgt, d_buffer + d_offsets[idx]); - } -}; - struct calculate_compute_buffer_fn { cudf::column_device_view d_strings; cudf::column_device_view d_targets; + int32_t pad; + int32_t count; __device__ std::ptrdiff_t operator()(cudf::size_type idx) const { @@ -119,16 +105,177 @@ struct calculate_compute_buffer_fn { auto d_str = d_strings.element(idx); auto d_tgt = d_targets.size() == 1 ? d_targets.element(0) : d_targets.element(idx); - // just need 2 integers for each character of the shorter string - return (cuda::std::min(d_str.length(), d_tgt.length()) + 1L) * 2L; + return (cuda::std::min(d_str.length(), d_tgt.length()) + static_cast(pad)) * + static_cast(count); } }; -} // namespace - /** - * @copydoc nvtext::edit_distance + * @brief Processes the 2 given strings by computing the matrix values + * comparing each character of both strings + * + * The full matrix is computed from top-left to bottom-right with the edit-distance + * as the final value. Only 3 diagonals are needed in memory at any iteration. + * + * - S E C O N D + * - 0 1 2 3 4 5 6 + * F 1 = x y z ↘ . + * I 2 x y z ↘ . . + * R 3 y z ↘ . . . + * S 4 z ↘ . . . . + * T 5 → . . . . X + * + * Computing a diagonal only requires the previous 2 and each value + * can be computed independently/parallel a tile at a time. + * + * The diagonal approach inspired by: + * https://ashvardanian.com/posts/stringwars-on-gpus/#dynamic-programming-and-levenshtein-evaluation-order */ +template +__device__ void compute_distance(Iterator ss1, + cudf::size_type length1, + Iterator ss2, + cudf::size_type length2, + cudf::size_type* d_buffer, + cudf::size_type* d_result) +{ + namespace cg = cooperative_groups; + auto const block = cg::this_thread_block(); + auto const tile = cg::tiled_partition(block); + auto const lane_idx = tile.thread_rank(); + + // shortcut if one of the strings is empty + // (null rows are mapped here as well) + if (length1 == 0 || length2 == 0) { + if (lane_idx == 0) { *d_result = length1 == 0 ? length2 : length1; } + return; + } + + // setup the 3 working vectors for this string + auto v0 = d_buffer; + auto v1 = v0 + length1 + 2; + auto v2 = v1 + length1 + 2; + if (lane_idx == 0) { + v0[0] = 0; // first diagonal + v1[0] = 1; // second diagonal + v1[1] = 1; + } + + // for navigating the diagonal of the matrix of characters for the 2 strings + auto next_itr = [](Iterator sitr, cudf::size_type length, Iterator itr, cudf::size_type offset) { + if constexpr (cuda::std::is_pointer_v) { + itr = sitr - offset; // ASCII iterator + } else { + auto const pos = sitr.position() - offset; // minimizes character counting + itr += (pos >= 0) && (pos < length) ? (pos - itr.position()) : 0; + } + return itr; + }; + + // top-left of the matrix + for (auto idx = 0; idx < length1; ++idx, ++ss1) { + auto const n = idx + 2; + auto const a = n > length1; + + auto jdx = static_cast(lane_idx); + auto it1 = ss1; + auto it2 = ss2; + + auto tile_count = cudf::util::div_rounding_up_unsafe(n, tile_size); + while (tile_count--) { + auto const offset = (jdx - 1); + // locate the 2 characters to compare + it1 = next_itr(ss1, length1, it1, offset); + it2 = next_itr(ss2, length2, it2, -offset); + if (jdx == 0) { + if (!a) { v2[0] = n; } + } else if (jdx < n) { + auto const sc = v0[jdx - 1] + (*it1 != *it2); + auto const dc = v1[jdx - 1] + 1; + auto const ic = v1[jdx] + 1; + v2[jdx - a] = cuda::std::min(cuda::std::min(sc, dc), ic); + } else if (jdx == n) { + v2[n - a] = n; + } + tile.sync(); + jdx += tile_size; + } + cuda::std::swap(v0, v1); + cuda::std::swap(v1, v2); + } + + --ss1; // reset + ++ss2; // iterators + + // bottom-right of the matrix + for (auto idx = 1; idx < length2; ++idx, ++ss2) { + bool const fl = (length2 - idx) > length1; + auto const n = (fl ? length1 : (length2 - idx)) + 1; + + auto jdx = static_cast(lane_idx); + auto it1 = ss1; + auto it2 = ss2; + + auto tile_count = cudf::util::div_rounding_up_unsafe(n, tile_size); + while (tile_count--) { + auto const offset = (jdx - 1); + // locate the 2 characters to compare + it1 = next_itr(ss1, length1, it1, offset); + it2 = next_itr(ss2, length2, it2, -offset); + if (jdx > 0 && jdx < n) { + auto const sc = v0[jdx] + (*it1 != *it2); + auto const dc = v1[jdx - 1] + 1; + auto const ic = v1[jdx] + 1; + v2[jdx - 1] = cuda::std::min(cuda::std::min(sc, dc), ic); + } else if (jdx == n && fl) { + v2[n - 1] = n + idx; + } + tile.sync(); + jdx += tile_size; + } + cuda::std::swap(v0, v1); + cuda::std::swap(v1, v2); + } + + if (lane_idx == 0) { *d_result = v1[0]; } +} + +template +CUDF_KERNEL void levenshtein_kernel(cudf::column_device_view d_strings, + cudf::column_device_view d_targets, + cudf::size_type* d_work_buffer, + std::ptrdiff_t const* d_offsets, + cudf::size_type* d_results) +{ + auto const tid = cudf::detail::grid_1d::global_thread_id(); + auto const str_idx = tid / tile_size; + if (str_idx >= d_strings.size()) { return; } + auto d_str1 = d_strings.is_null(str_idx) ? cudf::string_view{} + : d_strings.element(str_idx); + auto d_str2 = [&] { // d_targets is also allowed to have only one valid entry + if (d_targets.size() > 1 && d_targets.is_null(str_idx)) { return cudf::string_view{}; } + return d_targets.size() == 1 ? d_targets.element(0) + : d_targets.element(str_idx); + }(); + + if (d_str1.length() > d_str2.length()) { cuda::std::swap(d_str1, d_str2); } + auto const length1 = d_str1.length(); + auto const length2 = d_str2.length(); + + auto d_buffer = d_work_buffer + d_offsets[str_idx]; + auto d_result = d_results + str_idx; + + if (length1 == d_str1.size_bytes() && length2 == d_str2.size_bytes()) { + // ASCII path + compute_distance(d_str1.data(), length1, d_str2.data(), length2, d_buffer, d_result); + } else { + // use UTF8 iterator builtin to cudf::string_view + compute_distance(d_str1.begin(), length1, d_str2.begin(), length2, d_buffer, d_result); + } +} + +} // namespace + std::unique_ptr edit_distance(cudf::strings_column_view const& input, cudf::strings_column_view const& targets, rmm::cuda_stream_view stream, @@ -154,7 +301,7 @@ std::unique_ptr edit_distance(cudf::strings_column_view const& inp thrust::counting_iterator(0), thrust::counting_iterator(input.size()), offsets.begin(), - calculate_compute_buffer_fn{*d_strings, *d_targets}); + calculate_compute_buffer_fn{*d_strings, *d_targets, 2, 3}); // get the total size of the temporary compute buffer // and convert sizes to offsets in-place @@ -167,12 +314,12 @@ std::unique_ptr edit_distance(cudf::strings_column_view const& inp output_type, input.size(), rmm::device_buffer{0, stream, mr}, 0, stream, mr); auto d_results = results->mutable_view().data(); - // compute the edit distance into the output column - thrust::for_each_n(rmm::exec_policy_nosync(stream), - thrust::counting_iterator(0), - input.size(), - edit_distance_levenshtein_algorithm{ - *d_strings, *d_targets, d_buffer, offsets.data(), d_results}); + constexpr auto block_size = 256L; + constexpr auto tile_size = static_cast(cudf::detail::warp_size); + cudf::detail::grid_1d grid{input.size() * tile_size, block_size}; + levenshtein_kernel<<>>( + *d_strings, *d_targets, d_buffer, offsets.data(), d_results); + return results; } diff --git a/cpp/tests/text/edit_distance_tests.cpp b/cpp/tests/text/edit_distance_tests.cpp index 8bbf74aff7e..c597e0c4cb5 100644 --- a/cpp/tests/text/edit_distance_tests.cpp +++ b/cpp/tests/text/edit_distance_tests.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -36,37 +37,61 @@ TEST_F(TextEditDistanceTest, EditDistance) h_strings.begin(), h_strings.end(), thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + auto sv = cudf::strings_column_view(strings); - std::vector h_targets{"hog", "not", "cake", "house", "fox", nullptr, "puppy", "the"}; - cudf::test::strings_column_wrapper targets( - h_targets.begin(), - h_targets.end(), - thrust::make_transform_iterator(h_targets.begin(), [](auto str) { return str != nullptr; })); { - auto results = - nvtext::edit_distance(cudf::strings_column_view(strings), cudf::strings_column_view(targets)); + std::vector h_targets{ + "hog", "not", "cake", "house", "fox", nullptr, "puppy", "the"}; + cudf::test::strings_column_wrapper targets( + h_targets.begin(), + h_targets.end(), + thrust::make_transform_iterator(h_targets.begin(), [](auto str) { return str != nullptr; })); + auto tv = cudf::strings_column_view(targets); + + auto results = nvtext::edit_distance(sv, tv); cudf::test::fixed_width_column_wrapper expected({1, 3, 2, 1, 3, 0, 0, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { cudf::test::strings_column_wrapper single({"pup"}); - auto results = - nvtext::edit_distance(cudf::strings_column_view(strings), cudf::strings_column_view(single)); + auto tv = cudf::strings_column_view(single); + auto results = nvtext::edit_distance(sv, tv); cudf::test::fixed_width_column_wrapper expected({3, 3, 3, 4, 0, 3, 2, 3}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { cudf::test::strings_column_wrapper single({"pup"}, {1}); + auto tv = cudf::strings_column_view(single); std::vector h_input(516, "cup"); - auto input = cudf::test::strings_column_wrapper(h_input.begin(), h_input.end()); - auto results = - nvtext::edit_distance(cudf::strings_column_view(input), cudf::strings_column_view(single)); + auto input = cudf::test::strings_column_wrapper(h_input.begin(), h_input.end()); + auto sv = cudf::strings_column_view(input); + auto results = nvtext::edit_distance(sv, tv); auto begin = thrust::constant_iterator(1); auto expected = cudf::test::fixed_width_column_wrapper(begin, begin + h_input.size()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } } +TEST_F(TextEditDistanceTest, EditDistanceLong) +{ + auto input1 = cudf::test::strings_column_wrapper( + {"the lady brown fox jumps down the wall of the castle with wide windows", + "the lady brown fox jumps down the wall of thé castlé with wide windows", + "thé lady brown fox jumps down the wall of the castle with wide windows", + "the lazy brown dog jumps upon the hill of the castle with long windows", // exact one + "why the lazy brown dog jumps upon the hill of the castle with long windows", + "the lazy brown dog jumps upon the hill of the castle", + "lazy brown dog jumps upon hill"}); + auto input2 = cudf::test::strings_column_wrapper( + {"the lazy brown dog jumps upon the hill of the castle with long windows"}); + auto sv1 = cudf::strings_column_view(input1); + auto sv2 = cudf::strings_column_view(input2); + + auto expected = cudf::test::fixed_width_column_wrapper({12, 14, 13, 0, 4, 18, 40}); + auto results = nvtext::edit_distance(sv1, sv2); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(TextEditDistanceTest, EditDistanceMatrix) { std::vector h_strings{"dog", nullptr, "hog", "frog", "cat", "", "hat", "clog"}; From c7f9da35d406af8020c85f0596d9da8f11e3db9e Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 16 Oct 2025 08:37:25 -0400 Subject: [PATCH 2/4] large strings in bench_edit_distance_ascii --- cpp/benchmarks/text/edit_distance.cpp | 9 +++++++-- cpp/src/text/edit_distance.cu | 4 ++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/cpp/benchmarks/text/edit_distance.cpp b/cpp/benchmarks/text/edit_distance.cpp index 39a8f8be252..5b1a83bda81 100644 --- a/cpp/benchmarks/text/edit_distance.cpp +++ b/cpp/benchmarks/text/edit_distance.cpp @@ -57,12 +57,17 @@ static void bench_edit_distance_ascii(nvbench::state& state) auto const min_width = static_cast(state.get_int64("min_width")); auto const max_width = static_cast(state.get_int64("max_width")); + auto const max_size = static_cast(num_rows) * static_cast(max_width); + auto const offsets_type = max_size >= std::numeric_limits::max() + ? cudf::type_id::INT64 + : cudf::type_id::INT32; + data_profile profile = data_profile_builder().no_validity().cardinality(0).distribution( - cudf::type_id::INT32, distribution_id::NORMAL, min_width, max_width); + offsets_type, distribution_id::NORMAL, min_width, max_width); data_profile ascii_profile = data_profile_builder().no_validity().cardinality(0).distribution( cudf::type_id::INT8, distribution_id::UNIFORM, 32, 126); // nice ASCII range - auto offsets = create_random_column(cudf::type_id::INT32, row_count{num_rows + 1}, profile); + auto offsets = create_random_column(offsets_type, row_count{num_rows + 1}, profile); offsets = cudf::scan(offsets->view(), *cudf::make_sum_aggregation(), cudf::scan_type::EXCLUSIVE); diff --git a/cpp/src/text/edit_distance.cu b/cpp/src/text/edit_distance.cu index 0779de9f7c2..0a1bce87282 100644 --- a/cpp/src/text/edit_distance.cu +++ b/cpp/src/text/edit_distance.cu @@ -181,7 +181,7 @@ __device__ void compute_distance(Iterator ss1, auto it1 = ss1; auto it2 = ss2; - auto tile_count = cudf::util::div_rounding_up_unsafe(n, tile_size); + auto tile_count = cudf::util::div_rounding_up_safe(n, tile_size); while (tile_count--) { auto const offset = (jdx - 1); // locate the 2 characters to compare @@ -216,7 +216,7 @@ __device__ void compute_distance(Iterator ss1, auto it1 = ss1; auto it2 = ss2; - auto tile_count = cudf::util::div_rounding_up_unsafe(n, tile_size); + auto tile_count = cudf::util::div_rounding_up_safe(n, tile_size); while (tile_count--) { auto const offset = (jdx - 1); // locate the 2 characters to compare From 889f83ca759419f58a097bd4b8fbd3f85ec82a5d Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 22 Oct 2025 13:16:31 -0400 Subject: [PATCH 3/4] fix left-matrix boundary case --- cpp/src/text/edit_distance.cu | 44 ++++++++++++++------------ cpp/tests/text/edit_distance_tests.cpp | 1 - 2 files changed, 23 insertions(+), 22 deletions(-) diff --git a/cpp/src/text/edit_distance.cu b/cpp/src/text/edit_distance.cu index 0a1bce87282..5d3cf87eeb9 100644 --- a/cpp/src/text/edit_distance.cu +++ b/cpp/src/text/edit_distance.cu @@ -132,9 +132,9 @@ struct calculate_compute_buffer_fn { * https://ashvardanian.com/posts/stringwars-on-gpus/#dynamic-programming-and-levenshtein-evaluation-order */ template -__device__ void compute_distance(Iterator ss1, +__device__ void compute_distance(Iterator input1, cudf::size_type length1, - Iterator ss2, + Iterator input2, cudf::size_type length2, cudf::size_type* d_buffer, cudf::size_type* d_result) @@ -161,7 +161,7 @@ __device__ void compute_distance(Iterator ss1, v1[1] = 1; } - // for navigating the diagonal of the matrix of characters for the 2 strings + // utility for navigating the diagonal of the matrix of characters for the 2 strings auto next_itr = [](Iterator sitr, cudf::size_type length, Iterator itr, cudf::size_type offset) { if constexpr (cuda::std::is_pointer_v) { itr = sitr - offset; // ASCII iterator @@ -173,20 +173,21 @@ __device__ void compute_distance(Iterator ss1, }; // top-left of the matrix - for (auto idx = 0; idx < length1; ++idx, ++ss1) { - auto const n = idx + 2; - auto const a = n > length1; + // includes the diagonal one passed the max(length1,length2) diagonal + for (auto idx = 0; idx < length1; ++idx, ++input1) { + auto const n = idx + 2; // diagonal length + auto const a = n > length1; // extra diagonal adjust auto jdx = static_cast(lane_idx); - auto it1 = ss1; - auto it2 = ss2; + auto it1 = input1; + auto it2 = input2; - auto tile_count = cudf::util::div_rounding_up_safe(n, tile_size); + auto tile_count = cudf::util::div_rounding_up_safe(n + 1, tile_size); while (tile_count--) { auto const offset = (jdx - 1); - // locate the 2 characters to compare - it1 = next_itr(ss1, length1, it1, offset); - it2 = next_itr(ss2, length2, it2, -offset); + // locate the 2 characters to compare along the diagonal + it1 = next_itr(input1, length1, it1, offset); + it2 = next_itr(input2, length2, it2, -offset); if (jdx == 0) { if (!a) { v2[0] = n; } } else if (jdx < n) { @@ -204,24 +205,24 @@ __device__ void compute_distance(Iterator ss1, cuda::std::swap(v1, v2); } - --ss1; // reset - ++ss2; // iterators + --input1; // reset + ++input2; // iterators // bottom-right of the matrix - for (auto idx = 1; idx < length2; ++idx, ++ss2) { - bool const fl = (length2 - idx) > length1; + for (auto idx = 1; idx < length2; ++idx, ++input2) { + bool const fl = (length2 - idx) > length1; // fill-last flag auto const n = (fl ? length1 : (length2 - idx)) + 1; auto jdx = static_cast(lane_idx); - auto it1 = ss1; - auto it2 = ss2; + auto it1 = input1; + auto it2 = input2; auto tile_count = cudf::util::div_rounding_up_safe(n, tile_size); while (tile_count--) { auto const offset = (jdx - 1); - // locate the 2 characters to compare - it1 = next_itr(ss1, length1, it1, offset); - it2 = next_itr(ss2, length2, it2, -offset); + // locate the 2 characters to compare along the diagonal + it1 = next_itr(input1, length1, it1, offset); + it2 = next_itr(input2, length2, it2, -offset); if (jdx > 0 && jdx < n) { auto const sc = v0[jdx] + (*it1 != *it2); auto const dc = v1[jdx - 1] + 1; @@ -258,6 +259,7 @@ CUDF_KERNEL void levenshtein_kernel(cudf::column_device_view d_strings, : d_targets.element(str_idx); }(); + // compute_distance algorithm is designed such that it expects length1 <= length2 if (d_str1.length() > d_str2.length()) { cuda::std::swap(d_str1, d_str2); } auto const length1 = d_str1.length(); auto const length2 = d_str2.length(); diff --git a/cpp/tests/text/edit_distance_tests.cpp b/cpp/tests/text/edit_distance_tests.cpp index c597e0c4cb5..63b90784098 100644 --- a/cpp/tests/text/edit_distance_tests.cpp +++ b/cpp/tests/text/edit_distance_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include From d1a96dede71be080b3da3c1b0555bf88b6bc23f7 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 24 Oct 2025 10:43:16 -0400 Subject: [PATCH 4/4] fix ascii generation logic in benchmark code --- cpp/benchmarks/text/edit_distance.cpp | 32 ++++++++++++++++++--------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/cpp/benchmarks/text/edit_distance.cpp b/cpp/benchmarks/text/edit_distance.cpp index 3f5d0870087..9d19c4fe7ed 100644 --- a/cpp/benchmarks/text/edit_distance.cpp +++ b/cpp/benchmarks/text/edit_distance.cpp @@ -6,6 +6,7 @@ #include #include +#include #include #include @@ -56,14 +57,27 @@ static void bench_edit_distance_ascii(nvbench::state& state) data_profile ascii_profile = data_profile_builder().no_validity().cardinality(0).distribution( cudf::type_id::INT8, distribution_id::UNIFORM, 32, 126); // nice ASCII range - auto offsets = create_random_column(offsets_type, row_count{num_rows + 1}, profile); - offsets = cudf::scan(offsets->view(), + auto offsets = create_random_column(offsets_type, row_count{num_rows + 1}, profile); + offsets = cudf::scan(offsets->view(), *cudf::make_sum_aggregation(), cudf::scan_type::EXCLUSIVE); - auto ascii_data1 = - create_random_column(cudf::type_id::INT8, row_count{num_rows * max_width}, ascii_profile); - auto ascii_data2 = - create_random_column(cudf::type_id::INT8, row_count{num_rows * max_width}, ascii_profile); + auto chars_size = offsets_type == cudf::type_id::INT64 + ? dynamic_cast*>( + cudf::get_element(offsets->view(), num_rows).get()) + ->value() + : static_cast(dynamic_cast*>( + cudf::get_element(offsets->view(), num_rows).get()) + ->value()); + if (chars_size > std::numeric_limits::max()) { + // to be fixed with create_ascii_string_column utility in PR 20354 + state.skip("chars size too large for this benchmark"); + return; + } + + auto ascii_data1 = create_random_column( + cudf::type_id::INT8, row_count{static_cast(chars_size)}, ascii_profile); + auto ascii_data2 = create_random_column( + cudf::type_id::INT8, row_count{static_cast(chars_size)}, ascii_profile); auto input1 = cudf::column_view(cudf::data_type{cudf::type_id::STRING}, num_rows, @@ -82,10 +96,8 @@ static void bench_edit_distance_ascii(nvbench::state& state) state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - auto sv1 = cudf::strings_column_view(input1); - auto sv2 = cudf::strings_column_view(input2); - auto chars_size = - sv1.chars_size(cudf::get_default_stream()) + sv2.chars_size(cudf::get_default_stream()); + auto sv1 = cudf::strings_column_view(input1); + auto sv2 = cudf::strings_column_view(input2); auto offsets_size = offsets->alloc_size(); state.add_global_memory_reads(chars_size + 2 * offsets_size); // output are integers (one per row)