From 7ed9c7e8a38bcb2fb1ba54e1f945b1edf9e1ef77 Mon Sep 17 00:00:00 2001 From: Alexander Kalistratov Date: Tue, 15 Apr 2025 16:29:00 +0200 Subject: [PATCH 1/3] Initial commit --- .../extensions/statistics/CMakeLists.txt | 1 + .../extensions/statistics/kth_element1d.cpp | 381 ++++++++++++++++++ .../extensions/statistics/kth_element1d.hpp | 56 +++ .../extensions/statistics/partitioning.hpp | 356 ++++++++++++++++ .../extensions/statistics/statistics_py.cpp | 2 + 5 files changed, 796 insertions(+) create mode 100644 dpnp/backend/extensions/statistics/kth_element1d.cpp create mode 100644 dpnp/backend/extensions/statistics/kth_element1d.hpp create mode 100644 dpnp/backend/extensions/statistics/partitioning.hpp diff --git a/dpnp/backend/extensions/statistics/CMakeLists.txt b/dpnp/backend/extensions/statistics/CMakeLists.txt index 2a5467bff382..1fa481546e53 100644 --- a/dpnp/backend/extensions/statistics/CMakeLists.txt +++ b/dpnp/backend/extensions/statistics/CMakeLists.txt @@ -30,6 +30,7 @@ set(_module_src ${CMAKE_CURRENT_SOURCE_DIR}/histogram.cpp ${CMAKE_CURRENT_SOURCE_DIR}/histogramdd.cpp ${CMAKE_CURRENT_SOURCE_DIR}/histogram_common.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/kth_element1d.cpp ${CMAKE_CURRENT_SOURCE_DIR}/sliding_dot_product1d.cpp ${CMAKE_CURRENT_SOURCE_DIR}/sliding_window1d.cpp ${CMAKE_CURRENT_SOURCE_DIR}/statistics_py.cpp diff --git a/dpnp/backend/extensions/statistics/kth_element1d.cpp b/dpnp/backend/extensions/statistics/kth_element1d.cpp new file mode 100644 index 000000000000..9e2a2e235886 --- /dev/null +++ b/dpnp/backend/extensions/statistics/kth_element1d.cpp @@ -0,0 +1,381 @@ +//***************************************************************************** +// Copyright (c) 2024-2025, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** + +#include +#include +#include +#include + +#include +#include + +// dpctl tensor headers +#include "dpctl4pybind11.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +#include "ext/common.hpp" +#include "kth_element1d.hpp" +#include "partitioning.hpp" + +// #include + +namespace sycl_exp = sycl::ext::oneapi::experimental; +namespace dpctl_td_ns = dpctl::tensor::type_dispatch; +namespace dpctl_utils = dpctl::tensor::alloc_utils; + +using dpctl::tensor::usm_ndarray; + +using namespace statistics::partitioning; +using namespace ext::common; + +namespace +{ + +template +struct pick_pivot_kernel; + +template +struct KthElementF +{ + static sycl::event run_pick_pivot(sycl::queue &queue, + T *in, + T *out, + uint64_t target, + State &state, + uint64_t items_to_sort, + uint64_t limit, + const std::vector &deps) + { + auto e = queue.submit([&](sycl::handler &cgh) { + cgh.depends_on(deps); + constexpr uint64_t group_size = 128; + + auto work_sz = make_ndrange(group_size, group_size, 1); + + size_t temp_memory_size = + sycl_exp::default_sorters::joint_sorter<>::memory_required( + sycl::memory_scope::work_group, limit); + + auto loc_items = + sycl::local_accessor(sycl::range<1>(items_to_sort), cgh); + auto scratch = sycl::local_accessor( + sycl::range<1>(temp_memory_size), cgh); + + cgh.parallel_for>( + work_sz, [=](sycl::nd_item<1> item) { + auto group = item.get_group(); + + if (state.stop[0]) + return; + + auto llid = item.get_local_linear_id(); + auto local_size = item.get_group_range(0); + + uint64_t num_elems = 0; + bool target_found = false; + + T *_in = nullptr; + if (group.leader()) { + state.update_counters(); + auto less_count = state.counters.less_count[0]; + bool left = target < less_count; + state.left[0] = left; + + if (left) { + _in = in; + num_elems = state.iteration_counters.less_count[0]; + if (target + 1 == less_count) { + _in[num_elems] = state.pivot[0]; + state.counters.less_count[0] += 1; + num_elems += 1; + } + } + else { + num_elems = + state.iteration_counters.greater_equal_count[0]; + _in = in + state.n - num_elems; + + if (target + 1 < + less_count + + state.iteration_counters.equal_count[0]) { + state.values[0] = state.pivot[0]; + state.values[1] = state.pivot[0]; + + state.stop[0] = true; + state.target_found[0] = true; + target_found = true; + } + } + + state.reset_iteration_counters(); + } + + target_found = + sycl::group_broadcast(group, target_found, 0); + _in = sycl::group_broadcast(group, _in, 0); + num_elems = sycl::group_broadcast(group, num_elems, 0); + + if (target_found) { + return; + } + + if (num_elems <= limit) { + auto gh = sycl_exp::group_with_scratchpad( + group, sycl::span{&scratch[0], temp_memory_size}); + sycl_exp::joint_sort(gh, &_in[0], &_in[num_elems]); + + if (group.leader()) { + uint64_t offset = state.counters.less_count[0]; + if (state.left[0]) { + offset = + state.counters.less_count[0] - num_elems; + } + + uint64_t idx = target - offset; + state.values[0] = _in[idx]; + state.values[1] = _in[idx + 1]; + + state.stop[0] = true; + state.target_found[0] = true; + } + + return; + } + + uint64_t step = num_elems / items_to_sort; + for (uint32_t i = llid; i < items_to_sort; i += local_size) + { + loc_items[i] = std::numeric_limits::max(); + uint32_t idx = i * step; + if (idx < num_elems) { + loc_items[i] = _in[idx]; + } + } + + sycl::group_barrier(group); + + auto gh = sycl_exp::group_with_scratchpad( + group, sycl::span{&scratch[0], temp_memory_size}); + sycl_exp::joint_sort(gh, &loc_items[0], + &loc_items[0] + items_to_sort); + + T new_pivot = loc_items[items_to_sort / 2]; + + if (new_pivot != state.pivot[0]) { + if (group.leader()) { + state.pivot[0] = new_pivot; + state.num_elems[0] = num_elems; + } + return; + } + + auto start = llid + items_to_sort / 2 + 1; + uint32_t index = start; + for (uint32_t i = start; i < items_to_sort; i += local_size) + { + if (loc_items[i] != new_pivot) { + index = i; + break; + } + } + + index = sycl::reduce_over_group(group, index, + sycl::minimum<>()); + if (group.leader()) { + state.pivot[0] = loc_items[index]; + state.num_elems[0] = num_elems; + } + }); + }); + + return e; + } + + static sycl::event run_partition(sycl::queue &exec_q, + T *in, + T *out, + PartitionState &state, + const std::vector &deps) + { + + uint32_t group_size = 128; + auto e = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(deps); + + constexpr uint32_t WorkPI = 4; // empirically found number + + auto work_range = make_ndrange(state.n, group_size, WorkPI); + submit_partition_one_pivot(cgh, work_range, in, out, + state); + }); + + return e; + } + + static sycl::event run_kth_element(sycl::queue &exec_q, + const T *in, + T *partitioned, + const size_t k, + State &state, + PartitionState &pstate, + const std::vector &depends) + { + uint32_t items_to_sort = 128; + uint32_t limit = 4 * items_to_sort; + uint32_t iterations = + std::ceil(std::log(double(state.n) / limit) / std::log(2)); + + auto temp_buff = dpctl_utils::smart_malloc(state.n, exec_q, + sycl::usm::alloc::device); + + auto prev = run_pick_pivot(exec_q, const_cast(in), partitioned, k, + state, items_to_sort, limit, depends); + prev = run_partition(exec_q, const_cast(in), partitioned, pstate, + {prev}); + + T *_in = partitioned; + T *_out = temp_buff.get(); + for (uint32_t i = 0; i < iterations - 1; ++i) { + prev = run_pick_pivot(exec_q, _in, _out, k, state, limit, + items_to_sort, {prev}); + prev = run_partition(exec_q, _in, _out, pstate, {prev}); + std::swap(_in, _out); + } + prev = run_pick_pivot(exec_q, _in, _out, k, state, limit, items_to_sort, + {prev}); + + return prev; + } + + static std::tuple + impl(sycl::queue &exec_queue, + const void *v_ain, + void *v_partitioned, + const size_t a_size, + const size_t k, + const std::vector &depends) + { + const T *ain = static_cast(v_ain); + T *partitioned = static_cast(v_partitioned); + + State state(exec_queue, a_size, partitioned); + PartitionState pstate(state); + + auto init_e = state.init(exec_queue, depends); + init_e = pstate.init(exec_queue, {init_e}); + + auto evt = run_kth_element(exec_queue, ain, partitioned, k, state, + pstate, {init_e}); + + bool found = false; + bool left = false; + uint64_t less_count = 0; + uint64_t greater_equal_count = 0; + uint64_t num_elems = 0; + auto copy_evt = exec_queue.copy(state.target_found, &found, 1, evt); + copy_evt = exec_queue.copy(state.left, &left, 1, copy_evt); + copy_evt = exec_queue.copy(state.counters.less_count, &less_count, 1, + copy_evt); + copy_evt = exec_queue.copy(state.counters.greater_equal_count, + &greater_equal_count, 1, copy_evt); + copy_evt = exec_queue.copy(state.num_elems, &num_elems, 1, copy_evt); + + copy_evt.wait(); + + uint64_t buff_offset = 0; + uint64_t elems_offset = less_count; + if (!found) { + if (left) { + elems_offset = less_count - num_elems; + } + else { + buff_offset = a_size - num_elems; + } + } + else { + num_elems = 2; + elems_offset = k; + } + + state.cleanup(exec_queue); + + return {found, buff_offset, elems_offset, num_elems}; + } +}; + +using SupportedTypes = + std::tuple; +} // namespace + +KthElement1d::KthElement1d() : dispatch_table("a") +{ + dispatch_table.populate_dispatch_table(); +} + +std::tuple + KthElement1d::call(const dpctl::tensor::usm_ndarray &a, + dpctl::tensor::usm_ndarray &partitioned, + const size_t k, + const std::vector &depends) +{ + // validate(a, partitioned, k); + + const int a_typenum = a.get_typenum(); + auto kth_elem_func = dispatch_table.get(a_typenum); + + auto exec_q = a.get_queue(); + auto result = kth_elem_func(exec_q, a.get_data(), partitioned.get_data(), + a.get_shape(0), k, depends); + + return result; +} + +std::unique_ptr kth; + +void statistics::partitioning::populate_kth_element1d(py::module_ m) +{ + using namespace std::placeholders; + + kth.reset(new KthElement1d()); + + auto kth_func = [kthp = kth.get()]( + const dpctl::tensor::usm_ndarray &a, + dpctl::tensor::usm_ndarray &partitioned, const size_t k, + const std::vector &depends) { + return kthp->call(a, partitioned, k, depends); + }; + + m.def("kth_element", kth_func, "finding k and k+1 elements.", py::arg("a"), + py::arg("partitioned"), py::arg("k"), + py::arg("depends") = py::list()); + + auto kth_dtypes = [kthp = kth.get()]() { + return kthp->dispatch_table.get_all_supported_types(); + }; + + m.def("kth_element_dtypes", kth_dtypes, + "Get the supported data types for kth_element."); +} diff --git a/dpnp/backend/extensions/statistics/kth_element1d.hpp b/dpnp/backend/extensions/statistics/kth_element1d.hpp new file mode 100644 index 000000000000..06219823423b --- /dev/null +++ b/dpnp/backend/extensions/statistics/kth_element1d.hpp @@ -0,0 +1,56 @@ +//***************************************************************************** +// Copyright (c) 2024-2025, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** + +#pragma once + +#include "ext/dispatch_table.hpp" +#include +#include + +namespace statistics::partitioning +{ +struct KthElement1d +{ + using FnT = std::tuple (*)( + sycl::queue &, + const void *, + void *, + const size_t, + const size_t, + const std::vector &); + + ext::common::DispatchTable dispatch_table; + + KthElement1d(); + + std::tuple + call(const dpctl::tensor::usm_ndarray &a, + dpctl::tensor::usm_ndarray &partitioned, + uint64_t k, + const std::vector &depends); +}; + +void populate_kth_element1d(py::module_ m); +} // namespace statistics::partitioning diff --git a/dpnp/backend/extensions/statistics/partitioning.hpp b/dpnp/backend/extensions/statistics/partitioning.hpp new file mode 100644 index 000000000000..b249272a9c74 --- /dev/null +++ b/dpnp/backend/extensions/statistics/partitioning.hpp @@ -0,0 +1,356 @@ +//***************************************************************************** +// Copyright (c) 2024-2025, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** + +#pragma once + +#include "utils/math_utils.hpp" +#include +#include + +#include + +#include "ext/common.hpp" + +using dpctl::tensor::usm_ndarray; + +using ext::common::AtomicOp; +using ext::common::IsNan; +using ext::common::Less; +using ext::common::make_ndrange; + +namespace statistics::partitioning +{ + +struct Counters +{ + uint64_t *less_count; + uint64_t *equal_count; + uint64_t *greater_equal_count; + uint64_t *nan_count; + + Counters(sycl::queue &queue) + { + less_count = sycl::malloc_device(1, queue); + equal_count = sycl::malloc_device(1, queue); + greater_equal_count = sycl::malloc_device(1, queue); + nan_count = sycl::malloc_device(1, queue); + }; + + void cleanup(sycl::queue &queue) + { + sycl::free(less_count, queue); + sycl::free(equal_count, queue); + sycl::free(greater_equal_count, queue); + sycl::free(nan_count, queue); + } +}; + +template +struct State +{ + Counters counters; + Counters iteration_counters; + + bool *stop; + bool *target_found; + bool *left; + + T *pivot; + T *values; + + size_t *num_elems; + + size_t n; + + State(sycl::queue &queue, size_t _n, T *values_buff) + : counters(queue), iteration_counters(queue) + { + stop = sycl::malloc_device(1, queue); + target_found = sycl::malloc_device(1, queue); + left = sycl::malloc_device(1, queue); + + pivot = sycl::malloc_device(1, queue); + values = values_buff; + + num_elems = sycl::malloc_device(1, queue); + + n = _n; + } + + sycl::event init(sycl::queue &queue, const std::vector &deps) + { + sycl::event fill_e = + queue.fill(counters.less_count, 0, 1, deps); + fill_e = queue.fill(counters.equal_count, 0, 1, {fill_e}); + fill_e = + queue.fill(counters.greater_equal_count, n, 1, {fill_e}); + fill_e = queue.fill(counters.nan_count, 0, 1, {fill_e}); + fill_e = queue.fill(num_elems, 0, 1, {fill_e}); + fill_e = queue.fill(stop, false, 1, {fill_e}); + fill_e = queue.fill(target_found, false, 1, {fill_e}); + fill_e = queue.fill(left, false, 1, {fill_e}); + fill_e = queue.fill(pivot, 0, 1, {fill_e}); + + return fill_e; + } + + void update_counters() const + { + if (*left) { + counters.less_count[0] -= iteration_counters.greater_equal_count[0]; + counters.greater_equal_count[0] += + iteration_counters.greater_equal_count[0]; + } + else { + counters.less_count[0] += iteration_counters.less_count[0]; + counters.greater_equal_count[0] -= iteration_counters.less_count[0]; + } + counters.equal_count[0] = iteration_counters.equal_count[0]; + counters.nan_count[0] += iteration_counters.nan_count[0]; + } + + void reset_iteration_counters() const + { + iteration_counters.less_count[0] = 0; + iteration_counters.equal_count[0] = 0; + iteration_counters.greater_equal_count[0] = 0; + iteration_counters.nan_count[0] = 0; + } + + void cleanup(sycl::queue &queue) + { + counters.cleanup(queue); + iteration_counters.cleanup(queue); + + sycl::free(stop, queue); + sycl::free(target_found, queue); + sycl::free(left, queue); + + sycl::free(num_elems, queue); + sycl::free(pivot, queue); + } +}; + +template +struct PartitionState +{ + Counters iteration_counters; + + bool *stop; + bool *left; + + T *pivot; + + size_t n; + size_t *num_elems; + + PartitionState(State &state) + : iteration_counters(state.iteration_counters) + { + stop = state.stop; + left = state.left; + + num_elems = state.num_elems; + pivot = state.pivot; + + n = state.n; + } + + sycl::event init(sycl::queue &queue, const std::vector &deps) + { + sycl::event fill_e = + queue.fill(iteration_counters.less_count, n, 1, deps); + fill_e = queue.fill(iteration_counters.equal_count, 0, 1, + {fill_e}); + fill_e = queue.fill(iteration_counters.greater_equal_count, 0, + 1, {fill_e}); + fill_e = + queue.fill(iteration_counters.nan_count, 0, 1, {fill_e}); + + return fill_e; + } +}; + +template +class partition_one_pivot_kernel; + +template +void submit_partition_one_pivot(sycl::handler &cgh, + sycl::nd_range<1> work_sz, + T *in, + T *out, + PartitionState &state) +{ + auto loc_counters = + sycl::local_accessor(sycl::range<1>(4), cgh); + cgh.parallel_for>( + work_sz, [=](sycl::nd_item<1> item) { + if (state.stop[0]) + return; + + auto group = item.get_group(); + uint64_t items_per_group = group.get_local_range(0) * WorkPI; + uint64_t num_elems = state.num_elems[0]; + + if (group.get_group_id(0) * items_per_group >= num_elems) + return; + + T *_in = nullptr; + if (state.left[0]) { + _in = in; + } + else { + _in = in + state.n - num_elems; + } + + auto value = state.pivot[0]; + + auto sbg = item.get_sub_group(); + uint32_t sbg_size = sbg.get_max_local_range()[0]; + + uint64_t i_base = + (item.get_global_linear_id() - sbg.get_local_linear_id()) * + WorkPI; + + if (group.leader()) { + loc_counters[0] = 0; + loc_counters[1] = 0; + loc_counters[2] = 0; + } + + sycl::group_barrier(group); + + uint32_t less_count = 0; + uint32_t equal_count = 0; + uint32_t greater_equal_count = 0; + uint32_t nan_count = 0; + + T values[WorkPI]; + uint32_t actual_count = 0; + uint64_t local_i_base = i_base + sbg.get_local_linear_id(); + + for (uint32_t _i = 0; _i < WorkPI; ++_i) { + auto i = local_i_base + _i * sbg_size; + if (i < num_elems) { + values[_i] = _in[i]; + less_count += Less{}(values[_i], value); + equal_count += values[_i] == value; + nan_count += IsNan::isnan(values[_i]); + actual_count++; + } + } + + greater_equal_count = actual_count - less_count; + + auto sbg_less_equal = + sycl::reduce_over_group(sbg, less_count, sycl::plus<>()); + auto sbg_equal = + sycl::reduce_over_group(sbg, equal_count, sycl::plus<>()); + auto sbg_greater = sycl::reduce_over_group(sbg, greater_equal_count, + sycl::plus<>()); + + uint32_t local_less_offset = 0; + uint32_t local_gr_offset = 0; + if (sbg.leader()) { + sycl::atomic_ref + gr_less_eq(loc_counters[0]); + local_less_offset = gr_less_eq.fetch_add(sbg_less_equal); + + sycl::atomic_ref + gr_eq(loc_counters[1]); + gr_eq += sbg_equal; + + sycl::atomic_ref + gr_greater(loc_counters[2]); + local_gr_offset = gr_greater.fetch_add(sbg_greater); + } + + local_less_offset = + sycl::select_from_group(sbg, local_less_offset, 0); + local_gr_offset = sycl::select_from_group(sbg, local_gr_offset, 0); + + sycl::group_barrier(group); + + if (group.leader()) { + sycl::atomic_ref + glbl_less_eq(state.iteration_counters.less_count[0]); + auto global_less_eq_offset = + glbl_less_eq.fetch_add(loc_counters[0]); + + sycl::atomic_ref + glbl_eq(state.iteration_counters.equal_count[0]); + glbl_eq += loc_counters[1]; + + sycl::atomic_ref + glbl_greater( + state.iteration_counters.greater_equal_count[0]); + auto global_gr_offset = glbl_greater.fetch_add(loc_counters[2]); + + loc_counters[0] = global_less_eq_offset; + loc_counters[2] = global_gr_offset; + } + + sycl::group_barrier(group); + + auto sbg_less_offset = loc_counters[0] + local_less_offset; + auto sbg_gr_offset = + state.n - (loc_counters[2] + local_gr_offset + sbg_greater); + + uint32_t le_item_offset = 0; + uint32_t gr_item_offset = 0; + + for (uint32_t _i = 0; _i < WorkPI; ++_i) { + uint32_t less = values[_i] < value; + auto le_pos = + sycl::exclusive_scan_over_group(sbg, less, sycl::plus<>()); + auto ge_pos = sbg.get_local_linear_id() - le_pos; + + auto total_le = + sycl::reduce_over_group(sbg, less, sycl::plus<>()); + auto total_gr = sbg_size - total_le; + + if (_i < actual_count) { + if (less) { + out[sbg_less_offset + le_item_offset + le_pos] = + values[_i]; + } + else { + out[sbg_gr_offset + gr_item_offset + ge_pos] = + values[_i]; + } + le_item_offset += total_le; + gr_item_offset += total_gr; + } + } + }); +} + +} // namespace statistics::partitioning diff --git a/dpnp/backend/extensions/statistics/statistics_py.cpp b/dpnp/backend/extensions/statistics/statistics_py.cpp index 6636d3f7d531..757ec85c6222 100644 --- a/dpnp/backend/extensions/statistics/statistics_py.cpp +++ b/dpnp/backend/extensions/statistics/statistics_py.cpp @@ -32,12 +32,14 @@ #include "bincount.hpp" #include "histogram.hpp" #include "histogramdd.hpp" +#include "kth_element1d.hpp" #include "sliding_dot_product1d.hpp" PYBIND11_MODULE(_statistics_impl, m) { statistics::histogram::populate_bincount(m); statistics::histogram::populate_histogram(m); + statistics::partitioning::populate_kth_element1d(m); statistics::sliding_window1d::populate_sliding_dot_product1d(m); statistics::histogram::populate_histogramdd(m); } From a8a93da85de93f2cba2b79e2e8a89f0de989acfe Mon Sep 17 00:00:00 2001 From: Alexander Kalistratov Date: Wed, 23 Apr 2025 16:33:39 +0200 Subject: [PATCH 2/3] Call implementation from python --- .../extensions/statistics/kth_element1d.cpp | 156 ++++++++++++------ .../extensions/statistics/kth_element1d.hpp | 5 +- .../extensions/statistics/partitioning.hpp | 1 + dpnp/dpnp_utils/dpnp_utils_statistics.py | 41 +++++ 4 files changed, 153 insertions(+), 50 deletions(-) diff --git a/dpnp/backend/extensions/statistics/kth_element1d.cpp b/dpnp/backend/extensions/statistics/kth_element1d.cpp index 9e2a2e235886..7a07f568c9f0 100644 --- a/dpnp/backend/extensions/statistics/kth_element1d.cpp +++ b/dpnp/backend/extensions/statistics/kth_element1d.cpp @@ -40,7 +40,8 @@ #include "kth_element1d.hpp" #include "partitioning.hpp" -// #include +#include +#include namespace sycl_exp = sycl::ext::oneapi::experimental; namespace dpctl_td_ns = dpctl::tensor::type_dispatch; @@ -67,6 +68,7 @@ struct KthElementF State &state, uint64_t items_to_sort, uint64_t limit, + bool ret, const std::vector &deps) { auto e = queue.submit([&](sycl::handler &cgh) { @@ -84,6 +86,12 @@ struct KthElementF auto scratch = sycl::local_accessor( sycl::range<1>(temp_memory_size), cgh); + // std::cout << "temp_memory_size: " << temp_memory_size + // << " items_to_sort: " << items_to_sort + // << " limit: " << limit + // << " group_size: " << group_size << "\n"; + + // auto str = sycl::stream(8192, 1024, cgh); cgh.parallel_for>( work_sz, [=](sycl::nd_item<1> item) { auto group = item.get_group(); @@ -129,7 +137,6 @@ struct KthElementF target_found = true; } } - state.reset_iteration_counters(); } @@ -142,10 +149,15 @@ struct KthElementF return; } + // if (group.leader()) { + // str << "num_elems: " << num_elems << "\n"; + // } + if (num_elems <= limit) { auto gh = sycl_exp::group_with_scratchpad( group, sycl::span{&scratch[0], temp_memory_size}); - sycl_exp::joint_sort(gh, &_in[0], &_in[num_elems]); + if (num_elems > 0) + sycl_exp::joint_sort(gh, &_in[0], &_in[num_elems]); if (group.leader()) { uint64_t offset = state.counters.less_count[0]; @@ -154,9 +166,18 @@ struct KthElementF state.counters.less_count[0] - num_elems; } - uint64_t idx = target - offset; - state.values[0] = _in[idx]; - state.values[1] = _in[idx + 1]; + int64_t idx = target - offset; + + // if (idx + 1 > (in + state.n - _in) || idx < 0) + // { + // str << "buffer access out of bounds idx = " + // << idx << " size " << (in + state.n - _in) << "\n"; + // } + // else + { + state.values[0] = _in[idx]; + state.values[1] = _in[idx + 1]; + } state.stop[0] = true; state.target_found[0] = true; @@ -165,6 +186,9 @@ struct KthElementF return; } + // if (ret) + // return; + uint64_t step = num_elems / items_to_sort; for (uint32_t i = llid; i < items_to_sort; i += local_size) { @@ -184,30 +208,30 @@ struct KthElementF T new_pivot = loc_items[items_to_sort / 2]; - if (new_pivot != state.pivot[0]) { + // if (new_pivot != state.pivot[0]) { if (group.leader()) { state.pivot[0] = new_pivot; state.num_elems[0] = num_elems; } return; - } - - auto start = llid + items_to_sort / 2 + 1; - uint32_t index = start; - for (uint32_t i = start; i < items_to_sort; i += local_size) - { - if (loc_items[i] != new_pivot) { - index = i; - break; - } - } - - index = sycl::reduce_over_group(group, index, - sycl::minimum<>()); - if (group.leader()) { - state.pivot[0] = loc_items[index]; - state.num_elems[0] = num_elems; - } + // } + + // auto start = llid + items_to_sort / 2 + 1; + // uint32_t index = start; + // for (uint32_t i = start; i < items_to_sort; i += local_size) + // { + // if (loc_items[i] != new_pivot) { + // index = i; + // break; + // } + // } + + // index = sycl::reduce_over_group(group, index, + // sycl::minimum<>()); + // if (group.leader()) { + // state.pivot[0] = loc_items[index]; + // state.num_elems[0] = num_elems; + // } }); }); @@ -225,7 +249,7 @@ struct KthElementF auto e = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(deps); - constexpr uint32_t WorkPI = 4; // empirically found number + constexpr uint32_t WorkPI = 1; // empirically found number auto work_range = make_ndrange(state.n, group_size, WorkPI); submit_partition_one_pivot(cgh, work_range, in, out, @@ -243,34 +267,42 @@ struct KthElementF PartitionState &pstate, const std::vector &depends) { - uint32_t items_to_sort = 128; - uint32_t limit = 4 * items_to_sort; + uint32_t items_to_sort = 127; + uint32_t limit = 4 * (items_to_sort + 1); uint32_t iterations = - std::ceil(std::log(double(state.n) / limit) / std::log(2)); + std::ceil(-std::log(double(state.n) / limit) / std::log(0.536)) + 1; + // Ensure iterations are odd so the final result is always stored in 'partitioned' + iterations += 1 - iterations % 2; auto temp_buff = dpctl_utils::smart_malloc(state.n, exec_q, sycl::usm::alloc::device); + std::cout << "Iteration " << 0 << std::endl; auto prev = run_pick_pivot(exec_q, const_cast(in), partitioned, k, - state, items_to_sort, limit, depends); + state, items_to_sort, limit, false, depends); prev = run_partition(exec_q, const_cast(in), partitioned, pstate, {prev}); + // prev.wait(); T *_in = partitioned; T *_out = temp_buff.get(); for (uint32_t i = 0; i < iterations - 1; ++i) { - prev = run_pick_pivot(exec_q, _in, _out, k, state, limit, - items_to_sort, {prev}); + std::cout << "Iteration " << i + 1 << std::endl; + prev = run_pick_pivot(exec_q, _in, _out, k, state, + items_to_sort, limit, true, {prev}); prev = run_partition(exec_q, _in, _out, pstate, {prev}); std::swap(_in, _out); + // prev.wait(); + // if (i % 5 == 0) + // prev.wait(); } - prev = run_pick_pivot(exec_q, _in, _out, k, state, limit, items_to_sort, - {prev}); + prev = run_pick_pivot(exec_q, _in, _out, k, state, items_to_sort, limit, + true, {prev}); return prev; } - static std::tuple + static KthElement1d::RetT impl(sycl::queue &exec_queue, const void *v_ain, void *v_partitioned, @@ -278,12 +310,14 @@ struct KthElementF const size_t k, const std::vector &depends) { + auto start = std::chrono::high_resolution_clock::now(); const T *ain = static_cast(v_ain); T *partitioned = static_cast(v_partitioned); State state(exec_queue, a_size, partitioned); PartitionState pstate(state); + exec_queue.wait(); auto init_e = state.init(exec_queue, depends); init_e = pstate.init(exec_queue, {init_e}); @@ -295,6 +329,7 @@ struct KthElementF uint64_t less_count = 0; uint64_t greater_equal_count = 0; uint64_t num_elems = 0; + uint64_t nan_count = 0; auto copy_evt = exec_queue.copy(state.target_found, &found, 1, evt); copy_evt = exec_queue.copy(state.left, &left, 1, copy_evt); copy_evt = exec_queue.copy(state.counters.less_count, &less_count, 1, @@ -302,27 +337,52 @@ struct KthElementF copy_evt = exec_queue.copy(state.counters.greater_equal_count, &greater_equal_count, 1, copy_evt); copy_evt = exec_queue.copy(state.num_elems, &num_elems, 1, copy_evt); - - copy_evt.wait(); + copy_evt = exec_queue.copy(state.counters.nan_count, &nan_count, 1, copy_evt); uint64_t buff_offset = 0; uint64_t elems_offset = less_count; - if (!found) { - if (left) { - elems_offset = less_count - num_elems; + + try + { + copy_evt.wait(); + + if (!found) { + if (left) { + elems_offset = less_count - num_elems; + } + else { + buff_offset = a_size - num_elems; + } } else { - buff_offset = a_size - num_elems; + num_elems = 2; + elems_offset = k; } + + state.cleanup(exec_queue); + auto end = std::chrono::high_resolution_clock::now(); + + auto duration = + std::chrono::duration_cast(end - start) + .count(); + + std::cout << "KthElement1d took " << duration << " microseconds" + << std::endl; + + std::cout << "Found " << found << " left " << left + << " less_count " << less_count + << " greater_equal_count " << greater_equal_count + << " num_elems " << num_elems + << " nan_count " << nan_count + << std::endl; + /* code */ } - else { - num_elems = 2; - elems_offset = k; + catch (sycl::exception const &e) + { + std::cout << e.what() << std::endl; } - state.cleanup(exec_queue); - - return {found, buff_offset, elems_offset, num_elems}; + return {found, buff_offset, elems_offset, num_elems, nan_count}; } }; @@ -335,7 +395,7 @@ KthElement1d::KthElement1d() : dispatch_table("a") dispatch_table.populate_dispatch_table(); } -std::tuple +KthElement1d::RetT KthElement1d::call(const dpctl::tensor::usm_ndarray &a, dpctl::tensor::usm_ndarray &partitioned, const size_t k, diff --git a/dpnp/backend/extensions/statistics/kth_element1d.hpp b/dpnp/backend/extensions/statistics/kth_element1d.hpp index 06219823423b..0507206e439b 100644 --- a/dpnp/backend/extensions/statistics/kth_element1d.hpp +++ b/dpnp/backend/extensions/statistics/kth_element1d.hpp @@ -33,7 +33,8 @@ namespace statistics::partitioning { struct KthElement1d { - using FnT = std::tuple (*)( + using RetT = std::tuple; + using FnT = RetT (*)( sycl::queue &, const void *, void *, @@ -45,7 +46,7 @@ struct KthElement1d KthElement1d(); - std::tuple + RetT call(const dpctl::tensor::usm_ndarray &a, dpctl::tensor::usm_ndarray &partitioned, uint64_t k, diff --git a/dpnp/backend/extensions/statistics/partitioning.hpp b/dpnp/backend/extensions/statistics/partitioning.hpp index b249272a9c74..748a4a16f01c 100644 --- a/dpnp/backend/extensions/statistics/partitioning.hpp +++ b/dpnp/backend/extensions/statistics/partitioning.hpp @@ -205,6 +205,7 @@ void submit_partition_one_pivot(sycl::handler &cgh, { auto loc_counters = sycl::local_accessor(sycl::range<1>(4), cgh); + // sycl::stream str(8192, 1024, cgh); cgh.parallel_for>( work_sz, [=](sycl::nd_item<1> item) { if (state.stop[0]) diff --git a/dpnp/dpnp_utils/dpnp_utils_statistics.py b/dpnp/dpnp_utils/dpnp_utils_statistics.py index 108fda7286fc..47fab796d7d3 100644 --- a/dpnp/dpnp_utils/dpnp_utils_statistics.py +++ b/dpnp/dpnp_utils/dpnp_utils_statistics.py @@ -29,6 +29,9 @@ import dpctl.tensor as dpt from dpctl.tensor._numpy_helper import normalize_axis_tuple from dpctl.utils import ExecutionPlacementError +import dpnp.backend.extensions.statistics._statistics_impl as statistics_ext + +import dpctl.utils as dpu import dpnp from dpnp.dpnp_array import dpnp_array @@ -190,6 +193,41 @@ def dpnp_cov( c = dpnp.dot(x, x_t.conj()) / fact return c.squeeze() +def native_median(a): + + partitioned = dpnp.empty_like(a) + a_usm = dpnp.get_usm_ndarray(a) + partitioned_usm = dpnp.get_usm_ndarray(partitioned) + + _manager = dpu.SequentialOrderManager[a.sycl_queue] + + result = dpnp.empty_like(a, shape = 1) + k = a.shape[0] // 2 + + found, buff_offset, elems_offset, num_elems, nan_count = statistics_ext.kth_element( + a_usm, + partitioned_usm, + k, + depends=_manager.submitted_events, + ) + + if found: + if a.shape[0] % 2 == 0: + # even number of elements + result[0] = (partitioned[0] + partitioned[1]) / 2 + else: + result[0] = partitioned[0] + else: + partitioned[buff_offset:buff_offset + num_elems].sort() + kth_idx = buff_offset + k - elems_offset + if a.shape[0] % 2 == 0: + # even number of elements + result[0] = (partitioned[kth_idx] + partitioned[kth_idx + 1]) / 2 + else: + result[0] = partitioned[kth_idx] + + return result + def dpnp_median( a, @@ -223,6 +261,9 @@ def dpnp_median( ) axis = -1 + if not isinstance(a.dtype, dpnp.complexfloating) and not ignore_nan and a_ndim == 1: + return native_median(a) + if overwrite_input: if isinstance(a, dpt.usm_ndarray): # dpnp.ndarray.sort only works with dpnp_array From a518e0c7bba60643a652a08765cb8d13e8f7790c Mon Sep 17 00:00:00 2001 From: Alexander Kalistratov Date: Wed, 23 Apr 2025 17:05:42 +0200 Subject: [PATCH 3/3] Add sync --- dpnp/backend/extensions/statistics/kth_element1d.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dpnp/backend/extensions/statistics/kth_element1d.cpp b/dpnp/backend/extensions/statistics/kth_element1d.cpp index 7a07f568c9f0..4e5817ca97cc 100644 --- a/dpnp/backend/extensions/statistics/kth_element1d.cpp +++ b/dpnp/backend/extensions/statistics/kth_element1d.cpp @@ -296,6 +296,7 @@ struct KthElementF // if (i % 5 == 0) // prev.wait(); } + prev.wait(); prev = run_pick_pivot(exec_q, _in, _out, k, state, items_to_sort, limit, true, {prev}); @@ -359,7 +360,6 @@ struct KthElementF elems_offset = k; } - state.cleanup(exec_queue); auto end = std::chrono::high_resolution_clock::now(); auto duration = @@ -382,6 +382,7 @@ struct KthElementF std::cout << e.what() << std::endl; } + state.cleanup(exec_queue); return {found, buff_offset, elems_offset, num_elems, nan_count}; } };