[BugFix] Fix topn with large limit offset regression (backport #56590)

stdpain · stdpain · commit 0abde0fd663b · 2025-03-24T17:13:44.000+08:00
Signed-off-by: stdpain &lt;drfeng08@gmail.com&gt;

Signed-off-by: stdpain &lt;34912776+stdpain@users.noreply.github.com&gt;
diff --git a/be/src/exec/chunks_sorter.cpp b/be/src/exec/chunks_sorter.cpp
@@ -24,6 +24,7 @@
 #include "runtime/current_thread.h"
 #include "runtime/runtime_state.h"
 #include "util/orlp/pdqsort.h"
+#include "util/runtime_profile.h"
 #include "util/stopwatch.hpp"
 
 namespace starrocks {
@@ -58,6 +59,7 @@ void ChunksSorter::setup_runtime(RuntimeState* state, RuntimeProfile* profile, M
     _sort_timer = ADD_TIMER(profile, "SortingTime");
     _merge_timer = ADD_TIMER(profile, "MergingTime");
     _output_timer = ADD_TIMER(profile, "OutputTime");
+    _sort_cnt = ADD_COUNTER(profile, "SortingCnt", TUnit::UNIT);
     profile->add_info_string("SortKeys", _sort_keys);
     profile->add_info_string("SortType", _is_topn ? "TopN" : "All");
 }
diff --git a/be/src/exec/chunks_sorter.h b/be/src/exec/chunks_sorter.h
@@ -154,6 +154,7 @@ class ChunksSorter {
     RuntimeProfile::Counter* _sort_timer = nullptr;
     RuntimeProfile::Counter* _merge_timer = nullptr;
     RuntimeProfile::Counter* _output_timer = nullptr;
+    RuntimeProfile::Counter* _sort_cnt = nullptr;
 
     size_t _revocable_mem_bytes = 0;
     spill::SpillStrategy _spill_strategy = spill::SpillStrategy::NO_SPILL;
diff --git a/be/src/exec/chunks_sorter_topn.cpp b/be/src/exec/chunks_sorter_topn.cpp
@@ -27,6 +27,7 @@
 #include "runtime/runtime_state.h"
 #include "types/logical_type_infra.h"
 #include "util/orlp/pdqsort.h"
+#include "util/runtime_profile.h"
 #include "util/stopwatch.hpp"
 
 namespace starrocks {
@@ -68,6 +69,8 @@ ChunksSorterTopn::ChunksSorterTopn(RuntimeState* state, const std::vector<ExprCo
           _topn_type(topn_type) {
     DCHECK_GT(_get_number_of_rows_to_sort(), 0) << "output rows can't be empty";
     DCHECK(_topn_type == TTopNType::ROW_NUMBER || _offset == 0);
+    _init_buffered_chunks = tunning_buffered_chunks(_get_number_of_rows_to_sort());
+    _buffered_chunks_capacity = _init_buffered_chunks;
     auto& raw_chunks = _raw_chunks.chunks;
     // avoid too large buffer chunks
     raw_chunks.reserve(std::min<size_t>(max_buffered_chunks, 256));
@@ -88,43 +91,35 @@ Status ChunksSorterTopn::update(RuntimeState* state, const ChunkPtr& chunk) {
     }
     auto& raw_chunks = _raw_chunks.chunks;
     size_t chunk_number = raw_chunks.size();
+    size_t prev_chunk_memusage = 0;
     if (chunk_number <= 0) {
         raw_chunks.push_back(chunk);
         chunk_number++;
     } else if (raw_chunks[chunk_number - 1]->num_rows() + chunk->num_rows() > _state->chunk_size()) {
         raw_chunks.push_back(chunk);
         chunk_number++;
     } else {
+        prev_chunk_memusage = raw_chunks[chunk_number - 1]->memory_usage();
         // Old planner will not remove duplicated sort column.
         // columns in chunk may have same column ptr
         // append_safe will check size of all columns in dest chunk
         // to ensure same column will not apppend repeatedly.
         raw_chunks[chunk_number - 1]->append_safe(*chunk);
     }
+    _raw_chunks.update_mem_usage(raw_chunks[chunk_number - 1]->memory_usage() - prev_chunk_memusage);
     _raw_chunks.size_of_rows += chunk->num_rows();
 
     // Avoid TOPN from using too much memory.
-    bool exceed_mem_limit = _raw_chunks.mem_usage() > _max_buffered_bytes;
+    bool exceed_mem_limit = _raw_chunks.mem_usage > _max_buffered_bytes;
     if (exceed_mem_limit) {
         return _sort_chunks(state);
     }
 
-    // Try to accumulate more chunks.
-    size_t rows_to_sort = _get_number_of_rows_to_sort();
-    if (_merged_runs.num_rows() + _raw_chunks.size_of_rows < rows_to_sort) {
-        return Status::OK();
-    }
-
-    // We have accumulated rows_to_sort rows to build merged runs.
-    if (_merged_runs.num_rows() <= rows_to_sort) {
-        return _sort_chunks(state);
-    }
-
-    // When number of Chunks exceeds _limit or _max_buffered_chunks, run sort and then part of
+    // When number of Chunks exceeds _buffered_chunks_capacity or rows greater than _max_buffered_rows , run sort and then part of
     // cached chunks can be dropped, so it can reduce the memory usage.
     // TopN caches _limit or _max_buffered_chunks primitive chunks,
     // performs sorting once, and discards extra rows
-    if (chunk_number >= _max_buffered_chunks || _raw_chunks.size_of_rows > _max_buffered_rows) {
+    if (chunk_number >= _buffered_chunks_capacity || _raw_chunks.size_of_rows > _max_buffered_rows) {
         return _sort_chunks(state);
     }
 
@@ -228,6 +223,9 @@ size_t ChunksSorterTopn::get_output_rows() const {
 }
 
 Status ChunksSorterTopn::_sort_chunks(RuntimeState* state) {
+    if (_sort_cnt) {
+        COUNTER_UPDATE(_sort_cnt, 1);
+    }
     // Chunks for this batch.
     DataSegments segments;
 
@@ -569,6 +567,7 @@ Status ChunksSorterTopn::_merge_sort_common(MergedRuns* dst, DataSegments& segme
     }
 
     if (_merged_runs.num_chunks() > 1 || _merged_runs.mem_usage() > _max_buffered_bytes) {
+        _adjust_chunks_capacity(true);
         // merge to multi sorted chunks
         RETURN_IF_ERROR(merge_sorted_chunks(_sort_desc, _sort_exprs, _merged_runs, std::move(right_unique_chunk),
                                             rows_to_keep, dst));
@@ -583,24 +582,31 @@ Status ChunksSorterTopn::_merge_sort_common(MergedRuns* dst, DataSegments& segme
         // prepare right chunk
         ChunkPtr right_chunk = std::move(right_unique_chunk);
 
-        Permutation merged_perm;
-        merged_perm.reserve(left_chunk->num_rows() + right_chunk->num_rows());
-
-        RETURN_IF_ERROR(merge_sorted_chunks_two_way(_sort_desc, {left_chunk, left_columns},
-                                                    {right_chunk, right_columns}, &merged_perm));
-        CHECK_GE(merged_perm.size(), rows_to_keep);
-        merged_perm.resize(rows_to_keep);
+        const SortedRun left = {left_chunk, left_columns};
+        const SortedRun right = {right_chunk, right_columns};
+        bool intersected = !left.empty() && !right.empty() && !left.intersect(_sort_desc, right);
+        // adjust chunks capacity
+        _adjust_chunks_capacity(intersected);
 
-        // materialize into the dst runs
-        std::vector<ChunkPtr> chunks{left_chunk, right_chunk};
         ChunkUniquePtr big_chunk;
         if (dst->num_chunks() == 0) {
             big_chunk = segments[permutation_second[0].chunk_index].chunk->clone_empty(rows_to_keep);
         } else {
             big_chunk = std::move(dst->front().chunk);
             dst->pop_front();
         }
+
+        Permutation merged_perm;
+        merged_perm.reserve(left_chunk->num_rows() + right_chunk->num_rows());
+        RETURN_IF_ERROR(merge_sorted_chunks_two_way(_sort_desc, left, right, &merged_perm));
+        CHECK_GE(merged_perm.size(), rows_to_keep);
+        merged_perm.resize(rows_to_keep);
+
+        // materialize into the dst runs
+        std::vector<ChunkPtr> chunks{left_chunk, right_chunk};
+
         materialize_by_permutation(big_chunk.get(), chunks, merged_perm);
+
         RETURN_IF_ERROR(big_chunk->upgrade_if_overflow());
         ASSIGN_OR_RETURN(auto run, MergedRun::build(std::move(big_chunk), *_sort_exprs));
         dst->push_back(std::move(run));
diff --git a/be/src/exec/chunks_sorter_topn.h b/be/src/exec/chunks_sorter_topn.h
@@ -48,7 +48,11 @@ class ChunksSorterTopn : public ChunksSorter {
         if (limit <= 65536) {
             return 64;
         }
-        return std::max<size_t>(256, limit / 4096);
+        return 256;
+    }
+
+    static constexpr size_t max_buffered_chunks(size_t rows_to_sort) {
+        return std::max<size_t>(tunning_buffered_chunks(rows_to_sort), rows_to_sort / 4069);
     }
 
     /**
@@ -78,7 +82,7 @@ class ChunksSorterTopn : public ChunksSorter {
 
     size_t get_output_rows() const override;
 
-    int64_t mem_usage() const override { return _raw_chunks.mem_usage() + _merged_runs.mem_usage(); }
+    int64_t mem_usage() const override { return _raw_chunks.mem_usage + _merged_runs.mem_usage(); }
 
     void setup_runtime(RuntimeState* state, RuntimeProfile* profile, MemTracker* parent_mem_tracker) override;
 
@@ -154,23 +158,20 @@ class ChunksSorterTopn : public ChunksSorter {
     struct RawChunks {
         std::vector<ChunkPtr> chunks;
         size_t size_of_rows = 0;
+        size_t mem_usage = 0;
 
-        int64_t mem_usage() const {
-            int64_t usage = 0;
-            for (auto& chunk : chunks) {
-                usage += chunk->memory_usage();
-            }
-            return usage;
-        }
+        void update_mem_usage(size_t delta) { mem_usage += delta; }
 
         void clear() {
             chunks.clear();
             size_of_rows = 0;
+            mem_usage = 0;
         }
     };
     const size_t _max_buffered_rows;
     const size_t _max_buffered_bytes;
     const size_t _max_buffered_chunks;
+    size_t _init_buffered_chunks;
     RawChunks _raw_chunks;
     bool _init_merged_segment;
     MergedRuns _merged_runs;
@@ -179,6 +180,23 @@ class ChunksSorterTopn : public ChunksSorter {
     const size_t _offset;
     const TTopNType::type _topn_type;
 
+    int _highest_nozero_pos(size_t val) {
+        if (val == 0) {
+            return 0;
+        }
+        return (sizeof(size_t) * 8) - __builtin_clzll(val) - 1;
+    }
+
+    void _adjust_chunks_capacity(bool inc) {
+        if (inc) {
+            size_t shift = (_highest_nozero_pos(_max_buffered_chunks) - _highest_nozero_pos(_init_buffered_chunks)) / 4;
+            shift = std::max<size_t>(shift, 1);
+            _buffered_chunks_capacity = _buffered_chunks_capacity << shift;
+            _buffered_chunks_capacity = std::min(_buffered_chunks_capacity, _max_buffered_chunks);
+        }
+    }
+
+    size_t _buffered_chunks_capacity;
     std::vector<JoinRuntimeFilter*> _runtime_filter;
 
     RuntimeProfile::Counter* _sort_filter_rows = nullptr;
diff --git a/be/src/exec/pipeline/sort/partition_sort_sink_operator.cpp b/be/src/exec/pipeline/sort/partition_sort_sink_operator.cpp
@@ -121,7 +121,7 @@ OperatorPtr PartitionSortSinkOperatorFactory::create(int32_t dop, int32_t driver
                     runtime_state(), &(_sort_exec_exprs.lhs_ordering_expr_ctxs()), &_is_asc_order, &_is_null_first,
                     _sort_keys, 0, _limit + _offset);
         } else {
-            size_t max_buffered_chunks = ChunksSorterTopn::tunning_buffered_chunks(_limit);
+            size_t max_buffered_chunks = ChunksSorterTopn::max_buffered_chunks(_limit);
             chunks_sorter = std::make_unique<ChunksSorterTopn>(
                     runtime_state(), &(_sort_exec_exprs.lhs_ordering_expr_ctxs()), &_is_asc_order, &_is_null_first,
                     _sort_keys, 0, _limit + _offset, _topn_type, _max_buffered_rows, _max_buffered_bytes,
diff --git a/be/src/exec/topn_node.cpp b/be/src/exec/topn_node.cpp
@@ -225,7 +225,7 @@ Status TopNNode::_consume_chunks(RuntimeState* state, ExecNode* child) {
             _chunks_sorter = std::make_unique<ChunksSorterTopn>(
                     state, &(_sort_exec_exprs.lhs_ordering_expr_ctxs()), &_is_asc_order, &_is_null_first, _sort_keys,
                     _offset, _limit, TTopNType::ROW_NUMBER, ChunksSorterTopn::kDefaultMaxBufferRows,
-                    ChunksSorterTopn::kDefaultMaxBufferBytes, ChunksSorterTopn::tunning_buffered_chunks(_limit));
+                    ChunksSorterTopn::kDefaultMaxBufferBytes, ChunksSorterTopn::max_buffered_chunks(_limit));
         }
 
     } else {

Original file line number	Diff line number	Diff line change
`@@ -225,7 +225,7 @@ Status TopNNode::_consume_chunks(RuntimeState* state, ExecNode* child) {`
`225`	`225`	`_chunks_sorter = std::make_unique<ChunksSorterTopn>(`
`226`	`226`	`state, &(_sort_exec_exprs.lhs_ordering_expr_ctxs()), &_is_asc_order, &_is_null_first, _sort_keys,`
`227`	`227`	`_offset, _limit, TTopNType::ROW_NUMBER, ChunksSorterTopn::kDefaultMaxBufferRows,`
`228`		`- ChunksSorterTopn::kDefaultMaxBufferBytes, ChunksSorterTopn::tunning_buffered_chunks(_limit));`
	`228`	`+ ChunksSorterTopn::kDefaultMaxBufferBytes, ChunksSorterTopn::max_buffered_chunks(_limit));`
`229`	`229`	`}`
`230`	`230`
`231`	`231`	`} else {`