Skip to content

Commit d2bcee1

Browse files
[Enhancement] estimate cardinality of multiple equivalent predicates using multi-column combined stats
Signed-off-by: stephen <[email protected]>
1 parent 1ff26d1 commit d2bcee1

20 files changed

+464
-17
lines changed

fe/fe-core/src/main/java/com/starrocks/qe/SessionVariable.java

+12
Original file line numberDiff line numberDiff line change
@@ -358,6 +358,7 @@ public class SessionVariable implements Serializable, Writable, Cloneable {
358358
public static final String CBO_MAX_REORDER_NODE_USE_GREEDY = "cbo_max_reorder_node_use_greedy";
359359
public static final String CBO_ENABLE_REPLICATED_JOIN = "cbo_enable_replicated_join";
360360
public static final String CBO_USE_CORRELATED_JOIN_ESTIMATE = "cbo_use_correlated_join_estimate";
361+
public static final String CBO_USE_CORRELATED_PREDICATE_ESTIMATE = "cbo_use_correlated_predicate_estimate";
361362
public static final String ALWAYS_COLLECT_LOW_CARD_DICT = "always_collect_low_card_dict";
362363
public static final String ALWAYS_COLLECT_LOW_CARD_DICT_ON_LAKE = "always_collect_low_card_dict_on_lake";
363364
public static final String CBO_ENABLE_LOW_CARDINALITY_OPTIMIZE = "cbo_enable_low_cardinality_optimize";
@@ -1144,6 +1145,9 @@ public static MaterializedViewRewriteMode parse(String str) {
11441145
@VariableMgr.VarAttr(name = CBO_USE_CORRELATED_JOIN_ESTIMATE, flag = VariableMgr.INVISIBLE)
11451146
private boolean useCorrelatedJoinEstimate = true;
11461147

1148+
@VariableMgr.VarAttr(name = CBO_USE_CORRELATED_PREDICATE_ESTIMATE)
1149+
private boolean useCorrelatedPredicateEstimate = true;
1150+
11471151
@VariableMgr.VarAttr(name = CBO_USE_NTH_EXEC_PLAN, flag = VariableMgr.INVISIBLE)
11481152
private int useNthExecPlan = 0;
11491153

@@ -3661,6 +3665,14 @@ public void setUseCorrelatedJoinEstimate(boolean useCorrelatedJoinEstimate) {
36613665
this.useCorrelatedJoinEstimate = useCorrelatedJoinEstimate;
36623666
}
36633667

3668+
public boolean isUseCorrelatedPredicateEstimate() {
3669+
return useCorrelatedPredicateEstimate;
3670+
}
3671+
3672+
public void setUseCorrelatedPredicateEstimate(boolean useCorrelatedPredicateEstimate) {
3673+
this.useCorrelatedPredicateEstimate = useCorrelatedPredicateEstimate;
3674+
}
3675+
36643676
public boolean isAlwaysCollectDict() {
36653677
return alwaysCollectDict;
36663678
}

fe/fe-core/src/main/java/com/starrocks/sql/optimizer/Utils.java

+22
Original file line numberDiff line numberDiff line change
@@ -70,10 +70,12 @@
7070
import java.time.Instant;
7171
import java.time.LocalDateTime;
7272
import java.time.ZoneId;
73+
import java.util.ArrayList;
7374
import java.util.Arrays;
7475
import java.util.BitSet;
7576
import java.util.Collection;
7677
import java.util.Collections;
78+
import java.util.HashMap;
7779
import java.util.HashSet;
7880
import java.util.LinkedList;
7981
import java.util.List;
@@ -983,4 +985,24 @@ public static List<Pair<Table, Column>> resolveColumnRefRecursive(ColumnRefOpera
983985

984986
return null;
985987
}
988+
989+
public static Pair<Map<ColumnRefOperator, ConstantOperator>, List<ScalarOperator>> separateEqualityPredicates(
990+
ScalarOperator predicate) {
991+
List<ScalarOperator> conjunctivePredicates = extractConjuncts(predicate);
992+
Map<ColumnRefOperator, ConstantOperator> columnConstMap = new HashMap<>();
993+
List<ScalarOperator> otherPredicates = new ArrayList<>();
994+
995+
for (ScalarOperator op : conjunctivePredicates) {
996+
if (ScalarOperator.isColumnEqualConstant(op)) {
997+
BinaryPredicateOperator binaryOp = (BinaryPredicateOperator) op;
998+
ColumnRefOperator column = (ColumnRefOperator) binaryOp.getChild(0);
999+
ConstantOperator constant = (ConstantOperator) binaryOp.getChild(1);
1000+
columnConstMap.put(column, constant);
1001+
} else {
1002+
otherPredicates.add(op);
1003+
}
1004+
}
1005+
1006+
return new Pair<>(columnConstMap, otherPredicates);
1007+
}
9861008
}

fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/BinaryPredicateStatisticCalculator.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ private static Statistics estimateColumnEqualToConstant(Optional<ColumnRefOperat
106106
.setNullsFraction(0)
107107
.setMinValue(min)
108108
.setMaxValue(max)
109-
.setDistinctValuesCount(1)
109+
.setDistinctValuesCount(columnStatistic.getDistinctValuesCount())
110110
.build();
111111

112112
double predicateFactor;

fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/PredicateStatisticsCalculator.java

+19
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
import com.google.common.base.Preconditions;
1818
import com.starrocks.analysis.BinaryType;
19+
import com.starrocks.common.Pair;
1920
import com.starrocks.sql.optimizer.Utils;
2021
import com.starrocks.sql.optimizer.operator.OperatorType;
2122
import com.starrocks.sql.optimizer.operator.scalar.BinaryPredicateOperator;
@@ -31,9 +32,12 @@
3132
import org.apache.commons.math3.util.Precision;
3233

3334
import java.util.List;
35+
import java.util.Map;
3436
import java.util.Optional;
3537
import java.util.stream.Collectors;
3638

39+
import static com.starrocks.sql.optimizer.statistics.StatisticsEstimateUtils.computeCompoundStatsWithMultiColumnOptimize;
40+
3741
public class PredicateStatisticsCalculator {
3842
public static Statistics statisticsCalculate(ScalarOperator predicate, Statistics statistics) {
3943
if (predicate == null) {
@@ -330,6 +334,13 @@ public Statistics visitCompoundPredicate(CompoundPredicateOperator predicate, Vo
330334
}
331335

332336
if (predicate.isAnd()) {
337+
Pair<Map<ColumnRefOperator, ConstantOperator>, List<ScalarOperator>> extracted =
338+
Utils.separateEqualityPredicates(predicate);
339+
340+
if (extracted.first.size() > 1) {
341+
return computeCompoundStatsWithMultiColumnOptimize(predicate, statistics);
342+
}
343+
333344
Statistics leftStatistics = predicate.getChild(0).accept(this, null);
334345
Statistics andStatistics =
335346
predicate.getChild(1).accept(new BaseCalculatingVisitor(leftStatistics), null);
@@ -411,6 +422,13 @@ public Statistics visitCompoundPredicate(CompoundPredicateOperator predicate, Vo
411422
}
412423

413424
if (predicate.isAnd()) {
425+
Pair<Map<ColumnRefOperator, ConstantOperator>, List<ScalarOperator>> extracted =
426+
Utils.separateEqualityPredicates(predicate);
427+
428+
if (extracted.first.size() > 1) {
429+
return computeCompoundStatsWithMultiColumnOptimize(predicate, statistics);
430+
}
431+
414432
Statistics leftStatistics = predicate.getChild(0).accept(this, null);
415433
Statistics andStatistics = predicate.getChild(1)
416434
.accept(new LargeOrCalculatingVisitor(leftStatistics), null);
@@ -461,5 +479,6 @@ protected Statistics computeOrPredicateStatistics(Statistics baseStatistics, Sta
461479
});
462480
return builder.build();
463481
}
482+
464483
}
465484
}

fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/StatisticsEstimateUtils.java

+212
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,26 @@
1515

1616
package com.starrocks.sql.optimizer.statistics;
1717

18+
import com.starrocks.analysis.BinaryType;
19+
import com.starrocks.common.Pair;
20+
import com.starrocks.qe.ConnectContext;
21+
import com.starrocks.sql.optimizer.Utils;
22+
import com.starrocks.sql.optimizer.operator.scalar.BinaryPredicateOperator;
23+
import com.starrocks.sql.optimizer.operator.scalar.ColumnRefOperator;
24+
import com.starrocks.sql.optimizer.operator.scalar.ConstantOperator;
25+
import com.starrocks.sql.optimizer.operator.scalar.ScalarOperator;
26+
import com.starrocks.statistic.StatisticUtils;
27+
28+
import java.util.ArrayList;
29+
import java.util.HashMap;
30+
import java.util.List;
31+
import java.util.Map;
32+
import java.util.Set;
33+
34+
import static java.lang.Double.NEGATIVE_INFINITY;
35+
import static java.lang.Double.POSITIVE_INFINITY;
36+
import static java.lang.Double.isInfinite;
37+
1838
public class StatisticsEstimateUtils {
1939
public static ColumnStatistic unionColumnStatistic(ColumnStatistic left, double leftRowCount, ColumnStatistic right,
2040
double rightRowCount) {
@@ -64,4 +84,196 @@ public static Statistics adjustStatisticsByRowCount(Statistics statistics, doubl
6484
});
6585
return builder.build();
6686
}
87+
88+
public static double getPredicateSelectivity(ScalarOperator predicate, Statistics statistics) {
89+
Statistics estimatedStatistics = PredicateStatisticsCalculator.statisticsCalculate(predicate, statistics);
90+
91+
// avoid sample statistics filter all data, save one rows least
92+
if (statistics.getOutputRowCount() > 0 && estimatedStatistics.getOutputRowCount() == 0) {
93+
return 1 / statistics.getOutputRowCount();
94+
} else {
95+
return estimatedStatistics.getOutputRowCount() / statistics.getOutputRowCount();
96+
}
97+
}
98+
99+
/**
100+
* Estimates selectivity for conjunctive equality predicates across multiple columns.
101+
*
102+
* This method implements a hybrid approach that:
103+
* 1. Leverages multi-column combined statistics when available to capture column correlations
104+
* 2. Falls back to a weighted combination model with exponential decay for columns without joint statistics
105+
* 3. Applies selectivity bounds to avoid both overestimation and underestimation
106+
*
107+
* Key formulas:
108+
* - Multi-column combined statistics based: S_mc = max(min(1/NDV, min_sel), prod_sel)
109+
* Where:
110+
* - 1/NDV is the selectivity based on multi-columns ndv
111+
* - min_sel is the minimum selectivity among correlated columns
112+
* - prod_sel is the product of individual column selectivities
113+
*
114+
* - Exponential decay for additional columns: S_final = S_base * ∏(S_i^(0.5^i))
115+
* Where:
116+
* - S_base is the initial selectivity (from multi-column stats or most selective column)
117+
* - S_i is the selectivity of the i-th additional column (sorted by ascending selectivity)
118+
* - 0.5^i is the exponential decay weight (0.5, 0.25, 0.125, etc.)
119+
*
120+
* @param equalityPredicates Map of column references to their equality constant values
121+
* @param statistics
122+
* @return Estimated selectivity in range [0,1], or -1 if estimation cannot be performed
123+
*/
124+
private static double estimateConjunctiveEqualitySelectivity(
125+
Map<ColumnRefOperator, ConstantOperator> equalityPredicates,
126+
Statistics statistics) {
127+
// Require at least two columns for multi-column estimation
128+
if (equalityPredicates.size() < 2) {
129+
return -1;
130+
}
131+
132+
// Compute individual selectivity factors for each predicate and sort in ascending order
133+
Map<ColumnRefOperator, Double> columnToSelectivityMap = new HashMap<>();
134+
for (Map.Entry<ColumnRefOperator, ConstantOperator> entry : equalityPredicates.entrySet()) {
135+
ColumnRefOperator columnRef = entry.getKey();
136+
ConstantOperator constantValue = entry.getValue();
137+
BinaryPredicateOperator equalityPredicate = new BinaryPredicateOperator(BinaryType.EQ, columnRef, constantValue);
138+
columnToSelectivityMap.put(columnRef, getPredicateSelectivity(equalityPredicate, statistics));
139+
}
140+
141+
List<Map.Entry<ColumnRefOperator, Double>> selectivityEntriesSorted =
142+
new ArrayList<>(columnToSelectivityMap.entrySet());
143+
144+
// Sort by ascending selectivity (most selective first)
145+
selectivityEntriesSorted.sort(Map.Entry.comparingByValue());
146+
147+
// Retrieve available multi-column combined statistics for the target columns
148+
Set<ColumnRefOperator> targetColumnRefs = equalityPredicates.keySet();
149+
Pair<Set<ColumnRefOperator>, MultiColumnCombinedStats> multiColumnStatsPair =
150+
statistics.getLargestSubsetMCStats(targetColumnRefs);
151+
152+
double estimatedSelectivity;
153+
154+
// Primary estimation path: utilize multi-column statistics when available
155+
if (multiColumnStatsPair != null &&
156+
!multiColumnStatsPair.first.isEmpty() &&
157+
multiColumnStatsPair.second.getNdv() > 0) {
158+
159+
Set<ColumnRefOperator> correlatedColumns = multiColumnStatsPair.first;
160+
double distinctValueCount = Math.max(1.0, multiColumnStatsPair.second.getNdv());
161+
162+
// Formula: S_corr = 1/NDV
163+
// NDV-based selectivity estimation for correlated columns
164+
double correlationBasedSelectivity = 1.0 / distinctValueCount;
165+
166+
double maxNullFraction = correlatedColumns.stream()
167+
.map(statistics::getColumnStatistic)
168+
.mapToDouble(ColumnStatistic::getNullsFraction)
169+
.max()
170+
.orElse(0.0);
171+
correlationBasedSelectivity = correlationBasedSelectivity * (1.0 - maxNullFraction);
172+
173+
// Formula: S_ind = ∏(S_i) for all i in correlatedColumns
174+
// Calculate independence-assumption selectivity product as lower bound
175+
double independentSelectivityProduct = correlatedColumns.stream()
176+
.map(columnToSelectivityMap::get)
177+
.reduce(1.0, (a, b) -> a * b);
178+
179+
// Formula: S_min = min(S_i) for all i in correlatedColumns
180+
// Identify minimum column selectivity as upper bound
181+
double minColumnSelectivity = correlatedColumns.stream()
182+
.map(columnToSelectivityMap::get)
183+
.min(Double::compare)
184+
.orElse(1.0);
185+
186+
// Formula: S_mc = max(min(S_corr, S_min), S_ind)
187+
// Apply selectivity bounds to balance correlation effects
188+
// Because a single column may build a histogram or mcv, the selection will be much larger than using only ndv.
189+
estimatedSelectivity = Math.max(
190+
Math.min(correlationBasedSelectivity, minColumnSelectivity),
191+
independentSelectivityProduct);
192+
193+
// Process remaining columns not covered by multi-column combined statistics
194+
// Formula ordering: S_final = S_mc * ∏(S_i^(0.5^(i+1))) where S_i are sorted by ascending selectivity
195+
List<Double> uncorrelatedSelectivities = selectivityEntriesSorted.stream()
196+
.filter(entry -> !correlatedColumns.contains(entry.getKey()))
197+
.map(Map.Entry::getValue)
198+
.toList();
199+
200+
// Apply exponential decay weights to uncorrelated columns (max 3)
201+
// Multi-column selectivity is used as base, then apply remaining columns in ascending selectivity order
202+
for (int i = 0; i < Math.min(3, uncorrelatedSelectivities.size()); i++) {
203+
double decayFactor = 1;
204+
if (ConnectContext.get().getSessionVariable().isUseCorrelatedPredicateEstimate()) {
205+
decayFactor = Math.pow(0.5, i + 1); // Weights: 0.5, 0.25, 0.125
206+
}
207+
estimatedSelectivity *= Math.pow(uncorrelatedSelectivities.get(i), decayFactor);
208+
}
209+
} else {
210+
// Fallback estimation path: weighted combination of individual selectivities
211+
// Formula: S_base = S_0 (most selective predicate)
212+
// Use most selective predicate as base (first in the sorted list)
213+
estimatedSelectivity = selectivityEntriesSorted.get(0).getValue();
214+
215+
// Formula: S_final = S_base * ∏(S_i^(0.5^i)) for i=1,2,3
216+
// Apply exponential decay weights to additional columns (max 4)
217+
// Columns are already sorted by ascending selectivity, so most selective is first
218+
for (int i = 1; i < Math.min(4, selectivityEntriesSorted.size()); i++) {
219+
double decayFactor = 1;
220+
if (ConnectContext.get().getSessionVariable().isUseCorrelatedPredicateEstimate()) {
221+
decayFactor = Math.pow(0.5, i);
222+
}
223+
estimatedSelectivity *= Math.pow(selectivityEntriesSorted.get(i).getValue(), decayFactor);
224+
}
225+
}
226+
227+
// Clamp final selectivity to valid probability range
228+
return Math.min(1.0, Math.max(0.0, estimatedSelectivity));
229+
}
230+
231+
public static Statistics computeCompoundStatsWithMultiColumnOptimize(ScalarOperator predicate, Statistics inputStats) {
232+
Pair<Map<ColumnRefOperator, ConstantOperator>, List<ScalarOperator>> decomposedPredicates =
233+
Utils.separateEqualityPredicates(predicate);
234+
235+
Map<ColumnRefOperator, ConstantOperator> equalityPredicates = decomposedPredicates.first;
236+
List<ScalarOperator> nonEqualityPredicates = decomposedPredicates.second;
237+
238+
double conjunctiveSelectivity = estimateConjunctiveEqualitySelectivity(equalityPredicates, inputStats);
239+
double filteredRowCount = inputStats.getOutputRowCount() * conjunctiveSelectivity;
240+
241+
Statistics.Builder filteredStatsBuilder = Statistics.buildFrom(inputStats)
242+
.setOutputRowCount(filteredRowCount);
243+
244+
for (Map.Entry<ColumnRefOperator, ConstantOperator> entry : equalityPredicates.entrySet()) {
245+
ColumnRefOperator columnRef = entry.getKey();
246+
ConstantOperator constantOperator = entry.getValue();
247+
ColumnStatistic originalColumnStats = inputStats.getColumnStatistic(columnRef);
248+
249+
double constantValue = StatisticUtils.convertStatisticsToDouble(
250+
constantOperator.getType(), constantOperator.toString()).orElse(NEGATIVE_INFINITY);
251+
ColumnStatistic updatedColumnStats = ColumnStatistic.buildFrom(originalColumnStats)
252+
.setDistinctValuesCount(originalColumnStats.getDistinctValuesCount())
253+
.setNullsFraction(0.0)
254+
.setMinValue(constantValue)
255+
.setMaxValue(isInfinite(constantValue) ? POSITIVE_INFINITY : constantValue)
256+
.build();
257+
258+
filteredStatsBuilder.addColumnStatistic(columnRef, updatedColumnStats);
259+
}
260+
261+
Statistics equalityFilteredStats = filteredStatsBuilder.build();
262+
263+
if (nonEqualityPredicates.isEmpty()) {
264+
return StatisticsEstimateUtils.adjustStatisticsByRowCount(equalityFilteredStats, filteredRowCount);
265+
}
266+
267+
// Apply remaining non-equality predicates sequentially
268+
Statistics combinedFilteredStats = equalityFilteredStats;
269+
270+
for (ScalarOperator nonEqualityPredicate : nonEqualityPredicates) {
271+
combinedFilteredStats = PredicateStatisticsCalculator.statisticsCalculate(
272+
nonEqualityPredicate, combinedFilteredStats);
273+
}
274+
275+
return StatisticsEstimateUtils.adjustStatisticsByRowCount(
276+
combinedFilteredStats,
277+
combinedFilteredStats.getOutputRowCount());
278+
}
67279
}

0 commit comments

Comments
 (0)