|
15 | 15 |
|
16 | 16 | package com.starrocks.sql.optimizer.statistics;
|
17 | 17 |
|
| 18 | +import com.starrocks.analysis.BinaryType; |
| 19 | +import com.starrocks.common.Pair; |
| 20 | +import com.starrocks.qe.ConnectContext; |
| 21 | +import com.starrocks.sql.optimizer.Utils; |
| 22 | +import com.starrocks.sql.optimizer.operator.scalar.BinaryPredicateOperator; |
| 23 | +import com.starrocks.sql.optimizer.operator.scalar.ColumnRefOperator; |
| 24 | +import com.starrocks.sql.optimizer.operator.scalar.ConstantOperator; |
| 25 | +import com.starrocks.sql.optimizer.operator.scalar.ScalarOperator; |
| 26 | +import com.starrocks.statistic.StatisticUtils; |
| 27 | + |
| 28 | +import java.util.ArrayList; |
| 29 | +import java.util.HashMap; |
| 30 | +import java.util.List; |
| 31 | +import java.util.Map; |
| 32 | +import java.util.Set; |
| 33 | + |
| 34 | +import static java.lang.Double.NEGATIVE_INFINITY; |
| 35 | +import static java.lang.Double.POSITIVE_INFINITY; |
| 36 | +import static java.lang.Double.isInfinite; |
| 37 | + |
18 | 38 | public class StatisticsEstimateUtils {
|
19 | 39 | public static ColumnStatistic unionColumnStatistic(ColumnStatistic left, double leftRowCount, ColumnStatistic right,
|
20 | 40 | double rightRowCount) {
|
@@ -64,4 +84,196 @@ public static Statistics adjustStatisticsByRowCount(Statistics statistics, doubl
|
64 | 84 | });
|
65 | 85 | return builder.build();
|
66 | 86 | }
|
| 87 | + |
| 88 | + public static double getPredicateSelectivity(ScalarOperator predicate, Statistics statistics) { |
| 89 | + Statistics estimatedStatistics = PredicateStatisticsCalculator.statisticsCalculate(predicate, statistics); |
| 90 | + |
| 91 | + // avoid sample statistics filter all data, save one rows least |
| 92 | + if (statistics.getOutputRowCount() > 0 && estimatedStatistics.getOutputRowCount() == 0) { |
| 93 | + return 1 / statistics.getOutputRowCount(); |
| 94 | + } else { |
| 95 | + return estimatedStatistics.getOutputRowCount() / statistics.getOutputRowCount(); |
| 96 | + } |
| 97 | + } |
| 98 | + |
| 99 | + /** |
| 100 | + * Estimates selectivity for conjunctive equality predicates across multiple columns. |
| 101 | + * |
| 102 | + * This method implements a hybrid approach that: |
| 103 | + * 1. Leverages multi-column combined statistics when available to capture column correlations |
| 104 | + * 2. Falls back to a weighted combination model with exponential decay for columns without joint statistics |
| 105 | + * 3. Applies selectivity bounds to avoid both overestimation and underestimation |
| 106 | + * |
| 107 | + * Key formulas: |
| 108 | + * - Multi-column combined statistics based: S_mc = max(min(1/NDV, min_sel), prod_sel) |
| 109 | + * Where: |
| 110 | + * - 1/NDV is the selectivity based on multi-columns ndv |
| 111 | + * - min_sel is the minimum selectivity among correlated columns |
| 112 | + * - prod_sel is the product of individual column selectivities |
| 113 | + * |
| 114 | + * - Exponential decay for additional columns: S_final = S_base * ∏(S_i^(0.5^i)) |
| 115 | + * Where: |
| 116 | + * - S_base is the initial selectivity (from multi-column stats or most selective column) |
| 117 | + * - S_i is the selectivity of the i-th additional column (sorted by ascending selectivity) |
| 118 | + * - 0.5^i is the exponential decay weight (0.5, 0.25, 0.125, etc.) |
| 119 | + * |
| 120 | + * @param equalityPredicates Map of column references to their equality constant values |
| 121 | + * @param statistics |
| 122 | + * @return Estimated selectivity in range [0,1], or -1 if estimation cannot be performed |
| 123 | + */ |
| 124 | + private static double estimateConjunctiveEqualitySelectivity( |
| 125 | + Map<ColumnRefOperator, ConstantOperator> equalityPredicates, |
| 126 | + Statistics statistics) { |
| 127 | + // Require at least two columns for multi-column estimation |
| 128 | + if (equalityPredicates.size() < 2) { |
| 129 | + return -1; |
| 130 | + } |
| 131 | + |
| 132 | + // Compute individual selectivity factors for each predicate and sort in ascending order |
| 133 | + Map<ColumnRefOperator, Double> columnToSelectivityMap = new HashMap<>(); |
| 134 | + for (Map.Entry<ColumnRefOperator, ConstantOperator> entry : equalityPredicates.entrySet()) { |
| 135 | + ColumnRefOperator columnRef = entry.getKey(); |
| 136 | + ConstantOperator constantValue = entry.getValue(); |
| 137 | + BinaryPredicateOperator equalityPredicate = new BinaryPredicateOperator(BinaryType.EQ, columnRef, constantValue); |
| 138 | + columnToSelectivityMap.put(columnRef, getPredicateSelectivity(equalityPredicate, statistics)); |
| 139 | + } |
| 140 | + |
| 141 | + List<Map.Entry<ColumnRefOperator, Double>> selectivityEntriesSorted = |
| 142 | + new ArrayList<>(columnToSelectivityMap.entrySet()); |
| 143 | + |
| 144 | + // Sort by ascending selectivity (most selective first) |
| 145 | + selectivityEntriesSorted.sort(Map.Entry.comparingByValue()); |
| 146 | + |
| 147 | + // Retrieve available multi-column combined statistics for the target columns |
| 148 | + Set<ColumnRefOperator> targetColumnRefs = equalityPredicates.keySet(); |
| 149 | + Pair<Set<ColumnRefOperator>, MultiColumnCombinedStats> multiColumnStatsPair = |
| 150 | + statistics.getLargestSubsetMCStats(targetColumnRefs); |
| 151 | + |
| 152 | + double estimatedSelectivity; |
| 153 | + |
| 154 | + // Primary estimation path: utilize multi-column statistics when available |
| 155 | + if (multiColumnStatsPair != null && |
| 156 | + !multiColumnStatsPair.first.isEmpty() && |
| 157 | + multiColumnStatsPair.second.getNdv() > 0) { |
| 158 | + |
| 159 | + Set<ColumnRefOperator> correlatedColumns = multiColumnStatsPair.first; |
| 160 | + double distinctValueCount = Math.max(1.0, multiColumnStatsPair.second.getNdv()); |
| 161 | + |
| 162 | + // Formula: S_corr = 1/NDV |
| 163 | + // NDV-based selectivity estimation for correlated columns |
| 164 | + double correlationBasedSelectivity = 1.0 / distinctValueCount; |
| 165 | + |
| 166 | + double maxNullFraction = correlatedColumns.stream() |
| 167 | + .map(statistics::getColumnStatistic) |
| 168 | + .mapToDouble(ColumnStatistic::getNullsFraction) |
| 169 | + .max() |
| 170 | + .orElse(0.0); |
| 171 | + correlationBasedSelectivity = correlationBasedSelectivity * (1.0 - maxNullFraction); |
| 172 | + |
| 173 | + // Formula: S_ind = ∏(S_i) for all i in correlatedColumns |
| 174 | + // Calculate independence-assumption selectivity product as lower bound |
| 175 | + double independentSelectivityProduct = correlatedColumns.stream() |
| 176 | + .map(columnToSelectivityMap::get) |
| 177 | + .reduce(1.0, (a, b) -> a * b); |
| 178 | + |
| 179 | + // Formula: S_min = min(S_i) for all i in correlatedColumns |
| 180 | + // Identify minimum column selectivity as upper bound |
| 181 | + double minColumnSelectivity = correlatedColumns.stream() |
| 182 | + .map(columnToSelectivityMap::get) |
| 183 | + .min(Double::compare) |
| 184 | + .orElse(1.0); |
| 185 | + |
| 186 | + // Formula: S_mc = max(min(S_corr, S_min), S_ind) |
| 187 | + // Apply selectivity bounds to balance correlation effects |
| 188 | + // Because a single column may build a histogram or mcv, the selection will be much larger than using only ndv. |
| 189 | + estimatedSelectivity = Math.max( |
| 190 | + Math.min(correlationBasedSelectivity, minColumnSelectivity), |
| 191 | + independentSelectivityProduct); |
| 192 | + |
| 193 | + // Process remaining columns not covered by multi-column combined statistics |
| 194 | + // Formula ordering: S_final = S_mc * ∏(S_i^(0.5^(i+1))) where S_i are sorted by ascending selectivity |
| 195 | + List<Double> uncorrelatedSelectivities = selectivityEntriesSorted.stream() |
| 196 | + .filter(entry -> !correlatedColumns.contains(entry.getKey())) |
| 197 | + .map(Map.Entry::getValue) |
| 198 | + .toList(); |
| 199 | + |
| 200 | + // Apply exponential decay weights to uncorrelated columns (max 3) |
| 201 | + // Multi-column selectivity is used as base, then apply remaining columns in ascending selectivity order |
| 202 | + for (int i = 0; i < Math.min(3, uncorrelatedSelectivities.size()); i++) { |
| 203 | + double decayFactor = 1; |
| 204 | + if (ConnectContext.get().getSessionVariable().isUseCorrelatedPredicateEstimate()) { |
| 205 | + decayFactor = Math.pow(0.5, i + 1); // Weights: 0.5, 0.25, 0.125 |
| 206 | + } |
| 207 | + estimatedSelectivity *= Math.pow(uncorrelatedSelectivities.get(i), decayFactor); |
| 208 | + } |
| 209 | + } else { |
| 210 | + // Fallback estimation path: weighted combination of individual selectivities |
| 211 | + // Formula: S_base = S_0 (most selective predicate) |
| 212 | + // Use most selective predicate as base (first in the sorted list) |
| 213 | + estimatedSelectivity = selectivityEntriesSorted.get(0).getValue(); |
| 214 | + |
| 215 | + // Formula: S_final = S_base * ∏(S_i^(0.5^i)) for i=1,2,3 |
| 216 | + // Apply exponential decay weights to additional columns (max 4) |
| 217 | + // Columns are already sorted by ascending selectivity, so most selective is first |
| 218 | + for (int i = 1; i < Math.min(4, selectivityEntriesSorted.size()); i++) { |
| 219 | + double decayFactor = 1; |
| 220 | + if (ConnectContext.get().getSessionVariable().isUseCorrelatedPredicateEstimate()) { |
| 221 | + decayFactor = Math.pow(0.5, i); |
| 222 | + } |
| 223 | + estimatedSelectivity *= Math.pow(selectivityEntriesSorted.get(i).getValue(), decayFactor); |
| 224 | + } |
| 225 | + } |
| 226 | + |
| 227 | + // Clamp final selectivity to valid probability range |
| 228 | + return Math.min(1.0, Math.max(0.0, estimatedSelectivity)); |
| 229 | + } |
| 230 | + |
| 231 | + public static Statistics computeCompoundStatsWithMultiColumnOptimize(ScalarOperator predicate, Statistics inputStats) { |
| 232 | + Pair<Map<ColumnRefOperator, ConstantOperator>, List<ScalarOperator>> decomposedPredicates = |
| 233 | + Utils.separateEqualityPredicates(predicate); |
| 234 | + |
| 235 | + Map<ColumnRefOperator, ConstantOperator> equalityPredicates = decomposedPredicates.first; |
| 236 | + List<ScalarOperator> nonEqualityPredicates = decomposedPredicates.second; |
| 237 | + |
| 238 | + double conjunctiveSelectivity = estimateConjunctiveEqualitySelectivity(equalityPredicates, inputStats); |
| 239 | + double filteredRowCount = inputStats.getOutputRowCount() * conjunctiveSelectivity; |
| 240 | + |
| 241 | + Statistics.Builder filteredStatsBuilder = Statistics.buildFrom(inputStats) |
| 242 | + .setOutputRowCount(filteredRowCount); |
| 243 | + |
| 244 | + for (Map.Entry<ColumnRefOperator, ConstantOperator> entry : equalityPredicates.entrySet()) { |
| 245 | + ColumnRefOperator columnRef = entry.getKey(); |
| 246 | + ConstantOperator constantOperator = entry.getValue(); |
| 247 | + ColumnStatistic originalColumnStats = inputStats.getColumnStatistic(columnRef); |
| 248 | + |
| 249 | + double constantValue = StatisticUtils.convertStatisticsToDouble( |
| 250 | + constantOperator.getType(), constantOperator.toString()).orElse(NEGATIVE_INFINITY); |
| 251 | + ColumnStatistic updatedColumnStats = ColumnStatistic.buildFrom(originalColumnStats) |
| 252 | + .setDistinctValuesCount(originalColumnStats.getDistinctValuesCount()) |
| 253 | + .setNullsFraction(0.0) |
| 254 | + .setMinValue(constantValue) |
| 255 | + .setMaxValue(isInfinite(constantValue) ? POSITIVE_INFINITY : constantValue) |
| 256 | + .build(); |
| 257 | + |
| 258 | + filteredStatsBuilder.addColumnStatistic(columnRef, updatedColumnStats); |
| 259 | + } |
| 260 | + |
| 261 | + Statistics equalityFilteredStats = filteredStatsBuilder.build(); |
| 262 | + |
| 263 | + if (nonEqualityPredicates.isEmpty()) { |
| 264 | + return StatisticsEstimateUtils.adjustStatisticsByRowCount(equalityFilteredStats, filteredRowCount); |
| 265 | + } |
| 266 | + |
| 267 | + // Apply remaining non-equality predicates sequentially |
| 268 | + Statistics combinedFilteredStats = equalityFilteredStats; |
| 269 | + |
| 270 | + for (ScalarOperator nonEqualityPredicate : nonEqualityPredicates) { |
| 271 | + combinedFilteredStats = PredicateStatisticsCalculator.statisticsCalculate( |
| 272 | + nonEqualityPredicate, combinedFilteredStats); |
| 273 | + } |
| 274 | + |
| 275 | + return StatisticsEstimateUtils.adjustStatisticsByRowCount( |
| 276 | + combinedFilteredStats, |
| 277 | + combinedFilteredStats.getOutputRowCount()); |
| 278 | + } |
67 | 279 | }
|
0 commit comments