Skip to content

Commit ab1ce59

Browse files
committed
Merge branch 'cassandra-5.0' into trunk
* cassandra-5.0: Optimize initial skipping logic for SAI queries on large partitions
2 parents f91655d + 7a8335c commit ab1ce59

File tree

3 files changed

+375
-2
lines changed

3 files changed

+375
-2
lines changed

CHANGES.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,7 @@
190190
* Add the ability to disable bulk loading of SSTables (CASSANDRA-18781)
191191
* Clean up obsolete functions and simplify cql_version handling in cqlsh (CASSANDRA-18787)
192192
Merged from 5.0:
193+
* Optimize initial skipping logic for SAI queries on large partitions (CASSANDRA-20191)
193194
* zero copy streaming allocates direct memory that isn't used, but does help to fragment the memory space (CASSANDRA-20577)
194195
* CQLSSTableWriter supports setting the format (BTI or Big) (CASSANDRA-20609)
195196
* Don't allocate in ThreadLocalReadAheadBuffer#close() (CASSANDRA-20551)

src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexSearcher.java

Lines changed: 56 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
package org.apache.cassandra.index.sai.plan;
2020

21+
import java.nio.ByteBuffer;
2122
import java.util.ArrayList;
2223
import java.util.Collections;
2324
import java.util.HashSet;
@@ -32,13 +33,19 @@
3233

3334
import io.netty.util.concurrent.FastThreadLocal;
3435
import org.apache.cassandra.db.Clustering;
36+
import org.apache.cassandra.db.ClusteringBound;
37+
import org.apache.cassandra.db.ClusteringComparator;
3538
import org.apache.cassandra.db.ColumnFamilyStore;
3639
import org.apache.cassandra.db.DataRange;
3740
import org.apache.cassandra.db.DecoratedKey;
3841
import org.apache.cassandra.db.PartitionPosition;
3942
import org.apache.cassandra.db.ReadCommand;
4043
import org.apache.cassandra.db.ReadExecutionController;
4144
import org.apache.cassandra.db.RegularAndStaticColumns;
45+
import org.apache.cassandra.db.Slices;
46+
import org.apache.cassandra.db.filter.ClusteringIndexFilter;
47+
import org.apache.cassandra.db.filter.ClusteringIndexNamesFilter;
48+
import org.apache.cassandra.db.filter.ClusteringIndexSliceFilter;
4249
import org.apache.cassandra.db.filter.RowFilter;
4350
import org.apache.cassandra.db.partitions.PartitionIterator;
4451
import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
@@ -138,6 +145,7 @@ private class ResultRetriever extends AbstractIterator<UnfilteredRowIterator> im
138145
private final PrimaryKey firstPrimaryKey;
139146
private final PrimaryKey lastPrimaryKey;
140147
private final Iterator<DataRange> keyRanges;
148+
private final DataRange firstDataRange;
141149
private AbstractBounds<PartitionPosition> currentKeyRange;
142150

143151
private final KeyRangeIterator resultKeyIterator;
@@ -152,7 +160,8 @@ private class ResultRetriever extends AbstractIterator<UnfilteredRowIterator> im
152160
private ResultRetriever(ReadExecutionController executionController, boolean topK)
153161
{
154162
this.keyRanges = queryController.dataRanges().iterator();
155-
this.currentKeyRange = keyRanges.next().keyRange();
163+
this.firstDataRange = keyRanges.next();
164+
this.currentKeyRange = firstDataRange.keyRange();
156165
this.resultKeyIterator = Operation.buildIterator(queryController);
157166
this.filterTree = Operation.buildFilter(queryController, queryController.usesStrictFiltering());
158167
this.executionController = executionController;
@@ -175,7 +184,52 @@ public UnfilteredRowIterator computeNext()
175184
// We can't put this code in the constructor because it may throw and the caller
176185
// may not be prepared for that.
177186
if (lastKey == null)
178-
resultKeyIterator.skipTo(firstPrimaryKey);
187+
{
188+
PrimaryKey skipTarget = firstPrimaryKey;
189+
ClusteringComparator comparator = command.metadata().comparator;
190+
191+
// If there are no clusterings, the first data range selects an entire partitions, or we have static
192+
// expressions, don't bother trying to skip forward within the partition.
193+
if (comparator.size() > 0 && !firstDataRange.selectsAllPartition() && !command.rowFilter().hasStaticExpression())
194+
{
195+
// Only attempt to skip if the first data range covers a single partition.
196+
if (currentKeyRange.left.equals(currentKeyRange.right) && currentKeyRange.left instanceof DecoratedKey)
197+
{
198+
DecoratedKey decoratedKey = (DecoratedKey) currentKeyRange.left;
199+
ClusteringIndexFilter filter = firstDataRange.clusteringIndexFilter(decoratedKey);
200+
201+
if (filter instanceof ClusteringIndexSliceFilter)
202+
{
203+
Slices slices = ((ClusteringIndexSliceFilter) filter).requestedSlices();
204+
205+
if (!slices.isEmpty())
206+
{
207+
ClusteringBound<?> startBound = slices.get(0).start();
208+
209+
if (!startBound.isEmpty())
210+
{
211+
ByteBuffer[] rawValues = startBound.getBufferArray();
212+
213+
if (rawValues.length == comparator.size())
214+
skipTarget = keyFactory.create(decoratedKey, Clustering.make(rawValues));
215+
}
216+
}
217+
}
218+
else if (filter instanceof ClusteringIndexNamesFilter)
219+
{
220+
ClusteringIndexNamesFilter namesFilter = (ClusteringIndexNamesFilter) filter;
221+
222+
if (!namesFilter.requestedRows().isEmpty())
223+
{
224+
Clustering<?> skipClustering = namesFilter.requestedRows().iterator().next();
225+
skipTarget = keyFactory.create(decoratedKey, skipClustering);
226+
}
227+
}
228+
}
229+
}
230+
231+
resultKeyIterator.skipTo(skipTarget);
232+
}
179233

180234
// Theoretically we wouldn't need this if the caller of computeNext always ran the
181235
// returned iterators to the completion. Unfortunately, we have no control over the caller behavior here.

0 commit comments

Comments
 (0)