Skip to content

Commit 1349d27

Browse files
committed
Improve performance of ORDER BY / DISTINCT aggregates
ORDER BY / DISTINCT aggreagtes have, since implemented in Postgres, been executed by always performing a sort in nodeAgg.c to sort the tuples in the current group into the correct order before calling the transition function on the sorted tuples. This was not great as often there might be an index that could have provided pre-sorted input and allowed the transition functions to be called as the rows come in, rather than having to store them in a tuplestore in order to sort them once all the tuples for the group have arrived. Here we change the planner so it requests a path with a sort order which supports the most amount of ORDER BY / DISTINCT aggregate functions and add new code to the executor to allow it to support the processing of ORDER BY / DISTINCT aggregates where the tuples are already sorted in the correct order. Since there can be many ORDER BY / DISTINCT aggregates in any given query level, it's very possible that we can't find an order that suits all of these aggregates. The sort order that the planner chooses is simply the one that suits the most aggregate functions. We take the most strictly sorted variation of each order and see how many aggregate functions can use that, then we try again with the order of the remaining aggregates to see if another order would suit more aggregate functions. For example: SELECT agg(a ORDER BY a),agg2(a ORDER BY a,b) ... would request the sort order to be {a, b} because {a} is a subset of the sort order of {a,b}, but; SELECT agg(a ORDER BY a),agg2(a ORDER BY c) ... would just pick a plan ordered by {a} (we give precedence to aggregates which are earlier in the targetlist). SELECT agg(a ORDER BY a),agg2(a ORDER BY b),agg3(a ORDER BY b) ... would choose to order by {b} since two aggregates suit that vs just one that requires input ordered by {a}. Author: David Rowley Reviewed-by: Ronan Dunklau, James Coleman, Ranier Vilela, Richard Guo, Tom Lane Discussion: https://postgr.es/m/CAApHDvpHzfo92%3DR4W0%2BxVua3BUYCKMckWAmo-2t_KiXN-wYH%3Dw%40mail.gmail.com
1 parent a69959f commit 1349d27

File tree

24 files changed

+849
-138
lines changed

24 files changed

+849
-138
lines changed

Diff for: contrib/postgres_fdw/expected/postgres_fdw.out

+20-12
Original file line numberDiff line numberDiff line change
@@ -3295,15 +3295,18 @@ create operator class my_op_class for type int using btree family my_op_family a
32953295
-- extension yet.
32963296
explain (verbose, costs off)
32973297
select array_agg(c1 order by c1 using operator(public.<^)) from ft2 where c2 = 6 and c1 < 100 group by c2;
3298-
QUERY PLAN
3299-
--------------------------------------------------------------------------------------------
3298+
QUERY PLAN
3299+
--------------------------------------------------------------------------------------------------
33003300
GroupAggregate
33013301
Output: array_agg(c1 ORDER BY c1 USING <^ NULLS LAST), c2
33023302
Group Key: ft2.c2
3303-
-> Foreign Scan on public.ft2
3304-
Output: c1, c2
3305-
Remote SQL: SELECT "C 1", c2 FROM "S 1"."T 1" WHERE (("C 1" < 100)) AND ((c2 = 6))
3306-
(6 rows)
3303+
-> Sort
3304+
Output: c2, c1
3305+
Sort Key: ft2.c1 USING <^
3306+
-> Foreign Scan on public.ft2
3307+
Output: c2, c1
3308+
Remote SQL: SELECT "C 1", c2 FROM "S 1"."T 1" WHERE (("C 1" < 100)) AND ((c2 = 6))
3309+
(9 rows)
33073310

33083311
-- This should not be pushed either.
33093312
explain (verbose, costs off)
@@ -3329,6 +3332,7 @@ alter extension postgres_fdw add operator public.=^(int, int);
33293332
alter extension postgres_fdw add operator public.>^(int, int);
33303333
alter server loopback options (set extensions 'postgres_fdw');
33313334
-- Now this will be pushed as sort operator is part of the extension.
3335+
alter server loopback options (add fdw_tuple_cost '0.5');
33323336
explain (verbose, costs off)
33333337
select array_agg(c1 order by c1 using operator(public.<^)) from ft2 where c2 = 6 and c1 < 100 group by c2;
33343338
QUERY PLAN
@@ -3345,6 +3349,7 @@ select array_agg(c1 order by c1 using operator(public.<^)) from ft2 where c2 = 6
33453349
{6,16,26,36,46,56,66,76,86,96}
33463350
(1 row)
33473351

3352+
alter server loopback options (drop fdw_tuple_cost);
33483353
-- This should be pushed too.
33493354
explain (verbose, costs off)
33503355
select * from ft2 order by c1 using operator(public.<^);
@@ -3366,15 +3371,18 @@ alter server loopback options (set extensions 'postgres_fdw');
33663371
-- This will not be pushed as sort operator is now removed from the extension.
33673372
explain (verbose, costs off)
33683373
select array_agg(c1 order by c1 using operator(public.<^)) from ft2 where c2 = 6 and c1 < 100 group by c2;
3369-
QUERY PLAN
3370-
--------------------------------------------------------------------------------------------
3374+
QUERY PLAN
3375+
--------------------------------------------------------------------------------------------------
33713376
GroupAggregate
33723377
Output: array_agg(c1 ORDER BY c1 USING <^ NULLS LAST), c2
33733378
Group Key: ft2.c2
3374-
-> Foreign Scan on public.ft2
3375-
Output: c1, c2
3376-
Remote SQL: SELECT "C 1", c2 FROM "S 1"."T 1" WHERE (("C 1" < 100)) AND ((c2 = 6))
3377-
(6 rows)
3379+
-> Sort
3380+
Output: c2, c1
3381+
Sort Key: ft2.c1 USING <^
3382+
-> Foreign Scan on public.ft2
3383+
Output: c2, c1
3384+
Remote SQL: SELECT "C 1", c2 FROM "S 1"."T 1" WHERE (("C 1" < 100)) AND ((c2 = 6))
3385+
(9 rows)
33783386

33793387
-- Cleanup
33803388
drop operator class my_op_class using btree;

Diff for: contrib/postgres_fdw/sql/postgres_fdw.sql

+2
Original file line numberDiff line numberDiff line change
@@ -943,9 +943,11 @@ alter extension postgres_fdw add operator public.>^(int, int);
943943
alter server loopback options (set extensions 'postgres_fdw');
944944

945945
-- Now this will be pushed as sort operator is part of the extension.
946+
alter server loopback options (add fdw_tuple_cost '0.5');
946947
explain (verbose, costs off)
947948
select array_agg(c1 order by c1 using operator(public.<^)) from ft2 where c2 = 6 and c1 < 100 group by c2;
948949
select array_agg(c1 order by c1 using operator(public.<^)) from ft2 where c2 = 6 and c1 < 100 group by c2;
950+
alter server loopback options (drop fdw_tuple_cost);
949951

950952
-- This should be pushed too.
951953
explain (verbose, costs off)

Diff for: src/backend/executor/execExpr.c

+45-7
Original file line numberDiff line numberDiff line change
@@ -3666,20 +3666,31 @@ ExecBuildAggTrans(AggState *aggstate, AggStatePerPhase phase,
36663666
scratch.resnull = &state->resnull;
36673667
}
36683668
argno++;
3669+
3670+
Assert(pertrans->numInputs == argno);
36693671
}
3670-
else if (pertrans->numSortCols == 0)
3672+
else if (!pertrans->aggsortrequired)
36713673
{
36723674
ListCell *arg;
36733675

36743676
/*
3675-
* Normal transition function without ORDER BY / DISTINCT.
3677+
* Normal transition function without ORDER BY / DISTINCT or with
3678+
* ORDER BY / DISTINCT but the planner has given us pre-sorted
3679+
* input.
36763680
*/
36773681
strictargs = trans_fcinfo->args + 1;
36783682

36793683
foreach(arg, pertrans->aggref->args)
36803684
{
36813685
TargetEntry *source_tle = (TargetEntry *) lfirst(arg);
36823686

3687+
/*
3688+
* Don't initialize args for any ORDER BY clause that might
3689+
* exist in a presorted aggregate.
3690+
*/
3691+
if (argno == pertrans->numTransInputs)
3692+
break;
3693+
36833694
/*
36843695
* Start from 1, since the 0th arg will be the transition
36853696
* value
@@ -3689,11 +3700,13 @@ ExecBuildAggTrans(AggState *aggstate, AggStatePerPhase phase,
36893700
&trans_fcinfo->args[argno + 1].isnull);
36903701
argno++;
36913702
}
3703+
Assert(pertrans->numTransInputs == argno);
36923704
}
36933705
else if (pertrans->numInputs == 1)
36943706
{
36953707
/*
3696-
* DISTINCT and/or ORDER BY case, with a single column sorted on.
3708+
* Non-presorted DISTINCT and/or ORDER BY case, with a single
3709+
* column sorted on.
36973710
*/
36983711
TargetEntry *source_tle =
36993712
(TargetEntry *) linitial(pertrans->aggref->args);
@@ -3705,11 +3718,14 @@ ExecBuildAggTrans(AggState *aggstate, AggStatePerPhase phase,
37053718
&state->resnull);
37063719
strictnulls = &state->resnull;
37073720
argno++;
3721+
3722+
Assert(pertrans->numInputs == argno);
37083723
}
37093724
else
37103725
{
37113726
/*
3712-
* DISTINCT and/or ORDER BY case, with multiple columns sorted on.
3727+
* Non-presorted DISTINCT and/or ORDER BY case, with multiple
3728+
* columns sorted on.
37133729
*/
37143730
Datum *values = pertrans->sortslot->tts_values;
37153731
bool *nulls = pertrans->sortslot->tts_isnull;
@@ -3725,8 +3741,8 @@ ExecBuildAggTrans(AggState *aggstate, AggStatePerPhase phase,
37253741
&values[argno], &nulls[argno]);
37263742
argno++;
37273743
}
3744+
Assert(pertrans->numInputs == argno);
37283745
}
3729-
Assert(pertrans->numInputs == argno);
37303746

37313747
/*
37323748
* For a strict transfn, nothing happens when there's a NULL input; we
@@ -3748,6 +3764,21 @@ ExecBuildAggTrans(AggState *aggstate, AggStatePerPhase phase,
37483764
state->steps_len - 1);
37493765
}
37503766

3767+
/* Handle DISTINCT aggregates which have pre-sorted input */
3768+
if (pertrans->numDistinctCols > 0 && !pertrans->aggsortrequired)
3769+
{
3770+
if (pertrans->numDistinctCols > 1)
3771+
scratch.opcode = EEOP_AGG_PRESORTED_DISTINCT_MULTI;
3772+
else
3773+
scratch.opcode = EEOP_AGG_PRESORTED_DISTINCT_SINGLE;
3774+
3775+
scratch.d.agg_presorted_distinctcheck.pertrans = pertrans;
3776+
scratch.d.agg_presorted_distinctcheck.jumpdistinct = -1; /* adjust later */
3777+
ExprEvalPushStep(state, &scratch);
3778+
adjust_bailout = lappend_int(adjust_bailout,
3779+
state->steps_len - 1);
3780+
}
3781+
37513782
/*
37523783
* Call transition function (once for each concurrently evaluated
37533784
* grouping set). Do so for both sort and hash based computations, as
@@ -3808,6 +3839,12 @@ ExecBuildAggTrans(AggState *aggstate, AggStatePerPhase phase,
38083839
Assert(as->d.agg_deserialize.jumpnull == -1);
38093840
as->d.agg_deserialize.jumpnull = state->steps_len;
38103841
}
3842+
else if (as->opcode == EEOP_AGG_PRESORTED_DISTINCT_SINGLE ||
3843+
as->opcode == EEOP_AGG_PRESORTED_DISTINCT_MULTI)
3844+
{
3845+
Assert(as->d.agg_presorted_distinctcheck.jumpdistinct == -1);
3846+
as->d.agg_presorted_distinctcheck.jumpdistinct = state->steps_len;
3847+
}
38113848
else
38123849
Assert(false);
38133850
}
@@ -3857,7 +3894,8 @@ ExecBuildAggTransCall(ExprState *state, AggState *aggstate,
38573894
/*
38583895
* Determine appropriate transition implementation.
38593896
*
3860-
* For non-ordered aggregates:
3897+
* For non-ordered aggregates and ORDER BY / DISTINCT aggregates with
3898+
* presorted input:
38613899
*
38623900
* If the initial value for the transition state doesn't exist in the
38633901
* pg_aggregate table then we will let the first non-NULL value returned
@@ -3887,7 +3925,7 @@ ExecBuildAggTransCall(ExprState *state, AggState *aggstate,
38873925
* process_ordered_aggregate_{single, multi} and
38883926
* advance_transition_function.
38893927
*/
3890-
if (pertrans->numSortCols == 0)
3928+
if (!pertrans->aggsortrequired)
38913929
{
38923930
if (pertrans->transtypeByVal)
38933931
{

Diff for: src/backend/executor/execExprInterp.c

+102
Original file line numberDiff line numberDiff line change
@@ -502,6 +502,8 @@ ExecInterpExpr(ExprState *state, ExprContext *econtext, bool *isnull)
502502
&&CASE_EEOP_AGG_PLAIN_TRANS_INIT_STRICT_BYREF,
503503
&&CASE_EEOP_AGG_PLAIN_TRANS_STRICT_BYREF,
504504
&&CASE_EEOP_AGG_PLAIN_TRANS_BYREF,
505+
&&CASE_EEOP_AGG_PRESORTED_DISTINCT_SINGLE,
506+
&&CASE_EEOP_AGG_PRESORTED_DISTINCT_MULTI,
505507
&&CASE_EEOP_AGG_ORDERED_TRANS_DATUM,
506508
&&CASE_EEOP_AGG_ORDERED_TRANS_TUPLE,
507509
&&CASE_EEOP_LAST
@@ -1786,6 +1788,28 @@ ExecInterpExpr(ExprState *state, ExprContext *econtext, bool *isnull)
17861788
EEO_NEXT();
17871789
}
17881790

1791+
EEO_CASE(EEOP_AGG_PRESORTED_DISTINCT_SINGLE)
1792+
{
1793+
AggStatePerTrans pertrans = op->d.agg_presorted_distinctcheck.pertrans;
1794+
AggState *aggstate = castNode(AggState, state->parent);
1795+
1796+
if (ExecEvalPreOrderedDistinctSingle(aggstate, pertrans))
1797+
EEO_NEXT();
1798+
else
1799+
EEO_JUMP(op->d.agg_presorted_distinctcheck.jumpdistinct);
1800+
}
1801+
1802+
EEO_CASE(EEOP_AGG_PRESORTED_DISTINCT_MULTI)
1803+
{
1804+
AggState *aggstate = castNode(AggState, state->parent);
1805+
AggStatePerTrans pertrans = op->d.agg_presorted_distinctcheck.pertrans;
1806+
1807+
if (ExecEvalPreOrderedDistinctMulti(aggstate, pertrans))
1808+
EEO_NEXT();
1809+
else
1810+
EEO_JUMP(op->d.agg_presorted_distinctcheck.jumpdistinct);
1811+
}
1812+
17891813
/* process single-column ordered aggregate datum */
17901814
EEO_CASE(EEOP_AGG_ORDERED_TRANS_DATUM)
17911815
{
@@ -4402,6 +4426,84 @@ ExecAggTransReparent(AggState *aggstate, AggStatePerTrans pertrans,
44024426
return newValue;
44034427
}
44044428

4429+
/*
4430+
* ExecEvalPreOrderedDistinctSingle
4431+
* Returns true when the aggregate transition value Datum is distinct
4432+
* from the previous input Datum and returns false when the input Datum
4433+
* matches the previous input Datum.
4434+
*/
4435+
bool
4436+
ExecEvalPreOrderedDistinctSingle(AggState *aggstate, AggStatePerTrans pertrans)
4437+
{
4438+
Datum value = pertrans->transfn_fcinfo->args[1].value;
4439+
bool isnull = pertrans->transfn_fcinfo->args[1].isnull;
4440+
4441+
if (!pertrans->haslast ||
4442+
pertrans->lastisnull != isnull ||
4443+
!DatumGetBool(FunctionCall2Coll(&pertrans->equalfnOne,
4444+
pertrans->aggCollation,
4445+
pertrans->lastdatum, value)))
4446+
{
4447+
if (pertrans->haslast && !pertrans->inputtypeByVal)
4448+
pfree(DatumGetPointer(pertrans->lastdatum));
4449+
4450+
pertrans->haslast = true;
4451+
if (!isnull)
4452+
{
4453+
MemoryContext oldContext;
4454+
4455+
oldContext = MemoryContextSwitchTo(aggstate->curaggcontext->ecxt_per_tuple_memory);
4456+
4457+
pertrans->lastdatum = datumCopy(value, pertrans->inputtypeByVal,
4458+
pertrans->inputtypeLen);
4459+
4460+
MemoryContextSwitchTo(oldContext);
4461+
}
4462+
else
4463+
pertrans->lastdatum = (Datum) 0;
4464+
pertrans->lastisnull = isnull;
4465+
return true;
4466+
}
4467+
4468+
return false;
4469+
}
4470+
4471+
/*
4472+
* ExecEvalPreOrderedDistinctMulti
4473+
* Returns true when the aggregate input is distinct from the previous
4474+
* input and returns false when the input matches the previous input.
4475+
*/
4476+
bool
4477+
ExecEvalPreOrderedDistinctMulti(AggState *aggstate, AggStatePerTrans pertrans)
4478+
{
4479+
ExprContext *tmpcontext = aggstate->tmpcontext;
4480+
4481+
for (int i = 0; i < pertrans->numTransInputs; i++)
4482+
{
4483+
pertrans->sortslot->tts_values[i] = pertrans->transfn_fcinfo->args[i + 1].value;
4484+
pertrans->sortslot->tts_isnull[i] = pertrans->transfn_fcinfo->args[i + 1].isnull;
4485+
}
4486+
4487+
ExecClearTuple(pertrans->sortslot);
4488+
pertrans->sortslot->tts_nvalid = pertrans->numInputs;
4489+
ExecStoreVirtualTuple(pertrans->sortslot);
4490+
4491+
tmpcontext->ecxt_outertuple = pertrans->sortslot;
4492+
tmpcontext->ecxt_innertuple = pertrans->uniqslot;
4493+
4494+
if (!pertrans->haslast ||
4495+
!ExecQual(pertrans->equalfnMulti, tmpcontext))
4496+
{
4497+
if (pertrans->haslast)
4498+
ExecClearTuple(pertrans->uniqslot);
4499+
4500+
pertrans->haslast = true;
4501+
ExecCopySlot(pertrans->uniqslot, pertrans->sortslot);
4502+
return true;
4503+
}
4504+
return false;
4505+
}
4506+
44054507
/*
44064508
* Invoke ordered transition function, with a datum argument.
44074509
*/

0 commit comments

Comments
 (0)