Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
68cf7ae
feat: enforce WITHIN GROUP support for ordered-set aggregate function…
kosiew Nov 10, 2025
915e3bb
feat: add documentation for WITHIN GROUP support in ordered-set aggre…
kosiew Nov 10, 2025
1679849
feat: enforce restrictions on WITHIN GROUP support for order-insensit…
kosiew Nov 10, 2025
4f1431a
Enforce WITHIN GROUP clause for ordered-set UDAFs
kosiew Nov 11, 2025
5c33aa0
feat: add test case for rejection of WITHIN GROUP in non-ordered-set …
kosiew Nov 11, 2025
fcdce76
feat: update test case for WITHIN GROUP rejection in non-ordered-set …
kosiew Nov 11, 2025
5eaaddc
feat: add rejection test for WITHIN GROUP clause in non-ordered-set U…
kosiew Nov 11, 2025
9e71211
feat: add error handling for WITHIN GROUP clause in array_agg functio…
kosiew Nov 11, 2025
45bd1da
Improve error handling in within_group_udaf functions
kosiew Nov 11, 2025
62cc368
feat: update test case for percentile_cont UDAF to use c3 ordering in…
kosiew Nov 11, 2025
a2968ab
feat: add test for named argument behavior with WITHIN GROUP in perce…
kosiew Nov 11, 2025
494142f
refactor: clean up imports and remove redundant use statements in sql…
kosiew Nov 11, 2025
1811287
Refactor planner and improve readability
kosiew Nov 11, 2025
f79715a
refactor: introduce type alias WithinGroupExtraction to simplify retu…
kosiew Nov 11, 2025
193c6fc
clippy fix
kosiew Nov 11, 2025
a58a27f
Move WithinGroupExtraction type alias closer to fn
kosiew Nov 12, 2025
2c2ff39
refactor: clarify comments on UDAF WITHIN GROUP handling for better u…
kosiew Nov 12, 2025
752c78c
refactor: simplify argument handling in extract_and_prepend_within_gr…
kosiew Nov 12, 2025
333a3b8
tests(sqllogictest): remove unit tests duplicated by aggregate.slt (W…
kosiew Nov 12, 2025
1265c4b
sql: avoid unstable inherent associated type by moving WithinGroupExt…
kosiew Nov 12, 2025
566508b
Remove unused imports
kosiew Nov 12, 2025
c5dd922
cargo fmt
kosiew Nov 12, 2025
a28b463
docs: update upgrading guide for DataFusion 52.0.0 with explicit opt-…
kosiew Nov 13, 2025
6e10ca1
docs: clarify usage of WITHIN GROUP clause in aggregate functions doc…
kosiew Nov 13, 2025
b7183de
docs: enhance documentation generator for WITHIN GROUP clause in aggr…
kosiew Nov 13, 2025
2366a36
refactor: reorganize imports for clarity and consistency in function.rs
kosiew Nov 13, 2025
b11e66d
Merge branch 'main' into within-group-18109
kosiew Nov 13, 2025
6a90e31
npx prettier
kosiew Nov 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 64 additions & 29 deletions datafusion/sql/src/expr/function.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,12 @@ use datafusion_common::{
internal_datafusion_err, internal_err, not_impl_err, plan_datafusion_err, plan_err,
DFSchema, Dependency, Diagnostic, Result, Span,
};
use datafusion_expr::expr::{
NullTreatment, ScalarFunction, Unnest, WildcardOptions, WindowFunction,
use datafusion_expr::{
expr,
expr::{NullTreatment, ScalarFunction, Unnest, WildcardOptions, WindowFunction},
planner::{PlannerResult, RawAggregateExpr, RawWindowExpr},
Expr, ExprSchemable, SortExpr, WindowFrame, WindowFunctionDefinition,
};
use datafusion_expr::planner::{PlannerResult, RawAggregateExpr, RawWindowExpr};
use datafusion_expr::{expr, Expr, ExprSchemable, WindowFrame, WindowFunctionDefinition};
use sqlparser::ast::{
DuplicateTreatment, Expr as SQLExpr, Function as SQLFunction, FunctionArg,
FunctionArgExpr, FunctionArgumentClause, FunctionArgumentList, FunctionArguments,
Expand Down Expand Up @@ -212,6 +213,9 @@ impl FunctionArgs {
}
}

// Helper type for extracting WITHIN GROUP ordering and prepended args
type WithinGroupExtraction = (Vec<SortExpr>, Vec<Expr>, Vec<Option<String>>);

impl<S: ContextProvider> SqlToRel<'_, S> {
pub(super) fn sql_function_to_expr(
&self,
Expand Down Expand Up @@ -490,31 +494,30 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
let (mut args, mut arg_names) =
self.function_args_to_expr_with_names(args, schema, planner_context)?;

let order_by = if fm.supports_within_group_clause() {
let within_group = self.order_by_to_sort_expr(
within_group,
schema,
planner_context,
false,
None,
)?;

// Add the WITHIN GROUP ordering expressions to the front of the argument list
// So function(arg) WITHIN GROUP (ORDER BY x) becomes function(x, arg)
if !within_group.is_empty() {
// Prepend None arg names for each WITHIN GROUP expression
let within_group_count = within_group.len();
arg_names = std::iter::repeat_n(None, within_group_count)
.chain(arg_names)
.collect();

args = within_group
.iter()
.map(|sort| sort.expr.clone())
.chain(args)
.collect::<Vec<_>>();
}
within_group
// UDAFs must opt-in via `supports_within_group_clause()` to
// accept a WITHIN GROUP clause.
let supports_within_group = fm.supports_within_group_clause();

if !within_group.is_empty() && !supports_within_group {
return plan_err!(
"WITHIN GROUP is only supported for ordered-set aggregate functions"
);
}

// If the UDAF supports WITHIN GROUP, convert the ordering into
// sort expressions and prepend them as unnamed function args.
let order_by = if supports_within_group {
let (within_group_sorts, new_args, new_arg_names) = self
.extract_and_prepend_within_group_args(
within_group,
args,
arg_names,
schema,
planner_context,
)?;
args = new_args;
arg_names = new_arg_names;
within_group_sorts
} else {
let order_by = if !order_by.is_empty() {
order_by
Expand Down Expand Up @@ -807,6 +810,38 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
Ok((exprs, names))
}

fn extract_and_prepend_within_group_args(
&self,
within_group: Vec<OrderByExpr>,
mut args: Vec<Expr>,
mut arg_names: Vec<Option<String>>,
schema: &DFSchema,
planner_context: &mut PlannerContext,
) -> Result<WithinGroupExtraction> {
let within_group = self.order_by_to_sort_expr(
within_group,
schema,
planner_context,
false,
None,
)?;

if !within_group.is_empty() {
let within_group_count = within_group.len();
arg_names = std::iter::repeat_n(None, within_group_count)
.chain(arg_names)
.collect();

args = within_group
.iter()
.map(|sort| sort.expr.clone())
.chain(args)
.collect::<Vec<_>>();
}

Ok((within_group, args, arg_names))
}

pub(crate) fn check_unnest_arg(arg: &Expr, schema: &DFSchema) -> Result<()> {
// Check argument type, array types are supported
match arg.get_type(schema)? {
Expand Down
24 changes: 21 additions & 3 deletions datafusion/sql/tests/sql_integration.rs
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure adding these tests here are necessary; SLTs should be sufficient and I believe some of the tests added here are already in SLTs, e.g.

# Not supported over sliding windows
query error DataFusion error: Error during planning: OVER and WITHIN GROUP clause cannot be used together. OVER is for window functions, whereas WITHIN GROUP is for ordered set aggregate functions
SELECT approx_percentile_cont(0.5)
WITHIN GROUP (ORDER BY c3)
OVER (ROWS BETWEEN 4 PRECEDING AND CURRENT ROW)
FROM aggregate_test_100

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed the tests that are already covered in slt.

Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,12 @@ use datafusion_sql::{
use crate::common::{CustomExprPlanner, CustomTypePlanner, MockSessionState};
use datafusion_functions::core::planner::CoreFunctionPlanner;
use datafusion_functions_aggregate::{
approx_median::approx_median_udaf, count::count_udaf, min_max::max_udaf,
min_max::min_udaf,
approx_median::approx_median_udaf,
average::avg_udaf,
count::count_udaf,
grouping::grouping_udaf,
min_max::{max_udaf, min_udaf},
};
use datafusion_functions_aggregate::{average::avg_udaf, grouping::grouping_udaf};
use datafusion_functions_nested::make_array::make_array_udf;
use datafusion_functions_window::{rank::rank_udwf, row_number::row_number_udwf};
use insta::{allow_duplicates, assert_snapshot};
Expand Down Expand Up @@ -233,6 +235,22 @@ fn parse_ident_normalization_4() {
);
}

#[test]
fn within_group_rejected_for_non_ordered_set_udaf() {
// MIN is order-sensitive by nature but does not implement the
// ordered-set `WITHIN GROUP` opt-in. The planner must reject
// explicit `WITHIN GROUP` syntax for functions that do not
// advertise `supports_within_group_clause()`.
let sql = "SELECT min(c1) WITHIN GROUP (ORDER BY c1) FROM person";
let err = logical_plan(sql)
.expect_err("expected planning to fail for MIN WITHIN GROUP")
.to_string();
assert_contains!(
err,
"WITHIN GROUP is only supported for ordered-set aggregate functions"
);
}

#[test]
fn parse_ident_normalization_5() {
let sql = "SELECT AGE FROM PERSON";
Expand Down
20 changes: 14 additions & 6 deletions datafusion/sqllogictest/test_files/aggregate.slt
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,16 @@ CREATE TABLE group_median_table_nullable (
# Error tests
#######

statement error DataFusion error: Error during planning: WITHIN GROUP is only supported for ordered-set aggregate functions
SELECT SUM(c2) WITHIN GROUP (ORDER BY c2) FROM aggregate_test_100

# WITHIN GROUP rejected for non-ordered-set UDAF
# MIN does not implement ordered-set semantics (`supports_within_group_clause()`),
# so the planner should reject the WITHIN GROUP syntax.
statement error DataFusion error: Error during planning: WITHIN GROUP is only supported for ordered-set aggregate functions
SELECT MIN(c) WITHIN GROUP (ORDER BY c) FROM (VALUES (1),(2)) as t(c);


# https://github.com/apache/datafusion/issues/3353
statement error DataFusion error: Schema error: Schema contains duplicate unqualified field name "approx_distinct\(aggregate_test_100\.c9\)"
SELECT approx_distinct(c9) count_c9, approx_distinct(cast(c9 as varchar)) count_c9_str FROM aggregate_test_100
Expand Down Expand Up @@ -7867,17 +7877,15 @@ VALUES
----
x 1

query ?
query error Error during planning: WITHIN GROUP is only supported for ordered-set aggregate functions
SELECT array_agg(a_varchar) WITHIN GROUP (ORDER BY a_varchar)
FROM (VALUES ('a'), ('d'), ('c'), ('a')) t(a_varchar);
----
[a, a, c, d]

query ?

query error Error during planning: WITHIN GROUP is only supported for ordered-set aggregate functions
SELECT array_agg(DISTINCT a_varchar) WITHIN GROUP (ORDER BY a_varchar)
FROM (VALUES ('a'), ('d'), ('c'), ('a')) t(a_varchar);
----
[a, c, d]


query error Error during planning: ORDER BY and WITHIN GROUP clauses cannot be used together in the same aggregate function
SELECT array_agg(a_varchar order by a_varchar) WITHIN GROUP (ORDER BY a_varchar)
Expand Down
30 changes: 30 additions & 0 deletions dev/update_function_docs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,36 @@ FROM employees;
```

Note: When no rows pass the filter, `COUNT` returns `0` while `SUM`/`AVG`/`MIN`/`MAX` return `NULL`.

## WITHIN GROUP / Ordered-set aggregates

Some aggregate functions accept the SQL `WITHIN GROUP (ORDER BY ...)` clause to specify the ordering the
aggregate relies on. In DataFusion this is opt-in: only aggregate functions whose implementation returns
`true` from `AggregateUDFImpl::supports_within_group_clause()` accept the `WITHIN GROUP` clause. Attempting to
use `WITHIN GROUP` with a regular aggregate (for example, `SELECT SUM(x) WITHIN GROUP (ORDER BY x)`) will fail
during planning with an error: "WITHIN GROUP is only supported for ordered-set aggregate functions".

Currently, the built-in aggregate functions that support `WITHIN GROUP` are:

- `percentile_cont` — exact percentile aggregate (also available as `percentile_cont(column, percentile)`)
- `approx_percentile_cont` — approximate percentile using the t-digest algorithm
- `approx_percentile_cont_with_weight` — approximate weighted percentile using the t-digest algorithm

Note: rank-like functions such as `rank()`, `dense_rank()`, and `percent_rank()` are window functions and
use the `OVER (...)` clause; they are not ordered-set aggregates that accept `WITHIN GROUP` in DataFusion.

Example (ordered-set aggregate):

```sql
percentile_cont(0.5) WITHIN GROUP (ORDER BY value)
```

Example (invalid usage — planner will error):

```sql
-- This will fail: SUM is not an ordered-set aggregate
SELECT SUM(x) WITHIN GROUP (ORDER BY x) FROM t;
```
EOF

echo "Running CLI and inserting aggregate function docs table"
Expand Down
21 changes: 21 additions & 0 deletions docs/source/library-user-guide/upgrading.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,27 @@

You can see the current [status of the `52.0.0` release here](https://github.com/apache/datafusion/issues/18566)

### Planner now requires explicit opt-in for WITHIN GROUP syntax

The SQL planner now enforces the aggregate UDF contract more strictly: the
`WITHIN GROUP (ORDER BY ...)` syntax is accepted only if the aggregate UDAF
explicitly advertises support by returning `true` from
`AggregateUDFImpl::supports_within_group_clause()`.

Previously the planner forwarded a `WITHIN GROUP` clause to order-sensitive
aggregates even when they did not implement ordered-set semantics, which could
cause queries such as `SUM(x) WITHIN GROUP (ORDER BY x)` to plan successfully.
This behavior was too permissive and has been changed to match PostgreSQL and
the documented semantics.

Migration: If your UDAF intentionally implements ordered-set semantics and
wants to accept the `WITHIN GROUP` SQL syntax, update your implementation to
return `true` from `supports_within_group_clause()` and handle the ordering
semantics in your accumulator implementation. If your UDAF is merely
order-sensitive (but not an ordered-set aggregate), do not advertise
`supports_within_group_clause()` and clients should use alternative function
signatures (for example, explicit ordering as a function argument) instead.

### `AggregateUDFImpl::supports_null_handling_clause` now defaults to `false`

This method specifies whether an aggregate function allows `IGNORE NULLS`/`RESPECT NULLS`
Expand Down
30 changes: 30 additions & 0 deletions docs/source/user-guide/sql/aggregate_functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,36 @@ FROM employees;

Note: When no rows pass the filter, `COUNT` returns `0` while `SUM`/`AVG`/`MIN`/`MAX` return `NULL`.

## WITHIN GROUP / Ordered-set aggregates

Some aggregate functions accept the SQL `WITHIN GROUP (ORDER BY ...)` clause to specify the ordering the
aggregate relies on. In DataFusion this is opt-in: only aggregate functions whose implementation returns
`true` from `AggregateUDFImpl::supports_within_group_clause()` accept the `WITHIN GROUP` clause. Attempting to
use `WITHIN GROUP` with a regular aggregate (for example, `SELECT SUM(x) WITHIN GROUP (ORDER BY x)`) will fail
during planning with an error: "WITHIN GROUP is only supported for ordered-set aggregate functions".

Currently, the built-in aggregate functions that support `WITHIN GROUP` are:

- `percentile_cont` — exact percentile aggregate (also available as `percentile_cont(column, percentile)`)
- `approx_percentile_cont` — approximate percentile using the t-digest algorithm
- `approx_percentile_cont_with_weight` — approximate weighted percentile using the t-digest algorithm

Note: rank-like functions such as `rank()`, `dense_rank()`, and `percent_rank()` are window functions and
use the `OVER (...)` clause; they are not ordered-set aggregates that accept `WITHIN GROUP` in DataFusion.

Example (ordered-set aggregate):

```sql
percentile_cont(0.5) WITHIN GROUP (ORDER BY value)
```

Example (invalid usage — planner will error):

```sql
-- This will fail: SUM is not an ordered-set aggregate
SELECT SUM(x) WITHIN GROUP (ORDER BY x) FROM t;
```

## General Functions

- [array_agg](#array_agg)
Expand Down