apache · Jefffrey · Nov 14, 2025 · Nov 10, 2025 · Nov 10, 2025 · Nov 10, 2025
diff --git a/datafusion/sql/src/expr/function.rs b/datafusion/sql/src/expr/function.rs
@@ -22,11 +22,12 @@ use datafusion_common::{
     internal_datafusion_err, internal_err, not_impl_err, plan_datafusion_err, plan_err,
     DFSchema, Dependency, Diagnostic, Result, Span,
 };
-use datafusion_expr::expr::{
-    NullTreatment, ScalarFunction, Unnest, WildcardOptions, WindowFunction,
+use datafusion_expr::{
+    expr,
+    expr::{NullTreatment, ScalarFunction, Unnest, WildcardOptions, WindowFunction},
+    planner::{PlannerResult, RawAggregateExpr, RawWindowExpr},
+    Expr, ExprSchemable, SortExpr, WindowFrame, WindowFunctionDefinition,
 };
-use datafusion_expr::planner::{PlannerResult, RawAggregateExpr, RawWindowExpr};
-use datafusion_expr::{expr, Expr, ExprSchemable, WindowFrame, WindowFunctionDefinition};
 use sqlparser::ast::{
     DuplicateTreatment, Expr as SQLExpr, Function as SQLFunction, FunctionArg,
     FunctionArgExpr, FunctionArgumentClause, FunctionArgumentList, FunctionArguments,
@@ -212,6 +213,9 @@ impl FunctionArgs {
     }
 }
 
+// Helper type for extracting WITHIN GROUP ordering and prepended args
+type WithinGroupExtraction = (Vec<SortExpr>, Vec<Expr>, Vec<Option<String>>);
+
 impl<S: ContextProvider> SqlToRel<'_, S> {
     pub(super) fn sql_function_to_expr(
         &self,
@@ -490,31 +494,30 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 let (mut args, mut arg_names) =
                     self.function_args_to_expr_with_names(args, schema, planner_context)?;
 
-                let order_by = if fm.supports_within_group_clause() {
-                    let within_group = self.order_by_to_sort_expr(
-                        within_group,
-                        schema,
-                        planner_context,
-                        false,
-                        None,
-                    )?;
-
-                    // Add the WITHIN GROUP ordering expressions to the front of the argument list
-                    // So function(arg) WITHIN GROUP (ORDER BY x) becomes function(x, arg)
-                    if !within_group.is_empty() {
-                        // Prepend None arg names for each WITHIN GROUP expression
-                        let within_group_count = within_group.len();
-                        arg_names = std::iter::repeat_n(None, within_group_count)
-                            .chain(arg_names)
-                            .collect();
-
-                        args = within_group
-                            .iter()
-                            .map(|sort| sort.expr.clone())
-                            .chain(args)
-                            .collect::<Vec<_>>();
-                    }
-                    within_group
+                // UDAFs must opt-in via `supports_within_group_clause()` to
+                // accept a WITHIN GROUP clause.
+                let supports_within_group = fm.supports_within_group_clause();
+
+                if !within_group.is_empty() && !supports_within_group {
+                    return plan_err!(
+                        "WITHIN GROUP is only supported for ordered-set aggregate functions"
+                    );
+                }
+
+                // If the UDAF supports WITHIN GROUP, convert the ordering into
+                // sort expressions and prepend them as unnamed function args.
+                let order_by = if supports_within_group {
+                    let (within_group_sorts, new_args, new_arg_names) = self
+                        .extract_and_prepend_within_group_args(
+                            within_group,
+                            args,
+                            arg_names,
+                            schema,
+                            planner_context,
+                        )?;
+                    args = new_args;
+                    arg_names = new_arg_names;
+                    within_group_sorts
                 } else {
                     let order_by = if !order_by.is_empty() {
                         order_by
@@ -807,6 +810,38 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
         Ok((exprs, names))
     }
 
+    fn extract_and_prepend_within_group_args(
+        &self,
+        within_group: Vec<OrderByExpr>,
+        mut args: Vec<Expr>,
+        mut arg_names: Vec<Option<String>>,
+        schema: &DFSchema,
+        planner_context: &mut PlannerContext,
+    ) -> Result<WithinGroupExtraction> {
+        let within_group = self.order_by_to_sort_expr(
+            within_group,
+            schema,
+            planner_context,
+            false,
+            None,
+        )?;
+
+        if !within_group.is_empty() {
+            let within_group_count = within_group.len();
+            arg_names = std::iter::repeat_n(None, within_group_count)
+                .chain(arg_names)
+                .collect();
+
+            args = within_group
+                .iter()
+                .map(|sort| sort.expr.clone())
+                .chain(args)
+                .collect::<Vec<_>>();
+        }
+
+        Ok((within_group, args, arg_names))
+    }
+
     pub(crate) fn check_unnest_arg(arg: &Expr, schema: &DFSchema) -> Result<()> {
         // Check argument type, array types are supported
         match arg.get_type(schema)? {

diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs
 # Not supported over sliding windows 
 query error DataFusion error: Error during planning: OVER and WITHIN GROUP clause cannot be used together. OVER is for window functions, whereas WITHIN GROUP is for ordered set aggregate functions 
 SELECT approx_percentile_cont(0.5) 
 WITHIN GROUP (ORDER BY c3) 
 OVER (ROWS BETWEEN 4 PRECEDING AND CURRENT ROW) 
 FROM aggregate_test_100 
 # Not supported over sliding windows 
 query error DataFusion error: Error during planning: OVER and WITHIN GROUP clause cannot be used together. OVER is for window functions, whereas WITHIN GROUP is for ordered set aggregate functions 
 SELECT approx_percentile_cont(0.5) 
 WITHIN GROUP (ORDER BY c3) 
 OVER (ROWS BETWEEN 4 PRECEDING AND CURRENT ROW) 
 FROM aggregate_test_100 
@@ -38,10 +38,12 @@ use datafusion_sql::{
 use crate::common::{CustomExprPlanner, CustomTypePlanner, MockSessionState};
 use datafusion_functions::core::planner::CoreFunctionPlanner;
 use datafusion_functions_aggregate::{
-    approx_median::approx_median_udaf, count::count_udaf, min_max::max_udaf,
-    min_max::min_udaf,
+    approx_median::approx_median_udaf,
+    average::avg_udaf,
+    count::count_udaf,
+    grouping::grouping_udaf,
+    min_max::{max_udaf, min_udaf},
 };
-use datafusion_functions_aggregate::{average::avg_udaf, grouping::grouping_udaf};
 use datafusion_functions_nested::make_array::make_array_udf;
 use datafusion_functions_window::{rank::rank_udwf, row_number::row_number_udwf};
 use insta::{allow_duplicates, assert_snapshot};
@@ -233,6 +235,22 @@ fn parse_ident_normalization_4() {
     );
 }
 
+#[test]
+fn within_group_rejected_for_non_ordered_set_udaf() {
+    // MIN is order-sensitive by nature but does not implement the
+    // ordered-set `WITHIN GROUP` opt-in. The planner must reject
+    // explicit `WITHIN GROUP` syntax for functions that do not
+    // advertise `supports_within_group_clause()`.
+    let sql = "SELECT min(c1) WITHIN GROUP (ORDER BY c1) FROM person";
+    let err = logical_plan(sql)
+        .expect_err("expected planning to fail for MIN WITHIN GROUP")
+        .to_string();
+    assert_contains!(
+        err,
+        "WITHIN GROUP is only supported for ordered-set aggregate functions"
+    );
+}
+
 #[test]
 fn parse_ident_normalization_5() {
     let sql = "SELECT AGE FROM PERSON";

diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt
@@ -129,6 +129,16 @@ CREATE TABLE group_median_table_nullable (
 # Error tests
 #######
 
+statement error DataFusion error: Error during planning: WITHIN GROUP is only supported for ordered-set aggregate functions
+SELECT SUM(c2) WITHIN GROUP (ORDER BY c2) FROM aggregate_test_100
+
+# WITHIN GROUP rejected for non-ordered-set UDAF
+# MIN does not implement ordered-set semantics (`supports_within_group_clause()`),
+# so the planner should reject the WITHIN GROUP syntax.
+statement error DataFusion error: Error during planning: WITHIN GROUP is only supported for ordered-set aggregate functions
+SELECT MIN(c) WITHIN GROUP (ORDER BY c) FROM (VALUES (1),(2)) as t(c);
+
+
 # https://github.com/apache/datafusion/issues/3353
 statement error DataFusion error: Schema error: Schema contains duplicate unqualified field name "approx_distinct\(aggregate_test_100\.c9\)"
 SELECT approx_distinct(c9) count_c9, approx_distinct(cast(c9 as varchar)) count_c9_str FROM aggregate_test_100
@@ -7867,17 +7877,15 @@ VALUES
 ----
 x 1
 
-query ?
+query error Error during planning: WITHIN GROUP is only supported for ordered-set aggregate functions
 SELECT array_agg(a_varchar) WITHIN GROUP (ORDER BY a_varchar)
 FROM (VALUES ('a'), ('d'), ('c'), ('a')) t(a_varchar);
-----
-[a, a, c, d]
 
-query ?
+
+query error Error during planning: WITHIN GROUP is only supported for ordered-set aggregate functions
 SELECT array_agg(DISTINCT a_varchar) WITHIN GROUP (ORDER BY a_varchar)
 FROM (VALUES ('a'), ('d'), ('c'), ('a')) t(a_varchar);
-----
-[a, c, d]
+
 
 query error Error during planning: ORDER BY and WITHIN GROUP clauses cannot be used together in the same aggregate function
 SELECT array_agg(a_varchar order by a_varchar) WITHIN GROUP (ORDER BY a_varchar)

diff --git a/dev/update_function_docs.sh b/dev/update_function_docs.sh
@@ -78,6 +78,36 @@ FROM employees;
 ```
 
 Note: When no rows pass the filter, `COUNT` returns `0` while `SUM`/`AVG`/`MIN`/`MAX` return `NULL`.
+
+## WITHIN GROUP / Ordered-set aggregates
+
+Some aggregate functions accept the SQL `WITHIN GROUP (ORDER BY ...)` clause to specify the ordering the
+aggregate relies on. In DataFusion this is opt-in: only aggregate functions whose implementation returns
+`true` from `AggregateUDFImpl::supports_within_group_clause()` accept the `WITHIN GROUP` clause. Attempting to
+use `WITHIN GROUP` with a regular aggregate (for example, `SELECT SUM(x) WITHIN GROUP (ORDER BY x)`) will fail
+during planning with an error: "WITHIN GROUP is only supported for ordered-set aggregate functions".
+
+Currently, the built-in aggregate functions that support `WITHIN GROUP` are:
+
+- `percentile_cont` — exact percentile aggregate (also available as `percentile_cont(column, percentile)`)
+- `approx_percentile_cont` — approximate percentile using the t-digest algorithm
+- `approx_percentile_cont_with_weight` — approximate weighted percentile using the t-digest algorithm
+
+Note: rank-like functions such as `rank()`, `dense_rank()`, and `percent_rank()` are window functions and
+use the `OVER (...)` clause; they are not ordered-set aggregates that accept `WITHIN GROUP` in DataFusion.
+
+Example (ordered-set aggregate):
+
+```sql
+percentile_cont(0.5) WITHIN GROUP (ORDER BY value)
+```
+
+Example (invalid usage — planner will error):
+
+```sql
+-- This will fail: SUM is not an ordered-set aggregate
+SELECT SUM(x) WITHIN GROUP (ORDER BY x) FROM t;
+```
 EOF
 
 echo "Running CLI and inserting aggregate function docs table"

diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md
@@ -25,6 +25,27 @@
 
 You can see the current [status of the `52.0.0` release here](https://github.com/apache/datafusion/issues/18566)
 
+### Planner now requires explicit opt-in for WITHIN GROUP syntax
+
+The SQL planner now enforces the aggregate UDF contract more strictly: the
+`WITHIN GROUP (ORDER BY ...)` syntax is accepted only if the aggregate UDAF
+explicitly advertises support by returning `true` from
+`AggregateUDFImpl::supports_within_group_clause()`.
+
+Previously the planner forwarded a `WITHIN GROUP` clause to order-sensitive
+aggregates even when they did not implement ordered-set semantics, which could
+cause queries such as `SUM(x) WITHIN GROUP (ORDER BY x)` to plan successfully.
+This behavior was too permissive and has been changed to match PostgreSQL and
+the documented semantics.
+
+Migration: If your UDAF intentionally implements ordered-set semantics and
+wants to accept the `WITHIN GROUP` SQL syntax, update your implementation to
+return `true` from `supports_within_group_clause()` and handle the ordering
+semantics in your accumulator implementation. If your UDAF is merely
+order-sensitive (but not an ordered-set aggregate), do not advertise
+`supports_within_group_clause()` and clients should use alternative function
+signatures (for example, explicit ordering as a function argument) instead.
+
 ### `AggregateUDFImpl::supports_null_handling_clause` now defaults to `false`
 
 This method specifies whether an aggregate function allows `IGNORE NULLS`/`RESPECT NULLS`

diff --git a/docs/source/user-guide/sql/aggregate_functions.md b/docs/source/user-guide/sql/aggregate_functions.md
@@ -48,6 +48,36 @@ FROM employees;
 
 Note: When no rows pass the filter, `COUNT` returns `0` while `SUM`/`AVG`/`MIN`/`MAX` return `NULL`.
 
+## WITHIN GROUP / Ordered-set aggregates
+
+Some aggregate functions accept the SQL `WITHIN GROUP (ORDER BY ...)` clause to specify the ordering the
+aggregate relies on. In DataFusion this is opt-in: only aggregate functions whose implementation returns
+`true` from `AggregateUDFImpl::supports_within_group_clause()` accept the `WITHIN GROUP` clause. Attempting to
+use `WITHIN GROUP` with a regular aggregate (for example, `SELECT SUM(x) WITHIN GROUP (ORDER BY x)`) will fail
+during planning with an error: "WITHIN GROUP is only supported for ordered-set aggregate functions".
+
+Currently, the built-in aggregate functions that support `WITHIN GROUP` are:
+
+- `percentile_cont` — exact percentile aggregate (also available as `percentile_cont(column, percentile)`)
+- `approx_percentile_cont` — approximate percentile using the t-digest algorithm
+- `approx_percentile_cont_with_weight` — approximate weighted percentile using the t-digest algorithm
+
+Note: rank-like functions such as `rank()`, `dense_rank()`, and `percent_rank()` are window functions and
+use the `OVER (...)` clause; they are not ordered-set aggregates that accept `WITHIN GROUP` in DataFusion.
+
+Example (ordered-set aggregate):
+
+```sql
+percentile_cont(0.5) WITHIN GROUP (ORDER BY value)
+```
+
+Example (invalid usage — planner will error):
+
+```sql
+-- This will fail: SUM is not an ordered-set aggregate
+SELECT SUM(x) WITHIN GROUP (ORDER BY x) FROM t;
+```
+
 ## General Functions
 
 - [array_agg](#array_agg)