Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 79 additions & 7 deletions datafusion/expr/src/expr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -512,17 +512,26 @@ pub type SchemaFieldMetadata = std::collections::HashMap<String, String>;
pub fn intersect_metadata_for_union<'a>(
metadatas: impl IntoIterator<Item = &'a SchemaFieldMetadata>,
) -> SchemaFieldMetadata {
let mut metadatas = metadatas.into_iter();
let Some(mut intersected) = metadatas.next().cloned() else {
return Default::default();
};
let mut intersected: Option<SchemaFieldMetadata> = None;

for metadata in metadatas {
// Only keep keys that exist in both with the same value
intersected.retain(|k, v| metadata.get(k) == Some(v));
// Skip empty metadata (e.g. from NULL literals or computed expressions)
// to avoid dropping metadata from branches that have it.
if metadata.is_empty() {
continue;
}
match &mut intersected {
None => {
intersected = Some(metadata.clone());
}
Some(current) => {
// Only keep keys that exist in both with the same value
current.retain(|k, v| metadata.get(k) == Some(v));
}
}
}

intersected
intersected.unwrap_or_default()
}

/// UNNEST expression.
Expand Down Expand Up @@ -4127,4 +4136,67 @@ mod test {
}
}
}

mod intersect_metadata_tests {
use super::super::intersect_metadata_for_union;
use std::collections::HashMap;

#[test]
fn all_branches_same_metadata() {
let m1 = HashMap::from([("key".into(), "val".into())]);
let m2 = HashMap::from([("key".into(), "val".into())]);
let result = intersect_metadata_for_union([&m1, &m2]);
assert_eq!(result, HashMap::from([("key".into(), "val".into())]));
}

#[test]
fn conflicting_metadata_dropped() {
let m1 = HashMap::from([("key".into(), "a".into())]);
let m2 = HashMap::from([("key".into(), "b".into())]);
let result = intersect_metadata_for_union([&m1, &m2]);
assert!(result.is_empty());
}

#[test]
fn empty_metadata_branch_skipped() {
let m1 = HashMap::from([("key".into(), "val".into())]);
let m2 = HashMap::new(); // e.g. NULL literal
let result = intersect_metadata_for_union([&m1, &m2]);
assert_eq!(result, HashMap::from([("key".into(), "val".into())]));
}

#[test]
fn empty_metadata_first_branch_skipped() {
let m1 = HashMap::new();
let m2 = HashMap::from([("key".into(), "val".into())]);
let result = intersect_metadata_for_union([&m1, &m2]);
assert_eq!(result, HashMap::from([("key".into(), "val".into())]));
}

#[test]
fn all_branches_empty_metadata() {
let m1: HashMap<String, String> = HashMap::new();
let m2: HashMap<String, String> = HashMap::new();
let result = intersect_metadata_for_union([&m1, &m2]);
assert!(result.is_empty());
}

#[test]
fn mixed_empty_and_conflicting() {
let m1 = HashMap::from([("key".into(), "a".into())]);
let m2 = HashMap::new();
let m3 = HashMap::from([("key".into(), "b".into())]);
let result = intersect_metadata_for_union([&m1, &m2, &m3]);
// m2 is skipped; m1 and m3 conflict → dropped
assert!(result.is_empty());
}

#[test]
fn no_inputs() {
let result = intersect_metadata_for_union(std::iter::empty::<
&HashMap<String, String>,
>());
assert!(result.is_empty());
}
}
}
17 changes: 17 additions & 0 deletions datafusion/sqllogictest/test_files/metadata.slt
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,23 @@ ORDER BY id, name, l_name;
NULL bar NULL
NULL NULL l_bar

# Regression test: UNION ALL + optimize_projections pruning unused columns causes
# metadata loss when one branch has NULL literals (empty metadata) and the other
# has field metadata. The optimizer prunes unused columns, triggering recompute_schema
# which rebuilds the Union schema via intersect_metadata_for_union. Previously, this
# intersection would drop metadata when any branch had empty metadata (from NULL literals).
# See https://github.com/apache/datafusion/issues/19049
query T
SELECT name FROM (
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I verified that this test fails without the fix:

  Internal error: Physical input schema should be the same as the one converted from logical input schema.
  Differences:
    - schema metadata differs: (physical) {"metadata_key": "the entire schema"} vs (logical) {}
    - field metadata at index 0 [name]: (physical) {"metadata_key": "the name field"} vs (logical) {}.

SELECT id, name FROM "table_with_metadata"
UNION ALL
SELECT NULL::int as id, NULL::string as name
) GROUP BY name ORDER BY name;
----
bar
baz
NULL

# Regression test: missing field metadata from left side of the union when right side is chosen
query T
select name from (
Expand Down
Loading