test: add Rust integration tests for CDC parquet metadata verification

kszucs · kszucs · commit 154a3d200943 · 2026-03-31T16:10:16.000+02:00
Add content_defined_chunking.rs with 2 tests that write parquet files
using ArrowWriter with CDC-enabled WriterProperties, then inspect file
metadata to verify CDC is wired through correctly:
- cdc_data_round_trip: write/read 5000 rows, verify count and range
- cdc_affects_page_boundaries: compare column uncompressed sizes
  between CDC and non-CDC writes to verify CDC changes the page layout

Also fix clippy warning on CdcOptions::default() inherent method.
diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
@@ -715,6 +715,7 @@ pub struct CdcOptions {
 // Use `CdcOptions::default()` (the inherent method) instead of `Default::default()`.
 impl CdcOptions {
     /// Returns a new `CdcOptions` with default values.
+    #[expect(clippy::should_implement_trait)]
     pub fn default() -> Self {
         Self {
             min_chunk_size: 256 * 1024,
@@ -731,10 +732,7 @@ impl ConfigField for CdcOptions {
             "min_chunk_size" => self.min_chunk_size.set(rem, value),
             "max_chunk_size" => self.max_chunk_size.set(rem, value),
             "norm_level" => self.norm_level.set(rem, value),
-            _ => _config_err!(
-                "Config value \"{}\" not found on CdcOptions",
-                key
-            ),
+            _ => _config_err!("Config value \"{}\" not found on CdcOptions", key),
         }
     }
 
@@ -774,10 +772,7 @@ impl ConfigField for CdcOptions {
                     self.norm_level.reset(rem)
                 }
             }
-            _ => _config_err!(
-                "Config value \"{}\" not found on CdcOptions",
-                key
-            ),
+            _ => _config_err!("Config value \"{}\" not found on CdcOptions", key),
         }
     }
 }
@@ -809,8 +804,7 @@ impl ConfigField for Option<CdcOptions> {
                 ),
             }
         } else {
-            self.get_or_insert_with(CdcOptions::default)
-                .set(key, value)
+            self.get_or_insert_with(CdcOptions::default).set(key, value)
         }
     }
 
@@ -819,8 +813,7 @@ impl ConfigField for Option<CdcOptions> {
             *self = None;
             Ok(())
         } else {
-            self.get_or_insert_with(CdcOptions::default)
-                .reset(key)
+            self.get_or_insert_with(CdcOptions::default).reset(key)
         }
     }
 }
@@ -3731,11 +3724,13 @@ mod tests {
         use crate::config::ConfigOptions;
 
         let mut config = ConfigOptions::default();
-        assert!(config
-            .execution
-            .parquet
-            .use_content_defined_chunking
-            .is_none());
+        assert!(
+            config
+                .execution
+                .parquet
+                .use_content_defined_chunking
+                .is_none()
+        );
 
         // Setting to "true" should enable CDC with default options
         config
@@ -3761,11 +3756,13 @@ mod tests {
                 "false",
             )
             .unwrap();
-        assert!(config
-            .execution
-            .parquet
-            .use_content_defined_chunking
-            .is_none());
+        assert!(
+            config
+                .execution
+                .parquet
+                .use_content_defined_chunking
+                .is_none()
+        );
     }
 
     #[cfg(feature = "parquet")]
diff --git a/datafusion/core/tests/parquet/content_defined_chunking.rs b/datafusion/core/tests/parquet/content_defined_chunking.rs
@@ -0,0 +1,197 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Tests for parquet content-defined chunking (CDC).
+//!
+//! These tests verify that CDC options are correctly wired through to the
+//! parquet writer by inspecting file metadata (compressed sizes, page
+//! boundaries) on the written files.
+
+use arrow::array::{Int32Array, StringArray};
+use arrow::datatypes::{DataType, Field, Schema};
+use arrow::record_batch::RecordBatch;
+use datafusion::prelude::{ParquetReadOptions, SessionContext};
+use datafusion_common::config::{CdcOptions, TableParquetOptions};
+use parquet::arrow::ArrowWriter;
+use parquet::arrow::arrow_reader::ArrowReaderMetadata;
+use parquet::file::properties::WriterProperties;
+use std::fs::File;
+use std::sync::Arc;
+use tempfile::NamedTempFile;
+
+/// Create a RecordBatch with enough data to exercise CDC chunking.
+fn make_test_batch(num_rows: usize) -> RecordBatch {
+    let ids: Vec<i32> = (0..num_rows as i32).collect();
+    // ~100 bytes per row to generate enough data for CDC page splits
+    let payloads: Vec<String> = (0..num_rows)
+        .map(|i| format!("row-{i:06}-payload-{}", "x".repeat(80)))
+        .collect();
+
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("id", DataType::Int32, false),
+        Field::new("payload", DataType::Utf8, false),
+    ]));
+
+    RecordBatch::try_new(
+        schema,
+        vec![
+            Arc::new(Int32Array::from(ids)),
+            Arc::new(StringArray::from(payloads)),
+        ],
+    )
+    .unwrap()
+}
+
+/// Build WriterProperties from TableParquetOptions, exercising the same
+/// code path that DataFusion's parquet sink uses.
+fn writer_props(
+    opts: &mut TableParquetOptions,
+    schema: &Arc<Schema>,
+) -> WriterProperties {
+    opts.arrow_schema(schema);
+    parquet::file::properties::WriterPropertiesBuilder::try_from(
+        opts as &TableParquetOptions,
+    )
+    .unwrap()
+    .build()
+}
+
+/// Write a batch to a temp parquet file and return the file handle.
+fn write_parquet_file(batch: &RecordBatch, props: WriterProperties) -> NamedTempFile {
+    let tmp = tempfile::Builder::new()
+        .suffix(".parquet")
+        .tempfile()
+        .unwrap();
+    let mut writer =
+        ArrowWriter::try_new(tmp.reopen().unwrap(), batch.schema(), Some(props)).unwrap();
+    writer.write(batch).unwrap();
+    writer.close().unwrap();
+    tmp
+}
+
+/// Read parquet metadata from a file.
+fn read_metadata(file: &NamedTempFile) -> parquet::file::metadata::ParquetMetaData {
+    let f = File::open(file.path()).unwrap();
+    let reader_meta = ArrowReaderMetadata::load(&f, Default::default()).unwrap();
+    reader_meta.metadata().as_ref().clone()
+}
+
+/// Write parquet with CDC enabled, read it back via DataFusion, and verify
+/// the data round-trips correctly.
+#[tokio::test]
+async fn cdc_data_round_trip() {
+    let batch = make_test_batch(5000);
+
+    let mut opts = TableParquetOptions::default();
+    opts.global.use_content_defined_chunking = Some(CdcOptions::default());
+    let props = writer_props(&mut opts, &batch.schema());
+
+    let tmp = write_parquet_file(&batch, props);
+
+    // Read back via DataFusion and verify row count
+    let ctx = SessionContext::new();
+    ctx.register_parquet(
+        "data",
+        tmp.path().to_str().unwrap(),
+        ParquetReadOptions::default(),
+    )
+    .await
+    .unwrap();
+
+    let result = ctx
+        .sql("SELECT COUNT(*), MIN(id), MAX(id) FROM data")
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap();
+
+    let row = &result[0];
+    let count = row
+        .column(0)
+        .as_any()
+        .downcast_ref::<arrow::array::Int64Array>()
+        .unwrap()
+        .value(0);
+    let min_id = row
+        .column(1)
+        .as_any()
+        .downcast_ref::<Int32Array>()
+        .unwrap()
+        .value(0);
+    let max_id = row
+        .column(2)
+        .as_any()
+        .downcast_ref::<Int32Array>()
+        .unwrap()
+        .value(0);
+
+    assert_eq!(count, 5000);
+    assert_eq!(min_id, 0);
+    assert_eq!(max_id, 4999);
+}
+
+/// Verify that CDC options are reflected in the parquet file metadata.
+/// With small chunk sizes, CDC should produce different page boundaries
+/// compared to default (no CDC) writing.
+#[tokio::test]
+async fn cdc_affects_page_boundaries() {
+    let batch = make_test_batch(5000);
+
+    // Write WITHOUT CDC
+    let mut no_cdc_opts = TableParquetOptions::default();
+    let no_cdc_file =
+        write_parquet_file(&batch, writer_props(&mut no_cdc_opts, &batch.schema()));
+    let no_cdc_meta = read_metadata(&no_cdc_file);
+
+    // Write WITH CDC using small chunk sizes to maximize effect
+    let mut cdc_opts = TableParquetOptions::default();
+    cdc_opts.global.use_content_defined_chunking = Some(CdcOptions {
+        min_chunk_size: 512,
+        max_chunk_size: 2048,
+        norm_level: 0,
+    });
+    let cdc_file =
+        write_parquet_file(&batch, writer_props(&mut cdc_opts, &batch.schema()));
+    let cdc_meta = read_metadata(&cdc_file);
+
+    // Both files should have the same number of rows
+    assert_eq!(
+        no_cdc_meta.file_metadata().num_rows(),
+        cdc_meta.file_metadata().num_rows(),
+    );
+
+    // Compare the uncompressed sizes of columns across all row groups.
+    // CDC with small chunk sizes should produce different page boundaries.
+    let no_cdc_sizes: Vec<i64> = no_cdc_meta
+        .row_groups()
+        .iter()
+        .flat_map(|rg| rg.columns().iter().map(|c| c.uncompressed_size()))
+        .collect();
+
+    let cdc_sizes: Vec<i64> = cdc_meta
+        .row_groups()
+        .iter()
+        .flat_map(|rg| rg.columns().iter().map(|c| c.uncompressed_size()))
+        .collect();
+
+    assert_ne!(
+        no_cdc_sizes, cdc_sizes,
+        "CDC with small chunk sizes should produce different page layouts \
+         than default writing. no_cdc={no_cdc_sizes:?}, cdc={cdc_sizes:?}"
+    );
+}
diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs
@@ -44,6 +44,7 @@ use parquet::file::properties::{EnabledStatistics, WriterProperties};
 use std::sync::Arc;
 use tempfile::NamedTempFile;
 
+mod content_defined_chunking;
 mod custom_reader;
 #[cfg(feature = "parquet_encryption")]
 mod encryption;
diff --git a/datafusion/proto-common/src/from_proto/mod.rs b/datafusion/proto-common/src/from_proto/mod.rs
@@ -1275,7 +1275,6 @@ pub(crate) fn csv_writer_options_from_proto(
 
 #[cfg(test)]
 mod tests {
-    use super::*;
     use datafusion_common::config::{CdcOptions, ParquetOptions, TableParquetOptions};
 
     fn parquet_options_proto_round_trip(opts: ParquetOptions) -> ParquetOptions {

Original file line number	Diff line number	Diff line change
`@@ -715,6 +715,7 @@ pub struct CdcOptions {`
`715`	`715`	// Use `CdcOptions::default()` (the inherent method) instead of `Default::default()`.
`716`	`716`	`impl CdcOptions {`
`717`	`717`	/// Returns a new `CdcOptions` with default values.
	`718`	`+ #[expect(clippy::should_implement_trait)]`
`718`	`719`	`pub fn default() -> Self {`
`719`	`720`	`Self {`
`720`	`721`	`min_chunk_size: 256 * 1024,`
`@@ -731,10 +732,7 @@ impl ConfigField for CdcOptions {`
`731`	`732`	`"min_chunk_size" => self.min_chunk_size.set(rem, value),`
`732`	`733`	`"max_chunk_size" => self.max_chunk_size.set(rem, value),`
`733`	`734`	`"norm_level" => self.norm_level.set(rem, value),`
`734`		`- _ => _config_err!(`
`735`		`- "Config value \"{}\" not found on CdcOptions",`
`736`		`- key`
`737`		`- ),`
	`735`	`+ _ => _config_err!("Config value \"{}\" not found on CdcOptions", key),`
`738`	`736`	`}`
`739`	`737`	`}`
`740`	`738`
`@@ -774,10 +772,7 @@ impl ConfigField for CdcOptions {`
`774`	`772`	`self.norm_level.reset(rem)`
`775`	`773`	`}`
`776`	`774`	`}`
`777`		`- _ => _config_err!(`
`778`		`- "Config value \"{}\" not found on CdcOptions",`
`779`		`- key`
`780`		`- ),`
	`775`	`+ _ => _config_err!("Config value \"{}\" not found on CdcOptions", key),`
`781`	`776`	`}`
`782`	`777`	`}`
`783`	`778`	`}`
`@@ -809,8 +804,7 @@ impl ConfigField for Option<CdcOptions> {`
`809`	`804`	`),`
`810`	`805`	`}`
`811`	`806`	`} else {`
`812`		`- self.get_or_insert_with(CdcOptions::default)`
`813`		`- .set(key, value)`
	`807`	`+ self.get_or_insert_with(CdcOptions::default).set(key, value)`
`814`	`808`	`}`
`815`	`809`	`}`
`816`	`810`
`@@ -819,8 +813,7 @@ impl ConfigField for Option<CdcOptions> {`
`819`	`813`	`*self = None;`
`820`	`814`	`Ok(())`
`821`	`815`	`} else {`
`822`		`- self.get_or_insert_with(CdcOptions::default)`
`823`		`- .reset(key)`
	`816`	`+ self.get_or_insert_with(CdcOptions::default).reset(key)`
`824`	`817`	`}`
`825`	`818`	`}`
`826`	`819`	`}`
`@@ -3731,11 +3724,13 @@ mod tests {`
`3731`	`3724`	`use crate::config::ConfigOptions;`
`3732`	`3725`
`3733`	`3726`	`let mut config = ConfigOptions::default();`
`3734`		`- assert!(config`
`3735`		`- .execution`
`3736`		`- .parquet`
`3737`		`- .use_content_defined_chunking`
`3738`		`- .is_none());`
	`3727`	`+ assert!(`
	`3728`	`+ config`
	`3729`	`+ .execution`
	`3730`	`+ .parquet`
	`3731`	`+ .use_content_defined_chunking`
	`3732`	`+ .is_none()`
	`3733`	`+ );`
`3739`	`3734`
`3740`	`3735`	`// Setting to "true" should enable CDC with default options`
`3741`	`3736`	`config`
`@@ -3761,11 +3756,13 @@ mod tests {`
`3761`	`3756`	`"false",`
`3762`	`3757`	`)`
`3763`	`3758`	`.unwrap();`
`3764`		`- assert!(config`
`3765`		`- .execution`
`3766`		`- .parquet`
`3767`		`- .use_content_defined_chunking`
`3768`		`- .is_none());`
	`3759`	`+ assert!(`
	`3760`	`+ config`
	`3761`	`+ .execution`
	`3762`	`+ .parquet`
	`3763`	`+ .use_content_defined_chunking`
	`3764`	`+ .is_none()`
	`3765`	`+ );`
`3769`	`3766`	`}`
`3770`	`3767`
`3771`	`3768`	`#[cfg(feature = "parquet")]`