diff --git a/parquet-variant-compute/src/lib.rs b/parquet-variant-compute/src/lib.rs index 999e118367ac..70fcbdb66f95 100644 --- a/parquet-variant-compute/src/lib.rs +++ b/parquet-variant-compute/src/lib.rs @@ -46,7 +46,7 @@ pub mod variant_get; mod variant_to_arrow; pub use variant_array::{ShreddingState, VariantArray}; -pub use variant_array_builder::VariantArrayBuilder; +pub use variant_array_builder::{VariantArrayBuilder, VariantValueArrayBuilder}; pub use cast_to_variant::{cast_to_variant, cast_to_variant_with_options}; pub use from_json::json_to_variant; diff --git a/parquet-variant-compute/src/variant_array_builder.rs b/parquet-variant-compute/src/variant_array_builder.rs index 9779d4a06d4a..6451e3565802 100644 --- a/parquet-variant-compute/src/variant_array_builder.rs +++ b/parquet-variant-compute/src/variant_array_builder.rs @@ -23,7 +23,9 @@ use arrow_schema::{ArrowError, DataType, Field, Fields}; use parquet_variant::{ BuilderSpecificState, ListBuilder, MetadataBuilder, ObjectBuilder, Variant, VariantBuilderExt, }; -use parquet_variant::{ParentState, ValueBuilder, WritableMetadataBuilder}; +use parquet_variant::{ + ParentState, ReadOnlyMetadataBuilder, ValueBuilder, WritableMetadataBuilder, +}; use std::sync::Arc; /// A builder for [`VariantArray`] @@ -205,6 +207,154 @@ impl VariantBuilderExt for VariantArrayBuilder { } } +/// A builder for creating only the value column of a [`VariantArray`] +/// +/// This builder is used when you have existing metadata and only need to build +/// the value column. It's useful for scenarios like variant unshredding, data +/// transformation, or filtering where you want to reuse existing metadata. +/// +/// The builder produces a [`BinaryViewArray`] that can be combined with existing +/// metadata to create a complete [`VariantArray`]. +/// +/// # Example: +/// ``` +/// # use arrow::array::Array; +/// # use parquet_variant::{Variant}; +/// # use parquet_variant_compute::VariantValueArrayBuilder; +/// // Create a variant value builder for 10 rows +/// let mut builder = VariantValueArrayBuilder::new(10); +/// +/// // Append some values with their corresponding metadata, which the +/// // builder takes advantage of to avoid creating new metadata. +/// builder.append_value(Variant::from(42)); +/// builder.append_null(); +/// builder.append_value(Variant::from("hello")); +/// +/// // Build the final value array +/// let value_array = builder.build().unwrap(); +/// assert_eq!(value_array.len(), 3); +/// ``` +#[derive(Debug)] +pub struct VariantValueArrayBuilder { + value_builder: ValueBuilder, + value_offsets: Vec, + nulls: NullBufferBuilder, +} + +impl VariantValueArrayBuilder { + /// Create a new `VariantValueArrayBuilder` with the specified row capacity + pub fn new(row_capacity: usize) -> Self { + Self { + value_builder: ValueBuilder::new(), + value_offsets: Vec::with_capacity(row_capacity), + nulls: NullBufferBuilder::new(row_capacity), + } + } + + /// Build the final value array + /// + /// Returns a [`BinaryViewArray`] containing the serialized variant values. + /// This can be combined with existing metadata to create a complete [`VariantArray`]. + pub fn build(mut self) -> Result { + let value_buffer = self.value_builder.into_inner(); + let mut array = binary_view_array_from_buffers(value_buffer, self.value_offsets); + if let Some(nulls) = self.nulls.finish() { + let (views, buffers, _) = array.into_parts(); + array = BinaryViewArray::try_new(views, buffers, Some(nulls))?; + } + Ok(array) + } + + /// Append a null row to the builder + /// + /// WARNING: It is only valid to call this method when building the `value` field of a shredded + /// variant column (which is nullable). The `value` field of a binary (unshredded) variant + /// column is non-nullable, and callers should instead invoke [`Self::append_value`] with + /// `Variant::Null`, passing the appropriate metadata value. + pub fn append_null(&mut self) { + self.value_offsets.push(self.value_builder.offset()); + self.nulls.append_null(); + } + + /// Append a variant value with its corresponding metadata + /// + /// # Arguments + /// * `value` - The variant value to append + /// * `metadata` - The metadata dictionary for this variant (used for field name resolution) + /// + /// # Returns + /// * `Ok(())` if the value was successfully appended + /// * `Err(ArrowError)` if the variant contains field names not found in the metadata + /// + /// # Example + /// ``` + /// # use parquet_variant::Variant; + /// # use parquet_variant_compute::VariantValueArrayBuilder; + /// let mut builder = VariantValueArrayBuilder::new(10); + /// builder.append_value(Variant::from(42)); + /// ``` + pub fn append_value(&mut self, value: Variant<'_, '_>) { + let mut metadata_builder = ReadOnlyMetadataBuilder::new(value.metadata().clone()); + ValueBuilder::append_variant_bytes(self.parent_state(&mut metadata_builder), value); + } + + /// Creates a builder-specific parent state. + /// + /// For example, this can be useful for code that wants to copy a subset of fields from an + /// object `value` as a new row of `value_array_builder`: + /// + /// ```no_run + /// # use parquet_variant::{ObjectBuilder, ReadOnlyMetadataBuilder, Variant}; + /// # use parquet_variant_compute::VariantValueArrayBuilder; + /// # let value = Variant::Null; + /// # let mut value_array_builder = VariantValueArrayBuilder::new(0); + /// # fn should_keep(field_name: &str) -> bool { todo!() }; + /// let Variant::Object(obj) = value else { + /// panic!("Not a variant object"); + /// }; + /// let mut metadata_builder = ReadOnlyMetadataBuilder::new(obj.metadata.clone()); + /// let state = value_array_builder.parent_state(&mut metadata_builder); + /// let mut object_builder = ObjectBuilder::new(state, false); + /// for (field_name, field_value) in obj.iter() { + /// if should_keep(field_name) { + /// object_builder.insert_bytes(field_name, field_value); + /// } + /// } + /// object_builder.finish(); // appends the filtered object + /// ``` + pub fn parent_state<'a>( + &'a mut self, + metadata_builder: &'a mut dyn MetadataBuilder, + ) -> ParentState<'a, ValueArrayBuilderState<'a>> { + let state = ValueArrayBuilderState { + value_offsets: &mut self.value_offsets, + nulls: &mut self.nulls, + }; + + ParentState::new(&mut self.value_builder, metadata_builder, state) + } +} + +/// Builder-specific state for array building that manages array-level offsets and nulls. See +/// [`VariantBuilderExt`] for details. +#[derive(Debug)] +pub struct ValueArrayBuilderState<'a> { + value_offsets: &'a mut Vec, + nulls: &'a mut NullBufferBuilder, +} + +// All changes are pending until finalized +impl BuilderSpecificState for ValueArrayBuilderState<'_> { + fn finish( + &mut self, + _metadata_builder: &mut dyn MetadataBuilder, + value_builder: &mut ValueBuilder, + ) { + self.value_offsets.push(value_builder.offset()); + self.nulls.append_non_null(); + } +} + fn binary_view_array_from_buffers(buffer: Vec, offsets: Vec) -> BinaryViewArray { // All offsets are less than or equal to the buffer length, so we can safely cast all offsets // inside the loop below, as long as the buffer length fits in u32. @@ -228,6 +378,7 @@ fn binary_view_array_from_buffers(buffer: Vec, offsets: Vec) -> Binar mod test { use super::*; use arrow::array::Array; + use parquet_variant::Variant; /// Test that both the metadata and value buffers are non nullable #[test] @@ -288,4 +439,92 @@ mod test { let list = variant.as_list().expect("variant to be a list"); assert_eq!(list.len(), 2); } + + #[test] + fn test_variant_value_array_builder_basic() { + let mut builder = VariantValueArrayBuilder::new(10); + + // Append some values + builder.append_value(Variant::from(42i32)); + builder.append_null(); + builder.append_value(Variant::from("hello")); + + let value_array = builder.build().unwrap(); + assert_eq!(value_array.len(), 3); + } + + #[test] + fn test_variant_value_array_builder_with_objects() { + // Populate a variant array with objects + let mut builder = VariantArrayBuilder::new(3); + builder + .new_object() + .with_field("name", "Alice") + .with_field("age", 30i32) + .finish(); + + builder + .new_object() + .with_field("name", "Bob") + .with_field("age", 42i32) + .with_field("city", "Wonderland") + .finish(); + + builder + .new_object() + .with_field("name", "Charlie") + .with_field("age", 1i32) + .finish(); + + let array = builder.build(); + + // Copy (some of) the objects over to the value array builder + // + // NOTE: Because we will reuse the metadata column, we cannot reorder rows. We can only + // filter or manipulate values within a row. + let mut builder = VariantValueArrayBuilder::new(3); + + // straight copy + builder.append_value(array.value(0)); + + // filtering fields takes more work because we need to manually create an object builder + let value = array.value(1); + let mut metadata_builder = ReadOnlyMetadataBuilder::new(value.metadata().clone()); + let state = builder.parent_state(&mut metadata_builder); + ObjectBuilder::new(state, false) + .with_field("name", value.get_object_field("name").unwrap()) + .with_field("age", value.get_object_field("age").unwrap()) + .finish(); + + // same bytes, but now nested and duplicated inside a list + let value = array.value(2); + let mut metadata_builder = ReadOnlyMetadataBuilder::new(value.metadata().clone()); + let state = builder.parent_state(&mut metadata_builder); + ListBuilder::new(state, false) + .with_value(value.clone()) + .with_value(value.clone()) + .finish(); + + let array2 = VariantArray::from_parts( + array.metadata_field().clone(), + Some(builder.build().unwrap()), + None, + None, + ); + + assert_eq!(array2.len(), 3); + assert_eq!(array.value(0), array2.value(0)); + + assert_eq!( + array.value(1).get_object_field("name"), + array2.value(1).get_object_field("name") + ); + assert_eq!( + array.value(1).get_object_field("age"), + array2.value(1).get_object_field("age") + ); + + assert_eq!(array.value(2), array2.value(2).get_list_element(0).unwrap()); + assert_eq!(array.value(2), array2.value(2).get_list_element(1).unwrap()); + } } diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index a5819fc45937..01af46e564f2 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -135,8 +135,13 @@ fn shredded_get_path( let shred_basic_variant = |target: VariantArray, path: VariantPath<'_>, as_field: Option<&Field>| { let as_type = as_field.map(|f| f.data_type()); - let mut builder = - make_variant_to_arrow_row_builder(path, as_type, cast_options, target.len())?; + let mut builder = make_variant_to_arrow_row_builder( + target.metadata_field(), + path, + as_type, + cast_options, + target.len(), + )?; for i in 0..target.len() { if target.is_null(i) { builder.append_null()?; diff --git a/parquet-variant-compute/src/variant_to_arrow.rs b/parquet-variant-compute/src/variant_to_arrow.rs index 60f74e365dd4..b24f2d805d3e 100644 --- a/parquet-variant-compute/src/variant_to_arrow.rs +++ b/parquet-variant-compute/src/variant_to_arrow.rs @@ -15,14 +15,14 @@ // specific language governing permissions and limitations // under the License. -use arrow::array::{ArrayRef, PrimitiveBuilder}; +use arrow::array::{ArrayRef, BinaryViewArray, NullBufferBuilder, PrimitiveBuilder}; use arrow::compute::CastOptions; use arrow::datatypes::{self, ArrowPrimitiveType, DataType}; use arrow::error::{ArrowError, Result}; use parquet_variant::{Variant, VariantPath}; use crate::type_conversion::VariantAsPrimitive; -use crate::VariantArrayBuilder; +use crate::{VariantArray, VariantValueArrayBuilder}; use std::sync::Arc; @@ -93,7 +93,7 @@ impl<'a> VariantToArrowRowBuilder<'a> { } pub(crate) fn make_variant_to_arrow_row_builder<'a>( - //metadata: &BinaryViewArray, + metadata: &BinaryViewArray, path: VariantPath<'a>, data_type: Option<&'a DataType>, cast_options: &'a CastOptions, @@ -103,7 +103,10 @@ pub(crate) fn make_variant_to_arrow_row_builder<'a>( let mut builder = match data_type { // If no data type was requested, build an unshredded VariantArray. - None => BinaryVariant(VariantToBinaryVariantArrowRowBuilder::new(capacity)), + None => BinaryVariant(VariantToBinaryVariantArrowRowBuilder::new( + metadata.clone(), + capacity, + )), Some(DataType::Int8) => Int8(VariantToPrimitiveArrowRowBuilder::new( cast_options, capacity, @@ -246,13 +249,17 @@ where /// Builder for creating VariantArray output (for path extraction without type conversion) pub(crate) struct VariantToBinaryVariantArrowRowBuilder { - builder: VariantArrayBuilder, + metadata: BinaryViewArray, + builder: VariantValueArrayBuilder, + nulls: NullBufferBuilder, } impl VariantToBinaryVariantArrowRowBuilder { - fn new(capacity: usize) -> Self { + fn new(metadata: BinaryViewArray, capacity: usize) -> Self { Self { - builder: VariantArrayBuilder::new(capacity), + metadata, + builder: VariantValueArrayBuilder::new(capacity), + nulls: NullBufferBuilder::new(capacity), } } } @@ -260,22 +267,22 @@ impl VariantToBinaryVariantArrowRowBuilder { impl VariantToBinaryVariantArrowRowBuilder { fn append_null(&mut self) -> Result<()> { self.builder.append_null(); + self.nulls.append_null(); Ok(()) } fn append_value(&mut self, value: &Variant<'_, '_>) -> Result { - // TODO: We need a way to convert a Variant directly to bytes. In particular, we want to - // just copy across the underlying value byte slice of a `Variant::Object` or - // `Variant::List`, without any interaction with a `VariantMetadata` (because the shredding - // spec requires us to reuse the existing metadata when unshredding). - // - // One could _probably_ emulate this with parquet_variant::VariantBuilder, but it would do a - // lot of unnecessary work and would also create a new metadata column we don't need. - self.builder.append_variant(value.clone()); + self.builder.append_value(value.clone()); + self.nulls.append_non_null(); Ok(true) } - fn finish(self) -> Result { - Ok(Arc::new(self.builder.build())) + fn finish(mut self) -> Result { + Ok(Arc::new(VariantArray::from_parts( + self.metadata, + Some(self.builder.build()?), + None, // no typed_value column + self.nulls.finish(), + ))) } } diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index 93e736285853..1480d6400db1 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -562,7 +562,7 @@ pub struct WritableMetadataBuilder { impl WritableMetadataBuilder { /// Upsert field name to dictionary, return its ID - fn upsert_field_name(&mut self, field_name: &str) -> u32 { + pub fn upsert_field_name(&mut self, field_name: &str) -> u32 { let (id, new_entry) = self.field_names.insert_full(field_name.to_string()); if new_entry { diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index faaab94bc3fd..4aff866335bb 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -1320,14 +1320,12 @@ impl<'m, 'v> Variant<'m, 'v> { } } - /// Return the metadata associated with this variant, if any. - /// - /// Returns `Some(&VariantMetadata)` for object and list variants, - pub fn metadata(&self) -> Option<&'m VariantMetadata<'_>> { + /// Return the metadata dictionary associated with this variant value. + pub fn metadata(&self) -> &VariantMetadata<'m> { match self { Variant::Object(VariantObject { metadata, .. }) - | Variant::List(VariantList { metadata, .. }) => Some(metadata), - _ => None, + | Variant::List(VariantList { metadata, .. }) => metadata, + _ => &EMPTY_VARIANT_METADATA, } } diff --git a/parquet-variant/src/variant/object.rs b/parquet-variant/src/variant/object.rs index df1857846302..aa7714c6d2e1 100644 --- a/parquet-variant/src/variant/object.rs +++ b/parquet-variant/src/variant/object.rs @@ -848,8 +848,8 @@ mod tests { let v2 = Variant::try_new(&m, &v).unwrap(); - let m1 = v1.metadata().unwrap(); - let m2 = v2.metadata().unwrap(); + let m1 = v1.metadata(); + let m2 = v2.metadata(); // metadata would be equal since they contain the same keys assert_eq!(m1, m2); @@ -900,7 +900,7 @@ mod tests { let (m, v) = b.finish(); let v1 = Variant::try_new(&m, &v).unwrap(); - assert!(!v1.metadata().unwrap().is_sorted()); + assert!(!v1.metadata().is_sorted()); // create another object pre-filled with field names, b and a // but insert the fields in the order of a, b @@ -917,7 +917,7 @@ mod tests { let v2 = Variant::try_new(&m, &v).unwrap(); // v2 should also have a unsorted dictionary - assert!(!v2.metadata().unwrap().is_sorted()); + assert!(!v2.metadata().is_sorted()); assert_eq!(v1, v2); } @@ -936,7 +936,7 @@ mod tests { let v1 = Variant::try_new(&meta1, &value1).unwrap(); // v1 is sorted - assert!(v1.metadata().unwrap().is_sorted()); + assert!(v1.metadata().is_sorted()); // create a second object with different insertion order let mut b = VariantBuilder::new().with_field_names(["d", "c", "b", "a"]); @@ -951,7 +951,7 @@ mod tests { let v2 = Variant::try_new(&meta2, &value2).unwrap(); // v2 is not sorted - assert!(!v2.metadata().unwrap().is_sorted()); + assert!(!v2.metadata().is_sorted()); // object metadata are not the same assert_ne!(v1.metadata(), v2.metadata());