-
Notifications
You must be signed in to change notification settings - Fork 1k
[Variant] Implement new VariantValueArrayBuilder #8360
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
ab77733
428aae1
97f99da
fb455c0
2238b49
474fa31
5959577
f286c52
a476c72
2126c7a
fa2cded
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,7 +23,9 @@ use arrow_schema::{ArrowError, DataType, Field, Fields}; | |
| use parquet_variant::{ | ||
| BuilderSpecificState, ListBuilder, MetadataBuilder, ObjectBuilder, Variant, VariantBuilderExt, | ||
| }; | ||
| use parquet_variant::{ParentState, ValueBuilder, WritableMetadataBuilder}; | ||
| use parquet_variant::{ | ||
| ParentState, ReadOnlyMetadataBuilder, ValueBuilder, WritableMetadataBuilder, | ||
| }; | ||
| use std::sync::Arc; | ||
|
|
||
| /// A builder for [`VariantArray`] | ||
|
|
@@ -205,6 +207,154 @@ impl VariantBuilderExt for VariantArrayBuilder { | |
| } | ||
| } | ||
|
|
||
| /// A builder for creating only the value column of a [`VariantArray`] | ||
| /// | ||
| /// This builder is used when you have existing metadata and only need to build | ||
| /// the value column. It's useful for scenarios like variant unshredding, data | ||
| /// transformation, or filtering where you want to reuse existing metadata. | ||
| /// | ||
| /// The builder produces a [`BinaryViewArray`] that can be combined with existing | ||
| /// metadata to create a complete [`VariantArray`]. | ||
| /// | ||
| /// # Example: | ||
| /// ``` | ||
| /// # use arrow::array::Array; | ||
| /// # use parquet_variant::{Variant}; | ||
| /// # use parquet_variant_compute::VariantValueArrayBuilder; | ||
| /// // Create a variant value builder for 10 rows | ||
| /// let mut builder = VariantValueArrayBuilder::new(10); | ||
| /// | ||
| /// // Append some values with their corresponding metadata, which the | ||
| /// // builder takes advantage of to avoid creating new metadata. | ||
| /// builder.append_value(Variant::from(42)); | ||
| /// builder.append_null(); | ||
| /// builder.append_value(Variant::from("hello")); | ||
| /// | ||
| /// // Build the final value array | ||
| /// let value_array = builder.build().unwrap(); | ||
| /// assert_eq!(value_array.len(), 3); | ||
| /// ``` | ||
| #[derive(Debug)] | ||
| pub struct VariantValueArrayBuilder { | ||
| value_builder: ValueBuilder, | ||
| value_offsets: Vec<usize>, | ||
| nulls: NullBufferBuilder, | ||
| } | ||
|
|
||
| impl VariantValueArrayBuilder { | ||
| /// Create a new `VariantValueArrayBuilder` with the specified row capacity | ||
| pub fn new(row_capacity: usize) -> Self { | ||
| Self { | ||
| value_builder: ValueBuilder::new(), | ||
| value_offsets: Vec::with_capacity(row_capacity), | ||
| nulls: NullBufferBuilder::new(row_capacity), | ||
| } | ||
| } | ||
|
|
||
| /// Build the final value array | ||
| /// | ||
| /// Returns a [`BinaryViewArray`] containing the serialized variant values. | ||
| /// This can be combined with existing metadata to create a complete [`VariantArray`]. | ||
| pub fn build(mut self) -> Result<BinaryViewArray, ArrowError> { | ||
| let value_buffer = self.value_builder.into_inner(); | ||
| let mut array = binary_view_array_from_buffers(value_buffer, self.value_offsets); | ||
| if let Some(nulls) = self.nulls.finish() { | ||
| let (views, buffers, _) = array.into_parts(); | ||
| array = BinaryViewArray::try_new(views, buffers, Some(nulls))?; | ||
| } | ||
| Ok(array) | ||
| } | ||
|
|
||
| /// Append a null row to the builder | ||
| /// | ||
| /// WARNING: It is only valid to call this method when building the `value` field of a shredded | ||
| /// variant column (which is nullable). The `value` field of a binary (unshredded) variant | ||
| /// column is non-nullable, and callers should instead invoke [`Self::append_value`] with | ||
| /// `Variant::Null`, passing the appropriate metadata value. | ||
| pub fn append_null(&mut self) { | ||
| self.value_offsets.push(self.value_builder.offset()); | ||
| self.nulls.append_null(); | ||
| } | ||
|
|
||
| /// Append a variant value with its corresponding metadata | ||
| /// | ||
| /// # Arguments | ||
| /// * `value` - The variant value to append | ||
| /// * `metadata` - The metadata dictionary for this variant (used for field name resolution) | ||
| /// | ||
| /// # Returns | ||
| /// * `Ok(())` if the value was successfully appended | ||
| /// * `Err(ArrowError)` if the variant contains field names not found in the metadata | ||
| /// | ||
| /// # Example | ||
| /// ``` | ||
| /// # use parquet_variant::Variant; | ||
| /// # use parquet_variant_compute::VariantValueArrayBuilder; | ||
| /// let mut builder = VariantValueArrayBuilder::new(10); | ||
| /// builder.append_value(Variant::from(42)); | ||
| /// ``` | ||
| pub fn append_value(&mut self, value: Variant<'_, '_>) { | ||
| let mut metadata_builder = ReadOnlyMetadataBuilder::new(value.metadata().clone()); | ||
| ValueBuilder::append_variant_bytes(self.parent_state(&mut metadata_builder), value); | ||
| } | ||
|
|
||
| /// Creates a builder-specific parent state. | ||
| /// | ||
| /// For example, this can be useful for code that wants to copy a subset of fields from an | ||
| /// object `value` as a new row of `value_array_builder`: | ||
| /// | ||
| /// ```no_run | ||
| /// # use parquet_variant::{ObjectBuilder, ReadOnlyMetadataBuilder, Variant}; | ||
| /// # use parquet_variant_compute::VariantValueArrayBuilder; | ||
| /// # let value = Variant::Null; | ||
| /// # let mut value_array_builder = VariantValueArrayBuilder::new(0); | ||
| /// # fn should_keep(field_name: &str) -> bool { todo!() }; | ||
| /// let Variant::Object(obj) = value else { | ||
| /// panic!("Not a variant object"); | ||
| /// }; | ||
| /// let mut metadata_builder = ReadOnlyMetadataBuilder::new(obj.metadata.clone()); | ||
| /// let state = value_array_builder.parent_state(&mut metadata_builder); | ||
| /// let mut object_builder = ObjectBuilder::new(state, false); | ||
| /// for (field_name, field_value) in obj.iter() { | ||
| /// if should_keep(field_name) { | ||
| /// object_builder.insert_bytes(field_name, field_value); | ||
| /// } | ||
| /// } | ||
| /// object_builder.finish(); // appends the filtered object | ||
| /// ``` | ||
| pub fn parent_state<'a>( | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. NOTE: This is
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (also see the updated unit test for a realistic example of how to use it) |
||
| &'a mut self, | ||
| metadata_builder: &'a mut dyn MetadataBuilder, | ||
| ) -> ParentState<'a, ValueArrayBuilderState<'a>> { | ||
| let state = ValueArrayBuilderState { | ||
| value_offsets: &mut self.value_offsets, | ||
| nulls: &mut self.nulls, | ||
| }; | ||
|
|
||
| ParentState::new(&mut self.value_builder, metadata_builder, state) | ||
| } | ||
| } | ||
|
|
||
| /// Builder-specific state for array building that manages array-level offsets and nulls. See | ||
| /// [`VariantBuilderExt`] for details. | ||
| #[derive(Debug)] | ||
| pub struct ValueArrayBuilderState<'a> { | ||
| value_offsets: &'a mut Vec<usize>, | ||
| nulls: &'a mut NullBufferBuilder, | ||
| } | ||
|
|
||
| // All changes are pending until finalized | ||
| impl BuilderSpecificState for ValueArrayBuilderState<'_> { | ||
| fn finish( | ||
| &mut self, | ||
| _metadata_builder: &mut dyn MetadataBuilder, | ||
| value_builder: &mut ValueBuilder, | ||
| ) { | ||
| self.value_offsets.push(value_builder.offset()); | ||
| self.nulls.append_non_null(); | ||
| } | ||
| } | ||
|
|
||
| fn binary_view_array_from_buffers(buffer: Vec<u8>, offsets: Vec<usize>) -> BinaryViewArray { | ||
| // All offsets are less than or equal to the buffer length, so we can safely cast all offsets | ||
| // inside the loop below, as long as the buffer length fits in u32. | ||
|
|
@@ -228,6 +378,7 @@ fn binary_view_array_from_buffers(buffer: Vec<u8>, offsets: Vec<usize>) -> Binar | |
| mod test { | ||
| use super::*; | ||
| use arrow::array::Array; | ||
| use parquet_variant::Variant; | ||
|
|
||
| /// Test that both the metadata and value buffers are non nullable | ||
| #[test] | ||
|
|
@@ -288,4 +439,92 @@ mod test { | |
| let list = variant.as_list().expect("variant to be a list"); | ||
| assert_eq!(list.len(), 2); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_variant_value_array_builder_basic() { | ||
| let mut builder = VariantValueArrayBuilder::new(10); | ||
|
|
||
| // Append some values | ||
| builder.append_value(Variant::from(42i32)); | ||
| builder.append_null(); | ||
| builder.append_value(Variant::from("hello")); | ||
|
|
||
| let value_array = builder.build().unwrap(); | ||
| assert_eq!(value_array.len(), 3); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_variant_value_array_builder_with_objects() { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice test! |
||
| // Populate a variant array with objects | ||
| let mut builder = VariantArrayBuilder::new(3); | ||
| builder | ||
| .new_object() | ||
| .with_field("name", "Alice") | ||
| .with_field("age", 30i32) | ||
| .finish(); | ||
|
|
||
| builder | ||
| .new_object() | ||
| .with_field("name", "Bob") | ||
| .with_field("age", 42i32) | ||
| .with_field("city", "Wonderland") | ||
| .finish(); | ||
|
|
||
| builder | ||
| .new_object() | ||
| .with_field("name", "Charlie") | ||
| .with_field("age", 1i32) | ||
| .finish(); | ||
|
|
||
| let array = builder.build(); | ||
|
|
||
| // Copy (some of) the objects over to the value array builder | ||
| // | ||
| // NOTE: Because we will reuse the metadata column, we cannot reorder rows. We can only | ||
| // filter or manipulate values within a row. | ||
| let mut builder = VariantValueArrayBuilder::new(3); | ||
|
|
||
| // straight copy | ||
| builder.append_value(array.value(0)); | ||
|
|
||
| // filtering fields takes more work because we need to manually create an object builder | ||
| let value = array.value(1); | ||
| let mut metadata_builder = ReadOnlyMetadataBuilder::new(value.metadata().clone()); | ||
| let state = builder.parent_state(&mut metadata_builder); | ||
| ObjectBuilder::new(state, false) | ||
| .with_field("name", value.get_object_field("name").unwrap()) | ||
| .with_field("age", value.get_object_field("age").unwrap()) | ||
| .finish(); | ||
|
|
||
| // same bytes, but now nested and duplicated inside a list | ||
| let value = array.value(2); | ||
| let mut metadata_builder = ReadOnlyMetadataBuilder::new(value.metadata().clone()); | ||
| let state = builder.parent_state(&mut metadata_builder); | ||
| ListBuilder::new(state, false) | ||
| .with_value(value.clone()) | ||
| .with_value(value.clone()) | ||
| .finish(); | ||
|
|
||
| let array2 = VariantArray::from_parts( | ||
| array.metadata_field().clone(), | ||
| Some(builder.build().unwrap()), | ||
| None, | ||
| None, | ||
| ); | ||
|
|
||
| assert_eq!(array2.len(), 3); | ||
| assert_eq!(array.value(0), array2.value(0)); | ||
|
|
||
| assert_eq!( | ||
| array.value(1).get_object_field("name"), | ||
| array2.value(1).get_object_field("name") | ||
| ); | ||
| assert_eq!( | ||
| array.value(1).get_object_field("age"), | ||
| array2.value(1).get_object_field("age") | ||
| ); | ||
|
|
||
| assert_eq!(array.value(2), array2.value(2).get_list_element(0).unwrap()); | ||
| assert_eq!(array.value(2), array2.value(2).get_list_element(1).unwrap()); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,14 +15,14 @@ | |
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| use arrow::array::{ArrayRef, PrimitiveBuilder}; | ||
| use arrow::array::{ArrayRef, BinaryViewArray, NullBufferBuilder, PrimitiveBuilder}; | ||
| use arrow::compute::CastOptions; | ||
| use arrow::datatypes::{self, ArrowPrimitiveType, DataType}; | ||
| use arrow::error::{ArrowError, Result}; | ||
| use parquet_variant::{Variant, VariantPath}; | ||
|
|
||
| use crate::type_conversion::VariantAsPrimitive; | ||
| use crate::VariantArrayBuilder; | ||
| use crate::{VariantArray, VariantValueArrayBuilder}; | ||
|
|
||
| use std::sync::Arc; | ||
|
|
||
|
|
@@ -93,7 +93,7 @@ impl<'a> VariantToArrowRowBuilder<'a> { | |
| } | ||
|
|
||
| pub(crate) fn make_variant_to_arrow_row_builder<'a>( | ||
| //metadata: &BinaryViewArray, | ||
| metadata: &BinaryViewArray, | ||
| path: VariantPath<'a>, | ||
| data_type: Option<&'a DataType>, | ||
| cast_options: &'a CastOptions, | ||
|
|
@@ -103,7 +103,10 @@ pub(crate) fn make_variant_to_arrow_row_builder<'a>( | |
|
|
||
| let mut builder = match data_type { | ||
| // If no data type was requested, build an unshredded VariantArray. | ||
| None => BinaryVariant(VariantToBinaryVariantArrowRowBuilder::new(capacity)), | ||
| None => BinaryVariant(VariantToBinaryVariantArrowRowBuilder::new( | ||
| metadata.clone(), | ||
| capacity, | ||
| )), | ||
| Some(DataType::Int8) => Int8(VariantToPrimitiveArrowRowBuilder::new( | ||
| cast_options, | ||
| capacity, | ||
|
|
@@ -246,36 +249,40 @@ where | |
|
|
||
| /// Builder for creating VariantArray output (for path extraction without type conversion) | ||
| pub(crate) struct VariantToBinaryVariantArrowRowBuilder { | ||
| builder: VariantArrayBuilder, | ||
| metadata: BinaryViewArray, | ||
| builder: VariantValueArrayBuilder, | ||
| nulls: NullBufferBuilder, | ||
| } | ||
|
|
||
| impl VariantToBinaryVariantArrowRowBuilder { | ||
| fn new(capacity: usize) -> Self { | ||
| fn new(metadata: BinaryViewArray, capacity: usize) -> Self { | ||
| Self { | ||
| builder: VariantArrayBuilder::new(capacity), | ||
| metadata, | ||
| builder: VariantValueArrayBuilder::new(capacity), | ||
| nulls: NullBufferBuilder::new(capacity), | ||
| } | ||
| } | ||
| } | ||
|
|
||
| impl VariantToBinaryVariantArrowRowBuilder { | ||
| fn append_null(&mut self) -> Result<()> { | ||
| self.builder.append_null(); | ||
| self.nulls.append_null(); | ||
| Ok(()) | ||
| } | ||
|
|
||
| fn append_value(&mut self, value: &Variant<'_, '_>) -> Result<bool> { | ||
| // TODO: We need a way to convert a Variant directly to bytes. In particular, we want to | ||
| // just copy across the underlying value byte slice of a `Variant::Object` or | ||
| // `Variant::List`, without any interaction with a `VariantMetadata` (because the shredding | ||
| // spec requires us to reuse the existing metadata when unshredding). | ||
| // | ||
| // One could _probably_ emulate this with parquet_variant::VariantBuilder, but it would do a | ||
| // lot of unnecessary work and would also create a new metadata column we don't need. | ||
|
Comment on lines
-267
to
-273
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| self.builder.append_variant(value.clone()); | ||
| self.builder.append_value(value.clone()); | ||
| self.nulls.append_non_null(); | ||
| Ok(true) | ||
| } | ||
|
|
||
| fn finish(self) -> Result<ArrayRef> { | ||
| Ok(Arc::new(self.builder.build())) | ||
| fn finish(mut self) -> Result<ArrayRef> { | ||
| Ok(Arc::new(VariantArray::from_parts( | ||
| self.metadata, | ||
| Some(self.builder.build()?), | ||
| None, // no typed_value column | ||
| self.nulls.finish(), | ||
| ))) | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -562,7 +562,7 @@ pub struct WritableMetadataBuilder { | |
|
|
||
| impl WritableMetadataBuilder { | ||
| /// Upsert field name to dictionary, return its ID | ||
| fn upsert_field_name(&mut self, field_name: &str) -> u32 { | ||
| pub fn upsert_field_name(&mut self, field_name: &str) -> u32 { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It probably should have been public all along, but now it's needed. |
||
| let (id, new_entry) = self.field_names.insert_full(field_name.to_string()); | ||
|
|
||
| if new_entry { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
makes sense