Skip to content
Merged
2 changes: 1 addition & 1 deletion parquet-variant-compute/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ pub mod variant_get;
mod variant_to_arrow;

pub use variant_array::{ShreddingState, VariantArray};
pub use variant_array_builder::VariantArrayBuilder;
pub use variant_array_builder::{VariantArrayBuilder, VariantValueArrayBuilder};

pub use cast_to_variant::{cast_to_variant, cast_to_variant_with_options};
pub use from_json::json_to_variant;
Expand Down
241 changes: 240 additions & 1 deletion parquet-variant-compute/src/variant_array_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ use arrow_schema::{ArrowError, DataType, Field, Fields};
use parquet_variant::{
BuilderSpecificState, ListBuilder, MetadataBuilder, ObjectBuilder, Variant, VariantBuilderExt,
};
use parquet_variant::{ParentState, ValueBuilder, WritableMetadataBuilder};
use parquet_variant::{
ParentState, ReadOnlyMetadataBuilder, ValueBuilder, WritableMetadataBuilder,
};
use std::sync::Arc;

/// A builder for [`VariantArray`]
Expand Down Expand Up @@ -205,6 +207,154 @@ impl VariantBuilderExt for VariantArrayBuilder {
}
}

/// A builder for creating only the value column of a [`VariantArray`]
///
/// This builder is used when you have existing metadata and only need to build
/// the value column. It's useful for scenarios like variant unshredding, data
/// transformation, or filtering where you want to reuse existing metadata.
///
/// The builder produces a [`BinaryViewArray`] that can be combined with existing
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

makes sense

/// metadata to create a complete [`VariantArray`].
///
/// # Example:
/// ```
/// # use arrow::array::Array;
/// # use parquet_variant::{Variant};
/// # use parquet_variant_compute::VariantValueArrayBuilder;
/// // Create a variant value builder for 10 rows
/// let mut builder = VariantValueArrayBuilder::new(10);
///
/// // Append some values with their corresponding metadata, which the
/// // builder takes advantage of to avoid creating new metadata.
/// builder.append_value(Variant::from(42));
/// builder.append_null();
/// builder.append_value(Variant::from("hello"));
///
/// // Build the final value array
/// let value_array = builder.build().unwrap();
/// assert_eq!(value_array.len(), 3);
/// ```
#[derive(Debug)]
pub struct VariantValueArrayBuilder {
value_builder: ValueBuilder,
value_offsets: Vec<usize>,
nulls: NullBufferBuilder,
}

impl VariantValueArrayBuilder {
/// Create a new `VariantValueArrayBuilder` with the specified row capacity
pub fn new(row_capacity: usize) -> Self {
Self {
value_builder: ValueBuilder::new(),
value_offsets: Vec::with_capacity(row_capacity),
nulls: NullBufferBuilder::new(row_capacity),
}
}

/// Build the final value array
///
/// Returns a [`BinaryViewArray`] containing the serialized variant values.
/// This can be combined with existing metadata to create a complete [`VariantArray`].
pub fn build(mut self) -> Result<BinaryViewArray, ArrowError> {
let value_buffer = self.value_builder.into_inner();
let mut array = binary_view_array_from_buffers(value_buffer, self.value_offsets);
if let Some(nulls) = self.nulls.finish() {
let (views, buffers, _) = array.into_parts();
array = BinaryViewArray::try_new(views, buffers, Some(nulls))?;
}
Ok(array)
}

/// Append a null row to the builder
///
/// WARNING: It is only valid to call this method when building the `value` field of a shredded
/// variant column (which is nullable). The `value` field of a binary (unshredded) variant
/// column is non-nullable, and callers should instead invoke [`Self::append_value`] with
/// `Variant::Null`, passing the appropriate metadata value.
pub fn append_null(&mut self) {
self.value_offsets.push(self.value_builder.offset());
self.nulls.append_null();
}

/// Append a variant value with its corresponding metadata
///
/// # Arguments
/// * `value` - The variant value to append
/// * `metadata` - The metadata dictionary for this variant (used for field name resolution)
///
/// # Returns
/// * `Ok(())` if the value was successfully appended
/// * `Err(ArrowError)` if the variant contains field names not found in the metadata
///
/// # Example
/// ```
/// # use parquet_variant::Variant;
/// # use parquet_variant_compute::VariantValueArrayBuilder;
/// let mut builder = VariantValueArrayBuilder::new(10);
/// builder.append_value(Variant::from(42));
/// ```
pub fn append_value(&mut self, value: Variant<'_, '_>) {
let mut metadata_builder = ReadOnlyMetadataBuilder::new(value.metadata().clone());
ValueBuilder::append_variant_bytes(self.parent_state(&mut metadata_builder), value);
}

/// Creates a builder-specific parent state.
///
/// For example, this can be useful for code that wants to copy a subset of fields from an
/// object `value` as a new row of `value_array_builder`:
///
/// ```no_run
/// # use parquet_variant::{ObjectBuilder, ReadOnlyMetadataBuilder, Variant};
/// # use parquet_variant_compute::VariantValueArrayBuilder;
/// # let value = Variant::Null;
/// # let mut value_array_builder = VariantValueArrayBuilder::new(0);
/// # fn should_keep(field_name: &str) -> bool { todo!() };
/// let Variant::Object(obj) = value else {
/// panic!("Not a variant object");
/// };
/// let mut metadata_builder = ReadOnlyMetadataBuilder::new(obj.metadata.clone());
/// let state = value_array_builder.parent_state(&mut metadata_builder);
/// let mut object_builder = ObjectBuilder::new(state, false);
/// for (field_name, field_value) in obj.iter() {
/// if should_keep(field_name) {
/// object_builder.insert_bytes(field_name, field_value);
/// }
/// }
/// object_builder.finish(); // appends the filtered object
/// ```
pub fn parent_state<'a>(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NOTE: This is pub to compensate for the fact that VariantValueArrayBuilder cannot impl VariantBuilderExt (because the metadata builder is created on-demand instead of once up front). See PR description for details.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(also see the updated unit test for a realistic example of how to use it)

&'a mut self,
metadata_builder: &'a mut dyn MetadataBuilder,
) -> ParentState<'a, ValueArrayBuilderState<'a>> {
let state = ValueArrayBuilderState {
value_offsets: &mut self.value_offsets,
nulls: &mut self.nulls,
};

ParentState::new(&mut self.value_builder, metadata_builder, state)
}
}

/// Builder-specific state for array building that manages array-level offsets and nulls. See
/// [`VariantBuilderExt`] for details.
#[derive(Debug)]
pub struct ValueArrayBuilderState<'a> {
value_offsets: &'a mut Vec<usize>,
nulls: &'a mut NullBufferBuilder,
}

// All changes are pending until finalized
impl BuilderSpecificState for ValueArrayBuilderState<'_> {
fn finish(
&mut self,
_metadata_builder: &mut dyn MetadataBuilder,
value_builder: &mut ValueBuilder,
) {
self.value_offsets.push(value_builder.offset());
self.nulls.append_non_null();
}
}

fn binary_view_array_from_buffers(buffer: Vec<u8>, offsets: Vec<usize>) -> BinaryViewArray {
// All offsets are less than or equal to the buffer length, so we can safely cast all offsets
// inside the loop below, as long as the buffer length fits in u32.
Expand All @@ -228,6 +378,7 @@ fn binary_view_array_from_buffers(buffer: Vec<u8>, offsets: Vec<usize>) -> Binar
mod test {
use super::*;
use arrow::array::Array;
use parquet_variant::Variant;

/// Test that both the metadata and value buffers are non nullable
#[test]
Expand Down Expand Up @@ -288,4 +439,92 @@ mod test {
let list = variant.as_list().expect("variant to be a list");
assert_eq!(list.len(), 2);
}

#[test]
fn test_variant_value_array_builder_basic() {
let mut builder = VariantValueArrayBuilder::new(10);

// Append some values
builder.append_value(Variant::from(42i32));
builder.append_null();
builder.append_value(Variant::from("hello"));

let value_array = builder.build().unwrap();
assert_eq!(value_array.len(), 3);
}

#[test]
fn test_variant_value_array_builder_with_objects() {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice test!

// Populate a variant array with objects
let mut builder = VariantArrayBuilder::new(3);
builder
.new_object()
.with_field("name", "Alice")
.with_field("age", 30i32)
.finish();

builder
.new_object()
.with_field("name", "Bob")
.with_field("age", 42i32)
.with_field("city", "Wonderland")
.finish();

builder
.new_object()
.with_field("name", "Charlie")
.with_field("age", 1i32)
.finish();

let array = builder.build();

// Copy (some of) the objects over to the value array builder
//
// NOTE: Because we will reuse the metadata column, we cannot reorder rows. We can only
// filter or manipulate values within a row.
let mut builder = VariantValueArrayBuilder::new(3);

// straight copy
builder.append_value(array.value(0));

// filtering fields takes more work because we need to manually create an object builder
let value = array.value(1);
let mut metadata_builder = ReadOnlyMetadataBuilder::new(value.metadata().clone());
let state = builder.parent_state(&mut metadata_builder);
ObjectBuilder::new(state, false)
.with_field("name", value.get_object_field("name").unwrap())
.with_field("age", value.get_object_field("age").unwrap())
.finish();

// same bytes, but now nested and duplicated inside a list
let value = array.value(2);
let mut metadata_builder = ReadOnlyMetadataBuilder::new(value.metadata().clone());
let state = builder.parent_state(&mut metadata_builder);
ListBuilder::new(state, false)
.with_value(value.clone())
.with_value(value.clone())
.finish();

let array2 = VariantArray::from_parts(
array.metadata_field().clone(),
Some(builder.build().unwrap()),
None,
None,
);

assert_eq!(array2.len(), 3);
assert_eq!(array.value(0), array2.value(0));

assert_eq!(
array.value(1).get_object_field("name"),
array2.value(1).get_object_field("name")
);
assert_eq!(
array.value(1).get_object_field("age"),
array2.value(1).get_object_field("age")
);

assert_eq!(array.value(2), array2.value(2).get_list_element(0).unwrap());
assert_eq!(array.value(2), array2.value(2).get_list_element(1).unwrap());
}
}
9 changes: 7 additions & 2 deletions parquet-variant-compute/src/variant_get.rs
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,13 @@ fn shredded_get_path(
let shred_basic_variant =
|target: VariantArray, path: VariantPath<'_>, as_field: Option<&Field>| {
let as_type = as_field.map(|f| f.data_type());
let mut builder =
make_variant_to_arrow_row_builder(path, as_type, cast_options, target.len())?;
let mut builder = make_variant_to_arrow_row_builder(
target.metadata_field(),
path,
as_type,
cast_options,
target.len(),
)?;
for i in 0..target.len() {
if target.is_null(i) {
builder.append_null()?;
Expand Down
41 changes: 24 additions & 17 deletions parquet-variant-compute/src/variant_to_arrow.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@
// specific language governing permissions and limitations
// under the License.

use arrow::array::{ArrayRef, PrimitiveBuilder};
use arrow::array::{ArrayRef, BinaryViewArray, NullBufferBuilder, PrimitiveBuilder};
use arrow::compute::CastOptions;
use arrow::datatypes::{self, ArrowPrimitiveType, DataType};
use arrow::error::{ArrowError, Result};
use parquet_variant::{Variant, VariantPath};

use crate::type_conversion::VariantAsPrimitive;
use crate::VariantArrayBuilder;
use crate::{VariantArray, VariantValueArrayBuilder};

use std::sync::Arc;

Expand Down Expand Up @@ -93,7 +93,7 @@ impl<'a> VariantToArrowRowBuilder<'a> {
}

pub(crate) fn make_variant_to_arrow_row_builder<'a>(
//metadata: &BinaryViewArray,
metadata: &BinaryViewArray,
path: VariantPath<'a>,
data_type: Option<&'a DataType>,
cast_options: &'a CastOptions,
Expand All @@ -103,7 +103,10 @@ pub(crate) fn make_variant_to_arrow_row_builder<'a>(

let mut builder = match data_type {
// If no data type was requested, build an unshredded VariantArray.
None => BinaryVariant(VariantToBinaryVariantArrowRowBuilder::new(capacity)),
None => BinaryVariant(VariantToBinaryVariantArrowRowBuilder::new(
metadata.clone(),
capacity,
)),
Some(DataType::Int8) => Int8(VariantToPrimitiveArrowRowBuilder::new(
cast_options,
capacity,
Expand Down Expand Up @@ -246,36 +249,40 @@ where

/// Builder for creating VariantArray output (for path extraction without type conversion)
pub(crate) struct VariantToBinaryVariantArrowRowBuilder {
builder: VariantArrayBuilder,
metadata: BinaryViewArray,
builder: VariantValueArrayBuilder,
nulls: NullBufferBuilder,
}

impl VariantToBinaryVariantArrowRowBuilder {
fn new(capacity: usize) -> Self {
fn new(metadata: BinaryViewArray, capacity: usize) -> Self {
Self {
builder: VariantArrayBuilder::new(capacity),
metadata,
builder: VariantValueArrayBuilder::new(capacity),
nulls: NullBufferBuilder::new(capacity),
}
}
}

impl VariantToBinaryVariantArrowRowBuilder {
fn append_null(&mut self) -> Result<()> {
self.builder.append_null();
self.nulls.append_null();
Ok(())
}

fn append_value(&mut self, value: &Variant<'_, '_>) -> Result<bool> {
// TODO: We need a way to convert a Variant directly to bytes. In particular, we want to
// just copy across the underlying value byte slice of a `Variant::Object` or
// `Variant::List`, without any interaction with a `VariantMetadata` (because the shredding
// spec requires us to reuse the existing metadata when unshredding).
//
// One could _probably_ emulate this with parquet_variant::VariantBuilder, but it would do a
// lot of unnecessary work and would also create a new metadata column we don't need.
Comment on lines -267 to -273
Copy link
Contributor Author

@scovich scovich Sep 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@alamb -- I just realized while putting up #8366 that the new VariantValueArrayBuilder is perfect for fixing this TODO (in addition to supporting shredding).

So now it has an immediate use, instead of only being a building block for the next PR.

self.builder.append_variant(value.clone());
self.builder.append_value(value.clone());
self.nulls.append_non_null();
Ok(true)
}

fn finish(self) -> Result<ArrayRef> {
Ok(Arc::new(self.builder.build()))
fn finish(mut self) -> Result<ArrayRef> {
Ok(Arc::new(VariantArray::from_parts(
self.metadata,
Some(self.builder.build()?),
None, // no typed_value column
self.nulls.finish(),
)))
}
}
2 changes: 1 addition & 1 deletion parquet-variant/src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -562,7 +562,7 @@ pub struct WritableMetadataBuilder {

impl WritableMetadataBuilder {
/// Upsert field name to dictionary, return its ID
fn upsert_field_name(&mut self, field_name: &str) -> u32 {
pub fn upsert_field_name(&mut self, field_name: &str) -> u32 {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It probably should have been public all along, but now it's needed.

let (id, new_entry) = self.field_names.insert_full(field_name.to_string());

if new_entry {
Expand Down
Loading
Loading