Skip to content
Draft
Changes from 3 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
9c25cc4
Add test shredded variant list array
sdf-jkl Sep 15, 2025
ed961a4
Add basic tests
sdf-jkl Sep 16, 2025
03ecb95
Merge branch 'apache:main' into shredded_list_support
sdf-jkl Sep 16, 2025
158d6d7
Merge branch 'apache:main' into shredded_list_support
sdf-jkl Sep 16, 2025
d53c831
Redo test shredded array
sdf-jkl Sep 17, 2025
174e429
Merge branch 'main' of https://github.com/apache/arrow-rs into shredd…
sdf-jkl Sep 18, 2025
69de7d7
Rebuild the shredded list array
sdf-jkl Sep 19, 2025
cc6d787
Use select::take to build the output array
sdf-jkl Sep 23, 2025
8f6ad1b
Merge branch 'main' of https://github.com/apache/arrow-rs into shredd…
sdf-jkl Sep 23, 2025
bc8abd9
Merge branch 'main' of https://github.com/apache/arrow-rs into shredd…
sdf-jkl Sep 25, 2025
c0d2065
Pass one test
sdf-jkl Sep 25, 2025
85aaa3f
Merge branch 'main' of https://github.com/apache/arrow-rs into shredd…
sdf-jkl Sep 25, 2025
40b6311
Get typed values directly
sdf-jkl Sep 25, 2025
f6e88ef
Added support for utf8, largeUtf8, utf8view
sdf-jkl Oct 13, 2025
61ed178
added tests for utf8, largeUtf8, utf8view
sdf-jkl Oct 13, 2025
1fb612d
fix tests
sdf-jkl Oct 13, 2025
2b6d280
Update parquet-variant-compute/src/variant_to_arrow.rs
sdf-jkl Oct 14, 2025
398b52d
Merge branch 'main' into variant_to_arrow_utf8
sdf-jkl Oct 14, 2025
defa07b
Update parquet-variant-compute/src/variant_to_arrow.rs
sdf-jkl Oct 20, 2025
5022acd
Support LargeUtf8, Utf8-View
sdf-jkl Oct 20, 2025
ed66007
Merge branch 'main' into variant_to_arrow_utf8
sdf-jkl Oct 20, 2025
196b5d4
Fix Merge errors
sdf-jkl Oct 20, 2025
642d192
Update arrow-array/src/builder/generic_bytes_builder.rs
sdf-jkl Oct 20, 2025
76b3c80
Add docs for AVERAGE_STRING_LENGTH const
sdf-jkl Oct 21, 2025
35785d6
Merge branch 'variant_to_arrow_utf8' of https://github.com/sdf-jkl/ar…
sdf-jkl Oct 21, 2025
5914218
cargo fmt
sdf-jkl Oct 21, 2025
216d401
cargo fmt
sdf-jkl Oct 21, 2025
3aa6cf3
Merge branch 'variant_to_arrow_utf8' into shredded_list_support
sdf-jkl Oct 22, 2025
15fc8be
Merge branch 'main' of https://github.com/apache/arrow-rs into shredd…
sdf-jkl Oct 22, 2025
04b9941
Quick fix variant_get
sdf-jkl Oct 24, 2025
857f0e2
Merge branch 'main' into shredded_list_support
sdf-jkl Oct 24, 2025
1edfeca
Merge branch 'main' into shredded_list_support
sdf-jkl Nov 11, 2025
6d6793d
fix merge errors
sdf-jkl Nov 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 94 additions & 0 deletions parquet-variant-compute/src/variant_get.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1010,7 +1010,101 @@ mod test {
let expected: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), Some(42)]));
assert_eq!(&result, &expected);
}
/// This test manually constructs a shredded variant array representing lists
/// like ["comedy", "drama"], ["horror", null] and ["comedy", "drama", "romance"]
/// as VariantArray using variant_get.
#[test]
fn test_shredded_list_field_access() {
let array = shredded_list_variant_array();

// Test: Extract the 0 index field as VariantArray first
let options = GetOptions::new_with_path(VariantPath::from(0));
let result = variant_get(&array, options).unwrap();

let result_variant: &VariantArray = result.as_any().downcast_ref().unwrap();
assert_eq!(result_variant.len(), 3);

// Row 0: expect 0 index = "comedy"
assert_eq!(result_variant.value(0), Variant::String("comedy"));
// Row 1: expect 0 index = "horror"
assert_eq!(result_variant.value(1), Variant::String("horror"));
// Row 2: expect 0 index = "comedy"
assert_eq!(result_variant.value(2), Variant::String("comedy"));
}
/// Test extracting shredded list field with type conversion
#[test]
fn test_shredded_list_as_string() {
let array = shredded_list_variant_array();

// Test: Extract the 0 index values as StringArray (type conversion)
let field = Field::new("typed_value", DataType::Utf8, false);
let options = GetOptions::new_with_path(VariantPath::from(0))
.with_as_type(Some(FieldRef::from(field)));
let result = variant_get(&array, options).unwrap();

// Should get StringArray
let expected: ArrayRef = Arc::new(StringArray::from(vec![Some("comedy"), Some("drama")]));
assert_eq!(&result, &expected);
}
/// Helper function to create a shredded variant array representing lists
///
/// This creates an array that represents:
/// Row 0: ["comedy", "drama"] ([0] is shredded, [1] is shredded - perfectly shredded)
/// Row 1: ["horror", null] ([0] is shredded, [1] is binary null - partially shredded)
/// Row 2: ["comedy", "drama", "romance"] (perfectly shredded)
///
/// The physical layout follows the shredding spec where:
/// - metadata: contains list metadata
/// - typed_value: StructArray with 0 index value
/// - value: contains fallback for
fn shredded_list_variant_array() -> ArrayRef {
// Create the base metadata for lists

// Could add this as an api for VariantList, like VariantList::from()
fn build_list_metadata(vector: Vec<Variant>) -> (Vec<u8>, Vec<u8>) {
let mut builder = parquet_variant::VariantBuilder::new();
let mut list = builder.new_list();
for value in vector {
list.append_value(value);
}
list.finish();
builder.finish()
}
let (metadata1, _) =
build_list_metadata(vec![Variant::String("comedy"), Variant::String("drama")]);

let (metadata2, _) = build_list_metadata(vec![Variant::String("horror"), Variant::Null]);

let (metadata3, _) = build_list_metadata(vec![
Variant::String("comedy"),
Variant::String("drama"),
Variant::String("romance"),
]);

// Create metadata array
let metadata_array =
BinaryViewArray::from_iter_values(vec![metadata1, metadata2, metadata3]);

// Create the untyped value array
let value_array = BinaryViewArray::from(vec![Variant::Null.as_u8_slice()]);
// Maybe I should try with an actual primitive array
let typed_value_array = StringArray::from(vec![
Some("comedy"),
Some("drama"),
Some("horror"),
Some("comedy"),
Some("drama"),
Some("romance"),
]);
// Build the main VariantArray
let main_struct = crate::variant_array::StructArrayBuilder::new()
.with_field("metadata", Arc::new(metadata_array))
.with_field("value", Arc::new(value_array))
.with_field("typed_value", Arc::new(typed_value_array))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Check the variant shredding spec for arrays -- the typed_value for a shredded variant array is a non-nullable group called element, with child fields typed_value and value for shredded and unshredded list elements, respectively.

And then we'll need to build an appropriate GenericListArray out of this string array you built, which gives the offsets for each sub-list.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for this too, I was under the wrong impression that the metadata encoding stores the offsets for the actual values. Reading your #8359 and rereading the Variant Encoding spec I see that the values offsets are within the value encoding itself.

So the outermost typed_value should be an GenericListArray of element - VariantObjects with {value and typed_value fields}?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, exactly! And element is non-nullable (**), while the two children are nullable.

(**) As always, in arrow, it can still have null entries, but only if its parent is already NULL for the same row (so nobody can ever observe a non-null element)

.build();

Arc::new(VariantArray::try_new(Arc::new(main_struct)).expect("should create variant array"))
}
/// Helper function to create a shredded variant array representing objects
///
/// This creates an array that represents:
Expand Down
Loading