Skip to content

Commit aa9f8a9

Browse files
committed
[Variant] Support typed access for timestamp(micro&nano)
1 parent 93090d5 commit aa9f8a9

File tree

3 files changed

+234
-72
lines changed

3 files changed

+234
-72
lines changed

parquet-variant-compute/src/variant_array.rs

Lines changed: 50 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,18 @@
1717

1818
//! [`VariantArray`] implementation
1919
20-
use crate::type_conversion::primitive_conversion_single_value;
20+
use crate::type_conversion::{generic_conversion_single_value, primitive_conversion_single_value};
2121
use arrow::array::{Array, ArrayRef, AsArray, BinaryViewArray, StructArray};
2222
use arrow::buffer::NullBuffer;
2323
use arrow::compute::cast;
2424
use arrow::datatypes::{
2525
Date32Type, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type,
26-
UInt16Type, UInt32Type, UInt64Type, UInt8Type,
26+
TimestampMicrosecondType, TimestampNanosecondType, UInt16Type, UInt32Type, UInt64Type,
27+
UInt8Type,
2728
};
2829
use arrow_schema::extension::ExtensionType;
29-
use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields};
30+
use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields, TimeUnit};
31+
use chrono::DateTime;
3032
use parquet_variant::Uuid;
3133
use parquet_variant::Variant;
3234
use std::sync::Arc;
@@ -864,6 +866,51 @@ fn typed_value_to_variant(typed_value: &ArrayRef, index: usize) -> Variant<'_, '
864866
DataType::Float64 => {
865867
primitive_conversion_single_value!(Float64Type, typed_value, index)
866868
}
869+
DataType::Timestamp(timeunit, tz) => {
870+
match (timeunit, tz) {
871+
(TimeUnit::Microsecond, Some(_)) => {
872+
generic_conversion_single_value!(
873+
TimestampMicrosecondType,
874+
as_primitive,
875+
|v| DateTime::from_timestamp_micros(v).unwrap(),
876+
typed_value,
877+
index
878+
)
879+
}
880+
(TimeUnit::Microsecond, None) => {
881+
generic_conversion_single_value!(
882+
TimestampMicrosecondType,
883+
as_primitive,
884+
|v| DateTime::from_timestamp_micros(v).unwrap().naive_utc(),
885+
typed_value,
886+
index
887+
)
888+
}
889+
(TimeUnit::Nanosecond, Some(_)) => {
890+
generic_conversion_single_value!(
891+
TimestampNanosecondType,
892+
as_primitive,
893+
DateTime::from_timestamp_nanos,
894+
typed_value,
895+
index
896+
)
897+
}
898+
(TimeUnit::Nanosecond, None) => {
899+
generic_conversion_single_value!(
900+
TimestampNanosecondType,
901+
as_primitive,
902+
|v| DateTime::from_timestamp_nanos(v).naive_utc(),
903+
typed_value,
904+
index
905+
)
906+
}
907+
// Variant timestamp only support time unit with microsecond or nanosecond precision
908+
_ => panic!(
909+
"Variant only support timestamp with microsecond or nanosecond precision"
910+
),
911+
}
912+
}
913+
867914
// todo other types here (note this is very similar to cast_to_variant.rs)
868915
// so it would be great to figure out how to share this code
869916
_ => {

parquet-variant-compute/src/variant_get.rs

Lines changed: 176 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ pub(crate) enum ShreddedPathStep {
3434
/// Path step succeeded, return the new shredding state
3535
Success(ShreddingState),
3636
/// The path element is not present in the `typed_value` column and there is no `value` column,
37-
/// so we we know it does not exist. It, and all paths under it, are all-NULL.
37+
/// so we know it does not exist. It, and all paths under it, are all-NULL.
3838
Missing,
3939
/// The path element is not present in the `typed_value` column and must be retrieved from the `value`
4040
/// column instead. The caller should be prepared to handle any value, including the requested
@@ -294,24 +294,22 @@ impl<'a> GetOptions<'a> {
294294

295295
#[cfg(test)]
296296
mod test {
297-
use std::sync::Arc;
298-
297+
use super::{variant_get, GetOptions};
298+
use crate::json_to_variant;
299+
use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder};
300+
use crate::VariantArray;
299301
use arrow::array::{
300-
Array, ArrayRef, AsArray, BinaryViewArray, BooleanArray, Date32Array, FixedSizeBinaryArray,
301-
Float16Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array,
302-
StringArray, StructArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
302+
Array, ArrayRef, AsArray, BinaryViewArray, Date32Array, Float16Array, Float32Array,
303+
Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, StringArray, StructArray,
304+
UInt16Array, UInt32Array, UInt64Array, UInt8Array,
303305
};
304306
use arrow::buffer::NullBuffer;
305307
use arrow::compute::CastOptions;
306308
use arrow::datatypes::DataType::{Int16, Int32, Int64, UInt16, UInt32, UInt64, UInt8};
307309
use arrow_schema::{DataType, Field, FieldRef, Fields};
310+
use chrono::DateTime;
308311
use parquet_variant::{Variant, VariantPath, EMPTY_VARIANT_METADATA_BYTES};
309-
310-
use crate::json_to_variant;
311-
use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder};
312-
use crate::VariantArray;
313-
314-
use super::{variant_get, GetOptions};
312+
use std::sync::Arc;
315313

316314
fn single_variant_get_test(input_json: &str, path: VariantPath, expected_json: &str) {
317315
// Create input array from JSON string
@@ -454,9 +452,8 @@ mod test {
454452
.with_field("value", Arc::new(values), true)
455453
.with_nulls(nulls)
456454
.build();
457-
Arc::new(
458-
VariantArray::try_new(Arc::new(struct_array))
459-
.expect("should create variant array"),
455+
ArrayRef::from(
456+
VariantArray::try_new(&struct_array).expect("should create variant array"),
460457
)
461458
}
462459
};
@@ -924,6 +921,156 @@ mod test {
924921
f64
925922
);
926923

924+
macro_rules! assert_variant_get_as_variant_array_with_default_option {
925+
($variant_array: expr, $array_expected: expr) => {{
926+
let options = GetOptions::new();
927+
let array = $variant_array;
928+
let result = variant_get(&array, options).unwrap();
929+
930+
// expect the result is a VariantArray
931+
let result = VariantArray::try_new(&result).unwrap();
932+
933+
assert_eq!(result.len(), $array_expected.len());
934+
935+
for (idx, item) in $array_expected.into_iter().enumerate() {
936+
match item {
937+
Some(item) => assert_eq!(result.value(idx), item),
938+
None => assert!(result.is_null(idx)),
939+
}
940+
}
941+
}};
942+
}
943+
944+
partially_shredded_variant_array_gen!(
945+
partially_shredded_timestamp_micro_ntz_variant_array,
946+
|| {
947+
arrow::array::TimestampMicrosecondArray::from(vec![
948+
Some(-456000),
949+
None,
950+
None,
951+
Some(1758602096000000),
952+
])
953+
}
954+
);
955+
956+
#[test]
957+
fn get_variant_partial_shredded_timestamp_micro_ntz_as_variant() {
958+
let array = partially_shredded_timestamp_micro_ntz_variant_array();
959+
assert_variant_get_as_variant_array_with_default_option!(
960+
array,
961+
vec![
962+
Some(Variant::from(
963+
DateTime::from_timestamp_micros(-456000i64)
964+
.unwrap()
965+
.naive_utc(),
966+
)),
967+
None,
968+
Some(Variant::from("n/a")),
969+
Some(Variant::from(
970+
DateTime::parse_from_rfc3339("2025-09-23T12:34:56+08:00")
971+
.unwrap()
972+
.naive_utc(),
973+
)),
974+
]
975+
)
976+
}
977+
978+
partially_shredded_variant_array_gen!(partially_shredded_timestamp_micro_variant_array, || {
979+
arrow::array::TimestampMicrosecondArray::from(vec![
980+
Some(-456000),
981+
None,
982+
None,
983+
Some(1758602096000000),
984+
])
985+
.with_timezone("+00:00")
986+
});
987+
988+
#[test]
989+
fn get_variant_partial_shredded_timestamp_micro_as_variant() {
990+
let array = partially_shredded_timestamp_micro_variant_array();
991+
assert_variant_get_as_variant_array_with_default_option!(
992+
array,
993+
vec![
994+
Some(Variant::from(
995+
DateTime::from_timestamp_micros(-456000i64)
996+
.unwrap()
997+
.to_utc(),
998+
)),
999+
None,
1000+
Some(Variant::from("n/a")),
1001+
Some(Variant::from(
1002+
DateTime::parse_from_rfc3339("2025-09-23T12:34:56+08:00")
1003+
.unwrap()
1004+
.to_utc(),
1005+
)),
1006+
]
1007+
)
1008+
}
1009+
1010+
partially_shredded_variant_array_gen!(
1011+
partially_shredded_timestamp_nano_ntz_variant_array,
1012+
|| {
1013+
arrow::array::TimestampNanosecondArray::from(vec![
1014+
Some(-4999999561),
1015+
None,
1016+
None,
1017+
Some(1758602096000000000),
1018+
])
1019+
}
1020+
);
1021+
1022+
#[test]
1023+
fn get_variant_partial_shredded_timestamp_nano_ntz_as_variant() {
1024+
let array = partially_shredded_timestamp_nano_ntz_variant_array();
1025+
1026+
assert_variant_get_as_variant_array_with_default_option!(
1027+
array,
1028+
vec![
1029+
Some(Variant::from(
1030+
DateTime::from_timestamp(-5, 439).unwrap().naive_utc()
1031+
)),
1032+
None,
1033+
Some(Variant::from("n/a")),
1034+
Some(Variant::from(
1035+
DateTime::parse_from_rfc3339("2025-09-23T12:34:56+08:00")
1036+
.unwrap()
1037+
.naive_utc()
1038+
)),
1039+
]
1040+
)
1041+
}
1042+
1043+
partially_shredded_variant_array_gen!(partially_shredded_timestamp_nano_variant_array, || {
1044+
arrow::array::TimestampNanosecondArray::from(vec![
1045+
Some(-4999999561),
1046+
None,
1047+
None,
1048+
Some(1758602096000000000),
1049+
])
1050+
.with_timezone("+00:00")
1051+
});
1052+
1053+
#[test]
1054+
fn get_variant_partial_shredded_timestamp_nano_as_variant() {
1055+
let array = partially_shredded_timestamp_nano_variant_array();
1056+
1057+
assert_variant_get_as_variant_array_with_default_option!(
1058+
array,
1059+
vec![
1060+
Some(Variant::from(
1061+
DateTime::from_timestamp(-5, 439).unwrap().to_utc()
1062+
)),
1063+
None,
1064+
Some(Variant::from("n/a")),
1065+
Some(Variant::from(
1066+
DateTime::parse_from_rfc3339("2025-09-23T12:34:56+08:00")
1067+
.unwrap()
1068+
.to_utc()
1069+
)),
1070+
]
1071+
)
1072+
}
1073+
9271074
/// Return a VariantArray that represents a normal "shredded" variant
9281075
/// for the following example
9291076
///
@@ -955,10 +1102,11 @@ mod test {
9551102
None, // row 2 is a string, so no typed value
9561103
Some(<$primitive_type>::try_from(100u8).unwrap()), // row 3 is shredded, so it has a value
9571104
]));
1105+
};
9581106
}
959-
960-
macro_rules partially_shredded_variant_array_gen {
961-
($func:ident) => {
1107+
1108+
macro_rules! partially_shredded_variant_array_gen {
1109+
($func:ident, $typed_array_gen: expr) => {
9621110
fn $func() -> ArrayRef {
9631111
// At the time of writing, the `VariantArrayBuilder` does not support shredding.
9641112
// so we must construct the array manually. see https://github.com/apache/arrow-rs/issues/7895
@@ -987,12 +1135,7 @@ mod test {
9871135
None, // row 3 is shredded, so no value
9881136
]);
9891137

990-
let typed_value = $array_type::from(vec![
991-
Some(<$primitive_type>::try_from(34u8).unwrap()), // row 0 is shredded, so it has a value
992-
None, // row 1 is null, so no value
993-
None, // row 2 is a string, so no typed value
994-
Some(<$primitive_type>::try_from(100u8).unwrap()), // row 3 is shredded, so it has a value
995-
]);
1138+
let typed_value = $typed_array_gen();
9961139

9971140
let struct_array = StructArrayBuilder::new()
9981141
.with_field("metadata", Arc::new(metadata), false)
@@ -1001,9 +1144,8 @@ mod test {
10011144
.with_nulls(nulls)
10021145
.build();
10031146

1004-
Arc::new(
1005-
VariantArray::try_new(Arc::new(struct_array))
1006-
.expect("should create variant array"),
1147+
ArrayRef::from(
1148+
VariantArray::try_new(&struct_array).expect("should create variant array"),
10071149
)
10081150
}
10091151
};
@@ -1090,24 +1232,23 @@ mod test {
10901232
false, // row 2 is string
10911233
true, // row 3 has value
10921234
]);
1093-
let typed_value = arrow::array::FixedSizeBinaryArray::try_new(
1235+
arrow::array::FixedSizeBinaryArray::try_new(
10941236
3, // byte width
10951237
arrow::buffer::Buffer::from(data),
10961238
Some(typed_value_nulls),
10971239
)
1098-
.expect("should create fixed size binary array");
1099-
1100-
}
1240+
.expect("should create fixed size binary array")
1241+
}
1242+
);
11011243

1102-
/// Return a VariantArray that represents a partially "shredded" variant for UTF8
1103-
partially_shredded_variant_array_gen!(partially_shredded_utf8_variant_array, ||
1244+
partially_shredded_variant_array_gen!(partially_shredded_utf8_variant_array, || {
11041245
StringArray::from(vec![
11051246
Some("hello"), // row 0 is shredded
11061247
None, // row 1 is null
11071248
None, // row 2 is a string
11081249
Some("world"), // row 3 is shredded
1109-
]);
1110-
);
1250+
])
1251+
});
11111252

11121253
/// Return a VariantArray that represents a partially "shredded" variant for Date32
11131254
fn partially_shredded_date32_variant_array() -> ArrayRef {

0 commit comments

Comments
 (0)