Skip to content

Commit 0452360

Browse files
feat: Add Run-End Encoded array casting with overflow protection
Implement casting between REE arrays and other Arrow types. REE-to-REE casting validates run-end upcasts only (Int16→Int32, Int16→Int64, Int32→Int64) to prevent invalid sequences.
1 parent 5307851 commit 0452360

File tree

2 files changed

+284
-47
lines changed

2 files changed

+284
-47
lines changed

arrow-cast/src/cast/mod.rs

Lines changed: 149 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,9 @@ use crate::cast::decimal::*;
4747
use crate::cast::dictionary::*;
4848
use crate::cast::list::*;
4949
use crate::cast::map::*;
50-
use crate::cast::run_array::{cast_to_run_end_encoded, run_end_encoded_cast};
50+
use crate::cast::run_array::{
51+
can_cast_run_end_encoded, cast_to_run_end_encoded, run_end_encoded_cast,
52+
};
5153
use crate::cast::string::*;
5254

5355
use arrow_buffer::IntervalMonthDayNano;
@@ -140,9 +142,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
140142
}
141143
(Dictionary(_, value_type), _) => can_cast_types(value_type, to_type),
142144
(RunEndEncoded(_, value_type), _) => can_cast_types(value_type.data_type(), to_type),
143-
(_, RunEndEncoded(_, _value_type)) => true,
144-
145-
145+
(_, RunEndEncoded(_, _value_type)) => can_cast_run_end_encoded(from_type, to_type),
146146
(_, Dictionary(_, value_type)) => can_cast_types(from_type, value_type),
147147
(List(list_from) | LargeList(list_from), List(list_to) | LargeList(list_to)) => {
148148
can_cast_types(list_from.data_type(), list_to.data_type())
@@ -745,14 +745,18 @@ pub fn cast_with_options(
745745
| Map(_, _)
746746
| Dictionary(_, _),
747747
) => Ok(new_null_array(to_type, array.len())),
748-
(RunEndEncoded(index_type, _), _) => match index_type.data_type() {
749-
Int16 => run_end_encoded_cast::<Int16Type>(array, to_type, cast_options),
750-
Int32 => run_end_encoded_cast::<Int32Type>(array, to_type, cast_options),
751-
Int64 => run_end_encoded_cast::<Int64Type>(array, to_type, cast_options),
752-
_ => Err(ArrowError::CastError(format!(
753-
"Casting from run end encoded type {from_type:?} to {to_type:?} not supported",
754-
))),
755-
},
748+
(RunEndEncoded(index_type, _), _) => {
749+
let mut new_cast_options = cast_options.clone();
750+
new_cast_options.safe = false;
751+
match index_type.data_type() {
752+
Int16 => run_end_encoded_cast::<Int16Type>(array, to_type, &new_cast_options),
753+
Int32 => run_end_encoded_cast::<Int32Type>(array, to_type, &new_cast_options),
754+
Int64 => run_end_encoded_cast::<Int64Type>(array, to_type, &new_cast_options),
755+
_ => Err(ArrowError::CastError(format!(
756+
"Casting from run end encoded type {from_type:?} to {to_type:?} not supported",
757+
))),
758+
}
759+
}
756760
(_, RunEndEncoded(index_type, value_type)) => match index_type.data_type() {
757761
Int16 => {
758762
cast_to_run_end_encoded::<Int16Type>(array, value_type.data_type(), cast_options)
@@ -10726,16 +10730,14 @@ mod tests {
1072610730
let values = Int32Array::from(vec![1, 2, 3]);
1072710731
let run_array = RunArray::<Int32Type>::try_new(&run_ends, &values).unwrap();
1072810732
let array_ref = Arc::new(run_array) as ArrayRef;
10729-
println!("1");
1073010733
// Cast to Int64
1073110734
let cast_result = cast(&array_ref, &DataType::Int64).unwrap();
10732-
println!("2");
1073310735
// Verify the result is a RunArray with Int64 values
10734-
let result_run_array = cast_result
10735-
.as_any()
10736-
.downcast_ref::<Int64Array>()
10737-
.unwrap();
10738-
assert_eq!(result_run_array.values(), &[1i64, 1i64, 2i64, 2i64, 2i64, 3i64]);
10736+
let result_run_array = cast_result.as_any().downcast_ref::<Int64Array>().unwrap();
10737+
assert_eq!(
10738+
result_run_array.values(),
10739+
&[1i64, 1i64, 2i64, 2i64, 2i64, 3i64]
10740+
);
1073910741
}
1074010742

1074110743
/// Test casting FROM RunEndEncoded to string
@@ -10751,10 +10753,7 @@ mod tests {
1075110753
let cast_result = cast(&array_ref, &DataType::Utf8).unwrap();
1075210754

1075310755
// Verify the result is a RunArray with String values
10754-
let result_array = cast_result
10755-
.as_any()
10756-
.downcast_ref::<StringArray>()
10757-
.unwrap();
10756+
let result_array = cast_result.as_any().downcast_ref::<StringArray>().unwrap();
1075810757
// Check that values are correct
1075910758
assert_eq!(result_array.value(0), "10");
1076010759
assert_eq!(result_array.value(1), "10");
@@ -10890,10 +10889,7 @@ mod tests {
1089010889
let cast_result = cast(&array_ref, &DataType::Utf8).unwrap();
1089110890

1089210891
// Verify the result preserves nulls
10893-
let result_run_array = cast_result
10894-
.as_any()
10895-
.downcast_ref::<StringArray>()
10896-
.unwrap();
10892+
let result_run_array = cast_result.as_any().downcast_ref::<StringArray>().unwrap();
1089710893
assert_eq!(result_run_array.value(0), "1");
1089810894
assert!(result_run_array.is_null(2));
1089910895
assert_eq!(result_run_array.value(4), "2");
@@ -10939,5 +10935,131 @@ mod tests {
1093910935
// Expect this to fail
1094010936
assert!(cast_result.is_err());
1094110937
}
10938+
#[test]
10939+
fn test_cast_run_end_encoded_int64_to_int16_should_fail() {
10940+
use arrow_array::{Int64Array, RunArray, StringArray};
10941+
use arrow_schema::{DataType, Field};
10942+
use std::sync::Arc;
10943+
10944+
// Construct a valid REE array with Int64 run-ends
10945+
let run_ends = Int64Array::from(vec![100_000, 400_000, 700_000]); // values too large for Int16
10946+
let values = StringArray::from(vec!["a", "b", "c"]);
10947+
10948+
let ree_array = RunArray::<Int64Type>::try_new(&run_ends, &values).unwrap();
10949+
let array_ref = Arc::new(ree_array) as ArrayRef;
10950+
10951+
// Attempt to cast to RunEndEncoded<Int16, Utf8>
10952+
let target_type = DataType::RunEndEncoded(
10953+
Arc::new(Field::new("run_ends", DataType::Int16, false)),
10954+
Arc::new(Field::new("values", DataType::Utf8, true)),
10955+
);
10956+
let cast_options = CastOptions {
10957+
safe: false, // This should make it fail instead of returning nulls
10958+
format_options: FormatOptions::default(),
10959+
};
10960+
10961+
// This should fail due to run-end overflow
10962+
let result: Result<Arc<dyn Array + 'static>, ArrowError> =
10963+
cast_with_options(&array_ref, &target_type, &cast_options);
10964+
10965+
match result {
10966+
Err(e) => {
10967+
assert!(e
10968+
.to_string()
10969+
.contains("Cast error: Can't cast value 100000 to type Int16"));
10970+
}
10971+
Ok(_array_ref) => {
10972+
panic!("This should not happen");
10973+
}
10974+
}
10975+
}
10976+
#[test]
10977+
fn test_cast_run_end_encoded_int16_to_int64_should_succeed() {
10978+
use arrow_array::{Int16Array, RunArray, StringArray};
10979+
use arrow_schema::{DataType, Field};
10980+
use std::sync::Arc;
10981+
10982+
// Construct a valid REE array with Int16 run-ends
10983+
let run_ends = Int16Array::from(vec![2, 5, 8]); // values that fit in Int16
10984+
let values = StringArray::from(vec!["a", "b", "c"]);
10985+
10986+
let ree_array = RunArray::<Int16Type>::try_new(&run_ends, &values).unwrap();
10987+
let array_ref = Arc::new(ree_array) as ArrayRef;
10988+
10989+
// Attempt to cast to RunEndEncoded<Int64, Utf8> (upcast should succeed)
10990+
let target_type = DataType::RunEndEncoded(
10991+
Arc::new(Field::new("run_ends", DataType::Int64, false)),
10992+
Arc::new(Field::new("values", DataType::Utf8, true)),
10993+
);
10994+
let cast_options = CastOptions {
10995+
safe: false,
10996+
format_options: FormatOptions::default(),
10997+
};
10998+
10999+
// This should succeed due to valid upcast
11000+
let result: Result<Arc<dyn Array + 'static>, ArrowError> =
11001+
cast_with_options(&array_ref, &target_type, &cast_options);
11002+
11003+
match result {
11004+
Ok(array_ref) => {
11005+
// Downcast to RunArray<Int64Type>
11006+
let run_array = array_ref
11007+
.as_any()
11008+
.downcast_ref::<RunArray<Int64Type>>()
11009+
.unwrap();
11010+
11011+
// Verify the cast worked correctly
11012+
// Assert the values were cast correctly
11013+
assert_eq!(run_array.run_ends().values(), &[2i64, 5i64, 8i64]);
11014+
assert_eq!(run_array.values().as_string::<i32>().value(0), "a");
11015+
assert_eq!(run_array.values().as_string::<i32>().value(1), "b");
11016+
assert_eq!(run_array.values().as_string::<i32>().value(2), "c");
11017+
}
11018+
Err(e) => {
11019+
panic!("Cast should have succeeded but failed: {}", e);
11020+
}
11021+
}
11022+
}
11023+
11024+
#[test]
11025+
fn test_cast_run_end_encoded_int32_to_int16_should_fail() {
11026+
use arrow_array::{Int32Array, RunArray, StringArray};
11027+
use arrow_schema::{DataType, Field};
11028+
use std::sync::Arc;
11029+
11030+
// Construct a valid REE array with Int32 run-ends
11031+
let run_ends = Int32Array::from(vec![1000, 50000, 80000]); // values too large for Int16
11032+
let values = StringArray::from(vec!["x", "y", "z"]);
11033+
11034+
println!("Original run_ends null count: {}", run_ends.null_count());
11035+
println!("Original run_ends values: {:?}", run_ends.values());
11036+
11037+
let ree_array = RunArray::<Int32Type>::try_new(&run_ends, &values).unwrap();
11038+
let array_ref = Arc::new(ree_array) as ArrayRef;
11039+
11040+
// Attempt to cast to RunEndEncoded<Int16, Utf8> (downcast should fail)
11041+
let target_type = DataType::RunEndEncoded(
11042+
Arc::new(Field::new("run_ends", DataType::Int16, false)),
11043+
Arc::new(Field::new("values", DataType::Utf8, true)),
11044+
);
11045+
let cast_options = CastOptions {
11046+
safe: false,
11047+
format_options: FormatOptions::default(),
11048+
};
11049+
11050+
// This should fail due to run-end overflow
11051+
let result: Result<Arc<dyn Array + 'static>, ArrowError> =
11052+
cast_with_options(&array_ref, &target_type, &cast_options);
11053+
11054+
match result {
11055+
Ok(_) => {
11056+
panic!("Cast should have failed due to overflow but succeeded");
11057+
}
11058+
Err(e) => {
11059+
// Verify the error is about overflow/out of range
11060+
assert!(e.to_string().contains("Can't cast value"));
11061+
}
11062+
}
11063+
}
1094211064
}
1094311065
}

0 commit comments

Comments
 (0)