diff --git a/datafusion/functions/src/string/concat.rs b/datafusion/functions/src/string/concat.rs index a93e70e714e8..d1882e47b33c 100644 --- a/datafusion/functions/src/string/concat.rs +++ b/datafusion/functions/src/string/concat.rs @@ -15,7 +15,11 @@ // specific language governing permissions and limitations // under the License. -use arrow::array::{as_largestring_array, Array}; +use arrow::array::{ + as_largestring_array, Array, FixedSizeListArray, LargeListArray, ListArray, +}; +use arrow::buffer::OffsetBuffer; +use arrow::compute; use arrow::datatypes::DataType; use datafusion_expr::sort_properties::ExprProperties; use std::any::Any; @@ -35,8 +39,8 @@ use datafusion_macros::user_doc; #[user_doc( doc_section(label = "String Functions"), - description = "Concatenates multiple strings together.", - syntax_example = "concat(str[, ..., str_n])", + description = "Concatenates multiple strings or arrays together.", + syntax_example = "concat(str[, ..., str_n]) or concat(array[, ..., array_n])", sql_example = r#"```sql > select concat('data', 'f', 'us', 'ion'); +-------------------------------------------------------+ @@ -44,11 +48,17 @@ use datafusion_macros::user_doc; +-------------------------------------------------------+ | datafusion | +-------------------------------------------------------+ +> select concat(make_array(1, 2), make_array(3, 4)); ++--------------------------------------------------+ +| concat(make_array(1, 2), make_array(3, 4)) | ++--------------------------------------------------+ +| [1, 2, 3, 4] | ++--------------------------------------------------+ ```"#, - standard_argument(name = "str", prefix = "String"), + standard_argument(name = "str", prefix = "String or Array"), argument( name = "str_n", - description = "Subsequent string expressions to concatenate." + description = "Subsequent string or array expressions to concatenate." ), related_udf(name = "concat_ws") )] @@ -65,13 +75,405 @@ impl Default for ConcatFunc { impl ConcatFunc { pub fn new() -> Self { - use DataType::*; Self { - signature: Signature::variadic( - vec![Utf8View, Utf8, LargeUtf8], - Volatility::Immutable, - ), + signature: Signature::user_defined(Volatility::Immutable), + } + } + + /// Infer the element type from input arguments, even when arrays are null. + /// This helps maintain type consistency between planning and execution. + fn infer_element_type_from_args(&self, args: &[ColumnarValue]) -> Result { + // Look for any non-null array to get the element type + for arg in args { + match arg { + ColumnarValue::Array(array) => { + if let DataType::List(field) + | DataType::LargeList(field) + | DataType::FixedSizeList(field, _) = array.data_type() + { + return Ok(field.data_type().clone()); + } + } + ColumnarValue::Scalar(scalar) => match scalar { + ScalarValue::List(list_array) => { + if let DataType::List(field) = list_array.data_type() { + return Ok(field.data_type().clone()); + } + } + ScalarValue::LargeList(list_array) => { + if let DataType::LargeList(field) = list_array.data_type() { + return Ok(field.data_type().clone()); + } + } + ScalarValue::FixedSizeList(list_array) => { + if let DataType::FixedSizeList(field, _) = list_array.data_type() + { + return Ok(field.data_type().clone()); + } + } + _ => {} + }, + } + } + + // If no array type found, default to Int32 (common for cast operations) + Ok(DataType::Int32) + } + + /// Concatenates array arguments into a single array. + /// + /// This function handles array concatenation by extracting elements from each input array + /// and combining them into a single result array. It optimizes for single-row vs multi-row + /// processing to avoid unnecessary scalar-to-array conversions. + /// + /// # Arguments + /// * `args` - Array of ColumnarValue inputs (Arrays or Scalar arrays) + /// * `num_rows` - Number of rows to process + /// + /// # Returns + /// A ColumnarValue containing a ListArray with concatenated elements + fn concat_arrays( + &self, + args: &[ColumnarValue], + num_rows: usize, + ) -> Result { + if args.is_empty() { + return plan_err!("concat requires at least one argument"); + } + + if num_rows == 1 { + return self.concat_arrays_single_row(args); + } + + // Multi-row case: process each row individually to avoid scalar-to-array conversion + self.concat_arrays_multi_row(args, num_rows) + } + + /// Fast path for single-row array concatenation. + /// + /// When processing only a single row, this optimized path avoids the overhead + /// of row-by-row iteration and directly processes all input arrays to extract + /// their elements and concatenate them into a single result array. + /// + /// This method handles both Array and Scalar array inputs by extracting elements + /// from each non-null input and combining them using Arrow's compute::concat. + /// + /// # Arguments + /// * `args` - Array of ColumnarValue inputs to concatenate + /// + /// # Returns + /// A ColumnarValue containing a single-element ListArray with all concatenated elements + fn concat_arrays_single_row(&self, args: &[ColumnarValue]) -> Result { + let mut all_elements = Vec::new(); + + for arg in args { + match arg { + ColumnarValue::Array(array) => { + if !array.is_null(0) { + let elements = self.extract_row_elements(array.as_ref(), 0)?; + all_elements.extend(elements); + } + } + ColumnarValue::Scalar(scalar) => { + if !scalar.is_null() { + // For scalars, create a single-element array directly without conversion + match scalar { + ScalarValue::List(list_array) => { + let elements = + self.extract_row_elements(list_array.as_ref(), 0)?; + all_elements.extend(elements); + } + ScalarValue::LargeList(list_array) => { + let elements = + self.extract_row_elements(list_array.as_ref(), 0)?; + all_elements.extend(elements); + } + ScalarValue::FixedSizeList(list_array) => { + let elements = + self.extract_row_elements(list_array.as_ref(), 0)?; + all_elements.extend(elements); + } + _ => { + return plan_err!( + "Expected array scalar, got {}", + scalar.data_type() + ) + } + } + } + } + } + } + + if all_elements.is_empty() { + // Return empty array when all inputs are null + // Try to infer element type from input arguments + let element_type = self.infer_element_type_from_args(args)?; + let field = Arc::new(arrow::datatypes::Field::new_list_field( + element_type.clone(), + true, + )); + let offsets = OffsetBuffer::from_lengths([0]); + let empty_values = arrow::array::new_empty_array(&element_type); + let result = ListArray::new(field, offsets, empty_values, None); + return Ok(ColumnarValue::Array(Arc::new(result))); } + + let element_refs: Vec<&dyn Array> = + all_elements.iter().map(|a| a.as_ref()).collect(); + let concatenated = compute::concat(&element_refs)?; + + // Build single-element ListArray + let field = Arc::new(arrow::datatypes::Field::new_list_field( + concatenated.data_type().clone(), + true, + )); + let offsets = OffsetBuffer::from_lengths([concatenated.len()]); + let result = ListArray::new(field, offsets, concatenated, None); + + Ok(ColumnarValue::Array(Arc::new(result))) + } + + /// Extract elements from a specific row of an array, optimized for performance. + /// + /// This function handles the extraction of individual elements from different types + /// of list arrays (List, LargeList, FixedSizeList) at a specific row index. + /// It returns a vector of single-element arrays for each non-null element found. + /// + /// The extraction process: + /// 1. Checks if the array at the given row is null (returns empty if so) + /// 2. Gets the list value at the specified row using the appropriate array type + /// 3. Filters out null elements and creates single-element arrays for each + /// 4. Returns a vector of arrays ready for concatenation + /// + /// # Arguments + /// * `array` - The input array (must be a List, LargeList, or FixedSizeList) + /// * `row_idx` - The row index to extract elements from + /// + /// # Returns + /// A vector of single-element arrays containing the non-null elements from the specified row + fn extract_row_elements( + &self, + array: &dyn Array, + row_idx: usize, + ) -> Result>> { + if array.is_null(row_idx) { + return Ok(Vec::new()); + } + + let list_value = match array.data_type() { + DataType::List(_) => { + let list_array = array.as_any().downcast_ref::().unwrap(); + list_array.value(row_idx) + } + DataType::LargeList(_) => { + let list_array = array.as_any().downcast_ref::().unwrap(); + list_array.value(row_idx) + } + DataType::FixedSizeList(_, _) => { + let list_array = + array.as_any().downcast_ref::().unwrap(); + list_array.value(row_idx) + } + _ => return plan_err!("Expected array type, got {}", array.data_type()), + }; + + // Extract non-null elements efficiently + Ok((0..list_value.len()) + .filter(|&i| !list_value.is_null(i)) + .map(|i| list_value.slice(i, 1)) + .collect()) + } + + /// Multi-row array concatenation with efficient batching. + /// + /// For multiple rows, this method processes each row individually to avoid + /// the performance penalty of converting scalar arrays to full arrays for + /// the entire batch. It iterates through each row, extracts elements from + /// all input arrays at that row index, concatenates them, and builds the + /// final result array. + /// + /// This approach is more memory-efficient for large batches as it processes + /// one row at a time rather than materializing all scalar arrays upfront. + /// + /// # Arguments + /// * `args` - Array of ColumnarValue inputs (Arrays or Scalar arrays) + /// * `num_rows` - Number of rows to process + /// + /// # Returns + /// A ColumnarValue containing a ListArray with concatenated elements for each row + fn concat_arrays_multi_row( + &self, + args: &[ColumnarValue], + num_rows: usize, + ) -> Result { + let mut result_arrays = Vec::with_capacity(num_rows); + let mut sample_array = None; + + for row_idx in 0..num_rows { + let mut row_elements = Vec::new(); + + // Collect elements from this row across all args + // Process each input argument for the current row, accumulating elements + for arg in args { + match arg { + ColumnarValue::Array(array) => { + // Keep track of a sample array for result type inference + if sample_array.is_none() { + sample_array = Some(Arc::clone(array)); + } + // Extract elements from this row if not null + if !array.is_null(row_idx) { + let elements = + self.extract_row_elements(array.as_ref(), row_idx)?; + row_elements.extend(elements); + } + } + ColumnarValue::Scalar(scalar) => { + // Scalar arrays are repeated for each row - extract from index 0 + if !scalar.is_null() { + match scalar { + ScalarValue::List(list_array) => { + if sample_array.is_none() { + sample_array = Some( + Arc::clone(list_array) as Arc + ); + } + let elements = self + .extract_row_elements(list_array.as_ref(), 0)?; + row_elements.extend(elements); + } + ScalarValue::LargeList(list_array) => { + if sample_array.is_none() { + sample_array = Some( + Arc::clone(list_array) as Arc + ); + } + let elements = self + .extract_row_elements(list_array.as_ref(), 0)?; + row_elements.extend(elements); + } + ScalarValue::FixedSizeList(list_array) => { + if sample_array.is_none() { + sample_array = Some( + Arc::clone(list_array) as Arc + ); + } + let elements = self + .extract_row_elements(list_array.as_ref(), 0)?; + row_elements.extend(elements); + } + _ => { + return plan_err!( + "Expected array scalar, got {}", + scalar.data_type() + ) + } + } + } + } + } + } + + // Build concatenated result for this row + if row_elements.is_empty() { + // No elements found - record as null/empty for this row + result_arrays.push(None); + } else { + // Concatenate all collected elements using Arrow's efficient concat + let element_refs: Vec<&dyn Array> = + row_elements.iter().map(|a| a.as_ref()).collect(); + let concatenated = compute::concat(&element_refs)?; + result_arrays.push(Some(concatenated)); + } + } + + // Build the final result array + if let Some(sample) = sample_array { + self.build_list_array_result(result_arrays, &sample) + } else { + plan_err!("No sample array found for result construction") + } + } + + /// Build a ListArray result from concatenated elements. + /// + /// This function constructs the final ListArray from a vector of concatenated + /// arrays (one per row). It handles the complex process of: + /// 1. Determining the element type from a sample input array + /// 2. Building efficient offset arrays to track list boundaries + /// 3. Concatenating all values into a single flat values array + /// 4. Constructing the final ListArray with proper metadata + /// + /// The function handles null rows (empty concatenations) by using empty + /// ranges in the offset array, which Arrow interprets as empty lists. + /// + /// # Arguments + /// * `result_arrays` - Vector of concatenated arrays for each row (None for empty rows) + /// * `sample_array` - Sample input array used to determine element type + /// + /// # Returns + /// A ColumnarValue containing a ListArray with all concatenated results + fn build_list_array_result( + &self, + result_arrays: Vec>>, + sample_array: &dyn Array, + ) -> Result { + // Determine element type from sample array + // Extract the inner element type from the list wrapper to create the result field + let element_type = match sample_array.data_type() { + DataType::List(field) + | DataType::LargeList(field) + | DataType::FixedSizeList(field, _) => field.data_type().clone(), + _ => return plan_err!("Expected array type for element type determination"), + }; + + let field = Arc::new(arrow::datatypes::Field::new_list_field( + element_type.clone(), + true, + )); + + // Build values and offsets efficiently + // Create the flat values array and offset array that defines list boundaries + let mut values_vec = Vec::new(); + let mut offsets = vec![0i32]; // Start with offset 0 + let mut current_offset = 0i32; + + for result in result_arrays { + match result { + Some(array) => { + // Add this array's length to the current offset + current_offset += array.len() as i32; + values_vec.push(array); + } + None => { + // Empty array for null result - offset doesn't change + // This creates an empty list in the final result + } + } + // Record the ending offset for this list + offsets.push(current_offset); + } + + // Create the final flat values array containing all concatenated elements + let values = if values_vec.is_empty() { + // All rows were empty - create an empty array of the correct type + arrow::array::new_empty_array(&element_type) + } else { + // Concatenate all row results into a single flat array + let array_refs: Vec<&dyn Array> = + values_vec.iter().map(|a| a.as_ref()).collect(); + compute::concat(&array_refs)? + }; + + let result = ListArray::new( + field, + OffsetBuffer::new(offsets.into()), + values, + None, // Let nulls be determined by empty ranges + ); + + Ok(ColumnarValue::Array(Arc::new(result))) } } @@ -88,25 +490,149 @@ impl ScalarUDFImpl for ConcatFunc { &self.signature } + fn coerce_types(&self, arg_types: &[DataType]) -> Result> { + use DataType::*; + + if arg_types.is_empty() { + return plan_err!("concat requires at least one argument"); + } + + // Check if we have array types - all inputs must be arrays if any are arrays + let has_arrays = arg_types + .iter() + .any(|dt| matches!(dt, List(_) | LargeList(_) | FixedSizeList(_, _))); + let _has_strings = arg_types + .iter() + .any(|dt| matches!(dt, Utf8View | Utf8 | LargeUtf8 | _)); + + if has_arrays { + // If we have arrays, validate that ALL inputs are arrays or NULL + for dt in arg_types { + if !matches!(dt, List(_) | LargeList(_) | FixedSizeList(_, _) | Null) { + return plan_err!("Cannot mix array and non-array arguments in concat function. Found array type and {}", dt); + } + } + + // Coerce arrays to a common element type + // Find the first non-null, non-empty array type to use as target + let mut target_element_type = None; + for dt in arg_types { + if let List(field) | LargeList(field) | FixedSizeList(field, _) = dt { + if !matches!(field.data_type(), Null) { + target_element_type = Some(field.data_type().clone()); + break; + } + } + } + + // If we found a target type, coerce all List to that type + if let Some(target_type) = target_element_type { + let coerced_types: Vec = arg_types + .iter() + .map(|dt| match dt { + List(field) if matches!(field.data_type(), Null) => { + List(Arc::new(arrow::datatypes::Field::new( + "item", + target_type.clone(), + true, + ))) + } + _ => dt.clone(), + }) + .collect(); + return Ok(coerced_types); + } + + // No target type found (all are null or empty), return as-is + return Ok(arg_types.to_vec()); + } + + let coerced_types = arg_types + .iter() + .map(|data_type| match data_type { + Utf8View | Utf8 | LargeUtf8 => data_type.clone(), + _ => Utf8, + }) + .collect(); + Ok(coerced_types) + } + fn return_type(&self, arg_types: &[DataType]) -> Result { use DataType::*; - let mut dt = &Utf8; - arg_types.iter().for_each(|data_type| { - if data_type == &Utf8View { - dt = data_type; + + // Check if we have any arrays (ignoring nulls) + let array_types: Vec<&DataType> = arg_types + .iter() + .filter(|dt| matches!(dt, List(_) | LargeList(_) | FixedSizeList(_, _))) + .collect(); + + if !array_types.is_empty() { + // We have arrays - return list type based on first non-null array + for data_type in array_types { + if let List(field) | LargeList(field) | FixedSizeList(field, _) = + data_type + { + return Ok(List(Arc::new(arrow::datatypes::Field::new( + "item", + field.data_type().clone(), + true, + )))); + } } - if data_type == &LargeUtf8 && dt != &Utf8View { + } + + // Check if all arguments are null (for cast operations like CAST(NULL AS INT[])) + if arg_types.iter().all(|dt| matches!(dt, Null)) { + // When all are null, we need to determine from context or use a default + // For now, return List as a reasonable default + return Ok(List(Arc::new(arrow::datatypes::Field::new( + "item", Int32, true, + )))); + } + + let mut dt = &Utf8; + for data_type in arg_types.iter() { + if data_type == &Utf8View || (data_type == &LargeUtf8 && dt != &Utf8View) { dt = data_type; } - }); - - Ok(dt.to_owned()) + } + Ok(dt.clone()) } /// Concatenates the text representations of all the arguments. NULL arguments are ignored. /// concat('abcde', 2, NULL, 22) = 'abcde222' + /// + /// Also supports array concatenation: concat([1, 2], [3, 4]) = [1, 2, 3, 4] fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { - let ScalarFunctionArgs { args, .. } = args; + let ScalarFunctionArgs { + args, number_rows, .. + } = args; + + if args.is_empty() { + return plan_err!("concat requires at least one argument"); + } + + // Fast array detection - if ANY argument is an array type, route to array concatenation logic + // This allows the function to handle both string concat and array concat seamlessly + for arg in &args { + let is_array = match arg { + ColumnarValue::Array(array) => matches!( + array.data_type(), + DataType::List(_) + | DataType::LargeList(_) + | DataType::FixedSizeList(_, _) + ), + ColumnarValue::Scalar(scalar) => matches!( + scalar.data_type(), + DataType::List(_) + | DataType::LargeList(_) + | DataType::FixedSizeList(_, _) + ), + }; + if is_array { + return self.concat_arrays(&args, number_rows); + } + } let mut return_datatype = DataType::Utf8; args.iter().for_each(|col| { @@ -139,10 +665,14 @@ impl ScalarUDFImpl for ConcatFunc { match scalar.try_as_str() { Some(Some(v)) => result.push_str(v), Some(None) => {} // null literal - None => plan_err!( - "Concat function does not support scalar type {}", - scalar - )?, + None => { + // For non-string types, convert to string representation + if scalar.is_null() { + // Skip null values + } else { + result.push_str(&format!("{scalar}")); + } + } } } @@ -189,7 +719,7 @@ impl ScalarUDFImpl for ConcatFunc { ColumnarValueRef::NonNullableArray(string_array) }; columns.push(column); - }, + } DataType::LargeUtf8 => { let string_array = as_largestring_array(array); @@ -197,10 +727,12 @@ impl ScalarUDFImpl for ConcatFunc { let column = if array.is_nullable() { ColumnarValueRef::NullableLargeStringArray(string_array) } else { - ColumnarValueRef::NonNullableLargeStringArray(string_array) + ColumnarValueRef::NonNullableLargeStringArray( + string_array, + ) }; columns.push(column); - }, + } DataType::Utf8View => { let string_array = as_string_view_array(array)?; @@ -211,13 +743,13 @@ impl ScalarUDFImpl for ConcatFunc { ColumnarValueRef::NonNullableStringViewArray(string_array) }; columns.push(column); - }, + } other => { return plan_err!("Input was {other} which is not a supported datatype for concat function") } }; } - _ => unreachable!("concat"), + _ => return plan_err!("Unsupported argument type: {}", arg.data_type()), } } @@ -258,7 +790,7 @@ impl ScalarUDFImpl for ConcatFunc { let string_array = builder.finish(None); Ok(ColumnarValue::Array(Arc::new(string_array))) } - _ => unreachable!(), + _ => plan_err!("Unsupported return datatype: {return_datatype}"), } } @@ -288,6 +820,10 @@ impl ScalarUDFImpl for ConcatFunc { } pub fn simplify_concat(args: Vec) -> Result { + if args.is_empty() { + return plan_err!("concat requires at least one argument"); + } + let mut new_args = Vec::with_capacity(args.len()); let mut contiguous_scalar = "".to_string(); @@ -302,45 +838,63 @@ pub fn simplify_concat(args: Vec) -> Result { ConcatFunc::new().return_type(&data_types) }?; - for arg in args.clone() { + for arg in args.iter() { match arg { Expr::Literal(ScalarValue::Utf8(None), _) => {} - Expr::Literal(ScalarValue::LargeUtf8(None), _) => { - } - Expr::Literal(ScalarValue::Utf8View(None), _) => { } + Expr::Literal(ScalarValue::LargeUtf8(None), _) => {} + Expr::Literal(ScalarValue::Utf8View(None), _) => {} - // filter out `null` args - // All literals have been converted to Utf8 or LargeUtf8 in type_coercion. - // Concatenate it with the `contiguous_scalar`. Expr::Literal(ScalarValue::Utf8(Some(v)), _) => { - contiguous_scalar += &v; + contiguous_scalar += v; } Expr::Literal(ScalarValue::LargeUtf8(Some(v)), _) => { - contiguous_scalar += &v; + contiguous_scalar += v; } Expr::Literal(ScalarValue::Utf8View(Some(v)), _) => { - contiguous_scalar += &v; + contiguous_scalar += v; } - Expr::Literal(x, _) => { - return internal_err!( - "The scalar {x} should be casted to string type during the type coercion." - ) + Expr::Literal(scalar_val, _) => { + // Skip array literals - they should be handled at runtime + if matches!( + scalar_val.data_type(), + DataType::List(_) + | DataType::LargeList(_) + | DataType::FixedSizeList(_, _) + ) { + if !contiguous_scalar.is_empty() { + match return_type { + DataType::Utf8 => new_args.push(lit(contiguous_scalar)), + DataType::LargeUtf8 => new_args.push(lit( + ScalarValue::LargeUtf8(Some(contiguous_scalar)), + )), + DataType::Utf8View => new_args.push(lit( + ScalarValue::Utf8View(Some(contiguous_scalar)), + )), + _ => return Ok(ExprSimplifyResult::Original(args)), + } + contiguous_scalar = "".to_string(); + } + new_args.push(arg.clone()); + } else { + // Convert non-string, non-array literals to their string representation + let string_repr = format!("{scalar_val}"); + contiguous_scalar += &string_repr; + } } - // If the arg is not a literal, we should first push the current `contiguous_scalar` - // to the `new_args` (if it is not empty) and reset it to empty string. - // Then pushing this arg to the `new_args`. arg => { if !contiguous_scalar.is_empty() { match return_type { DataType::Utf8 => new_args.push(lit(contiguous_scalar)), - DataType::LargeUtf8 => new_args.push(lit(ScalarValue::LargeUtf8(Some(contiguous_scalar)))), - DataType::Utf8View => new_args.push(lit(ScalarValue::Utf8View(Some(contiguous_scalar)))), - _ => unreachable!(), + DataType::LargeUtf8 => new_args + .push(lit(ScalarValue::LargeUtf8(Some(contiguous_scalar)))), + DataType::Utf8View => new_args + .push(lit(ScalarValue::Utf8View(Some(contiguous_scalar)))), + _ => return Ok(ExprSimplifyResult::Original(args)), } contiguous_scalar = "".to_string(); } - new_args.push(arg); + new_args.push(arg.clone()); } } } @@ -354,7 +908,7 @@ pub fn simplify_concat(args: Vec) -> Result { DataType::Utf8View => { new_args.push(lit(ScalarValue::Utf8View(Some(contiguous_scalar)))) } - _ => unreachable!(), + _ => return Ok(ExprSimplifyResult::Original(args)), } } @@ -376,6 +930,7 @@ mod tests { use crate::utils::test::test_function; use arrow::array::{Array, LargeStringArray, StringViewArray}; use arrow::array::{ArrayRef, StringArray}; + use arrow::buffer::NullBuffer; use arrow::datatypes::Field; use datafusion_common::config::ConfigOptions; use DataType::*; @@ -479,7 +1034,7 @@ mod tests { ] .into_iter() .map(Arc::new) - .collect::>(); + .collect(); let args = ScalarFunctionArgs { args: vec![c0, c1, c2, c3, c4], @@ -501,4 +1056,181 @@ mod tests { } Ok(()) } + + #[test] + fn test_concat_with_integers() -> Result<()> { + use datafusion_common::config::ConfigOptions; + + let args = vec![ + ColumnarValue::Scalar(ScalarValue::Utf8(Some("abc".to_string()))), + ColumnarValue::Scalar(ScalarValue::Int64(Some(123))), + ColumnarValue::Scalar(ScalarValue::Utf8(None)), // NULL + ColumnarValue::Scalar(ScalarValue::Int64(Some(456))), + ]; + + let arg_fields = vec![ + Field::new("a", Utf8, true), + Field::new("b", Int64, true), + Field::new("c", Utf8, true), + Field::new("d", Int64, true), + ] + .into_iter() + .map(Arc::new) + .collect(); + + let func_args = ScalarFunctionArgs { + args, + arg_fields, + number_rows: 1, + return_field: Field::new("f", Utf8, true).into(), + config_options: Arc::new(ConfigOptions::default()), + }; + + let result = ConcatFunc::new().invoke_with_args(func_args)?; + + // Expected result should be "abc123456" + match result { + ColumnarValue::Scalar(ScalarValue::Utf8(Some(s))) => { + assert_eq!(s, "abc123456"); + } + _ => panic!("Expected scalar UTF8 result, got {result:?}"), + } + + Ok(()) + } + + #[test] + fn test_concat_arrays_basic() -> Result<()> { + use arrow::array::{Int32Array, ListArray}; + use datafusion_common::config::ConfigOptions; + + let field = Arc::new(Field::new("item", Int32, true)); + let array1 = ListArray::new( + Arc::clone(&field), + OffsetBuffer::from_lengths([3]), + Arc::new(Int32Array::from(vec![1, 2, 3])), + None, + ); + let array2 = ListArray::new( + Arc::clone(&field), + OffsetBuffer::from_lengths([2]), + Arc::new(Int32Array::from(vec![4, 5])), + None, + ); + + let args = ScalarFunctionArgs { + args: vec![ + ColumnarValue::Array(Arc::new(array1)), + ColumnarValue::Array(Arc::new(array2)), + ], + arg_fields: vec![ + Arc::new(Field::new("a", List(Arc::clone(&field)), true)), + Arc::new(Field::new("b", List(Arc::clone(&field)), true)), + ], + number_rows: 1, + return_field: Arc::new(Field::new("f", List(field), true)), + config_options: Arc::new(ConfigOptions::default()), + }; + + let result = ConcatFunc::new().invoke_with_args(args)?; + if let ColumnarValue::Array(array) = result { + let list_array = array.as_any().downcast_ref::().unwrap(); + let array_value = list_array.value(0); + let values = array_value.as_any().downcast_ref::().unwrap(); + assert_eq!(values.values(), &[1, 2, 3, 4, 5]); + } + Ok(()) + } + + #[test] + fn test_concat_arrays_multi_row() -> Result<()> { + use arrow::array::{Int32Array, ListArray}; + use datafusion_common::config::ConfigOptions; + + let field = Arc::new(Field::new("item", Int32, true)); + let array1 = ListArray::new( + Arc::clone(&field), + OffsetBuffer::from_lengths([2, 2]), + Arc::new(Int32Array::from(vec![1, 2, 10, 20])), + None, + ); + let array2 = ListArray::new( + Arc::clone(&field), + OffsetBuffer::from_lengths([1, 1]), + Arc::new(Int32Array::from(vec![3, 30])), + None, + ); + + let args = ScalarFunctionArgs { + args: vec![ + ColumnarValue::Array(Arc::new(array1)), + ColumnarValue::Array(Arc::new(array2)), + ], + arg_fields: vec![ + Arc::new(Field::new("a", List(Arc::clone(&field)), true)), + Arc::new(Field::new("b", List(Arc::clone(&field)), true)), + ], + number_rows: 2, + return_field: Arc::new(Field::new("f", List(field), true)), + config_options: Arc::new(ConfigOptions::default()), + }; + + let result = ConcatFunc::new().invoke_with_args(args)?; + if let ColumnarValue::Array(array) = result { + let list_array = array.as_any().downcast_ref::().unwrap(); + assert_eq!(list_array.len(), 2); + + let array_value1 = list_array.value(0); + let row1 = array_value1.as_any().downcast_ref::().unwrap(); + assert_eq!(row1.values(), &[1, 2, 3]); + + let array_value2 = list_array.value(1); + let row2 = array_value2.as_any().downcast_ref::().unwrap(); + assert_eq!(row2.values(), &[10, 20, 30]); + } + Ok(()) + } + + #[test] + fn test_concat_arrays_with_nulls() -> Result<()> { + use arrow::array::{Int32Array, ListArray}; + use datafusion_common::config::ConfigOptions; + + let field = Arc::new(Field::new("item", Int32, true)); + let array1 = ListArray::new( + Arc::clone(&field), + OffsetBuffer::from_lengths([2]), + Arc::new(Int32Array::from(vec![1, 2])), + Some(NullBuffer::new_null(1)), + ); + let array2 = ListArray::new( + Arc::clone(&field), + OffsetBuffer::from_lengths([2]), + Arc::new(Int32Array::from(vec![3, 4])), + None, + ); + + let args = ScalarFunctionArgs { + args: vec![ + ColumnarValue::Array(Arc::new(array1)), + ColumnarValue::Array(Arc::new(array2)), + ], + arg_fields: vec![ + Arc::new(Field::new("a", List(Arc::clone(&field)), true)), + Arc::new(Field::new("b", List(Arc::clone(&field)), true)), + ], + number_rows: 1, + return_field: Arc::new(Field::new("f", List(field), true)), + config_options: Arc::new(ConfigOptions::default()), + }; + + let result = ConcatFunc::new().invoke_with_args(args)?; + if let ColumnarValue::Array(array) = result { + let list_array = array.as_any().downcast_ref::().unwrap(); + let array_value = list_array.value(0); + let values = array_value.as_any().downcast_ref::().unwrap(); + assert_eq!(values.values(), &[3, 4]); + } + Ok(()) + } } diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index b15ec026372d..c8c2d87fd4e1 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -770,8 +770,7 @@ datafusion public string_agg 1 OUT NULL String NULL false 1 query TTTBI rowsort select specific_name, data_type, parameter_mode, is_variadic, rid from information_schema.parameters where specific_name = 'concat'; ---- -concat String IN true 0 -concat String OUT false 0 +concat NULL NULL true 0 # test ceorcion signature query TTITI rowsort diff --git a/datafusion/sqllogictest/test_files/spark/string/concat.slt b/datafusion/sqllogictest/test_files/spark/string/concat.slt index 258cb829d7d4..7b70eabffa61 100644 --- a/datafusion/sqllogictest/test_files/spark/string/concat.slt +++ b/datafusion/sqllogictest/test_files/spark/string/concat.slt @@ -46,3 +46,43 @@ SELECT concat(a, b, c) from (select 'a' a, 'b' b, 'c' c union all select null a, ---- abc NULL + +# Test array concatenation +query ? +SELECT concat([1, 2], [3, 4]); +---- +[1, 2, 3, 4] + +query ? +SELECT concat([1, 2], [3, 4], [5, 6]); +---- +[1, 2, 3, 4, 5, 6] + +# Test array concatenation with nulls +query ? +SELECT concat([1, 2], NULL, [3, 4]); +---- +[1, 2, 3, 4] + +# Test array concatenation with empty arrays +query ? +SELECT concat([], [1, 2]); +---- +[1, 2] + +query ? +SELECT concat([1, 2], []); +---- +[1, 2] + +# Test concatenation of all null arrays should return empty array +query ? +SELECT concat(CAST(NULL AS INT[]), CAST(NULL AS INT[])); +---- +[] + +# Test string arrays +query ? +SELECT concat(['a', 'b'], ['c', 'd']); +---- +[a, b, c, d] diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index d2e7066191f9..303aebb00df8 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -1332,16 +1332,16 @@ chr(expression) ### `concat` -Concatenates multiple strings together. +Concatenates multiple strings or arrays together. ```sql -concat(str[, ..., str_n]) +concat(str[, ..., str_n]) or concat(array[, ..., array_n]) ``` #### Arguments -- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. -- **str_n**: Subsequent string expressions to concatenate. +- **str**: String or Array expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **str_n**: Subsequent string or array expressions to concatenate. #### Example @@ -1352,6 +1352,12 @@ concat(str[, ..., str_n]) +-------------------------------------------------------+ | datafusion | +-------------------------------------------------------+ +> select concat(make_array(1, 2), make_array(3, 4)); ++--------------------------------------------------+ +| concat(make_array(1, 2), make_array(3, 4)) | ++--------------------------------------------------+ +| [1, 2, 3, 4] | ++--------------------------------------------------+ ``` **Related functions**: