Skip to content

Commit d9a5386

Browse files
adriangbclaudemartin-g
authored andcommitted
Add arrow_field(expr) scalar UDF (apache#21389)
## Which issue does this PR close? - Related to apache#21387 and apache#19435 ## Rationale for this change DataFusion has individual introspection functions (`arrow_typeof`, `arrow_metadata`, `is_nullable`) but no single function that returns the complete Arrow `Field` representation. Having `arrow_field(expr)` that returns a struct with all field info avoids multiple function calls when you need the full picture, and provides a natural complement to the existing introspection suite. ## What changes are included in this PR? Adds `arrow_field(expr)` scalar UDF that returns a struct: ```sql > SELECT arrow_field(x) FROM my_table; {name: x, data_type: Int32, nullable: false, metadata: {}} > SELECT arrow_field(x)['data_type'] FROM my_table; Int32 ``` The returned struct has four fields: - `name` (Utf8) — the field name - `data_type` (Utf8) — the Arrow data type as string - `nullable` (Boolean) — whether the field is nullable - `metadata` (Map<Utf8, Utf8>) — the field metadata Individual fields are accessible via bracket syntax. **Files:** - `datafusion/functions/src/core/arrow_field.rs` — new UDF implementation - `datafusion/functions/src/core/mod.rs` — registration - `datafusion/sqllogictest/test_files/arrow_field.slt` — tests ## Are these changes tested? Yes, sqllogictest covering literals (int, null, bool, string, float, list), table columns, nullability, and struct field access. ## Are there any user-facing changes? New SQL function `arrow_field(expr)` is available. 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Co-authored-by: Martin Grigorov <martin-g@users.noreply.github.com>
1 parent 7258781 commit d9a5386

File tree

5 files changed

+347
-1
lines changed

5 files changed

+347
-1
lines changed
Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use arrow::array::{
19+
Array, BooleanArray, MapBuilder, StringArray, StringBuilder, StructArray,
20+
};
21+
use arrow::datatypes::{DataType, Field, Fields};
22+
use datafusion_common::{Result, ScalarValue, utils::take_function_args};
23+
use datafusion_expr::{
24+
ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
25+
Volatility,
26+
};
27+
use datafusion_macros::user_doc;
28+
use std::sync::Arc;
29+
30+
#[user_doc(
31+
doc_section(label = "Other Functions"),
32+
description = "Returns a struct containing the Arrow field information of the expression, including name, data type, nullability, and metadata.",
33+
syntax_example = "arrow_field(expression)",
34+
sql_example = r#"```sql
35+
> select arrow_field(1);
36+
+-------------------------------------------------------------+
37+
| arrow_field(Int64(1)) |
38+
+-------------------------------------------------------------+
39+
| {name: lit, data_type: Int64, nullable: false, metadata: {}} |
40+
+-------------------------------------------------------------+
41+
42+
> select arrow_field(1)['data_type'];
43+
+-----------------------------------+
44+
| arrow_field(Int64(1))[data_type] |
45+
+-----------------------------------+
46+
| Int64 |
47+
+-----------------------------------+
48+
```"#,
49+
argument(
50+
name = "expression",
51+
description = "Expression to evaluate. The expression can be a constant, column, or function, and any combination of operators."
52+
)
53+
)]
54+
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
55+
pub struct ArrowFieldFunc {
56+
signature: Signature,
57+
}
58+
59+
impl Default for ArrowFieldFunc {
60+
fn default() -> Self {
61+
Self::new()
62+
}
63+
}
64+
65+
impl ArrowFieldFunc {
66+
pub fn new() -> Self {
67+
Self {
68+
signature: Signature::any(1, Volatility::Immutable),
69+
}
70+
}
71+
72+
fn return_struct_type() -> DataType {
73+
DataType::Struct(Fields::from(vec![
74+
Field::new("name", DataType::Utf8, false),
75+
Field::new("data_type", DataType::Utf8, false),
76+
Field::new("nullable", DataType::Boolean, false),
77+
Field::new(
78+
"metadata",
79+
DataType::Map(
80+
Arc::new(Field::new(
81+
"entries",
82+
DataType::Struct(Fields::from(vec![
83+
Field::new("keys", DataType::Utf8, false),
84+
Field::new("values", DataType::Utf8, true),
85+
])),
86+
false,
87+
)),
88+
false,
89+
),
90+
false,
91+
),
92+
]))
93+
}
94+
}
95+
96+
impl ScalarUDFImpl for ArrowFieldFunc {
97+
fn name(&self) -> &str {
98+
"arrow_field"
99+
}
100+
101+
fn signature(&self) -> &Signature {
102+
&self.signature
103+
}
104+
105+
fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
106+
Ok(Self::return_struct_type())
107+
}
108+
109+
fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
110+
let return_type = args.return_type().clone();
111+
let [field] = take_function_args(self.name(), args.arg_fields)?;
112+
113+
// Build the name array
114+
let name_array =
115+
Arc::new(StringArray::from(vec![field.name().as_str()])) as Arc<dyn Array>;
116+
117+
// Build the data_type array
118+
let data_type_str = field.data_type().to_string();
119+
let data_type_array =
120+
Arc::new(StringArray::from(vec![data_type_str.as_str()])) as Arc<dyn Array>;
121+
122+
// Build the nullable array
123+
let nullable_array =
124+
Arc::new(BooleanArray::from(vec![field.is_nullable()])) as Arc<dyn Array>;
125+
126+
// Build the metadata map array (same pattern as arrow_metadata.rs)
127+
let metadata = field.metadata();
128+
let mut map_builder =
129+
MapBuilder::new(None, StringBuilder::new(), StringBuilder::new());
130+
131+
let mut entries: Vec<_> = metadata.iter().collect();
132+
entries.sort_by_key(|(k, _)| *k);
133+
134+
for (k, v) in entries {
135+
map_builder.keys().append_value(k);
136+
map_builder.values().append_value(v);
137+
}
138+
map_builder.append(true)?;
139+
140+
let metadata_array = Arc::new(map_builder.finish()) as Arc<dyn Array>;
141+
142+
// Build the struct
143+
let DataType::Struct(fields) = return_type else {
144+
unreachable!()
145+
};
146+
147+
let struct_array = StructArray::new(
148+
fields,
149+
vec![name_array, data_type_array, nullable_array, metadata_array],
150+
None,
151+
);
152+
153+
Ok(ColumnarValue::Scalar(ScalarValue::try_from_array(
154+
&struct_array,
155+
0,
156+
)?))
157+
}
158+
159+
fn documentation(&self) -> Option<&Documentation> {
160+
self.doc()
161+
}
162+
}

datafusion/functions/src/core/mod.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ use datafusion_expr::ScalarUDF;
2121
use std::sync::Arc;
2222

2323
pub mod arrow_cast;
24+
pub mod arrow_field;
2425
pub mod arrow_metadata;
2526
pub mod arrow_try_cast;
2627
pub mod arrowtypeof;
@@ -63,6 +64,7 @@ make_udf_function!(union_extract::UnionExtractFun, union_extract);
6364
make_udf_function!(union_tag::UnionTagFunc, union_tag);
6465
make_udf_function!(version::VersionFunc, version);
6566
make_udf_function!(arrow_metadata::ArrowMetadataFunc, arrow_metadata);
67+
make_udf_function!(arrow_field::ArrowFieldFunc, arrow_field);
6668

6769
pub mod expr_fn {
6870
use datafusion_expr::{Expr, Literal};
@@ -103,6 +105,10 @@ pub mod expr_fn {
103105
arrow_typeof,
104106
"Returns the Arrow type of the input expression.",
105107
arg1
108+
),(
109+
arrow_field,
110+
"Returns the Arrow field info (name, data_type, nullable, metadata) of the input expression.",
111+
arg1
106112
),(
107113
arrow_metadata,
108114
"Returns the metadata of the input expression",
@@ -158,6 +164,7 @@ pub fn functions() -> Vec<Arc<ScalarUDF>> {
158164
vec![
159165
nullif(),
160166
arrow_cast(),
167+
arrow_field(),
161168
arrow_try_cast(),
162169
cast_to_type(),
163170
try_cast_to_type(),

datafusion/sqllogictest/src/test_context.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ impl TestContext {
165165
info!("Registering table with many types");
166166
register_table_with_many_types(test_ctx.session_ctx()).await;
167167
}
168-
"metadata.slt" => {
168+
"metadata.slt" | "arrow_field.slt" => {
169169
info!("Registering metadata table tables");
170170
register_metadata_tables(test_ctx.session_ctx()).await;
171171
}
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
# arrow_field on integer literal
19+
query ?
20+
SELECT arrow_field(1)
21+
----
22+
{name: lit, data_type: Int64, nullable: false, metadata: {}}
23+
24+
# arrow_field on null literal
25+
query ?
26+
SELECT arrow_field(null)
27+
----
28+
{name: lit, data_type: Null, nullable: true, metadata: {}}
29+
30+
# arrow_field on boolean literal
31+
query ?
32+
SELECT arrow_field(true)
33+
----
34+
{name: lit, data_type: Boolean, nullable: false, metadata: {}}
35+
36+
# arrow_field on string literal
37+
query ?
38+
SELECT arrow_field('foo')
39+
----
40+
{name: lit, data_type: Utf8, nullable: false, metadata: {}}
41+
42+
# arrow_field on float literal
43+
query ?
44+
SELECT arrow_field(1.0)
45+
----
46+
{name: lit, data_type: Float64, nullable: false, metadata: {}}
47+
48+
# arrow_field on list
49+
query ?
50+
SELECT arrow_field(ARRAY[1,2,3])
51+
----
52+
{name: lit, data_type: List(Int64), nullable: false, metadata: {}}
53+
54+
# arrow_field on map
55+
query ?
56+
SELECT arrow_field(MAP {'a': 1, 'b': 2})
57+
----
58+
{name: lit, data_type: Map("entries": non-null Struct("key": non-null Utf8, "value": Int64), unsorted), nullable: false, metadata: {}}
59+
60+
# arrow_field on struct
61+
query ?
62+
SELECT arrow_field({a: 1, b: 'foo'})
63+
----
64+
{name: lit, data_type: Struct("a": Int64, "b": Utf8), nullable: false, metadata: {}}
65+
66+
# arrow_field on dictionary
67+
query ?
68+
SELECT arrow_field(arrow_cast('foo', 'Dictionary(Int32, Utf8)'))
69+
----
70+
{name: lit, data_type: Dictionary(Int32, Utf8), nullable: false, metadata: {}}
71+
72+
# arrow_field struct field access - data_type
73+
query T
74+
SELECT arrow_field(1)['data_type']
75+
----
76+
Int64
77+
78+
# arrow_field struct field access - nullable
79+
query B
80+
SELECT arrow_field(1)['nullable']
81+
----
82+
false
83+
84+
# arrow_field struct field access - name
85+
query T
86+
SELECT arrow_field(1)['name']
87+
----
88+
lit
89+
90+
# arrow_field with table columns
91+
statement ok
92+
CREATE TABLE arrow_field_test(x INT NOT NULL, y TEXT) AS VALUES (1, 'a');
93+
94+
query ?
95+
SELECT arrow_field(x) FROM arrow_field_test
96+
----
97+
{name: x, data_type: Int32, nullable: false, metadata: {}}
98+
99+
query ?
100+
SELECT arrow_field(y) FROM arrow_field_test
101+
----
102+
{name: y, data_type: Utf8View, nullable: true, metadata: {}}
103+
104+
# arrow_field column access - name reflects column name
105+
query T
106+
SELECT arrow_field(x)['name'] FROM arrow_field_test
107+
----
108+
x
109+
110+
# arrow_field column access - nullability
111+
query B
112+
SELECT arrow_field(x)['nullable'] FROM arrow_field_test
113+
----
114+
false
115+
116+
query B
117+
SELECT arrow_field(y)['nullable'] FROM arrow_field_test
118+
----
119+
true
120+
121+
statement ok
122+
DROP TABLE arrow_field_test;
123+
124+
# arrow_field on a column that carries field metadata
125+
# (table_with_metadata is registered by the Rust test harness)
126+
query ?
127+
SELECT arrow_field(id) FROM table_with_metadata LIMIT 1
128+
----
129+
{name: id, data_type: Int32, nullable: true, metadata: {metadata_key: the id field}}
130+
131+
# arrow_field metadata field access
132+
query ?
133+
SELECT arrow_field(id)['metadata'] FROM table_with_metadata LIMIT 1
134+
----
135+
{metadata_key: the id field}
136+
137+
# arrow_field nullability field access
138+
query I
139+
SELECT count(*) FROM table_with_metadata WHERE NOT arrow_field(id)['nullable']
140+
----
141+
0
142+
143+
query I
144+
SELECT count(*) FROM table_with_metadata WHERE arrow_field(id)['nullable']
145+
----
146+
3

docs/source/user-guide/sql/scalar_functions.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5282,6 +5282,7 @@ union_tag(union_expression)
52825282
## Other Functions
52835283

52845284
- [arrow_cast](#arrow_cast)
5285+
- [arrow_field](#arrow_field)
52855286
- [arrow_metadata](#arrow_metadata)
52865287
- [arrow_try_cast](#arrow_try_cast)
52875288
- [arrow_typeof](#arrow_typeof)
@@ -5328,6 +5329,36 @@ arrow_cast(expression, datatype)
53285329
+---------------------------+---------------------+
53295330
```
53305331

5332+
### `arrow_field`
5333+
5334+
Returns a struct containing the Arrow field information of the expression, including name, data type, nullability, and metadata.
5335+
5336+
```sql
5337+
arrow_field(expression)
5338+
```
5339+
5340+
#### Arguments
5341+
5342+
- **expression**: Expression to evaluate. The expression can be a constant, column, or function, and any combination of operators.
5343+
5344+
#### Example
5345+
5346+
```sql
5347+
> select arrow_field(1);
5348+
+-------------------------------------------------------------+
5349+
| arrow_field(Int64(1)) |
5350+
+-------------------------------------------------------------+
5351+
| {name: lit, data_type: Int64, nullable: false, metadata: {}} |
5352+
+-------------------------------------------------------------+
5353+
5354+
> select arrow_field(1)['data_type'];
5355+
+-----------------------------------+
5356+
| arrow_field(Int64(1))[data_type] |
5357+
+-----------------------------------+
5358+
| Int64 |
5359+
+-----------------------------------+
5360+
```
5361+
53315362
### `arrow_metadata`
53325363

53335364
Returns the metadata of the input expression. If a key is provided, returns the value for that key. If no key is provided, returns a Map of all metadata.

0 commit comments

Comments
 (0)