From 79b6f7e6b79e7210b514f3acc96b64a4ddfc46bf Mon Sep 17 00:00:00 2001 From: Khanh Duong Date: Sat, 18 Oct 2025 14:15:28 +0900 Subject: [PATCH 01/13] support parse list --- arrow-schema/src/datatype_parse.rs | 96 +++++++++++++++++++++++++----- 1 file changed, 80 insertions(+), 16 deletions(-) diff --git a/arrow-schema/src/datatype_parse.rs b/arrow-schema/src/datatype_parse.rs index 48b7089e8ecc..d62a84becabb 100644 --- a/arrow-schema/src/datatype_parse.rs +++ b/arrow-schema/src/datatype_parse.rs @@ -96,11 +96,29 @@ impl<'a> Parser<'a> { /// Parses the List type fn parse_list(&mut self) -> ArrowResult { self.expect_token(Token::LParen)?; + let nullable = self.nullable(); let data_type = self.parse_next_type()?; - self.expect_token(Token::RParen)?; - Ok(DataType::List(Arc::new(Field::new_list_field( - data_type, true, - )))) + + match self.next_token()? { + // default field name + Token::RParen => Ok(DataType::List(Arc::new(Field::new_list_field( + data_type, nullable, + )))), + // expects: field: 'field_name' + Token::Comma => { + self.expect_token(Token::Field)?; + self.expect_token(Token::Colon)?; + let field_name = self.parse_single_quoted_string("List's field")?; + self.expect_token(Token::RParen)?; + Ok(DataType::List(Arc::new(Field::new( + field_name, data_type, nullable, + )))) + } + tok => Err(make_error( + self.val, + &format!("Expected a single string for a field name; got {tok:?}"), + )), + } } /// Parses the LargeList type @@ -150,6 +168,19 @@ impl<'a> Parser<'a> { } } + /// Parses the next single quoted string + fn parse_single_quoted_string(&mut self, context: &str) -> ArrowResult { + let token = self.next_token()?; + if let Token::SingleQuotedString(string) = token { + Ok(string) + } else { + Err(make_error( + self.val, + &format!("expected single quoted string for {context}, got '{token}'"), + )) + } + } + /// Parses the next integer value fn parse_i64(&mut self, context: &str) -> ArrowResult { match self.next_token()? { @@ -354,16 +385,13 @@ impl<'a> Parser<'a> { tok => { return Err(make_error( self.val, - &format!("Expected a quoted string for a field name; got {tok:?}"), + &format!("Expected a double quoted string for a field name; got {tok:?}"), )); } }; self.expect_token(Token::Colon)?; - let nullable = self - .tokenizer - .next_if(|next| matches!(next, Ok(Token::Nullable))) - .is_some(); + let nullable = self.nullable(); let field_type = self.parse_next_type()?; fields.push(Arc::new(Field::new(field_name, field_type, nullable))); match self.next_token()? { @@ -382,6 +410,12 @@ impl<'a> Parser<'a> { Ok(DataType::Struct(Fields::from(fields))) } + fn nullable(&mut self) -> bool { + self.tokenizer + .next_if(|next| matches!(next, Ok(Token::Nullable))) + .is_some() + } + /// return the next token, or an error if there are none left fn next_token(&mut self) -> ArrowResult { match self.tokenizer.next() { @@ -406,6 +440,11 @@ fn is_separator(c: char) -> bool { c == '(' || c == ')' || c == ',' || c == ':' || c == ' ' } +enum QuoteType { + Double, + Single, +} + #[derive(Debug)] /// Splits a strings like Dictionary(Int32, Int64) into tokens sutable for parsing /// @@ -527,6 +566,7 @@ impl<'a> Tokenizer<'a> { "None" => Token::None, "nullable" => Token::Nullable, + "field" => Token::Field, "Struct" => Token::Struct, @@ -537,9 +577,14 @@ impl<'a> Tokenizer<'a> { Ok(token) } - /// Parses e.g. `"foo bar"` - fn parse_quoted_string(&mut self) -> ArrowResult { - if self.next_char() != Some('\"') { + /// Parses e.g. `"foo bar"`, `'foo bar'` + fn parse_quoted_string(&mut self, quote_type: QuoteType) -> ArrowResult { + let quote = match quote_type { + QuoteType::Double => '\"', + QuoteType::Single => '\'', + }; + + if self.next_char() != Some(quote) { return Err(make_error(self.val, "Expected \"")); } @@ -561,7 +606,7 @@ impl<'a> Tokenizer<'a> { is_escaped = true; self.word.push(c); } - '"' => { + c if c == quote => { if is_escaped { self.word.push(c); is_escaped = false; @@ -585,7 +630,10 @@ impl<'a> Tokenizer<'a> { return Err(make_error(self.val, "empty strings aren't allowed")); } - Ok(Token::DoubleQuotedString(val)) + match quote_type { + QuoteType::Double => Ok(Token::DoubleQuotedString(val)), + QuoteType::Single => Ok(Token::SingleQuotedString(val)), + } } } @@ -601,7 +649,10 @@ impl Iterator for Tokenizer<'_> { continue; } '"' => { - return Some(self.parse_quoted_string()); + return Some(self.parse_quoted_string(QuoteType::Double)); + } + '\'' => { + return Some(self.parse_quoted_string(QuoteType::Single)); } '(' => { self.next_char(); @@ -652,11 +703,13 @@ enum Token { None, Integer(i64), DoubleQuotedString(String), + SingleQuotedString(String), List, LargeList, FixedSizeList, Struct, Nullable, + Field, } impl Display for Token { @@ -687,8 +740,10 @@ impl Display for Token { Token::Dictionary => write!(f, "Dictionary"), Token::Integer(v) => write!(f, "Integer({v})"), Token::DoubleQuotedString(s) => write!(f, "DoubleQuotedString({s})"), + Token::SingleQuotedString(s) => write!(f, "SingleQuotedString({s})"), Token::Struct => write!(f, "Struct"), Token::Nullable => write!(f, "nullable"), + Token::Field => write!(f, "field"), } } } @@ -828,7 +883,16 @@ mod test { ), ])), DataType::Struct(Fields::empty()), - // TODO support more structured types (List, LargeList, Union, Map, RunEndEncoded, etc) + DataType::List(Arc::new(Field::new_list_field(DataType::Int64, true))), + DataType::List(Arc::new(Field::new_list_field(DataType::Int64, false))), + DataType::List(Arc::new(Field::new("Int64", DataType::Int64, true))), + DataType::List(Arc::new(Field::new("Int64", DataType::Int64, false))), + DataType::List(Arc::new(Field::new( + "nested_list", + DataType::List(Arc::new(Field::new("Int64", DataType::Int64, true))), + true, + ))), + // TODO support more structured types (LargeList, Union, Map, RunEndEncoded, etc) ] } From 28c639799fa740d5796289f4a705d415cbedc8e8 Mon Sep 17 00:00:00 2001 From: Khanh Duong Date: Sat, 18 Oct 2025 15:24:57 +0900 Subject: [PATCH 02/13] docs --- arrow-schema/src/datatype_parse.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arrow-schema/src/datatype_parse.rs b/arrow-schema/src/datatype_parse.rs index d62a84becabb..3fda20f79517 100644 --- a/arrow-schema/src/datatype_parse.rs +++ b/arrow-schema/src/datatype_parse.rs @@ -371,6 +371,8 @@ impl<'a> Parser<'a> { Box::new(value_type), )) } + + /// Parses the next Struct (called after `Struct` has been consumed) fn parse_struct(&mut self) -> ArrowResult { self.expect_token(Token::LParen)?; let mut fields = Vec::new(); @@ -410,6 +412,7 @@ impl<'a> Parser<'a> { Ok(DataType::Struct(Fields::from(fields))) } + /// return and consume if the next token is `Token::Nullable` fn nullable(&mut self) -> bool { self.tokenizer .next_if(|next| matches!(next, Ok(Token::Nullable))) From e25e4a1449a865136fa41db3cc1eee238152f5b3 Mon Sep 17 00:00:00 2001 From: Khanh Duong Date: Sat, 18 Oct 2025 16:23:42 +0900 Subject: [PATCH 03/13] refactor field name out parse list --- arrow-schema/src/datatype_parse.rs | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/arrow-schema/src/datatype_parse.rs b/arrow-schema/src/datatype_parse.rs index 3fda20f79517..98b1546ea86c 100644 --- a/arrow-schema/src/datatype_parse.rs +++ b/arrow-schema/src/datatype_parse.rs @@ -93,6 +93,14 @@ impl<'a> Parser<'a> { } } + /// Parses list field name: `field: 'field_name'` + fn parse_list_field_name(&mut self, context: &str) -> ArrowResult { + self.expect_token(Token::Field)?; + self.expect_token(Token::Colon)?; + let field_name = self.parse_single_quoted_string(context)?; + Ok(field_name) + } + /// Parses the List type fn parse_list(&mut self) -> ArrowResult { self.expect_token(Token::LParen)?; @@ -100,18 +108,16 @@ impl<'a> Parser<'a> { let data_type = self.parse_next_type()?; match self.next_token()? { - // default field name + // list with default field name Token::RParen => Ok(DataType::List(Arc::new(Field::new_list_field( data_type, nullable, )))), - // expects: field: 'field_name' + // list with field name Token::Comma => { - self.expect_token(Token::Field)?; - self.expect_token(Token::Colon)?; - let field_name = self.parse_single_quoted_string("List's field")?; + let field = self.parse_list_field_name("List's field")?; self.expect_token(Token::RParen)?; Ok(DataType::List(Arc::new(Field::new( - field_name, data_type, nullable, + field, data_type, nullable, )))) } tok => Err(make_error( From 08a65771c972b87930d8df79cb306270e55428d8 Mon Sep 17 00:00:00 2001 From: Khanh Duong Date: Sat, 18 Oct 2025 18:54:56 +0900 Subject: [PATCH 04/13] make `parse_list` easy to read --- arrow-schema/src/datatype_parse.rs | 38 ++++++++++++++++-------------- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/arrow-schema/src/datatype_parse.rs b/arrow-schema/src/datatype_parse.rs index 98b1546ea86c..3db1a7cbea23 100644 --- a/arrow-schema/src/datatype_parse.rs +++ b/arrow-schema/src/datatype_parse.rs @@ -93,12 +93,22 @@ impl<'a> Parser<'a> { } } - /// Parses list field name: `field: 'field_name'` - fn parse_list_field_name(&mut self, context: &str) -> ArrowResult { + /// Parses list field name, return `None` if the list doesn't have field name. + fn parse_list_field_name(&mut self, context: &str) -> ArrowResult> { + // `field` must be after a comma + if self + .tokenizer + .next_if(|next| matches!(next, Ok(Token::Comma))) + .is_none() + { + return Ok(None); + } + + // field name: `field: 'field_name'`. self.expect_token(Token::Field)?; self.expect_token(Token::Colon)?; let field_name = self.parse_single_quoted_string(context)?; - Ok(field_name) + Ok(Some(field_name)) } /// Parses the List type @@ -106,24 +116,16 @@ impl<'a> Parser<'a> { self.expect_token(Token::LParen)?; let nullable = self.nullable(); let data_type = self.parse_next_type()?; + let field = self.parse_list_field_name("List's field")?; + self.expect_token(Token::RParen)?; - match self.next_token()? { - // list with default field name - Token::RParen => Ok(DataType::List(Arc::new(Field::new_list_field( + match field { + Some(field) => Ok(DataType::List(Arc::new(Field::new( + field, data_type, nullable, + )))), + None => Ok(DataType::List(Arc::new(Field::new_list_field( data_type, nullable, )))), - // list with field name - Token::Comma => { - let field = self.parse_list_field_name("List's field")?; - self.expect_token(Token::RParen)?; - Ok(DataType::List(Arc::new(Field::new( - field, data_type, nullable, - )))) - } - tok => Err(make_error( - self.val, - &format!("Expected a single string for a field name; got {tok:?}"), - )), } } From a8822e5b46c52a8f31029a7f1db42bd0ea5c253c Mon Sep 17 00:00:00 2001 From: Khanh Duong Date: Sat, 18 Oct 2025 18:59:18 +0900 Subject: [PATCH 05/13] support `ListView` --- arrow-schema/src/datatype_parse.rs | 31 ++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/arrow-schema/src/datatype_parse.rs b/arrow-schema/src/datatype_parse.rs index 3db1a7cbea23..d3ed2dfcb4bd 100644 --- a/arrow-schema/src/datatype_parse.rs +++ b/arrow-schema/src/datatype_parse.rs @@ -83,6 +83,7 @@ impl<'a> Parser<'a> { Token::Decimal256 => self.parse_decimal_256(), Token::Dictionary => self.parse_dictionary(), Token::List => self.parse_list(), + Token::ListView => self.parse_list_view(), Token::LargeList => self.parse_large_list(), Token::FixedSizeList => self.parse_fixed_size_list(), Token::Struct => self.parse_struct(), @@ -129,6 +130,24 @@ impl<'a> Parser<'a> { } } + /// Parses the ListView type + fn parse_list_view(&mut self) -> ArrowResult { + self.expect_token(Token::LParen)?; + let nullable = self.nullable(); + let data_type = self.parse_next_type()?; + let field = self.parse_list_field_name("ListView's field")?; + self.expect_token(Token::RParen)?; + + match field { + Some(field) => Ok(DataType::ListView(Arc::new(Field::new( + field, data_type, nullable, + )))), + None => Ok(DataType::ListView(Arc::new(Field::new_list_field( + data_type, nullable, + )))), + } + } + /// Parses the LargeList type fn parse_large_list(&mut self) -> ArrowResult { self.expect_token(Token::LParen)?; @@ -547,6 +566,7 @@ impl<'a> Tokenizer<'a> { "Date64" => Token::SimpleType(DataType::Date64), "List" => Token::List, + "ListView" => Token::ListView, "LargeList" => Token::LargeList, "FixedSizeList" => Token::FixedSizeList, @@ -716,6 +736,7 @@ enum Token { DoubleQuotedString(String), SingleQuotedString(String), List, + ListView, LargeList, FixedSizeList, Struct, @@ -728,6 +749,7 @@ impl Display for Token { match self { Token::SimpleType(t) => write!(f, "{t}"), Token::List => write!(f, "List"), + Token::ListView => write!(f, "ListView"), Token::LargeList => write!(f, "LargeList"), Token::FixedSizeList => write!(f, "FixedSizeList"), Token::Timestamp => write!(f, "Timestamp"), @@ -903,6 +925,15 @@ mod test { DataType::List(Arc::new(Field::new("Int64", DataType::Int64, true))), true, ))), + DataType::ListView(Arc::new(Field::new_list_field(DataType::Int64, true))), + DataType::ListView(Arc::new(Field::new_list_field(DataType::Int64, false))), + DataType::ListView(Arc::new(Field::new("Int64", DataType::Int64, true))), + DataType::ListView(Arc::new(Field::new("Int64", DataType::Int64, false))), + DataType::ListView(Arc::new(Field::new( + "nested_list_view", + DataType::ListView(Arc::new(Field::new("Int64", DataType::Int64, true))), + true, + ))), // TODO support more structured types (LargeList, Union, Map, RunEndEncoded, etc) ] } From 776288e6178840dba5bb3311b66c553fee68c8d3 Mon Sep 17 00:00:00 2001 From: Khanh Duong Date: Sat, 18 Oct 2025 19:07:10 +0900 Subject: [PATCH 06/13] support `LargeList`, `LargeListView` --- arrow-schema/src/datatype_parse.rs | 54 ++++++++++++++++++++++++++++-- 1 file changed, 51 insertions(+), 3 deletions(-) diff --git a/arrow-schema/src/datatype_parse.rs b/arrow-schema/src/datatype_parse.rs index d3ed2dfcb4bd..99662627ed83 100644 --- a/arrow-schema/src/datatype_parse.rs +++ b/arrow-schema/src/datatype_parse.rs @@ -85,6 +85,7 @@ impl<'a> Parser<'a> { Token::List => self.parse_list(), Token::ListView => self.parse_list_view(), Token::LargeList => self.parse_large_list(), + Token::LargeListView => self.parse_large_list_view(), Token::FixedSizeList => self.parse_fixed_size_list(), Token::Struct => self.parse_struct(), tok => Err(make_error( @@ -151,11 +152,37 @@ impl<'a> Parser<'a> { /// Parses the LargeList type fn parse_large_list(&mut self) -> ArrowResult { self.expect_token(Token::LParen)?; + let nullable = self.nullable(); let data_type = self.parse_next_type()?; + let field = self.parse_list_field_name("LargeList's field")?; self.expect_token(Token::RParen)?; - Ok(DataType::LargeList(Arc::new(Field::new_list_field( - data_type, true, - )))) + + match field { + Some(field) => Ok(DataType::LargeList(Arc::new(Field::new( + field, data_type, nullable, + )))), + None => Ok(DataType::LargeList(Arc::new(Field::new_list_field( + data_type, nullable, + )))), + } + } + + /// Parses the LargeListView type + fn parse_large_list_view(&mut self) -> ArrowResult { + self.expect_token(Token::LParen)?; + let nullable = self.nullable(); + let data_type = self.parse_next_type()?; + let field = self.parse_list_field_name("LargeListView's field")?; + self.expect_token(Token::RParen)?; + + match field { + Some(field) => Ok(DataType::LargeListView(Arc::new(Field::new( + field, data_type, nullable, + )))), + None => Ok(DataType::LargeListView(Arc::new(Field::new_list_field( + data_type, nullable, + )))), + } } /// Parses the FixedSizeList type @@ -568,6 +595,7 @@ impl<'a> Tokenizer<'a> { "List" => Token::List, "ListView" => Token::ListView, "LargeList" => Token::LargeList, + "LargeListView" => Token::LargeListView, "FixedSizeList" => Token::FixedSizeList, "s" | "Second" => Token::TimeUnit(TimeUnit::Second), @@ -738,6 +766,7 @@ enum Token { List, ListView, LargeList, + LargeListView, FixedSizeList, Struct, Nullable, @@ -751,6 +780,7 @@ impl Display for Token { Token::List => write!(f, "List"), Token::ListView => write!(f, "ListView"), Token::LargeList => write!(f, "LargeList"), + Token::LargeListView => write!(f, "LargeListView"), Token::FixedSizeList => write!(f, "FixedSizeList"), Token::Timestamp => write!(f, "Timestamp"), Token::Time32 => write!(f, "Time32"), @@ -934,6 +964,24 @@ mod test { DataType::ListView(Arc::new(Field::new("Int64", DataType::Int64, true))), true, ))), + DataType::LargeList(Arc::new(Field::new_list_field(DataType::Int64, true))), + DataType::LargeList(Arc::new(Field::new_list_field(DataType::Int64, false))), + DataType::LargeList(Arc::new(Field::new("Int64", DataType::Int64, true))), + DataType::LargeList(Arc::new(Field::new("Int64", DataType::Int64, false))), + DataType::LargeList(Arc::new(Field::new( + "nested_large_list", + DataType::LargeList(Arc::new(Field::new("Int64", DataType::Int64, true))), + true, + ))), + DataType::LargeListView(Arc::new(Field::new_list_field(DataType::Int64, true))), + DataType::LargeListView(Arc::new(Field::new_list_field(DataType::Int64, false))), + DataType::LargeListView(Arc::new(Field::new("Int64", DataType::Int64, true))), + DataType::LargeListView(Arc::new(Field::new("Int64", DataType::Int64, false))), + DataType::LargeListView(Arc::new(Field::new( + "nested_large_list_view", + DataType::LargeListView(Arc::new(Field::new("Int64", DataType::Int64, true))), + true, + ))), // TODO support more structured types (LargeList, Union, Map, RunEndEncoded, etc) ] } From 1c2edb4bdb08ddd15259cf1c231fe1bbfe160b3b Mon Sep 17 00:00:00 2001 From: Khanh Duong Date: Sat, 18 Oct 2025 19:10:32 +0900 Subject: [PATCH 07/13] return default list field name instead of None --- arrow-schema/src/datatype_parse.rs | 61 +++++++++--------------------- 1 file changed, 18 insertions(+), 43 deletions(-) diff --git a/arrow-schema/src/datatype_parse.rs b/arrow-schema/src/datatype_parse.rs index 99662627ed83..d616268e4164 100644 --- a/arrow-schema/src/datatype_parse.rs +++ b/arrow-schema/src/datatype_parse.rs @@ -95,22 +95,21 @@ impl<'a> Parser<'a> { } } - /// Parses list field name, return `None` if the list doesn't have field name. - fn parse_list_field_name(&mut self, context: &str) -> ArrowResult> { - // `field` must be after a comma + /// Parses list field name + fn parse_list_field_name(&mut self, context: &str) -> ArrowResult { + // field must be after a comma if self .tokenizer .next_if(|next| matches!(next, Ok(Token::Comma))) .is_none() { - return Ok(None); + return Ok(Field::LIST_FIELD_DEFAULT_NAME.into()); } - // field name: `field: 'field_name'`. + // expects: `field: 'field_name'`. self.expect_token(Token::Field)?; self.expect_token(Token::Colon)?; - let field_name = self.parse_single_quoted_string(context)?; - Ok(Some(field_name)) + self.parse_single_quoted_string(context) } /// Parses the List type @@ -120,15 +119,9 @@ impl<'a> Parser<'a> { let data_type = self.parse_next_type()?; let field = self.parse_list_field_name("List's field")?; self.expect_token(Token::RParen)?; - - match field { - Some(field) => Ok(DataType::List(Arc::new(Field::new( - field, data_type, nullable, - )))), - None => Ok(DataType::List(Arc::new(Field::new_list_field( - data_type, nullable, - )))), - } + Ok(DataType::List(Arc::new(Field::new( + field, data_type, nullable, + )))) } /// Parses the ListView type @@ -138,15 +131,9 @@ impl<'a> Parser<'a> { let data_type = self.parse_next_type()?; let field = self.parse_list_field_name("ListView's field")?; self.expect_token(Token::RParen)?; - - match field { - Some(field) => Ok(DataType::ListView(Arc::new(Field::new( - field, data_type, nullable, - )))), - None => Ok(DataType::ListView(Arc::new(Field::new_list_field( - data_type, nullable, - )))), - } + Ok(DataType::ListView(Arc::new(Field::new( + field, data_type, nullable, + )))) } /// Parses the LargeList type @@ -156,15 +143,9 @@ impl<'a> Parser<'a> { let data_type = self.parse_next_type()?; let field = self.parse_list_field_name("LargeList's field")?; self.expect_token(Token::RParen)?; - - match field { - Some(field) => Ok(DataType::LargeList(Arc::new(Field::new( - field, data_type, nullable, - )))), - None => Ok(DataType::LargeList(Arc::new(Field::new_list_field( - data_type, nullable, - )))), - } + Ok(DataType::LargeList(Arc::new(Field::new( + field, data_type, nullable, + )))) } /// Parses the LargeListView type @@ -174,15 +155,9 @@ impl<'a> Parser<'a> { let data_type = self.parse_next_type()?; let field = self.parse_list_field_name("LargeListView's field")?; self.expect_token(Token::RParen)?; - - match field { - Some(field) => Ok(DataType::LargeListView(Arc::new(Field::new( - field, data_type, nullable, - )))), - None => Ok(DataType::LargeListView(Arc::new(Field::new_list_field( - data_type, nullable, - )))), - } + Ok(DataType::LargeListView(Arc::new(Field::new( + field, data_type, nullable, + )))) } /// Parses the FixedSizeList type From 2bc42d9d3b2fd1901fc3be178124184f52af4eff Mon Sep 17 00:00:00 2001 From: Khanh Duong Date: Sat, 18 Oct 2025 19:22:36 +0900 Subject: [PATCH 08/13] remove uneeded context name --- arrow-schema/src/datatype_parse.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arrow-schema/src/datatype_parse.rs b/arrow-schema/src/datatype_parse.rs index d616268e4164..595dca25fab3 100644 --- a/arrow-schema/src/datatype_parse.rs +++ b/arrow-schema/src/datatype_parse.rs @@ -117,7 +117,7 @@ impl<'a> Parser<'a> { self.expect_token(Token::LParen)?; let nullable = self.nullable(); let data_type = self.parse_next_type()?; - let field = self.parse_list_field_name("List's field")?; + let field = self.parse_list_field_name("List")?; self.expect_token(Token::RParen)?; Ok(DataType::List(Arc::new(Field::new( field, data_type, nullable, @@ -129,7 +129,7 @@ impl<'a> Parser<'a> { self.expect_token(Token::LParen)?; let nullable = self.nullable(); let data_type = self.parse_next_type()?; - let field = self.parse_list_field_name("ListView's field")?; + let field = self.parse_list_field_name("ListView")?; self.expect_token(Token::RParen)?; Ok(DataType::ListView(Arc::new(Field::new( field, data_type, nullable, @@ -141,7 +141,7 @@ impl<'a> Parser<'a> { self.expect_token(Token::LParen)?; let nullable = self.nullable(); let data_type = self.parse_next_type()?; - let field = self.parse_list_field_name("LargeList's field")?; + let field = self.parse_list_field_name("LargeList")?; self.expect_token(Token::RParen)?; Ok(DataType::LargeList(Arc::new(Field::new( field, data_type, nullable, @@ -153,7 +153,7 @@ impl<'a> Parser<'a> { self.expect_token(Token::LParen)?; let nullable = self.nullable(); let data_type = self.parse_next_type()?; - let field = self.parse_list_field_name("LargeListView's field")?; + let field = self.parse_list_field_name("LargeListView")?; self.expect_token(Token::RParen)?; Ok(DataType::LargeListView(Arc::new(Field::new( field, data_type, nullable, From b00d84967735f6d4432fea098d69ff89ad02c4c3 Mon Sep 17 00:00:00 2001 From: Khanh Duong Date: Sat, 18 Oct 2025 19:24:36 +0900 Subject: [PATCH 09/13] support `FixedSizeList` --- arrow-schema/src/datatype_parse.rs | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/arrow-schema/src/datatype_parse.rs b/arrow-schema/src/datatype_parse.rs index 595dca25fab3..5f72ddc914d6 100644 --- a/arrow-schema/src/datatype_parse.rs +++ b/arrow-schema/src/datatype_parse.rs @@ -163,12 +163,15 @@ impl<'a> Parser<'a> { /// Parses the FixedSizeList type fn parse_fixed_size_list(&mut self) -> ArrowResult { self.expect_token(Token::LParen)?; + // expects: `length x #data_type [field]` let length = self.parse_i32("FixedSizeList")?; - self.expect_token(Token::Comma)?; + self.expect_token(Token::Count)?; + let nullable = self.nullable(); let data_type = self.parse_next_type()?; + let field = self.parse_list_field_name("FixedSizeList")?; self.expect_token(Token::RParen)?; Ok(DataType::FixedSizeList( - Arc::new(Field::new_list_field(data_type, true)), + Arc::new(Field::new(field, data_type, nullable)), length, )) } @@ -601,6 +604,7 @@ impl<'a> Tokenizer<'a> { "nullable" => Token::Nullable, "field" => Token::Field, + "x" => Token::Count, "Struct" => Token::Struct, @@ -746,6 +750,7 @@ enum Token { Struct, Nullable, Field, + Count, } impl Display for Token { @@ -782,6 +787,7 @@ impl Display for Token { Token::Struct => write!(f, "Struct"), Token::Nullable => write!(f, "nullable"), Token::Field => write!(f, "field"), + Token::Count => write!(f, "x"), } } } @@ -957,7 +963,22 @@ mod test { DataType::LargeListView(Arc::new(Field::new("Int64", DataType::Int64, true))), true, ))), - // TODO support more structured types (LargeList, Union, Map, RunEndEncoded, etc) + DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int64, true)), 2), + DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int64, false)), 2), + DataType::FixedSizeList(Arc::new(Field::new("Int64", DataType::Int64, true)), 2), + DataType::FixedSizeList(Arc::new(Field::new("Int64", DataType::Int64, false)), 2), + DataType::FixedSizeList( + Arc::new(Field::new( + "nested_large_list_view", + DataType::FixedSizeList( + Arc::new(Field::new("Int64", DataType::Int64, true)), + 2, + ), + true, + )), + 2, + ), + // TODO support more structured types (Union, Map, RunEndEncoded, etc) ] } From 5b6cbd6b4b482e4610fec8c486c142b7b62afcb6 Mon Sep 17 00:00:00 2001 From: Khanh Duong Date: Sat, 18 Oct 2025 19:29:58 +0900 Subject: [PATCH 10/13] rename `Token::Count` to `Token::X` --- arrow-schema/src/datatype_parse.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arrow-schema/src/datatype_parse.rs b/arrow-schema/src/datatype_parse.rs index 5f72ddc914d6..f1865cee7aa8 100644 --- a/arrow-schema/src/datatype_parse.rs +++ b/arrow-schema/src/datatype_parse.rs @@ -165,7 +165,7 @@ impl<'a> Parser<'a> { self.expect_token(Token::LParen)?; // expects: `length x #data_type [field]` let length = self.parse_i32("FixedSizeList")?; - self.expect_token(Token::Count)?; + self.expect_token(Token::X)?; let nullable = self.nullable(); let data_type = self.parse_next_type()?; let field = self.parse_list_field_name("FixedSizeList")?; @@ -604,7 +604,7 @@ impl<'a> Tokenizer<'a> { "nullable" => Token::Nullable, "field" => Token::Field, - "x" => Token::Count, + "x" => Token::X, "Struct" => Token::Struct, @@ -750,7 +750,7 @@ enum Token { Struct, Nullable, Field, - Count, + X, } impl Display for Token { @@ -787,7 +787,7 @@ impl Display for Token { Token::Struct => write!(f, "Struct"), Token::Nullable => write!(f, "nullable"), Token::Field => write!(f, "field"), - Token::Count => write!(f, "x"), + Token::X => write!(f, "x"), } } } From da856f96a50c3542d3195868a9a1ad6600eac2ee Mon Sep 17 00:00:00 2001 From: Khanh Duong Date: Sat, 18 Oct 2025 19:50:21 +0900 Subject: [PATCH 11/13] docs and example --- arrow-schema/src/datatype_parse.rs | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/arrow-schema/src/datatype_parse.rs b/arrow-schema/src/datatype_parse.rs index f1865cee7aa8..49c5767aca9b 100644 --- a/arrow-schema/src/datatype_parse.rs +++ b/arrow-schema/src/datatype_parse.rs @@ -95,7 +95,7 @@ impl<'a> Parser<'a> { } } - /// Parses list field name + /// Parses list field name. Returns default field name if not found. fn parse_list_field_name(&mut self, context: &str) -> ArrowResult { // field must be after a comma if self @@ -112,7 +112,8 @@ impl<'a> Parser<'a> { self.parse_single_quoted_string(context) } - /// Parses the List type + /// Parses the List type (called after `List` has been consumed) + /// E.g: List(nullable Int64, field: 'foo') fn parse_list(&mut self) -> ArrowResult { self.expect_token(Token::LParen)?; let nullable = self.nullable(); @@ -124,7 +125,8 @@ impl<'a> Parser<'a> { )))) } - /// Parses the ListView type + /// Parses the ListView type (called after `ListView` has been consumed) + /// E.g: ListView(nullable Int64, field: 'foo') fn parse_list_view(&mut self) -> ArrowResult { self.expect_token(Token::LParen)?; let nullable = self.nullable(); @@ -136,7 +138,8 @@ impl<'a> Parser<'a> { )))) } - /// Parses the LargeList type + /// Parses the LargeList type (called after `LargeList` has been consumed) + /// E.g: LargeList(nullable Int64, field: 'foo') fn parse_large_list(&mut self) -> ArrowResult { self.expect_token(Token::LParen)?; let nullable = self.nullable(); @@ -148,7 +151,8 @@ impl<'a> Parser<'a> { )))) } - /// Parses the LargeListView type + /// Parses the LargeListView type (called after `LargeListView` has been consumed) + /// E.g: LargeListView(nullable Int64, field: 'foo') fn parse_large_list_view(&mut self) -> ArrowResult { self.expect_token(Token::LParen)?; let nullable = self.nullable(); @@ -160,10 +164,10 @@ impl<'a> Parser<'a> { )))) } - /// Parses the FixedSizeList type + /// Parses the FixedSizeList type (called after `FixedSizeList` has been consumed) + /// E.g: FixedSizeList(5 x nullable Int64, field: 'foo') fn parse_fixed_size_list(&mut self) -> ArrowResult { self.expect_token(Token::LParen)?; - // expects: `length x #data_type [field]` let length = self.parse_i32("FixedSizeList")?; self.expect_token(Token::X)?; let nullable = self.nullable(); From 82197b524b5d094d27c1e3eda2b140062935a37c Mon Sep 17 00:00:00 2001 From: Khanh Duong Date: Sat, 18 Oct 2025 20:00:06 +0900 Subject: [PATCH 12/13] typo --- arrow-schema/src/datatype_parse.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-schema/src/datatype_parse.rs b/arrow-schema/src/datatype_parse.rs index 49c5767aca9b..a437f72feab7 100644 --- a/arrow-schema/src/datatype_parse.rs +++ b/arrow-schema/src/datatype_parse.rs @@ -973,7 +973,7 @@ mod test { DataType::FixedSizeList(Arc::new(Field::new("Int64", DataType::Int64, false)), 2), DataType::FixedSizeList( Arc::new(Field::new( - "nested_large_list_view", + "nested_fixed_size_list", DataType::FixedSizeList( Arc::new(Field::new("Int64", DataType::Int64, true)), 2, From 49495a6e4c4d238150332c5db52696cf08db3196 Mon Sep 17 00:00:00 2001 From: Khanh Duong Date: Wed, 29 Oct 2025 06:54:19 +0900 Subject: [PATCH 13/13] rename `nullable` to `parse_opt_nullable` Co-authored-by: Andrew Lamb --- arrow-schema/src/datatype_parse.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arrow-schema/src/datatype_parse.rs b/arrow-schema/src/datatype_parse.rs index a437f72feab7..56d8fb56a533 100644 --- a/arrow-schema/src/datatype_parse.rs +++ b/arrow-schema/src/datatype_parse.rs @@ -116,7 +116,7 @@ impl<'a> Parser<'a> { /// E.g: List(nullable Int64, field: 'foo') fn parse_list(&mut self) -> ArrowResult { self.expect_token(Token::LParen)?; - let nullable = self.nullable(); + let nullable = self.parse_opt_nullable(); let data_type = self.parse_next_type()?; let field = self.parse_list_field_name("List")?; self.expect_token(Token::RParen)?; @@ -129,7 +129,7 @@ impl<'a> Parser<'a> { /// E.g: ListView(nullable Int64, field: 'foo') fn parse_list_view(&mut self) -> ArrowResult { self.expect_token(Token::LParen)?; - let nullable = self.nullable(); + let nullable = self.parse_opt_nullable(); let data_type = self.parse_next_type()?; let field = self.parse_list_field_name("ListView")?; self.expect_token(Token::RParen)?; @@ -142,7 +142,7 @@ impl<'a> Parser<'a> { /// E.g: LargeList(nullable Int64, field: 'foo') fn parse_large_list(&mut self) -> ArrowResult { self.expect_token(Token::LParen)?; - let nullable = self.nullable(); + let nullable = self.parse_opt_nullable(); let data_type = self.parse_next_type()?; let field = self.parse_list_field_name("LargeList")?; self.expect_token(Token::RParen)?; @@ -155,7 +155,7 @@ impl<'a> Parser<'a> { /// E.g: LargeListView(nullable Int64, field: 'foo') fn parse_large_list_view(&mut self) -> ArrowResult { self.expect_token(Token::LParen)?; - let nullable = self.nullable(); + let nullable = self.parse_opt_nullable(); let data_type = self.parse_next_type()?; let field = self.parse_list_field_name("LargeListView")?; self.expect_token(Token::RParen)?; @@ -170,7 +170,7 @@ impl<'a> Parser<'a> { self.expect_token(Token::LParen)?; let length = self.parse_i32("FixedSizeList")?; self.expect_token(Token::X)?; - let nullable = self.nullable(); + let nullable = self.parse_opt_nullable(); let data_type = self.parse_next_type()?; let field = self.parse_list_field_name("FixedSizeList")?; self.expect_token(Token::RParen)?; @@ -429,7 +429,7 @@ impl<'a> Parser<'a> { }; self.expect_token(Token::Colon)?; - let nullable = self.nullable(); + let nullable = self.parse_opt_nullable(); let field_type = self.parse_next_type()?; fields.push(Arc::new(Field::new(field_name, field_type, nullable))); match self.next_token()? { @@ -449,7 +449,7 @@ impl<'a> Parser<'a> { } /// return and consume if the next token is `Token::Nullable` - fn nullable(&mut self) -> bool { + fn parse_opt_nullable(&mut self) -> bool { self.tokenizer .next_if(|next| matches!(next, Ok(Token::Nullable))) .is_some()