From 81283607b263d85a17abcfc2125b2b20e1aefc8a Mon Sep 17 00:00:00 2001 From: Allen Xu Date: Wed, 20 May 2026 19:05:39 +0800 Subject: [PATCH 1/2] [AutoSparkUT] Recover SPARK-10136 nested-list parquet reads Refs #11589, #11592. ParquetSchemaUtils.clipSparkArrayType already follows the Parquet LIST backward-compatibility rules, but the guard for the unannotated 1-level legacy branch checked only isRepetition(REPEATED) and missed the additional getLogicalTypeAnnotation == null clause that Spark CPU applies (ParquetReadSupport.clipParquetListType, lines 268-269). For a Thrift- or parquet-avro 1.7-written 2-level nested LIST shape: required group f (LIST) { repeated group f_tuple (LIST) { repeated int32 f_tuple_tuple; } } the outer call descended into f_tuple (the inner LIST-annotated REPEATED group) and the recursive call short-circuited at the missing guard, passing f_tuple to clipSparkType as if it were a primitive element type. clipSparkType then called f_tuple.asPrimitiveType() and threw ClassCastException: repeated group f_tuple (LIST) { repeated int32 f_tuple_tuple; } is not primitive. The fix adds the getOriginalType == null guard so a LIST-annotated REPEATED group correctly routes to the LIST-wrapper branch. The plugin fix alone surfaces a downstream cuDF issue: cuDF's SchemaElement::is_stub() also treats a LIST-annotated REPEATED group as a stub and collapses one nesting level (list> -> list). This PR is paired with rapidsai/cudf#22597 (closes rapidsai/cudf#22596), which excludes LIST/MAP-annotated REPEATED groups from is_stub(). Local Maven validation with both fixes applied: mvn package -pl tests -am -Dbuildver=330 \ -Dmaven.repo.local=./.mvn-repo \ -DwildcardSuites=org.apache.spark.sql.rapids.suites.RapidsParquetThriftCompatibilitySuite,org.apache.spark.sql.rapids.suites.RapidsParquetAvroCompatibilitySuite \ -Drapids.test.gpu.allocFraction=0.3 \ -Drapids.test.gpu.maxAllocFraction=0.3 \ -Drapids.test.gpu.minAllocFraction=0 RapidsParquetThriftCompatibilitySuite: - SPARK-10136 list of primitive list RapidsParquetAvroCompatibilitySuite: - SPARK-10136 array of primitive array Tests: succeeded 9, failed 0, canceled 0, ignored 2, pending 0 All tests passed. Recovered tests: - RapidsParquetThriftCompatibilitySuite.SPARK-10136 list of primitive list (Spark original: ParquetThriftCompatibilitySuite.scala:74-147) - RapidsParquetAvroCompatibilitySuite.SPARK-10136 array of primitive array (Spark original: ParquetAvroCompatibilitySuite.scala:172-191) Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Allen Xu --- .../spark/rapids/parquet/ParquetSchemaUtils.scala | 15 ++++++++------- .../sql/rapids/utils/RapidsTestSettings.scala | 2 -- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/parquet/ParquetSchemaUtils.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/parquet/ParquetSchemaUtils.scala index eaee85453c7..5fc31cc6950 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/parquet/ParquetSchemaUtils.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/parquet/ParquetSchemaUtils.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2025, NVIDIA CORPORATION. + * Copyright (c) 2022-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -366,12 +366,13 @@ object ParquetSchemaUtils { caseSensitive: Boolean, useFieldId: Boolean): DataType = { val elementType = sparkType.elementType - // Unannotated repeated group should be interpreted as required list of required element, so - // list element type is just the group itself. - // TODO: When we drop Spark 3.1.x, this should use Parquet's LogicalTypeAnnotation - // Note that the original type is not null for leaf nodes. - //if (parquetList.getLogicalTypeAnnotation == null && - val newSparkType = if (parquetList.isRepetition(Repetition.REPEATED)) { + // A REPEATED group with no LIST/MAP annotation is the legacy 1-level list: the element type + // is the group/primitive itself. A REPEATED group that IS LIST-annotated (Thrift / Avro 1.7 + // nested-list style) must go through the LIST-wrapper branch below, otherwise the wrapper + // gets passed to clipSparkType as if it were the primitive element and asPrimitiveType() + // throws ClassCastException (issues #11589, #11592). + val newSparkType = if (parquetList.getOriginalType == null && + parquetList.isRepetition(Repetition.REPEATED)) { clipSparkType(elementType, parquetList, caseSensitive, useFieldId) } else { val parquetListGroup = parquetList.asGroupType() diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala index 3a66b42b332..3dac86a3216 100644 --- a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala @@ -141,7 +141,6 @@ class RapidsTestSettings extends BackendTestSettings { enableSuite[RapidsMathFunctionsSuite] enableSuite[RapidsMiscFunctionsSuite] enableSuite[RapidsParquetAvroCompatibilitySuite] - .exclude("SPARK-10136 array of primitive array", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/11592")) enableSuite[RapidsParquetColumnIndexSuite] enableSuite[RapidsParquetCompressionCodecPrecedenceSuite] .exclude("Create parquet table with compression", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/11416")) @@ -178,7 +177,6 @@ class RapidsTestSettings extends BackendTestSettings { .exclude("schema mismatch failure error message for parquet reader", WONT_FIX_ISSUE("GPU uses a unified parquet reader path; the non-vectorized CPU error variant rooted in ParquetDecodingException is not reachable by design. See https://github.com/NVIDIA/spark-rapids/issues/11434")) enableSuite[RapidsParquetThriftCompatibilitySuite] .exclude("Read Parquet file generated by parquet-thrift", ADJUST_UT("https://github.com/NVIDIA/spark-rapids/pull/11591")) - .exclude("SPARK-10136 list of primitive list", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/11589")) enableSuite[RapidsParquetVectorizedSuite] enableSuite[RapidsRandomSuite] .exclude("random", ADJUST_UT("Replaced by testRapids version that considers partitionIndex offset")) From 49dba9f3b17afa6e2873239e72724a0726444da2 Mon Sep 17 00:00:00 2001 From: Allen Xu Date: Thu, 21 May 2026 13:27:34 +0800 Subject: [PATCH 2/2] Broaden legacy-list guard to cover REPEATED primitives with annotation The previous predicate `getOriginalType == null && isRepetition(REPEATED)` was too strict: legacy 1-level lists can be encoded as a REPEATED primitive with a non-null original type (e.g. `repeated binary x (UTF8)` for array, `repeated fixed_len_byte_array (DECIMAL)` for array). Those shapes would route into the LIST-wrapper branch and parquetList.asGroupType() would throw ClassCastException because the type is primitive. Per the Parquet spec backward-compatibility rules, any REPEATED field that isn't explicitly LIST- or MAP-annotated is the legacy 1-level encoding. Updated the predicate accordingly. Caught by @copilot-pull-request-reviewer. Signed-off-by: Allen Xu --- .../rapids/parquet/ParquetSchemaUtils.scala | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/parquet/ParquetSchemaUtils.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/parquet/ParquetSchemaUtils.scala index 5fc31cc6950..b4680ccde29 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/parquet/ParquetSchemaUtils.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/parquet/ParquetSchemaUtils.scala @@ -366,13 +366,17 @@ object ParquetSchemaUtils { caseSensitive: Boolean, useFieldId: Boolean): DataType = { val elementType = sparkType.elementType - // A REPEATED group with no LIST/MAP annotation is the legacy 1-level list: the element type - // is the group/primitive itself. A REPEATED group that IS LIST-annotated (Thrift / Avro 1.7 - // nested-list style) must go through the LIST-wrapper branch below, otherwise the wrapper - // gets passed to clipSparkType as if it were the primitive element and asPrimitiveType() - // throws ClassCastException (issues #11589, #11592). - val newSparkType = if (parquetList.getOriginalType == null && - parquetList.isRepetition(Repetition.REPEATED)) { + // A REPEATED field that is neither LIST- nor MAP-annotated is the legacy 1-level list: + // the element type is the field itself (which may be primitive — e.g. `repeated binary x + // (UTF8)` for array — or a group). A REPEATED group that IS LIST-annotated (Thrift + // / Avro 1.7 nested-list style) must go through the LIST-wrapper branch below, otherwise + // the wrapper gets passed to clipSparkType as if it were the primitive element and + // asPrimitiveType() throws ClassCastException (issues #11589, #11592). Predicate matches + // the Parquet spec's "backward-compatibility rules": treat REPEATED as legacy unless + // explicitly annotated LIST or MAP. + val newSparkType = if (parquetList.isRepetition(Repetition.REPEATED) && + parquetList.getOriginalType != OriginalType.LIST && + parquetList.getOriginalType != OriginalType.MAP) { clipSparkType(elementType, parquetList, caseSensitive, useFieldId) } else { val parquetListGroup = parquetList.asGroupType()