Iceberg: extract version-divergent scan APIs behind a shim

Chong Gao · Chong Gao · commit 4647fc33ec5b · 2026-05-29T11:09:56.000+08:00
Refactors iceberg/common so the {SparkScan, SparkBatchQueryScan,
SparkCopyOnWriteScan, SparkBatch, DataWriteResult} APIs that diverge
between Iceberg 1.10.x and 1.11.x are hidden behind a small interface,
with per-version implementations in iceberg-1-6-x / iceberg-1-9-x /
iceberg-1-10-x. No behavior change for the existing Iceberg versions
this PR ships; sets the stage for a follow-up that adds iceberg-1-11-x.

Common:
- GpuSparkCopyOnWriteScan -&gt; renamed to GpuSparkCopyOnWriteScanBase
  (abstract); per-version concrete subclass mixes in the right runtime-
  filter trait (SupportsRuntimeFiltering vs SupportsRuntimeV2Filtering)
  and the matching filter() signature.
- GpuSparkScan: rewrite hasNestedType via Spark's readSchema() + Spark
  types so it no longer depends on the Iceberg 1.10-only
  cpuScan.expectedSchema(); dispatch SparkCopyOnWriteScan construction
  through ShimUtils.newCopyOnWriteScan.
- GpuSparkBatchQueryScan: toString uses cpuScan.description() (public,
  available in both Iceberg 1.10 and 1.11) instead of branch /
  expectedSchema / filterExpressions which 1.11 removed.
  runtimeFilterExpressions field read tolerates both 1.10 name
  (runtimeFilterExpressions) and 1.11 name (runtimeFilters) — a tactical
  fallback to be replaced with proper per-version shim methods.
- GpuSparkBatch: same tolerance for expectedSchema (1.10) vs projection
  (1.11).
- GpuSparkWrite: type-annotate `new Array[DataFile](0)` so Scala 2.13
  doesn't infer Array[Nothing] under 1.11's wildcarded
  DataWriteResult.dataFiles().
- IcebergShimUtils / ShimUtils: add newCopyOnWriteScan(Scan, ...) factory
  whose parameter is Spark's public Scan because Iceberg's
  SparkCopyOnWriteScan is package-private — cross-package callers cannot
  reference it directly.

Per-Iceberg-version module:
- New GpuSparkCopyOnWriteScan in org.apache.iceberg.spark.source (so it
  can reference the package-private SparkCopyOnWriteScan). Companion
  object exposes create(Scan, ...): GpuScan for cross-package callers.
  1.6/1.9/1.10 mix in SupportsRuntimeFiltering + filter(Filter[]).
- ShimUtilsImpl.java: implement newCopyOnWriteScan via
  GpuSparkCopyOnWriteScan.create.

Signed-off-by: Chong Gao &lt;res_life@163.com&gt;
diff --git a/iceberg/common/src/main/java/com/nvidia/spark/rapids/iceberg/IcebergShimUtils.java b/iceberg/common/src/main/java/com/nvidia/spark/rapids/iceberg/IcebergShimUtils.java
@@ -17,7 +17,9 @@
 package com.nvidia.spark.rapids.iceberg;
 
 import com.nvidia.spark.rapids.GpuMetric;
+import com.nvidia.spark.rapids.GpuScan;
 import com.nvidia.spark.rapids.NoopMetric$;
+import com.nvidia.spark.rapids.RapidsConf;
 import com.nvidia.spark.rapids.fileio.iceberg.IcebergInputFile;
 import org.apache.hadoop.fs.Path;
 import org.apache.iceberg.ContentFile;
@@ -28,6 +30,7 @@
 import org.apache.iceberg.parquet.GpuParquetIO;
 import org.apache.iceberg.shaded.org.apache.parquet.ParquetReadOptions;
 import org.apache.iceberg.shaded.org.apache.parquet.hadoop.ParquetFileReader;
+import org.apache.spark.sql.connector.read.Scan;
 import scala.Option;
 
 import java.io.IOException;
@@ -98,4 +101,23 @@ default ParquetFileReader openParquetReader(
         missCounter.$plus$eq(1L);
         return ParquetFileReader.open(GpuParquetIO.file(inputFile.getDelegate()), options);
     }
+
+    /**
+     * Constructs the version-appropriate {@code GpuSparkCopyOnWriteScan} subclass.
+     *
+     * <p>Iceberg 1.6.x, 1.9.x, and 1.10.x have {@code SparkCopyOnWriteScan} implementing
+     * {@code SupportsRuntimeFiltering} with {@code filter(Filter[])}; Iceberg 1.11.x
+     * switched to {@code SupportsRuntimeV2Filtering} with {@code filter(Predicate[])}.
+     * The concrete class therefore differs per Iceberg version and is constructed
+     * here rather than directly in common code.
+     *
+     * <p>The parameter is declared as the public {@code Scan} interface because
+     * Iceberg's {@code SparkCopyOnWriteScan} is package-private — callers outside
+     * {@code org.apache.iceberg.spark.source} cannot reference it directly. Each
+     * impl downcasts inside a helper that lives in the right package.
+     */
+    GpuScan newCopyOnWriteScan(
+            Scan cpuScan,
+            RapidsConf rapidsConf,
+            boolean queryUsesInputFile);
 }
diff --git a/iceberg/common/src/main/java/com/nvidia/spark/rapids/iceberg/ShimUtils.java b/iceberg/common/src/main/java/com/nvidia/spark/rapids/iceberg/ShimUtils.java
@@ -17,6 +17,8 @@
 package com.nvidia.spark.rapids.iceberg;
 
 import com.nvidia.spark.rapids.GpuMetric;
+import com.nvidia.spark.rapids.GpuScan;
+import com.nvidia.spark.rapids.RapidsConf;
 import com.nvidia.spark.rapids.ShimLoader;
 import com.nvidia.spark.rapids.fileio.iceberg.IcebergInputFile;
 
@@ -28,6 +30,7 @@
 import org.apache.iceberg.io.FileIO;
 import org.apache.iceberg.shaded.org.apache.parquet.ParquetReadOptions;
 import org.apache.iceberg.shaded.org.apache.parquet.hadoop.ParquetFileReader;
+import org.apache.spark.sql.connector.read.Scan;
 
 import java.io.IOException;
 import java.util.Map;
@@ -71,4 +74,11 @@ public static ParquetFileReader openParquetReader(
             scala.collection.immutable.Map<String, GpuMetric> metrics) throws IOException {
         return IMPL.openParquetReader(inputFile, filePath, options, metrics);
     }
+
+    public static GpuScan newCopyOnWriteScan(
+            Scan cpuScan,
+            RapidsConf rapidsConf,
+            boolean queryUsesInputFile) {
+        return IMPL.newCopyOnWriteScan(cpuScan, rapidsConf, queryUsesInputFile);
+    }
 }
diff --git a/iceberg/common/src/main/scala/org/apache/iceberg/spark/source/GpuSparkBatch.scala b/iceberg/common/src/main/scala/org/apache/iceberg/spark/source/GpuSparkBatch.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
+ * Copyright (c) 2025-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,8 +37,13 @@ class GpuSparkBatch(
   }
 
   override def planInputPartitions(): Array[InputPartition] = {
-    val expectedSchema = FieldUtils.readField(cpuBatch, "expectedSchema", true)
-      .asInstanceOf[Schema]
+    // Iceberg 1.10.x: SparkBatch.expectedSchema. Iceberg 1.11.x: SparkBatch.projection.
+    val expectedSchema = (try {
+      FieldUtils.readField(cpuBatch, "projection", true)
+    } catch {
+      case _: IllegalArgumentException =>
+        FieldUtils.readField(cpuBatch, "expectedSchema", true)
+    }).asInstanceOf[Schema]
     val expectedSchemaString = SchemaParser.toJson(expectedSchema)
 
     val sparkContext = SparkSession.getActiveSession.get.sparkContext
diff --git a/iceberg/common/src/main/scala/org/apache/iceberg/spark/source/GpuSparkBatchQueryScan.scala b/iceberg/common/src/main/scala/org/apache/iceberg/spark/source/GpuSparkBatchQueryScan.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
+ * Copyright (c) 2025-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,11 +36,17 @@ class GpuSparkBatchQueryScan(
   GpuSparkPartitioningAwareScan[PartitionScanTask](cpuScan, rapidsConf, queryUsesInputFile)
   with SupportsRuntimeV2Filtering {
 
-  private val runtimeFilterExpressions: List[Expression] = FieldUtils.readField(
-      cpuScan, "runtimeFilterExpressions", true)
-    .asInstanceOf[java.util.List[Expression]]
-    .asScala
-    .toList
+  // Iceberg 1.10.x: SparkBatchQueryScan.runtimeFilterExpressions.
+  // Iceberg 1.11.x: SparkRuntimeFilterableScan.runtimeFilters (renamed + moved to parent).
+  private val runtimeFilterExpressions: List[Expression] = {
+    val raw = try {
+      FieldUtils.readField(cpuScan, "runtimeFilters", true)
+    } catch {
+      case _: IllegalArgumentException =>
+        FieldUtils.readField(cpuScan, "runtimeFilterExpressions", true)
+    }
+    raw.asInstanceOf[java.util.List[Expression]].asScala.toList
+  }
 
   override def filterAttributes(): Array[NamedReference] = cpuScan.filterAttributes()
 
@@ -62,13 +68,9 @@ class GpuSparkBatchQueryScan(
   }
 
   override def toString: String = {
-    s"GpuSparkBatchQueryScan(table=${cpuScan.table()}, )" +
-      s"branch=${cpuScan.branch()}, " +
-      s"type=${cpuScan.expectedSchema().asStruct()}, " +
-      s"filters=${cpuScan.filterExpressions()}, " +
+    s"GpuSparkBatchQueryScan(${cpuScan.description()}, " +
       s"runtimeFilters=$runtimeFilterExpressions, " +
-      s"caseSensitive=${cpuScan.caseSensitive()}, " +
-      s"queryUseInputFile=$queryUsesInputFile"
+      s"queryUseInputFile=$queryUsesInputFile)"
   }
 
   /** Create a version of this scan with input file name support */
diff --git a/iceberg/common/src/main/scala/org/apache/iceberg/spark/source/GpuSparkCopyOnWriteScanBase.scala b/iceberg/common/src/main/scala/org/apache/iceberg/spark/source/GpuSparkCopyOnWriteScanBase.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
+ * Copyright (c) 2025-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,27 +18,34 @@ package org.apache.iceberg.spark.source
 
 import java.util.Objects
 
-import com.nvidia.spark.rapids.{GpuScan, RapidsConf}
+import com.nvidia.spark.rapids.RapidsConf
 import org.apache.iceberg.FileScanTask
 
 import org.apache.spark.sql.connector.expressions.NamedReference
-import org.apache.spark.sql.connector.read.{Statistics, SupportsRuntimeFiltering}
-import org.apache.spark.sql.sources.Filter
-
-class GpuSparkCopyOnWriteScan(
+import org.apache.spark.sql.connector.read.Statistics
+
+/**
+ * Version-agnostic base for the GPU copy-on-write scan. Iceberg 1.10.x has
+ * {@code SparkCopyOnWriteScan implements SupportsRuntimeFiltering} with
+ * {@code filter(Filter[])}; Iceberg 1.11.x switched to
+ * {@code SupportsRuntimeV2Filtering} with {@code filter(Predicate[])}. The
+ * per-version concrete subclass lives in {@code iceberg-1-1N-x} and mixes
+ * in the matching Spark runtime-filter trait + delegates {@code filter}
+ * to the matching Iceberg API.
+ */
+abstract class GpuSparkCopyOnWriteScanBase(
     override val cpuScan: SparkCopyOnWriteScan,
     override val rapidsConf: RapidsConf,
     override val queryUsesInputFile: Boolean) extends
-  GpuSparkPartitioningAwareScan[FileScanTask](cpuScan, rapidsConf, queryUsesInputFile)
-  with SupportsRuntimeFiltering {
+  GpuSparkPartitioningAwareScan[FileScanTask](cpuScan, rapidsConf, queryUsesInputFile) {
 
-  override def filterAttributes(): Array[NamedReference] = cpuScan.filterAttributes()
+  def filterAttributes(): Array[NamedReference] = cpuScan.filterAttributes()
 
   override def estimateStatistics(): Statistics = cpuScan.estimateStatistics()
 
   override def equals(obj: Any): Boolean = {
     obj match {
-      case that: GpuSparkCopyOnWriteScan =>
+      case that: GpuSparkCopyOnWriteScanBase =>
         this.cpuScan == that.cpuScan &&
           this.queryUsesInputFile == that.queryUsesInputFile
       case _ => false
@@ -50,18 +57,7 @@ class GpuSparkCopyOnWriteScan(
   }
 
   override def toString: String = {
-    s"GpuSparkCopyOnWriteScan(table=${cpuScan.table()}, " +
-      s"branch=${cpuScan.branch()}, " +
-      s"type=${cpuScan.expectedSchema().asStruct()}, " +
-      s"filters=${cpuScan.filterExpressions()}, " +
-      s"caseSensitive=${cpuScan.caseSensitive()}, " +
+    s"GpuSparkCopyOnWriteScan(${cpuScan.description()}, " +
       s"queryUseInputFile=$queryUsesInputFile)"
   }
-
-  /** Create a version of this scan with input file name support */
-  override def withInputFile(): GpuScan = {
-    new GpuSparkCopyOnWriteScan(cpuScan, rapidsConf, true)
-  }
-
-  override def filter(filters: Array[Filter]): Unit = cpuScan.filter(filters)
-}
+}
diff --git a/iceberg/common/src/main/scala/org/apache/iceberg/spark/source/GpuSparkScan.scala b/iceberg/common/src/main/scala/org/apache/iceberg/spark/source/GpuSparkScan.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
+ * Copyright (c) 2025-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,10 +16,10 @@
 
 package org.apache.iceberg.spark.source
 
-import scala.collection.JavaConverters._
 import scala.util.{Failure, Success, Try}
 
 import com.nvidia.spark.rapids._
+import com.nvidia.spark.rapids.iceberg.ShimUtils
 import org.apache.hadoop.shaded.org.apache.commons.lang3.reflect.FieldUtils
 import org.apache.iceberg.{BaseMetadataTable, ScanTaskGroup}
 import org.apache.iceberg.spark.{GpuSparkReadConf, SparkReadConf}
@@ -28,7 +28,7 @@ import org.apache.iceberg.types.Types
 import org.apache.spark.sql.connector.metric.{CustomMetric, CustomTaskMetric}
 import org.apache.spark.sql.connector.read.{Batch, Scan, Statistics, SupportsReportStatistics}
 import org.apache.spark.sql.connector.read.streaming.MicroBatchStream
-import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.types.{ArrayType, MapType, StructType}
 
 
 abstract class GpuSparkScan(val cpuScan: SparkScan,
@@ -65,11 +65,12 @@ abstract class GpuSparkScan(val cpuScan: SparkScan,
   protected def taskGroups(): Seq[_ <: ScanTaskGroup[_]]
 
   def hasNestedType: Boolean = {
-    cpuScan.expectedSchema()
-      .asStruct()
-      .fields()
-      .asScala
-      .exists { field => field.`type`().isNestedType }
+    cpuScan.readSchema().fields.exists { field =>
+      field.dataType match {
+        case _: StructType | _: ArrayType | _: MapType => true
+        case _ => false
+      }
+    }
   }
 }
 
@@ -85,7 +86,7 @@ object GpuSparkScan {
         case icebergScan: SparkBatchQueryScan =>
           new GpuSparkBatchQueryScan(icebergScan, rapidsConf, false)
         case s: SparkCopyOnWriteScan =>
-          new GpuSparkCopyOnWriteScan(s, rapidsConf, false)
+          ShimUtils.newCopyOnWriteScan(s, rapidsConf, false).asInstanceOf[GpuSparkScan]
         case _ =>
           throw new IllegalArgumentException(
             s"Currently iceberg support only supports batch query scan and copy-on-write scan, " +
diff --git a/iceberg/common/src/main/scala/org/apache/iceberg/spark/source/GpuSparkWrite.scala b/iceberg/common/src/main/scala/org/apache/iceberg/spark/source/GpuSparkWrite.scala
@@ -391,7 +391,7 @@ class GpuUnpartitionedDataWriter(
     close()
 
     val result = delegate.result()
-    val taskCommit = new TaskCommit(result.dataFiles().toArray(new Array(0)))
+    val taskCommit = new TaskCommit(result.dataFiles().toArray(new Array[DataFile](0)))
     taskCommit.reportOutputMetrics()
     taskCommit
   }
@@ -441,7 +441,7 @@ class GpuPartitionedDataWriter(
     close()
 
     val result = delegate.result()
-    val taskCommit = new TaskCommit(result.dataFiles().toArray(new Array(0)))
+    val taskCommit = new TaskCommit(result.dataFiles().toArray(new Array[DataFile](0)))
     taskCommit.reportOutputMetrics()
     taskCommit
   }
diff --git a/iceberg/iceberg-1-10-x/src/main/java/com/nvidia/spark/rapids/iceberg/iceberg110x/ShimUtilsImpl.java b/iceberg/iceberg-1-10-x/src/main/java/com/nvidia/spark/rapids/iceberg/iceberg110x/ShimUtilsImpl.java
@@ -17,6 +17,8 @@
 package com.nvidia.spark.rapids.iceberg.iceberg110x;
 
 import com.nvidia.spark.rapids.GpuMetric;
+import com.nvidia.spark.rapids.GpuScan;
+import com.nvidia.spark.rapids.RapidsConf;
 import com.nvidia.spark.rapids.fileio.iceberg.IcebergInputFile;
 import com.nvidia.spark.rapids.iceberg.IcebergShimUtils;
 import org.apache.hadoop.fs.Path;
@@ -27,7 +29,9 @@
 import org.apache.iceberg.shaded.org.apache.parquet.ParquetReadOptions;
 import org.apache.iceberg.shaded.org.apache.parquet.hadoop.ParquetFileReader;
 import org.apache.iceberg.spark.SparkUtil;
+import org.apache.iceberg.spark.source.GpuSparkCopyOnWriteScan;
 import org.apache.iceberg.types.Types;
+import org.apache.spark.sql.connector.read.Scan;
 import org.apache.iceberg.util.PartitionUtil;
 
 import java.io.IOException;
@@ -74,4 +78,12 @@ public ParquetFileReader openParquetReader(
             scala.collection.immutable.Map<String, GpuMetric> metrics) throws IOException {
         return GpuParquetIOShim.openReader(inputFile, filePath, options, metrics);
     }
+
+    @Override
+    public GpuScan newCopyOnWriteScan(
+            Scan cpuScan,
+            RapidsConf rapidsConf,
+            boolean queryUsesInputFile) {
+        return GpuSparkCopyOnWriteScan.create(cpuScan, rapidsConf, queryUsesInputFile);
+    }
 }
diff --git a/iceberg/iceberg-1-10-x/src/main/scala/org/apache/iceberg/spark/source/GpuSparkCopyOnWriteScan.scala b/iceberg/iceberg-1-10-x/src/main/scala/org/apache/iceberg/spark/source/GpuSparkCopyOnWriteScan.scala
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.iceberg.spark.source
+
+import com.nvidia.spark.rapids.{GpuScan, RapidsConf}
+
+import org.apache.spark.sql.connector.read.{Scan, SupportsRuntimeFiltering}
+import org.apache.spark.sql.sources.Filter
+
+/** Iceberg 1.10.x copy-on-write scan: SupportsRuntimeFiltering with Array[Filter]. */
+class GpuSparkCopyOnWriteScan(
+    cpuScanArg: SparkCopyOnWriteScan,
+    rapidsConfArg: RapidsConf,
+    queryUsesInputFileArg: Boolean)
+  extends GpuSparkCopyOnWriteScanBase(cpuScanArg, rapidsConfArg, queryUsesInputFileArg)
+  with SupportsRuntimeFiltering {
+
+  override def filter(filters: Array[Filter]): Unit = cpuScan.filter(filters)
+
+  override def withInputFile(): GpuScan =
+    new GpuSparkCopyOnWriteScan(cpuScan, rapidsConf, true)
+}
+
+object GpuSparkCopyOnWriteScan {
+  def create(cpuScan: Scan, rapidsConf: RapidsConf, queryUsesInputFile: Boolean): GpuScan =
+    new GpuSparkCopyOnWriteScan(
+      cpuScan.asInstanceOf[SparkCopyOnWriteScan], rapidsConf, queryUsesInputFile)
+}
diff --git a/iceberg/iceberg-1-6-x/src/main/java/com/nvidia/spark/rapids/iceberg/iceberg16x/ShimUtilsImpl.java b/iceberg/iceberg-1-6-x/src/main/java/com/nvidia/spark/rapids/iceberg/iceberg16x/ShimUtilsImpl.java
@@ -16,11 +16,15 @@
 
 package com.nvidia.spark.rapids.iceberg.iceberg16x;
 
+import com.nvidia.spark.rapids.GpuScan;
+import com.nvidia.spark.rapids.RapidsConf;
 import com.nvidia.spark.rapids.iceberg.IcebergShimUtils;
 import org.apache.iceberg.*;
 import org.apache.iceberg.io.FileIO;
 import org.apache.iceberg.spark.source.GpuBaseReader;
+import org.apache.iceberg.spark.source.GpuSparkCopyOnWriteScan;
 import org.apache.iceberg.types.Types;
+import org.apache.spark.sql.connector.read.Scan;
 import org.apache.iceberg.util.PartitionUtil;
 
 import java.util.Collections;
@@ -54,4 +58,12 @@ public Map<String, Map<String, String>> storageCredentialOverlays(FileIO fileIO)
     // openParquetReader: inherits the no-cache default from IcebergShimUtils. The shaded
     // ParquetFileReader in 1.6.x has no public API to inject pre-parsed footer metadata,
     // so file-cache routing is not possible here.
+
+    @Override
+    public GpuScan newCopyOnWriteScan(
+            Scan cpuScan,
+            RapidsConf rapidsConf,
+            boolean queryUsesInputFile) {
+        return GpuSparkCopyOnWriteScan.create(cpuScan, rapidsConf, queryUsesInputFile);
+    }
 }
diff --git a/iceberg/iceberg-1-6-x/src/main/scala/org/apache/iceberg/spark/source/GpuSparkCopyOnWriteScan.scala b/iceberg/iceberg-1-6-x/src/main/scala/org/apache/iceberg/spark/source/GpuSparkCopyOnWriteScan.scala
diff --git a/iceberg/iceberg-1-9-x/src/main/java/com/nvidia/spark/rapids/iceberg/iceberg19x/ShimUtilsImpl.java b/iceberg/iceberg-1-9-x/src/main/java/com/nvidia/spark/rapids/iceberg/iceberg19x/ShimUtilsImpl.java
diff --git a/iceberg/iceberg-1-9-x/src/main/scala/org/apache/iceberg/spark/source/GpuSparkCopyOnWriteScan.scala b/iceberg/iceberg-1-9-x/src/main/scala/org/apache/iceberg/spark/source/GpuSparkCopyOnWriteScan.scala