API, Core, Spark: Pass Table with FileIO from Scan on Spark's read path

nastra · nastra · commit 4279c5add179 · 2026-02-24T10:17:00.000+01:00
diff --git a/api/src/main/java/org/apache/iceberg/BatchScan.java b/api/src/main/java/org/apache/iceberg/BatchScan.java
@@ -18,6 +18,8 @@
  */
 package org.apache.iceberg;
 
+import org.apache.iceberg.io.FileIO;
+
 /** API for configuring a batch scan. */
 public interface BatchScan extends Scan<BatchScan, ScanTask, ScanTaskGroup<ScanTask>> {
   /**
@@ -68,4 +70,14 @@ public interface BatchScan extends Scan<BatchScan, ScanTask, ScanTaskGroup<ScanT
    * @return the Snapshot this scan will use
    */
   Snapshot snapshot();
+
+  /**
+   * The {@link FileIO} instance to use for the scan.
+   *
+   * @return The {@link FileIO} instance to use for the scan.
+   */
+  @Override
+  default FileIO io() {
+    return table().io();
+  }
 }
diff --git a/api/src/main/java/org/apache/iceberg/BatchScanAdapter.java b/api/src/main/java/org/apache/iceberg/BatchScanAdapter.java
@@ -22,6 +22,7 @@
 import java.util.concurrent.ExecutorService;
 import org.apache.iceberg.expressions.Expression;
 import org.apache.iceberg.io.CloseableIterable;
+import org.apache.iceberg.io.FileIO;
 import org.apache.iceberg.metrics.MetricsReporter;
 
 /** An adapter that allows using {@link TableScan} as {@link BatchScan}. */
@@ -156,4 +157,9 @@ public BatchScan metricsReporter(MetricsReporter reporter) {
   public BatchScan minRowsRequested(long numRows) {
     return new BatchScanAdapter(scan.minRowsRequested(numRows));
   }
+
+  @Override
+  public FileIO io() {
+    return scan.io();
+  }
 }
diff --git a/api/src/main/java/org/apache/iceberg/Scan.java b/api/src/main/java/org/apache/iceberg/Scan.java
@@ -22,6 +22,7 @@
 import java.util.concurrent.ExecutorService;
 import org.apache.iceberg.expressions.Expression;
 import org.apache.iceberg.io.CloseableIterable;
+import org.apache.iceberg.io.FileIO;
 import org.apache.iceberg.metrics.MetricsReporter;
 import org.apache.iceberg.relocated.com.google.common.collect.Lists;
 
@@ -208,4 +209,13 @@ default ThisT minRowsRequested(long numRows) {
     throw new UnsupportedOperationException(
         this.getClass().getName() + " doesn't implement minRowsRequested");
   }
+
+  /**
+   * The {@link FileIO} instance to use for the scan.
+   *
+   * @return The {@link FileIO} instance to use for the scan.
+   */
+  default FileIO io() {
+    throw new UnsupportedOperationException(this.getClass().getName() + " doesn't implement io");
+  }
 }
diff --git a/core/src/main/java/org/apache/iceberg/BaseScan.java b/core/src/main/java/org/apache/iceberg/BaseScan.java
@@ -102,7 +102,8 @@ public Table table() {
     return table;
   }
 
-  protected FileIO io() {
+  @Override
+  public FileIO io() {
     return table.io();
   }
 
diff --git a/core/src/main/java/org/apache/iceberg/rest/RESTTable.java b/core/src/main/java/org/apache/iceberg/rest/RESTTable.java
@@ -20,7 +20,6 @@
 
 import java.util.Map;
 import java.util.Set;
-import java.util.concurrent.atomic.AtomicReference;
 import java.util.function.Supplier;
 import org.apache.iceberg.BaseTable;
 import org.apache.iceberg.BatchScan;
@@ -30,7 +29,6 @@
 import org.apache.iceberg.TableOperations;
 import org.apache.iceberg.TableScan;
 import org.apache.iceberg.catalog.TableIdentifier;
-import org.apache.iceberg.io.FileIO;
 import org.apache.iceberg.metrics.MetricsReporter;
 
 class RESTTable extends BaseTable implements SupportsDistributedScanPlanning {
@@ -42,7 +40,6 @@ class RESTTable extends BaseTable implements SupportsDistributedScanPlanning {
   private final Set<Endpoint> supportedEndpoints;
   private final Map<String, String> catalogProperties;
   private final Object hadoopConf;
-  private final AtomicReference<FileIO> ioReference;
 
   RESTTable(
       TableOperations ops,
@@ -64,7 +61,6 @@ class RESTTable extends BaseTable implements SupportsDistributedScanPlanning {
     this.supportedEndpoints = supportedEndpoints;
     this.catalogProperties = catalogProperties;
     this.hadoopConf = hadoopConf;
-    this.ioReference = new AtomicReference<>(ops.io());
   }
 
   @Override
@@ -79,20 +75,11 @@ public TableScan newScan() {
         tableIdentifier,
         resourcePaths,
         supportedEndpoints,
-        ioReference,
+        io(),
         catalogProperties,
         hadoopConf);
   }
 
-  @Override
-  public FileIO io() {
-    if (null != ioReference.get()) {
-      return ioReference.get();
-    }
-
-    return super.io();
-  }
-
   @Override
   public BatchScan newBatchScan() {
     return new BatchScanAdapter(newScan());
diff --git a/core/src/main/java/org/apache/iceberg/rest/RESTTableScan.java b/core/src/main/java/org/apache/iceberg/rest/RESTTableScan.java
@@ -25,7 +25,6 @@
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
-import java.util.concurrent.atomic.AtomicReference;
 import java.util.stream.Collectors;
 import org.apache.iceberg.CatalogProperties;
 import org.apache.iceberg.CatalogUtil;
@@ -70,8 +69,9 @@ class RESTTableScan extends DataTableScan {
   private final ParserContext parserContext;
   private final Map<String, String> catalogProperties;
   private final Object hadoopConf;
-  private final AtomicReference<FileIO> ioReference;
+  private final FileIO tableIo;
   private String planId = null;
+  private FileIO fileIOForPlanId = null;
 
   RESTTableScan(
       Table table,
@@ -83,7 +83,7 @@ class RESTTableScan extends DataTableScan {
       TableIdentifier tableIdentifier,
       ResourcePaths resourcePaths,
       Set<Endpoint> supportedEndpoints,
-      AtomicReference<FileIO> ioReference,
+      FileIO tableIo,
       Map<String, String> catalogProperties,
       Object hadoopConf) {
     super(table, schema, context);
@@ -99,7 +99,7 @@ class RESTTableScan extends DataTableScan {
             .add("specsById", table.specs())
             .add("caseSensitive", context().caseSensitive())
             .build();
-    this.ioReference = ioReference;
+    this.tableIo = tableIo;
     this.catalogProperties = catalogProperties;
     this.hadoopConf = hadoopConf;
   }
@@ -117,14 +117,14 @@ protected TableScan newRefinedScan(
         tableIdentifier,
         resourcePaths,
         supportedEndpoints,
-        ioReference,
+        io(),
         catalogProperties,
         hadoopConf);
   }
 
   @Override
-  protected FileIO io() {
-    return ioReference.get();
+  public FileIO io() {
+    return null != fileIOForPlanId ? fileIOForPlanId : tableIo;
   }
 
   @Override
@@ -177,8 +177,7 @@ private CloseableIterable<FileScanTask> planTableScan(PlanTableScanRequest planT
     this.planId = response.planId();
     PlanStatus planStatus = response.planStatus();
     if (null != planId && !response.credentials().isEmpty()) {
-      // update FileIO for RESTTable
-      ioReference.set(fileIOForPlanId(response.credentials()));
+      this.fileIOForPlanId = fileIOForPlanId(response.credentials());
     }
 
     switch (planStatus) {
@@ -260,8 +259,7 @@ private CloseableIterable<FileScanTask> fetchPlanningResult() {
           planId);
 
       if (!response.credentials().isEmpty()) {
-        // update FileIO for RESTTable
-        ioReference.set(fileIOForPlanId(response.credentials()));
+        this.fileIOForPlanId = fileIOForPlanId(response.credentials());
       }
 
       return scanTasksIterable(response.planTasks(), response.fileScanTasks());
diff --git a/core/src/test/java/org/apache/iceberg/rest/TestRESTScanPlanning.java b/core/src/test/java/org/apache/iceberg/rest/TestRESTScanPlanning.java
@@ -996,8 +996,13 @@ public <T extends RESTResponse> T execute(
 
     assertThat(table.io().properties()).doesNotContainKey(RESTCatalogProperties.REST_SCAN_PLAN_ID);
     // make sure remote scan planning is called and FileIO gets the planId
-    assertThat(table.newScan().planFiles()).hasSize(1);
-    assertThat(table.io().properties()).containsKey(RESTCatalogProperties.REST_SCAN_PLAN_ID);
+    TableScan tableScan = table.newScan();
+    assertThat(tableScan.io().properties())
+        .isSameAs(table.io().properties())
+        .doesNotContainKey(RESTCatalogProperties.REST_SCAN_PLAN_ID);
+    assertThat(tableScan.planFiles()).hasSize(1);
+    assertThat(table.io().properties()).doesNotContainKey(RESTCatalogProperties.REST_SCAN_PLAN_ID);
+    assertThat(tableScan.io().properties()).containsKey(RESTCatalogProperties.REST_SCAN_PLAN_ID);
   }
 
   @SuppressWarnings("unchecked")
diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitioningAwareScan.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkPartitioningAwareScan.java
@@ -27,13 +27,15 @@
 import java.util.function.Supplier;
 import java.util.stream.Collectors;
 import java.util.stream.IntStream;
+import org.apache.iceberg.BaseTable;
 import org.apache.iceberg.PartitionField;
 import org.apache.iceberg.PartitionScanTask;
 import org.apache.iceberg.PartitionSpec;
 import org.apache.iceberg.Scan;
 import org.apache.iceberg.ScanTask;
 import org.apache.iceberg.ScanTaskGroup;
 import org.apache.iceberg.Schema;
+import org.apache.iceberg.SparkDistributedDataScan;
 import org.apache.iceberg.Table;
 import org.apache.iceberg.exceptions.ValidationException;
 import org.apache.iceberg.expressions.Expression;
@@ -79,7 +81,15 @@ abstract class SparkPartitioningAwareScan<T extends PartitionScanTask> extends S
       Schema projection,
       List<Expression> filters,
       Supplier<ScanReport> scanReportSupplier) {
-    super(spark, table, readConf, projection, filters, scanReportSupplier);
+    super(
+        spark,
+        table instanceof BaseTable && null != scan && !(scan instanceof SparkDistributedDataScan)
+            ? new TableWithIO(table, scan::io)
+            : table,
+        readConf,
+        projection,
+        filters,
+        scanReportSupplier);
 
     this.scan = scan;
     this.preserveDataGrouping = readConf.preserveDataGrouping();
diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/TableWithIO.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/TableWithIO.java

Original file line number	Diff line number	Diff line change
`@@ -102,7 +102,8 @@ public Table table() {`
`102`	`102`	`return table;`
`103`	`103`	`}`
`104`	`104`
`105`		`- protected FileIO io() {`
	`105`	`+ @Override`
	`106`	`+ public FileIO io() {`
`106`	`107`	`return table.io();`
`107`	`108`	`}`
`108`	`109`