From d8e2530db99b7082a9efe86056bbe321e18b4fc9 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Tue, 26 May 2026 14:26:37 +0800
Subject: [PATCH 1/6] Add protobuf integration-test dependency infrastructure
 (plugin-0)

Wires the optional `spark-protobuf` module into the integration-test
classpath so subsequent from_protobuf PRs have a CPU baseline. Follows
the same pattern as `spark-avro`.

* `integration_tests/pom.xml`: declare `spark-protobuf_${scala.binary.version}`
  and an unshaded `protobuf-java` (3.25.5) in `maven-dependency-plugin` so
  they are copied into `target/dependency/` during the `package` phase.
  spark-protobuf is a Spark 3.4.0+ module, so the protobuf copy lives in
  its own execution gated by `spark.protobuf.skipCopy` (set to `true` by
  the `release33x` profiles in the root pom). The unshaded `protobuf-java`
  is required because spark-protobuf shades its own copy into
  `org.sparkproject.spark_protobuf.protobuf` and Spark itself does not
  bundle the unshaded jar.
* `run_pyspark_from_build.sh`: glob both jars from `target/dependency/`
  (or `LOCAL_JAR_PATH`), gate them behind `INCLUDE_SPARK_PROTOBUF_JAR`
  (default `true`), and append them to `ALL_JARS`. `--jars` already
  reaches both driver and executor classpath in client mode, so no
  separate `--driver-class-path` plumbing is needed.
* `protobuf_test.py` (new): two minimal fallback-only smoke tests that
  build a `FileDescriptorSet` through the JVM, hand-encode a couple of
  proto2 messages, and exercise both the path-based and (Spark 3.5+)
  `binaryDescriptorSet` `from_protobuf` API variants. GPU support is not
  enabled yet, so they use `@allow_non_gpu` + `assert_gpu_fallback_collect`
  to verify the plugin falls back to CPU while still producing correct
  results.

No GPU code is changed in this PR.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 integration_tests/pom.xml                     |  32 ++++
 integration_tests/run_pyspark_from_build.sh   |  22 ++-
 .../src/main/python/protobuf_test.py          | 141 ++++++++++++++++++
 pom.xml                                       |   9 ++
 scala2.13/integration_tests/pom.xml           |  32 ++++
 scala2.13/pom.xml                             |   9 ++
 6 files changed, 243 insertions(+), 2 deletions(-)
 create mode 100644 integration_tests/src/main/python/protobuf_test.py
diff --git a/integration_tests/pom.xml b/integration_tests/pom.xml
index ea075d7a2dc..fdf585895a5 100644
--- a/integration_tests/pom.xml
+++ b/integration_tests/pom.xml
@@ -132,6 +132,8 @@
                                     <includes>
                                         <include>parquet-hadoop*.jar</include>
                                         <include>spark-avro*.jar</include>
+                                        <include>spark-protobuf*.jar</include>
+                                        <include>protobuf-java-*.jar</include>
                                     </includes>
                                 </filesets>
                             </filesets>
@@ -166,6 +168,36 @@
                             </artifactItems>
                         </configuration>
                     </execution>
+                    <!--
+                        spark-protobuf is a Spark 3.4.0+ module, and spark-protobuf shades its
+                        own `com.google.protobuf` into `org.sparkproject.spark_protobuf.protobuf`
+                        so we ship an unshaded protobuf-java alongside it for the integration
+                        tests. release33x profiles set `spark.protobuf.skipCopy=true` because
+                        spark-protobuf does not exist for Spark 3.3.x.
+                    -->
+                    <execution>
+                        <id>copy-spark-protobuf</id>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>copy</goal>
+                        </goals>
+                        <configuration>
+                            <skip>${spark.protobuf.skipCopy}</skip>
+                            <useBaseVersion>true</useBaseVersion>
+                            <artifactItems>
+                                <artifactItem>
+                                    <groupId>org.apache.spark</groupId>
+                                    <artifactId>spark-protobuf_${scala.binary.version}</artifactId>
+                                    <version>${spark.version}</version>
+                                </artifactItem>
+                                <artifactItem>
+                                    <groupId>com.google.protobuf</groupId>
+                                    <artifactId>protobuf-java</artifactId>
+                                    <version>3.25.5</version>
+                                </artifactItem>
+                            </artifactItems>
+                        </configuration>
+                    </execution>
                 </executions>
             </plugin>
             <plugin>
diff --git a/integration_tests/run_pyspark_from_build.sh b/integration_tests/run_pyspark_from_build.sh
index 2f1b96d901d..58599d32bcd 100755
--- a/integration_tests/run_pyspark_from_build.sh
+++ b/integration_tests/run_pyspark_from_build.sh
@@ -46,6 +46,9 @@
 #   To run all tests, including Avro tests:
 #     INCLUDE_SPARK_AVRO_JAR=true ./run_pyspark_from_build.sh
 #
+#   To run tests WITHOUT Protobuf tests (protobuf is included by default):
+#     INCLUDE_SPARK_PROTOBUF_JAR=false ./run_pyspark_from_build.sh
+#
 #   To run a specific test:
 #     TEST=my_test ./run_pyspark_from_build.sh
 #
@@ -100,6 +103,7 @@ else
     # support alternate local jars NOT building from the source code
     if [ -d "$LOCAL_JAR_PATH" ]; then
         AVRO_JARS=$(echo "$LOCAL_JAR_PATH"/spark-avro*.jar)
+        PROTOBUF_JARS=$(echo "$LOCAL_JAR_PATH"/spark-protobuf*.jar "$LOCAL_JAR_PATH"/protobuf-java-*.jar)
         PLUGIN_JAR=$(echo "$LOCAL_JAR_PATH"/rapids-4-spark_*.jar)
         if [ -f $(echo $LOCAL_JAR_PATH/parquet-hadoop*.jar) ]; then
             export INCLUDE_PARQUET_HADOOP_TEST_JAR=true
@@ -116,6 +120,7 @@ else
     else
         [[ "$SCALA_VERSION" != "2.12"  ]] && TARGET_DIR=${TARGET_DIR/integration_tests/scala$SCALA_VERSION\/integration_tests}
         AVRO_JARS=$(echo "$TARGET_DIR"/dependency/spark-avro*.jar)
+        PROTOBUF_JARS=$(echo "$TARGET_DIR"/dependency/spark-protobuf*.jar "$TARGET_DIR"/dependency/protobuf-java-*.jar)
         PARQUET_HADOOP_TESTS=$(echo "$TARGET_DIR"/dependency/parquet-hadoop*.jar)
         # remove the log4j.properties file so it doesn't conflict with ours, ignore errors
         # if it isn't present or already removed
@@ -141,9 +146,22 @@ else
         AVRO_JARS=""
     fi
 
-    # ALL_JARS includes dist.jar integration-test.jar avro.jar parquet.jar if they exist
+    # Set INCLUDE_SPARK_PROTOBUF_JAR=false to skip protobuf_test.py. Both `spark-protobuf` and
+    # the unshaded `protobuf-java` come from maven-dependency-plugin and must both be present
+    # -- spark-protobuf shades `com.google.protobuf.*` internally and Spark does not bundle
+    # the unshaded jar.
+    if [[ $( echo ${INCLUDE_SPARK_PROTOBUF_JAR} | tr '[:upper:]' '[:lower:]' ) != "false" \
+          && $(readlink -e $PROTOBUF_JARS 2>/dev/null | wc -l) -eq 2 ]];
+    then
+        export INCLUDE_SPARK_PROTOBUF_JAR=true
+    else
+        export INCLUDE_SPARK_PROTOBUF_JAR=false
+        PROTOBUF_JARS=""
+    fi
+
+    # ALL_JARS includes dist.jar integration-test.jar avro.jar parquet.jar protobuf.jar if they exist
     # Remove non-existing paths and canonicalize the paths including get rid of links and `..`
-    ALL_JARS=$(readlink -e $PLUGIN_JAR $TEST_JARS $AVRO_JARS $PARQUET_HADOOP_TESTS || true)
+    ALL_JARS=$(readlink -e $PLUGIN_JAR $TEST_JARS $AVRO_JARS $PARQUET_HADOOP_TESTS $PROTOBUF_JARS || true)
     # `:` separated jars
     ALL_JARS="${ALL_JARS//$'\n'/:}"
 
diff --git a/integration_tests/src/main/python/protobuf_test.py b/integration_tests/src/main/python/protobuf_test.py
new file mode 100644
index 00000000000..525180c2811
--- /dev/null
+++ b/integration_tests/src/main/python/protobuf_test.py
@@ -0,0 +1,141 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import os
+
+import pytest
+
+from asserts import assert_gpu_fallback_collect
+from marks import allow_non_gpu
+from spark_session import is_before_spark_340, with_cpu_session
+import pyspark.sql.functions as f
+
+if os.environ.get('INCLUDE_SPARK_PROTOBUF_JAR', 'true').lower() == 'false':
+    pytestmark = pytest.mark.skip(reason="INCLUDE_SPARK_PROTOBUF_JAR is disabled")
+else:
+    pytestmark = pytest.mark.skipif(
+        is_before_spark_340(), reason="from_protobuf is Spark 3.4.0+")
+
+
+def _try_import_from_protobuf():
+    try:
+        from pyspark.sql.protobuf.functions import from_protobuf
+        return from_protobuf
+    except Exception:
+        return None
+
+
+@pytest.fixture(scope="module")
+def from_protobuf_fn():
+    fn = _try_import_from_protobuf()
+    if fn is None:
+        pytest.skip("from_protobuf not available")
+    return fn
+
+
+def _encode_varint(value):
+    out = bytearray()
+    value &= 0xFFFFFFFFFFFFFFFF
+    while True:
+        bits = value & 0x7F
+        value >>= 7
+        if value:
+            out.append(bits | 0x80)
+        else:
+            out.append(bits)
+            return bytes(out)
+
+
+def _encode_simple_message(i32_value, s_value):
+    buf = bytearray()
+    buf += _encode_varint((1 << 3) | 0)  # field 1, VARINT
+    buf += _encode_varint(i32_value)
+    s_bytes = s_value.encode("utf-8")
+    buf += _encode_varint((2 << 3) | 2)  # field 2, LENGTH-DELIMITED
+    buf += _encode_varint(len(s_bytes))
+    buf += s_bytes
+    return bytes(buf)
+
+
+def _build_simple_descriptor_bytes(spark):
+    D = spark.sparkContext._jvm.com.google.protobuf.DescriptorProtos
+    i32_field = D.FieldDescriptorProto.newBuilder() \
+        .setName("i32").setNumber(1) \
+        .setLabel(D.FieldDescriptorProto.Label.LABEL_OPTIONAL) \
+        .setType(D.FieldDescriptorProto.Type.TYPE_INT32).build()
+    s_field = D.FieldDescriptorProto.newBuilder() \
+        .setName("s").setNumber(2) \
+        .setLabel(D.FieldDescriptorProto.Label.LABEL_OPTIONAL) \
+        .setType(D.FieldDescriptorProto.Type.TYPE_STRING).build()
+    msg = D.DescriptorProto.newBuilder() \
+        .setName("Simple").addField(i32_field).addField(s_field).build()
+    file_builder = D.FileDescriptorProto.newBuilder() \
+        .setName("simple.proto").setPackage("test").addMessageType(msg) \
+        .setSyntax("proto2")
+    fds = D.FileDescriptorSet.newBuilder().addFile(file_builder.build()).build()
+    return bytes(fds.toByteArray())
+
+
+def _write_bytes_to_hadoop_path(spark, path_str, data_bytes):
+    sc = spark.sparkContext
+    config = sc._jsc.hadoopConfiguration()
+    jpath = sc._jvm.org.apache.hadoop.fs.Path(path_str)
+    fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(config)
+    out = fs.create(jpath, True)
+    try:
+        out.write(bytearray(data_bytes))
+    finally:
+        out.close()
+
+
+def _setup_simple_desc(spark_tmp_path):
+    desc_path = spark_tmp_path + "/simple.desc"
+    desc_bytes = with_cpu_session(_build_simple_descriptor_bytes)
+    with_cpu_session(
+        lambda spark: _write_bytes_to_hadoop_path(spark, desc_path, desc_bytes))
+    return desc_path, desc_bytes
+
+
+_smoke_rows = [(1, "a"), (-2, "bb"), (0, ""), (12345, "hello")]
+
+
+def _make_smoke_df(spark):
+    encoded = [(_encode_simple_message(i, s),) for (i, s) in _smoke_rows]
+    return spark.createDataFrame(encoded, ["bin"])
+
+
+@allow_non_gpu("ProjectExec", "ProtobufDataToCatalyst")
+def test_from_protobuf_smoke_path_api(spark_tmp_path, from_protobuf_fn):
+    desc_path, _ = _setup_simple_desc(spark_tmp_path)
+
+    def run(spark):
+        return _make_smoke_df(spark).select(
+            from_protobuf_fn(f.col("bin"), "test.Simple", desc_path).alias("d"))
+
+    assert_gpu_fallback_collect(run, "ProtobufDataToCatalyst")
+
+
+@allow_non_gpu("ProjectExec", "ProtobufDataToCatalyst")
+def test_from_protobuf_smoke_binary_descriptor_api(spark_tmp_path, from_protobuf_fn):
+    if "binaryDescriptorSet" not in inspect.signature(from_protobuf_fn).parameters:
+        pytest.skip("binaryDescriptorSet kwarg is Spark 3.5+ only")
+    _, desc_bytes = _setup_simple_desc(spark_tmp_path)
+
+    def run(spark):
+        return _make_smoke_df(spark).select(
+            from_protobuf_fn(f.col("bin"), "test.Simple",
+                             binaryDescriptorSet=bytearray(desc_bytes)).alias("d"))
+
+    assert_gpu_fallback_collect(run, "ProtobufDataToCatalyst")
diff --git a/pom.xml b/pom.xml
index 450211bcc4a..abe463d4ed7 100644
--- a/pom.xml
+++ b/pom.xml
@@ -94,6 +94,7 @@
                 <rapids.delta.artifactId1>rapids-4-spark-delta-21x</rapids.delta.artifactId1>
                 <rapids.delta.artifactId2>rapids-4-spark-delta-22x</rapids.delta.artifactId2>
                 <rapids.delta.artifactId3>rapids-4-spark-delta-23x</rapids.delta.artifactId3>
+                <spark.protobuf.skipCopy>true</spark.protobuf.skipCopy>
             </properties>
             <modules>
                 <module>delta-lake/delta-21x</module>
@@ -118,6 +119,7 @@
                 <rapids.delta.artifactId1>rapids-4-spark-delta-21x</rapids.delta.artifactId1>
                 <rapids.delta.artifactId2>rapids-4-spark-delta-22x</rapids.delta.artifactId2>
                 <rapids.delta.artifactId3>rapids-4-spark-delta-23x</rapids.delta.artifactId3>
+                <spark.protobuf.skipCopy>true</spark.protobuf.skipCopy>
             </properties>
             <modules>
                 <module>delta-lake/delta-21x</module>
@@ -142,6 +144,7 @@
                 <rapids.delta.artifactId1>rapids-4-spark-delta-21x</rapids.delta.artifactId1>
                 <rapids.delta.artifactId2>rapids-4-spark-delta-22x</rapids.delta.artifactId2>
                 <rapids.delta.artifactId3>rapids-4-spark-delta-23x</rapids.delta.artifactId3>
+                <spark.protobuf.skipCopy>true</spark.protobuf.skipCopy>
             </properties>
             <modules>
                 <module>delta-lake/delta-21x</module>
@@ -166,6 +169,7 @@
                 <rapids.delta.artifactId1>rapids-4-spark-delta-21x</rapids.delta.artifactId1>
                 <rapids.delta.artifactId2>rapids-4-spark-delta-22x</rapids.delta.artifactId2>
                 <rapids.delta.artifactId3>rapids-4-spark-delta-23x</rapids.delta.artifactId3>
+                <spark.protobuf.skipCopy>true</spark.protobuf.skipCopy>
             </properties>
             <modules>
                 <module>delta-lake/delta-21x</module>
@@ -190,6 +194,7 @@
                 <rapids.delta.artifactId1>rapids-4-spark-delta-21x</rapids.delta.artifactId1>
                 <rapids.delta.artifactId2>rapids-4-spark-delta-22x</rapids.delta.artifactId2>
                 <rapids.delta.artifactId3>rapids-4-spark-delta-23x</rapids.delta.artifactId3>
+                <spark.protobuf.skipCopy>true</spark.protobuf.skipCopy>
             </properties>
             <modules>
                 <module>delta-lake/delta-21x</module>
@@ -806,6 +811,10 @@
         <rapids.secondaryCacheDir>${spark.rapids.project.basedir}/target/${spark.version.classifier}/.sbt/1.0/zinc/org.scala-sbt</rapids.secondaryCacheDir>
         <allowConventionalDistJar>false</allowConventionalDistJar>
         <buildver>330</buildver>
+        <!-- Default to copying spark-protobuf and protobuf-java into target/dependency. The
+             release33x profiles override this to `true` because spark-protobuf is a Spark
+             3.4.0+ module and the artifact does not exist for Spark 3.3.x. -->
+        <spark.protobuf.skipCopy>false</spark.protobuf.skipCopy>
         <maven.compiler.source>1.8</maven.compiler.source>
         <java.major.version>8</java.major.version>
         <scala.compiler.release>${java.major.version}</scala.compiler.release>
diff --git a/scala2.13/integration_tests/pom.xml b/scala2.13/integration_tests/pom.xml
index e1d2e133210..198455fd02f 100644
--- a/scala2.13/integration_tests/pom.xml
+++ b/scala2.13/integration_tests/pom.xml
@@ -132,6 +132,8 @@
                                     <includes>
                                         <include>parquet-hadoop*.jar</include>
                                         <include>spark-avro*.jar</include>
+                                        <include>spark-protobuf*.jar</include>
+                                        <include>protobuf-java-*.jar</include>
                                     </includes>
                                 </filesets>
                             </filesets>
@@ -166,6 +168,36 @@
                             </artifactItems>
                         </configuration>
                     </execution>
+                    <!--
+                        spark-protobuf is a Spark 3.4.0+ module, and spark-protobuf shades its
+                        own `com.google.protobuf` into `org.sparkproject.spark_protobuf.protobuf`
+                        so we ship an unshaded protobuf-java alongside it for the integration
+                        tests. release33x profiles set `spark.protobuf.skipCopy=true` because
+                        spark-protobuf does not exist for Spark 3.3.x.
+                    -->
+                    <execution>
+                        <id>copy-spark-protobuf</id>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>copy</goal>
+                        </goals>
+                        <configuration>
+                            <skip>${spark.protobuf.skipCopy}</skip>
+                            <useBaseVersion>true</useBaseVersion>
+                            <artifactItems>
+                                <artifactItem>
+                                    <groupId>org.apache.spark</groupId>
+                                    <artifactId>spark-protobuf_${scala.binary.version}</artifactId>
+                                    <version>${spark.version}</version>
+                                </artifactItem>
+                                <artifactItem>
+                                    <groupId>com.google.protobuf</groupId>
+                                    <artifactId>protobuf-java</artifactId>
+                                    <version>3.25.5</version>
+                                </artifactItem>
+                            </artifactItems>
+                        </configuration>
+                    </execution>
                 </executions>
             </plugin>
             <plugin>
diff --git a/scala2.13/pom.xml b/scala2.13/pom.xml
index 6b9a9aa8d68..169e1d5685d 100644
--- a/scala2.13/pom.xml
+++ b/scala2.13/pom.xml
@@ -94,6 +94,7 @@
                 <rapids.delta.artifactId1>rapids-4-spark-delta-21x</rapids.delta.artifactId1>
                 <rapids.delta.artifactId2>rapids-4-spark-delta-22x</rapids.delta.artifactId2>
                 <rapids.delta.artifactId3>rapids-4-spark-delta-23x</rapids.delta.artifactId3>
+                <spark.protobuf.skipCopy>true</spark.protobuf.skipCopy>
             </properties>
             <modules>
                 <module>delta-lake/delta-21x</module>
@@ -118,6 +119,7 @@
                 <rapids.delta.artifactId1>rapids-4-spark-delta-21x</rapids.delta.artifactId1>
                 <rapids.delta.artifactId2>rapids-4-spark-delta-22x</rapids.delta.artifactId2>
                 <rapids.delta.artifactId3>rapids-4-spark-delta-23x</rapids.delta.artifactId3>
+                <spark.protobuf.skipCopy>true</spark.protobuf.skipCopy>
             </properties>
             <modules>
                 <module>delta-lake/delta-21x</module>
@@ -142,6 +144,7 @@
                 <rapids.delta.artifactId1>rapids-4-spark-delta-21x</rapids.delta.artifactId1>
                 <rapids.delta.artifactId2>rapids-4-spark-delta-22x</rapids.delta.artifactId2>
                 <rapids.delta.artifactId3>rapids-4-spark-delta-23x</rapids.delta.artifactId3>
+                <spark.protobuf.skipCopy>true</spark.protobuf.skipCopy>
             </properties>
             <modules>
                 <module>delta-lake/delta-21x</module>
@@ -166,6 +169,7 @@
                 <rapids.delta.artifactId1>rapids-4-spark-delta-21x</rapids.delta.artifactId1>
                 <rapids.delta.artifactId2>rapids-4-spark-delta-22x</rapids.delta.artifactId2>
                 <rapids.delta.artifactId3>rapids-4-spark-delta-23x</rapids.delta.artifactId3>
+                <spark.protobuf.skipCopy>true</spark.protobuf.skipCopy>
             </properties>
             <modules>
                 <module>delta-lake/delta-21x</module>
@@ -190,6 +194,7 @@
                 <rapids.delta.artifactId1>rapids-4-spark-delta-21x</rapids.delta.artifactId1>
                 <rapids.delta.artifactId2>rapids-4-spark-delta-22x</rapids.delta.artifactId2>
                 <rapids.delta.artifactId3>rapids-4-spark-delta-23x</rapids.delta.artifactId3>
+                <spark.protobuf.skipCopy>true</spark.protobuf.skipCopy>
             </properties>
             <modules>
                 <module>delta-lake/delta-21x</module>
@@ -806,6 +811,10 @@
         <rapids.secondaryCacheDir>${spark.rapids.project.basedir}/target/${spark.version.classifier}/.sbt/1.0/zinc/org.scala-sbt</rapids.secondaryCacheDir>
         <allowConventionalDistJar>false</allowConventionalDistJar>
         <buildver>330</buildver>
+        <!-- Default to copying spark-protobuf and protobuf-java into target/dependency. The
+             release33x profiles override this to `true` because spark-protobuf is a Spark
+             3.4.0+ module and the artifact does not exist for Spark 3.3.x. -->
+        <spark.protobuf.skipCopy>false</spark.protobuf.skipCopy>
         <maven.compiler.source>1.8</maven.compiler.source>
         <java.major.version>8</java.major.version>
         <scala.compiler.release>${java.major.version}</scala.compiler.release>

From ae0e557b29dc157c3b248e3cc3050f0aec3004fe Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Wed, 27 May 2026 16:30:41 +0800
Subject: [PATCH 2/6] signoff

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

From 8a7b4e2c71d19ba7bd203328482901808af29bd9 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Wed, 27 May 2026 17:09:15 +0800
Subject: [PATCH 3/6] Warn when INCLUDE_SPARK_PROTOBUF_JAR=true but jars are
 missing

Surface a stderr warning when the variable is explicitly requested but
the spark-protobuf/protobuf-java jars are not present, so a CI
misconfiguration is not masked as a silent skip. Default opt-out
(unset or false) stays silent.

Addresses greptile review feedback on #14885.

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 integration_tests/run_pyspark_from_build.sh | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/integration_tests/run_pyspark_from_build.sh b/integration_tests/run_pyspark_from_build.sh
index 58599d32bcd..b000c350d26 100755
--- a/integration_tests/run_pyspark_from_build.sh
+++ b/integration_tests/run_pyspark_from_build.sh
@@ -150,11 +150,15 @@ else
     # the unshaded `protobuf-java` come from maven-dependency-plugin and must both be present
     # -- spark-protobuf shades `com.google.protobuf.*` internally and Spark does not bundle
     # the unshaded jar.
-    if [[ $( echo ${INCLUDE_SPARK_PROTOBUF_JAR} | tr '[:upper:]' '[:lower:]' ) != "false" \
+    INCLUDE_SPARK_PROTOBUF_JAR_REQUESTED=$(echo "${INCLUDE_SPARK_PROTOBUF_JAR}" | tr '[:upper:]' '[:lower:]')
+    if [[ "$INCLUDE_SPARK_PROTOBUF_JAR_REQUESTED" != "false" \
           && $(readlink -e $PROTOBUF_JARS 2>/dev/null | wc -l) -eq 2 ]];
     then
         export INCLUDE_SPARK_PROTOBUF_JAR=true
     else
+        if [[ "$INCLUDE_SPARK_PROTOBUF_JAR_REQUESTED" == "true" ]]; then
+            >&2 echo "WARNING: INCLUDE_SPARK_PROTOBUF_JAR=true was requested but spark-protobuf/protobuf-java jars were not found under $TARGET_DIR/dependency; disabling protobuf tests."
+        fi
         export INCLUDE_SPARK_PROTOBUF_JAR=false
         PROTOBUF_JARS=""
     fi

From 3dc8dbb9f606475a14ee227d73e352015c3566b8 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Thu, 28 May 2026 15:28:07 +0800
Subject: [PATCH 4/6] Write descFilePath with plain Python open(), not Hadoop
 FS

spark-protobuf's path-based API reads `descFilePath` with
`new File(...)` + `FileUtils.readFileToByteArray` (driver-local read),
not via Hadoop FileSystem. The original implementation wrote the
descriptor through Hadoop FS, which only worked in local mode because
the default fs is `file://` and resolves to the same driver-local
path; on a distributed setup `spark_tmp_path` would resolve to
HDFS / GCS and the driver's `new File()` would fail.

Switch to plain Python `open()` against `spark_tmp_path`, mirroring
the convention already used by `json_fuzz_test.py` and
`delta_lake_test.py` (both write driver-local files into
`spark_tmp_path` the same way).

Addresses #14885 review feedback from revans2.
---
 .../src/main/python/protobuf_test.py          | 31 +++++++------------
 1 file changed, 12 insertions(+), 19 deletions(-)

diff --git a/integration_tests/src/main/python/protobuf_test.py b/integration_tests/src/main/python/protobuf_test.py
index 525180c2811..4e2db38f69a 100644
--- a/integration_tests/src/main/python/protobuf_test.py
+++ b/integration_tests/src/main/python/protobuf_test.py
@@ -88,23 +88,16 @@ def _build_simple_descriptor_bytes(spark):
     return bytes(fds.toByteArray())
 
 
-def _write_bytes_to_hadoop_path(spark, path_str, data_bytes):
-    sc = spark.sparkContext
-    config = sc._jsc.hadoopConfiguration()
-    jpath = sc._jvm.org.apache.hadoop.fs.Path(path_str)
-    fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(config)
-    out = fs.create(jpath, True)
-    try:
-        out.write(bytearray(data_bytes))
-    finally:
-        out.close()
-
-
-def _setup_simple_desc(spark_tmp_path):
+@pytest.fixture
+def simple_desc(spark_tmp_path):
+    # spark-protobuf reads descFilePath with `new File(...)` + FileUtils
+    # (driver-local), not via Hadoop FileSystem -- write the descriptor with
+    # plain Python `open` like the other integration tests that share this
+    # assumption about `spark_tmp_path` (e.g. json_fuzz_test, delta_lake_test).
     desc_path = spark_tmp_path + "/simple.desc"
     desc_bytes = with_cpu_session(_build_simple_descriptor_bytes)
-    with_cpu_session(
-        lambda spark: _write_bytes_to_hadoop_path(spark, desc_path, desc_bytes))
+    with open(desc_path, "wb") as fp:
+        fp.write(desc_bytes)
     return desc_path, desc_bytes
 
 
@@ -117,8 +110,8 @@ def _make_smoke_df(spark):
 
 
 @allow_non_gpu("ProjectExec", "ProtobufDataToCatalyst")
-def test_from_protobuf_smoke_path_api(spark_tmp_path, from_protobuf_fn):
-    desc_path, _ = _setup_simple_desc(spark_tmp_path)
+def test_from_protobuf_smoke_path_api(simple_desc, from_protobuf_fn):
+    desc_path, _ = simple_desc
 
     def run(spark):
         return _make_smoke_df(spark).select(
@@ -128,10 +121,10 @@ def run(spark):
 
 
 @allow_non_gpu("ProjectExec", "ProtobufDataToCatalyst")
-def test_from_protobuf_smoke_binary_descriptor_api(spark_tmp_path, from_protobuf_fn):
+def test_from_protobuf_smoke_binary_descriptor_api(simple_desc, from_protobuf_fn):
     if "binaryDescriptorSet" not in inspect.signature(from_protobuf_fn).parameters:
         pytest.skip("binaryDescriptorSet kwarg is Spark 3.5+ only")
-    _, desc_bytes = _setup_simple_desc(spark_tmp_path)
+    _, desc_bytes = simple_desc
 
     def run(spark):
         return _make_smoke_df(spark).select(

From f1f780f9226818d4f8fb2faed109a712d39b7de6 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Thu, 28 May 2026 15:57:45 +0800
Subject: [PATCH 5/6] Trim comments to WHY-only

Drop the WHAT/recap halves from the comments introduced earlier in this
PR; keep only the WHY parts (spark-protobuf shading and the Spark 3.4.0+
module constraint).
---
 integration_tests/pom.xml                          | 9 ++-------
 integration_tests/run_pyspark_from_build.sh        | 6 ++----
 integration_tests/src/main/python/protobuf_test.py | 5 +----
 pom.xml                                            | 4 +---
 scala2.13/integration_tests/pom.xml                | 9 ++-------
 scala2.13/pom.xml                                  | 4 +---
 6 files changed, 9 insertions(+), 28 deletions(-)

diff --git a/integration_tests/pom.xml b/integration_tests/pom.xml
index fdf585895a5..ec15acdd66d 100644
--- a/integration_tests/pom.xml
+++ b/integration_tests/pom.xml
@@ -168,13 +168,8 @@
                             </artifactItems>
                         </configuration>
                     </execution>
-                    <!--
-                        spark-protobuf is a Spark 3.4.0+ module, and spark-protobuf shades its
-                        own `com.google.protobuf` into `org.sparkproject.spark_protobuf.protobuf`
-                        so we ship an unshaded protobuf-java alongside it for the integration
-                        tests. release33x profiles set `spark.protobuf.skipCopy=true` because
-                        spark-protobuf does not exist for Spark 3.3.x.
-                    -->
+                    <!-- spark-protobuf shades `com.google.protobuf` internally, so the
+                         unshaded jar is shipped separately for the tests to use. -->
                     <execution>
                         <id>copy-spark-protobuf</id>
                         <phase>package</phase>
diff --git a/integration_tests/run_pyspark_from_build.sh b/integration_tests/run_pyspark_from_build.sh
index b000c350d26..db7b3126f4d 100755
--- a/integration_tests/run_pyspark_from_build.sh
+++ b/integration_tests/run_pyspark_from_build.sh
@@ -146,10 +146,8 @@ else
         AVRO_JARS=""
     fi
 
-    # Set INCLUDE_SPARK_PROTOBUF_JAR=false to skip protobuf_test.py. Both `spark-protobuf` and
-    # the unshaded `protobuf-java` come from maven-dependency-plugin and must both be present
-    # -- spark-protobuf shades `com.google.protobuf.*` internally and Spark does not bundle
-    # the unshaded jar.
+    # spark-protobuf shades `com.google.protobuf.*` internally and Spark does not bundle the
+    # unshaded jar, so we must ship both jars to the test classpath.
     INCLUDE_SPARK_PROTOBUF_JAR_REQUESTED=$(echo "${INCLUDE_SPARK_PROTOBUF_JAR}" | tr '[:upper:]' '[:lower:]')
     if [[ "$INCLUDE_SPARK_PROTOBUF_JAR_REQUESTED" != "false" \
           && $(readlink -e $PROTOBUF_JARS 2>/dev/null | wc -l) -eq 2 ]];
diff --git a/integration_tests/src/main/python/protobuf_test.py b/integration_tests/src/main/python/protobuf_test.py
index 4e2db38f69a..e8cda38c372 100644
--- a/integration_tests/src/main/python/protobuf_test.py
+++ b/integration_tests/src/main/python/protobuf_test.py
@@ -90,10 +90,7 @@ def _build_simple_descriptor_bytes(spark):
 
 @pytest.fixture
 def simple_desc(spark_tmp_path):
-    # spark-protobuf reads descFilePath with `new File(...)` + FileUtils
-    # (driver-local), not via Hadoop FileSystem -- write the descriptor with
-    # plain Python `open` like the other integration tests that share this
-    # assumption about `spark_tmp_path` (e.g. json_fuzz_test, delta_lake_test).
+    # spark-protobuf reads descFilePath via `new File(...)`, not Hadoop FileSystem.
     desc_path = spark_tmp_path + "/simple.desc"
     desc_bytes = with_cpu_session(_build_simple_descriptor_bytes)
     with open(desc_path, "wb") as fp:
diff --git a/pom.xml b/pom.xml
index abe463d4ed7..0dc708fba65 100644
--- a/pom.xml
+++ b/pom.xml
@@ -811,9 +811,7 @@
         <rapids.secondaryCacheDir>${spark.rapids.project.basedir}/target/${spark.version.classifier}/.sbt/1.0/zinc/org.scala-sbt</rapids.secondaryCacheDir>
         <allowConventionalDistJar>false</allowConventionalDistJar>
         <buildver>330</buildver>
-        <!-- Default to copying spark-protobuf and protobuf-java into target/dependency. The
-             release33x profiles override this to `true` because spark-protobuf is a Spark
-             3.4.0+ module and the artifact does not exist for Spark 3.3.x. -->
+        <!-- spark-protobuf is a Spark 3.4.0+ module; release33x profiles override to `true`. -->
         <spark.protobuf.skipCopy>false</spark.protobuf.skipCopy>
         <maven.compiler.source>1.8</maven.compiler.source>
         <java.major.version>8</java.major.version>
diff --git a/scala2.13/integration_tests/pom.xml b/scala2.13/integration_tests/pom.xml
index 198455fd02f..77081a9ea47 100644
--- a/scala2.13/integration_tests/pom.xml
+++ b/scala2.13/integration_tests/pom.xml
@@ -168,13 +168,8 @@
                             </artifactItems>
                         </configuration>
                     </execution>
-                    <!--
-                        spark-protobuf is a Spark 3.4.0+ module, and spark-protobuf shades its
-                        own `com.google.protobuf` into `org.sparkproject.spark_protobuf.protobuf`
-                        so we ship an unshaded protobuf-java alongside it for the integration
-                        tests. release33x profiles set `spark.protobuf.skipCopy=true` because
-                        spark-protobuf does not exist for Spark 3.3.x.
-                    -->
+                    <!-- spark-protobuf shades `com.google.protobuf` internally, so the
+                         unshaded jar is shipped separately for the tests to use. -->
                     <execution>
                         <id>copy-spark-protobuf</id>
                         <phase>package</phase>
diff --git a/scala2.13/pom.xml b/scala2.13/pom.xml
index 169e1d5685d..09d9e74cf68 100644
--- a/scala2.13/pom.xml
+++ b/scala2.13/pom.xml
@@ -811,9 +811,7 @@
         <rapids.secondaryCacheDir>${spark.rapids.project.basedir}/target/${spark.version.classifier}/.sbt/1.0/zinc/org.scala-sbt</rapids.secondaryCacheDir>
         <allowConventionalDistJar>false</allowConventionalDistJar>
         <buildver>330</buildver>
-        <!-- Default to copying spark-protobuf and protobuf-java into target/dependency. The
-             release33x profiles override this to `true` because spark-protobuf is a Spark
-             3.4.0+ module and the artifact does not exist for Spark 3.3.x. -->
+        <!-- spark-protobuf is a Spark 3.4.0+ module; release33x profiles override to `true`. -->
         <spark.protobuf.skipCopy>false</spark.protobuf.skipCopy>
         <maven.compiler.source>1.8</maven.compiler.source>
         <java.major.version>8</java.major.version>

From 899625a93a08fedd0d6590a4d97eb3e9d98a677d Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Thu, 28 May 2026 16:05:16 +0800
Subject: [PATCH 6/6] Drop stale review-context comment

---
 integration_tests/src/main/python/protobuf_test.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/integration_tests/src/main/python/protobuf_test.py b/integration_tests/src/main/python/protobuf_test.py
index e8cda38c372..6b2606f06b6 100644
--- a/integration_tests/src/main/python/protobuf_test.py
+++ b/integration_tests/src/main/python/protobuf_test.py
@@ -90,7 +90,6 @@ def _build_simple_descriptor_bytes(spark):
 
 @pytest.fixture
 def simple_desc(spark_tmp_path):
-    # spark-protobuf reads descFilePath via `new File(...)`, not Hadoop FileSystem.
     desc_path = spark_tmp_path + "/simple.desc"
     desc_bytes = with_cpu_session(_build_simple_descriptor_bytes)
     with open(desc_path, "wb") as fp: