From d8e2530db99b7082a9efe86056bbe321e18b4fc9 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Tue, 26 May 2026 14:26:37 +0800 Subject: [PATCH 1/6] Add protobuf integration-test dependency infrastructure (plugin-0) Wires the optional `spark-protobuf` module into the integration-test classpath so subsequent from_protobuf PRs have a CPU baseline. Follows the same pattern as `spark-avro`. * `integration_tests/pom.xml`: declare `spark-protobuf_${scala.binary.version}` and an unshaded `protobuf-java` (3.25.5) in `maven-dependency-plugin` so they are copied into `target/dependency/` during the `package` phase. spark-protobuf is a Spark 3.4.0+ module, so the protobuf copy lives in its own execution gated by `spark.protobuf.skipCopy` (set to `true` by the `release33x` profiles in the root pom). The unshaded `protobuf-java` is required because spark-protobuf shades its own copy into `org.sparkproject.spark_protobuf.protobuf` and Spark itself does not bundle the unshaded jar. * `run_pyspark_from_build.sh`: glob both jars from `target/dependency/` (or `LOCAL_JAR_PATH`), gate them behind `INCLUDE_SPARK_PROTOBUF_JAR` (default `true`), and append them to `ALL_JARS`. `--jars` already reaches both driver and executor classpath in client mode, so no separate `--driver-class-path` plumbing is needed. * `protobuf_test.py` (new): two minimal fallback-only smoke tests that build a `FileDescriptorSet` through the JVM, hand-encode a couple of proto2 messages, and exercise both the path-based and (Spark 3.5+) `binaryDescriptorSet` `from_protobuf` API variants. GPU support is not enabled yet, so they use `@allow_non_gpu` + `assert_gpu_fallback_collect` to verify the plugin falls back to CPU while still producing correct results. No GPU code is changed in this PR. Co-Authored-By: Claude Opus 4.7 (1M context) --- integration_tests/pom.xml | 32 ++++ integration_tests/run_pyspark_from_build.sh | 22 ++- .../src/main/python/protobuf_test.py | 141 ++++++++++++++++++ pom.xml | 9 ++ scala2.13/integration_tests/pom.xml | 32 ++++ scala2.13/pom.xml | 9 ++ 6 files changed, 243 insertions(+), 2 deletions(-) create mode 100644 integration_tests/src/main/python/protobuf_test.py diff --git a/integration_tests/pom.xml b/integration_tests/pom.xml index ea075d7a2dc..fdf585895a5 100644 --- a/integration_tests/pom.xml +++ b/integration_tests/pom.xml @@ -132,6 +132,8 @@ parquet-hadoop*.jar spark-avro*.jar + spark-protobuf*.jar + protobuf-java-*.jar @@ -166,6 +168,36 @@ + + + copy-spark-protobuf + package + + copy + + + ${spark.protobuf.skipCopy} + true + + + org.apache.spark + spark-protobuf_${scala.binary.version} + ${spark.version} + + + com.google.protobuf + protobuf-java + 3.25.5 + + + + diff --git a/integration_tests/run_pyspark_from_build.sh b/integration_tests/run_pyspark_from_build.sh index 2f1b96d901d..58599d32bcd 100755 --- a/integration_tests/run_pyspark_from_build.sh +++ b/integration_tests/run_pyspark_from_build.sh @@ -46,6 +46,9 @@ # To run all tests, including Avro tests: # INCLUDE_SPARK_AVRO_JAR=true ./run_pyspark_from_build.sh # +# To run tests WITHOUT Protobuf tests (protobuf is included by default): +# INCLUDE_SPARK_PROTOBUF_JAR=false ./run_pyspark_from_build.sh +# # To run a specific test: # TEST=my_test ./run_pyspark_from_build.sh # @@ -100,6 +103,7 @@ else # support alternate local jars NOT building from the source code if [ -d "$LOCAL_JAR_PATH" ]; then AVRO_JARS=$(echo "$LOCAL_JAR_PATH"/spark-avro*.jar) + PROTOBUF_JARS=$(echo "$LOCAL_JAR_PATH"/spark-protobuf*.jar "$LOCAL_JAR_PATH"/protobuf-java-*.jar) PLUGIN_JAR=$(echo "$LOCAL_JAR_PATH"/rapids-4-spark_*.jar) if [ -f $(echo $LOCAL_JAR_PATH/parquet-hadoop*.jar) ]; then export INCLUDE_PARQUET_HADOOP_TEST_JAR=true @@ -116,6 +120,7 @@ else else [[ "$SCALA_VERSION" != "2.12" ]] && TARGET_DIR=${TARGET_DIR/integration_tests/scala$SCALA_VERSION\/integration_tests} AVRO_JARS=$(echo "$TARGET_DIR"/dependency/spark-avro*.jar) + PROTOBUF_JARS=$(echo "$TARGET_DIR"/dependency/spark-protobuf*.jar "$TARGET_DIR"/dependency/protobuf-java-*.jar) PARQUET_HADOOP_TESTS=$(echo "$TARGET_DIR"/dependency/parquet-hadoop*.jar) # remove the log4j.properties file so it doesn't conflict with ours, ignore errors # if it isn't present or already removed @@ -141,9 +146,22 @@ else AVRO_JARS="" fi - # ALL_JARS includes dist.jar integration-test.jar avro.jar parquet.jar if they exist + # Set INCLUDE_SPARK_PROTOBUF_JAR=false to skip protobuf_test.py. Both `spark-protobuf` and + # the unshaded `protobuf-java` come from maven-dependency-plugin and must both be present + # -- spark-protobuf shades `com.google.protobuf.*` internally and Spark does not bundle + # the unshaded jar. + if [[ $( echo ${INCLUDE_SPARK_PROTOBUF_JAR} | tr '[:upper:]' '[:lower:]' ) != "false" \ + && $(readlink -e $PROTOBUF_JARS 2>/dev/null | wc -l) -eq 2 ]]; + then + export INCLUDE_SPARK_PROTOBUF_JAR=true + else + export INCLUDE_SPARK_PROTOBUF_JAR=false + PROTOBUF_JARS="" + fi + + # ALL_JARS includes dist.jar integration-test.jar avro.jar parquet.jar protobuf.jar if they exist # Remove non-existing paths and canonicalize the paths including get rid of links and `..` - ALL_JARS=$(readlink -e $PLUGIN_JAR $TEST_JARS $AVRO_JARS $PARQUET_HADOOP_TESTS || true) + ALL_JARS=$(readlink -e $PLUGIN_JAR $TEST_JARS $AVRO_JARS $PARQUET_HADOOP_TESTS $PROTOBUF_JARS || true) # `:` separated jars ALL_JARS="${ALL_JARS//$'\n'/:}" diff --git a/integration_tests/src/main/python/protobuf_test.py b/integration_tests/src/main/python/protobuf_test.py new file mode 100644 index 00000000000..525180c2811 --- /dev/null +++ b/integration_tests/src/main/python/protobuf_test.py @@ -0,0 +1,141 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +import os + +import pytest + +from asserts import assert_gpu_fallback_collect +from marks import allow_non_gpu +from spark_session import is_before_spark_340, with_cpu_session +import pyspark.sql.functions as f + +if os.environ.get('INCLUDE_SPARK_PROTOBUF_JAR', 'true').lower() == 'false': + pytestmark = pytest.mark.skip(reason="INCLUDE_SPARK_PROTOBUF_JAR is disabled") +else: + pytestmark = pytest.mark.skipif( + is_before_spark_340(), reason="from_protobuf is Spark 3.4.0+") + + +def _try_import_from_protobuf(): + try: + from pyspark.sql.protobuf.functions import from_protobuf + return from_protobuf + except Exception: + return None + + +@pytest.fixture(scope="module") +def from_protobuf_fn(): + fn = _try_import_from_protobuf() + if fn is None: + pytest.skip("from_protobuf not available") + return fn + + +def _encode_varint(value): + out = bytearray() + value &= 0xFFFFFFFFFFFFFFFF + while True: + bits = value & 0x7F + value >>= 7 + if value: + out.append(bits | 0x80) + else: + out.append(bits) + return bytes(out) + + +def _encode_simple_message(i32_value, s_value): + buf = bytearray() + buf += _encode_varint((1 << 3) | 0) # field 1, VARINT + buf += _encode_varint(i32_value) + s_bytes = s_value.encode("utf-8") + buf += _encode_varint((2 << 3) | 2) # field 2, LENGTH-DELIMITED + buf += _encode_varint(len(s_bytes)) + buf += s_bytes + return bytes(buf) + + +def _build_simple_descriptor_bytes(spark): + D = spark.sparkContext._jvm.com.google.protobuf.DescriptorProtos + i32_field = D.FieldDescriptorProto.newBuilder() \ + .setName("i32").setNumber(1) \ + .setLabel(D.FieldDescriptorProto.Label.LABEL_OPTIONAL) \ + .setType(D.FieldDescriptorProto.Type.TYPE_INT32).build() + s_field = D.FieldDescriptorProto.newBuilder() \ + .setName("s").setNumber(2) \ + .setLabel(D.FieldDescriptorProto.Label.LABEL_OPTIONAL) \ + .setType(D.FieldDescriptorProto.Type.TYPE_STRING).build() + msg = D.DescriptorProto.newBuilder() \ + .setName("Simple").addField(i32_field).addField(s_field).build() + file_builder = D.FileDescriptorProto.newBuilder() \ + .setName("simple.proto").setPackage("test").addMessageType(msg) \ + .setSyntax("proto2") + fds = D.FileDescriptorSet.newBuilder().addFile(file_builder.build()).build() + return bytes(fds.toByteArray()) + + +def _write_bytes_to_hadoop_path(spark, path_str, data_bytes): + sc = spark.sparkContext + config = sc._jsc.hadoopConfiguration() + jpath = sc._jvm.org.apache.hadoop.fs.Path(path_str) + fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(config) + out = fs.create(jpath, True) + try: + out.write(bytearray(data_bytes)) + finally: + out.close() + + +def _setup_simple_desc(spark_tmp_path): + desc_path = spark_tmp_path + "/simple.desc" + desc_bytes = with_cpu_session(_build_simple_descriptor_bytes) + with_cpu_session( + lambda spark: _write_bytes_to_hadoop_path(spark, desc_path, desc_bytes)) + return desc_path, desc_bytes + + +_smoke_rows = [(1, "a"), (-2, "bb"), (0, ""), (12345, "hello")] + + +def _make_smoke_df(spark): + encoded = [(_encode_simple_message(i, s),) for (i, s) in _smoke_rows] + return spark.createDataFrame(encoded, ["bin"]) + + +@allow_non_gpu("ProjectExec", "ProtobufDataToCatalyst") +def test_from_protobuf_smoke_path_api(spark_tmp_path, from_protobuf_fn): + desc_path, _ = _setup_simple_desc(spark_tmp_path) + + def run(spark): + return _make_smoke_df(spark).select( + from_protobuf_fn(f.col("bin"), "test.Simple", desc_path).alias("d")) + + assert_gpu_fallback_collect(run, "ProtobufDataToCatalyst") + + +@allow_non_gpu("ProjectExec", "ProtobufDataToCatalyst") +def test_from_protobuf_smoke_binary_descriptor_api(spark_tmp_path, from_protobuf_fn): + if "binaryDescriptorSet" not in inspect.signature(from_protobuf_fn).parameters: + pytest.skip("binaryDescriptorSet kwarg is Spark 3.5+ only") + _, desc_bytes = _setup_simple_desc(spark_tmp_path) + + def run(spark): + return _make_smoke_df(spark).select( + from_protobuf_fn(f.col("bin"), "test.Simple", + binaryDescriptorSet=bytearray(desc_bytes)).alias("d")) + + assert_gpu_fallback_collect(run, "ProtobufDataToCatalyst") diff --git a/pom.xml b/pom.xml index 450211bcc4a..abe463d4ed7 100644 --- a/pom.xml +++ b/pom.xml @@ -94,6 +94,7 @@ rapids-4-spark-delta-21x rapids-4-spark-delta-22x rapids-4-spark-delta-23x + true delta-lake/delta-21x @@ -118,6 +119,7 @@ rapids-4-spark-delta-21x rapids-4-spark-delta-22x rapids-4-spark-delta-23x + true delta-lake/delta-21x @@ -142,6 +144,7 @@ rapids-4-spark-delta-21x rapids-4-spark-delta-22x rapids-4-spark-delta-23x + true delta-lake/delta-21x @@ -166,6 +169,7 @@ rapids-4-spark-delta-21x rapids-4-spark-delta-22x rapids-4-spark-delta-23x + true delta-lake/delta-21x @@ -190,6 +194,7 @@ rapids-4-spark-delta-21x rapids-4-spark-delta-22x rapids-4-spark-delta-23x + true delta-lake/delta-21x @@ -806,6 +811,10 @@ ${spark.rapids.project.basedir}/target/${spark.version.classifier}/.sbt/1.0/zinc/org.scala-sbt false 330 + + false 1.8 8 ${java.major.version} diff --git a/scala2.13/integration_tests/pom.xml b/scala2.13/integration_tests/pom.xml index e1d2e133210..198455fd02f 100644 --- a/scala2.13/integration_tests/pom.xml +++ b/scala2.13/integration_tests/pom.xml @@ -132,6 +132,8 @@ parquet-hadoop*.jar spark-avro*.jar + spark-protobuf*.jar + protobuf-java-*.jar @@ -166,6 +168,36 @@ + + + copy-spark-protobuf + package + + copy + + + ${spark.protobuf.skipCopy} + true + + + org.apache.spark + spark-protobuf_${scala.binary.version} + ${spark.version} + + + com.google.protobuf + protobuf-java + 3.25.5 + + + + diff --git a/scala2.13/pom.xml b/scala2.13/pom.xml index 6b9a9aa8d68..169e1d5685d 100644 --- a/scala2.13/pom.xml +++ b/scala2.13/pom.xml @@ -94,6 +94,7 @@ rapids-4-spark-delta-21x rapids-4-spark-delta-22x rapids-4-spark-delta-23x + true delta-lake/delta-21x @@ -118,6 +119,7 @@ rapids-4-spark-delta-21x rapids-4-spark-delta-22x rapids-4-spark-delta-23x + true delta-lake/delta-21x @@ -142,6 +144,7 @@ rapids-4-spark-delta-21x rapids-4-spark-delta-22x rapids-4-spark-delta-23x + true delta-lake/delta-21x @@ -166,6 +169,7 @@ rapids-4-spark-delta-21x rapids-4-spark-delta-22x rapids-4-spark-delta-23x + true delta-lake/delta-21x @@ -190,6 +194,7 @@ rapids-4-spark-delta-21x rapids-4-spark-delta-22x rapids-4-spark-delta-23x + true delta-lake/delta-21x @@ -806,6 +811,10 @@ ${spark.rapids.project.basedir}/target/${spark.version.classifier}/.sbt/1.0/zinc/org.scala-sbt false 330 + + false 1.8 8 ${java.major.version} From ae0e557b29dc157c3b248e3cc3050f0aec3004fe Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Wed, 27 May 2026 16:30:41 +0800 Subject: [PATCH 2/6] signoff Signed-off-by: Haoyang Li From 8a7b4e2c71d19ba7bd203328482901808af29bd9 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Wed, 27 May 2026 17:09:15 +0800 Subject: [PATCH 3/6] Warn when INCLUDE_SPARK_PROTOBUF_JAR=true but jars are missing Surface a stderr warning when the variable is explicitly requested but the spark-protobuf/protobuf-java jars are not present, so a CI misconfiguration is not masked as a silent skip. Default opt-out (unset or false) stays silent. Addresses greptile review feedback on #14885. Signed-off-by: Haoyang Li --- integration_tests/run_pyspark_from_build.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/integration_tests/run_pyspark_from_build.sh b/integration_tests/run_pyspark_from_build.sh index 58599d32bcd..b000c350d26 100755 --- a/integration_tests/run_pyspark_from_build.sh +++ b/integration_tests/run_pyspark_from_build.sh @@ -150,11 +150,15 @@ else # the unshaded `protobuf-java` come from maven-dependency-plugin and must both be present # -- spark-protobuf shades `com.google.protobuf.*` internally and Spark does not bundle # the unshaded jar. - if [[ $( echo ${INCLUDE_SPARK_PROTOBUF_JAR} | tr '[:upper:]' '[:lower:]' ) != "false" \ + INCLUDE_SPARK_PROTOBUF_JAR_REQUESTED=$(echo "${INCLUDE_SPARK_PROTOBUF_JAR}" | tr '[:upper:]' '[:lower:]') + if [[ "$INCLUDE_SPARK_PROTOBUF_JAR_REQUESTED" != "false" \ && $(readlink -e $PROTOBUF_JARS 2>/dev/null | wc -l) -eq 2 ]]; then export INCLUDE_SPARK_PROTOBUF_JAR=true else + if [[ "$INCLUDE_SPARK_PROTOBUF_JAR_REQUESTED" == "true" ]]; then + >&2 echo "WARNING: INCLUDE_SPARK_PROTOBUF_JAR=true was requested but spark-protobuf/protobuf-java jars were not found under $TARGET_DIR/dependency; disabling protobuf tests." + fi export INCLUDE_SPARK_PROTOBUF_JAR=false PROTOBUF_JARS="" fi From 3dc8dbb9f606475a14ee227d73e352015c3566b8 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Thu, 28 May 2026 15:28:07 +0800 Subject: [PATCH 4/6] Write descFilePath with plain Python open(), not Hadoop FS spark-protobuf's path-based API reads `descFilePath` with `new File(...)` + `FileUtils.readFileToByteArray` (driver-local read), not via Hadoop FileSystem. The original implementation wrote the descriptor through Hadoop FS, which only worked in local mode because the default fs is `file://` and resolves to the same driver-local path; on a distributed setup `spark_tmp_path` would resolve to HDFS / GCS and the driver's `new File()` would fail. Switch to plain Python `open()` against `spark_tmp_path`, mirroring the convention already used by `json_fuzz_test.py` and `delta_lake_test.py` (both write driver-local files into `spark_tmp_path` the same way). Addresses #14885 review feedback from revans2. --- .../src/main/python/protobuf_test.py | 31 +++++++------------ 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/integration_tests/src/main/python/protobuf_test.py b/integration_tests/src/main/python/protobuf_test.py index 525180c2811..4e2db38f69a 100644 --- a/integration_tests/src/main/python/protobuf_test.py +++ b/integration_tests/src/main/python/protobuf_test.py @@ -88,23 +88,16 @@ def _build_simple_descriptor_bytes(spark): return bytes(fds.toByteArray()) -def _write_bytes_to_hadoop_path(spark, path_str, data_bytes): - sc = spark.sparkContext - config = sc._jsc.hadoopConfiguration() - jpath = sc._jvm.org.apache.hadoop.fs.Path(path_str) - fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(config) - out = fs.create(jpath, True) - try: - out.write(bytearray(data_bytes)) - finally: - out.close() - - -def _setup_simple_desc(spark_tmp_path): +@pytest.fixture +def simple_desc(spark_tmp_path): + # spark-protobuf reads descFilePath with `new File(...)` + FileUtils + # (driver-local), not via Hadoop FileSystem -- write the descriptor with + # plain Python `open` like the other integration tests that share this + # assumption about `spark_tmp_path` (e.g. json_fuzz_test, delta_lake_test). desc_path = spark_tmp_path + "/simple.desc" desc_bytes = with_cpu_session(_build_simple_descriptor_bytes) - with_cpu_session( - lambda spark: _write_bytes_to_hadoop_path(spark, desc_path, desc_bytes)) + with open(desc_path, "wb") as fp: + fp.write(desc_bytes) return desc_path, desc_bytes @@ -117,8 +110,8 @@ def _make_smoke_df(spark): @allow_non_gpu("ProjectExec", "ProtobufDataToCatalyst") -def test_from_protobuf_smoke_path_api(spark_tmp_path, from_protobuf_fn): - desc_path, _ = _setup_simple_desc(spark_tmp_path) +def test_from_protobuf_smoke_path_api(simple_desc, from_protobuf_fn): + desc_path, _ = simple_desc def run(spark): return _make_smoke_df(spark).select( @@ -128,10 +121,10 @@ def run(spark): @allow_non_gpu("ProjectExec", "ProtobufDataToCatalyst") -def test_from_protobuf_smoke_binary_descriptor_api(spark_tmp_path, from_protobuf_fn): +def test_from_protobuf_smoke_binary_descriptor_api(simple_desc, from_protobuf_fn): if "binaryDescriptorSet" not in inspect.signature(from_protobuf_fn).parameters: pytest.skip("binaryDescriptorSet kwarg is Spark 3.5+ only") - _, desc_bytes = _setup_simple_desc(spark_tmp_path) + _, desc_bytes = simple_desc def run(spark): return _make_smoke_df(spark).select( From f1f780f9226818d4f8fb2faed109a712d39b7de6 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Thu, 28 May 2026 15:57:45 +0800 Subject: [PATCH 5/6] Trim comments to WHY-only Drop the WHAT/recap halves from the comments introduced earlier in this PR; keep only the WHY parts (spark-protobuf shading and the Spark 3.4.0+ module constraint). --- integration_tests/pom.xml | 9 ++------- integration_tests/run_pyspark_from_build.sh | 6 ++---- integration_tests/src/main/python/protobuf_test.py | 5 +---- pom.xml | 4 +--- scala2.13/integration_tests/pom.xml | 9 ++------- scala2.13/pom.xml | 4 +--- 6 files changed, 9 insertions(+), 28 deletions(-) diff --git a/integration_tests/pom.xml b/integration_tests/pom.xml index fdf585895a5..ec15acdd66d 100644 --- a/integration_tests/pom.xml +++ b/integration_tests/pom.xml @@ -168,13 +168,8 @@ - + copy-spark-protobuf package diff --git a/integration_tests/run_pyspark_from_build.sh b/integration_tests/run_pyspark_from_build.sh index b000c350d26..db7b3126f4d 100755 --- a/integration_tests/run_pyspark_from_build.sh +++ b/integration_tests/run_pyspark_from_build.sh @@ -146,10 +146,8 @@ else AVRO_JARS="" fi - # Set INCLUDE_SPARK_PROTOBUF_JAR=false to skip protobuf_test.py. Both `spark-protobuf` and - # the unshaded `protobuf-java` come from maven-dependency-plugin and must both be present - # -- spark-protobuf shades `com.google.protobuf.*` internally and Spark does not bundle - # the unshaded jar. + # spark-protobuf shades `com.google.protobuf.*` internally and Spark does not bundle the + # unshaded jar, so we must ship both jars to the test classpath. INCLUDE_SPARK_PROTOBUF_JAR_REQUESTED=$(echo "${INCLUDE_SPARK_PROTOBUF_JAR}" | tr '[:upper:]' '[:lower:]') if [[ "$INCLUDE_SPARK_PROTOBUF_JAR_REQUESTED" != "false" \ && $(readlink -e $PROTOBUF_JARS 2>/dev/null | wc -l) -eq 2 ]]; diff --git a/integration_tests/src/main/python/protobuf_test.py b/integration_tests/src/main/python/protobuf_test.py index 4e2db38f69a..e8cda38c372 100644 --- a/integration_tests/src/main/python/protobuf_test.py +++ b/integration_tests/src/main/python/protobuf_test.py @@ -90,10 +90,7 @@ def _build_simple_descriptor_bytes(spark): @pytest.fixture def simple_desc(spark_tmp_path): - # spark-protobuf reads descFilePath with `new File(...)` + FileUtils - # (driver-local), not via Hadoop FileSystem -- write the descriptor with - # plain Python `open` like the other integration tests that share this - # assumption about `spark_tmp_path` (e.g. json_fuzz_test, delta_lake_test). + # spark-protobuf reads descFilePath via `new File(...)`, not Hadoop FileSystem. desc_path = spark_tmp_path + "/simple.desc" desc_bytes = with_cpu_session(_build_simple_descriptor_bytes) with open(desc_path, "wb") as fp: diff --git a/pom.xml b/pom.xml index abe463d4ed7..0dc708fba65 100644 --- a/pom.xml +++ b/pom.xml @@ -811,9 +811,7 @@ ${spark.rapids.project.basedir}/target/${spark.version.classifier}/.sbt/1.0/zinc/org.scala-sbt false 330 - + false 1.8 8 diff --git a/scala2.13/integration_tests/pom.xml b/scala2.13/integration_tests/pom.xml index 198455fd02f..77081a9ea47 100644 --- a/scala2.13/integration_tests/pom.xml +++ b/scala2.13/integration_tests/pom.xml @@ -168,13 +168,8 @@ - + copy-spark-protobuf package diff --git a/scala2.13/pom.xml b/scala2.13/pom.xml index 169e1d5685d..09d9e74cf68 100644 --- a/scala2.13/pom.xml +++ b/scala2.13/pom.xml @@ -811,9 +811,7 @@ ${spark.rapids.project.basedir}/target/${spark.version.classifier}/.sbt/1.0/zinc/org.scala-sbt false 330 - + false 1.8 8 From 899625a93a08fedd0d6590a4d97eb3e9d98a677d Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Thu, 28 May 2026 16:05:16 +0800 Subject: [PATCH 6/6] Drop stale review-context comment --- integration_tests/src/main/python/protobuf_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/integration_tests/src/main/python/protobuf_test.py b/integration_tests/src/main/python/protobuf_test.py index e8cda38c372..6b2606f06b6 100644 --- a/integration_tests/src/main/python/protobuf_test.py +++ b/integration_tests/src/main/python/protobuf_test.py @@ -90,7 +90,6 @@ def _build_simple_descriptor_bytes(spark): @pytest.fixture def simple_desc(spark_tmp_path): - # spark-protobuf reads descFilePath via `new File(...)`, not Hadoop FileSystem. desc_path = spark_tmp_path + "/simple.desc" desc_bytes = with_cpu_session(_build_simple_descriptor_bytes) with open(desc_path, "wb") as fp: