Skip to content

Commit c04d075

Browse files
committed
Warn when INCLUDE_SPARK_PROTOBUF_JAR=true but jars are missing
Surface a stderr warning when the variable is explicitly requested but the spark-protobuf/protobuf-java jars are not present, so a CI misconfiguration is not masked as a silent skip. Default opt-out (unset or false) stays silent. Addresses greptile review feedback on #14885. Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
1 parent ae0e557 commit c04d075

2 files changed

Lines changed: 20 additions & 21 deletions

File tree

integration_tests/run_pyspark_from_build.sh

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,11 +150,15 @@ else
150150
# the unshaded `protobuf-java` come from maven-dependency-plugin and must both be present
151151
# -- spark-protobuf shades `com.google.protobuf.*` internally and Spark does not bundle
152152
# the unshaded jar.
153-
if [[ $( echo ${INCLUDE_SPARK_PROTOBUF_JAR} | tr '[:upper:]' '[:lower:]' ) != "false" \
153+
INCLUDE_SPARK_PROTOBUF_JAR_REQUESTED=$(echo "${INCLUDE_SPARK_PROTOBUF_JAR}" | tr '[:upper:]' '[:lower:]')
154+
if [[ "$INCLUDE_SPARK_PROTOBUF_JAR_REQUESTED" != "false" \
154155
&& $(readlink -e $PROTOBUF_JARS 2>/dev/null | wc -l) -eq 2 ]];
155156
then
156157
export INCLUDE_SPARK_PROTOBUF_JAR=true
157158
else
159+
if [[ "$INCLUDE_SPARK_PROTOBUF_JAR_REQUESTED" == "true" ]]; then
160+
>&2 echo "WARNING: INCLUDE_SPARK_PROTOBUF_JAR=true was requested but spark-protobuf/protobuf-java jars were not found under $TARGET_DIR/dependency; disabling protobuf tests."
161+
fi
158162
export INCLUDE_SPARK_PROTOBUF_JAR=false
159163
PROTOBUF_JARS=""
160164
fi

integration_tests/src/main/python/protobuf_test.py

Lines changed: 15 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
import inspect
1616
import os
17+
import tempfile
1718

1819
import pytest
1920

@@ -88,23 +89,17 @@ def _build_simple_descriptor_bytes(spark):
8889
return bytes(fds.toByteArray())
8990

9091

91-
def _write_bytes_to_hadoop_path(spark, path_str, data_bytes):
92-
sc = spark.sparkContext
93-
config = sc._jsc.hadoopConfiguration()
94-
jpath = sc._jvm.org.apache.hadoop.fs.Path(path_str)
95-
fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(config)
96-
out = fs.create(jpath, True)
97-
try:
98-
out.write(bytearray(data_bytes))
99-
finally:
100-
out.close()
101-
102-
103-
def _setup_simple_desc(spark_tmp_path):
104-
desc_path = spark_tmp_path + "/simple.desc"
92+
@pytest.fixture
93+
def simple_desc(request):
94+
# spark-protobuf reads descFilePath with `new File(...)` + FileUtils
95+
# (driver-local), not via Hadoop FileSystem -- so the descriptor must live
96+
# on the driver's local disk. Writing to `spark_tmp_path` would break in
97+
# distributed setups where it resolves to HDFS / GCS / etc.
10598
desc_bytes = with_cpu_session(_build_simple_descriptor_bytes)
106-
with_cpu_session(
107-
lambda spark: _write_bytes_to_hadoop_path(spark, desc_path, desc_bytes))
99+
fd, desc_path = tempfile.mkstemp(suffix=".desc")
100+
with os.fdopen(fd, "wb") as fp:
101+
fp.write(desc_bytes)
102+
request.addfinalizer(lambda: os.path.exists(desc_path) and os.unlink(desc_path))
108103
return desc_path, desc_bytes
109104

110105

@@ -117,8 +112,8 @@ def _make_smoke_df(spark):
117112

118113

119114
@allow_non_gpu("ProjectExec", "ProtobufDataToCatalyst")
120-
def test_from_protobuf_smoke_path_api(spark_tmp_path, from_protobuf_fn):
121-
desc_path, _ = _setup_simple_desc(spark_tmp_path)
115+
def test_from_protobuf_smoke_path_api(simple_desc, from_protobuf_fn):
116+
desc_path, _ = simple_desc
122117

123118
def run(spark):
124119
return _make_smoke_df(spark).select(
@@ -128,10 +123,10 @@ def run(spark):
128123

129124

130125
@allow_non_gpu("ProjectExec", "ProtobufDataToCatalyst")
131-
def test_from_protobuf_smoke_binary_descriptor_api(spark_tmp_path, from_protobuf_fn):
126+
def test_from_protobuf_smoke_binary_descriptor_api(simple_desc, from_protobuf_fn):
132127
if "binaryDescriptorSet" not in inspect.signature(from_protobuf_fn).parameters:
133128
pytest.skip("binaryDescriptorSet kwarg is Spark 3.5+ only")
134-
_, desc_bytes = _setup_simple_desc(spark_tmp_path)
129+
_, desc_bytes = simple_desc
135130

136131
def run(spark):
137132
return _make_smoke_df(spark).select(

0 commit comments

Comments
 (0)