Skip to content

Commit dc4e10b

Browse files
committed
Enable Spark query runner in aggregate fuzzer test
1 parent b44ffc9 commit dc4e10b

File tree

4 files changed

+25
-75
lines changed

4 files changed

+25
-75
lines changed

.github/workflows/experimental.yml

-53
Original file line numberDiff line numberDiff line change
@@ -85,18 +85,6 @@ jobs:
8585
name: aggregation
8686
path: velox/_build/debug/velox/functions/prestosql/fuzzer/velox_aggregation_fuzzer_test
8787

88-
- name: Upload spark aggregation fuzzer
89-
uses: actions/upload-artifact@v3
90-
with:
91-
name: spark_aggregation_fuzzer
92-
path: velox/_build/debug/velox/functions/sparksql/fuzzer/spark_aggregation_fuzzer_test
93-
94-
- name: Upload aggregation fuzzer
95-
uses: actions/upload-artifact@v3
96-
with:
97-
name: aggregation
98-
path: velox/_build/debug/velox/functions/prestosql/fuzzer/velox_aggregation_fuzzer_test
99-
10088
- name: Upload join fuzzer
10189
uses: actions/upload-artifact@v3
10290
with:
@@ -180,47 +168,6 @@ jobs:
180168
/tmp/aggregate_fuzzer_repro
181169
/tmp/server.log
182170
183-
linux-spark-fuzzer-run:
184-
runs-on: ubuntu-latest
185-
needs: compile
186-
timeout-minutes: 120
187-
steps:
188-
189-
- name: "Checkout Repo"
190-
uses: actions/checkout@v3
191-
with:
192-
ref: "${{ inputs.ref || 'main' }}"
193-
194-
- name: "Install dependencies"
195-
run: source ./scripts/setup-ubuntu.sh && install_apt_deps
196-
197-
- name: Download spark aggregation fuzzer
198-
uses: actions/download-artifact@v3
199-
with:
200-
name: spark_aggregation_fuzzer
201-
202-
- name: "Run Spark Aggregate Fuzzer"
203-
run: |
204-
mkdir -p /tmp/spark_aggregate_fuzzer_repro/
205-
chmod -R 777 /tmp/spark_aggregate_fuzzer_repro
206-
chmod +x spark_aggregation_fuzzer_test
207-
./spark_aggregation_fuzzer_test \
208-
--seed ${RANDOM} \
209-
--duration_sec 1800 \
210-
--logtostderr=1 \
211-
--minloglevel=0 \
212-
--repro_persist_path=/tmp/spark_aggregate_fuzzer_repro \
213-
--enable_sorted_aggregations=true \
214-
&& echo -e "\n\nSpark Aggregation Fuzzer run finished successfully."
215-
216-
- name: Archive Spark aggregate production artifacts
217-
if: always()
218-
uses: actions/upload-artifact@v3
219-
with:
220-
name: spark-agg-fuzzer-failure-artifacts
221-
path: |
222-
/tmp/spark_aggregate_fuzzer_repro
223-
224171
linux-join-fuzzer-run:
225172
runs-on: ubuntu-latest
226173
needs: compile

.github/workflows/scheduled.yml

+5-1
Original file line numberDiff line numberDiff line change
@@ -457,7 +457,7 @@ jobs:
457457
spark-aggregate-fuzzer-run:
458458
name: Spark Aggregate Fuzzer
459459
runs-on: ubuntu-latest
460-
container: ghcr.io/facebookincubator/velox-dev:centos9
460+
container: ghcr.io/facebookincubator/velox-dev:spark-server
461461
needs: compile
462462
timeout-minutes: 60
463463
steps:
@@ -469,12 +469,16 @@ jobs:
469469

470470
- name: Run Spark Aggregate Fuzzer
471471
run: |
472+
bash /opt/start-spark.sh
473+
# Sleep for 60 seconds to allow Spark server to start.
474+
sleep 60
472475
mkdir -p /tmp/spark_aggregate_fuzzer_repro/logs/
473476
chmod -R 777 /tmp/spark_aggregate_fuzzer_repro
474477
chmod +x spark_aggregation_fuzzer_test
475478
./spark_aggregation_fuzzer_test \
476479
--seed ${RANDOM} \
477480
--duration_sec $DURATION \
481+
--enable_sorted_aggregations=false \
478482
--minloglevel=0 \
479483
--stderrthreshold=2 \
480484
--log_dir=/tmp/spark_aggregate_fuzzer_repro/logs \

velox/exec/fuzzer/FuzzerUtil.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,8 @@ void setupMemory(
355355
void registerHiveConnector(
356356
const std::unordered_map<std::string, std::string>& hiveConfigs) {
357357
auto configs = hiveConfigs;
358+
// Make sure not to run out of open file descriptors.
359+
configs[connector::hive::HiveConfig::kNumCacheFileHandles] = "1000";
358360
if (!connector::hasConnectorFactory(
359361
connector::hive::HiveConnectorFactory::kHiveConnectorName)) {
360362
connector::registerConnectorFactory(

velox/functions/sparksql/fuzzer/SparkAggregationFuzzerTest.cpp

+18-21
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,13 @@
1919
#include <gtest/gtest.h>
2020
#include <unordered_set>
2121

22+
#include "velox/dwio/parquet/RegisterParquetWriter.h"
2223
#include "velox/exec/fuzzer/AggregationFuzzerOptions.h"
2324
#include "velox/exec/fuzzer/AggregationFuzzerRunner.h"
24-
#include "velox/exec/fuzzer/DuckQueryRunner.h"
2525
#include "velox/exec/fuzzer/TransformResultVerifier.h"
2626
#include "velox/functions/prestosql/registration/RegistrationFunctions.h"
2727
#include "velox/functions/sparksql/aggregates/Register.h"
28+
#include "velox/functions/sparksql/fuzzer/SparkQueryRunner.h"
2829
#include "velox/serializers/CompactRowSerializer.h"
2930
#include "velox/serializers/PrestoSerializer.h"
3031
#include "velox/serializers/UnsafeRowSerializer.h"
@@ -45,6 +46,7 @@ DEFINE_string(
4546
int main(int argc, char** argv) {
4647
facebook::velox::functions::aggregate::sparksql::registerAggregateFunctions(
4748
"", false);
49+
facebook::velox::parquet::registerParquetWriterFactory();
4850

4951
::testing::InitGoogleTest(&argc, argv);
5052

@@ -71,10 +73,13 @@ int main(int argc, char** argv) {
7173
}
7274
facebook::velox::memory::MemoryManager::initialize({});
7375

74-
// TODO: List of the functions that at some point crash or fail and need to
75-
// be fixed before we can enable. Constant argument of bloom_filter_agg cause
76-
// fuzzer test fail.
77-
std::unordered_set<std::string> skipFunctions = {"bloom_filter_agg"};
76+
// Spark does not provide user-accessible aggregate functions with the
77+
// following names.
78+
std::unordered_set<std::string> skipFunctions = {
79+
"bloom_filter_agg",
80+
"first_ignore_null",
81+
"last_ignore_null",
82+
"regr_replacement"};
7883

7984
using facebook::velox::exec::test::TransformResultVerifier;
8085

@@ -113,21 +118,9 @@ int main(int argc, char** argv) {
113118
size_t initialSeed = FLAGS_seed == 0 ? std::time(nullptr) : FLAGS_seed;
114119
std::shared_ptr<facebook::velox::memory::MemoryPool> rootPool{
115120
facebook::velox::memory::memoryManager()->addRootPool()};
116-
auto duckQueryRunner =
117-
std::make_unique<facebook::velox::exec::test::DuckQueryRunner>(
118-
rootPool.get());
119-
duckQueryRunner->disableAggregateFunctions(
120-
{// https://github.com/facebookincubator/velox/issues/7677
121-
"max_by",
122-
"min_by",
123-
// The skewness functions of Velox and DuckDB use different
124-
// algorithms.
125-
// https://github.com/facebookincubator/velox/issues/4845
126-
"skewness",
127-
// Spark's kurtosis uses Pearson's formula for calculating the kurtosis
128-
// coefficient. Meanwhile, DuckDB employs the sample kurtosis calculation
129-
// formula. The results from the two methods are completely different.
130-
"kurtosis"});
121+
auto sparkQueryRunner = std::make_unique<
122+
facebook::velox::functions::sparksql::fuzzer::SparkQueryRunner>(
123+
rootPool.get(), "localhost:15002", "fuzzer", "aggregate");
131124

132125
using Runner = facebook::velox::exec::test::AggregationFuzzerRunner;
133126
using Options = facebook::velox::exec::test::AggregationFuzzerOptions;
@@ -137,5 +130,9 @@ int main(int argc, char** argv) {
137130
options.skipFunctions = skipFunctions;
138131
options.customVerificationFunctions = customVerificationFunctions;
139132
options.orderableGroupKeys = true;
140-
return Runner::run(initialSeed, std::move(duckQueryRunner), options);
133+
options.timestampPrecision =
134+
facebook::velox::VectorFuzzer::Options::TimestampPrecision::kMicroSeconds;
135+
options.hiveConfigs = {
136+
{facebook::velox::connector::hive::HiveConfig::kReadTimestampUnit, "6"}};
137+
return Runner::run(initialSeed, std::move(sparkQueryRunner), options);
141138
}

0 commit comments

Comments
 (0)