Skip to content

Commit 1c63e85

Browse files
authored
Prep for 0.1.0rc2 (#86)
* squash merge in the benchmarking scripts * merge in another change required before subsequent merges * update readme with more benchmarking instructions * rename to DataFusion for Ray. black format python code * updates to better support s3 and local for benchmarking * add more benchmark results * revert broken change in CI TPCH testing * additional fix for CI
1 parent dcea736 commit 1c63e85

35 files changed

+2123
-199
lines changed

.github/workflows/main.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -131,5 +131,5 @@ jobs:
131131
--concurrency 3 \
132132
--partitions-per-processor 2 \
133133
--batch-size=8192 \
134-
--worker-pool-min=20 \
134+
--processor-pool-min=20 \
135135
--validate

Cargo.lock

+1-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ description = "DataFusion on Ray"
2121
homepage = "https://github.com/apache/datafusion-ray"
2222
repository = "https://github.com/apache/datafusion-ray"
2323
authors = ["Apache DataFusion <[email protected]>"]
24-
version = "0.1.0"
24+
version = "0.1.0-rc2"
2525
edition = "2024"
2626
readme = "README.md"
2727
license = "Apache-2.0"

README.md

+5-5
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
under the License.
1818
-->
1919

20-
# DataFusion Ray
20+
# DataFusion for Ray
2121

2222
[![Apache licensed][license-badge]][license-url]
2323
[![Python Tests][actions-badge]][actions-url]
@@ -32,13 +32,13 @@
3232

3333
## Overview
3434

35-
DataFusion Ray is a distributed execution framework that enables DataFusion DataFrame and SQL queries to run on a
35+
DataFusion for Ray is a distributed execution framework that enables DataFusion DataFrame and SQL queries to run on a
3636
Ray cluster. This integration allows users to leverage Ray's dynamic scheduling capabilities while executing
3737
queries in a distributed fashion.
3838

3939
## Execution Modes
4040

41-
DataFusion Ray supports two execution modes:
41+
DataFusion for Ray supports two execution modes:
4242

4343
### Streaming Execution
4444

@@ -54,7 +54,7 @@ intermediate shuffle files that are persisted and used as input for the next sta
5454

5555
## Getting Started
5656

57-
See the [contributor guide] for instructions on building DataFusion Ray.
57+
See the [contributor guide] for instructions on building DataFusion for Ray.
5858

5959
Once installed, you can run queries using DataFusion's familiar API while leveraging the distributed execution
6060
capabilities of Ray.
@@ -84,6 +84,6 @@ Contributions are welcome! Please open an issue or submit a pull request if you
8484

8585
## License
8686

87-
DataFusion Ray is licensed under Apache 2.0.
87+
DataFusion for Ray is licensed under Apache 2.0.
8888

8989
[contributor guide]: docs/contributing.md

datafusion_ray/core.py

+53-32
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,9 @@ async def wait_for(coros, name=""):
8686
# wrap the coro in a task to work with python 3.10 and 3.11+ where asyncio.wait semantics
8787
# changed to not accept any awaitable
8888
start = time.time()
89-
done, _ = await asyncio.wait([asyncio.create_task(_ensure_coro(c)) for c in coros])
89+
done, _ = await asyncio.wait(
90+
[asyncio.create_task(_ensure_coro(c)) for c in coros]
91+
)
9092
end = time.time()
9193
log.info(f"waiting for {name} took {end - start}s")
9294
for d in done:
@@ -108,9 +110,9 @@ class DFRayProcessorPool:
108110
#
109111
# This is simple though and will suffice for now
110112

111-
def __init__(self, min_workers: int, max_workers: int):
112-
self.min_workers = min_workers
113-
self.max_workers = max_workers
113+
def __init__(self, min_processors: int, max_processors: int):
114+
self.min_processors = min_processors
115+
self.max_processors = max_processors
114116

115117
# a map of processor_key (a random identifier) to stage actor reference
116118
self.pool = {}
@@ -137,11 +139,11 @@ def __init__(self, min_workers: int, max_workers: int):
137139
# processors available
138140
self.available = set()
139141

140-
for _ in range(min_workers):
142+
for _ in range(min_processors):
141143
self._new_processor()
142144

143145
log.info(
144-
f"created ray processor pool (min_workers: {min_workers}, max_workers: {max_workers})"
146+
f"created ray processor pool (min_processors: {min_processors}, max_processors: {max_processors})"
145147
)
146148

147149
async def start(self):
@@ -159,12 +161,14 @@ async def acquire(self, need=1):
159161

160162
have = len(self.available)
161163
total = len(self.available) + len(self.acquired)
162-
can_make = self.max_workers - total
164+
can_make = self.max_processors - total
163165

164166
need_to_make = need - have
165167

166168
if need_to_make > can_make:
167-
raise Exception(f"Cannot allocate workers above {self.max_workers}")
169+
raise Exception(
170+
f"Cannot allocate processors above {self.max_processors}"
171+
)
168172

169173
if need_to_make > 0:
170174
log.debug(f"creating {need_to_make} additional processors")
@@ -193,9 +197,9 @@ def _new_processor(self):
193197
self.processors_ready.clear()
194198
processor_key = new_friendly_name()
195199
log.debug(f"starting processor: {processor_key}")
196-
processor = DFRayProcessor.options(name=f"Processor : {processor_key}").remote(
197-
processor_key
198-
)
200+
processor = DFRayProcessor.options(
201+
name=f"Processor : {processor_key}"
202+
).remote(processor_key)
199203
self.pool[processor_key] = processor
200204
self.processors_started.add(processor.start_up.remote())
201205
self.available.add(processor_key)
@@ -244,7 +248,9 @@ async def _wait_for_serve(self):
244248

245249
async def all_done(self):
246250
log.info("calling processor all done")
247-
refs = [processor.all_done.remote() for processor in self.pool.values()]
251+
refs = [
252+
processor.all_done.remote() for processor in self.pool.values()
253+
]
248254
await wait_for(refs, "processors to be all done")
249255
log.info("all processors shutdown")
250256

@@ -287,7 +293,9 @@ async def update_plan(
287293
)
288294

289295
async def serve(self):
290-
log.info(f"[{self.processor_key}] serving on {self.processor_service.addr()}")
296+
log.info(
297+
f"[{self.processor_key}] serving on {self.processor_service.addr()}"
298+
)
291299
await self.processor_service.serve()
292300
log.info(f"[{self.processor_key}] done serving")
293301

@@ -321,11 +329,13 @@ def __str__(self):
321329
class DFRayContextSupervisor:
322330
def __init__(
323331
self,
324-
worker_pool_min: int,
325-
worker_pool_max: int,
332+
processor_pool_min: int,
333+
processor_pool_max: int,
326334
) -> None:
327-
log.info(f"Creating DFRayContextSupervisor worker_pool_min: {worker_pool_min}")
328-
self.pool = DFRayProcessorPool(worker_pool_min, worker_pool_max)
335+
log.info(
336+
f"Creating DFRayContextSupervisor processor_pool_min: {processor_pool_min}"
337+
)
338+
self.pool = DFRayProcessorPool(processor_pool_min, processor_pool_max)
329339
self.stages: dict[str, InternalStageData] = {}
330340
log.info("Created DFRayContextSupervisor")
331341

@@ -337,7 +347,9 @@ async def wait_for_ready(self):
337347

338348
async def get_stage_addrs(self, stage_id: int):
339349
addrs = [
340-
sd.remote_addr for sd in self.stages.values() if sd.stage_id == stage_id
350+
sd.remote_addr
351+
for sd in self.stages.values()
352+
if sd.stage_id == stage_id
341353
]
342354
return addrs
343355

@@ -387,7 +399,10 @@ async def new_query(
387399
refs.append(
388400
isd.remote_processor.update_plan.remote(
389401
isd.stage_id,
390-
{stage_id: val["child_addrs"] for (stage_id, val) in kid.items()},
402+
{
403+
stage_id: val["child_addrs"]
404+
for (stage_id, val) in kid.items()
405+
},
391406
isd.partition_group,
392407
isd.plan_bytes,
393408
)
@@ -419,7 +434,9 @@ async def sort_out_addresses(self):
419434
]
420435

421436
# sanity check
422-
assert all([op == output_partitions[0] for op in output_partitions])
437+
assert all(
438+
[op == output_partitions[0] for op in output_partitions]
439+
)
423440
output_partitions = output_partitions[0]
424441

425442
for child_stage_isd in child_stage_datas:
@@ -452,15 +469,15 @@ def __init__(
452469
internal_df: DFRayDataFrameInternal,
453470
supervisor, # ray.actor.ActorHandle[DFRayContextSupervisor],
454471
batch_size=8192,
455-
partitions_per_worker: int | None = None,
472+
partitions_per_processor: int | None = None,
456473
prefetch_buffer_size=0,
457474
):
458475
self.df = internal_df
459476
self.supervisor = supervisor
460477
self._stages = None
461478
self._batches = None
462479
self.batch_size = batch_size
463-
self.partitions_per_worker = partitions_per_worker
480+
self.partitions_per_processor = partitions_per_processor
464481
self.prefetch_buffer_size = prefetch_buffer_size
465482

466483
def stages(self):
@@ -469,7 +486,7 @@ def stages(self):
469486
self._stages = self.df.stages(
470487
self.batch_size,
471488
self.prefetch_buffer_size,
472-
self.partitions_per_worker,
489+
self.partitions_per_processor,
473490
)
474491

475492
return self._stages
@@ -503,7 +520,9 @@ def collect(self) -> list[pa.RecordBatch]:
503520
)
504521
log.debug(f"last stage addrs {last_stage_addrs}")
505522

506-
reader = self.df.read_final_stage(last_stage_id, last_stage_addrs[0])
523+
reader = self.df.read_final_stage(
524+
last_stage_id, last_stage_addrs[0]
525+
)
507526
log.debug("got reader")
508527
self._batches = list(reader)
509528
return self._batches
@@ -541,20 +560,20 @@ def __init__(
541560
self,
542561
batch_size: int = 8192,
543562
prefetch_buffer_size: int = 0,
544-
partitions_per_worker: int | None = None,
545-
worker_pool_min: int = 1,
546-
worker_pool_max: int = 100,
563+
partitions_per_processor: int | None = None,
564+
processor_pool_min: int = 1,
565+
processor_pool_max: int = 100,
547566
) -> None:
548567
self.ctx = DFRayContextInternal()
549568
self.batch_size = batch_size
550-
self.partitions_per_worker = partitions_per_worker
569+
self.partitions_per_processor = partitions_per_processor
551570
self.prefetch_buffer_size = prefetch_buffer_size
552571

553572
self.supervisor = DFRayContextSupervisor.options(
554573
name="RayContextSupersisor",
555574
).remote(
556-
worker_pool_min,
557-
worker_pool_max,
575+
processor_pool_min,
576+
processor_pool_max,
558577
)
559578

560579
# start up our super visor and don't check in on it until its
@@ -603,7 +622,9 @@ def register_csv(self, name: str, path: str):
603622
"""
604623
self.ctx.register_csv(name, path)
605624

606-
def register_listing_table(self, name: str, path: str, file_extention="parquet"):
625+
def register_listing_table(
626+
self, name: str, path: str, file_extention="parquet"
627+
):
607628
"""
608629
Register a directory of parquet files with the given name.
609630
The path can be a local filesystem path, absolute filesystem path, or a url.
@@ -629,7 +650,7 @@ def sql(self, query: str) -> DFRayDataFrame:
629650
df,
630651
self.supervisor,
631652
self.batch_size,
632-
self.partitions_per_worker,
653+
self.partitions_per_processor,
633654
self.prefetch_buffer_size,
634655
)
635656

dev/release/README.md

+8-8
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
under the License.
1818
-->
1919

20-
# DataFusion Ray Release Process
20+
# DataFusion for Ray Release Process
2121

2222
Development happens on the `main` branch, and most of the time, we depend on DataFusion using GitHub dependencies
2323
rather than using an official release from crates.io. This allows us to pick up new features and bug fixes frequently
@@ -43,7 +43,7 @@ You will need a GitHub Personal Access Token. Follow
4343
[these instructions](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token)
4444
to generate one if you do not already have one.
4545

46-
You will need a PyPI API token. Create one at https://test.pypi.org/manage/account/#api-tokens, setting the “Scope” to
46+
You will need a PyPI API token. Create one at <https://test.pypi.org/manage/account/#api-tokens>, setting the “Scope” to
4747
“Entire account”.
4848

4949
You will also need access to the [datafusion-ray](https://test.pypi.org/project/datafusion-ray/) project on testpypi.
@@ -63,7 +63,7 @@ We maintain a `CHANGELOG.md` so our users know what has been changed between rel
6363
The changelog is generated using a Python script:
6464

6565
```bash
66-
$ GITHUB_TOKEN=<TOKEN> ./dev/release/generate-changelog.py 0.1.0 HEAD 0.2.0 > dev/changelog/0.2.0.md
66+
GITHUB_TOKEN=<TOKEN> ./dev/release/generate-changelog.py 0.1.0 HEAD 0.2.0 > dev/changelog/0.2.0.md
6767
```
6868

6969
This script creates a changelog from GitHub PRs based on the labels associated with them as well as looking for
@@ -91,7 +91,7 @@ git push apache 0.2.0-rc1
9191
./dev/release/create-tarball.sh 0.2.0 1
9292
```
9393

94-
This will also create the email template to send to the mailing list.
94+
This will also create the email template to send to the mailing list.
9595

9696
Create a draft email using this content, but do not send until after completing the next step.
9797

@@ -104,7 +104,7 @@ This section assumes some familiarity with publishing Python packages to PyPi. F
104104

105105
Pushing an `rc` tag to the release branch will cause a GitHub Workflow to run that will build the Python wheels.
106106

107-
Go to https://github.com/apache/datafusion-ray/actions and look for an action named "Python Release Build"
107+
Go to <https://github.com/apache/datafusion-ray/actions> and look for an action named "Python Release Build"
108108
that has run against the pushed tag.
109109

110110
Click on the action and scroll down to the bottom of the page titled "Artifacts". Download `dist.zip`. It should
@@ -207,10 +207,10 @@ git push apache 0.2.0
207207

208208
### Add the release to Apache Reporter
209209

210-
Add the release to https://reporter.apache.org/addrelease.html?datafusion with a version name prefixed with `DATAFUSION-RAY`,
210+
Add the release to <https://reporter.apache.org/addrelease.html?datafusion> with a version name prefixed with `DATAFUSION-RAY`,
211211
for example `DATAFUSION-RAY-0.2.0`.
212212

213-
The release information is used to generate a template for a board report (see example from Apache Arrow
213+
The release information is used to generate a template for a board report (see example from Apache Arrow
214214
[here](https://github.com/apache/arrow/pull/14357)).
215215

216216
### Delete old RCs and Releases
@@ -222,7 +222,7 @@ for more information.
222222

223223
Release candidates should be deleted once the release is published.
224224

225-
Get a list of DataFusion release candidates:
225+
Get a list of DataFusion for Ray release candidates:
226226

227227
```bash
228228
svn ls https://dist.apache.org/repos/dist/dev/datafusion | grep datafusion-ray

dev/release/check-rat-report.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@
2323
import xml.etree.ElementTree as ET
2424

2525
if len(sys.argv) != 3:
26-
sys.stderr.write("Usage: %s exclude_globs.lst rat_report.xml\n" % sys.argv[0])
26+
sys.stderr.write(
27+
"Usage: %s exclude_globs.lst rat_report.xml\n" % sys.argv[0]
28+
)
2729
sys.exit(1)
2830

2931
exclude_globs_filename = sys.argv[1]

0 commit comments

Comments
 (0)