Add a streaming read_parquet node

wence- · wence- · commit a00dfcf7c379 · 2025-10-10T16:45:16.000Z
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -174,6 +174,7 @@ if(RAPIDSMPF_HAVE_STREAMING)
             src/streaming/core/leaf_node.cpp
             src/streaming/core/node.cpp
             src/streaming/cudf/partition.cpp
+            src/streaming/cudf/parquet.cpp
             src/streaming/cudf/table_chunk.cpp
   )
 endif()
diff --git a/cpp/include/rapidsmpf/streaming/cudf/parquet.hpp b/cpp/include/rapidsmpf/streaming/cudf/parquet.hpp
@@ -0,0 +1,58 @@
+/**
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <cudf/ast/expressions.hpp>
+#include <cudf/types.hpp>
+
+#include <rapidsmpf/streaming/core/channel.hpp>
+#include <rapidsmpf/streaming/core/context.hpp>
+#include <rapidsmpf/streaming/core/node.hpp>
+
+namespace rapidsmpf::streaming::node {
+
+/**
+ * @brief A container for an expression with a stream.
+ */
+struct Filter {
+    std::unique_ptr<cudf::ast::expression> expression;  ///< Filter expression.
+    rmm::cuda_stream_view
+        stream;  ///< CUDA stream any allocations in expression are valid on.
+};
+
+/**
+ * @brief Asynchronously read parquet files into an output channel.
+ *
+ * @param ctx The execution context to use.
+ * @param ch_out Channel to which `TableChunk`s are sent.
+ * @param max_tickets Maximum number of tickets to throttle production of chunks. Up to
+ * this many tasks can start producing data simultaneously.
+ * @param files Vector of file names to read from. These must all have the same schema.
+ * @param columns Vector of column names to read from the files.
+ * @param num_rows_per_chunk Target (maximum) number of rows any sent `TableChunk` should
+ * have.
+ * @param predicate Optional predicate to apply during the read.
+ *
+ * @return Streaming node representing the asynchronous read.
+ */
+Node read_parquet(
+    std::shared_ptr<Context> ctx,
+    std::shared_ptr<Channel> ch_out,
+    std::ptrdiff_t max_tickets,
+    // TODO: use parquet_reader_options, but more complicated...
+    // See https://github.com/rapidsai/cudf/issues/20052
+    std::vector<std::string> files,
+    std::vector<std::string> columns,
+    // TODO: use byte count, not row count?
+    cudf::size_type num_rows_per_chunk,
+    std::shared_ptr<Filter> predicate = nullptr
+);
+
+}  // namespace rapidsmpf::streaming::node
diff --git a/cpp/src/streaming/cudf/parquet.cpp b/cpp/src/streaming/cudf/parquet.cpp
@@ -0,0 +1,164 @@
+/**
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <algorithm>
+#include <cstddef>
+
+#include <cudf/ast/expressions.hpp>
+#include <cudf/io/datasource.hpp>
+#include <cudf/io/parquet.hpp>
+#include <cudf/io/parquet_metadata.hpp>
+#include <cudf/io/types.hpp>
+#include <rmm/cuda_stream_view.hpp>
+
+#include <rapidsmpf/cuda_stream.hpp>
+#include <rapidsmpf/streaming/core/channel.hpp>
+#include <rapidsmpf/streaming/core/node.hpp>
+#include <rapidsmpf/streaming/cudf/parquet.hpp>
+#include <rapidsmpf/streaming/cudf/table_chunk.hpp>
+
+namespace rapidsmpf::streaming::node {
+namespace {
+
+/**
+ * @brief Read a single chunk from a parquet source and send it to an output channel.
+ *
+ * @param ctx The execution context to use.
+ * @param ch_out Channel to which `TableChunk`s are sent.
+ * @param stream The stream on which to read the chunk.
+ * @param source The `cudf::io::source_info` describing the data to read.
+ * @param columns Named columns to read from the file.
+ * @param skip_rows Number of rows to skip from the beginning of the file.
+ * @param num_rows Number of rows to read.
+ * @param predicate Optional predicate to apply during the read.
+ * @param sequence_number The ordered chunk id to reconstruct original ordering of the
+ * data.
+ *
+ * @note The caller is responsible for scheduling this coroutine onto a thread pool for
+ * execution.
+ *
+ * @return Streaming node representing the asynchronous read of a chunk and send to the
+ * output channel.
+ */
+Node read_parquet_chunk(
+    std::shared_ptr<Context> ctx,
+    std::shared_ptr<ThrottlingAdaptor> ch_out,
+    rmm::cuda_stream_view stream,
+    cudf::io::source_info source,
+    std::vector<std::string> columns,
+    std::int64_t skip_rows,
+    cudf::size_type num_rows,
+    std::shared_ptr<Filter> predicate,
+    std::uint64_t sequence_number
+) {
+    auto ticket = co_await ch_out->acquire();
+    auto builder = cudf::io::parquet_reader_options::builder(source)
+                       .columns(columns)
+                       .num_rows(num_rows)
+                       .skip_rows(skip_rows);
+    if (predicate != nullptr) {
+        cuda_stream_join(stream, predicate->stream);
+        builder.filter(*predicate->expression);
+    }
+    auto options = builder.build();
+    auto result = std::make_unique<TableChunk>(
+        sequence_number,
+        cudf::io::read_parquet(options, stream, ctx->br()->device_mr()).tbl,
+        stream
+    );
+    if (predicate != nullptr) {
+        cuda_stream_join(predicate->stream, stream);
+    }
+    auto [_, receipt] = co_await ticket.send(std::move(result));
+    // Move this coroutine to the back of the queue so that when we release the semaphore
+    // it is likely to occur on a different thread, releasing the semaphore resumes any
+    // waiters on the current thread which is not what we typically want for throttled
+    // reads, we want the next waiting read task to run on a different thread.
+    co_await ctx->executor()->yield();
+    co_await receipt;
+}
+
+}  // namespace
+
+Node read_parquet(
+    std::shared_ptr<Context> ctx,
+    std::shared_ptr<Channel> ch_out,
+    std::ptrdiff_t max_tickets,
+    std::vector<std::string> files,
+    std::vector<std::string> columns,
+    cudf::size_type num_rows_per_chunk,
+    std::shared_ptr<Filter> predicate
+) {
+    ShutdownAtExit c{ch_out};
+    auto throttle = std::make_shared<ThrottlingAdaptor>(ch_out, max_tickets);
+    co_await ctx->executor()->schedule();
+    auto size = static_cast<std::size_t>(ctx->comm()->nranks());
+    auto rank = static_cast<std::size_t>(ctx->comm()->rank());
+    RAPIDSMPF_EXPECTS(
+        files.size() < std::numeric_limits<int>::max(), "Trying to read too many files"
+    );
+    int files_per_rank =
+        static_cast<int>(files.size() / size + (rank < (files.size() % size)));
+    int file_offset = 0;
+    for (auto i = std::size_t{0}; i < rank; i++) {
+        file_offset +=
+            static_cast<int>(files.size() / size + (i < (files.size() % size)));
+    }
+    files = std::vector(
+        files.begin() + file_offset, files.begin() + file_offset + files_per_rank
+    );
+    int files_per_split = 1;
+    // TODO: Handle case where multiple ranks are reading from a single file.
+    // TODO: We could be smarter here, suppose that the number of files we end up wanting
+    // is one, but each file is marginally larger than our target_rows_per_chunk, we'd end
+    // up producing many small chunks.
+    if (files_per_rank > 1) {
+        // Figure out a guesstimated splitting.
+        auto source = cudf::io::source_info(files[0]);
+        auto metadata = cudf::io::read_parquet_metadata(source);
+        auto const rg = metadata.rowgroup_metadata();
+        auto const num_rows = metadata.num_rows();
+        files_per_split =
+            std::max(static_cast<int>(num_rows_per_chunk / num_rows), files_per_split);
+    }
+    std::vector<Node> read_tasks;
+    for (file_offset = 0; file_offset < files_per_rank; file_offset += files_per_split) {
+        auto nfiles = std::min(files_per_split, files_per_rank - file_offset);
+        std::vector<std::string> chunk;
+        chunk.reserve(static_cast<std::size_t>(nfiles));
+        std::ranges::move(
+            files.begin() + file_offset,
+            files.begin() + file_offset + nfiles,
+            std::back_inserter(chunk)
+        );
+        auto source = cudf::io::source_info(std::move(chunk));
+        std::int64_t skip_rows = 0;
+        std::uint64_t sequence_number = 0;
+        auto metadata = cudf::io::read_parquet_metadata(source);
+        auto const source_num_rows = metadata.num_rows();
+        while (skip_rows < source_num_rows) {
+            cudf::size_type num_rows = std::min(
+                static_cast<std::int64_t>(num_rows_per_chunk), source_num_rows - skip_rows
+            );
+            read_tasks.push_back(ctx->executor()->schedule(read_parquet_chunk(
+                ctx,
+                throttle,
+                ctx->br()->stream_pool().get_stream(),
+                source,
+                columns,
+                skip_rows,
+                num_rows,
+                predicate,
+                // TODO: sequence number being correct relies on read_parquet_chunk
+                // sending only one chunk.
+                sequence_number++
+            )));
+            skip_rows += num_rows;
+        }
+    }
+    co_await when_all_or_throw(std::move(read_tasks));
+    co_await ch_out->drain(ctx->executor());
+}
+}  // namespace rapidsmpf::streaming::node

Original file line number	Diff line number	Diff line change
`@@ -174,6 +174,7 @@ if(RAPIDSMPF_HAVE_STREAMING)`
`174`	`174`	`src/streaming/core/leaf_node.cpp`
`175`	`175`	`src/streaming/core/node.cpp`
`176`	`176`	`src/streaming/cudf/partition.cpp`
	`177`	`+ src/streaming/cudf/parquet.cpp`
`177`	`178`	`src/streaming/cudf/table_chunk.cpp`
`178`	`179`	`)`
`179`	`180`	`endif()`