Expose read_parquet node to python

wence- · wence- · commit e1edb189b4a2 · 2025-10-29T12:20:51.000Z
diff --git a/python/rapidsmpf/rapidsmpf/streaming/cudf/CMakeLists.txt b/python/rapidsmpf/rapidsmpf/streaming/cudf/CMakeLists.txt
@@ -5,7 +5,7 @@
 # cmake-format: on
 # =================================================================================
 
-set(cython_modules partition.pyx table_chunk.pyx)
+set(cython_modules parquet.pyx partition.pyx table_chunk.pyx)
 
 rapids_cython_create_modules(
   CXX
diff --git a/python/rapidsmpf/rapidsmpf/streaming/cudf/parquet.pyi b/python/rapidsmpf/rapidsmpf/streaming/cudf/parquet.pyi
@@ -0,0 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from pylibcudf.io.parquet import ParquetReaderOptions
+
+from rapidsmpf.streaming.core.channel import Channel
+from rapidsmpf.streaming.core.context import Context
+from rapidsmpf.streaming.core.node import CppNode
+from rapidsmpf.streaming.cudf.table_chunk import TableChunk
+
+def read_parquet(
+    ctx: Context,
+    ch_out: Channel[TableChunk],
+    num_producers: int,
+    options: ParquetReaderOptions,
+    num_rows_per_chunk: int,
+) -> CppNode: ...
diff --git a/python/rapidsmpf/rapidsmpf/streaming/cudf/parquet.pyx b/python/rapidsmpf/rapidsmpf/streaming/cudf/parquet.pyx
@@ -0,0 +1,66 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from libc.stddef cimport size_t
+from libcpp.memory cimport make_unique, shared_ptr
+from libcpp.utility cimport move
+from pylibcudf.io.parquet cimport ParquetReaderOptions
+from pylibcudf.libcudf.io.parquet cimport parquet_reader_options
+from pylibcudf.libcudf.types cimport size_type
+
+from rapidsmpf.streaming.core.channel cimport Channel, cpp_Channel
+from rapidsmpf.streaming.core.context cimport Context, cpp_Context
+from rapidsmpf.streaming.core.node cimport CppNode, cpp_Node
+
+
+cdef extern from "<rapidsmpf/streaming/cudf/parquet.hpp>" nogil:
+    cdef cpp_Node cpp_read_parquet \
+        "rapidsmpf::streaming::node::read_parquet"(
+            shared_ptr[cpp_Context] ctx,
+            shared_ptr[cpp_Channel] ch_out,
+            size_t num_producers,
+            parquet_reader_options options,
+            size_type num_rows_per_chunk,
+        ) except +
+
+
+def read_parquet(
+    Context ctx not None,
+    Channel ch_out not None,
+    size_t num_producers,
+    ParquetReaderOptions options not None,
+    size_type num_rows_per_chunk
+):
+    """
+    Create a streaming node to read from parquet.
+
+    Parameters
+    ----------
+    ctx
+        Streaming execution context
+    ch_out
+        Output channel to receive the TableChunks.
+    num_producers
+        Number of concurrent producers of output chunks.
+    options
+        Reader options
+    num_rows_per_chunk
+        Target (maximum) number of rows per output chunk.
+
+    Notes
+    -----
+    This is a collective operation, all ranks participating via the
+    execution context's communicator must call it with the same options.
+    """
+    cdef cpp_Node _ret
+    with nogil:
+        _ret = cpp_read_parquet(
+            ctx._handle,
+            ch_out._handle,
+            num_producers,
+            options.c_obj,
+            num_rows_per_chunk,
+        )
+    return CppNode.from_handle(
+        make_unique[cpp_Node](move(_ret)), owner=None
+    )
diff --git a/python/rapidsmpf/rapidsmpf/tests/streaming/test_read_parquet.py b/python/rapidsmpf/rapidsmpf/tests/streaming/test_read_parquet.py
@@ -0,0 +1,102 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import itertools
+from typing import TYPE_CHECKING
+
+import numpy as np
+import pytest
+
+import pylibcudf as plc
+
+from rapidsmpf.streaming.core.channel import Channel
+from rapidsmpf.streaming.core.leaf_node import pull_from_channel
+from rapidsmpf.streaming.core.node import run_streaming_pipeline
+from rapidsmpf.streaming.cudf.parquet import read_parquet
+from rapidsmpf.streaming.cudf.table_chunk import TableChunk
+
+if TYPE_CHECKING:
+    from typing import Literal
+
+    from rapidsmpf.streaming.core.context import Context
+
+
+@pytest.fixture(scope="module")
+def source(
+    tmp_path_factory: pytest.TempPathFactory,
+) -> plc.io.SourceInfo:
+    path = tmp_path_factory.mktemp("read_parquet")
+
+    nrows = 10
+    start = 0
+    sources = []
+    for i in range(10):
+        table = plc.Table(
+            [plc.Column.from_array(np.arange(start, start + nrows, dtype="int32"))]
+        )
+        # gaps in the column numbering we produce
+        start += nrows + nrows // 2
+        filename = path / f"{i:3d}.pq"
+        sink = plc.io.SinkInfo([filename])
+        options = plc.io.parquet.ParquetWriterOptions.builder(sink, table).build()
+        plc.io.parquet.write_parquet(options)
+        sources.append(filename)
+    return plc.io.SourceInfo(sources)
+
+
+@pytest.mark.parametrize(
+    "skip_rows", ["none", 7, 19, 113], ids=lambda s: f"skip_rows_{s}"
+)
+@pytest.mark.parametrize("num_rows", ["all", 0, 3, 31, 83], ids=lambda s: f"nrows_{s}")
+def test_read_parquet(
+    context: Context,
+    source: plc.io.SourceInfo,
+    skip_rows: int | Literal["none"],
+    num_rows: int | Literal["all"],
+) -> None:
+    ch = Channel[TableChunk]()
+
+    options = plc.io.parquet.ParquetReaderOptions.builder(source).build()
+
+    if skip_rows != "none":
+        options.set_skip_rows(skip_rows)
+    if num_rows != "all":
+        options.set_num_rows(num_rows)
+    expected = plc.io.parquet.read_parquet(options).tbl
+
+    producer = read_parquet(context, ch, 4, options, 3)
+
+    consumer, deferred_messages = pull_from_channel(context, ch)
+
+    run_streaming_pipeline(nodes=[producer, consumer])
+
+    messages = deferred_messages.release()
+    assert all(
+        m1.sequence_number < m2.sequence_number
+        for m1, m2 in itertools.pairwise(messages)
+    )
+    chunks = [TableChunk.from_message(m) for m in messages]
+    for chunk in chunks:
+        chunk.stream.synchronize()
+
+    got = plc.concatenate.concatenate([chunk.table_view() for chunk in chunks])
+    for chunk in chunks:
+        chunk.stream.synchronize()
+
+    assert got.num_rows() == expected.num_rows()
+    assert got.num_columns() == expected.num_columns()
+    assert got.num_columns() == 1
+
+    all_equal = plc.reduce.reduce(
+        plc.binaryop.binary_operation(
+            got.columns()[0],
+            expected.columns()[0],
+            plc.binaryop.BinaryOperator.EQUAL,
+            plc.DataType(plc.TypeId.BOOL8),
+        ),
+        plc.aggregation.all(),
+        plc.DataType(plc.TypeId.BOOL8),
+    )
+    assert all_equal.to_py()