Add read_parquet test

wence- · wence- · commit 17a409989970 · 2025-10-16T17:25:56.000Z
Edge cases still need sorted
diff --git a/cpp/src/streaming/cudf/parquet.cpp b/cpp/src/streaming/cudf/parquet.cpp
@@ -129,6 +129,8 @@ Node read_parquet(
     auto options_num_rows =
         options.get_num_rows().value_or(std::numeric_limits<int64_t>::max());
     std::uint64_t sequence_number = 0;
+    // TODO: Handle case where total num rows is zero and/or where we skip all the rows in
+    // the file.
     for (file_offset = 0; file_offset < files_per_rank; file_offset += files_per_split) {
         if (options_num_rows == 0) {
             break;
diff --git a/python/rapidsmpf/rapidsmpf/tests/streaming/test_read_parquet.py b/python/rapidsmpf/rapidsmpf/tests/streaming/test_read_parquet.py
@@ -0,0 +1,110 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import operator
+from typing import TYPE_CHECKING
+
+import numpy as np
+import pytest
+
+import pylibcudf as plc
+
+from rapidsmpf.streaming.core.channel import Channel
+from rapidsmpf.streaming.core.leaf_node import pull_from_channel
+from rapidsmpf.streaming.core.node import run_streaming_pipeline
+from rapidsmpf.streaming.cudf.parquet import read_parquet
+from rapidsmpf.streaming.cudf.table_chunk import TableChunk
+
+if TYPE_CHECKING:
+    from rapidsmpf.streaming.core.context import Context
+
+
+@pytest.fixture(scope="module")
+def source_full_table(
+    tmp_path_factory: pytest.TempPathFactory,
+) -> tuple[plc.io.SourceInfo, plc.Table]:
+    path = tmp_path_factory.mktemp("read_parquet")
+
+    nrows = 10
+    start = 0
+    sources = []
+    tables = []
+    for i in range(10):
+        table = plc.Table(
+            [plc.Column.from_array(np.arange(start, start + nrows, dtype="int32"))]
+        )
+        tables.append(table)
+        # gaps in the column numbering we produce
+        start += nrows + nrows // 2
+        filename = path / f"{i:3d}.pq"
+        sink = plc.io.SinkInfo([filename])
+        options = plc.io.parquet.ParquetWriterOptions.builder(sink, table).build()
+        plc.io.parquet.write_parquet(options)
+        sources.append(filename)
+    return plc.io.SourceInfo(sources), plc.concatenate.concatenate(tables)
+
+
+@pytest.mark.parametrize("skip_rows", [None, 7, 19, 113], ids=lambda s: f"skip_{s}")
+@pytest.mark.parametrize("num_rows", [None, 0, 3, 31, 83], ids=lambda s: f"nrows_{s}")
+def test_read_parquet(
+    context: Context,
+    source_full_table: tuple[plc.io.SourceInfo, plc.Table],
+    skip_rows: int | None,
+    num_rows: int | None,
+) -> None:
+    ch = Channel[TableChunk]()
+
+    source, expected = source_full_table
+    options = plc.io.parquet.ParquetReaderOptions.builder(source).build()
+
+    if skip_rows is not None:
+        options.set_skip_rows(skip_rows)
+        (expected,) = plc.copying.slice(
+            expected, [min(skip_rows, expected.num_rows()), expected.num_rows()]
+        )
+    if num_rows is not None:
+        options.set_num_rows(num_rows)
+        (expected,) = plc.copying.slice(
+            expected, [0, min(num_rows, expected.num_rows())]
+        )
+    producer = read_parquet(context, ch, 4, options, 3)
+
+    consumer, messages = pull_from_channel(context, ch)
+
+    run_streaming_pipeline(nodes=[producer, consumer])
+
+    chunks = [TableChunk.from_message(m) for m in messages.release()]
+    for chunk in chunks:
+        chunk.stream.synchronize()
+
+    views = [
+        chunk.table_view()
+        for chunk in sorted(chunks, key=operator.attrgetter("sequence_number"))
+    ]
+    if views:
+        result = plc.concatenate.concatenate(views)
+    else:
+        result = plc.Table([])
+    for chunk in chunks:
+        chunk.stream.synchronize()
+
+    assert result.num_rows() == expected.num_rows()
+    assert result.num_columns() == expected.num_columns()
+    assert result.num_columns() == 1
+
+    (got,) = result.columns()
+    (expect,) = expected.columns()
+
+    all_equal = plc.reduce.reduce(
+        plc.binaryop.binary_operation(
+            got,
+            expect,
+            plc.binaryop.BinaryOperator.EQUAL,
+            plc.DataType(plc.TypeId.BOOL8),
+        ),
+        plc.aggregation.all(),
+        plc.DataType(plc.TypeId.BOOL8),
+    )
+    assert all_equal.to_py()