Skip to content
Merged
14 changes: 10 additions & 4 deletions cpp/include/cudf/io/parquet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ class parquet_reader_options {
// Number of rows to skip from the start; Parquet stores the number of rows as int64_t
int64_t _skip_rows = 0;
// Number of rows to read; `nullopt` is all
std::optional<size_type> _num_rows;
std::optional<int64_t> _num_rows;

// Read row groups that start at or after this byte offset into the source
size_t _skip_bytes = 0;
Expand Down Expand Up @@ -204,7 +204,7 @@ class parquet_reader_options {
* @return Number of rows to read; `nullopt` if the option hasn't been set (in which case the file
* is read until the end)
*/
[[nodiscard]] std::optional<size_type> const& get_num_rows() const { return _num_rows; }
[[nodiscard]] std::optional<int64_t> const& get_num_rows() const { return _num_rows; }

/**
* @brief Returns bytes to skip before starting reading row groups
Expand Down Expand Up @@ -387,9 +387,12 @@ class parquet_reader_options {
/**
* @brief Sets number of rows to read.
*
* @note Although this allows one to request more than `size_type::max()` rows, if any
* single read would produce a table larger than this row limit, an error is thrown.
*
* @param val Number of rows to read after skip
*/
void set_num_rows(size_type val);
void set_num_rows(int64_t val);

/**
* @brief Sets bytes to skip before starting reading row groups.
Expand Down Expand Up @@ -547,10 +550,13 @@ class parquet_reader_options_builder {
/**
* @brief Sets number of rows to read.
*
* @note Although this allows one to request more than `size_type::max()` rows, if any
* single read would produce a table larger than this row limit, an error is thrown.
*
* @param val Number of rows to read after skip
* @return this for chaining
*/
parquet_reader_options_builder& num_rows(size_type val)
parquet_reader_options_builder& num_rows(int64_t val)
{
options.set_num_rows(val);
return *this;
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/io/functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -820,7 +820,7 @@ void parquet_reader_options::set_skip_rows(int64_t val)
_skip_rows = val;
}

void parquet_reader_options::set_num_rows(size_type val)
void parquet_reader_options::set_num_rows(int64_t val)
{
CUDF_EXPECTS(val >= 0, "num_rows cannot be negative");
CUDF_EXPECTS(_row_groups.empty(), "num_rows can't be set along with a non-empty row_groups");
Expand Down
6 changes: 5 additions & 1 deletion cpp/src/io/parquet/reader_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include <bitset>
#include <limits>
#include <numeric>
#include <stdexcept>
#include <utility>

namespace cudf::io::parquet::detail {
Expand Down Expand Up @@ -870,7 +871,10 @@ table_with_metadata reader_impl::read()
{
CUDF_EXPECTS(_output_chunk_read_limit == 0,
"Reading the whole file must not have non-zero byte_limit.");

CUDF_EXPECTS(!_options.num_rows.has_value() ||
_options.num_rows.value() <= std::numeric_limits<size_type>::max(),
"Requested number of rows to read exceeds column size limit",
std::overflow_error);
prepare_data(read_mode::READ_ALL);
return read_chunk_internal(read_mode::READ_ALL);
}
Expand Down
43 changes: 43 additions & 0 deletions cpp/tests/io/parquet_reader_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,20 @@

#include <cudf/column/column.hpp>
#include <cudf/io/parquet.hpp>
#include <cudf/io/parquet_metadata.hpp>
#include <cudf/stream_compaction.hpp>
#include <cudf/table/table.hpp>
#include <cudf/table/table_view.hpp>
#include <cudf/transform.hpp>

#include <thrust/iterator/constant_iterator.h>

#include <src/io/parquet/parquet_gpu.hpp>

#include <array>
#include <limits>
#include <memory>
#include <stdexcept>

using ParquetDecompressionTest = DecompressionTest<ParquetReaderTest>;

Expand Down Expand Up @@ -3682,6 +3687,44 @@ TEST_F(ParquetReaderTest, ByteBoundsAndFilters)
}
}

TEST_F(ParquetReaderTest, TableTooLargeOverflows)
{
using T = bool;
constexpr int64_t per_file_num_rows = std::numeric_limits<cudf::size_type>::max() / 2 + 1000;
static_assert(per_file_num_rows <= std::numeric_limits<cudf::size_type>::max(),
"Number of rows per file should be less than size_type::max()");
static_assert(2 * per_file_num_rows > std::numeric_limits<cudf::size_type>::max(),
"Twice number of rows per file should be greather than size_type::max()");
auto value = thrust::make_constant_iterator(true);
auto column = cudf::test::fixed_width_column_wrapper<T>(value, value + per_file_num_rows);

auto filepath = temp_env->get_temp_filepath("TableTooLargeOverflows.parquet");
{
auto sink = cudf::io::sink_info{filepath};
auto options =
cudf::io::parquet_writer_options::builder(sink, cudf::table_view{{column}}).build();
std::ignore = cudf::io::write_parquet(options);
}
std::vector<std::string> files{{filepath, filepath}};
auto source = cudf::io::source_info(files);
auto metadata = cudf::io::read_parquet_metadata(source);
auto const num_rows_to_read = metadata.num_rows() - 1000;
EXPECT_EQ(metadata.num_rows(), per_file_num_rows * 2);
auto options = cudf::io::parquet_reader_options::builder(source)
.num_rows(num_rows_to_read)
.skip_rows(10)
.build();

EXPECT_THROW(cudf::io::read_parquet(options), std::overflow_error);
auto reader = cudf::io::chunked_parquet_reader(0, 0, options);
int64_t num_rows_read{0};
while (reader.has_next()) {
auto chunk = reader.read_chunk();
num_rows_read += chunk.tbl->num_rows();
}
EXPECT_EQ(num_rows_read, num_rows_to_read);
}

TEST_F(ParquetReaderTest, LateBindSourceInfo)
{
srand(31337);
Expand Down
2 changes: 1 addition & 1 deletion python/pylibcudf/pylibcudf/io/parquet.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ cdef class ParquetReaderOptions:
cdef parquet_reader_options c_obj
cdef SourceInfo source
cpdef void set_row_groups(self, list row_groups)
cpdef void set_num_rows(self, size_type nrows)
cpdef void set_num_rows(self, int64_t nrows)
cpdef void set_skip_rows(self, int64_t skip_rows)
cpdef void set_columns(self, list col_names)
cpdef void set_filter(self, Expression filter)
Expand Down
10 changes: 8 additions & 2 deletions python/pylibcudf/pylibcudf/io/parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -111,15 +111,21 @@ cdef class ParquetReaderOptions:

self.c_obj.set_row_groups(outer)

cpdef void set_num_rows(self, size_type nrows):
cpdef void set_num_rows(self, int64_t nrows):
"""
Sets number of rows to read.

Parameters
----------
nrows : size_type
nrows : int64_t
Number of rows to read after skip

Notes
-----
Although this allows one to request more than `size_type::max()`
rows, if any single read would produce a table larger than this row
limit, an error is thrown.

Returns
-------
None
Expand Down
2 changes: 1 addition & 1 deletion python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
void set_source(source_info src) except +libcudf_exception_handler
void set_filter(expression &filter) except +libcudf_exception_handler
void set_columns(vector[string] col_names) except +libcudf_exception_handler
void set_num_rows(size_type val) except +libcudf_exception_handler
void set_num_rows(int64_t val) except +libcudf_exception_handler
void set_row_groups(
vector[vector[size_type]] row_grp
) except +libcudf_exception_handler
Expand Down
Loading