diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index e7f2245ea52..b7fc68ae99c 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -86,7 +86,7 @@ class parquet_reader_options { // Number of rows to skip from the start; Parquet stores the number of rows as int64_t int64_t _skip_rows = 0; // Number of rows to read; `nullopt` is all - std::optional _num_rows; + std::optional _num_rows; // Read row groups that start at or after this byte offset into the source size_t _skip_bytes = 0; @@ -204,7 +204,7 @@ class parquet_reader_options { * @return Number of rows to read; `nullopt` if the option hasn't been set (in which case the file * is read until the end) */ - [[nodiscard]] std::optional const& get_num_rows() const { return _num_rows; } + [[nodiscard]] std::optional const& get_num_rows() const { return _num_rows; } /** * @brief Returns bytes to skip before starting reading row groups @@ -387,9 +387,12 @@ class parquet_reader_options { /** * @brief Sets number of rows to read. * + * @note Although this allows one to request more than `size_type::max()` rows, if any + * single read would produce a table larger than this row limit, an error is thrown. + * * @param val Number of rows to read after skip */ - void set_num_rows(size_type val); + void set_num_rows(int64_t val); /** * @brief Sets bytes to skip before starting reading row groups. @@ -547,10 +550,13 @@ class parquet_reader_options_builder { /** * @brief Sets number of rows to read. * + * @note Although this allows one to request more than `size_type::max()` rows, if any + * single read would produce a table larger than this row limit, an error is thrown. + * * @param val Number of rows to read after skip * @return this for chaining */ - parquet_reader_options_builder& num_rows(size_type val) + parquet_reader_options_builder& num_rows(int64_t val) { options.set_num_rows(val); return *this; diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index a43ccf3f934..865ebd25e45 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -820,7 +820,7 @@ void parquet_reader_options::set_skip_rows(int64_t val) _skip_rows = val; } -void parquet_reader_options::set_num_rows(size_type val) +void parquet_reader_options::set_num_rows(int64_t val) { CUDF_EXPECTS(val >= 0, "num_rows cannot be negative"); CUDF_EXPECTS(_row_groups.empty(), "num_rows can't be set along with a non-empty row_groups"); diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index 1c0ebdfb0b4..cf2eff87847 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -34,6 +34,7 @@ #include #include #include +#include #include namespace cudf::io::parquet::detail { @@ -870,7 +871,10 @@ table_with_metadata reader_impl::read() { CUDF_EXPECTS(_output_chunk_read_limit == 0, "Reading the whole file must not have non-zero byte_limit."); - + CUDF_EXPECTS(!_options.num_rows.has_value() || + _options.num_rows.value() <= std::numeric_limits::max(), + "Requested number of rows to read exceeds column size limit", + std::overflow_error); prepare_data(read_mode::READ_ALL); return read_chunk_internal(read_mode::READ_ALL); } diff --git a/cpp/tests/io/parquet_reader_test.cpp b/cpp/tests/io/parquet_reader_test.cpp index 0d5e658e3be..42b031a4d49 100644 --- a/cpp/tests/io/parquet_reader_test.cpp +++ b/cpp/tests/io/parquet_reader_test.cpp @@ -26,15 +26,20 @@ #include #include +#include #include #include #include #include +#include + #include #include +#include #include +#include using ParquetDecompressionTest = DecompressionTest; @@ -3682,6 +3687,44 @@ TEST_F(ParquetReaderTest, ByteBoundsAndFilters) } } +TEST_F(ParquetReaderTest, TableTooLargeOverflows) +{ + using T = bool; + constexpr int64_t per_file_num_rows = std::numeric_limits::max() / 2 + 1000; + static_assert(per_file_num_rows <= std::numeric_limits::max(), + "Number of rows per file should be less than size_type::max()"); + static_assert(2 * per_file_num_rows > std::numeric_limits::max(), + "Twice number of rows per file should be greather than size_type::max()"); + auto value = thrust::make_constant_iterator(true); + auto column = cudf::test::fixed_width_column_wrapper(value, value + per_file_num_rows); + + auto filepath = temp_env->get_temp_filepath("TableTooLargeOverflows.parquet"); + { + auto sink = cudf::io::sink_info{filepath}; + auto options = + cudf::io::parquet_writer_options::builder(sink, cudf::table_view{{column}}).build(); + std::ignore = cudf::io::write_parquet(options); + } + std::vector files{{filepath, filepath}}; + auto source = cudf::io::source_info(files); + auto metadata = cudf::io::read_parquet_metadata(source); + auto const num_rows_to_read = metadata.num_rows() - 1000; + EXPECT_EQ(metadata.num_rows(), per_file_num_rows * 2); + auto options = cudf::io::parquet_reader_options::builder(source) + .num_rows(num_rows_to_read) + .skip_rows(10) + .build(); + + EXPECT_THROW(cudf::io::read_parquet(options), std::overflow_error); + auto reader = cudf::io::chunked_parquet_reader(0, 0, options); + int64_t num_rows_read{0}; + while (reader.has_next()) { + auto chunk = reader.read_chunk(); + num_rows_read += chunk.tbl->num_rows(); + } + EXPECT_EQ(num_rows_read, num_rows_to_read); +} + TEST_F(ParquetReaderTest, LateBindSourceInfo) { srand(31337); diff --git a/python/pylibcudf/pylibcudf/io/parquet.pxd b/python/pylibcudf/pylibcudf/io/parquet.pxd index 4657ec795b8..8b2e2bcb65a 100644 --- a/python/pylibcudf/pylibcudf/io/parquet.pxd +++ b/python/pylibcudf/pylibcudf/io/parquet.pxd @@ -42,7 +42,7 @@ cdef class ParquetReaderOptions: cdef parquet_reader_options c_obj cdef SourceInfo source cpdef void set_row_groups(self, list row_groups) - cpdef void set_num_rows(self, size_type nrows) + cpdef void set_num_rows(self, int64_t nrows) cpdef void set_skip_rows(self, int64_t skip_rows) cpdef void set_columns(self, list col_names) cpdef void set_filter(self, Expression filter) diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyx b/python/pylibcudf/pylibcudf/io/parquet.pyx index 146da465793..f9edbc4e693 100644 --- a/python/pylibcudf/pylibcudf/io/parquet.pyx +++ b/python/pylibcudf/pylibcudf/io/parquet.pyx @@ -111,15 +111,21 @@ cdef class ParquetReaderOptions: self.c_obj.set_row_groups(outer) - cpdef void set_num_rows(self, size_type nrows): + cpdef void set_num_rows(self, int64_t nrows): """ Sets number of rows to read. Parameters ---------- - nrows : size_type + nrows : int64_t Number of rows to read after skip + Notes + ----- + Although this allows one to request more than `size_type::max()` + rows, if any single read would produce a table larger than this row + limit, an error is thrown. + Returns ------- None diff --git a/python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd b/python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd index 46597850604..94c822862bc 100644 --- a/python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd @@ -41,7 +41,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: void set_source(source_info src) except +libcudf_exception_handler void set_filter(expression &filter) except +libcudf_exception_handler void set_columns(vector[string] col_names) except +libcudf_exception_handler - void set_num_rows(size_type val) except +libcudf_exception_handler + void set_num_rows(int64_t val) except +libcudf_exception_handler void set_row_groups( vector[vector[size_type]] row_grp ) except +libcudf_exception_handler