From 08982bffa725996d745211c17edbb4b8926d1337 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Wed, 24 Jul 2024 19:08:04 +0000 Subject: [PATCH] Disable skip rows in chunked pq reader --- cpp/src/io/parquet/reader.cpp | 5 ++++ cpp/tests/io/parquet_chunked_reader_test.cu | 29 ++++++++++++++------- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/cpp/src/io/parquet/reader.cpp b/cpp/src/io/parquet/reader.cpp index 8dfd68cd9b8..65dafb568c0 100644 --- a/cpp/src/io/parquet/reader.cpp +++ b/cpp/src/io/parquet/reader.cpp @@ -41,6 +41,11 @@ chunked_reader::chunked_reader(std::size_t chunk_read_limit, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { + // TODO: skip_rows not currently supported in chunked parquet reader until + // https://github.com/rapidsai/cudf/issues/16186 is closed + CUDF_EXPECTS(options.get_skip_rows() == 0, + "skip_rows > 0 is not currently supported in the Chunked Parquet reader."); + _impl = std::make_unique( chunk_read_limit, pass_read_limit, std::move(sources), options, stream, mr); } diff --git a/cpp/tests/io/parquet_chunked_reader_test.cu b/cpp/tests/io/parquet_chunked_reader_test.cu index 2917852235c..66b36aeed63 100644 --- a/cpp/tests/io/parquet_chunked_reader_test.cu +++ b/cpp/tests/io/parquet_chunked_reader_test.cu @@ -1544,7 +1544,8 @@ TEST_F(ParquetChunkedReaderTest, TestNumRowsPerSource) // Chunked-read rows_to_read rows skipping rows_to_skip from single data source { - auto const rows_to_skip = 1'237; + // TODO: rows_to_skip = 0 until https://github.com/rapidsai/cudf/issues/16186 is resolved + auto const rows_to_skip = 0; // 1'237 auto const rows_to_read = 7'232; auto constexpr output_read_limit = 1'500; auto constexpr pass_read_limit = 3'500; @@ -1571,7 +1572,8 @@ TEST_F(ParquetChunkedReaderTest, TestNumRowsPerSource) // Chunked-read two data sources skipping the first entire file completely { - auto constexpr rows_to_skip = 15'723; + // TODO: rows_to_skip = 0 until https://github.com/rapidsai/cudf/issues/16186 is resolved + auto constexpr rows_to_skip = 0; // 15'723; auto constexpr output_read_limit = 1'024'000; auto constexpr pass_read_limit = 1'024'000; @@ -1588,20 +1590,25 @@ TEST_F(ParquetChunkedReaderTest, TestNumRowsPerSource) auto const [result, num_chunks, num_rows_per_source] = read_table_and_nrows_per_source(reader); + // TODO: Enable code inside /* */ when https://github.com/rapidsai/cudf/issues/16186 is resolved auto int64_col_selected = - int64s_col(int64_data.begin() + rows_to_skip - num_rows, int64_data.end()).release(); + int64s_col(int64_data.begin() /* + rows_to_skip - num_rows */, int64_data.end()).release(); cudf::table_view const expected_selected({int64_col_selected->view()}); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_selected, result->view()); + // TODO: Enable the following check when https://github.com/rapidsai/cudf/issues/16186 + // is resolved + // CUDF_TEST_EXPECT_TABLES_EQUAL(expected_selected, result->view()); + EXPECT_EQ(num_rows_per_source.size(), 2); - EXPECT_EQ(num_rows_per_source[0], 0); - EXPECT_EQ(num_rows_per_source[1], nsources * num_rows - rows_to_skip); + EXPECT_EQ(num_rows_per_source[0], num_rows /* 0 */); + EXPECT_EQ(num_rows_per_source[1], num_rows /* nsources * num_rows - rows_to_skip */); } // Chunked-read from single data source skipping rows_to_skip { - auto const rows_to_skip = 1'237; + // TODO: rows_to_skip = 0 until https://github.com/rapidsai/cudf/issues/16186 is resolved + auto const rows_to_skip = 0; // 1'237; auto constexpr output_read_limit = 1'500; auto constexpr pass_read_limit = 1'800; @@ -1736,7 +1743,8 @@ TEST_F(ParquetChunkedReaderTest, TestNumRowsPerSourceMultipleSources) // Chunked-read rows_to_read rows skipping rows_to_skip from eight data sources { - auto const rows_to_skip = 25'571; + // TODO: rows_to_skip = 0 until https://github.com/rapidsai/cudf/issues/16186 is resolved + auto const rows_to_skip = 0; // 25'571; auto const rows_to_read = 41'232; auto constexpr output_read_limit = 15'000; auto constexpr pass_read_limit = 35'000; @@ -1782,8 +1790,9 @@ TEST_F(ParquetChunkedReaderTest, TestNumRowsPerSourceMultipleSources) // Chunked-read four data sources skipping three files completely { - auto const nsources = 4; - int constexpr rows_to_skip = num_rows * 3 + 1; + auto const nsources = 4; + // TODO: rows_to_skip = 0 until https://github.com/rapidsai/cudf/issues/16186 is resolved + int constexpr rows_to_skip = 0; // num_rows * 3 + 1; auto constexpr output_read_limit = 15'000; auto constexpr pass_read_limit = 35'000; std::vector int64_selected_data{};