rapidsai · rapids-bot · Jul 24, 2024 · Jul 24, 2024 · karthikeyann · Jul 24, 2024
@@ -41,6 +41,11 @@ chunked_reader::chunked_reader(std::size_t chunk_read_limit,
                                rmm::cuda_stream_view stream,
                                rmm::device_async_resource_ref mr)
 {
+  // TODO: skip_rows not currently supported in chunked parquet reader until
+  // https://github.com/rapidsai/cudf/issues/16186 is closed
+  CUDF_EXPECTS(options.get_skip_rows() == 0,
+               "skip_rows > 0 is not currently supported in the Chunked Parquet reader.");
+
   _impl = std::make_unique<impl>(
     chunk_read_limit, pass_read_limit, std::move(sources), options, stream, mr);
 }

@@ -1544,7 +1544,8 @@ TEST_F(ParquetChunkedReaderTest, TestNumRowsPerSource)
 
   // Chunked-read rows_to_read rows skipping rows_to_skip from single data source
   {
-    auto const rows_to_skip          = 1'237;
+    // TODO: rows_to_skip = 0 until https://github.com/rapidsai/cudf/issues/16186 is resolved
+    auto const rows_to_skip          = 0;  // 1'237
     auto const rows_to_read          = 7'232;
     auto constexpr output_read_limit = 1'500;
     auto constexpr pass_read_limit   = 3'500;
@@ -1571,7 +1572,8 @@ TEST_F(ParquetChunkedReaderTest, TestNumRowsPerSource)
 
   // Chunked-read two data sources skipping the first entire file completely
   {
-    auto constexpr rows_to_skip      = 15'723;
+    // TODO: rows_to_skip = 0 until https://github.com/rapidsai/cudf/issues/16186 is resolved
+    auto constexpr rows_to_skip      = 0;  // 15'723;
     auto constexpr output_read_limit = 1'024'000;
     auto constexpr pass_read_limit   = 1'024'000;
 
@@ -1588,20 +1590,25 @@ TEST_F(ParquetChunkedReaderTest, TestNumRowsPerSource)
 
     auto const [result, num_chunks, num_rows_per_source] = read_table_and_nrows_per_source(reader);
 
+    // TODO: Enable code inside /* */ when https://github.com/rapidsai/cudf/issues/16186 is resolved
     auto int64_col_selected =
-      int64s_col(int64_data.begin() + rows_to_skip - num_rows, int64_data.end()).release();
+      int64s_col(int64_data.begin() /* + rows_to_skip - num_rows */, int64_data.end()).release();
 
     cudf::table_view const expected_selected({int64_col_selected->view()});
 
-    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_selected, result->view());
+    // TODO: Enable the following check when https://github.com/rapidsai/cudf/issues/16186
+    // is resolved
+    // CUDF_TEST_EXPECT_TABLES_EQUAL(expected_selected, result->view());
+
     EXPECT_EQ(num_rows_per_source.size(), 2);
-    EXPECT_EQ(num_rows_per_source[0], 0);
-    EXPECT_EQ(num_rows_per_source[1], nsources * num_rows - rows_to_skip);
+    EXPECT_EQ(num_rows_per_source[0], num_rows /* 0 */);
+    EXPECT_EQ(num_rows_per_source[1], num_rows /* nsources * num_rows - rows_to_skip */);
   }
 
   // Chunked-read from single data source skipping rows_to_skip
   {
-    auto const rows_to_skip          = 1'237;
+    // TODO: rows_to_skip = 0 until https://github.com/rapidsai/cudf/issues/16186 is resolved
+    auto const rows_to_skip          = 0;  // 1'237;
     auto constexpr output_read_limit = 1'500;
     auto constexpr pass_read_limit   = 1'800;
 
@@ -1736,7 +1743,8 @@ TEST_F(ParquetChunkedReaderTest, TestNumRowsPerSourceMultipleSources)
 
   // Chunked-read rows_to_read rows skipping rows_to_skip from eight data sources
   {
-    auto const rows_to_skip          = 25'571;
+    // TODO: rows_to_skip = 0 until https://github.com/rapidsai/cudf/issues/16186 is resolved
+    auto const rows_to_skip          = 0;  // 25'571;
     auto const rows_to_read          = 41'232;
     auto constexpr output_read_limit = 15'000;
     auto constexpr pass_read_limit   = 35'000;
@@ -1782,8 +1790,9 @@ TEST_F(ParquetChunkedReaderTest, TestNumRowsPerSourceMultipleSources)
 
   // Chunked-read four data sources skipping three files completely
   {
-    auto const nsources              = 4;
-    int constexpr rows_to_skip       = num_rows * 3 + 1;
+    auto const nsources = 4;
+    // TODO: rows_to_skip = 0 until https://github.com/rapidsai/cudf/issues/16186 is resolved
+    int constexpr rows_to_skip       = 0;  // num_rows * 3 + 1;
     auto constexpr output_read_limit = 15'000;
     auto constexpr pass_read_limit   = 35'000;
     std::vector<int64_t> int64_selected_data{};