Return empty dataframe when reading a Parquet file using empty `colum…

…ns` option (#11018) Fixes #8668 Store the columns option as `optional`: - `nullopt` when columns are not passed by caller - read all columns. - Empty vector when caller explicitly passes an empty list/vector - return empty dataframe. - Vector of column names - read columns with given names. Authors: - Vukasin Milovanovic (https://github.com/vuule) - Andy Grove (https://github.com/andygrove) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Mike Wilson (https://github.com/hyperbolic2346) - Yunsong Wang (https://github.com/PointKernel) - Devavret Makkar (https://github.com/devavret) - Jason Lowe (https://github.com/jlowe) URL: #11018
rapidsai · Jun 17, 2022 · 379faf9 · 379faf9
1 parent ef6a390
commit 379faf9
Show file tree

Hide file tree

Showing 8 changed files with 63 additions and 20 deletions.
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
@@ -51,7 +51,7 @@ class parquet_reader_options {
   source_info _source;
 
   // Path in schema of column to read; empty is all
-  std::vector<std::string> _columns;
+  std::optional<std::vector<std::string>> _columns;
 
   // List of individual row groups to read (ignored if empty)
   std::vector<std::vector<size_type>> _row_groups;
@@ -132,11 +132,14 @@ class parquet_reader_options {
   [[nodiscard]] size_type get_num_rows() const { return _num_rows; }
 
   /**
-   * @brief Returns names of column to be read.
+   * @brief Returns names of column to be read, if set.
    *
-   * @return Names of column to be read
+   * @return Names of column to be read; `nullopt` if the option is not set
    */
-  [[nodiscard]] std::vector<std::string> const& get_columns() const { return _columns; }
+  [[nodiscard]] std::optional<std::vector<std::string>> const& get_columns() const
+  {
+    return _columns;
+  }
 
   /**
    * @brief Returns list of individual row groups to be read.

diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
@@ -590,15 +590,16 @@ class aggregate_reader_metadata {
   /**
    * @brief Filters and reduces down to a selection of columns
    *
-   * @param use_names List of paths of column names to select
+   * @param use_names List of paths of column names to select; `nullopt` if user did not select
+   * columns to read
    * @param include_index Whether to always include the PANDAS index column(s)
    * @param strings_to_categorical Type conversion parameter
    * @param timestamp_type_id Type conversion parameter
    *
    * @return input column information, output column information, list of output column schema
    * indices
    */
-  [[nodiscard]] auto select_columns(std::vector<std::string> const& use_names,
+  [[nodiscard]] auto select_columns(std::optional<std::vector<std::string>> const& use_names,
                                     bool include_index,
                                     bool strings_to_categorical,
                                     type_id timestamp_type_id) const
@@ -724,7 +725,7 @@ class aggregate_reader_metadata {
     // ["name", "firstname"]
     //
     auto const& root = get_schema(0);
-    if (use_names.empty()) {
+    if (not use_names.has_value()) {
       for (auto const& schema_idx : root.children_idx) {
         build_column(nullptr, schema_idx, output_columns);
         output_column_schemas.push_back(schema_idx);
@@ -752,7 +753,7 @@ class aggregate_reader_metadata {
 
       // Find which of the selected paths are valid and get their schema index
       std::vector<path_info> valid_selected_paths;
-      for (auto const& selected_path : use_names) {
+      for (auto const& selected_path : *use_names) {
         auto found_path =
           std::find_if(all_paths.begin(), all_paths.end(), [&](path_info& valid_path) {
             return valid_path.full_path == selected_path;

diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
@@ -3398,4 +3398,24 @@ TEST_F(ParquetWriterTest, CheckPageRows)
   EXPECT_EQ(nvals, page_rows);
 }
 
+TEST_F(ParquetReaderTest, EmptyColumnsParam)
+{
+  srand(31337);
+  auto const expected = create_random_fixed_table<int>(2, 4, false);
+
+  std::vector<char> out_buffer;
+  cudf_io::parquet_writer_options args =
+    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{&out_buffer}, *expected);
+  cudf_io::write_parquet(args);
+
+  cudf_io::parquet_reader_options read_opts =
+    cudf_io::parquet_reader_options::builder(
+      cudf_io::source_info{out_buffer.data(), out_buffer.size()})
+      .columns({});
+  auto const result = cudf_io::read_parquet(read_opts);
+
+  EXPECT_EQ(result.tbl->num_columns(), 0);
+  EXPECT_EQ(result.tbl->num_rows(), 0);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnFilterOptions.java b/java/src/main/java/ai/rapids/cudf/ColumnFilterOptions.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -21,7 +21,6 @@
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.List;
-import java.util.function.Supplier;
 
 /**
  * Base options class for input formats that can filter columns.

diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
@@ -1497,10 +1497,13 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv *env,
                                                       static_cast<std::size_t>(buffer_length)) :
                                 cudf::io::source_info(filename.get());
 
+    auto builder = cudf::io::parquet_reader_options::builder(source);
+    if (n_filter_col_names.size() > 0) {
+      builder = builder.columns(n_filter_col_names.as_cpp_vector());
+    }
+
     cudf::io::parquet_reader_options opts =
-        cudf::io::parquet_reader_options::builder(source)
-            .columns(n_filter_col_names.as_cpp_vector())
-            .convert_strings_to_categories(false)
+        builder.convert_strings_to_categories(false)
             .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
             .build();
     return convert_table_for_return(env, cudf::io::read_parquet(opts).tbl);

diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
@@ -16,7 +16,6 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
     cdef cppclass parquet_reader_options:
         parquet_reader_options() except +
         cudf_io_types.source_info get_source_info() except +
-        vector[string] get_columns() except +
         vector[vector[size_type]] get_row_groups() except +
         data_type get_timestamp_type() except +
         bool is_enabled_convert_strings_to_categories() except +

diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
@@ -135,7 +135,6 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
     cdef cudf_io_types.source_info source = make_source_info(
         filepaths_or_buffers)
 
-    cdef vector[string] cpp_columns
     cdef bool cpp_strings_to_categorical = strings_to_categorical
     cdef bool cpp_use_pandas_metadata = use_pandas_metadata
     cdef size_type cpp_skiprows = skiprows if skiprows is not None else 0
@@ -145,18 +144,13 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
         cudf_types.type_id.EMPTY
     )
 
-    if columns is not None:
-        cpp_columns.reserve(len(columns))
-        for col in columns or []:
-            cpp_columns.push_back(str(col).encode())
     if row_groups is not None:
         cpp_row_groups = row_groups
 
     cdef parquet_reader_options args
     # Setup parquet reader arguments
     args = move(
         parquet_reader_options.builder(source)
-        .columns(cpp_columns)
         .row_groups(cpp_row_groups)
         .convert_strings_to_categories(cpp_strings_to_categorical)
         .use_pandas_metadata(cpp_use_pandas_metadata)
@@ -165,6 +159,15 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
         .timestamp_type(cpp_timestamp_type)
         .build()
     )
+    cdef vector[string] cpp_columns
+    allow_range_index = True
+    if columns is not None:
+        cpp_columns.reserve(len(columns))
+        if len(cpp_columns) == 0:
+            allow_range_index = False
+        for col in columns or []:
+            cpp_columns.push_back(str(col).encode())
+        args.set_columns(cpp_columns)
 
     # Read Parquet
     cdef cudf_io_types.table_with_metadata c_out_table
@@ -218,6 +221,8 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
     # Set the index column
     if index_col is not None and len(index_col) > 0:
         if is_range_index:
+            if not allow_range_index:
+                return df
             range_index_meta = index_col[0]
             if row_groups is not None:
                 per_file_metadata = [

diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
@@ -2550,3 +2550,16 @@ def test_parquet_reader_zstd_compression(datadir):
         assert_eq(df, pdf)
     except RuntimeError:
         pytest.mark.xfail(reason="zstd support is not enabled")
+
+
+@pytest.mark.parametrize("index", [True, False, None])
+@pytest.mark.parametrize("columns", [None, [], ["b", "a"]])
+def test_parquet_columns_and_index_param(index, columns):
+    buffer = BytesIO()
+    df = cudf.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
+    df.to_parquet(buffer, index=index)
+
+    expected = pd.read_parquet(buffer, columns=columns)
+    got = cudf.read_parquet(buffer, columns=columns)
+
+    assert_eq(expected, got, check_index_type=True)