Return empty dataframe when reading an ORC file using empty columns…

… option (NVIDIA#11446) Changes are mostly equivalent to Parquet changes in rapidsai/cudf#11018. Store the `columns` option as `optional`: - `nullopt` when columns are not passed by caller - read all columns. - Empty vector when caller explicitly passes an empty list/vector - return empty dataframe. - Vector of column names - read columns with given names. Also includes a small cleanup of the code equivalent in the Parquet reader. Fixes rapidsai/cudf#11021 Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - MithunR (https://github.com/mythrocks) - Nghia Truong (https://github.com/ttnghia) URL: rapidsai/cudf#11446
tgravescs · Aug 4, 2022 · 9429099 · 9429099
1 parent d86bb39
commit 9429099
Show file tree

Hide file tree

Showing 10 changed files with 75 additions and 30 deletions.
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
@@ -24,6 +24,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 
 #include <memory>
+#include <optional>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -51,8 +52,8 @@ class orc_reader_options_builder;
 class orc_reader_options {
   source_info _source;
 
-  // Names of column to read; empty is all
-  std::vector<std::string> _columns;
+  // Names of column to read; `nullopt` is all
+  std::optional<std::vector<std::string>> _columns;
 
   // List of individual stripes to read (ignored if empty)
   std::vector<std::vector<size_type>> _stripes;
@@ -105,18 +106,18 @@ class orc_reader_options {
   [[nodiscard]] source_info const& get_source() const { return _source; }
 
   /**
-   * @brief Returns names of the columns to read.
+   * @brief Returns names of the columns to read, if set.
    *
-   * @return Names of the columns to read
+   * @return Names of the columns to read; `nullopt` if the option is not set
    */
-  [[nodiscard]] std::vector<std::string> const& get_columns() const { return _columns; }
+  [[nodiscard]] auto const& get_columns() const { return _columns; }
 
   /**
    * @brief Returns vector of vectors, stripes to read for each input source
    *
    * @return Vector of vectors, stripes to read for each input source
    */
-  std::vector<std::vector<size_type>> const& get_stripes() const { return _stripes; }
+  [[nodiscard]] auto const& get_stripes() const { return _stripes; }
 
   /**
    * @brief Returns number of rows to skip from the start.

diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
@@ -51,7 +51,7 @@ class parquet_reader_options_builder;
 class parquet_reader_options {
   source_info _source;
 
-  // Path in schema of column to read; empty is all
+  // Path in schema of column to read; `nullopt` is all
   std::optional<std::vector<std::string>> _columns;
 
   // List of individual row groups to read (ignored if empty)
@@ -152,17 +152,14 @@ class parquet_reader_options {
    *
    * @return Names of column to be read; `nullopt` if the option is not set
    */
-  [[nodiscard]] std::optional<std::vector<std::string>> const& get_columns() const
-  {
-    return _columns;
-  }
+  [[nodiscard]] auto const& get_columns() const { return _columns; }
 
   /**
    * @brief Returns list of individual row groups to be read.
    *
    * @return List of individual row groups to be read
    */
-  std::vector<std::vector<size_type>> const& get_row_groups() const { return _row_groups; }
+  [[nodiscard]] auto const& get_row_groups() const { return _row_groups; }
 
   /**
    * @brief Returns timestamp type used to cast timestamp columns.

diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -18,6 +18,7 @@
 
 #include <algorithm>
 #include <numeric>
+#include <optional>
 
 namespace cudf::io::orc::detail {
 
@@ -249,17 +250,17 @@ std::vector<metadata::stripe_source_mapping> aggregate_orc_metadata::select_stri
 }
 
 column_hierarchy aggregate_orc_metadata::select_columns(
-  std::vector<std::string> const& column_paths)
+  std::optional<std::vector<std::string>> const& column_paths)
 {
   auto const& pfm = per_file_metadata[0];
 
   column_hierarchy::nesting_map selected_columns;
-  if (column_paths.empty()) {
+  if (not column_paths.has_value()) {
     for (auto const& col_id : pfm.ff.types[0].subtypes) {
       add_column_to_mapping(selected_columns, pfm, col_id);
     }
   } else {
-    for (const auto& path : column_paths) {
+    for (const auto& path : column_paths.value()) {
       bool name_found = false;
       for (auto col_id = 1; col_id < pfm.get_num_columns(); ++col_id) {
         if (pfm.column_path(col_id) == path) {

diff --git a/cpp/src/io/orc/aggregate_orc_metadata.hpp b/cpp/src/io/orc/aggregate_orc_metadata.hpp
@@ -17,6 +17,7 @@
 #include "orc.hpp"
 
 #include <map>
+#include <optional>
 #include <vector>
 
 namespace cudf::io::orc::detail {
@@ -126,10 +127,11 @@ class aggregate_orc_metadata {
    * Paths are in format "grandparent_col.parent_col.child_col", where the root ORC column is
    * omitted to match the cuDF table hierarchy.
    *
-   * @param column_paths List of full column names (i.e. paths) to select from the ORC file
+   * @param column_paths List of full column names (i.e. paths) to select from the ORC file;
+   * `nullopt` if user did not select columns to read
    * @return Columns hierarchy - lists of children columns and sorted columns in each nesting level
    */
-  column_hierarchy select_columns(std::vector<std::string> const& column_paths);
+  column_hierarchy select_columns(std::optional<std::vector<std::string>> const& column_paths);
 };
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
@@ -1514,4 +1514,23 @@ TEST_F(OrcWriterTest, DecimalOptionsNested)
                                       result.tbl->view().column(0).child(1).child(0).child(1));
 }
 
+TEST_F(OrcReaderTest, EmptyColumnsParam)
+{
+  srand(31337);
+  auto const expected = create_random_fixed_table<int>(2, 4, false);
+
+  std::vector<char> out_buffer;
+  cudf_io::orc_writer_options args =
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{&out_buffer}, *expected);
+  cudf_io::write_orc(args);
+
+  cudf_io::orc_reader_options read_opts =
+    cudf_io::orc_reader_options::builder(cudf_io::source_info{out_buffer.data(), out_buffer.size()})
+      .columns({});
+  auto const result = cudf_io::read_orc(read_opts);
+
+  EXPECT_EQ(result.tbl->num_columns(), 0);
+  EXPECT_EQ(result.tbl->num_rows(), 0);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
@@ -1751,10 +1751,13 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORC(
                       cudf::io::source_info(reinterpret_cast<char *>(buffer), buffer_length) :
                       cudf::io::source_info(filename.get());
 
+    auto builder = cudf::io::orc_reader_options::builder(source);
+    if (n_filter_col_names.size() > 0) {
+      builder = builder.columns(n_filter_col_names.as_cpp_vector());
+    }
+
     cudf::io::orc_reader_options opts =
-        cudf::io::orc_reader_options::builder(source)
-            .columns(n_filter_col_names.as_cpp_vector())
-            .use_index(false)
+        builder.use_index(false)
             .use_np_dtypes(static_cast<bool>(usingNumPyTypes))
             .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
             .decimal128_columns(n_dec128_col_names.as_cpp_vector())

diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd
@@ -19,7 +19,6 @@ cdef extern from "cudf/io/orc.hpp" \
         orc_reader_options() except+
 
         cudf_io_types.source_info get_source() except+
-        vector[string] get_columns() except+
         vector[vector[size_type]] get_stripes() except+
         size_type get_skip_rows() except+
         size_type get_num_rows() except+

diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
@@ -103,7 +103,7 @@ cpdef read_orc(object filepaths_or_buffers,
     """
     cdef orc_reader_options c_orc_reader_options = make_orc_reader_options(
         filepaths_or_buffers,
-        columns or [],
+        columns,
         stripes or [],
         get_size_t_arg(skip_rows, "skip_rows"),
         get_size_t_arg(num_rows, "num_rows"),
@@ -325,16 +325,11 @@ cdef orc_reader_options make_orc_reader_options(
     for i, datasource in enumerate(filepaths_or_buffers):
         if isinstance(datasource, NativeFile):
             filepaths_or_buffers[i] = NativeFileDatasource(datasource)
-    cdef vector[string] c_column_names
     cdef vector[vector[size_type]] strps = stripes
-    c_column_names.reserve(len(column_names))
-    for col in column_names:
-        c_column_names.push_back(str(col).encode())
     cdef orc_reader_options opts
     cdef source_info src = make_source_info(filepaths_or_buffers)
     opts = move(
         orc_reader_options.builder(src)
-        .columns(c_column_names)
         .stripes(strps)
         .skip_rows(skip_rows)
         .num_rows(num_rows)
@@ -343,6 +338,13 @@ cdef orc_reader_options make_orc_reader_options(
         .build()
     )
 
+    cdef vector[string] c_column_names
+    if column_names is not None:
+        c_column_names.reserve(len(column_names))
+        for col in column_names:
+            c_column_names.push_back(str(col).encode())
+        opts.set_columns(c_column_names)
+
     return opts
 
 

diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
@@ -177,9 +177,8 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
     allow_range_index = True
     if columns is not None:
         cpp_columns.reserve(len(columns))
-        if len(cpp_columns) == 0:
-            allow_range_index = False
-        for col in columns or []:
+        allow_range_index = False
+        for col in columns:
             cpp_columns.push_back(str(col).encode())
         args.set_columns(cpp_columns)
 

diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
@@ -1758,3 +1758,25 @@ def test_orc_writer_zlib_compression(list_struct_buff):
             pytest.mark.xfail(reason="nvcomp build doesn't have deflate")
         else:
             raise e
+
+
+@pytest.mark.parametrize("index", [True, False, None])
+@pytest.mark.parametrize("columns", [None, [], ["b", "a"]])
+def test_orc_columns_and_index_param(index, columns):
+    buffer = BytesIO()
+    df = cudf.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
+    df.to_orc(buffer, index=index)
+
+    expected = pd.read_orc(buffer, columns=columns)
+    got = cudf.read_orc(buffer, columns=columns)
+
+    if columns:
+        # TODO: Remove workaround after this issue is fixed:
+        # https://github.com/pandas-dev/pandas/issues/47944
+        assert_eq(
+            expected.sort_index(axis=1),
+            got.sort_index(axis=1),
+            check_index_type=True,
+        )
+    else:
+        assert_eq(expected, got, check_index_type=True)