From 9429099a07a9b761cba7f7100dbeb29afef19994 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vukasin.milovanovic.87@gmail.com>
Date: Wed, 3 Aug 2022 18:39:10 -0700
Subject: [PATCH] Return empty dataframe when reading an ORC file using empty
 `columns` option (#11446)

Changes are mostly equivalent to Parquet changes in https://github.com/rapidsai/cudf/pull/11018.

Store the `columns` option as `optional`:

- `nullopt` when columns are not passed by caller - read all columns.
- Empty vector when caller explicitly passes an empty list/vector - return empty dataframe.
- Vector of column names - read columns with given names.

Also includes a small cleanup of the code equivalent in the Parquet reader.

Fixes https://github.com/rapidsai/cudf/issues/11021

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - MithunR (https://github.com/mythrocks)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/11446
---
 cpp/include/cudf/io/orc.hpp               | 13 +++++++------
 cpp/include/cudf/io/parquet.hpp           |  9 +++------
 cpp/src/io/orc/aggregate_orc_metadata.cpp |  7 ++++---
 cpp/src/io/orc/aggregate_orc_metadata.hpp |  6 ++++--
 cpp/tests/io/orc_test.cpp                 | 19 +++++++++++++++++++
 java/src/main/native/src/TableJni.cpp     |  9 ++++++---
 python/cudf/cudf/_lib/cpp/io/orc.pxd      |  1 -
 python/cudf/cudf/_lib/orc.pyx             | 14 ++++++++------
 python/cudf/cudf/_lib/parquet.pyx         |  5 ++---
 python/cudf/cudf/tests/test_orc.py        | 22 ++++++++++++++++++++++
 10 files changed, 75 insertions(+), 30 deletions(-)
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 30acf80548b7..7f3cb95e4b2a 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -24,6 +24,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 
 #include <memory>
+#include <optional>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -51,8 +52,8 @@ class orc_reader_options_builder;
 class orc_reader_options {
   source_info _source;
 
-  // Names of column to read; empty is all
-  std::vector<std::string> _columns;
+  // Names of column to read; `nullopt` is all
+  std::optional<std::vector<std::string>> _columns;
 
   // List of individual stripes to read (ignored if empty)
   std::vector<std::vector<size_type>> _stripes;
@@ -105,18 +106,18 @@ class orc_reader_options {
   [[nodiscard]] source_info const& get_source() const { return _source; }
 
   /**
-   * @brief Returns names of the columns to read.
+   * @brief Returns names of the columns to read, if set.
    *
-   * @return Names of the columns to read
+   * @return Names of the columns to read; `nullopt` if the option is not set
    */
-  [[nodiscard]] std::vector<std::string> const& get_columns() const { return _columns; }
+  [[nodiscard]] auto const& get_columns() const { return _columns; }
 
   /**
    * @brief Returns vector of vectors, stripes to read for each input source
    *
    * @return Vector of vectors, stripes to read for each input source
    */
-  std::vector<std::vector<size_type>> const& get_stripes() const { return _stripes; }
+  [[nodiscard]] auto const& get_stripes() const { return _stripes; }
 
   /**
    * @brief Returns number of rows to skip from the start.
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 10368f84824c..19156e01c1ed 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -51,7 +51,7 @@ class parquet_reader_options_builder;
 class parquet_reader_options {
   source_info _source;
 
-  // Path in schema of column to read; empty is all
+  // Path in schema of column to read; `nullopt` is all
   std::optional<std::vector<std::string>> _columns;
 
   // List of individual row groups to read (ignored if empty)
@@ -152,17 +152,14 @@ class parquet_reader_options {
    *
    * @return Names of column to be read; `nullopt` if the option is not set
    */
-  [[nodiscard]] std::optional<std::vector<std::string>> const& get_columns() const
-  {
-    return _columns;
-  }
+  [[nodiscard]] auto const& get_columns() const { return _columns; }
 
   /**
    * @brief Returns list of individual row groups to be read.
    *
    * @return List of individual row groups to be read
    */
-  std::vector<std::vector<size_type>> const& get_row_groups() const { return _row_groups; }
+  [[nodiscard]] auto const& get_row_groups() const { return _row_groups; }
 
   /**
    * @brief Returns timestamp type used to cast timestamp columns.
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
index 82765c60c1e8..df3dfca5fa95 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.cpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -18,6 +18,7 @@
 
 #include <algorithm>
 #include <numeric>
+#include <optional>
 
 namespace cudf::io::orc::detail {
 
@@ -249,17 +250,17 @@ std::vector<metadata::stripe_source_mapping> aggregate_orc_metadata::select_stri
 }
 
 column_hierarchy aggregate_orc_metadata::select_columns(
-  std::vector<std::string> const& column_paths)
+  std::optional<std::vector<std::string>> const& column_paths)
 {
   auto const& pfm = per_file_metadata[0];
 
   column_hierarchy::nesting_map selected_columns;
-  if (column_paths.empty()) {
+  if (not column_paths.has_value()) {
     for (auto const& col_id : pfm.ff.types[0].subtypes) {
       add_column_to_mapping(selected_columns, pfm, col_id);
     }
   } else {
-    for (const auto& path : column_paths) {
+    for (const auto& path : column_paths.value()) {
       bool name_found = false;
       for (auto col_id = 1; col_id < pfm.get_num_columns(); ++col_id) {
         if (pfm.column_path(col_id) == path) {
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.hpp b/cpp/src/io/orc/aggregate_orc_metadata.hpp
index 9d2380c00976..3ce1a922f315 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.hpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.hpp
@@ -17,6 +17,7 @@
 #include "orc.hpp"
 
 #include <map>
+#include <optional>
 #include <vector>
 
 namespace cudf::io::orc::detail {
@@ -126,10 +127,11 @@ class aggregate_orc_metadata {
    * Paths are in format "grandparent_col.parent_col.child_col", where the root ORC column is
    * omitted to match the cuDF table hierarchy.
    *
-   * @param column_paths List of full column names (i.e. paths) to select from the ORC file
+   * @param column_paths List of full column names (i.e. paths) to select from the ORC file;
+   * `nullopt` if user did not select columns to read
    * @return Columns hierarchy - lists of children columns and sorted columns in each nesting level
    */
-  column_hierarchy select_columns(std::vector<std::string> const& column_paths);
+  column_hierarchy select_columns(std::optional<std::vector<std::string>> const& column_paths);
 };
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index b3df2c8a8ddd..76ffc92e2435 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -1514,4 +1514,23 @@ TEST_F(OrcWriterTest, DecimalOptionsNested)
                                       result.tbl->view().column(0).child(1).child(0).child(1));
 }
 
+TEST_F(OrcReaderTest, EmptyColumnsParam)
+{
+  srand(31337);
+  auto const expected = create_random_fixed_table<int>(2, 4, false);
+
+  std::vector<char> out_buffer;
+  cudf_io::orc_writer_options args =
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{&out_buffer}, *expected);
+  cudf_io::write_orc(args);
+
+  cudf_io::orc_reader_options read_opts =
+    cudf_io::orc_reader_options::builder(cudf_io::source_info{out_buffer.data(), out_buffer.size()})
+      .columns({});
+  auto const result = cudf_io::read_orc(read_opts);
+
+  EXPECT_EQ(result.tbl->num_columns(), 0);
+  EXPECT_EQ(result.tbl->num_rows(), 0);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 877d77eeeef4..44c08aec1104 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1751,10 +1751,13 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORC(
                       cudf::io::source_info(reinterpret_cast<char *>(buffer), buffer_length) :
                       cudf::io::source_info(filename.get());
 
+    auto builder = cudf::io::orc_reader_options::builder(source);
+    if (n_filter_col_names.size() > 0) {
+      builder = builder.columns(n_filter_col_names.as_cpp_vector());
+    }
+
     cudf::io::orc_reader_options opts =
-        cudf::io::orc_reader_options::builder(source)
-            .columns(n_filter_col_names.as_cpp_vector())
-            .use_index(false)
+        builder.use_index(false)
             .use_np_dtypes(static_cast<bool>(usingNumPyTypes))
             .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
             .decimal128_columns(n_dec128_col_names.as_cpp_vector())
diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd
index 62ff5eb4f539..3e44ef983480 100644
--- a/python/cudf/cudf/_lib/cpp/io/orc.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd
@@ -19,7 +19,6 @@ cdef extern from "cudf/io/orc.hpp" \
         orc_reader_options() except+
 
         cudf_io_types.source_info get_source() except+
-        vector[string] get_columns() except+
         vector[vector[size_type]] get_stripes() except+
         size_type get_skip_rows() except+
         size_type get_num_rows() except+
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index 4d1090d84340..11c70317a395 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -103,7 +103,7 @@ cpdef read_orc(object filepaths_or_buffers,
     """
     cdef orc_reader_options c_orc_reader_options = make_orc_reader_options(
         filepaths_or_buffers,
-        columns or [],
+        columns,
         stripes or [],
         get_size_t_arg(skip_rows, "skip_rows"),
         get_size_t_arg(num_rows, "num_rows"),
@@ -325,16 +325,11 @@ cdef orc_reader_options make_orc_reader_options(
     for i, datasource in enumerate(filepaths_or_buffers):
         if isinstance(datasource, NativeFile):
             filepaths_or_buffers[i] = NativeFileDatasource(datasource)
-    cdef vector[string] c_column_names
     cdef vector[vector[size_type]] strps = stripes
-    c_column_names.reserve(len(column_names))
-    for col in column_names:
-        c_column_names.push_back(str(col).encode())
     cdef orc_reader_options opts
     cdef source_info src = make_source_info(filepaths_or_buffers)
     opts = move(
         orc_reader_options.builder(src)
-        .columns(c_column_names)
         .stripes(strps)
         .skip_rows(skip_rows)
         .num_rows(num_rows)
@@ -343,6 +338,13 @@ cdef orc_reader_options make_orc_reader_options(
         .build()
     )
 
+    cdef vector[string] c_column_names
+    if column_names is not None:
+        c_column_names.reserve(len(column_names))
+        for col in column_names:
+            c_column_names.push_back(str(col).encode())
+        opts.set_columns(c_column_names)
+
     return opts
 
 
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 264b1fb507b9..c25360b307d5 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -177,9 +177,8 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
     allow_range_index = True
     if columns is not None:
         cpp_columns.reserve(len(columns))
-        if len(cpp_columns) == 0:
-            allow_range_index = False
-        for col in columns or []:
+        allow_range_index = False
+        for col in columns:
             cpp_columns.push_back(str(col).encode())
         args.set_columns(cpp_columns)
 
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 132eb528cd0e..4373ef9afdf8 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -1758,3 +1758,25 @@ def test_orc_writer_zlib_compression(list_struct_buff):
             pytest.mark.xfail(reason="nvcomp build doesn't have deflate")
         else:
             raise e
+
+
+@pytest.mark.parametrize("index", [True, False, None])
+@pytest.mark.parametrize("columns", [None, [], ["b", "a"]])
+def test_orc_columns_and_index_param(index, columns):
+    buffer = BytesIO()
+    df = cudf.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
+    df.to_orc(buffer, index=index)
+
+    expected = pd.read_orc(buffer, columns=columns)
+    got = cudf.read_orc(buffer, columns=columns)
+
+    if columns:
+        # TODO: Remove workaround after this issue is fixed:
+        # https://github.com/pandas-dev/pandas/issues/47944
+        assert_eq(
+            expected.sort_index(axis=1),
+            got.sort_index(axis=1),
+            check_index_type=True,
+        )
+    else:
+        assert_eq(expected, got, check_index_type=True)