From 9429099a07a9b761cba7f7100dbeb29afef19994 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 3 Aug 2022 18:39:10 -0700 Subject: [PATCH] Return empty dataframe when reading an ORC file using empty `columns` option (#11446) Changes are mostly equivalent to Parquet changes in https://github.com/rapidsai/cudf/pull/11018. Store the `columns` option as `optional`: - `nullopt` when columns are not passed by caller - read all columns. - Empty vector when caller explicitly passes an empty list/vector - return empty dataframe. - Vector of column names - read columns with given names. Also includes a small cleanup of the code equivalent in the Parquet reader. Fixes https://github.com/rapidsai/cudf/issues/11021 Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - MithunR (https://github.com/mythrocks) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/11446 --- cpp/include/cudf/io/orc.hpp | 13 +++++++------ cpp/include/cudf/io/parquet.hpp | 9 +++------ cpp/src/io/orc/aggregate_orc_metadata.cpp | 7 ++++--- cpp/src/io/orc/aggregate_orc_metadata.hpp | 6 ++++-- cpp/tests/io/orc_test.cpp | 19 +++++++++++++++++++ java/src/main/native/src/TableJni.cpp | 9 ++++++--- python/cudf/cudf/_lib/cpp/io/orc.pxd | 1 - python/cudf/cudf/_lib/orc.pyx | 14 ++++++++------ python/cudf/cudf/_lib/parquet.pyx | 5 ++--- python/cudf/cudf/tests/test_orc.py | 22 ++++++++++++++++++++++ 10 files changed, 75 insertions(+), 30 deletions(-) diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp index 30acf80548b7..7f3cb95e4b2a 100644 --- a/cpp/include/cudf/io/orc.hpp +++ b/cpp/include/cudf/io/orc.hpp @@ -24,6 +24,7 @@ #include #include +#include #include #include #include @@ -51,8 +52,8 @@ class orc_reader_options_builder; class orc_reader_options { source_info _source; - // Names of column to read; empty is all - std::vector _columns; + // Names of column to read; `nullopt` is all + std::optional> _columns; // List of individual stripes to read (ignored if empty) std::vector> _stripes; @@ -105,18 +106,18 @@ class orc_reader_options { [[nodiscard]] source_info const& get_source() const { return _source; } /** - * @brief Returns names of the columns to read. + * @brief Returns names of the columns to read, if set. * - * @return Names of the columns to read + * @return Names of the columns to read; `nullopt` if the option is not set */ - [[nodiscard]] std::vector const& get_columns() const { return _columns; } + [[nodiscard]] auto const& get_columns() const { return _columns; } /** * @brief Returns vector of vectors, stripes to read for each input source * * @return Vector of vectors, stripes to read for each input source */ - std::vector> const& get_stripes() const { return _stripes; } + [[nodiscard]] auto const& get_stripes() const { return _stripes; } /** * @brief Returns number of rows to skip from the start. diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index 10368f84824c..19156e01c1ed 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -51,7 +51,7 @@ class parquet_reader_options_builder; class parquet_reader_options { source_info _source; - // Path in schema of column to read; empty is all + // Path in schema of column to read; `nullopt` is all std::optional> _columns; // List of individual row groups to read (ignored if empty) @@ -152,17 +152,14 @@ class parquet_reader_options { * * @return Names of column to be read; `nullopt` if the option is not set */ - [[nodiscard]] std::optional> const& get_columns() const - { - return _columns; - } + [[nodiscard]] auto const& get_columns() const { return _columns; } /** * @brief Returns list of individual row groups to be read. * * @return List of individual row groups to be read */ - std::vector> const& get_row_groups() const { return _row_groups; } + [[nodiscard]] auto const& get_row_groups() const { return _row_groups; } /** * @brief Returns timestamp type used to cast timestamp columns. diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp index 82765c60c1e8..df3dfca5fa95 100644 --- a/cpp/src/io/orc/aggregate_orc_metadata.cpp +++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp @@ -18,6 +18,7 @@ #include #include +#include namespace cudf::io::orc::detail { @@ -249,17 +250,17 @@ std::vector aggregate_orc_metadata::select_stri } column_hierarchy aggregate_orc_metadata::select_columns( - std::vector const& column_paths) + std::optional> const& column_paths) { auto const& pfm = per_file_metadata[0]; column_hierarchy::nesting_map selected_columns; - if (column_paths.empty()) { + if (not column_paths.has_value()) { for (auto const& col_id : pfm.ff.types[0].subtypes) { add_column_to_mapping(selected_columns, pfm, col_id); } } else { - for (const auto& path : column_paths) { + for (const auto& path : column_paths.value()) { bool name_found = false; for (auto col_id = 1; col_id < pfm.get_num_columns(); ++col_id) { if (pfm.column_path(col_id) == path) { diff --git a/cpp/src/io/orc/aggregate_orc_metadata.hpp b/cpp/src/io/orc/aggregate_orc_metadata.hpp index 9d2380c00976..3ce1a922f315 100644 --- a/cpp/src/io/orc/aggregate_orc_metadata.hpp +++ b/cpp/src/io/orc/aggregate_orc_metadata.hpp @@ -17,6 +17,7 @@ #include "orc.hpp" #include +#include #include namespace cudf::io::orc::detail { @@ -126,10 +127,11 @@ class aggregate_orc_metadata { * Paths are in format "grandparent_col.parent_col.child_col", where the root ORC column is * omitted to match the cuDF table hierarchy. * - * @param column_paths List of full column names (i.e. paths) to select from the ORC file + * @param column_paths List of full column names (i.e. paths) to select from the ORC file; + * `nullopt` if user did not select columns to read * @return Columns hierarchy - lists of children columns and sorted columns in each nesting level */ - column_hierarchy select_columns(std::vector const& column_paths); + column_hierarchy select_columns(std::optional> const& column_paths); }; } // namespace cudf::io::orc::detail diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index b3df2c8a8ddd..76ffc92e2435 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -1514,4 +1514,23 @@ TEST_F(OrcWriterTest, DecimalOptionsNested) result.tbl->view().column(0).child(1).child(0).child(1)); } +TEST_F(OrcReaderTest, EmptyColumnsParam) +{ + srand(31337); + auto const expected = create_random_fixed_table(2, 4, false); + + std::vector out_buffer; + cudf_io::orc_writer_options args = + cudf_io::orc_writer_options::builder(cudf_io::sink_info{&out_buffer}, *expected); + cudf_io::write_orc(args); + + cudf_io::orc_reader_options read_opts = + cudf_io::orc_reader_options::builder(cudf_io::source_info{out_buffer.data(), out_buffer.size()}) + .columns({}); + auto const result = cudf_io::read_orc(read_opts); + + EXPECT_EQ(result.tbl->num_columns(), 0); + EXPECT_EQ(result.tbl->num_rows(), 0); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 877d77eeeef4..44c08aec1104 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1751,10 +1751,13 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORC( cudf::io::source_info(reinterpret_cast(buffer), buffer_length) : cudf::io::source_info(filename.get()); + auto builder = cudf::io::orc_reader_options::builder(source); + if (n_filter_col_names.size() > 0) { + builder = builder.columns(n_filter_col_names.as_cpp_vector()); + } + cudf::io::orc_reader_options opts = - cudf::io::orc_reader_options::builder(source) - .columns(n_filter_col_names.as_cpp_vector()) - .use_index(false) + builder.use_index(false) .use_np_dtypes(static_cast(usingNumPyTypes)) .timestamp_type(cudf::data_type(static_cast(unit))) .decimal128_columns(n_dec128_col_names.as_cpp_vector()) diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd index 62ff5eb4f539..3e44ef983480 100644 --- a/python/cudf/cudf/_lib/cpp/io/orc.pxd +++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd @@ -19,7 +19,6 @@ cdef extern from "cudf/io/orc.hpp" \ orc_reader_options() except+ cudf_io_types.source_info get_source() except+ - vector[string] get_columns() except+ vector[vector[size_type]] get_stripes() except+ size_type get_skip_rows() except+ size_type get_num_rows() except+ diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index 4d1090d84340..11c70317a395 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -103,7 +103,7 @@ cpdef read_orc(object filepaths_or_buffers, """ cdef orc_reader_options c_orc_reader_options = make_orc_reader_options( filepaths_or_buffers, - columns or [], + columns, stripes or [], get_size_t_arg(skip_rows, "skip_rows"), get_size_t_arg(num_rows, "num_rows"), @@ -325,16 +325,11 @@ cdef orc_reader_options make_orc_reader_options( for i, datasource in enumerate(filepaths_or_buffers): if isinstance(datasource, NativeFile): filepaths_or_buffers[i] = NativeFileDatasource(datasource) - cdef vector[string] c_column_names cdef vector[vector[size_type]] strps = stripes - c_column_names.reserve(len(column_names)) - for col in column_names: - c_column_names.push_back(str(col).encode()) cdef orc_reader_options opts cdef source_info src = make_source_info(filepaths_or_buffers) opts = move( orc_reader_options.builder(src) - .columns(c_column_names) .stripes(strps) .skip_rows(skip_rows) .num_rows(num_rows) @@ -343,6 +338,13 @@ cdef orc_reader_options make_orc_reader_options( .build() ) + cdef vector[string] c_column_names + if column_names is not None: + c_column_names.reserve(len(column_names)) + for col in column_names: + c_column_names.push_back(str(col).encode()) + opts.set_columns(c_column_names) + return opts diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 264b1fb507b9..c25360b307d5 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -177,9 +177,8 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, allow_range_index = True if columns is not None: cpp_columns.reserve(len(columns)) - if len(cpp_columns) == 0: - allow_range_index = False - for col in columns or []: + allow_range_index = False + for col in columns: cpp_columns.push_back(str(col).encode()) args.set_columns(cpp_columns) diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 132eb528cd0e..4373ef9afdf8 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -1758,3 +1758,25 @@ def test_orc_writer_zlib_compression(list_struct_buff): pytest.mark.xfail(reason="nvcomp build doesn't have deflate") else: raise e + + +@pytest.mark.parametrize("index", [True, False, None]) +@pytest.mark.parametrize("columns", [None, [], ["b", "a"]]) +def test_orc_columns_and_index_param(index, columns): + buffer = BytesIO() + df = cudf.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + df.to_orc(buffer, index=index) + + expected = pd.read_orc(buffer, columns=columns) + got = cudf.read_orc(buffer, columns=columns) + + if columns: + # TODO: Remove workaround after this issue is fixed: + # https://github.com/pandas-dev/pandas/issues/47944 + assert_eq( + expected.sort_index(axis=1), + got.sort_index(axis=1), + check_index_type=True, + ) + else: + assert_eq(expected, got, check_index_type=True)