Skip to content

Commit

Permalink
Return empty dataframe when reading an ORC file using empty columns
Browse files Browse the repository at this point in the history
… option (NVIDIA#11446)

Changes are mostly equivalent to Parquet changes in rapidsai/cudf#11018.

Store the `columns` option as `optional`:

- `nullopt` when columns are not passed by caller - read all columns.
- Empty vector when caller explicitly passes an empty list/vector - return empty dataframe.
- Vector of column names - read columns with given names.

Also includes a small cleanup of the code equivalent in the Parquet reader.

Fixes rapidsai/cudf#11021

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - MithunR (https://github.com/mythrocks)
  - Nghia Truong (https://github.com/ttnghia)

URL: rapidsai/cudf#11446
  • Loading branch information
vuule authored Aug 4, 2022
1 parent d86bb39 commit 9429099
Show file tree
Hide file tree
Showing 10 changed files with 75 additions and 30 deletions.
13 changes: 7 additions & 6 deletions cpp/include/cudf/io/orc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <rmm/mr/device/per_device_resource.hpp>

#include <memory>
#include <optional>
#include <string>
#include <unordered_map>
#include <vector>
Expand Down Expand Up @@ -51,8 +52,8 @@ class orc_reader_options_builder;
class orc_reader_options {
source_info _source;

// Names of column to read; empty is all
std::vector<std::string> _columns;
// Names of column to read; `nullopt` is all
std::optional<std::vector<std::string>> _columns;

// List of individual stripes to read (ignored if empty)
std::vector<std::vector<size_type>> _stripes;
Expand Down Expand Up @@ -105,18 +106,18 @@ class orc_reader_options {
[[nodiscard]] source_info const& get_source() const { return _source; }

/**
* @brief Returns names of the columns to read.
* @brief Returns names of the columns to read, if set.
*
* @return Names of the columns to read
* @return Names of the columns to read; `nullopt` if the option is not set
*/
[[nodiscard]] std::vector<std::string> const& get_columns() const { return _columns; }
[[nodiscard]] auto const& get_columns() const { return _columns; }

/**
* @brief Returns vector of vectors, stripes to read for each input source
*
* @return Vector of vectors, stripes to read for each input source
*/
std::vector<std::vector<size_type>> const& get_stripes() const { return _stripes; }
[[nodiscard]] auto const& get_stripes() const { return _stripes; }

/**
* @brief Returns number of rows to skip from the start.
Expand Down
9 changes: 3 additions & 6 deletions cpp/include/cudf/io/parquet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ class parquet_reader_options_builder;
class parquet_reader_options {
source_info _source;

// Path in schema of column to read; empty is all
// Path in schema of column to read; `nullopt` is all
std::optional<std::vector<std::string>> _columns;

// List of individual row groups to read (ignored if empty)
Expand Down Expand Up @@ -152,17 +152,14 @@ class parquet_reader_options {
*
* @return Names of column to be read; `nullopt` if the option is not set
*/
[[nodiscard]] std::optional<std::vector<std::string>> const& get_columns() const
{
return _columns;
}
[[nodiscard]] auto const& get_columns() const { return _columns; }

/**
* @brief Returns list of individual row groups to be read.
*
* @return List of individual row groups to be read
*/
std::vector<std::vector<size_type>> const& get_row_groups() const { return _row_groups; }
[[nodiscard]] auto const& get_row_groups() const { return _row_groups; }

/**
* @brief Returns timestamp type used to cast timestamp columns.
Expand Down
7 changes: 4 additions & 3 deletions cpp/src/io/orc/aggregate_orc_metadata.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include <algorithm>
#include <numeric>
#include <optional>

namespace cudf::io::orc::detail {

Expand Down Expand Up @@ -249,17 +250,17 @@ std::vector<metadata::stripe_source_mapping> aggregate_orc_metadata::select_stri
}

column_hierarchy aggregate_orc_metadata::select_columns(
std::vector<std::string> const& column_paths)
std::optional<std::vector<std::string>> const& column_paths)
{
auto const& pfm = per_file_metadata[0];

column_hierarchy::nesting_map selected_columns;
if (column_paths.empty()) {
if (not column_paths.has_value()) {
for (auto const& col_id : pfm.ff.types[0].subtypes) {
add_column_to_mapping(selected_columns, pfm, col_id);
}
} else {
for (const auto& path : column_paths) {
for (const auto& path : column_paths.value()) {
bool name_found = false;
for (auto col_id = 1; col_id < pfm.get_num_columns(); ++col_id) {
if (pfm.column_path(col_id) == path) {
Expand Down
6 changes: 4 additions & 2 deletions cpp/src/io/orc/aggregate_orc_metadata.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "orc.hpp"

#include <map>
#include <optional>
#include <vector>

namespace cudf::io::orc::detail {
Expand Down Expand Up @@ -126,10 +127,11 @@ class aggregate_orc_metadata {
* Paths are in format "grandparent_col.parent_col.child_col", where the root ORC column is
* omitted to match the cuDF table hierarchy.
*
* @param column_paths List of full column names (i.e. paths) to select from the ORC file
* @param column_paths List of full column names (i.e. paths) to select from the ORC file;
* `nullopt` if user did not select columns to read
* @return Columns hierarchy - lists of children columns and sorted columns in each nesting level
*/
column_hierarchy select_columns(std::vector<std::string> const& column_paths);
column_hierarchy select_columns(std::optional<std::vector<std::string>> const& column_paths);
};

} // namespace cudf::io::orc::detail
19 changes: 19 additions & 0 deletions cpp/tests/io/orc_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1514,4 +1514,23 @@ TEST_F(OrcWriterTest, DecimalOptionsNested)
result.tbl->view().column(0).child(1).child(0).child(1));
}

TEST_F(OrcReaderTest, EmptyColumnsParam)
{
srand(31337);
auto const expected = create_random_fixed_table<int>(2, 4, false);

std::vector<char> out_buffer;
cudf_io::orc_writer_options args =
cudf_io::orc_writer_options::builder(cudf_io::sink_info{&out_buffer}, *expected);
cudf_io::write_orc(args);

cudf_io::orc_reader_options read_opts =
cudf_io::orc_reader_options::builder(cudf_io::source_info{out_buffer.data(), out_buffer.size()})
.columns({});
auto const result = cudf_io::read_orc(read_opts);

EXPECT_EQ(result.tbl->num_columns(), 0);
EXPECT_EQ(result.tbl->num_rows(), 0);
}

CUDF_TEST_PROGRAM_MAIN()
9 changes: 6 additions & 3 deletions java/src/main/native/src/TableJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1751,10 +1751,13 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORC(
cudf::io::source_info(reinterpret_cast<char *>(buffer), buffer_length) :
cudf::io::source_info(filename.get());

auto builder = cudf::io::orc_reader_options::builder(source);
if (n_filter_col_names.size() > 0) {
builder = builder.columns(n_filter_col_names.as_cpp_vector());
}

cudf::io::orc_reader_options opts =
cudf::io::orc_reader_options::builder(source)
.columns(n_filter_col_names.as_cpp_vector())
.use_index(false)
builder.use_index(false)
.use_np_dtypes(static_cast<bool>(usingNumPyTypes))
.timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
.decimal128_columns(n_dec128_col_names.as_cpp_vector())
Expand Down
1 change: 0 additions & 1 deletion python/cudf/cudf/_lib/cpp/io/orc.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ cdef extern from "cudf/io/orc.hpp" \
orc_reader_options() except+

cudf_io_types.source_info get_source() except+
vector[string] get_columns() except+
vector[vector[size_type]] get_stripes() except+
size_type get_skip_rows() except+
size_type get_num_rows() except+
Expand Down
14 changes: 8 additions & 6 deletions python/cudf/cudf/_lib/orc.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ cpdef read_orc(object filepaths_or_buffers,
"""
cdef orc_reader_options c_orc_reader_options = make_orc_reader_options(
filepaths_or_buffers,
columns or [],
columns,
stripes or [],
get_size_t_arg(skip_rows, "skip_rows"),
get_size_t_arg(num_rows, "num_rows"),
Expand Down Expand Up @@ -325,16 +325,11 @@ cdef orc_reader_options make_orc_reader_options(
for i, datasource in enumerate(filepaths_or_buffers):
if isinstance(datasource, NativeFile):
filepaths_or_buffers[i] = NativeFileDatasource(datasource)
cdef vector[string] c_column_names
cdef vector[vector[size_type]] strps = stripes
c_column_names.reserve(len(column_names))
for col in column_names:
c_column_names.push_back(str(col).encode())
cdef orc_reader_options opts
cdef source_info src = make_source_info(filepaths_or_buffers)
opts = move(
orc_reader_options.builder(src)
.columns(c_column_names)
.stripes(strps)
.skip_rows(skip_rows)
.num_rows(num_rows)
Expand All @@ -343,6 +338,13 @@ cdef orc_reader_options make_orc_reader_options(
.build()
)

cdef vector[string] c_column_names
if column_names is not None:
c_column_names.reserve(len(column_names))
for col in column_names:
c_column_names.push_back(str(col).encode())
opts.set_columns(c_column_names)

return opts


Expand Down
5 changes: 2 additions & 3 deletions python/cudf/cudf/_lib/parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -177,9 +177,8 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
allow_range_index = True
if columns is not None:
cpp_columns.reserve(len(columns))
if len(cpp_columns) == 0:
allow_range_index = False
for col in columns or []:
allow_range_index = False
for col in columns:
cpp_columns.push_back(str(col).encode())
args.set_columns(cpp_columns)

Expand Down
22 changes: 22 additions & 0 deletions python/cudf/cudf/tests/test_orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -1758,3 +1758,25 @@ def test_orc_writer_zlib_compression(list_struct_buff):
pytest.mark.xfail(reason="nvcomp build doesn't have deflate")
else:
raise e


@pytest.mark.parametrize("index", [True, False, None])
@pytest.mark.parametrize("columns", [None, [], ["b", "a"]])
def test_orc_columns_and_index_param(index, columns):
buffer = BytesIO()
df = cudf.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
df.to_orc(buffer, index=index)

expected = pd.read_orc(buffer, columns=columns)
got = cudf.read_orc(buffer, columns=columns)

if columns:
# TODO: Remove workaround after this issue is fixed:
# https://github.com/pandas-dev/pandas/issues/47944
assert_eq(
expected.sort_index(axis=1),
got.sort_index(axis=1),
check_index_type=True,
)
else:
assert_eq(expected, got, check_index_type=True)

0 comments on commit 9429099

Please sign in to comment.