Skip to content

Commit

Permalink
Fix ORC reader for empty DataFrame/Table (#7624)
Browse files Browse the repository at this point in the history
`ff.types` by default will have a [main type as struct](https://github.com/rapidsai/cudf/blob/0146f743987a6f2a51aab08f34771eb4d3531afc/cpp/src/io/orc/writer_impl.cu#L1278) under which all other columns will originate. So, we need to skip first which is not a column and start with 1st index.
(Look for `Type Information` in [ORC Specification](https://orc.apache.org/specification/ORCv1/))
 Along with that, we should also take care of the scenario where user would specify specific column name to retrieve, but it doesn't exist in case of empty data frame/table.

Added test case to validate both scenario. 

closes #7356

Authors:
  - Ram (Ramakrishna Prabhu) (@rgsl888prabhu)

Approvers:
  - Devavret Makkar (@devavret)
  - @nvdbaranec
  - Vukasin Milovanovic (@vuule)
  - Michael Wang (@isVoid)

URL: #7624
  • Loading branch information
rgsl888prabhu authored Mar 22, 2021
1 parent c21bd0e commit 8632ca0
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 2 deletions.
6 changes: 4 additions & 2 deletions cpp/src/io/orc/orc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -538,26 +538,28 @@ std::vector<int> metadata::select_columns(std::vector<std::string> use_names,
if (not use_names.empty()) {
int index = 0;
for (const auto &use_name : use_names) {
bool name_found = false;
for (int i = 0; i < get_num_columns(); ++i, ++index) {
if (index >= get_num_columns()) { index = 0; }
if (get_column_name(index) == use_name) {
name_found = true;
selection.emplace_back(index);
if (ff.types[index].kind == orc::TIMESTAMP) { has_timestamp_column = true; }
index++;
break;
}
}
CUDF_EXPECTS(name_found, "Unknown column name : " + std::string(use_name));
}
} else {
// For now, only select all leaf nodes
for (int i = 0; i < get_num_columns(); ++i) {
for (int i = 1; i < get_num_columns(); ++i) {
if (ff.types[i].subtypes.empty()) {
selection.emplace_back(i);
if (ff.types[i].kind == orc::TIMESTAMP) { has_timestamp_column = true; }
}
}
}
CUDF_EXPECTS(selection.size() > 0, "Filtered out all columns");

return selection;
}
Expand Down
3 changes: 3 additions & 0 deletions cpp/src/io/orc/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,9 @@ table_with_metadata reader::impl::read(size_type skip_rows,
std::vector<std::unique_ptr<column>> out_columns;
table_metadata out_metadata;

// There are no columns in table
if (_selected_columns.size() == 0) return {std::make_unique<table>(), std::move(out_metadata)};

// Select only stripes required (aka row groups)
const auto selected_stripes = _metadata->select_stripes(stripes, skip_rows, num_rows);

Expand Down
16 changes: 16 additions & 0 deletions python/cudf/cudf/tests/test_orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -740,6 +740,22 @@ def test_nanoseconds_overflow():
assert_eq(expected.to_pandas(), pyarrow_got.to_pandas())


def test_empty_dataframe():
buffer = BytesIO()
expected = cudf.DataFrame()
expected.to_orc(buffer)

# Raise error if column name is mentioned, but it doesn't exist.
with pytest.raises(RuntimeError):
cudf.read_orc(buffer, columns=["a"])

got_df = cudf.read_orc(buffer)
expected_pdf = pd.read_orc(buffer)

assert_eq(expected, got_df)
assert_eq(expected_pdf, got_df)


@pytest.mark.parametrize(
"data", [[None, ""], ["", None], [None, None], ["", ""]]
)
Expand Down

0 comments on commit 8632ca0

Please sign in to comment.