From 8632ca0da31b49fd318d8cc5972c7e65f1b7ced3 Mon Sep 17 00:00:00 2001 From: "Ram (Ramakrishna Prabhu)" <42624703+rgsl888prabhu@users.noreply.github.com> Date: Mon, 22 Mar 2021 23:30:26 +0530 Subject: [PATCH] Fix ORC reader for empty DataFrame/Table (#7624) `ff.types` by default will have a [main type as struct](https://github.com/rapidsai/cudf/blob/0146f743987a6f2a51aab08f34771eb4d3531afc/cpp/src/io/orc/writer_impl.cu#L1278) under which all other columns will originate. So, we need to skip first which is not a column and start with 1st index. (Look for `Type Information` in [ORC Specification](https://orc.apache.org/specification/ORCv1/)) Along with that, we should also take care of the scenario where user would specify specific column name to retrieve, but it doesn't exist in case of empty data frame/table. Added test case to validate both scenario. closes #7356 Authors: - Ram (Ramakrishna Prabhu) (@rgsl888prabhu) Approvers: - Devavret Makkar (@devavret) - @nvdbaranec - Vukasin Milovanovic (@vuule) - Michael Wang (@isVoid) URL: https://github.com/rapidsai/cudf/pull/7624 --- cpp/src/io/orc/orc.cpp | 6 ++++-- cpp/src/io/orc/reader_impl.cu | 3 +++ python/cudf/cudf/tests/test_orc.py | 16 ++++++++++++++++ 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp index 6c0216a6d6b..e1b6c3ace6c 100644 --- a/cpp/src/io/orc/orc.cpp +++ b/cpp/src/io/orc/orc.cpp @@ -538,26 +538,28 @@ std::vector metadata::select_columns(std::vector use_names, if (not use_names.empty()) { int index = 0; for (const auto &use_name : use_names) { + bool name_found = false; for (int i = 0; i < get_num_columns(); ++i, ++index) { if (index >= get_num_columns()) { index = 0; } if (get_column_name(index) == use_name) { + name_found = true; selection.emplace_back(index); if (ff.types[index].kind == orc::TIMESTAMP) { has_timestamp_column = true; } index++; break; } } + CUDF_EXPECTS(name_found, "Unknown column name : " + std::string(use_name)); } } else { // For now, only select all leaf nodes - for (int i = 0; i < get_num_columns(); ++i) { + for (int i = 1; i < get_num_columns(); ++i) { if (ff.types[i].subtypes.empty()) { selection.emplace_back(i); if (ff.types[i].kind == orc::TIMESTAMP) { has_timestamp_column = true; } } } } - CUDF_EXPECTS(selection.size() > 0, "Filtered out all columns"); return selection; } diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index e6e3ec69e43..9f88c6584ce 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -419,6 +419,9 @@ table_with_metadata reader::impl::read(size_type skip_rows, std::vector> out_columns; table_metadata out_metadata; + // There are no columns in table + if (_selected_columns.size() == 0) return {std::make_unique(), std::move(out_metadata)}; + // Select only stripes required (aka row groups) const auto selected_stripes = _metadata->select_stripes(stripes, skip_rows, num_rows); diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index fa14a0a9690..faad489da86 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -740,6 +740,22 @@ def test_nanoseconds_overflow(): assert_eq(expected.to_pandas(), pyarrow_got.to_pandas()) +def test_empty_dataframe(): + buffer = BytesIO() + expected = cudf.DataFrame() + expected.to_orc(buffer) + + # Raise error if column name is mentioned, but it doesn't exist. + with pytest.raises(RuntimeError): + cudf.read_orc(buffer, columns=["a"]) + + got_df = cudf.read_orc(buffer) + expected_pdf = pd.read_orc(buffer) + + assert_eq(expected, got_df) + assert_eq(expected_pdf, got_df) + + @pytest.mark.parametrize( "data", [[None, ""], ["", None], [None, None], ["", ""]] )