diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp index 6c0216a6d6b..e1b6c3ace6c 100644 --- a/cpp/src/io/orc/orc.cpp +++ b/cpp/src/io/orc/orc.cpp @@ -538,26 +538,28 @@ std::vector metadata::select_columns(std::vector use_names, if (not use_names.empty()) { int index = 0; for (const auto &use_name : use_names) { + bool name_found = false; for (int i = 0; i < get_num_columns(); ++i, ++index) { if (index >= get_num_columns()) { index = 0; } if (get_column_name(index) == use_name) { + name_found = true; selection.emplace_back(index); if (ff.types[index].kind == orc::TIMESTAMP) { has_timestamp_column = true; } index++; break; } } + CUDF_EXPECTS(name_found, "Unknown column name : " + std::string(use_name)); } } else { // For now, only select all leaf nodes - for (int i = 0; i < get_num_columns(); ++i) { + for (int i = 1; i < get_num_columns(); ++i) { if (ff.types[i].subtypes.empty()) { selection.emplace_back(i); if (ff.types[i].kind == orc::TIMESTAMP) { has_timestamp_column = true; } } } } - CUDF_EXPECTS(selection.size() > 0, "Filtered out all columns"); return selection; } diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index e6e3ec69e43..9f88c6584ce 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -419,6 +419,9 @@ table_with_metadata reader::impl::read(size_type skip_rows, std::vector> out_columns; table_metadata out_metadata; + // There are no columns in table + if (_selected_columns.size() == 0) return {std::make_unique(), std::move(out_metadata)}; + // Select only stripes required (aka row groups) const auto selected_stripes = _metadata->select_stripes(stripes, skip_rows, num_rows); diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index fa14a0a9690..faad489da86 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -740,6 +740,22 @@ def test_nanoseconds_overflow(): assert_eq(expected.to_pandas(), pyarrow_got.to_pandas()) +def test_empty_dataframe(): + buffer = BytesIO() + expected = cudf.DataFrame() + expected.to_orc(buffer) + + # Raise error if column name is mentioned, but it doesn't exist. + with pytest.raises(RuntimeError): + cudf.read_orc(buffer, columns=["a"]) + + got_df = cudf.read_orc(buffer) + expected_pdf = pd.read_orc(buffer) + + assert_eq(expected, got_df) + assert_eq(expected_pdf, got_df) + + @pytest.mark.parametrize( "data", [[None, ""], ["", None], [None, None], ["", ""]] )