From 217d702c9cb546dc8c175defc8876fd473b5c8a0 Mon Sep 17 00:00:00 2001 From: "Ram (Ramakrishna Prabhu)" <42624703+rgsl888prabhu@users.noreply.github.com> Date: Sat, 20 Mar 2021 03:28:46 +0530 Subject: [PATCH] Fix ORC reader issue with reading empty string columns (#7656) There was a [condition in reader where if the data size is zero](https://github.com/rapidsai/cudf/blob/8773a40f4c8ce63f56ed6eb67b4eaf959106939f/cpp/src/io/orc/reader_impl.cu#L538), then stream pointer was not getting updated. But in case of `["", ""]` where it is a valid data with 0 size, it was reading it as `[null, null]`, so the condition has been removed which caused this issue. I have also added test cases to validate. closes #7620 Authors: - Ram (Ramakrishna Prabhu) (@rgsl888prabhu) Approvers: - Devavret Makkar (@devavret) - Vukasin Milovanovic (@vuule) - Keith Kraus (@kkraus14) URL: https://github.com/rapidsai/cudf/pull/7656 --- cpp/src/io/orc/reader_impl.cu | 4 +--- python/cudf/cudf/tests/test_orc.py | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index 61adef26dab..2567b2579d7 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -535,9 +535,7 @@ table_with_metadata reader::impl::read(size_type skip_rows, chunk.ts_clock_rate = to_clockrate(_timestamp_type.id()); } for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) { - if (chunk.strm_len[k] > 0) { - chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos; - } + chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos; } } stripe_start_row += stripe_info->numberOfRows; diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index ca8aa00f80c..fa14a0a9690 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -738,3 +738,19 @@ def test_nanoseconds_overflow(): pyarrow_got = pa.orc.ORCFile(buffer).read() assert_eq(expected.to_pandas(), pyarrow_got.to_pandas()) + + +@pytest.mark.parametrize( + "data", [[None, ""], ["", None], [None, None], ["", ""]] +) +def test_empty_string_columns(data): + buffer = BytesIO() + + expected = cudf.DataFrame({"string": data}, dtype="str") + expected.to_orc(buffer) + + expected_pdf = pd.read_orc(buffer) + got_df = cudf.read_orc(buffer) + + assert_eq(expected, got_df) + assert_eq(expected_pdf, got_df)