From a08ec0e1b0609716e72d81b266eeb54670ed2e33 Mon Sep 17 00:00:00 2001 From: "Ram (Ramakrishna Prabhu)" <42624703+rgsl888prabhu@users.noreply.github.com> Date: Mon, 15 Feb 2021 16:39:42 -0600 Subject: [PATCH] Fix skiprows issue with ORC Reader (#7359) closes #7343 The validity bits in streams are placed msb to lsb in a byte, [True, False, True. False. True, True, True, False] -> 10101110. So, when it is being analyzed as 32 bit chunk, we can't apply mask directly, which caused this issue. `__brev(__byte_perm(bits, 0, 0x0123)) ` takes care of that issue and rearranges the bits as per the expectation. Authors: - Ram (Ramakrishna Prabhu) (@rgsl888prabhu) Approvers: - GALI PREM SAGAR (@galipremsagar) - Vukasin Milovanovic (@vuule) URL: https://github.com/rapidsai/cudf/pull/7359 --- cpp/src/io/orc/stripe_data.cu | 6 ++++-- python/cudf/cudf/tests/test_orc.py | 25 +++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index 6f326fc0576..1af5e088c22 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -1209,8 +1209,10 @@ __global__ void __launch_bounds__(block_size) uint32_t skippedrows = min(static_cast(first_row - row_in), nrows); uint32_t skip_count = 0; for (uint32_t i = t * 32; i < skippedrows; i += 32 * 32) { - uint32_t bits = s->vals.u32[i >> 5]; - if (i + 32 > skippedrows) { bits &= (1 << (skippedrows - i)) - 1; } + // Need to arrange the bytes to apply mask properly. + uint32_t bits = (i + 32 <= skippedrows) ? s->vals.u32[i >> 5] + : (__byte_perm(s->vals.u32[i >> 5], 0, 0x0123) & + (0xffffffffu << (0x20 - skippedrows + i))); skip_count += __popc(bits); } skip_count = warp_reduce(temp_storage.wr_storage[t / 32]).Sum(skip_count); diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 62696303ecb..ed91e909f25 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -8,6 +8,7 @@ import pandas as pd import pyarrow as pa import pyarrow.orc +import pyorc import pytest import cudf @@ -318,6 +319,30 @@ def test_orc_read_rows(datadir, skiprows, num_rows): np.testing.assert_allclose(pdf, gdf) +def test_orc_read_skiprows(tmpdir): + buff = BytesIO() + df = pd.DataFrame( + {"a": [1, 0, 1, 0, None, 1, 1, 1, 0, None, 0, 0, 1, 1, 1, 1]}, + dtype=pd.BooleanDtype(), + ) + writer = pyorc.Writer(buff, pyorc.Struct(a=pyorc.Boolean())) + tuples = list( + map( + lambda x: (None,) if x[0] is pd.NA else x, + list(df.itertuples(index=False, name=None)), + ) + ) + writer.writerows(tuples) + writer.close() + + skiprows = 10 + + expected = cudf.read_orc(buff)[skiprows::].reset_index(drop=True) + got = cudf.read_orc(buff, skiprows=skiprows) + + assert_eq(expected, got) + + def test_orc_reader_uncompressed_block(datadir): path = datadir / "uncompressed_snappy.orc" try: