diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index 856c23c0f55..cf28ee12af7 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -1211,8 +1211,10 @@ __global__ void __launch_bounds__(block_size) uint32_t skippedrows = min(static_cast(first_row - row_in), nrows); uint32_t skip_count = 0; for (uint32_t i = t * 32; i < skippedrows; i += 32 * 32) { - uint32_t bits = s->vals.u32[i >> 5]; - if (i + 32 > skippedrows) { bits &= (1 << (skippedrows - i)) - 1; } + // Need to arrange the bytes to apply mask properly. + uint32_t bits = (i + 32 <= skippedrows) ? s->vals.u32[i >> 5] + : (__byte_perm(s->vals.u32[i >> 5], 0, 0x0123) & + (0xffffffffu << (0x20 - skippedrows + i))); skip_count += __popc(bits); } skip_count = warp_reduce(temp_storage[t / 32]).Sum(skip_count); diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 62696303ecb..ed91e909f25 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -8,6 +8,7 @@ import pandas as pd import pyarrow as pa import pyarrow.orc +import pyorc import pytest import cudf @@ -318,6 +319,30 @@ def test_orc_read_rows(datadir, skiprows, num_rows): np.testing.assert_allclose(pdf, gdf) +def test_orc_read_skiprows(tmpdir): + buff = BytesIO() + df = pd.DataFrame( + {"a": [1, 0, 1, 0, None, 1, 1, 1, 0, None, 0, 0, 1, 1, 1, 1]}, + dtype=pd.BooleanDtype(), + ) + writer = pyorc.Writer(buff, pyorc.Struct(a=pyorc.Boolean())) + tuples = list( + map( + lambda x: (None,) if x[0] is pd.NA else x, + list(df.itertuples(index=False, name=None)), + ) + ) + writer.writerows(tuples) + writer.close() + + skiprows = 10 + + expected = cudf.read_orc(buff)[skiprows::].reset_index(drop=True) + got = cudf.read_orc(buff, skiprows=skiprows) + + assert_eq(expected, got) + + def test_orc_reader_uncompressed_block(datadir): path = datadir / "uncompressed_snappy.orc" try: