Skip to content

Commit

Permalink
Fix skiprows issue with ORC Reader (#7359)
Browse files Browse the repository at this point in the history
closes #7343

The validity bits in streams are placed msb to lsb in a byte, [True, False, True. False. True, True, True, False] -> 10101110.
So, when it is being analyzed as 32 bit chunk, we can't apply mask directly, which caused this issue. `__brev(__byte_perm(bits, 0, 0x0123)) ` takes care of that issue and rearranges the bits as per the expectation.

Authors:
  - Ram (Ramakrishna Prabhu) (@rgsl888prabhu)

Approvers:
  - GALI PREM SAGAR (@galipremsagar)
  - Vukasin Milovanovic (@vuule)

URL: #7359
  • Loading branch information
rgsl888prabhu authored Feb 15, 2021
1 parent 92c4b26 commit a08ec0e
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 2 deletions.
6 changes: 4 additions & 2 deletions cpp/src/io/orc/stripe_data.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1209,8 +1209,10 @@ __global__ void __launch_bounds__(block_size)
uint32_t skippedrows = min(static_cast<uint32_t>(first_row - row_in), nrows);
uint32_t skip_count = 0;
for (uint32_t i = t * 32; i < skippedrows; i += 32 * 32) {
uint32_t bits = s->vals.u32[i >> 5];
if (i + 32 > skippedrows) { bits &= (1 << (skippedrows - i)) - 1; }
// Need to arrange the bytes to apply mask properly.
uint32_t bits = (i + 32 <= skippedrows) ? s->vals.u32[i >> 5]
: (__byte_perm(s->vals.u32[i >> 5], 0, 0x0123) &
(0xffffffffu << (0x20 - skippedrows + i)));
skip_count += __popc(bits);
}
skip_count = warp_reduce(temp_storage.wr_storage[t / 32]).Sum(skip_count);
Expand Down
25 changes: 25 additions & 0 deletions python/cudf/cudf/tests/test_orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import pandas as pd
import pyarrow as pa
import pyarrow.orc
import pyorc
import pytest

import cudf
Expand Down Expand Up @@ -318,6 +319,30 @@ def test_orc_read_rows(datadir, skiprows, num_rows):
np.testing.assert_allclose(pdf, gdf)


def test_orc_read_skiprows(tmpdir):
buff = BytesIO()
df = pd.DataFrame(
{"a": [1, 0, 1, 0, None, 1, 1, 1, 0, None, 0, 0, 1, 1, 1, 1]},
dtype=pd.BooleanDtype(),
)
writer = pyorc.Writer(buff, pyorc.Struct(a=pyorc.Boolean()))
tuples = list(
map(
lambda x: (None,) if x[0] is pd.NA else x,
list(df.itertuples(index=False, name=None)),
)
)
writer.writerows(tuples)
writer.close()

skiprows = 10

expected = cudf.read_orc(buff)[skiprows::].reset_index(drop=True)
got = cudf.read_orc(buff, skiprows=skiprows)

assert_eq(expected, got)


def test_orc_reader_uncompressed_block(datadir):
path = datadir / "uncompressed_snappy.orc"
try:
Expand Down

0 comments on commit a08ec0e

Please sign in to comment.