Fix skiprows issue with ORC Reader (#7359)

closes #7343 The validity bits in streams are placed msb to lsb in a byte, [True, False, True. False. True, True, True, False] -> 10101110. So, when it is being analyzed as 32 bit chunk, we can't apply mask directly, which caused this issue. `__brev(__byte_perm(bits, 0, 0x0123)) ` takes care of that issue and rearranges the bits as per the expectation. Authors: - Ram (Ramakrishna Prabhu) (@rgsl888prabhu) Approvers: - GALI PREM SAGAR (@galipremsagar) - Vukasin Milovanovic (@vuule) URL: #7359
rapidsai · Feb 15, 2021 · a08ec0e · a08ec0e
1 parent 92c4b26
commit a08ec0e
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 2 deletions.
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
@@ -1209,8 +1209,10 @@ __global__ void __launch_bounds__(block_size)
         uint32_t skippedrows = min(static_cast<uint32_t>(first_row - row_in), nrows);
         uint32_t skip_count  = 0;
         for (uint32_t i = t * 32; i < skippedrows; i += 32 * 32) {
-          uint32_t bits = s->vals.u32[i >> 5];
-          if (i + 32 > skippedrows) { bits &= (1 << (skippedrows - i)) - 1; }
+          // Need to arrange the bytes to apply mask properly.
+          uint32_t bits = (i + 32 <= skippedrows) ? s->vals.u32[i >> 5]
+                                                  : (__byte_perm(s->vals.u32[i >> 5], 0, 0x0123) &
+                                                     (0xffffffffu << (0x20 - skippedrows + i)));
           skip_count += __popc(bits);
         }
         skip_count = warp_reduce(temp_storage.wr_storage[t / 32]).Sum(skip_count);

diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
@@ -8,6 +8,7 @@
 import pandas as pd
 import pyarrow as pa
 import pyarrow.orc
+import pyorc
 import pytest
 
 import cudf
@@ -318,6 +319,30 @@ def test_orc_read_rows(datadir, skiprows, num_rows):
     np.testing.assert_allclose(pdf, gdf)
 
 
+def test_orc_read_skiprows(tmpdir):
+    buff = BytesIO()
+    df = pd.DataFrame(
+        {"a": [1, 0, 1, 0, None, 1, 1, 1, 0, None, 0, 0, 1, 1, 1, 1]},
+        dtype=pd.BooleanDtype(),
+    )
+    writer = pyorc.Writer(buff, pyorc.Struct(a=pyorc.Boolean()))
+    tuples = list(
+        map(
+            lambda x: (None,) if x[0] is pd.NA else x,
+            list(df.itertuples(index=False, name=None)),
+        )
+    )
+    writer.writerows(tuples)
+    writer.close()
+
+    skiprows = 10
+
+    expected = cudf.read_orc(buff)[skiprows::].reset_index(drop=True)
+    got = cudf.read_orc(buff, skiprows=skiprows)
+
+    assert_eq(expected, got)
+
+
 def test_orc_reader_uncompressed_block(datadir):
     path = datadir / "uncompressed_snappy.orc"
     try: