Skip to content

Commit

Permalink
Merge pull request #8542 from rapidsai/branch-21.06
Browse files Browse the repository at this point in the history
[RELEASE] cudf v21.06.01
  • Loading branch information
raydouglass authored Jun 17, 2021
2 parents 3b831c3 + b9a1165 commit d56ac1d
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 1 deletion.
9 changes: 8 additions & 1 deletion cpp/src/io/orc/writer_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -565,6 +565,8 @@ orc_streams::orc_stream_offsets orc_streams::compute_offsets(
// Everything else uses RLE
return true;
}();
// non-RLE and RLE streams are separated in the buffer that stores encoded data
// The computed offsets do not take the streams of the other type into account
if (is_rle_data) {
strm_offsets[i] = rle_data_size;
rle_data_size += (stream.length * num_rowgroups + 7) & ~7;
Expand Down Expand Up @@ -681,6 +683,10 @@ encoded_data writer::impl::encode_columns(const table_device_view &view,
: (((stripe_dict->num_strings + 0x1ff) >> 9) * (512 * 4 + 2));
if (stripe.id == 0) {
strm.data_ptrs[strm_type] = encoded_data.data() + stream_offsets.offsets[strm_id];
// Dictionary lengths are encoded as RLE, which are all stored after non-RLE data:
// include non-RLE data size in the offset only in that case
if (strm_type == gpu::CI_DATA2 && ck.encoding_kind == DICTIONARY_V2)
strm.data_ptrs[strm_type] += stream_offsets.non_rle_data_size;
} else {
auto const &strm_up = col_streams[stripe_dict[-dict_stride].start_chunk];
strm.data_ptrs[strm_type] =
Expand Down Expand Up @@ -710,7 +716,8 @@ encoded_data writer::impl::encode_columns(const table_device_view &view,
: (col_streams[rg_idx - 1].data_ptrs[strm_type] +
col_streams[rg_idx - 1].lengths[strm_type]);
} else {
strm.lengths[strm_type] = streams[strm_id].length;
strm.lengths[strm_type] = streams[strm_id].length;
// RLE encoded streams are stored after all non-RLE streams
strm.data_ptrs[strm_type] = encoded_data.data() + stream_offsets.non_rle_data_size +
stream_offsets.offsets[strm_id] +
streams[strm_id].length * rg_idx;
Expand Down
15 changes: 15 additions & 0 deletions python/cudf/cudf/tests/test_orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import pyarrow.orc
import pyorc
import pytest
import decimal

import cudf
from cudf.io.orc import ORCWriter
Expand Down Expand Up @@ -780,3 +781,17 @@ def test_orc_writer_decimal(tmpdir, scale):

got = pd.read_orc(fname)
assert_eq(expected.to_pandas()["dec_val"], got["dec_val"])


def test_orc_string_stream_offset_issue():
size = 30000
vals = {
str(x): [decimal.Decimal(1)] * size if x != 0 else ["XYZ"] * size
for x in range(0, 5)
}
df = cudf.DataFrame(vals)

buffer = BytesIO()
df.to_orc(buffer)

assert_eq(df, cudf.read_orc(buffer))

0 comments on commit d56ac1d

Please sign in to comment.