Skip to content

Commit

Permalink
Throw if bool column would cause incorrect result when writing to ORC (
Browse files Browse the repository at this point in the history
…#7261)

Issue #6763

Authors:
  - Vukasin Milovanovic (@vuule)

Approvers:
  - Ram (Ramakrishna Prabhu) (@rgsl888prabhu)
  - @nvdbaranec
  - GALI PREM SAGAR (@galipremsagar)
  - Keith Kraus (@kkraus14)

URL: #7261
  • Loading branch information
vuule authored Feb 4, 2021
1 parent 369ec98 commit 4f87a59
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 0 deletions.
30 changes: 30 additions & 0 deletions cpp/src/io/orc/writer_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,11 @@ std::vector<Stream> writer::impl::gather_streams(orc_column_view *columns,
return streams;
}

struct segmented_valid_cnt_input {
bitmask_type const *mask;
std::vector<size_type> indices;
};

rmm::device_buffer writer::impl::encode_columns(orc_column_view *columns,
size_t num_columns,
size_t num_rows,
Expand Down Expand Up @@ -555,6 +560,7 @@ rmm::device_buffer writer::impl::encode_columns(orc_column_view *columns,
// Initialize column chunks' descriptions
size_t stripe_start = 0;
size_t stripe_id = 0;
std::map<size_type, segmented_valid_cnt_input> validity_check_inputs;
for (size_t j = 0; j < num_rowgroups; j++) {
for (size_t i = 0; i < num_columns; i++) {
auto *ck = &chunks[j * num_columns + i];
Expand All @@ -578,6 +584,20 @@ rmm::device_buffer writer::impl::encode_columns(orc_column_view *columns,
}
ck->scale = columns[i].clockscale();

// Only need to check row groups that end within the stripe
if (ck->type_kind == TypeKind::BOOLEAN && columns[i].nullable() &&
j + 1 != stripe_start + stripe_list[stripe_id]) {
auto curr_cnt_in = validity_check_inputs.find(i);
if (curr_cnt_in == validity_check_inputs.end()) {
bool unused;
// add new object
std::tie(curr_cnt_in, unused) = validity_check_inputs.insert({i, {columns[i].nulls()}});
}
// append row group start and end to existing object
curr_cnt_in->second.indices.push_back(ck->start_row);
curr_cnt_in->second.indices.push_back(ck->start_row + ck->num_rows);
}

for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
const auto strm_id = strm_ids[i * gpu::CI_NUM_STREAMS + k];

Expand Down Expand Up @@ -631,6 +651,16 @@ rmm::device_buffer writer::impl::encode_columns(orc_column_view *columns,
}
}

for (auto &cnt_in : validity_check_inputs) {
auto const valid_counts = segmented_count_set_bits(cnt_in.second.mask, cnt_in.second.indices);
CUDF_EXPECTS(
std::none_of(valid_counts.cbegin(),
valid_counts.cend(),
[](auto valid_count) { return valid_count % 8; }),
"There's currently a bug in encoding boolean columns. Suggested workaround is to convert to "
"int8 type. Please see https://github.com/rapidsai/cudf/issues/6763 for more information.");
}

chunks.host_to_device(stream);
if (!str_col_ids.empty()) {
auto d_stripe_dict = columns[str_col_ids[0]].device_stripe_dict();
Expand Down
29 changes: 29 additions & 0 deletions python/cudf/cudf/tests/test_orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -571,6 +571,9 @@ def normalized_equals(value1, value2):
@pytest.mark.parametrize("nrows", [1, 100, 6000000])
def test_orc_write_statistics(tmpdir, datadir, nrows):
supported_stat_types = supported_numpy_dtypes + ["str"]
# Can't write random bool columns until issue #6763 is fixed
if nrows == 6000000:
supported_stat_types.remove("bool")

# Make a dataframe
gdf = cudf.DataFrame(
Expand Down Expand Up @@ -670,3 +673,29 @@ def test_orc_reader_gmt_timestamps(datadir):
pdf = orcfile.read().to_pandas()
gdf = cudf.read_orc(path, engine="cudf").to_pandas()
assert_eq(pdf, gdf)


def test_orc_bool_encode_fail():
np.random.seed(0)

# Generate a boolean column longer than a single stripe
fail_df = cudf.DataFrame({"col": gen_rand_series("bool", 600000)})
# Invalidate the first row in the second stripe to break encoding
fail_df["col"][500000] = None

# Should throw instead of generating a file that is incompatible
# with other readers (see issue #6763)
with pytest.raises(RuntimeError):
fail_df.to_orc("should_throw.orc")

# Generate a boolean column that fits into a single stripe
okay_df = cudf.DataFrame({"col": gen_rand_series("bool", 500000)})
okay_df["col"][500000 - 1] = None
fname = "single_stripe.orc"
# Invalid row is in the last row group of the stripe;
# encoding is assumed to be correct
okay_df.to_orc(fname)

# Also validate data
pdf = pa.orc.ORCFile(fname).read().to_pandas()
assert_eq(okay_df, pdf)

0 comments on commit 4f87a59

Please sign in to comment.