From 4f87a59e4fa9303ce67ade3a7d572ce73dca12a9 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 3 Feb 2021 23:16:13 -0800 Subject: [PATCH] Throw if bool column would cause incorrect result when writing to ORC (#7261) Issue #6763 Authors: - Vukasin Milovanovic (@vuule) Approvers: - Ram (Ramakrishna Prabhu) (@rgsl888prabhu) - @nvdbaranec - GALI PREM SAGAR (@galipremsagar) - Keith Kraus (@kkraus14) URL: https://github.com/rapidsai/cudf/pull/7261 --- cpp/src/io/orc/writer_impl.cu | 30 ++++++++++++++++++++++++++++++ python/cudf/cudf/tests/test_orc.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index f1f3d1de9ed..7198539a460 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -516,6 +516,11 @@ std::vector writer::impl::gather_streams(orc_column_view *columns, return streams; } +struct segmented_valid_cnt_input { + bitmask_type const *mask; + std::vector indices; +}; + rmm::device_buffer writer::impl::encode_columns(orc_column_view *columns, size_t num_columns, size_t num_rows, @@ -555,6 +560,7 @@ rmm::device_buffer writer::impl::encode_columns(orc_column_view *columns, // Initialize column chunks' descriptions size_t stripe_start = 0; size_t stripe_id = 0; + std::map validity_check_inputs; for (size_t j = 0; j < num_rowgroups; j++) { for (size_t i = 0; i < num_columns; i++) { auto *ck = &chunks[j * num_columns + i]; @@ -578,6 +584,20 @@ rmm::device_buffer writer::impl::encode_columns(orc_column_view *columns, } ck->scale = columns[i].clockscale(); + // Only need to check row groups that end within the stripe + if (ck->type_kind == TypeKind::BOOLEAN && columns[i].nullable() && + j + 1 != stripe_start + stripe_list[stripe_id]) { + auto curr_cnt_in = validity_check_inputs.find(i); + if (curr_cnt_in == validity_check_inputs.end()) { + bool unused; + // add new object + std::tie(curr_cnt_in, unused) = validity_check_inputs.insert({i, {columns[i].nulls()}}); + } + // append row group start and end to existing object + curr_cnt_in->second.indices.push_back(ck->start_row); + curr_cnt_in->second.indices.push_back(ck->start_row + ck->num_rows); + } + for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) { const auto strm_id = strm_ids[i * gpu::CI_NUM_STREAMS + k]; @@ -631,6 +651,16 @@ rmm::device_buffer writer::impl::encode_columns(orc_column_view *columns, } } + for (auto &cnt_in : validity_check_inputs) { + auto const valid_counts = segmented_count_set_bits(cnt_in.second.mask, cnt_in.second.indices); + CUDF_EXPECTS( + std::none_of(valid_counts.cbegin(), + valid_counts.cend(), + [](auto valid_count) { return valid_count % 8; }), + "There's currently a bug in encoding boolean columns. Suggested workaround is to convert to " + "int8 type. Please see https://github.com/rapidsai/cudf/issues/6763 for more information."); + } + chunks.host_to_device(stream); if (!str_col_ids.empty()) { auto d_stripe_dict = columns[str_col_ids[0]].device_stripe_dict(); diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 85e61acd8e6..cf3da26258e 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -571,6 +571,9 @@ def normalized_equals(value1, value2): @pytest.mark.parametrize("nrows", [1, 100, 6000000]) def test_orc_write_statistics(tmpdir, datadir, nrows): supported_stat_types = supported_numpy_dtypes + ["str"] + # Can't write random bool columns until issue #6763 is fixed + if nrows == 6000000: + supported_stat_types.remove("bool") # Make a dataframe gdf = cudf.DataFrame( @@ -670,3 +673,29 @@ def test_orc_reader_gmt_timestamps(datadir): pdf = orcfile.read().to_pandas() gdf = cudf.read_orc(path, engine="cudf").to_pandas() assert_eq(pdf, gdf) + + +def test_orc_bool_encode_fail(): + np.random.seed(0) + + # Generate a boolean column longer than a single stripe + fail_df = cudf.DataFrame({"col": gen_rand_series("bool", 600000)}) + # Invalidate the first row in the second stripe to break encoding + fail_df["col"][500000] = None + + # Should throw instead of generating a file that is incompatible + # with other readers (see issue #6763) + with pytest.raises(RuntimeError): + fail_df.to_orc("should_throw.orc") + + # Generate a boolean column that fits into a single stripe + okay_df = cudf.DataFrame({"col": gen_rand_series("bool", 500000)}) + okay_df["col"][500000 - 1] = None + fname = "single_stripe.orc" + # Invalid row is in the last row group of the stripe; + # encoding is assumed to be correct + okay_df.to_orc(fname) + + # Also validate data + pdf = pa.orc.ORCFile(fname).read().to_pandas() + assert_eq(okay_df, pdf)