From 4f87a59e4fa9303ce67ade3a7d572ce73dca12a9 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 3 Feb 2021 23:16:13 -0800
Subject: [PATCH] Throw if bool column would cause incorrect result when
 writing to ORC (#7261)

Issue #6763

Authors:
  - Vukasin Milovanovic (@vuule)

Approvers:
  - Ram (Ramakrishna Prabhu) (@rgsl888prabhu)
  - @nvdbaranec
  - GALI PREM SAGAR (@galipremsagar)
  - Keith Kraus (@kkraus14)

URL: https://github.com/rapidsai/cudf/pull/7261
---
 cpp/src/io/orc/writer_impl.cu      | 30 ++++++++++++++++++++++++++++++
 python/cudf/cudf/tests/test_orc.py | 29 +++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+)
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index f1f3d1de9ed..7198539a460 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -516,6 +516,11 @@ std::vector<Stream> writer::impl::gather_streams(orc_column_view *columns,
   return streams;
 }
 
+struct segmented_valid_cnt_input {
+  bitmask_type const *mask;
+  std::vector<size_type> indices;
+};
+
 rmm::device_buffer writer::impl::encode_columns(orc_column_view *columns,
                                                 size_t num_columns,
                                                 size_t num_rows,
@@ -555,6 +560,7 @@ rmm::device_buffer writer::impl::encode_columns(orc_column_view *columns,
   // Initialize column chunks' descriptions
   size_t stripe_start = 0;
   size_t stripe_id    = 0;
+  std::map<size_type, segmented_valid_cnt_input> validity_check_inputs;
   for (size_t j = 0; j < num_rowgroups; j++) {
     for (size_t i = 0; i < num_columns; i++) {
       auto *ck          = &chunks[j * num_columns + i];
@@ -578,6 +584,20 @@ rmm::device_buffer writer::impl::encode_columns(orc_column_view *columns,
       }
       ck->scale = columns[i].clockscale();
 
+      // Only need to check row groups that end within the stripe
+      if (ck->type_kind == TypeKind::BOOLEAN && columns[i].nullable() &&
+          j + 1 != stripe_start + stripe_list[stripe_id]) {
+        auto curr_cnt_in = validity_check_inputs.find(i);
+        if (curr_cnt_in == validity_check_inputs.end()) {
+          bool unused;
+          // add new object
+          std::tie(curr_cnt_in, unused) = validity_check_inputs.insert({i, {columns[i].nulls()}});
+        }
+        // append row group start and end to existing object
+        curr_cnt_in->second.indices.push_back(ck->start_row);
+        curr_cnt_in->second.indices.push_back(ck->start_row + ck->num_rows);
+      }
+
       for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
         const auto strm_id = strm_ids[i * gpu::CI_NUM_STREAMS + k];
 
@@ -631,6 +651,16 @@ rmm::device_buffer writer::impl::encode_columns(orc_column_view *columns,
     }
   }
 
+  for (auto &cnt_in : validity_check_inputs) {
+    auto const valid_counts = segmented_count_set_bits(cnt_in.second.mask, cnt_in.second.indices);
+    CUDF_EXPECTS(
+      std::none_of(valid_counts.cbegin(),
+                   valid_counts.cend(),
+                   [](auto valid_count) { return valid_count % 8; }),
+      "There's currently a bug in encoding boolean columns. Suggested workaround is to convert to "
+      "int8 type. Please see https://github.com/rapidsai/cudf/issues/6763 for more information.");
+  }
+
   chunks.host_to_device(stream);
   if (!str_col_ids.empty()) {
     auto d_stripe_dict = columns[str_col_ids[0]].device_stripe_dict();
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 85e61acd8e6..cf3da26258e 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -571,6 +571,9 @@ def normalized_equals(value1, value2):
 @pytest.mark.parametrize("nrows", [1, 100, 6000000])
 def test_orc_write_statistics(tmpdir, datadir, nrows):
     supported_stat_types = supported_numpy_dtypes + ["str"]
+    # Can't write random bool columns until issue #6763 is fixed
+    if nrows == 6000000:
+        supported_stat_types.remove("bool")
 
     # Make a dataframe
     gdf = cudf.DataFrame(
@@ -670,3 +673,29 @@ def test_orc_reader_gmt_timestamps(datadir):
     pdf = orcfile.read().to_pandas()
     gdf = cudf.read_orc(path, engine="cudf").to_pandas()
     assert_eq(pdf, gdf)
+
+
+def test_orc_bool_encode_fail():
+    np.random.seed(0)
+
+    # Generate a boolean column longer than a single stripe
+    fail_df = cudf.DataFrame({"col": gen_rand_series("bool", 600000)})
+    # Invalidate the first row in the second stripe to break encoding
+    fail_df["col"][500000] = None
+
+    # Should throw instead of generating a file that is incompatible
+    # with other readers (see issue #6763)
+    with pytest.raises(RuntimeError):
+        fail_df.to_orc("should_throw.orc")
+
+    # Generate a boolean column that fits into a single stripe
+    okay_df = cudf.DataFrame({"col": gen_rand_series("bool", 500000)})
+    okay_df["col"][500000 - 1] = None
+    fname = "single_stripe.orc"
+    # Invalid row is in the last row group of the stripe;
+    # encoding is assumed to be correct
+    okay_df.to_orc(fname)
+
+    # Also validate data
+    pdf = pa.orc.ORCFile(fname).read().to_pandas()
+    assert_eq(okay_df, pdf)