From add4b4535999dcc200b7fdf83298b90d0495af96 Mon Sep 17 00:00:00 2001 From: Kumar Aatish Date: Fri, 26 Mar 2021 22:26:02 -0400 Subject: [PATCH] Fix string length in stripe dictionary building (#7744) In PR #7676 the length of the current string being referred to while building stripe dictionaries was always set to 0 while incrementing the dictionary character count of a StripeDictionary. This led to corrupted strings when the dictionary encoding was used as noted in issue #7741. This has been fixed in this PR. Fixes #7741 Authors: - Kumar Aatish (@kaatish) Approvers: - Vukasin Milovanovic (@vuule) - Nghia Truong (@ttnghia) URL: https://github.com/rapidsai/cudf/pull/7744 --- cpp/src/io/orc/dict_enc.cu | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu index 5695e882a95..e69a61bde66 100644 --- a/cpp/src/io/orc/dict_enc.cu +++ b/cpp/src/io/orc/dict_enc.cu @@ -396,7 +396,10 @@ __global__ void __launch_bounds__(block_size) uint32_t cur = (i + t < num_strings) ? dict_data[i + t] : 0; uint32_t cur_len = 0; bool is_dupe = false; - if (i + t < num_strings) { current_string = s->stripe.leaf_column->element(cur); } + if (i + t < num_strings) { + current_string = s->stripe.leaf_column->element(cur); + cur_len = current_string.size_bytes(); + } if (i + t != 0 && i + t < num_strings) { uint32_t prev = dict_data[i + t - 1]; is_dupe = (current_string == (s->stripe.leaf_column->element(prev)));