From ff0c5378ae99df797fdeffda9ac8728a8c63d579 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 12 Mar 2021 13:08:46 -0800 Subject: [PATCH] Fix ORC writer output corruption with string columns (#7565) Closes: #7346 Fixes an issue in ORC writer where null counting would not read the mask for every row. The issue occurs when the column offset is not divisible by 32 so that two words are always read to get 32bits of mask (each read is effectively offset by the columns offset, so when reading the mask for 32 rows, we need to get two words to account for the offset). Namely, the second word is not read when the row is closer than 32 to the end of the chunk. This condition is incorrect for most column offsets, as the current row is not really the first bit of the mask word. The fix is to adjust the condition when the second mask word is read (assuming that mask in padded to multiple of 32). Authors: - Vukasin Milovanovic (@vuule) Approvers: - @nvdbaranec - Mike Wilson (@hyperbolic2346) - Devavret Makkar (@devavret) URL: https://github.com/rapidsai/cudf/pull/7565 --- cpp/src/io/orc/dict_enc.cu | 36 +++++++++++++++++++++--------------- cpp/tests/io/orc_test.cpp | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 15 deletions(-) diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu index de20af1bff4..99157a23fcb 100644 --- a/cpp/src/io/orc/dict_enc.cu +++ b/cpp/src/io/orc/dict_enc.cu @@ -70,24 +70,28 @@ static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s, Storage &temp_storage) { if (t == 0) { s->nnz = 0; } - for (uint32_t i = 0; i < s->chunk.num_rows; i += 512) { + for (uint32_t i = 0; i < s->chunk.num_rows; i += block_size) { const uint32_t *valid_map = s->chunk.valid_map_base; uint32_t is_valid, nz_pos; - if (t < 16) { + if (t < block_size / 32) { if (!valid_map) { s->scratch_red[t] = 0xffffffffu; } else { - uint32_t row = s->chunk.start_row + i + t * 32; - uint32_t v = (row < s->chunk.start_row + s->chunk.num_rows) - ? valid_map[(row + s->chunk.column_offset) / 32] - : 0; - if (row & 0x1f) { - uint32_t v1 = (row + 32 < s->chunk.start_row + s->chunk.num_rows) - ? valid_map[((row + s->chunk.column_offset) / 32) + 1] - : 0; - v = __funnelshift_r(v, v1, row + s->chunk.column_offset); + uint32_t const row = s->chunk.start_row + i + t * 32; + auto const chunk_end = s->chunk.start_row + s->chunk.num_rows; + + auto const valid_map_idx = (row + s->chunk.column_offset) / 32; + uint32_t valid = (row < chunk_end) ? valid_map[valid_map_idx] : 0; + + auto const rows_in_next_word = (row + s->chunk.column_offset) & 0x1f; + if (rows_in_next_word != 0) { + auto const rows_in_current_word = 32 - rows_in_next_word; + // Read next word if any rows are within the chunk + uint32_t const valid_next = + (row + rows_in_current_word < chunk_end) ? valid_map[valid_map_idx + 1] : 0; + valid = __funnelshift_r(valid, valid_next, rows_in_next_word); } - s->scratch_red[t] = v; + s->scratch_red[t] = valid; } } __syncthreads(); @@ -109,7 +113,7 @@ static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s, * @param[in] chunks DictionaryChunk device array [rowgroup][column] * @param[in] num_columns Number of columns */ -// blockDim {512,1,1} +// blockDim {block_size,1,1} template __global__ void __launch_bounds__(block_size, 2) gpuInitDictionaryIndices(DictionaryChunk *chunks, uint32_t num_columns) @@ -411,9 +415,11 @@ void InitDictionaryIndices(DictionaryChunk *chunks, uint32_t num_rowgroups, rmm::cuda_stream_view stream) { - dim3 dim_block(512, 1); // 512 threads per chunk + static constexpr int block_size = 512; + dim3 dim_block(block_size, 1); dim3 dim_grid(num_columns, num_rowgroups); - gpuInitDictionaryIndices<512><<>>(chunks, num_columns); + gpuInitDictionaryIndices + <<>>(chunks, num_columns); } /** diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index cc4c9b700af..b0dc01ea001 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -1025,4 +1025,40 @@ TEST_F(OrcStatisticsTest, Basic) validate_statistics(stats.stripes_stats[0]); } +TEST_F(OrcWriterTest, SlicedValidMask) +{ + std::vector strings; + // Need more than 32 elements to reproduce the issue + for (int i = 0; i < 34; ++i) + strings.emplace_back("a long string to make sure overflow affects the output"); + // An element is null only to enforce the output column to be nullable + auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 32; }); + + column_wrapper col{strings.begin(), strings.end(), validity}; + + std::vector> cols; + cols.push_back(col.release()); + + cudf_io::table_metadata expected_metadata; + expected_metadata.column_names.emplace_back("col_string"); + + // Bug tested here is easiest to reproduce when column_offset % 32 is 31 + std::vector indices{31, 34}; + std::vector sliced_col = cudf::slice(cols[0]->view(), indices); + cudf::table_view tbl{sliced_col}; + + auto filepath = temp_env->get_temp_filepath("OrcStrings.orc"); + cudf_io::orc_writer_options out_opts = + cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, tbl) + .metadata(&expected_metadata); + cudf_io::write_orc(out_opts); + + cudf_io::orc_reader_options in_opts = + cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false); + auto result = cudf_io::read_orc(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(tbl, result.tbl->view()); + EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names); +} + CUDF_TEST_PROGRAM_MAIN()