From c9588bb33a5572e53a25d69bed23857d851a6bb6 Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 10 Mar 2021 17:22:58 -0800 Subject: [PATCH 1/4] LoadNonNullIndices - fix condition under which we fetch the next velid mask word --- cpp/src/io/orc/dict_enc.cu | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu index de20af1bff4..5fdf541b7ca 100644 --- a/cpp/src/io/orc/dict_enc.cu +++ b/cpp/src/io/orc/dict_enc.cu @@ -73,21 +73,25 @@ static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s, for (uint32_t i = 0; i < s->chunk.num_rows; i += 512) { const uint32_t *valid_map = s->chunk.valid_map_base; uint32_t is_valid, nz_pos; - if (t < 16) { + if (t < 512 / 32) { if (!valid_map) { s->scratch_red[t] = 0xffffffffu; } else { - uint32_t row = s->chunk.start_row + i + t * 32; - uint32_t v = (row < s->chunk.start_row + s->chunk.num_rows) - ? valid_map[(row + s->chunk.column_offset) / 32] - : 0; - if (row & 0x1f) { - uint32_t v1 = (row + 32 < s->chunk.start_row + s->chunk.num_rows) - ? valid_map[((row + s->chunk.column_offset) / 32) + 1] - : 0; - v = __funnelshift_r(v, v1, row + s->chunk.column_offset); + uint32_t const row = s->chunk.start_row + i + t * 32; + auto const chunk_end = s->chunk.start_row + s->chunk.num_rows; + + auto const valid_map_idx = (row + s->chunk.column_offset) / 32; + uint32_t valid = (row < chunk_end) ? valid_map[valid_map_idx] : 0; + + auto const rows_in_next_word = (row + s->chunk.column_offset) & 0x1f; + if (rows_in_next_word != 0) { + auto const rows_in_current_word = 32 - rows_in_next_word; + // Read next word if any rows are within the chunk + uint32_t const valid_next = + (row + rows_in_current_word < chunk_end) ? valid_map[valid_map_idx + 1] : 0; + valid = __funnelshift_r(valid, valid_next, rows_in_next_word); } - s->scratch_red[t] = v; + s->scratch_red[t] = valid; } } __syncthreads(); From 07ca6413282b480e7b98df3281011f29ebc4d940 Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 11 Mar 2021 13:14:14 -0800 Subject: [PATCH 2/4] add test --- cpp/tests/io/orc_test.cpp | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index cc4c9b700af..cde00d8799f 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -1025,4 +1025,37 @@ TEST_F(OrcStatisticsTest, Basic) validate_statistics(stats.stripes_stats[0]); } +TEST_F(OrcWriterTest, SlicedValidMask) +{ + std::vector strings; + for (int i = 0; i < 34; ++i) + strings.emplace_back("a long string to make sure overflow affects the output"); + auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 32; }); + + column_wrapper col{strings.begin(), strings.end(), validity}; + + std::vector> cols; + cols.push_back(col.release()); + + cudf_io::table_metadata expected_metadata; + expected_metadata.column_names.emplace_back("col_string"); + + std::vector indices{31, 34}; + std::vector sliced_col = cudf::slice(cols[0]->view(), indices); + cudf::table_view tbl{sliced_col}; + + auto filepath = temp_env->get_temp_filepath("OrcStrings.orc"); + cudf_io::orc_writer_options out_opts = + cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, tbl) + .metadata(&expected_metadata); + cudf_io::write_orc(out_opts); + + cudf_io::orc_reader_options in_opts = + cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false); + auto result = cudf_io::read_orc(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(tbl, result.tbl->view()); + EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names); +} + CUDF_TEST_PROGRAM_MAIN() From c230cee373456b5f1f29e8ab465ea7a8f699a2f7 Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 11 Mar 2021 13:23:11 -0800 Subject: [PATCH 3/4] use block_size instead of 512 --- cpp/src/io/orc/dict_enc.cu | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu index 5fdf541b7ca..99157a23fcb 100644 --- a/cpp/src/io/orc/dict_enc.cu +++ b/cpp/src/io/orc/dict_enc.cu @@ -70,10 +70,10 @@ static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s, Storage &temp_storage) { if (t == 0) { s->nnz = 0; } - for (uint32_t i = 0; i < s->chunk.num_rows; i += 512) { + for (uint32_t i = 0; i < s->chunk.num_rows; i += block_size) { const uint32_t *valid_map = s->chunk.valid_map_base; uint32_t is_valid, nz_pos; - if (t < 512 / 32) { + if (t < block_size / 32) { if (!valid_map) { s->scratch_red[t] = 0xffffffffu; } else { @@ -113,7 +113,7 @@ static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s, * @param[in] chunks DictionaryChunk device array [rowgroup][column] * @param[in] num_columns Number of columns */ -// blockDim {512,1,1} +// blockDim {block_size,1,1} template __global__ void __launch_bounds__(block_size, 2) gpuInitDictionaryIndices(DictionaryChunk *chunks, uint32_t num_columns) @@ -415,9 +415,11 @@ void InitDictionaryIndices(DictionaryChunk *chunks, uint32_t num_rowgroups, rmm::cuda_stream_view stream) { - dim3 dim_block(512, 1); // 512 threads per chunk + static constexpr int block_size = 512; + dim3 dim_block(block_size, 1); dim3 dim_grid(num_columns, num_rowgroups); - gpuInitDictionaryIndices<512><<>>(chunks, num_columns); + gpuInitDictionaryIndices + <<>>(chunks, num_columns); } /** From 15c354d5d3ab61591108f37f8b6eb9b7be6f3719 Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 11 Mar 2021 14:54:49 -0800 Subject: [PATCH 4/4] test comments --- cpp/tests/io/orc_test.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index cde00d8799f..b0dc01ea001 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -1028,8 +1028,10 @@ TEST_F(OrcStatisticsTest, Basic) TEST_F(OrcWriterTest, SlicedValidMask) { std::vector strings; + // Need more than 32 elements to reproduce the issue for (int i = 0; i < 34; ++i) strings.emplace_back("a long string to make sure overflow affects the output"); + // An element is null only to enforce the output column to be nullable auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 32; }); column_wrapper col{strings.begin(), strings.end(), validity}; @@ -1040,6 +1042,7 @@ TEST_F(OrcWriterTest, SlicedValidMask) cudf_io::table_metadata expected_metadata; expected_metadata.column_names.emplace_back("col_string"); + // Bug tested here is easiest to reproduce when column_offset % 32 is 31 std::vector indices{31, 34}; std::vector sliced_col = cudf::slice(cols[0]->view(), indices); cudf::table_view tbl{sliced_col};