diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu index de20af1bff4..99157a23fcb 100644 --- a/cpp/src/io/orc/dict_enc.cu +++ b/cpp/src/io/orc/dict_enc.cu @@ -70,24 +70,28 @@ static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s, Storage &temp_storage) { if (t == 0) { s->nnz = 0; } - for (uint32_t i = 0; i < s->chunk.num_rows; i += 512) { + for (uint32_t i = 0; i < s->chunk.num_rows; i += block_size) { const uint32_t *valid_map = s->chunk.valid_map_base; uint32_t is_valid, nz_pos; - if (t < 16) { + if (t < block_size / 32) { if (!valid_map) { s->scratch_red[t] = 0xffffffffu; } else { - uint32_t row = s->chunk.start_row + i + t * 32; - uint32_t v = (row < s->chunk.start_row + s->chunk.num_rows) - ? valid_map[(row + s->chunk.column_offset) / 32] - : 0; - if (row & 0x1f) { - uint32_t v1 = (row + 32 < s->chunk.start_row + s->chunk.num_rows) - ? valid_map[((row + s->chunk.column_offset) / 32) + 1] - : 0; - v = __funnelshift_r(v, v1, row + s->chunk.column_offset); + uint32_t const row = s->chunk.start_row + i + t * 32; + auto const chunk_end = s->chunk.start_row + s->chunk.num_rows; + + auto const valid_map_idx = (row + s->chunk.column_offset) / 32; + uint32_t valid = (row < chunk_end) ? valid_map[valid_map_idx] : 0; + + auto const rows_in_next_word = (row + s->chunk.column_offset) & 0x1f; + if (rows_in_next_word != 0) { + auto const rows_in_current_word = 32 - rows_in_next_word; + // Read next word if any rows are within the chunk + uint32_t const valid_next = + (row + rows_in_current_word < chunk_end) ? valid_map[valid_map_idx + 1] : 0; + valid = __funnelshift_r(valid, valid_next, rows_in_next_word); } - s->scratch_red[t] = v; + s->scratch_red[t] = valid; } } __syncthreads(); @@ -109,7 +113,7 @@ static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s, * @param[in] chunks DictionaryChunk device array [rowgroup][column] * @param[in] num_columns Number of columns */ -// blockDim {512,1,1} +// blockDim {block_size,1,1} template __global__ void __launch_bounds__(block_size, 2) gpuInitDictionaryIndices(DictionaryChunk *chunks, uint32_t num_columns) @@ -411,9 +415,11 @@ void InitDictionaryIndices(DictionaryChunk *chunks, uint32_t num_rowgroups, rmm::cuda_stream_view stream) { - dim3 dim_block(512, 1); // 512 threads per chunk + static constexpr int block_size = 512; + dim3 dim_block(block_size, 1); dim3 dim_grid(num_columns, num_rowgroups); - gpuInitDictionaryIndices<512><<>>(chunks, num_columns); + gpuInitDictionaryIndices + <<>>(chunks, num_columns); } /** diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index cc4c9b700af..b0dc01ea001 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -1025,4 +1025,40 @@ TEST_F(OrcStatisticsTest, Basic) validate_statistics(stats.stripes_stats[0]); } +TEST_F(OrcWriterTest, SlicedValidMask) +{ + std::vector strings; + // Need more than 32 elements to reproduce the issue + for (int i = 0; i < 34; ++i) + strings.emplace_back("a long string to make sure overflow affects the output"); + // An element is null only to enforce the output column to be nullable + auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 32; }); + + column_wrapper col{strings.begin(), strings.end(), validity}; + + std::vector> cols; + cols.push_back(col.release()); + + cudf_io::table_metadata expected_metadata; + expected_metadata.column_names.emplace_back("col_string"); + + // Bug tested here is easiest to reproduce when column_offset % 32 is 31 + std::vector indices{31, 34}; + std::vector sliced_col = cudf::slice(cols[0]->view(), indices); + cudf::table_view tbl{sliced_col}; + + auto filepath = temp_env->get_temp_filepath("OrcStrings.orc"); + cudf_io::orc_writer_options out_opts = + cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, tbl) + .metadata(&expected_metadata); + cudf_io::write_orc(out_opts); + + cudf_io::orc_reader_options in_opts = + cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false); + auto result = cudf_io::read_orc(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(tbl, result.tbl->view()); + EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names); +} + CUDF_TEST_PROGRAM_MAIN()