Skip to content

Commit

Permalink
Fix ORC writer output corruption with string columns (#7565)
Browse files Browse the repository at this point in the history
Closes: #7346

Fixes an issue in ORC writer where null counting would not read the mask for every row. 
The issue occurs when the column offset is not divisible by 32 so that two words are always read to get 32bits of mask (each read is effectively offset by the columns offset, so when reading the mask for 32 rows, we need to get two words to account for the offset). Namely, the second word is not read when the row is closer than 32 to the end of the chunk. This condition is incorrect for most column offsets, as the current row is not really the first bit of the mask word.
The fix is to adjust the condition when the second mask word is read (assuming that mask in padded to multiple of 32).

Authors:
  - Vukasin Milovanovic (@vuule)

Approvers:
  - @nvdbaranec
  - Mike Wilson (@hyperbolic2346)
  - Devavret Makkar (@devavret)

URL: #7565
  • Loading branch information
vuule authored Mar 12, 2021
1 parent 8aeb14e commit ff0c537
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 15 deletions.
36 changes: 21 additions & 15 deletions cpp/src/io/orc/dict_enc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -70,24 +70,28 @@ static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s,
Storage &temp_storage)
{
if (t == 0) { s->nnz = 0; }
for (uint32_t i = 0; i < s->chunk.num_rows; i += 512) {
for (uint32_t i = 0; i < s->chunk.num_rows; i += block_size) {
const uint32_t *valid_map = s->chunk.valid_map_base;
uint32_t is_valid, nz_pos;
if (t < 16) {
if (t < block_size / 32) {
if (!valid_map) {
s->scratch_red[t] = 0xffffffffu;
} else {
uint32_t row = s->chunk.start_row + i + t * 32;
uint32_t v = (row < s->chunk.start_row + s->chunk.num_rows)
? valid_map[(row + s->chunk.column_offset) / 32]
: 0;
if (row & 0x1f) {
uint32_t v1 = (row + 32 < s->chunk.start_row + s->chunk.num_rows)
? valid_map[((row + s->chunk.column_offset) / 32) + 1]
: 0;
v = __funnelshift_r(v, v1, row + s->chunk.column_offset);
uint32_t const row = s->chunk.start_row + i + t * 32;
auto const chunk_end = s->chunk.start_row + s->chunk.num_rows;

auto const valid_map_idx = (row + s->chunk.column_offset) / 32;
uint32_t valid = (row < chunk_end) ? valid_map[valid_map_idx] : 0;

auto const rows_in_next_word = (row + s->chunk.column_offset) & 0x1f;
if (rows_in_next_word != 0) {
auto const rows_in_current_word = 32 - rows_in_next_word;
// Read next word if any rows are within the chunk
uint32_t const valid_next =
(row + rows_in_current_word < chunk_end) ? valid_map[valid_map_idx + 1] : 0;
valid = __funnelshift_r(valid, valid_next, rows_in_next_word);
}
s->scratch_red[t] = v;
s->scratch_red[t] = valid;
}
}
__syncthreads();
Expand All @@ -109,7 +113,7 @@ static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s,
* @param[in] chunks DictionaryChunk device array [rowgroup][column]
* @param[in] num_columns Number of columns
*/
// blockDim {512,1,1}
// blockDim {block_size,1,1}
template <int block_size>
__global__ void __launch_bounds__(block_size, 2)
gpuInitDictionaryIndices(DictionaryChunk *chunks, uint32_t num_columns)
Expand Down Expand Up @@ -411,9 +415,11 @@ void InitDictionaryIndices(DictionaryChunk *chunks,
uint32_t num_rowgroups,
rmm::cuda_stream_view stream)
{
dim3 dim_block(512, 1); // 512 threads per chunk
static constexpr int block_size = 512;
dim3 dim_block(block_size, 1);
dim3 dim_grid(num_columns, num_rowgroups);
gpuInitDictionaryIndices<512><<<dim_grid, dim_block, 0, stream.value()>>>(chunks, num_columns);
gpuInitDictionaryIndices<block_size>
<<<dim_grid, dim_block, 0, stream.value()>>>(chunks, num_columns);
}

/**
Expand Down
36 changes: 36 additions & 0 deletions cpp/tests/io/orc_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1025,4 +1025,40 @@ TEST_F(OrcStatisticsTest, Basic)
validate_statistics(stats.stripes_stats[0]);
}

TEST_F(OrcWriterTest, SlicedValidMask)
{
std::vector<const char*> strings;
// Need more than 32 elements to reproduce the issue
for (int i = 0; i < 34; ++i)
strings.emplace_back("a long string to make sure overflow affects the output");
// An element is null only to enforce the output column to be nullable
auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 32; });

column_wrapper<cudf::string_view> col{strings.begin(), strings.end(), validity};

std::vector<std::unique_ptr<column>> cols;
cols.push_back(col.release());

cudf_io::table_metadata expected_metadata;
expected_metadata.column_names.emplace_back("col_string");

// Bug tested here is easiest to reproduce when column_offset % 32 is 31
std::vector<cudf::size_type> indices{31, 34};
std::vector<cudf::column_view> sliced_col = cudf::slice(cols[0]->view(), indices);
cudf::table_view tbl{sliced_col};

auto filepath = temp_env->get_temp_filepath("OrcStrings.orc");
cudf_io::orc_writer_options out_opts =
cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, tbl)
.metadata(&expected_metadata);
cudf_io::write_orc(out_opts);

cudf_io::orc_reader_options in_opts =
cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false);
auto result = cudf_io::read_orc(in_opts);

CUDF_TEST_EXPECT_TABLES_EQUAL(tbl, result.tbl->view());
EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names);
}

CUDF_TEST_PROGRAM_MAIN()

0 comments on commit ff0c537

Please sign in to comment.