diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh index 3ff3bb4cf3c..1baa97f2186 100644 --- a/cpp/include/cudf/detail/null_mask.cuh +++ b/cpp/include/cudf/detail/null_mask.cuh @@ -66,9 +66,11 @@ __global__ void offset_bitmask_binop(Binop op, size_type source_size_bits, size_type* count_ptr) { - constexpr auto const word_size{detail::size_in_bits()}; auto const tid = threadIdx.x + blockIdx.x * blockDim.x; + auto const last_bit_index = source_size_bits - 1; + auto const last_word_index = cudf::word_index(last_bit_index); + size_type thread_count = 0; for (size_type destination_word_index = tid; destination_word_index < destination.size(); @@ -86,20 +88,19 @@ __global__ void offset_bitmask_binop(Binop op, source_begin_bits[i] + source_size_bits)); } + if (destination_word_index == last_word_index) { + // mask out any bits not part of this word + auto const num_bits_in_last_word = intra_word_index(last_bit_index); + if (num_bits_in_last_word < + static_cast(detail::size_in_bits() - 1)) { + destination_word &= set_least_significant_bits(num_bits_in_last_word + 1); + } + } + destination[destination_word_index] = destination_word; thread_count += __popc(destination_word); } - // Subtract any slack bits from the last word - if (tid == 0) { - size_type const last_bit_index = source_size_bits - 1; - size_type const num_slack_bits = word_size - (last_bit_index % word_size) - 1; - if (num_slack_bits > 0) { - size_type const word_index = cudf::word_index(last_bit_index); - thread_count -= __popc(destination[word_index] & set_most_significant_bits(num_slack_bits)); - } - } - using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; size_type block_count = BlockReduce(temp_storage).Sum(thread_count);