Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix issue in slice() where columns with a positive offset were computing null counts incorrectly. #8738

Merged
merged 4 commits into from
Jul 19, 2021
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ add_library(cudf
src/copying/sample.cu
src/copying/scatter.cu
src/copying/shift.cu
src/copying/slice.cpp
src/copying/slice.cu
src/copying/split.cpp
src/copying/segmented_shift.cu
src/datetime/datetime_ops.cu
Expand Down
280 changes: 280 additions & 0 deletions cpp/include/cudf/detail/null_mask.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#include <cudf/column/column_device_view.cuh>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/null_mask.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/span.hpp>
Expand Down Expand Up @@ -141,6 +142,285 @@ void inplace_bitmask_binop(
stream.synchronize();
}

namespace {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jrhemstad @nvdbaranec JFYI this PR looks like it got merged without addressing this. It's not a big deal but may be worth fixing in a follow-up to avoid the possibility of unexpected UB somewhere down the line.


/**
* For each range `[first_bit_indices[i], last_bit_indices[i])`
* (where 0 <= i < `num_ranges`), count the number of bits set outside the range
* in the boundary words (i.e. words that include either
* `first_bit_indices[i]'th` bit or `(last_bit_indices[i] - 1)'th` bit) and
* subtract the count from the range's null count.
*
* Expects `0 <= first_bit_indices[i] <= last_bit_indices[i]`.
*
* @param[in] bitmask The bitmask whose non-zero bits outside the range in the
* boundary words will be counted.
* @param[in] num_ranges The number of ranges
* @param[in] first_bit_indices The indices (inclusive) of the first bit in each
* range
* @param[in] last_bit_indices The indices (exclusive) of the last bit in each
* range
* @param[in,out] null_counts The number of non-zero bits in each range to be
* updated
*/
template <typename OffsetIterator, typename OutputIterator>
__global__ void subtract_set_bits_range_boundaries_kerenel(bitmask_type const* bitmask,
nvdbaranec marked this conversation as resolved.
Show resolved Hide resolved
size_type num_ranges,
OffsetIterator first_bit_indices,
OffsetIterator last_bit_indices,
OutputIterator null_counts)
{
constexpr size_type const word_size_in_bits{detail::size_in_bits<bitmask_type>()};

cudf::size_type const tid = threadIdx.x + blockIdx.x * blockDim.x;
cudf::size_type range_id = tid;

while (range_id < num_ranges) {
size_type const first_bit_index = *(first_bit_indices + range_id);
size_type const last_bit_index = *(last_bit_indices + range_id);
size_type delta = 0;
size_type num_slack_bits = 0;

// compute delta due to the preceding bits in the first word in the range

num_slack_bits = intra_word_index(first_bit_index);
if (num_slack_bits > 0) {
bitmask_type word = bitmask[word_index(first_bit_index)];
bitmask_type slack_mask = set_least_significant_bits(num_slack_bits);
delta -= __popc(word & slack_mask);
}

// compute delta due to the following bits in the last word in the range

num_slack_bits = (last_bit_index % word_size_in_bits) == 0
? 0
: word_size_in_bits - intra_word_index(last_bit_index);
if (num_slack_bits > 0) {
bitmask_type word = bitmask[word_index(last_bit_index)];
bitmask_type slack_mask = set_most_significant_bits(num_slack_bits);
delta -= __popc(word & slack_mask);
}

size_type updated_null_count = *(null_counts + range_id) + delta;
*(null_counts + range_id) = updated_null_count;

range_id += blockDim.x * gridDim.x;
}
}

// convert [first_bit_index,last_bit_index) to
// [first_word_index,last_word_index)
struct to_word_index : public thrust::unary_function<size_type, size_type> {
const bool _inclusive = false;
size_type const* const _d_bit_indices = nullptr;

/**
* @brief Constructor of a functor that converts bit indices to bitmask word
* indices.
*
* @param[in] inclusive Flag that indicates whether bit indices are inclusive
* or exclusive.
* @param[in] d_bit_indices Pointer to an array of bit indices
*/
__host__ to_word_index(bool inclusive, size_type const* d_bit_indices)
: _inclusive(inclusive), _d_bit_indices(d_bit_indices)
{
}

__device__ size_type operator()(const size_type& i) const
{
auto bit_index = _d_bit_indices[i];
return word_index(bit_index) + ((_inclusive || intra_word_index(bit_index) == 0) ? 0 : 1);
}
};

/**
* @brief Functor that returns the number of set bits for a specified word
* of a bitmask array.
*
*/
struct word_num_set_bits_functor {
word_num_set_bits_functor(bitmask_type const* bitmask_) : bitmask(bitmask_) {}
__device__ size_type operator()(size_type i) const
{
return static_cast<size_type>(__popc(bitmask[i]));
}
bitmask_type const* bitmask;
};

} // anonymous namespace

/**
* @brief Given a bitmask, counts the number of set (1) bits in every range
* `[indices_begin[2*i], indices_begin[(2*i)+1])` (where 0 <= i < std::distance(indices_begin,
* indices_end) / 2).
*
* Returns an empty vector if `bitmask == nullptr`.
*
* @throws cudf::logic_error if `std::distance(indices_begin, indices_end) % 2 != 0`
* @throws cudf::logic_error if `indices_begin[2*i] < 0 or indices_begin[2*i] >
* indices_begin[(2*i)+1]`
*
* @param bitmask Bitmask residing in device memory whose bits will be counted
* @param indices_begin An iterator representing the beginning of the range of indices specifying
* ranges to count the number of set bits within
* @param indices_end An iterator representing the end of the range of indices specifying ranges to
* count the number of set bits within
* @param streaam CUDA stream used for device memory operations and kernel launches
*
* @return A vector storing the number of non-zero bits in the specified ranges
*/
template <typename IndexIterator>
std::vector<size_type> segmented_count_set_bits(bitmask_type const* bitmask,
IndexIterator indices_begin,
IndexIterator indices_end,
rmm::cuda_stream_view stream)
{
size_t const num_indices = std::distance(indices_begin, indices_end);

CUDF_EXPECTS(num_indices % 2 == 0, "Array of indices needs to have an even number of elements.");
for (size_t i = 0; i < num_indices / 2; i++) {
auto begin = indices_begin[i * 2];
auto end = indices_begin[i * 2 + 1];
CUDF_EXPECTS(begin >= 0, "Starting index cannot be negative.");
CUDF_EXPECTS(end >= begin, "End index cannot be smaller than the starting index.");
}

if (num_indices == 0) {
return std::vector<size_type>{};
} else if (bitmask == nullptr) {
std::vector<size_type> ret(num_indices / 2);
for (size_t i = 0; i < num_indices / 2; i++) {
ret[i] = indices_begin[2 * i + 1] - indices_begin[2 * i];
}
return ret;
}

size_type num_ranges = num_indices / 2;
std::vector<size_type> h_first_indices(num_ranges);
std::vector<size_type> h_last_indices(num_ranges);
thrust::stable_partition_copy(thrust::seq,
indices_begin,
indices_end,
thrust::make_counting_iterator(0),
h_first_indices.begin(),
h_last_indices.begin(),
[](auto i) { return (i % 2) == 0; });

auto d_first_indices = make_device_uvector_async(h_first_indices, stream);
auto d_last_indices = make_device_uvector_async(h_last_indices, stream);
rmm::device_uvector<size_type> d_null_counts(num_ranges, stream);

auto word_num_set_bits = thrust::make_transform_iterator(thrust::make_counting_iterator(0),
word_num_set_bits_functor{bitmask});
auto first_word_indices = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
// We cannot use lambda as cub::DeviceSegmentedReduce::Sum() requires
// first_word_indices and last_word_indices to have the same type.
to_word_index(true, d_first_indices.data()));
auto last_word_indices = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
// We cannot use lambda as cub::DeviceSegmentedReduce::Sum() requires
// first_word_indices and last_word_indices to have the same type.
to_word_index(false, d_last_indices.data()));

// first allocate temporary memory

size_t temp_storage_bytes{0};
CUDA_TRY(cub::DeviceSegmentedReduce::Sum(nullptr,
temp_storage_bytes,
word_num_set_bits,
d_null_counts.begin(),
num_ranges,
first_word_indices,
last_word_indices,
stream.value()));
rmm::device_buffer d_temp_storage(temp_storage_bytes, stream);

// second perform segmented reduction

CUDA_TRY(cub::DeviceSegmentedReduce::Sum(d_temp_storage.data(),
temp_storage_bytes,
word_num_set_bits,
d_null_counts.begin(),
num_ranges,
first_word_indices,
last_word_indices,
stream.value()));

CHECK_CUDA(stream.value());

// third, adjust counts in segment boundaries (if segments are not
// word-aligned)

constexpr size_type block_size{256};

cudf::detail::grid_1d grid(num_ranges, block_size);

subtract_set_bits_range_boundaries_kerenel<<<grid.num_blocks,
nvdbaranec marked this conversation as resolved.
Show resolved Hide resolved
grid.num_threads_per_block,
0,
stream.value()>>>(
bitmask, num_ranges, d_first_indices.begin(), d_last_indices.begin(), d_null_counts.begin());

CHECK_CUDA(stream.value());

std::vector<size_type> ret(num_ranges);
CUDA_TRY(cudaMemcpyAsync(ret.data(),
d_null_counts.data(),
num_ranges * sizeof(size_type),
cudaMemcpyDeviceToHost,
stream.value()));

stream.synchronize(); // now ret is valid.

return ret;
}

/**
* @brief Given a bitmask, counts the number of unset (0) bits in every range
* `[indices_begin[2*i], indices_begin[(2*i)+1])` (where 0 <= i < std::distance(indices_begin,
* indices_end) / 2).
*
* Returns an empty vector if `bitmask == nullptr`.
*
* @throws cudf::logic_error if `std::distance(indices_begin, indices_end) % 2 != 0`
* @throws cudf::logic_error if `indices_begin[2*i] < 0 or indices_begin[2*i] >
* indices_begin[(2*i)+1]`
*
* @param bitmask Bitmask residing in device memory whose bits will be counted
* @param indices_begin An iterator representing the beginning of the range of indices specifying
* ranges to count the number of unset bits within
* @param indices_end An iterator representing the end of the range of indices specifying ranges to
* count the number of unset bits within
* @param streaam CUDA stream used for device memory operations and kernel launches
*
* @return A vector storing the number of non-zero bits in the specified ranges
*/
template <typename IndexIterator>
std::vector<size_type> segmented_count_unset_bits(bitmask_type const* bitmask,
IndexIterator indices_begin,
IndexIterator indices_end,
rmm::cuda_stream_view stream)
{
size_t const num_indices = std::distance(indices_begin, indices_end);

if (num_indices == 0) {
return std::vector<size_type>{};
} else if (bitmask == nullptr) {
return std::vector<size_type>(num_indices / 2, 0);
}

auto ret = segmented_count_set_bits(bitmask, indices_begin, indices_end, stream);
for (size_t i = 0; i < ret.size(); i++) {
auto begin = indices_begin[i * 2];
auto end = indices_begin[i * 2 + 1];
ret[i] = (end - begin) - ret[i];
}

return ret;
}

} // namespace detail

} // namespace cudf
Loading