Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace device_vector with device_uvector in null_mask #7715

Merged
merged 1 commit into from
Mar 25, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion cpp/include/cudf/column/column_factories.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
#include <cudf/utilities/traits.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_vector.hpp>

namespace cudf {
/**
Expand Down
4 changes: 2 additions & 2 deletions cpp/include/cudf/detail/null_mask.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ void set_null_mask(bitmask_type *bitmask,
* @param[in] stream CUDA stream used for device memory operations and kernel launches.
*/
std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,
std::vector<size_type> const &indices,
host_span<size_type const> indices,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume the host_span being passed by value here now is not a performance concern since it's much smaller than a vector (just a wrapper around a raw pointer)? Perhaps more for my edification than anything else: is there any particular reason to prefer not using the reference in this case?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think Vukasin covered this in his Better Code presentation on spans, but in general one of the points of views is you pass by value, though I don't remember why. It's just a pointer and a size. We don't pass column_view by value because they are not true views. There is an open issue for that one...

rmm::cuda_stream_view stream);

/**
Expand All @@ -62,7 +62,7 @@ std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,
* @param[in] stream CUDA stream used for device memory operations and kernel launches.
*/
std::vector<size_type> segmented_count_unset_bits(bitmask_type const *bitmask,
std::vector<size_type> const &indices,
host_span<size_type const> indices,
rmm::cuda_stream_view stream);

/**
Expand Down
35 changes: 15 additions & 20 deletions cpp/include/cudf/null_mask.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#pragma once

#include <cudf/types.hpp>
#include <cudf/utilities/span.hpp>

#include <rmm/device_buffer.hpp>

Expand Down Expand Up @@ -136,38 +137,32 @@ cudf::size_type count_unset_bits(bitmask_type const* bitmask, size_type start, s
* `[indices[2*i], indices[(2*i)+1])` (where 0 <= i < indices.size() / 2).
*
* Returns an empty vector if `bitmask == nullptr`.
*
* @throws cudf::logic_error if `indices.size() % 2 != 0`
* @throws cudf::logic_error if `indices[2*i] < 0 or
* indices[2*i] > indices[(2*i)+1]`
*
* @param[in] bitmask Bitmask residing in device memory whose bits will be
* counted
* @param[in] indices A vector of indices used to specify ranges to count the
* number of set bits
* @return std::vector<size_type> A vector storing the number of non-zero bits
* in the specified ranges
* @throws cudf::logic_error if `indices[2*i] < 0 or indices[2*i] > indices[(2*i)+1]`
*
* @param[in] bitmask Bitmask residing in device memory whose bits will be counted
* @param[in] indices A host_span of indices specifying ranges to count the number of set bits
* @return A vector storing the number of non-zero bits in the specified ranges
*/
std::vector<size_type> segmented_count_set_bits(bitmask_type const* bitmask,
std::vector<cudf::size_type> const& indices);
host_span<cudf::size_type const> indices);

/**
* @brief Given a bitmask, counts the number of unset (0) bits in every range
* `[indices[2*i], indices[(2*i)+1])` (where 0 <= i < indices.size() / 2).
*
* Returns an empty vector if `bitmask == nullptr`.
*
* @throws cudf::logic_error if `indices.size() % 2 != 0`
* @throws cudf::logic_error if `indices[2*i] < 0 or
* indices[2*i] > indices[(2*i)+1]`
*
* @param[in] bitmask Bitmask residing in device memory whose bits will be
* counted
* @param[in] indices A vector of indices used to specify ranges to count the
* number of unset bits
* @return std::vector<size_type> A vector storing the number of zero bits in
* the specified ranges
* @throws cudf::logic_error if `indices[2*i] < 0 or indices[2*i] > indices[(2*i)+1]`
*
* @param[in] bitmask Bitmask residing in device memory whose bits will be counted
* @param[in] indices A host_span of indices specifying ranges to count the number of unset bits
* @return A vector storing the number of zero bits in the specified ranges
*/
std::vector<size_type> segmented_count_unset_bits(bitmask_type const* bitmask,
std::vector<cudf::size_type> const& indices);
host_span<cudf::size_type const> indices);

/**
* @brief Creates a `device_buffer` from a slice of bitmask defined by a range
Expand Down
26 changes: 13 additions & 13 deletions cpp/src/bitmask/null_mask.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/null_mask.hpp>
#include <cudf/table/table_view.hpp>
#include <cudf/utilities/bit.hpp>
Expand All @@ -30,7 +31,6 @@
#include <rmm/device_buffer.hpp>
#include <rmm/device_scalar.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/device_vector.hpp>
#include <rmm/mr/device/device_memory_resource.hpp>

#include <thrust/binary_search.h>
Expand Down Expand Up @@ -466,7 +466,7 @@ cudf::size_type count_unset_bits(bitmask_type const *bitmask,
}

std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,
std::vector<size_type> const &indices,
host_span<size_type const> indices,
rmm::cuda_stream_view stream)
{
CUDF_EXPECTS(indices.size() % 2 == 0,
Expand All @@ -489,8 +489,8 @@ std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,
}

size_type num_ranges = indices.size() / 2;
thrust::host_vector<size_type> h_first_indices(num_ranges);
thrust::host_vector<size_type> h_last_indices(num_ranges);
std::vector<size_type> h_first_indices(num_ranges);
std::vector<size_type> h_last_indices(num_ranges);
thrust::stable_partition_copy(thrust::seq,
std::begin(indices),
std::end(indices),
Expand All @@ -499,9 +499,9 @@ std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,
h_last_indices.begin(),
[](auto i) { return (i % 2) == 0; });

rmm::device_vector<size_type> d_first_indices = h_first_indices;
rmm::device_vector<size_type> d_last_indices = h_last_indices;
rmm::device_vector<size_type> d_null_counts(num_ranges, 0);
auto d_first_indices = make_device_uvector_async(h_first_indices, stream);
auto d_last_indices = make_device_uvector_async(h_last_indices, stream);
rmm::device_uvector<size_type> d_null_counts(num_ranges, stream);

auto word_num_set_bits = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
Expand All @@ -510,12 +510,12 @@ std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,
thrust::make_counting_iterator(0),
// We cannot use lambda as cub::DeviceSegmentedReduce::Sum() requires
// first_word_indices and last_word_indices to have the same type.
to_word_index(true, d_first_indices.data().get()));
to_word_index(true, d_first_indices.data()));
auto last_word_indices = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
// We cannot use lambda as cub::DeviceSegmentedReduce::Sum() requires
// first_word_indices and last_word_indices to have the same type.
to_word_index(false, d_last_indices.data().get()));
to_word_index(false, d_last_indices.data()));

// first allocate temporary memroy

Expand Down Expand Up @@ -560,7 +560,7 @@ std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,

std::vector<size_type> ret(num_ranges);
CUDA_TRY(cudaMemcpyAsync(ret.data(),
d_null_counts.data().get(),
d_null_counts.data(),
num_ranges * sizeof(size_type),
cudaMemcpyDeviceToHost,
stream.value()));
Expand All @@ -571,7 +571,7 @@ std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,
}

std::vector<size_type> segmented_count_unset_bits(bitmask_type const *bitmask,
std::vector<size_type> const &indices,
host_span<size_type const> indices,
rmm::cuda_stream_view stream)
{
if (indices.empty()) {
Expand Down Expand Up @@ -669,15 +669,15 @@ cudf::size_type count_unset_bits(bitmask_type const *bitmask, size_type start, s

// Count non-zero bits in the specified ranges
std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,
std::vector<size_type> const &indices)
host_span<size_type const> indices)
{
CUDF_FUNC_RANGE();
return detail::segmented_count_set_bits(bitmask, indices, rmm::cuda_stream_default);
}

// Count zero bits in the specified ranges
std::vector<size_type> segmented_count_unset_bits(bitmask_type const *bitmask,
std::vector<size_type> const &indices)
host_span<size_type const> indices)
{
CUDF_FUNC_RANGE();
return detail::segmented_count_unset_bits(bitmask, indices, rmm::cuda_stream_default);
Expand Down