Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement segmented_row_bit_count for computing row sizes by segments of rows #15169

Merged
merged 22 commits into from
Mar 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion cpp/include/cudf/detail/transform.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -100,5 +100,15 @@ std::unique_ptr<column> row_bit_count(table_view const& t,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

/**
* @copydoc cudf::segmented_row_bit_count
*
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<column> segmented_row_bit_count(table_view const& t,
size_type segment_length,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

} // namespace detail
} // namespace cudf
25 changes: 24 additions & 1 deletion cpp/include/cudf/transform.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -224,5 +224,28 @@ std::unique_ptr<column> row_bit_count(
table_view const& t,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns an approximate cumulative size in bits of all columns in the `table_view` for
* each segment of rows.
*
* This is similar to counting bit size per row for the input table in `cudf::row_bit_count`,
* except that row sizes are accumulated by segments.
*
* Currently, only fixed-length segments are supported. In case the input table has number of rows
* not divisible by `segment_length`, its last segment is considered as shorter than the others.
*
* @throw std::invalid_argument if the input `segment_length` is non-positive or larger than the
* number of rows in the input table.
*
* @param t The table view to perform the computation on
* @param segment_length The number of rows in each segment for which the total size is computed
* @param mr Device memory resource used to allocate the returned columns' device memory
* @return A 32-bit integer column containing the bit counts for each segment of rows
*/
std::unique_ptr<column> segmented_row_bit_count(
table_view const& t,
size_type segment_length,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of group
} // namespace cudf
95 changes: 64 additions & 31 deletions cpp/src/transform/row_bit_count.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/offsets_iterator.cuh>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/lists/lists_column_view.hpp>
#include <cudf/structs/structs_column_view.hpp>
Expand All @@ -31,8 +32,10 @@
#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/fill.h>
#include <cuda/functional>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/optional.h>
#include <thrust/tabulate.h>

namespace cudf {
namespace detail {
Expand Down Expand Up @@ -398,26 +401,32 @@ __device__ size_type row_size_functor::operator()<struct_view>(column_device_vie
* @param cols An span of column_device_views representing a column hierarchy
* @param info An span of column_info structs corresponding the elements in `cols`
* @param output Output span of size (# rows) where per-row bit sizes are stored
* @param segment_length The number of rows in each segment for which the total size is computed
* @param max_branch_depth Maximum depth of the span stack needed per-thread
*/
CUDF_KERNEL void compute_row_sizes(device_span<column_device_view const> cols,
device_span<column_info const> info,
device_span<size_type> output,
size_type max_branch_depth)
CUDF_KERNEL void compute_segment_sizes(device_span<column_device_view const> cols,
device_span<column_info const> info,
device_span<size_type> output,
size_type segment_length,
size_type max_branch_depth)
{
extern __shared__ row_span thread_branch_stacks[];
int const tid = threadIdx.x + blockIdx.x * blockDim.x;

auto const num_rows = output.size();
if (tid >= num_rows) { return; }
auto const num_segments = static_cast<size_type>(output.size());
if (tid >= num_segments) { return; }

// my_branch_stack points to the last span prior to branching. a branch occurs only
// when we are inside of a list contained within a struct column.
row_span* my_branch_stack = thread_branch_stacks + (threadIdx.x * max_branch_depth);
size_type branch_depth{0};

// current row span - always starts at 1 row.
row_span cur_span{tid, tid + 1};
// current row span - always starts at spanning over `segment_length` rows.
auto const num_rows = cols[0].size();
auto const get_default_row_span = [=] {
return row_span{tid * segment_length, cuda::std::min((tid + 1) * segment_length, num_rows)};
};
auto cur_span = get_default_row_span();

// output size
size_type& size = output[tid];
Expand All @@ -444,7 +453,7 @@ CUDF_KERNEL void compute_row_sizes(device_span<column_device_view const> cols,
if (info[idx].depth == 0) {
branch_depth = 0;
last_branch_depth = 0;
cur_span = row_span{tid, tid + 1};
cur_span = get_default_row_span();
}

// add the contributing size of this row
Expand All @@ -465,17 +474,18 @@ CUDF_KERNEL void compute_row_sizes(device_span<column_device_view const> cols,

} // anonymous namespace

/**
* @copydoc cudf::detail::row_bit_count
*
*/
std::unique_ptr<column> row_bit_count(table_view const& t,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
std::unique_ptr<column> segmented_row_bit_count(table_view const& t,
size_type segment_length,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
// no rows
// If there is no rows, segment_length will not be checked.
if (t.num_rows() <= 0) { return cudf::make_empty_column(type_id::INT32); }

CUDF_EXPECTS(segment_length >= 1 && segment_length <= t.num_rows(),
"Invalid segment length.",
std::invalid_argument);

// flatten the hierarchy and determine some information about it.
std::vector<cudf::column_view> cols;
std::vector<column_info> info;
Expand All @@ -484,17 +494,28 @@ std::unique_ptr<column> row_bit_count(table_view const& t,
CUDF_EXPECTS(info.size() == cols.size(), "Size/info mismatch");

// create output buffer and view
auto output = cudf::make_fixed_width_column(
data_type{type_id::INT32}, t.num_rows(), mask_state::UNALLOCATED, stream, mr);
auto const num_segments = cudf::util::div_rounding_up_safe(t.num_rows(), segment_length);
auto output = cudf::make_fixed_width_column(
data_type{type_id::INT32}, num_segments, mask_state::UNALLOCATED, stream, mr);
mutable_column_view mcv = output->mutable_view();

// simple case. if we have no complex types (lists, strings, etc), the per-row size is already
// trivially computed
if (h_info.complex_type_count <= 0) {
thrust::fill(rmm::exec_policy(stream),
mcv.begin<size_type>(),
mcv.end<size_type>(),
h_info.simple_per_row_size);
thrust::tabulate(
rmm::exec_policy_nosync(stream),
mcv.begin<size_type>(),
mcv.end<size_type>(),
cuda::proclaim_return_type<size_type>(
[segment_length,
num_rows = t.num_rows(),
per_row_size = h_info.simple_per_row_size] __device__(size_type const segment_idx) {
// Since the number of rows may not divisible by segment_length,
// the last segment may be shorter than the others.
auto const current_length =
cuda::std::min(segment_length, num_rows - segment_length * segment_idx);
return per_row_size * current_length;
}));
return output;
}

Expand Down Expand Up @@ -523,22 +544,34 @@ std::unique_ptr<column> row_bit_count(table_view const& t,
// should we be aborting if we reach some extremely small block size, or just if we hit 0?
CUDF_EXPECTS(block_size > 0, "Encountered a column hierarchy too complex for row_bit_count");

cudf::detail::grid_1d grid{t.num_rows(), block_size, 1};
compute_row_sizes<<<grid.num_blocks, block_size, shared_mem_size, stream.value()>>>(
cudf::detail::grid_1d grid{num_segments, block_size, 1};
compute_segment_sizes<<<grid.num_blocks, block_size, shared_mem_size, stream.value()>>>(
{std::get<1>(d_cols), cols.size()},
{d_info.data(), info.size()},
{mcv.data<size_type>(), static_cast<std::size_t>(t.num_rows())},
{mcv.data<size_type>(), static_cast<std::size_t>(mcv.size())},
segment_length,
h_info.max_branch_depth);

return output;
}

std::unique_ptr<column> row_bit_count(table_view const& t,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
return segmented_row_bit_count(t, 1, stream, mr);
}

} // namespace detail

/**
* @copydoc cudf::row_bit_count
*
*/
std::unique_ptr<column> segmented_row_bit_count(table_view const& t,
size_type segment_length,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::segmented_row_bit_count(t, segment_length, cudf::get_default_stream(), mr);
}

std::unique_ptr<column> row_bit_count(table_view const& t, rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
Expand Down
1 change: 1 addition & 0 deletions cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ ConfigureTest(
transform/mask_to_bools_test.cpp
transform/bools_to_mask_test.cpp
transform/row_bit_count_test.cu
transform/segmented_row_bit_count_test.cu
transform/one_hot_encode_tests.cpp
)

Expand Down
Loading
Loading