Skip to content

Commit

Permalink
Simplify type dispatch with device_storage_dispatch (rapidsai#7419)
Browse files Browse the repository at this point in the history
Resolves rapidsai#7390

Compile times:
```
// Before
real	33m29.842s
user	300m0.478s
sys	10m46.871s

// After
real	33m20.127s
user	299m24.825s
sys	10m35.779s
```

Binary sizes:
```
Before: -rwxr-xr-x  1 rapids rapids 328M Feb 22 15:10 libcudf_base.so
After:  -rwxr-xr-x  1 rapids rapids 327M Feb 23 07:49 libcudf_base.so
```

Authors:
  - Conor Hoekstra (@codereport)

Approvers:
  - David (@davidwendt)
  - Jake Hemstad (@jrhemstad)

URL: rapidsai#7419
  • Loading branch information
codereport authored Feb 23, 2021
1 parent c3b3b96 commit 2234554
Show file tree
Hide file tree
Showing 19 changed files with 229 additions and 254 deletions.
24 changes: 11 additions & 13 deletions cpp/include/cudf/detail/gather.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -176,11 +176,9 @@ struct column_gatherer_impl {
auto destination_column =
cudf::detail::allocate_like(source_column, num_rows, policy, stream, mr);

using Type = device_storage_type_t<Element>;

gather_helper(source_column.data<Type>(),
gather_helper(source_column.data<Element>(),
source_column.size(),
destination_column->mutable_view().template begin<Type>(),
destination_column->mutable_view().template begin<Element>(),
gather_map_begin,
gather_map_end,
nullify_out_of_bounds,
Expand Down Expand Up @@ -633,14 +631,14 @@ std::unique_ptr<table> gather(
for (auto const& source_column : source_table) {
// The data gather for n columns will be put on the first n streams
destination_columns.push_back(
cudf::type_dispatcher(source_column.type(),
column_gatherer{},
source_column,
gather_map_begin,
gather_map_end,
bounds_policy == out_of_bounds_policy::NULLIFY,
stream,
mr));
cudf::type_dispatcher<dispatch_storage_type>(source_column.type(),
column_gatherer{},
source_column,
gather_map_begin,
gather_map_end,
bounds_policy == out_of_bounds_policy::NULLIFY,
stream,
mr));
}

gather_bitmask_op const op = bounds_policy == out_of_bounds_policy::NULLIFY
Expand Down
24 changes: 11 additions & 13 deletions cpp/include/cudf/detail/scatter.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -91,15 +91,13 @@ struct column_scatterer_impl {
auto result = std::make_unique<column>(target, stream, mr);
auto result_view = result->mutable_view();

using Type = device_storage_type_t<Element>;

// NOTE use source.begin + scatter rows rather than source.end in case the
// scatter map is smaller than the number of source rows
thrust::scatter(rmm::exec_policy(stream),
source.begin<Type>(),
source.begin<Type>() + cudf::distance(scatter_map_begin, scatter_map_end),
source.begin<Element>(),
source.begin<Element>() + cudf::distance(scatter_map_begin, scatter_map_end),
scatter_map_begin,
result_view.begin<Type>());
result_view.begin<Element>());

return result;
}
Expand Down Expand Up @@ -285,14 +283,14 @@ std::unique_ptr<table> scatter(
target.begin(),
result.begin(),
[=](auto const& source_col, auto const& target_col) {
return type_dispatcher(source_col.type(),
scatter_functor,
source_col,
updated_scatter_map_begin,
updated_scatter_map_end,
target_col,
stream,
mr);
return type_dispatcher<dispatch_storage_type>(source_col.type(),
scatter_functor,
source_col,
updated_scatter_map_begin,
updated_scatter_map_end,
target_col,
stream,
mr);
});

auto gather_map = scatter_to_gather(
Expand Down
13 changes: 13 additions & 0 deletions cpp/include/cudf/utilities/type_dispatcher.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,19 @@ CUDF_TYPE_MAPPING(numeric::decimal32, type_id::DECIMAL32);
CUDF_TYPE_MAPPING(numeric::decimal64, type_id::DECIMAL64);
CUDF_TYPE_MAPPING(cudf::struct_view, type_id::STRUCT);

/**
* @brief Use this specialization on `type_dispatcher` whenever you only need to operate on the
* underlying stored type.
*
* For example, `cudf::sort` in sort.cu uses `cudf::type_dispatcher<dispatch_storage_type>(...)`.
* `cudf::gather` in gather.cuh also uses `cudf::type_dispatcher<dispatch_storage_type>(...)`.
* However, reductions needs both `data_type` and underlying type, so cannot use this.
*/
template <cudf::type_id Id>
struct dispatch_storage_type {
using type = device_storage_type_t<typename id_to_type_impl<Id>::type>;
};

template <typename T>
struct type_to_scalar_type_impl {
using ScalarType = cudf::scalar;
Expand Down
12 changes: 5 additions & 7 deletions cpp/src/copying/concatenate.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -300,13 +300,11 @@ struct concatenate_dispatch {
bool const has_nulls =
std::any_of(views.cbegin(), views.cend(), [](auto const& col) { return col.has_nulls(); });

using Type = device_storage_type_t<T>;

// Use a heuristic to guess when the fused kernel will be faster
if (use_fused_kernel_heuristic(has_nulls, views.size())) {
return fused_concatenate<Type>(views, has_nulls, stream, mr);
return fused_concatenate<T>(views, has_nulls, stream, mr);
} else {
return for_each_concatenate<Type>(views, has_nulls, stream, mr);
return for_each_concatenate<T>(views, has_nulls, stream, mr);
}
}
};
Expand Down Expand Up @@ -409,8 +407,8 @@ std::unique_ptr<column> concatenate(std::vector<column_view> const& columns_to_c
return empty_like(columns_to_concat.front());
}

return type_dispatcher(columns_to_concat.front().type(),
concatenate_dispatch{columns_to_concat, stream, mr});
return type_dispatcher<dispatch_storage_type>(
columns_to_concat.front().type(), concatenate_dispatch{columns_to_concat, stream, mr});
}

std::unique_ptr<table> concatenate(std::vector<table_view> const& tables_to_concat,
Expand Down
60 changes: 29 additions & 31 deletions cpp/src/copying/copy.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -41,28 +41,26 @@ struct copy_if_else_functor_impl {
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
using Type = device_storage_type_t<T>;

if (left_nullable) {
if (right_nullable) {
auto lhs_iter = cudf::detail::make_pair_iterator<Type, true>(lhs);
auto rhs_iter = cudf::detail::make_pair_iterator<Type, true>(rhs);
auto lhs_iter = cudf::detail::make_pair_iterator<T, true>(lhs);
auto rhs_iter = cudf::detail::make_pair_iterator<T, true>(rhs);
return detail::copy_if_else(
true, lhs_iter, lhs_iter + size, rhs_iter, filter, lhs.type(), stream, mr);
}
auto lhs_iter = cudf::detail::make_pair_iterator<Type, true>(lhs);
auto rhs_iter = cudf::detail::make_pair_iterator<Type, false>(rhs);
auto lhs_iter = cudf::detail::make_pair_iterator<T, true>(lhs);
auto rhs_iter = cudf::detail::make_pair_iterator<T, false>(rhs);
return detail::copy_if_else(
true, lhs_iter, lhs_iter + size, rhs_iter, filter, lhs.type(), stream, mr);
}
if (right_nullable) {
auto lhs_iter = cudf::detail::make_pair_iterator<Type, false>(lhs);
auto rhs_iter = cudf::detail::make_pair_iterator<Type, true>(rhs);
auto lhs_iter = cudf::detail::make_pair_iterator<T, false>(lhs);
auto rhs_iter = cudf::detail::make_pair_iterator<T, true>(rhs);
return detail::copy_if_else(
true, lhs_iter, lhs_iter + size, rhs_iter, filter, lhs.type(), stream, mr);
}
auto lhs_iter = cudf::detail::make_pair_iterator<Type, false>(lhs);
auto rhs_iter = cudf::detail::make_pair_iterator<Type, false>(rhs);
auto lhs_iter = cudf::detail::make_pair_iterator<T, false>(lhs);
auto rhs_iter = cudf::detail::make_pair_iterator<T, false>(rhs);
return detail::copy_if_else(
false, lhs_iter, lhs_iter + size, rhs_iter, filter, lhs.type(), stream, mr);
}
Expand Down Expand Up @@ -182,30 +180,30 @@ std::unique_ptr<column> copy_if_else(Left const& lhs,
auto filter = [bool_mask_device] __device__(cudf::size_type i) {
return bool_mask_device.is_valid_nocheck(i) and bool_mask_device.element<bool>(i);
};
return cudf::type_dispatcher(lhs.type(),
copy_if_else_functor{},
lhs,
rhs,
boolean_mask.size(),
left_nullable,
right_nullable,
filter,
stream,
mr);
return cudf::type_dispatcher<dispatch_storage_type>(lhs.type(),
copy_if_else_functor{},
lhs,
rhs,
boolean_mask.size(),
left_nullable,
right_nullable,
filter,
stream,
mr);
} else {
auto filter = [bool_mask_device] __device__(cudf::size_type i) {
return bool_mask_device.element<bool>(i);
};
return cudf::type_dispatcher(lhs.type(),
copy_if_else_functor{},
lhs,
rhs,
boolean_mask.size(),
left_nullable,
right_nullable,
filter,
stream,
mr);
return cudf::type_dispatcher<dispatch_storage_type>(lhs.type(),
copy_if_else_functor{},
lhs,
rhs,
boolean_mask.size(),
left_nullable,
right_nullable,
filter,
stream,
mr);
}
}

Expand Down
20 changes: 10 additions & 10 deletions cpp/src/copying/copy_range.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -108,9 +108,8 @@ struct out_of_place_copy_range_dispatch {
}

if (source_end != source_begin) { // otherwise no-op
using Type = cudf::device_storage_type_t<T>;
auto ret_view = p_ret->mutable_view();
in_place_copy_range<Type>(source, ret_view, source_begin, source_end, target_begin, stream);
in_place_copy_range<T>(source, ret_view, source_begin, source_end, target_begin, stream);
}

return p_ret;
Expand Down Expand Up @@ -261,13 +260,14 @@ std::unique_ptr<column> copy_range(column_view const& source,
"Range is out of bounds.");
CUDF_EXPECTS(target.type() == source.type(), "Data type mismatch.");

return cudf::type_dispatcher(target.type(),
out_of_place_copy_range_dispatch{source, target},
source_begin,
source_end,
target_begin,
stream,
mr);
return cudf::type_dispatcher<dispatch_storage_type>(
target.type(),
out_of_place_copy_range_dispatch{source, target},
source_begin,
source_end,
target_begin,
stream,
mr);
}

} // namespace detail
Expand Down
24 changes: 11 additions & 13 deletions cpp/src/copying/scatter.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -106,18 +106,16 @@ struct column_scalar_scatterer_impl {
auto result = std::make_unique<column>(target, stream, mr);
auto result_view = result->mutable_view();

using Type = device_storage_type_t<Element>;

// Use permutation iterator with constant index to dereference scalar data
auto scalar_impl = static_cast<const scalar_type_t<Type>*>(&source.get());
auto scalar_impl = static_cast<const scalar_type_t<Element>*>(&source.get());
auto scalar_iter =
thrust::make_permutation_iterator(scalar_impl->data(), thrust::make_constant_iterator(0));

thrust::scatter(rmm::exec_policy(stream),
scalar_iter,
scalar_iter + scatter_rows,
scatter_iter,
result_view.begin<Type>());
result_view.begin<Element>());

return result;
}
Expand Down Expand Up @@ -300,14 +298,14 @@ std::unique_ptr<table> scatter(std::vector<std::reference_wrapper<const scalar>>
target.begin(),
result.begin(),
[=](auto const& source_scalar, auto const& target_col) {
return type_dispatcher(target_col.type(),
scatter_functor,
source_scalar,
scatter_iter,
scatter_rows,
target_col,
stream,
mr);
return type_dispatcher<dispatch_storage_type>(target_col.type(),
scatter_functor,
source_scalar,
scatter_iter,
scatter_rows,
target_col,
stream,
mr);
});

scatter_scalar_bitmask(source, scatter_iter, scatter_rows, result, stream, mr);
Expand Down
12 changes: 6 additions & 6 deletions cpp/src/copying/shift.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -60,8 +60,7 @@ struct shift_functor {
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
using Type = device_storage_type_t<T>;
using ScalarType = cudf::scalar_type_t<Type>;
using ScalarType = cudf::scalar_type_t<T>;
auto& scalar = static_cast<ScalarType const&>(fill_value);

auto device_input = column_device_view::create(input);
Expand All @@ -88,7 +87,7 @@ struct shift_functor {
output->set_null_count(std::get<1>(mask_pair));
}

auto data = device_output->data<Type>();
auto data = device_output->data<T>();

// avoid assigning elements we know to be invalid.
if (not scalar.is_valid()) {
Expand All @@ -103,7 +102,7 @@ struct shift_functor {
auto func_value =
[size, offset, fill = scalar.data(), input = *device_input] __device__(size_type idx) {
auto src_idx = idx - offset;
return out_of_bounds(size, src_idx) ? *fill : input.element<Type>(src_idx);
return out_of_bounds(size, src_idx) ? *fill : input.element<T>(src_idx);
};

thrust::transform(rmm::exec_policy(stream), index_begin, index_end, data, func_value);
Expand All @@ -128,7 +127,8 @@ std::unique_ptr<column> shift(column_view const& input,

if (input.is_empty()) { return empty_like(input); }

return type_dispatcher(input.type(), shift_functor{}, input, offset, fill_value, stream, mr);
return type_dispatcher<dispatch_storage_type>(
input.type(), shift_functor{}, input, offset, fill_value, stream, mr);
}

} // namespace detail
Expand Down
Loading

0 comments on commit 2234554

Please sign in to comment.