Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rework some code logic to reduce iterator and comparator inlining to improve compile time #12900

Merged
merged 40 commits into from
Mar 27, 2023
Merged
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
bd63a2b
Disable inline of row operators for nested column types
davidwendt Mar 7, 2023
1446e08
change attribute(noinline) to noinline
davidwendt Mar 7, 2023
8da8470
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 7, 2023
bcc2a14
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 8, 2023
2453e7d
use transform for row-ops intermediate result
davidwendt Mar 9, 2023
edd2583
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 9, 2023
993c36d
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 9, 2023
47f1332
fix bool logic from count() return
davidwendt Mar 9, 2023
fb50223
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 9, 2023
8cdf813
add transform/count to unique_count
davidwendt Mar 9, 2023
29601dd
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 9, 2023
b842763
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 10, 2023
2a7f2de
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 10, 2023
b77db9d
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 11, 2023
f8fe035
undo no-inline declaration
davidwendt Mar 13, 2023
76445b7
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 13, 2023
9c5ee0c
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 14, 2023
4cca497
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 15, 2023
3ccd0a4
rework group-nunique to use intermediate buffer
davidwendt Mar 16, 2023
f994e53
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 16, 2023
321c4d4
use temp buffer for reduce-by-key calls
davidwendt Mar 16, 2023
7f8ed2c
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 17, 2023
0370e24
cleanup comments
davidwendt Mar 17, 2023
5748ad8
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 17, 2023
76f48da
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 20, 2023
9c3b473
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 21, 2023
e4b7c8e
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 21, 2023
6784431
prefer using counting-iterator over factory call
davidwendt Mar 21, 2023
25912ca
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 21, 2023
7868b0e
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 22, 2023
69da72a
add benchmarks for unique_count
davidwendt Mar 22, 2023
2a4f7de
add comments for new code patterns
davidwendt Mar 22, 2023
c5870e9
fix style violation
davidwendt Mar 22, 2023
03e8835
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 23, 2023
c142598
test cmake change
davidwendt Mar 23, 2023
00f5130
revert temp cmake change
davidwendt Mar 23, 2023
52f8c29
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 23, 2023
80803a1
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 24, 2023
5089cfe
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 24, 2023
bfdf963
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 24, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 11 additions & 6 deletions cpp/src/groupby/sort/group_m2.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -25,10 +25,12 @@
#include <cudf/utilities/type_dispatcher.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/iterator/discard_iterator.h>
#include <thrust/reduce.h>
#include <thrust/transform.h>

namespace cudf {
namespace groupby {
Expand Down Expand Up @@ -62,15 +64,18 @@ void compute_m2_fn(column_device_view const& values,
ResultType* d_result,
rmm::cuda_stream_view stream)
{
auto const var_iter = cudf::detail::make_counting_transform_iterator(
size_type{0},
m2_transform<ResultType, decltype(values_iter)>{
values, values_iter, d_means, group_labels.data()});
auto m2_fn = m2_transform<ResultType, decltype(values_iter)>{
values, values_iter, d_means, group_labels.data()};
auto const itr = thrust::counting_iterator<size_type>(0);
// using a temporary buffer for the transform instead of a transform-iterator
// improves compile-time significantly
auto m2_vals = rmm::device_uvector<ResultType>(values.size(), stream);
thrust::transform(rmm::exec_policy(stream), itr, itr + values.size(), m2_vals.begin(), m2_fn);

thrust::reduce_by_key(rmm::exec_policy(stream),
group_labels.begin(),
group_labels.end(),
var_iter,
m2_vals.begin(),
thrust::make_discard_iterator(),
d_result);
}
Expand Down
36 changes: 22 additions & 14 deletions cpp/src/groupby/sort/group_nunique.cu
Original file line number Diff line number Diff line change
Expand Up @@ -94,21 +94,20 @@ std::unique_ptr<column> group_nunique(column_view const& values,

auto const d_values_view = column_device_view::create(values, stream);

auto d_result = rmm::device_uvector<size_type>(group_labels.size(), stream);

auto const comparator_helper = [&](auto const d_equal) {
auto const is_unique_iterator =
thrust::make_transform_iterator(thrust::counting_iterator<cudf::size_type>(0),
is_unique_iterator_fn{nullate::DYNAMIC{values.has_nulls()},
*d_values_view,
d_equal,
null_handling,
group_offsets.data(),
group_labels.data()});
thrust::reduce_by_key(rmm::exec_policy(stream),
group_labels.begin(),
group_labels.end(),
is_unique_iterator,
thrust::make_discard_iterator(),
result->mutable_view().begin<size_type>());
auto fn = is_unique_iterator_fn{nullate::DYNAMIC{values.has_nulls()},
*d_values_view,
d_equal,
null_handling,
group_offsets.data(),
group_labels.data()};
thrust::transform(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(values.size()),
d_result.begin(),
fn);
};

if (cudf::detail::has_nested_columns(values_view)) {
Expand All @@ -121,6 +120,15 @@ std::unique_ptr<column> group_nunique(column_view const& values,
comparator_helper(d_equal);
}

// calling this with a vector instead of a transform iterator is 10x faster to compile;
// it also helps that we are only calling it once for both conditions
thrust::reduce_by_key(rmm::exec_policy(stream),
group_labels.begin(),
group_labels.end(),
d_result.begin(),
thrust::make_discard_iterator(),
result->mutable_view().begin<size_type>());

return result;
}

Expand Down
17 changes: 11 additions & 6 deletions cpp/src/groupby/sort/group_std.cu
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,15 @@
#include <cudf/utilities/type_dispatcher.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/reduce.h>
#include <thrust/transform.h>

namespace cudf {
namespace groupby {
Expand All @@ -48,7 +50,7 @@ struct var_transform {
size_type const* d_group_labels;
size_type ddof;

__device__ ResultType operator()(size_type i)
__device__ ResultType operator()(size_type i) const
{
if (d_values.is_null(i)) return 0.0;

Expand All @@ -75,15 +77,18 @@ void reduce_by_key_fn(column_device_view const& values,
ResultType* d_result,
rmm::cuda_stream_view stream)
{
auto var_iter = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
var_transform<ResultType, decltype(values_iter)>{
values, values_iter, d_means, d_group_sizes, group_labels.data(), ddof});
auto var_fn = var_transform<ResultType, decltype(values_iter)>{
values, values_iter, d_means, d_group_sizes, group_labels.data(), ddof};
auto const itr = thrust::make_counting_iterator<size_type>(0);
// using a temporary buffer for the transform instead of a transform-iterator
// improves compile-time significantly
auto vars = rmm::device_uvector<ResultType>(values.size(), stream);
thrust::transform(rmm::exec_policy(stream), itr, itr + values.size(), vars.begin(), var_fn);

thrust::reduce_by_key(rmm::exec_policy(stream),
group_labels.begin(),
group_labels.end(),
var_iter,
vars.begin(),
thrust::make_discard_iterator(),
d_result);
}
Expand Down
26 changes: 18 additions & 8 deletions cpp/src/groupby/sort/sort_helper.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@

#include "common_utils.cuh"

#include <stream_compaction/stream_compaction_common.cuh>
davidwendt marked this conversation as resolved.
Show resolved Hide resolved

#include <cudf/column/column_factories.hpp>
#include <cudf/copying.hpp>
#include <cudf/detail/copy.hpp>
Expand Down Expand Up @@ -144,7 +146,8 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_offsets(
{
if (_group_offsets) return *_group_offsets;

_group_offsets = std::make_unique<index_vector>(num_keys(stream) + 1, stream);
auto const size = num_keys(stream);
_group_offsets = std::make_unique<index_vector>(size + 1, stream);

auto const comparator = cudf::experimental::row::equality::self_comparator{_keys, stream};

Expand All @@ -154,23 +157,30 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_offsets(
if (cudf::detail::has_nested_columns(_keys)) {
auto const d_key_equal = comparator.equal_to<true>(
cudf::nullate::DYNAMIC{cudf::has_nested_nulls(_keys)}, null_equality::EQUAL);
result_end = thrust::unique_copy(rmm::exec_policy(stream),
thrust::counting_iterator<size_type>(0),
thrust::counting_iterator<size_type>(num_keys(stream)),
_group_offsets->begin(),
permuted_row_equality_comparator(d_key_equal, sorted_order));
auto result = rmm::device_uvector<bool>(size, stream);
auto const itr = thrust::make_counting_iterator<size_type>(0);
auto const row_eq = permuted_row_equality_comparator(d_key_equal, sorted_order);
auto const ufn = cudf::detail::unique_copy_fn<decltype(itr), decltype(row_eq)>{
itr, duplicate_keep_option::KEEP_FIRST, row_eq, size - 1};
thrust::transform(rmm::exec_policy(stream), itr, itr + size, result.begin(), ufn);
result_end = thrust::copy_if(rmm::exec_policy(stream),
itr,
itr + size,
result.begin(),
_group_offsets->begin(),
thrust::identity<bool>{});
} else {
auto const d_key_equal = comparator.equal_to<false>(
cudf::nullate::DYNAMIC{cudf::has_nested_nulls(_keys)}, null_equality::EQUAL);
result_end = thrust::unique_copy(rmm::exec_policy(stream),
thrust::counting_iterator<size_type>(0),
thrust::counting_iterator<size_type>(num_keys(stream)),
thrust::counting_iterator<size_type>(size),
_group_offsets->begin(),
permuted_row_equality_comparator(d_key_equal, sorted_order));
}

size_type num_groups = thrust::distance(_group_offsets->begin(), result_end);
_group_offsets->set_element(num_groups, num_keys(stream), stream);
_group_offsets->set_element(num_groups, size, stream);
_group_offsets->resize(num_groups + 1, stream);

return *_group_offsets;
Expand Down
26 changes: 16 additions & 10 deletions cpp/src/search/contains_scalar.cu
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

#include <thrust/count.h>
#include <thrust/pair.h>
#include <thrust/transform.h>

namespace cudf {
namespace detail {
Expand Down Expand Up @@ -108,16 +109,21 @@ struct contains_scalar_dispatch {
auto const haystack_cdv_ptr = column_device_view::create(haystack, stream);

auto const d_comp = comparator.equal_to<true>(nullate::DYNAMIC{has_nulls});
return thrust::count_if(
rmm::exec_policy(stream),
begin,
end,
[d_comp, check_nulls, d_haystack = *haystack_cdv_ptr] __device__(auto const idx) {
if (check_nulls && d_haystack.is_null_nocheck(static_cast<size_type>(idx))) {
return false;
}
return d_comp(idx, rhs_index_type{0}); // compare haystack[idx] == needle[0].
}) > 0;

auto d_results = rmm::device_uvector<bool>(haystack.size(), stream);
thrust::transform(
rmm::exec_policy(stream),
begin,
end,
d_results.begin(),
[d_comp, check_nulls, d_haystack = *haystack_cdv_ptr] __device__(auto const idx) {
if (check_nulls && d_haystack.is_null_nocheck(static_cast<size_type>(idx))) {
return false;
}
return d_comp(idx, rhs_index_type{0}); // compare haystack[idx] == needle[0].
});

return thrust::count(rmm::exec_policy(stream), d_results.begin(), d_results.end(), true) > 0;
}
};

Expand Down
22 changes: 15 additions & 7 deletions cpp/src/sort/is_sorted.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,15 @@
#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/count.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/sort.h>
#include <thrust/transform.h>

namespace cudf {
namespace detail {

auto is_sorted(cudf::table_view const& in,
bool is_sorted(cudf::table_view const& in,
std::vector<order> const& column_order,
std::vector<null_order> const& null_precedence,
rmm::cuda_stream_view stream)
Expand All @@ -44,16 +46,22 @@ auto is_sorted(cudf::table_view const& in,
if (cudf::detail::has_nested_columns(in)) {
auto const device_comparator = comparator.less<true>(has_nested_nulls(in));

return thrust::is_sorted(rmm::exec_policy(stream),
thrust::make_counting_iterator(0),
thrust::make_counting_iterator(in.num_rows()),
device_comparator);
auto d_results = rmm::device_uvector<bool>(in.num_rows(), stream);
thrust::transform(rmm::exec_policy(stream),
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
thrust::counting_iterator<size_type>(0),
thrust::counting_iterator<size_type>(in.num_rows()),
d_results.begin(),
[device_comparator] __device__(auto idx) -> bool {
return (idx == 0) || device_comparator(idx - 1, idx);
});

return thrust::count(rmm::exec_policy(stream), d_results.begin(), d_results.end(), false) == 0;
} else {
auto const device_comparator = comparator.less<false>(has_nested_nulls(in));

return thrust::is_sorted(rmm::exec_policy(stream),
thrust::make_counting_iterator(0),
thrust::make_counting_iterator(in.num_rows()),
thrust::counting_iterator<size_type>(0),
thrust::counting_iterator<size_type>(in.num_rows()),
device_comparator);
}
}
Expand Down
70 changes: 39 additions & 31 deletions cpp/src/stream_compaction/unique.cu
Original file line number Diff line number Diff line change
Expand Up @@ -67,38 +67,46 @@ std::unique_ptr<table> unique(table_view const& input,

auto comp = cudf::experimental::row::equality::self_comparator(keys_view, stream);

auto const comparator_helper = [&](auto const row_equal) {
// get indices of unique rows
auto result_end = unique_copy(thrust::counting_iterator<size_type>(0),
thrust::counting_iterator<size_type>(num_rows),
mutable_view->begin<size_type>(),
row_equal,
keep,
stream);
size_type const unique_size = [&] {
if (cudf::detail::has_nested_columns(keys_view)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this compiles faster for nested types, does it also compile faster for non-nested types? If it's possible to unify these and have a single implementation of the algorithms, I would prefer that (rather than one transform + copy_if for nested types and one unique_copy for non-nested types).

If there are considerations like runtime, memory usage, etc. that warrant two separate implementations, then let's inform the reader with some comments explaining this decision.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does compile faster for non-nested types but the performance impact was too large (20-50% increase) for this path.

auto row_equal =
comp.equal_to<true>(nullate::DYNAMIC{has_nested_nulls(keys_view)}, nulls_equal);
auto d_results = rmm::device_uvector<bool>(num_rows, stream);
auto itr = thrust::make_counting_iterator<size_type>(0);
thrust::transform(
rmm::exec_policy(stream),
itr,
itr + num_rows,
d_results.begin(),
unique_copy_fn<decltype(itr), decltype(row_equal)>{itr, keep, row_equal, num_rows - 1});
auto result_end = thrust::copy_if(rmm::exec_policy(stream),
itr,
itr + num_rows,
d_results.begin(),
mutable_view->begin<size_type>(),
thrust::identity<bool>{});
return static_cast<size_type>(thrust::distance(mutable_view->begin<size_type>(), result_end));
} else {
auto row_equal =
comp.equal_to<false>(nullate::DYNAMIC{has_nested_nulls(keys_view)}, nulls_equal);
auto result_end = unique_copy(thrust::counting_iterator<size_type>(0),
thrust::counting_iterator<size_type>(num_rows),
mutable_view->begin<size_type>(),
row_equal,
keep,
stream);
return static_cast<size_type>(thrust::distance(mutable_view->begin<size_type>(), result_end));
}
}();
auto indices_view = cudf::detail::slice(column_view(*unique_indices), 0, unique_size);

auto indices_view =
cudf::detail::slice(column_view(*unique_indices),
0,
thrust::distance(mutable_view->begin<size_type>(), result_end));

// gather unique rows and return
return detail::gather(input,
indices_view,
out_of_bounds_policy::DONT_CHECK,
detail::negative_index_policy::NOT_ALLOWED,
stream,
mr);
};

if (cudf::detail::has_nested_columns(keys_view)) {
auto row_equal =
comp.equal_to<true>(nullate::DYNAMIC{has_nested_nulls(keys_view)}, nulls_equal);
return comparator_helper(row_equal);
} else {
auto row_equal =
comp.equal_to<false>(nullate::DYNAMIC{has_nested_nulls(keys_view)}, nulls_equal);
return comparator_helper(row_equal);
}
// gather unique rows and return
return detail::gather(input,
indices_view,
out_of_bounds_policy::DONT_CHECK,
detail::negative_index_policy::NOT_ALLOWED,
stream,
mr);
}
} // namespace detail

Expand Down
16 changes: 11 additions & 5 deletions cpp/src/stream_compaction/unique_count.cu
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
#include <thrust/execution_policy.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/logical.h>
#include <thrust/transform.h>

#include <cmath>
#include <cstddef>
Expand Down Expand Up @@ -76,11 +77,16 @@ cudf::size_type unique_count(table_view const& keys,
if (cudf::detail::has_nested_columns(keys)) {
auto const comp =
row_comp.equal_to<true>(nullate::DYNAMIC{has_nested_nulls(keys)}, nulls_equal);
return thrust::count_if(
rmm::exec_policy(stream),
thrust::counting_iterator<cudf::size_type>(0),
thrust::counting_iterator<cudf::size_type>(keys.num_rows()),
[comp] __device__(cudf::size_type i) { return (i == 0 or not comp(i, i - 1)); });

auto d_results = rmm::device_uvector<bool>(keys.num_rows(), stream);
thrust::transform(rmm::exec_policy(stream),
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(keys.num_rows()),
d_results.begin(),
[comp] __device__(auto i) { return (i == 0 or not comp(i, i - 1)); });

return static_cast<size_type>(
thrust::count(rmm::exec_policy(stream), d_results.begin(), d_results.end(), true));
} else {
auto const comp =
row_comp.equal_to<false>(nullate::DYNAMIC{has_nested_nulls(keys)}, nulls_equal);
Expand Down