Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rework some code logic to reduce iterator and comparator inlining to improve compile time #12900

Merged
merged 40 commits into from
Mar 27, 2023
Merged
Show file tree
Hide file tree
Changes from 37 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
bd63a2b
Disable inline of row operators for nested column types
davidwendt Mar 7, 2023
1446e08
change attribute(noinline) to noinline
davidwendt Mar 7, 2023
8da8470
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 7, 2023
bcc2a14
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 8, 2023
2453e7d
use transform for row-ops intermediate result
davidwendt Mar 9, 2023
edd2583
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 9, 2023
993c36d
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 9, 2023
47f1332
fix bool logic from count() return
davidwendt Mar 9, 2023
fb50223
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 9, 2023
8cdf813
add transform/count to unique_count
davidwendt Mar 9, 2023
29601dd
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 9, 2023
b842763
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 10, 2023
2a7f2de
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 10, 2023
b77db9d
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 11, 2023
f8fe035
undo no-inline declaration
davidwendt Mar 13, 2023
76445b7
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 13, 2023
9c5ee0c
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 14, 2023
4cca497
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 15, 2023
3ccd0a4
rework group-nunique to use intermediate buffer
davidwendt Mar 16, 2023
f994e53
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 16, 2023
321c4d4
use temp buffer for reduce-by-key calls
davidwendt Mar 16, 2023
7f8ed2c
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 17, 2023
0370e24
cleanup comments
davidwendt Mar 17, 2023
5748ad8
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 17, 2023
76f48da
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 20, 2023
9c3b473
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 21, 2023
e4b7c8e
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 21, 2023
6784431
prefer using counting-iterator over factory call
davidwendt Mar 21, 2023
25912ca
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 21, 2023
7868b0e
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 22, 2023
69da72a
add benchmarks for unique_count
davidwendt Mar 22, 2023
2a4f7de
add comments for new code patterns
davidwendt Mar 22, 2023
c5870e9
fix style violation
davidwendt Mar 22, 2023
03e8835
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 23, 2023
c142598
test cmake change
davidwendt Mar 23, 2023
00f5130
revert temp cmake change
davidwendt Mar 23, 2023
52f8c29
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 23, 2023
80803a1
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 24, 2023
5089cfe
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 24, 2023
bfdf963
Merge branch 'branch-23.04' into row-ops-no-inline
davidwendt Mar 24, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask.cpp
# * stream_compaction benchmark -------------------------------------------------------------------
ConfigureNVBench(
STREAM_COMPACTION_NVBENCH stream_compaction/distinct.cpp stream_compaction/unique.cpp
stream_compaction/unique_count.cpp
)

# ##################################################################################################
Expand Down
53 changes: 53 additions & 0 deletions cpp/benchmarks/stream_compaction/unique_count.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/fixture/rmm_pool_raii.hpp>

#include <cudf/column/column_view.hpp>
#include <cudf/sorting.hpp>
#include <cudf/stream_compaction.hpp>
#include <cudf/types.hpp>

#include <nvbench/nvbench.cuh>

template <typename Type>
void nvbench_unique_count(nvbench::state& state, nvbench::type_list<Type>)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("NumRows"));
auto const nulls = state.get_float64("NullProbability");

data_profile profile = data_profile_builder().cardinality(0).null_probability(nulls).distribution(
cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, num_rows / 100);

auto source_column = create_random_column(cudf::type_to_id<Type>(), row_count{num_rows}, profile);
auto sorted_table = cudf::sort(cudf::table_view({source_column->view()}));

auto input = sorted_table->view();

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
cudf::unique_count(input, cudf::null_equality::EQUAL);
});
}

using data_type = nvbench::type_list<bool, int8_t, int32_t, int64_t, float, cudf::timestamp_ms>;

NVBENCH_BENCH_TYPES(nvbench_unique_count, NVBENCH_TYPE_AXES(data_type))
.set_name("unique_count")
.set_type_axes_names({"Type"})
.add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000})
.add_float64_axis("NullProbability", {0.0, 0.1});
18 changes: 12 additions & 6 deletions cpp/src/groupby/sort/group_m2.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -25,10 +25,12 @@
#include <cudf/utilities/type_dispatcher.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/iterator/discard_iterator.h>
#include <thrust/reduce.h>
#include <thrust/transform.h>

namespace cudf {
namespace groupby {
Expand Down Expand Up @@ -62,15 +64,19 @@ void compute_m2_fn(column_device_view const& values,
ResultType* d_result,
rmm::cuda_stream_view stream)
{
auto const var_iter = cudf::detail::make_counting_transform_iterator(
size_type{0},
m2_transform<ResultType, decltype(values_iter)>{
values, values_iter, d_means, group_labels.data()});
auto m2_fn = m2_transform<ResultType, decltype(values_iter)>{
values, values_iter, d_means, group_labels.data()};
auto const itr = thrust::counting_iterator<size_type>(0);
// Using a temporary buffer for intermediate transform results instead of
// using the transform-iterator directly in thrust::reduce_by_key
// improves compile-time significantly.
auto m2_vals = rmm::device_uvector<ResultType>(values.size(), stream);
thrust::transform(rmm::exec_policy(stream), itr, itr + values.size(), m2_vals.begin(), m2_fn);

thrust::reduce_by_key(rmm::exec_policy(stream),
group_labels.begin(),
group_labels.end(),
var_iter,
m2_vals.begin(),
thrust::make_discard_iterator(),
d_result);
}
Expand Down
36 changes: 22 additions & 14 deletions cpp/src/groupby/sort/group_nunique.cu
Original file line number Diff line number Diff line change
Expand Up @@ -94,21 +94,20 @@ std::unique_ptr<column> group_nunique(column_view const& values,

auto const d_values_view = column_device_view::create(values, stream);

auto d_result = rmm::device_uvector<size_type>(group_labels.size(), stream);

auto const comparator_helper = [&](auto const d_equal) {
auto const is_unique_iterator =
thrust::make_transform_iterator(thrust::counting_iterator<cudf::size_type>(0),
is_unique_iterator_fn{nullate::DYNAMIC{values.has_nulls()},
*d_values_view,
d_equal,
null_handling,
group_offsets.data(),
group_labels.data()});
thrust::reduce_by_key(rmm::exec_policy(stream),
group_labels.begin(),
group_labels.end(),
is_unique_iterator,
thrust::make_discard_iterator(),
result->mutable_view().begin<size_type>());
auto fn = is_unique_iterator_fn{nullate::DYNAMIC{values.has_nulls()},
*d_values_view,
d_equal,
null_handling,
group_offsets.data(),
group_labels.data()};
thrust::transform(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(values.size()),
d_result.begin(),
fn);
};

if (cudf::detail::has_nested_columns(values_view)) {
Expand All @@ -121,6 +120,15 @@ std::unique_ptr<column> group_nunique(column_view const& values,
comparator_helper(d_equal);
}

// calling this with a vector instead of a transform iterator is 10x faster to compile;
// it also helps that we are only calling it once for both conditions
thrust::reduce_by_key(rmm::exec_policy(stream),
group_labels.begin(),
group_labels.end(),
d_result.begin(),
thrust::make_discard_iterator(),
result->mutable_view().begin<size_type>());

return result;
}

Expand Down
18 changes: 12 additions & 6 deletions cpp/src/groupby/sort/group_std.cu
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,15 @@
#include <cudf/utilities/type_dispatcher.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/reduce.h>
#include <thrust/transform.h>

namespace cudf {
namespace groupby {
Expand All @@ -48,7 +50,7 @@ struct var_transform {
size_type const* d_group_labels;
size_type ddof;

__device__ ResultType operator()(size_type i)
__device__ ResultType operator()(size_type i) const
{
if (d_values.is_null(i)) return 0.0;

Expand All @@ -75,15 +77,19 @@ void reduce_by_key_fn(column_device_view const& values,
ResultType* d_result,
rmm::cuda_stream_view stream)
{
auto var_iter = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
var_transform<ResultType, decltype(values_iter)>{
values, values_iter, d_means, d_group_sizes, group_labels.data(), ddof});
auto var_fn = var_transform<ResultType, decltype(values_iter)>{
values, values_iter, d_means, d_group_sizes, group_labels.data(), ddof};
auto const itr = thrust::make_counting_iterator<size_type>(0);
// Using a temporary buffer for intermediate transform results instead of
// using the transform-iterator directly in thrust::reduce_by_key
// improves compile-time significantly.
auto vars = rmm::device_uvector<ResultType>(values.size(), stream);
thrust::transform(rmm::exec_policy(stream), itr, itr + values.size(), vars.begin(), var_fn);

thrust::reduce_by_key(rmm::exec_policy(stream),
group_labels.begin(),
group_labels.end(),
var_iter,
vars.begin(),
thrust::make_discard_iterator(),
d_result);
}
Expand Down
29 changes: 21 additions & 8 deletions cpp/src/groupby/sort/sort_helper.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@

#include "common_utils.cuh"

#include <stream_compaction/stream_compaction_common.cuh>
davidwendt marked this conversation as resolved.
Show resolved Hide resolved

#include <cudf/column/column_factories.hpp>
#include <cudf/copying.hpp>
#include <cudf/detail/copy.hpp>
Expand Down Expand Up @@ -144,7 +146,8 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_offsets(
{
if (_group_offsets) return *_group_offsets;

_group_offsets = std::make_unique<index_vector>(num_keys(stream) + 1, stream);
auto const size = num_keys(stream);
_group_offsets = std::make_unique<index_vector>(size + 1, stream);

auto const comparator = cudf::experimental::row::equality::self_comparator{_keys, stream};

Expand All @@ -154,23 +157,33 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_offsets(
if (cudf::detail::has_nested_columns(_keys)) {
auto const d_key_equal = comparator.equal_to<true>(
cudf::nullate::DYNAMIC{cudf::has_nested_nulls(_keys)}, null_equality::EQUAL);
result_end = thrust::unique_copy(rmm::exec_policy(stream),
thrust::counting_iterator<size_type>(0),
thrust::counting_iterator<size_type>(num_keys(stream)),
_group_offsets->begin(),
permuted_row_equality_comparator(d_key_equal, sorted_order));
// Using a temporary buffer for intermediate transform results from the iterator containing
// the comparator speeds up compile-time significantly without much degradation in
// runtime performance over using the comparator directly in thrust::unique_copy.
auto result = rmm::device_uvector<bool>(size, stream);
auto const itr = thrust::make_counting_iterator<size_type>(0);
auto const row_eq = permuted_row_equality_comparator(d_key_equal, sorted_order);
auto const ufn = cudf::detail::unique_copy_fn<decltype(itr), decltype(row_eq)>{
itr, duplicate_keep_option::KEEP_FIRST, row_eq, size - 1};
thrust::transform(rmm::exec_policy(stream), itr, itr + size, result.begin(), ufn);
result_end = thrust::copy_if(rmm::exec_policy(stream),
itr,
itr + size,
result.begin(),
_group_offsets->begin(),
thrust::identity<bool>{});
} else {
auto const d_key_equal = comparator.equal_to<false>(
cudf::nullate::DYNAMIC{cudf::has_nested_nulls(_keys)}, null_equality::EQUAL);
result_end = thrust::unique_copy(rmm::exec_policy(stream),
thrust::counting_iterator<size_type>(0),
thrust::counting_iterator<size_type>(num_keys(stream)),
thrust::counting_iterator<size_type>(size),
_group_offsets->begin(),
permuted_row_equality_comparator(d_key_equal, sorted_order));
}

size_type num_groups = thrust::distance(_group_offsets->begin(), result_end);
_group_offsets->set_element(num_groups, num_keys(stream), stream);
_group_offsets->set_element(num_groups, size, stream);
_group_offsets->resize(num_groups + 1, stream);

return *_group_offsets;
Expand Down
29 changes: 19 additions & 10 deletions cpp/src/search/contains_scalar.cu
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

#include <thrust/count.h>
#include <thrust/pair.h>
#include <thrust/transform.h>

namespace cudf {
namespace detail {
Expand Down Expand Up @@ -108,16 +109,24 @@ struct contains_scalar_dispatch {
auto const haystack_cdv_ptr = column_device_view::create(haystack, stream);

auto const d_comp = comparator.equal_to<true>(nullate::DYNAMIC{has_nulls});
return thrust::count_if(
rmm::exec_policy(stream),
begin,
end,
[d_comp, check_nulls, d_haystack = *haystack_cdv_ptr] __device__(auto const idx) {
if (check_nulls && d_haystack.is_null_nocheck(static_cast<size_type>(idx))) {
return false;
}
return d_comp(idx, rhs_index_type{0}); // compare haystack[idx] == needle[0].
}) > 0;

// Using a temporary buffer for intermediate transform results from the lambda containing
// the comparator speeds up compile-time significantly without much degradation in
// runtime performance over using the comparator in a transform iterator with thrust::count_if.
auto d_results = rmm::device_uvector<bool>(haystack.size(), stream);
thrust::transform(
rmm::exec_policy(stream),
begin,
end,
d_results.begin(),
[d_comp, check_nulls, d_haystack = *haystack_cdv_ptr] __device__(auto const idx) {
if (check_nulls && d_haystack.is_null_nocheck(static_cast<size_type>(idx))) {
return false;
}
return d_comp(idx, rhs_index_type{0}); // compare haystack[idx] == needle[0].
});

return thrust::count(rmm::exec_policy(stream), d_results.begin(), d_results.end(), true) > 0;
}
};

Expand Down
25 changes: 18 additions & 7 deletions cpp/src/sort/is_sorted.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,15 @@
#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/count.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/sort.h>
#include <thrust/transform.h>

namespace cudf {
namespace detail {

auto is_sorted(cudf::table_view const& in,
bool is_sorted(cudf::table_view const& in,
std::vector<order> const& column_order,
std::vector<null_order> const& null_precedence,
rmm::cuda_stream_view stream)
Expand All @@ -44,16 +46,25 @@ auto is_sorted(cudf::table_view const& in,
if (cudf::detail::has_nested_columns(in)) {
auto const device_comparator = comparator.less<true>(has_nested_nulls(in));

return thrust::is_sorted(rmm::exec_policy(stream),
thrust::make_counting_iterator(0),
thrust::make_counting_iterator(in.num_rows()),
device_comparator);
// Using a temporary buffer for intermediate transform results from the lambda containing
// the comparator speeds up compile-time significantly over using the comparator directly
// in thrust::is_sorted.
auto d_results = rmm::device_uvector<bool>(in.num_rows(), stream);
thrust::transform(rmm::exec_policy(stream),
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
thrust::counting_iterator<size_type>(0),
thrust::counting_iterator<size_type>(in.num_rows()),
d_results.begin(),
[device_comparator] __device__(auto idx) -> bool {
return (idx == 0) || device_comparator(idx - 1, idx);
});

return thrust::count(rmm::exec_policy(stream), d_results.begin(), d_results.end(), false) == 0;
} else {
auto const device_comparator = comparator.less<false>(has_nested_nulls(in));

return thrust::is_sorted(rmm::exec_policy(stream),
thrust::make_counting_iterator(0),
thrust::make_counting_iterator(in.num_rows()),
thrust::counting_iterator<size_type>(0),
thrust::counting_iterator<size_type>(in.num_rows()),
device_comparator);
}
}
Expand Down
Loading