Skip to content

Commit

Permalink
Rework some code logic to reduce iterator and comparator inlining to …
Browse files Browse the repository at this point in the history
…improve compile time (#12900)

Disables inlining the device code logic for the row operators for nested column types did not work as hoped.
Some files took longer to compile and some functions ran 20% slower for large rows.

Reworking individual source files to break up the code logic into multiple kernels seems to work well for compile time while having a smaller effect on performance. The goal is to only rework the nested column code paths.
Here are some source files that have compile time issues and are improved in this PR.

| source file  | current | PR |
|:--- | ---:| ---:|
|   stream_compaction/unique_count.cu | 18 min |  13 min |
|   groupby/sort/group_nunique.cu   | 16 min |  2 min  |
|   stream_compaction/unique.cu  | 16 min | 5 min |
|   groupby/sort/sort_helper.cu | 10 min | 6.5 min |
|   search/contains_scalar.cu | 12 min | 4.7 min  |
|   sort/is_sorted.cu | 9 min | 7 min |
|   groupby/sort/group_std.cu | 7 min | 1.2 min |
|   groupby/sort/group_m2.cu | 6 min | 1.2 min  |

Available benchmarks showed minimal impact to performance.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: #12900
  • Loading branch information
davidwendt authored Mar 27, 2023
1 parent 698fcf6 commit 12dc130
Show file tree
Hide file tree
Showing 10 changed files with 217 additions and 87 deletions.
1 change: 1 addition & 0 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask.cpp
# * stream_compaction benchmark -------------------------------------------------------------------
ConfigureNVBench(
STREAM_COMPACTION_NVBENCH stream_compaction/distinct.cpp stream_compaction/unique.cpp
stream_compaction/unique_count.cpp
)

# ##################################################################################################
Expand Down
53 changes: 53 additions & 0 deletions cpp/benchmarks/stream_compaction/unique_count.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/fixture/rmm_pool_raii.hpp>

#include <cudf/column/column_view.hpp>
#include <cudf/sorting.hpp>
#include <cudf/stream_compaction.hpp>
#include <cudf/types.hpp>

#include <nvbench/nvbench.cuh>

template <typename Type>
void nvbench_unique_count(nvbench::state& state, nvbench::type_list<Type>)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("NumRows"));
auto const nulls = state.get_float64("NullProbability");

data_profile profile = data_profile_builder().cardinality(0).null_probability(nulls).distribution(
cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, num_rows / 100);

auto source_column = create_random_column(cudf::type_to_id<Type>(), row_count{num_rows}, profile);
auto sorted_table = cudf::sort(cudf::table_view({source_column->view()}));

auto input = sorted_table->view();

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
cudf::unique_count(input, cudf::null_equality::EQUAL);
});
}

using data_type = nvbench::type_list<bool, int8_t, int32_t, int64_t, float, cudf::timestamp_ms>;

NVBENCH_BENCH_TYPES(nvbench_unique_count, NVBENCH_TYPE_AXES(data_type))
.set_name("unique_count")
.set_type_axes_names({"Type"})
.add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000})
.add_float64_axis("NullProbability", {0.0, 0.1});
18 changes: 12 additions & 6 deletions cpp/src/groupby/sort/group_m2.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -25,10 +25,12 @@
#include <cudf/utilities/type_dispatcher.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/iterator/discard_iterator.h>
#include <thrust/reduce.h>
#include <thrust/transform.h>

namespace cudf {
namespace groupby {
Expand Down Expand Up @@ -62,15 +64,19 @@ void compute_m2_fn(column_device_view const& values,
ResultType* d_result,
rmm::cuda_stream_view stream)
{
auto const var_iter = cudf::detail::make_counting_transform_iterator(
size_type{0},
m2_transform<ResultType, decltype(values_iter)>{
values, values_iter, d_means, group_labels.data()});
auto m2_fn = m2_transform<ResultType, decltype(values_iter)>{
values, values_iter, d_means, group_labels.data()};
auto const itr = thrust::counting_iterator<size_type>(0);
// Using a temporary buffer for intermediate transform results instead of
// using the transform-iterator directly in thrust::reduce_by_key
// improves compile-time significantly.
auto m2_vals = rmm::device_uvector<ResultType>(values.size(), stream);
thrust::transform(rmm::exec_policy(stream), itr, itr + values.size(), m2_vals.begin(), m2_fn);

thrust::reduce_by_key(rmm::exec_policy(stream),
group_labels.begin(),
group_labels.end(),
var_iter,
m2_vals.begin(),
thrust::make_discard_iterator(),
d_result);
}
Expand Down
36 changes: 22 additions & 14 deletions cpp/src/groupby/sort/group_nunique.cu
Original file line number Diff line number Diff line change
Expand Up @@ -94,21 +94,20 @@ std::unique_ptr<column> group_nunique(column_view const& values,

auto const d_values_view = column_device_view::create(values, stream);

auto d_result = rmm::device_uvector<size_type>(group_labels.size(), stream);

auto const comparator_helper = [&](auto const d_equal) {
auto const is_unique_iterator =
thrust::make_transform_iterator(thrust::counting_iterator<cudf::size_type>(0),
is_unique_iterator_fn{nullate::DYNAMIC{values.has_nulls()},
*d_values_view,
d_equal,
null_handling,
group_offsets.data(),
group_labels.data()});
thrust::reduce_by_key(rmm::exec_policy(stream),
group_labels.begin(),
group_labels.end(),
is_unique_iterator,
thrust::make_discard_iterator(),
result->mutable_view().begin<size_type>());
auto fn = is_unique_iterator_fn{nullate::DYNAMIC{values.has_nulls()},
*d_values_view,
d_equal,
null_handling,
group_offsets.data(),
group_labels.data()};
thrust::transform(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(values.size()),
d_result.begin(),
fn);
};

if (cudf::detail::has_nested_columns(values_view)) {
Expand All @@ -121,6 +120,15 @@ std::unique_ptr<column> group_nunique(column_view const& values,
comparator_helper(d_equal);
}

// calling this with a vector instead of a transform iterator is 10x faster to compile;
// it also helps that we are only calling it once for both conditions
thrust::reduce_by_key(rmm::exec_policy(stream),
group_labels.begin(),
group_labels.end(),
d_result.begin(),
thrust::make_discard_iterator(),
result->mutable_view().begin<size_type>());

return result;
}

Expand Down
18 changes: 12 additions & 6 deletions cpp/src/groupby/sort/group_std.cu
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,15 @@
#include <cudf/utilities/type_dispatcher.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/reduce.h>
#include <thrust/transform.h>

namespace cudf {
namespace groupby {
Expand All @@ -48,7 +50,7 @@ struct var_transform {
size_type const* d_group_labels;
size_type ddof;

__device__ ResultType operator()(size_type i)
__device__ ResultType operator()(size_type i) const
{
if (d_values.is_null(i)) return 0.0;

Expand All @@ -75,15 +77,19 @@ void reduce_by_key_fn(column_device_view const& values,
ResultType* d_result,
rmm::cuda_stream_view stream)
{
auto var_iter = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
var_transform<ResultType, decltype(values_iter)>{
values, values_iter, d_means, d_group_sizes, group_labels.data(), ddof});
auto var_fn = var_transform<ResultType, decltype(values_iter)>{
values, values_iter, d_means, d_group_sizes, group_labels.data(), ddof};
auto const itr = thrust::make_counting_iterator<size_type>(0);
// Using a temporary buffer for intermediate transform results instead of
// using the transform-iterator directly in thrust::reduce_by_key
// improves compile-time significantly.
auto vars = rmm::device_uvector<ResultType>(values.size(), stream);
thrust::transform(rmm::exec_policy(stream), itr, itr + values.size(), vars.begin(), var_fn);

thrust::reduce_by_key(rmm::exec_policy(stream),
group_labels.begin(),
group_labels.end(),
var_iter,
vars.begin(),
thrust::make_discard_iterator(),
d_result);
}
Expand Down
29 changes: 21 additions & 8 deletions cpp/src/groupby/sort/sort_helper.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@

#include "common_utils.cuh"

#include <stream_compaction/stream_compaction_common.cuh>

#include <cudf/column/column_factories.hpp>
#include <cudf/copying.hpp>
#include <cudf/detail/copy.hpp>
Expand Down Expand Up @@ -144,7 +146,8 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_offsets(
{
if (_group_offsets) return *_group_offsets;

_group_offsets = std::make_unique<index_vector>(num_keys(stream) + 1, stream);
auto const size = num_keys(stream);
_group_offsets = std::make_unique<index_vector>(size + 1, stream);

auto const comparator = cudf::experimental::row::equality::self_comparator{_keys, stream};

Expand All @@ -154,23 +157,33 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_offsets(
if (cudf::detail::has_nested_columns(_keys)) {
auto const d_key_equal = comparator.equal_to<true>(
cudf::nullate::DYNAMIC{cudf::has_nested_nulls(_keys)}, null_equality::EQUAL);
result_end = thrust::unique_copy(rmm::exec_policy(stream),
thrust::counting_iterator<size_type>(0),
thrust::counting_iterator<size_type>(num_keys(stream)),
_group_offsets->begin(),
permuted_row_equality_comparator(d_key_equal, sorted_order));
// Using a temporary buffer for intermediate transform results from the iterator containing
// the comparator speeds up compile-time significantly without much degradation in
// runtime performance over using the comparator directly in thrust::unique_copy.
auto result = rmm::device_uvector<bool>(size, stream);
auto const itr = thrust::make_counting_iterator<size_type>(0);
auto const row_eq = permuted_row_equality_comparator(d_key_equal, sorted_order);
auto const ufn = cudf::detail::unique_copy_fn<decltype(itr), decltype(row_eq)>{
itr, duplicate_keep_option::KEEP_FIRST, row_eq, size - 1};
thrust::transform(rmm::exec_policy(stream), itr, itr + size, result.begin(), ufn);
result_end = thrust::copy_if(rmm::exec_policy(stream),
itr,
itr + size,
result.begin(),
_group_offsets->begin(),
thrust::identity<bool>{});
} else {
auto const d_key_equal = comparator.equal_to<false>(
cudf::nullate::DYNAMIC{cudf::has_nested_nulls(_keys)}, null_equality::EQUAL);
result_end = thrust::unique_copy(rmm::exec_policy(stream),
thrust::counting_iterator<size_type>(0),
thrust::counting_iterator<size_type>(num_keys(stream)),
thrust::counting_iterator<size_type>(size),
_group_offsets->begin(),
permuted_row_equality_comparator(d_key_equal, sorted_order));
}

size_type num_groups = thrust::distance(_group_offsets->begin(), result_end);
_group_offsets->set_element(num_groups, num_keys(stream), stream);
_group_offsets->set_element(num_groups, size, stream);
_group_offsets->resize(num_groups + 1, stream);

return *_group_offsets;
Expand Down
29 changes: 19 additions & 10 deletions cpp/src/search/contains_scalar.cu
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

#include <thrust/count.h>
#include <thrust/pair.h>
#include <thrust/transform.h>

namespace cudf {
namespace detail {
Expand Down Expand Up @@ -108,16 +109,24 @@ struct contains_scalar_dispatch {
auto const haystack_cdv_ptr = column_device_view::create(haystack, stream);

auto const d_comp = comparator.equal_to<true>(nullate::DYNAMIC{has_nulls});
return thrust::count_if(
rmm::exec_policy(stream),
begin,
end,
[d_comp, check_nulls, d_haystack = *haystack_cdv_ptr] __device__(auto const idx) {
if (check_nulls && d_haystack.is_null_nocheck(static_cast<size_type>(idx))) {
return false;
}
return d_comp(idx, rhs_index_type{0}); // compare haystack[idx] == needle[0].
}) > 0;

// Using a temporary buffer for intermediate transform results from the lambda containing
// the comparator speeds up compile-time significantly without much degradation in
// runtime performance over using the comparator in a transform iterator with thrust::count_if.
auto d_results = rmm::device_uvector<bool>(haystack.size(), stream);
thrust::transform(
rmm::exec_policy(stream),
begin,
end,
d_results.begin(),
[d_comp, check_nulls, d_haystack = *haystack_cdv_ptr] __device__(auto const idx) {
if (check_nulls && d_haystack.is_null_nocheck(static_cast<size_type>(idx))) {
return false;
}
return d_comp(idx, rhs_index_type{0}); // compare haystack[idx] == needle[0].
});

return thrust::count(rmm::exec_policy(stream), d_results.begin(), d_results.end(), true) > 0;
}
};

Expand Down
25 changes: 18 additions & 7 deletions cpp/src/sort/is_sorted.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,15 @@
#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/count.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/sort.h>
#include <thrust/transform.h>

namespace cudf {
namespace detail {

auto is_sorted(cudf::table_view const& in,
bool is_sorted(cudf::table_view const& in,
std::vector<order> const& column_order,
std::vector<null_order> const& null_precedence,
rmm::cuda_stream_view stream)
Expand All @@ -44,16 +46,25 @@ auto is_sorted(cudf::table_view const& in,
if (cudf::detail::has_nested_columns(in)) {
auto const device_comparator = comparator.less<true>(has_nested_nulls(in));

return thrust::is_sorted(rmm::exec_policy(stream),
thrust::make_counting_iterator(0),
thrust::make_counting_iterator(in.num_rows()),
device_comparator);
// Using a temporary buffer for intermediate transform results from the lambda containing
// the comparator speeds up compile-time significantly over using the comparator directly
// in thrust::is_sorted.
auto d_results = rmm::device_uvector<bool>(in.num_rows(), stream);
thrust::transform(rmm::exec_policy(stream),
thrust::counting_iterator<size_type>(0),
thrust::counting_iterator<size_type>(in.num_rows()),
d_results.begin(),
[device_comparator] __device__(auto idx) -> bool {
return (idx == 0) || device_comparator(idx - 1, idx);
});

return thrust::count(rmm::exec_policy(stream), d_results.begin(), d_results.end(), false) == 0;
} else {
auto const device_comparator = comparator.less<false>(has_nested_nulls(in));

return thrust::is_sorted(rmm::exec_policy(stream),
thrust::make_counting_iterator(0),
thrust::make_counting_iterator(in.num_rows()),
thrust::counting_iterator<size_type>(0),
thrust::counting_iterator<size_type>(in.num_rows()),
device_comparator);
}
}
Expand Down
Loading

0 comments on commit 12dc130

Please sign in to comment.