diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 073f0d62c0a..6fe9e2616e1 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -182,6 +182,7 @@ ConfigureBench(TYPE_DISPATCHER_BENCH "${TD_BENCH_SRC}") set(REDUCTION_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/reduction/anyall_benchmark.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/reduction/dictionary_benchmark.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/reduction/reduce_benchmark.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/reduction/minmax_benchmark.cpp") diff --git a/cpp/benchmarks/reduction/anyall_benchmark.cpp b/cpp/benchmarks/reduction/anyall_benchmark.cpp index 72379725abf..97d66585f8c 100644 --- a/cpp/benchmarks/reduction/anyall_benchmark.cpp +++ b/cpp/benchmarks/reduction/anyall_benchmark.cpp @@ -22,7 +22,6 @@ #include #include -#include #include class Reduction : public cudf::benchmark { @@ -33,18 +32,18 @@ void BM_reduction_anyall(benchmark::State& state, std::unique_ptr(state.range(0))}; - cudf::test::UniformRandomGenerator rand_gen(0, 100); + cudf::test::UniformRandomGenerator rand_gen( + (agg->kind == cudf::aggregation::ALL ? 1 : 0), (agg->kind == cudf::aggregation::ANY ? 0 : 100)); auto data_it = cudf::detail::make_counting_transform_iterator( 0, [&rand_gen](cudf::size_type row) { return rand_gen.generate(); }); cudf::test::fixed_width_column_wrapper values( data_it, data_it + column_size); - auto input_column = cudf::column_view(values); cudf::data_type output_dtype{cudf::type_id::BOOL8}; for (auto _ : state) { cuda_event_timer timer(state, true); - auto result = cudf::reduce(input_column, agg, output_dtype); + auto result = cudf::reduce(values, agg, output_dtype); } } diff --git a/cpp/benchmarks/reduction/dictionary_benchmark.cpp b/cpp/benchmarks/reduction/dictionary_benchmark.cpp new file mode 100644 index 00000000000..3622b36eb66 --- /dev/null +++ b/cpp/benchmarks/reduction/dictionary_benchmark.cpp @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +class ReductionDictionary : public cudf::benchmark { +}; + +template +void BM_reduction_dictionary(benchmark::State& state, std::unique_ptr const& agg) +{ + const cudf::size_type column_size{static_cast(state.range(0))}; + + cudf::test::UniformRandomGenerator rand_gen( + (agg->kind == cudf::aggregation::ALL ? 1 : 0), (agg->kind == cudf::aggregation::ANY ? 0 : 100)); + auto data_it = cudf::detail::make_counting_transform_iterator( + 0, [&rand_gen](cudf::size_type row) { return rand_gen.generate(); }); + cudf::test::dictionary_column_wrapper values( + data_it, data_it + column_size); + + cudf::data_type output_dtype = [&] { + if (agg->kind == cudf::aggregation::ANY || agg->kind == cudf::aggregation::ALL) + return cudf::data_type{cudf::type_id::BOOL8}; + if (agg->kind == cudf::aggregation::MEAN) return cudf::data_type{cudf::type_id::FLOAT64}; + return cudf::data_type{cudf::type_to_id()}; + }(); + + for (auto _ : state) { + cuda_event_timer timer(state, true); + auto result = cudf::reduce(values, agg, output_dtype); + } +} + +#define concat(a, b, c) a##b##c +#define get_agg(op) concat(cudf::make_, op, _aggregation()) + +// TYPE, OP +#define RBM_BENCHMARK_DEFINE(name, type, aggregation) \ + BENCHMARK_DEFINE_F(ReductionDictionary, name)(::benchmark::State & state) \ + { \ + BM_reduction_dictionary(state, get_agg(aggregation)); \ + } \ + BENCHMARK_REGISTER_F(ReductionDictionary, name) \ + ->UseManualTime() \ + ->Arg(10000) /* 10k */ \ + ->Arg(100000) /* 100k */ \ + ->Arg(1000000) /* 1M */ \ + ->Arg(10000000) /* 10M */ \ + ->Arg(100000000); /* 100M */ + +#define REDUCE_BENCHMARK_DEFINE(type, aggregation) \ + RBM_BENCHMARK_DEFINE(concat(type, _, aggregation), type, aggregation) + +REDUCE_BENCHMARK_DEFINE(int32_t, all); +REDUCE_BENCHMARK_DEFINE(float, all); +REDUCE_BENCHMARK_DEFINE(int32_t, any); +REDUCE_BENCHMARK_DEFINE(float, any); +REDUCE_BENCHMARK_DEFINE(int32_t, min); +REDUCE_BENCHMARK_DEFINE(float, min); +REDUCE_BENCHMARK_DEFINE(int32_t, max); +REDUCE_BENCHMARK_DEFINE(float, max); +REDUCE_BENCHMARK_DEFINE(int32_t, mean); +REDUCE_BENCHMARK_DEFINE(float, mean); diff --git a/cpp/include/cudf/dictionary/detail/iterator.cuh b/cpp/include/cudf/dictionary/detail/iterator.cuh index 88563f2334b..5c9abfa69ce 100644 --- a/cpp/include/cudf/dictionary/detail/iterator.cuh +++ b/cpp/include/cudf/dictionary/detail/iterator.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -66,13 +66,13 @@ auto make_dictionary_iterator(column_device_view const& dictionary_column) * @brief Accessor functor for returning a dictionary pair iterator. * * @tparam KeyType The type of the dictionary's key element. - * @tparam has_nulls Set to `true` if `d_dictionary` has nulls. * * @throw cudf::logic_error if `has_nulls==true` and `d_dictionary` is not nullable. */ -template +template struct dictionary_access_pair_fn { - dictionary_access_pair_fn(column_device_view const& d_dictionary) : d_dictionary{d_dictionary} + dictionary_access_pair_fn(column_device_view const& d_dictionary, bool has_nulls = true) + : d_dictionary{d_dictionary}, has_nulls{has_nulls} { if (has_nulls) { CUDF_EXPECTS(d_dictionary.nullable(), "unexpected non-nullable column"); } } @@ -80,13 +80,14 @@ struct dictionary_access_pair_fn { __device__ thrust::pair operator()(size_type idx) const { if (has_nulls && d_dictionary.is_null(idx)) return {KeyType{}, false}; - auto keys = d_dictionary.child(1); + auto keys = d_dictionary.child(dictionary_column_view::keys_column_index); return {keys.element(static_cast(d_dictionary.element(idx))), true}; }; private: column_device_view const d_dictionary; + bool has_nulls; }; /** @@ -100,19 +101,20 @@ struct dictionary_access_pair_fn { * @throw cudf::logic_error if `dictionary_column` is not a dictionary column. * * @tparam KeyType The type of the dictionary's key element. - * @tparam has_nulls Set to `true` if the dictionary_column has nulls. * * @param dictionary_column The dictionary device view to iterate. + * @param has_nulls Set to `true` if the `dictionary_column` has nulls. * @return Pair iterator with `{value,valid}` */ -template -auto make_dictionary_pair_iterator(column_device_view const& dictionary_column) +template +auto make_dictionary_pair_iterator(column_device_view const& dictionary_column, + bool has_nulls = true) { CUDF_EXPECTS(is_dictionary(dictionary_column.type()), "Dictionary iterator is only for dictionary columns"); return thrust::make_transform_iterator( thrust::make_counting_iterator(0), - dictionary_access_pair_fn{dictionary_column}); + dictionary_access_pair_fn{dictionary_column, has_nulls}); } } // namespace detail diff --git a/cpp/src/reductions/all.cu b/cpp/src/reductions/all.cu index 79390441e0a..81e63278e4e 100644 --- a/cpp/src/reductions/all.cu +++ b/cpp/src/reductions/all.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,11 +15,65 @@ */ #include +#include #include #include namespace cudf { namespace reduction { +namespace detail { +namespace { + +/** + * @brief Compute reduction all() for dictionary columns. + * + * This compiles 10x faster than using thrust::reduce or the + * cudf::simple::reduction::detail::reduce utility. + * Both of these use the CUB DeviceReduce which aggressively inlines + * the input iterator logic. + */ +struct all_fn { + template + struct all_true_fn { + __device__ void operator()(size_type idx) + { + if (*d_result && (iter[idx] != *d_result)) atomicAnd(d_result, false); + } + Iterator iter; + bool* d_result; + }; + + template ::value>* = nullptr> + std::unique_ptr operator()(column_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + { + auto const d_dict = cudf::column_device_view::create(input, stream); + auto const iter = [&] { + auto null_iter = + cudf::reduction::op::min{}.template get_null_replacing_element_transformer(); + auto pair_iter = + cudf::dictionary::detail::make_dictionary_pair_iterator(*d_dict, input.has_nulls()); + return thrust::make_transform_iterator(pair_iter, null_iter); + }(); + auto result = std::make_unique>(true, true, stream, mr); + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + input.size(), + all_true_fn{iter, result->data()}); + return result; + } + template ::value>* = nullptr> + std::unique_ptr operator()(column_view const&, + rmm::cuda_stream_view, + rmm::mr::device_memory_resource*) + { + CUDF_FAIL("Unexpected key type for dictionary in reduction all()"); + } +}; + +} // namespace +} // namespace detail std::unique_ptr all(column_view const& col, cudf::data_type const output_dtype, @@ -28,9 +82,13 @@ std::unique_ptr all(column_view const& col, { CUDF_EXPECTS(output_dtype == cudf::data_type(cudf::type_id::BOOL8), "all() operation can be applied with output type `BOOL8` only"); - auto const dispatch_type = - cudf::is_dictionary(col.type()) ? dictionary_column_view(col).keys().type() : col.type(); - return cudf::type_dispatcher(dispatch_type, + + if (cudf::is_dictionary(col.type())) { + return cudf::type_dispatcher( + dictionary_column_view(col).keys().type(), detail::all_fn{}, col, stream, mr); + } + // dispatch for non-dictionary types + return cudf::type_dispatcher(col.type(), simple::bool_result_element_dispatcher{}, col, stream, diff --git a/cpp/src/reductions/any.cu b/cpp/src/reductions/any.cu index 78a1e4635e7..ceeef017768 100644 --- a/cpp/src/reductions/any.cu +++ b/cpp/src/reductions/any.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,11 +15,65 @@ */ #include +#include #include #include namespace cudf { namespace reduction { +namespace detail { +namespace { + +/** + * @brief Compute reduction any() for dictionary columns. + * + * This compiles 10x faster than using thrust::reduce or the + * cudf::simple::reduction::detail::reduce utility. + * Both of these use the CUB DeviceReduce which aggressively inlines + * the input iterator logic. + */ +struct any_fn { + template + struct any_true_fn { + __device__ void operator()(size_type idx) + { + if (!*d_result && (iter[idx] != *d_result)) atomicOr(d_result, true); + } + Iterator iter; + bool* d_result; + }; + + template ::value>* = nullptr> + std::unique_ptr operator()(column_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + { + auto const d_dict = cudf::column_device_view::create(input, stream); + auto const iter = [&] { + auto null_iter = + cudf::reduction::op::max{}.template get_null_replacing_element_transformer(); + auto pair_iter = + cudf::dictionary::detail::make_dictionary_pair_iterator(*d_dict, input.has_nulls()); + return thrust::make_transform_iterator(pair_iter, null_iter); + }(); + auto result = std::make_unique>(false, true, stream, mr); + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + input.size(), + any_true_fn{iter, result->data()}); + return result; + } + template ::value>* = nullptr> + std::unique_ptr operator()(column_view const&, + rmm::cuda_stream_view, + rmm::mr::device_memory_resource*) + { + CUDF_FAIL("Unexpected key type for dictionary in reduction any()"); + } +}; + +} // namespace +} // namespace detail std::unique_ptr any(column_view const& col, cudf::data_type const output_dtype, @@ -28,9 +82,13 @@ std::unique_ptr any(column_view const& col, { CUDF_EXPECTS(output_dtype == cudf::data_type(cudf::type_id::BOOL8), "any() operation can be applied with output type `bool8` only"); - auto const dispatch_type = - cudf::is_dictionary(col.type()) ? dictionary_column_view(col).keys().type() : col.type(); - return cudf::type_dispatcher(dispatch_type, + + if (cudf::is_dictionary(col.type())) { + return cudf::type_dispatcher( + dictionary_column_view(col).keys().type(), detail::any_fn{}, col, stream, mr); + } + // dispatch for non-dictionary types + return cudf::type_dispatcher(col.type(), simple::bool_result_element_dispatcher{}, col, stream, diff --git a/cpp/src/reductions/compound.cuh b/cpp/src/reductions/compound.cuh index 011af214d45..09d812e5d94 100644 --- a/cpp/src/reductions/compound.cuh +++ b/cpp/src/reductions/compound.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -70,19 +70,11 @@ std::unique_ptr compound_reduction(column_view const& col, it, col.size(), compound_op, valid_count, ddof, stream, mr); } } else { - if (col.has_nulls()) { - auto it = thrust::make_transform_iterator( - cudf::dictionary::detail::make_dictionary_pair_iterator(*dcol), - compound_op.template get_null_replacing_element_transformer()); - result = detail::reduce( - it, col.size(), compound_op, valid_count, ddof, stream, mr); - } else { - auto it = thrust::make_transform_iterator( - cudf::dictionary::detail::make_dictionary_iterator(*dcol), - compound_op.template get_element_transformer()); - result = detail::reduce( - it, col.size(), compound_op, valid_count, ddof, stream, mr); - } + auto it = thrust::make_transform_iterator( + cudf::dictionary::detail::make_dictionary_pair_iterator(*dcol, col.has_nulls()), + compound_op.template get_null_replacing_element_transformer()); + result = detail::reduce( + it, col.size(), compound_op, valid_count, ddof, stream, mr); } // set scalar is valid diff --git a/cpp/src/reductions/simple.cuh b/cpp/src/reductions/simple.cuh index f1c68e30dc9..baaedda7d63 100644 --- a/cpp/src/reductions/simple.cuh +++ b/cpp/src/reductions/simple.cuh @@ -139,17 +139,11 @@ std::unique_ptr dictionary_reduction(column_view const& col, auto simple_op = Op{}; auto result = [&] { - if (col.has_nulls()) { - auto f = simple_op.template get_null_replacing_element_transformer(); - auto p = cudf::dictionary::detail::make_dictionary_pair_iterator(*dcol); - auto it = thrust::make_transform_iterator(p, f); - return detail::reduce(it, col.size(), simple_op, stream, mr); - } else { - auto f = simple_op.template get_element_transformer(); - auto p = cudf::dictionary::detail::make_dictionary_iterator(*dcol); - auto it = thrust::make_transform_iterator(p, f); - return detail::reduce(it, col.size(), simple_op, stream, mr); - } + auto f = simple_op.template get_null_replacing_element_transformer(); + auto p = + cudf::dictionary::detail::make_dictionary_pair_iterator(*dcol, col.has_nulls()); + auto it = thrust::make_transform_iterator(p, f); + return detail::reduce(it, col.size(), simple_op, stream, mr); }(); // set scalar is valid @@ -232,9 +226,7 @@ struct bool_result_element_dispatcher { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - return cudf::is_dictionary(col.type()) - ? dictionary_reduction(col, stream, mr) - : simple_reduction(col, stream, mr); + return simple_reduction(col, stream, mr); } template