From 4d4632af1033b910d715be09dc03af68a8d3931d Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 8 Jul 2022 06:44:58 -0700 Subject: [PATCH] Improve performance for `cudf::contains` when searching for a scalar (#11202) The current implementation of `cudf::contains(column_view, scalar)` uses `thrust::find` and `thrust::any_of` (which also calls `thrust::find_if` under the hood). These thrust APIs were known to have performance regression (https://github.com/NVIDIA/thrust/issues/1016). This PR replaces `thrust::find` and `thrust::any_of` in `cudf::contains` by `thrust::count_if`, which improves performance significantly. Benchmarks show that the run time can be reduced as much as 80% after modification, or up to 5X speedup. Closes https://github.com/rapidsai/cudf/issues/3806. Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - Karthikeyan (https://github.com/karthikeyann) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/11202 --- cpp/benchmarks/CMakeLists.txt | 1 + cpp/benchmarks/search/contains.cpp | 70 ++++++++++++++++++++++++++++++ cpp/src/search/contains.cu | 37 +++++++++++++--- cpp/src/search/contains_nested.cu | 22 ++++++---- 4 files changed, 114 insertions(+), 16 deletions(-) create mode 100644 cpp/benchmarks/search/contains.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 4d0a16c5372..a635e409a95 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -160,6 +160,7 @@ ConfigureBench(ITERATOR_BENCH iterator/iterator.cu) # ################################################################################################## # * search benchmark ------------------------------------------------------------------------------ ConfigureBench(SEARCH_BENCH search/search.cpp) +ConfigureNVBench(SEARCH_NVBENCH search/contains.cpp) # ################################################################################################## # * sort benchmark -------------------------------------------------------------------------------- diff --git a/cpp/benchmarks/search/contains.cpp b/cpp/benchmarks/search/contains.cpp new file mode 100644 index 00000000000..ac986e8c5fc --- /dev/null +++ b/cpp/benchmarks/search/contains.cpp @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include + +#include + +namespace { +template +std::unique_ptr create_table_data(cudf::size_type n_rows, + cudf::size_type n_cols, + bool has_nulls = false) +{ + data_profile profile; + profile.set_cardinality(0); + profile.set_null_frequency(has_nulls ? std::optional{0.1} : std::nullopt); + profile.set_distribution_params( + cudf::type_to_id(), distribution_id::UNIFORM, Type{0}, Type{1000}); + + return create_random_table( + cycle_dtypes({cudf::type_to_id()}, n_cols), row_count{n_rows}, profile); +} + +template +std::unique_ptr create_column_data(cudf::size_type n_rows, bool has_nulls = false) +{ + return std::move(create_table_data(n_rows, 1, has_nulls)->release().front()); +} + +} // namespace + +static void nvbench_contains_scalar(nvbench::state& state) +{ + cudf::rmm_pool_raii pool_raii; + using Type = int; + + auto const has_nulls = static_cast(state.get_int64("has_nulls")); + auto const size = state.get_int64("data_size"); + + auto const haystack = create_column_data(size, has_nulls); + auto const needle = cudf::make_fixed_width_scalar(size / 2); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto const stream_view = rmm::cuda_stream_view{launch.get_stream()}; + [[maybe_unused]] auto const result = cudf::detail::contains(*haystack, *needle, stream_view); + }); +} + +NVBENCH_BENCH(nvbench_contains_scalar) + .set_name("contains_scalar") + .add_int64_power_of_two_axis("data_size", {10, 12, 14, 16, 18, 20, 22, 24, 26}) + .add_int64_axis("has_nulls", {0, 1}); diff --git a/cpp/src/search/contains.cu b/cpp/src/search/contains.cu index de75c4a0567..5d068e72584 100644 --- a/cpp/src/search/contains.cu +++ b/cpp/src/search/contains.cu @@ -23,13 +23,14 @@ #include #include #include +#include #include #include #include #include -#include +#include #include #include #include @@ -40,6 +41,23 @@ namespace detail { namespace { +/** + * @brief Get the underlying value of a scalar through a scalar device view. + * + * @tparam Type The scalar's value type + * @tparam ScalarDView Type of the input scalar device view + * @param d_scalar The input scalar device view + */ +template +__device__ auto inline get_scalar_value(ScalarDView d_scalar) +{ + if constexpr (cudf::is_fixed_point()) { + return d_scalar.rep(); + } else { + return d_scalar.value(); + } +} + struct contains_scalar_dispatch { template bool operator()(column_view const& haystack, @@ -49,22 +67,27 @@ struct contains_scalar_dispatch { CUDF_EXPECTS(haystack.type() == needle.type(), "scalar and column types must match"); using DType = device_storage_type_t; - using ScalarType = cudf::scalar_type_t; auto const d_haystack = column_device_view::create(haystack, stream); - auto const s = static_cast(&needle); + auto const d_needle = + get_scalar_device_view(static_cast&>(const_cast(needle))); if (haystack.has_nulls()) { auto const begin = d_haystack->pair_begin(); auto const end = d_haystack->pair_end(); - auto const val = thrust::make_pair(s->value(stream), true); - return thrust::find(rmm::exec_policy(stream), begin, end, val) != end; + return thrust::count_if( + rmm::exec_policy(stream), begin, end, [d_needle] __device__(auto const val_pair) { + auto const needle_pair = thrust::make_pair(get_scalar_value(d_needle), true); + return val_pair == needle_pair; + }) > 0; } else { auto const begin = d_haystack->begin(); auto const end = d_haystack->end(); - auto const val = s->value(stream); - return thrust::find(rmm::exec_policy(stream), begin, end, val) != end; + return thrust::count_if( + rmm::exec_policy(stream), begin, end, [d_needle] __device__(auto const val) { + return val == get_scalar_value(d_needle); + }) > 0; } } }; diff --git a/cpp/src/search/contains_nested.cu b/cpp/src/search/contains_nested.cu index f4332efb23f..6767b27a918 100644 --- a/cpp/src/search/contains_nested.cu +++ b/cpp/src/search/contains_nested.cu @@ -21,7 +21,7 @@ #include #include -#include +#include namespace cudf::detail { @@ -47,16 +47,20 @@ bool contains_nested_element(column_view const& haystack, auto const haystack_cdv_ptr = column_device_view::create(haystack, stream); auto const haystack_valid_it = cudf::detail::make_validity_iterator(*haystack_cdv_ptr); - return thrust::any_of( - rmm::exec_policy(stream), begin, end, [d_comp, haystack_valid_it] __device__(auto const idx) { - if (!haystack_valid_it[static_cast(idx)]) { return false; } - return d_comp(idx, rhs_index_type{0}); // compare haystack[idx] == needle[0]. - }); + return thrust::count_if(rmm::exec_policy(stream), + begin, + end, + [d_comp, haystack_valid_it] __device__(auto const idx) { + if (!haystack_valid_it[static_cast(idx)]) { return false; } + return d_comp( + idx, rhs_index_type{0}); // compare haystack[idx] == needle[0]. + }) > 0; } - return thrust::any_of(rmm::exec_policy(stream), begin, end, [d_comp] __device__(auto const idx) { - return d_comp(idx, rhs_index_type{0}); // compare haystack[idx] == needle[0]. - }); + return thrust::count_if( + rmm::exec_policy(stream), begin, end, [d_comp] __device__(auto const idx) { + return d_comp(idx, rhs_index_type{0}); // compare haystack[idx] == needle[0]. + }) > 0; } } // namespace cudf::detail