From 122da2002418983f9eca5e241c78d4247d33ed24 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 4 Oct 2021 23:54:45 -0400 Subject: [PATCH] Move rank scan implementations from scan_inclusive.cu to rank_scan.cu (#9351) This change was mainly to improve the compile time for `reductions/scan/scan_inclusive.cu` by refactoring out the rank-scan functions into a separate file `rank.cu`. Although the overall compile time improvement for `scan_inclusive.cu` is only 25%, the source code is better organized with this change. The code function has changed. The detail `inclusive_rank_scan` and `inclusive_dense_rank_scan` declarations were moved from `src/reductions/scan/scan.cuh` to `include/cudf/detail/scan.hpp` and dispatching of the RANK and DENSE_RANK aggregation is done in `scan.cpp` instead of handled by `scan_inclusive.cu` and also `scan_exclusive.cu` (which just throws an exception anyway). Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Jake Hemstad (https://github.com/jrhemstad) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/9351 --- cpp/CMakeLists.txt | 1 + cpp/include/cudf/detail/scan.hpp | 24 ++++ cpp/src/reductions/scan/rank_scan.cu | 130 ++++++++++++++++++++++ cpp/src/reductions/scan/scan.cpp | 12 ++ cpp/src/reductions/scan/scan.cuh | 18 +-- cpp/src/reductions/scan/scan_exclusive.cu | 2 - cpp/src/reductions/scan/scan_inclusive.cu | 102 +---------------- cpp/tests/reductions/scan_tests.cpp | 2 +- 8 files changed, 172 insertions(+), 119 deletions(-) create mode 100644 cpp/src/reductions/scan/rank_scan.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 82bc5bfba93..3b7bc8a223c 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -331,6 +331,7 @@ add_library(cudf src/reductions/nth_element.cu src/reductions/product.cu src/reductions/reductions.cpp + src/reductions/scan/rank_scan.cu src/reductions/scan/scan.cpp src/reductions/scan/scan_exclusive.cu src/reductions/scan/scan_inclusive.cu diff --git a/cpp/include/cudf/detail/scan.hpp b/cpp/include/cudf/detail/scan.hpp index 5691adecb5e..113c15f19a1 100644 --- a/cpp/include/cudf/detail/scan.hpp +++ b/cpp/include/cudf/detail/scan.hpp @@ -75,5 +75,29 @@ std::unique_ptr scan_inclusive(column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); +/** + * @brief Generate row ranks for a column + * + * @param order_by Input column to generate ranks for + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return rank values + */ +std::unique_ptr inclusive_rank_scan(column_view const& order_by, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +/** + * @brief Generate row dense ranks for a column + * + * @param order_by Input column to generate ranks for + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return rank values + */ +std::unique_ptr inclusive_dense_rank_scan(column_view const& order_by, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + } // namespace detail } // namespace cudf diff --git a/cpp/src/reductions/scan/rank_scan.cu b/cpp/src/reductions/scan/rank_scan.cu new file mode 100644 index 00000000000..566b9aadea8 --- /dev/null +++ b/cpp/src/reductions/scan/rank_scan.cu @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace cudf { +namespace detail { +namespace { + +/** + * @brief generate row ranks or dense ranks using a row comparison then scan the results + * + * @tparam has_nulls if the order_by column has nulls + * @tparam value_resolver flag value resolver with boolean first and row number arguments + * @tparam scan_operator scan function ran on the flag values + * @param order_by input column to generate ranks for + * @param resolver flag value resolver + * @param scan_op scan operation ran on the flag results + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return std::unique_ptr rank values + */ +template +std::unique_ptr rank_generator(column_view const& order_by, + value_resolver resolver, + scan_operator scan_op, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const superimposed = structs::detail::superimpose_parent_nulls(order_by, stream, mr); + table_view const order_table{{std::get<0>(superimposed)}}; + auto const flattener = cudf::structs::detail::flatten_nested_columns( + order_table, {}, {}, structs::detail::column_nullability::MATCH_INCOMING); + auto const d_flat_order = table_device_view::create(std::get<0>(flattener), stream); + row_equality_comparator comparator(*d_flat_order, *d_flat_order, true); + auto ranks = make_fixed_width_column(data_type{type_to_id()}, + order_table.num_rows(), + mask_state::UNALLOCATED, + stream, + mr); + auto mutable_ranks = ranks->mutable_view(); + + thrust::tabulate(rmm::exec_policy(stream), + mutable_ranks.begin(), + mutable_ranks.end(), + [comparator, resolver] __device__(size_type row_index) { + return resolver(row_index == 0 || !comparator(row_index, row_index - 1), + row_index); + }); + + thrust::inclusive_scan(rmm::exec_policy(stream), + mutable_ranks.begin(), + mutable_ranks.end(), + mutable_ranks.begin(), + scan_op); + return ranks; +} + +} // namespace + +std::unique_ptr inclusive_dense_rank_scan(column_view const& order_by, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(!cudf::structs::detail::is_or_has_nested_lists(order_by), + "Unsupported list type in dense_rank scan."); + if (has_nested_nulls(table_view{{order_by}})) { + return rank_generator( + order_by, + [] __device__(bool equality, auto row_index) { return equality; }, + DeviceSum{}, + stream, + mr); + } + return rank_generator( + order_by, + [] __device__(bool equality, auto row_index) { return equality; }, + DeviceSum{}, + stream, + mr); +} + +std::unique_ptr inclusive_rank_scan(column_view const& order_by, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(!cudf::structs::detail::is_or_has_nested_lists(order_by), + "Unsupported list type in rank scan."); + if (has_nested_nulls(table_view{{order_by}})) { + return rank_generator( + order_by, + [] __device__(bool equality, auto row_index) { return equality ? row_index + 1 : 0; }, + DeviceMax{}, + stream, + mr); + } + return rank_generator( + order_by, + [] __device__(bool equality, auto row_index) { return equality ? row_index + 1 : 0; }, + DeviceMax{}, + stream, + mr); +} + +} // namespace detail +} // namespace cudf diff --git a/cpp/src/reductions/scan/scan.cpp b/cpp/src/reductions/scan/scan.cpp index f40a3fd5c75..5f10707d7fb 100644 --- a/cpp/src/reductions/scan/scan.cpp +++ b/cpp/src/reductions/scan/scan.cpp @@ -31,6 +31,18 @@ std::unique_ptr scan(column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); + + if (agg->kind == aggregation::RANK) { + CUDF_EXPECTS(inclusive == scan_type::INCLUSIVE, + "Unsupported rank aggregation operator for exclusive scan"); + return inclusive_rank_scan(input, rmm::cuda_stream_default, mr); + } + if (agg->kind == aggregation::DENSE_RANK) { + CUDF_EXPECTS(inclusive == scan_type::INCLUSIVE, + "Unsupported dense rank aggregation operator for exclusive scan"); + return inclusive_dense_rank_scan(input, rmm::cuda_stream_default, mr); + } + return inclusive == scan_type::EXCLUSIVE ? detail::scan_exclusive(input, agg, null_handling, rmm::cuda_stream_default, mr) : detail::scan_inclusive(input, agg, null_handling, rmm::cuda_stream_default, mr); diff --git a/cpp/src/reductions/scan/scan.cuh b/cpp/src/reductions/scan/scan.cuh index 3853e34e97b..8352d16b2d0 100644 --- a/cpp/src/reductions/scan/scan.cuh +++ b/cpp/src/reductions/scan/scan.cuh @@ -33,14 +33,6 @@ rmm::device_buffer mask_scan(column_view const& input_view, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); -std::unique_ptr inclusive_rank_scan(column_view const& order_by, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); - -std::unique_ptr inclusive_dense_rank_scan(column_view const& order_by, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); - template