diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f743bc022f8..82689deb5e7 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -265,7 +265,6 @@ add_library( src/groupby/sort/group_rank_scan.cu src/groupby/sort/group_replace_nulls.cu src/groupby/sort/group_sum_scan.cu - src/groupby/sort/group_tdigest.cu src/groupby/sort/sort_helper.cu src/hash/hashing.cu src/hash/md5_hash.cu @@ -357,6 +356,7 @@ add_library( src/partitioning/partitioning.cu src/partitioning/round_robin.cu src/quantiles/tdigest/tdigest.cu + src/quantiles/tdigest/tdigest_aggregation.cu src/quantiles/tdigest/tdigest_column_view.cpp src/quantiles/quantile.cu src/quantiles/quantiles.cu diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp index b3624282c24..4a8aaf025c3 100644 --- a/cpp/src/groupby/sort/aggregate.cpp +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -35,6 +35,8 @@ #include #include +#include + #include #include @@ -700,7 +702,7 @@ void aggregate_result_functor::operator()(aggregation cons cache.add_result(values, agg, - detail::group_tdigest( + cudf::detail::tdigest::group_tdigest( get_sorted_values(), helper.group_offsets(stream), helper.group_labels(stream), @@ -744,13 +746,13 @@ void aggregate_result_functor::operator()(aggregatio dynamic_cast(agg).max_centroids; cache.add_result(values, agg, - detail::group_merge_tdigest(get_grouped_values(), - helper.group_offsets(stream), - helper.group_labels(stream), - helper.num_groups(stream), - max_centroids, - stream, - mr)); + cudf::detail::tdigest::group_merge_tdigest(get_grouped_values(), + helper.group_offsets(stream), + helper.group_labels(stream), + helper.num_groups(stream), + max_centroids, + stream, + mr)); }; } // namespace detail diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp index 75708c7b01c..fc24b679db5 100644 --- a/cpp/src/groupby/sort/group_reductions.hpp +++ b/cpp/src/groupby/sort/group_reductions.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -483,94 +483,6 @@ std::unique_ptr group_correlation(column_view const& covariance, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); -/** - * @brief Generate a tdigest column from a grouped set of numeric input values. - * - * The tdigest column produced is of the following structure: - * - * struct { - * // centroids for the digest - * list { - * struct { - * double // mean - * double // weight - * }, - * ... - * } - * // these are from the input stream, not the centroids. they are used - * // during the percentile_approx computation near the beginning or - * // end of the quantiles - * double // min - * double // max - * } - * - * Each output row is a single tdigest. The length of the row is the "size" of the - * tdigest, each element of which represents a weighted centroid (mean, weight). - * - * @param values Grouped (and sorted) values to merge. - * @param group_offsets Offsets of groups' starting points within @p values. - * @param group_labels 0-based ID of group that the corresponding value belongs to - * @param group_valid_counts Per-group counts of valid elements. - * @param num_groups Number of groups. - * @param max_centroids Parameter controlling the level of compression of the tdigest. Higher - * values result in a larger, more precise tdigest. - * @param stream CUDA stream used for device memory operations and kernel launches. - * @param mr Device memory resource used to allocate the returned column's device memory - * - * @returns tdigest column, with 1 tdigest per row - */ -std::unique_ptr group_tdigest(column_view const& values, - cudf::device_span group_offsets, - cudf::device_span group_labels, - cudf::device_span group_valid_counts, - size_type num_groups, - int max_centroids, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); - -/** - * @brief Merges tdigests within the same group to generate a new tdigest. - * - * The tdigest column produced is of the following structure: - * - * struct { - * // centroids for the digest - * list { - * struct { - * double // mean - * double // weight - * }, - * ... - * } - * // these are from the input stream, not the centroids. they are used - * // during the percentile_approx computation near the beginning or - * // end of the quantiles - * double // min - * double // max - * } - * - * Each output row is a single tdigest. The length of the row is the "size" of the - * tdigest, each element of which represents a weighted centroid (mean, weight). - * - * @param values Grouped tdigests to merge. - * @param group_offsets Offsets of groups' starting points within @p values. - * @param group_labels 0-based ID of group that the corresponding value belongs to - * @param num_groups Number of groups. - * @param max_centroids Parameter controlling the level of compression of the tdigest. Higher - * values result in a larger, more precise tdigest. - * @param stream CUDA stream used for device memory operations and kernel launches. - * @param mr Device memory resource used to allocate the returned column's device memory - * - * @returns tdigest column, with 1 tdigest per row - */ -std::unique_ptr group_merge_tdigest(column_view const& values, - cudf::device_span group_offsets, - cudf::device_span group_labels, - size_type num_groups, - int max_centroids, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); - } // namespace detail } // namespace groupby } // namespace cudf diff --git a/cpp/src/quantiles/tdigest/tdigest.hpp b/cpp/src/quantiles/tdigest/tdigest.hpp new file mode 100644 index 00000000000..52b19821b90 --- /dev/null +++ b/cpp/src/quantiles/tdigest/tdigest.hpp @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace cudf { +namespace detail { +namespace tdigest { + +/** + * @brief Generate a tdigest column from a grouped set of numeric input values. + * + * The tdigest column produced is of the following structure: + * + * struct { + * // centroids for the digest + * list { + * struct { + * double // mean + * double // weight + * }, + * ... + * } + * // these are from the input stream, not the centroids. they are used + * // during the percentile_approx computation near the beginning or + * // end of the quantiles + * double // min + * double // max + * } + * + * Each output row is a single tdigest. The length of the row is the "size" of the + * tdigest, each element of which represents a weighted centroid (mean, weight). + * + * @param values Grouped (and sorted) values to merge. + * @param group_offsets Offsets of groups' starting points within @p values. + * @param group_labels 0-based ID of group that the corresponding value belongs to + * @param group_valid_counts Per-group counts of valid elements. + * @param num_groups Number of groups. + * @param max_centroids Parameter controlling the level of compression of the tdigest. Higher + * values result in a larger, more precise tdigest. + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory + * + * @returns tdigest column, with 1 tdigest per row + */ +std::unique_ptr group_tdigest(column_view const& values, + cudf::device_span group_offsets, + cudf::device_span group_labels, + cudf::device_span group_valid_counts, + size_type num_groups, + int max_centroids, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +/** + * @brief Merges tdigests within the same group to generate a new tdigest. + * + * The tdigest column produced is of the following structure: + * + * struct { + * // centroids for the digest + * list { + * struct { + * double // mean + * double // weight + * }, + * ... + * } + * // these are from the input stream, not the centroids. they are used + * // during the percentile_approx computation near the beginning or + * // end of the quantiles + * double // min + * double // max + * } + * + * Each output row is a single tdigest. The length of the row is the "size" of the + * tdigest, each element of which represents a weighted centroid (mean, weight). + * + * @param values Grouped tdigests to merge. + * @param group_offsets Offsets of groups' starting points within @p values. + * @param group_labels 0-based ID of group that the corresponding value belongs to + * @param num_groups Number of groups. + * @param max_centroids Parameter controlling the level of compression of the tdigest. Higher + * values result in a larger, more precise tdigest. + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory + * + * @returns tdigest column, with 1 tdigest per row + */ +std::unique_ptr group_merge_tdigest(column_view const& values, + cudf::device_span group_offsets, + cudf::device_span group_labels, + size_type num_groups, + int max_centroids, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +} // namespace tdigest +} // namespace detail +} // namespace cudf diff --git a/cpp/src/groupby/sort/group_tdigest.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu similarity index 99% rename from cpp/src/groupby/sort/group_tdigest.cu rename to cpp/src/quantiles/tdigest/tdigest_aggregation.cu index f726de9bf3c..0198bd11107 100644 --- a/cpp/src/groupby/sort/group_tdigest.cu +++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu @@ -37,8 +37,8 @@ #include namespace cudf { -namespace groupby { namespace detail { +namespace tdigest { using namespace cudf::tdigest; @@ -1008,6 +1008,6 @@ std::unique_ptr group_merge_tdigest(column_view const& input, mr); } +} // namespace tdigest } // namespace detail -} // namespace groupby } // namespace cudf