Skip to content

Commit

Permalink
Centralization of tdigest aggregation code. (#10422)
Browse files Browse the repository at this point in the history
This PR is prep-work for adding support for invoking the tdigest aggregations through `cudf::reduce`.  It doesn't change any code, just moves some things around:

- `groupby/sort/group_tdigest.cu` -> `quantiles/tdigest/tdigest_aggregation.cu`
- Moves exposure of the aggregation functions from `group_reductions.hpp` to `quantiles/tdigest/tdigest.hpp`

Doing this as a seperate PR so the actual changes for the reduction work show up as a reasonable diff.

Authors:
  - https://github.com/nvdbaranec

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Conor Hoekstra (https://github.com/codereport)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

URL: #10422
  • Loading branch information
nvdbaranec authored Mar 14, 2022
1 parent 749295d commit cf936b6
Show file tree
Hide file tree
Showing 5 changed files with 128 additions and 101 deletions.
2 changes: 1 addition & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,6 @@ add_library(
src/groupby/sort/group_rank_scan.cu
src/groupby/sort/group_replace_nulls.cu
src/groupby/sort/group_sum_scan.cu
src/groupby/sort/group_tdigest.cu
src/groupby/sort/sort_helper.cu
src/hash/hashing.cu
src/hash/md5_hash.cu
Expand Down Expand Up @@ -357,6 +356,7 @@ add_library(
src/partitioning/partitioning.cu
src/partitioning/round_robin.cu
src/quantiles/tdigest/tdigest.cu
src/quantiles/tdigest/tdigest_aggregation.cu
src/quantiles/tdigest/tdigest_column_view.cpp
src/quantiles/quantile.cu
src/quantiles/quantiles.cu
Expand Down
20 changes: 11 additions & 9 deletions cpp/src/groupby/sort/aggregate.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -35,6 +35,8 @@
#include <cudf/table/table_view.hpp>
#include <cudf/types.hpp>

#include <quantiles/tdigest/tdigest.hpp>

#include <rmm/cuda_stream_view.hpp>

#include <memory>
Expand Down Expand Up @@ -700,7 +702,7 @@ void aggregate_result_functor::operator()<aggregation::TDIGEST>(aggregation cons

cache.add_result(values,
agg,
detail::group_tdigest(
cudf::detail::tdigest::group_tdigest(
get_sorted_values(),
helper.group_offsets(stream),
helper.group_labels(stream),
Expand Down Expand Up @@ -744,13 +746,13 @@ void aggregate_result_functor::operator()<aggregation::MERGE_TDIGEST>(aggregatio
dynamic_cast<cudf::detail::merge_tdigest_aggregation const&>(agg).max_centroids;
cache.add_result(values,
agg,
detail::group_merge_tdigest(get_grouped_values(),
helper.group_offsets(stream),
helper.group_labels(stream),
helper.num_groups(stream),
max_centroids,
stream,
mr));
cudf::detail::tdigest::group_merge_tdigest(get_grouped_values(),
helper.group_offsets(stream),
helper.group_labels(stream),
helper.num_groups(stream),
max_centroids,
stream,
mr));
};

} // namespace detail
Expand Down
90 changes: 1 addition & 89 deletions cpp/src/groupby/sort/group_reductions.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -483,94 +483,6 @@ std::unique_ptr<column> group_correlation(column_view const& covariance,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

/**
* @brief Generate a tdigest column from a grouped set of numeric input values.
*
* The tdigest column produced is of the following structure:
*
* struct {
* // centroids for the digest
* list {
* struct {
* double // mean
* double // weight
* },
* ...
* }
* // these are from the input stream, not the centroids. they are used
* // during the percentile_approx computation near the beginning or
* // end of the quantiles
* double // min
* double // max
* }
*
* Each output row is a single tdigest. The length of the row is the "size" of the
* tdigest, each element of which represents a weighted centroid (mean, weight).
*
* @param values Grouped (and sorted) values to merge.
* @param group_offsets Offsets of groups' starting points within @p values.
* @param group_labels 0-based ID of group that the corresponding value belongs to
* @param group_valid_counts Per-group counts of valid elements.
* @param num_groups Number of groups.
* @param max_centroids Parameter controlling the level of compression of the tdigest. Higher
* values result in a larger, more precise tdigest.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned column's device memory
*
* @returns tdigest column, with 1 tdigest per row
*/
std::unique_ptr<column> group_tdigest(column_view const& values,
cudf::device_span<size_type const> group_offsets,
cudf::device_span<size_type const> group_labels,
cudf::device_span<size_type const> group_valid_counts,
size_type num_groups,
int max_centroids,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

/**
* @brief Merges tdigests within the same group to generate a new tdigest.
*
* The tdigest column produced is of the following structure:
*
* struct {
* // centroids for the digest
* list {
* struct {
* double // mean
* double // weight
* },
* ...
* }
* // these are from the input stream, not the centroids. they are used
* // during the percentile_approx computation near the beginning or
* // end of the quantiles
* double // min
* double // max
* }
*
* Each output row is a single tdigest. The length of the row is the "size" of the
* tdigest, each element of which represents a weighted centroid (mean, weight).
*
* @param values Grouped tdigests to merge.
* @param group_offsets Offsets of groups' starting points within @p values.
* @param group_labels 0-based ID of group that the corresponding value belongs to
* @param num_groups Number of groups.
* @param max_centroids Parameter controlling the level of compression of the tdigest. Higher
* values result in a larger, more precise tdigest.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned column's device memory
*
* @returns tdigest column, with 1 tdigest per row
*/
std::unique_ptr<column> group_merge_tdigest(column_view const& values,
cudf::device_span<size_type const> group_offsets,
cudf::device_span<size_type const> group_labels,
size_type num_groups,
int max_centroids,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

} // namespace detail
} // namespace groupby
} // namespace cudf
113 changes: 113 additions & 0 deletions cpp/src/quantiles/tdigest/tdigest.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

namespace cudf {
namespace detail {
namespace tdigest {

/**
* @brief Generate a tdigest column from a grouped set of numeric input values.
*
* The tdigest column produced is of the following structure:
*
* struct {
* // centroids for the digest
* list {
* struct {
* double // mean
* double // weight
* },
* ...
* }
* // these are from the input stream, not the centroids. they are used
* // during the percentile_approx computation near the beginning or
* // end of the quantiles
* double // min
* double // max
* }
*
* Each output row is a single tdigest. The length of the row is the "size" of the
* tdigest, each element of which represents a weighted centroid (mean, weight).
*
* @param values Grouped (and sorted) values to merge.
* @param group_offsets Offsets of groups' starting points within @p values.
* @param group_labels 0-based ID of group that the corresponding value belongs to
* @param group_valid_counts Per-group counts of valid elements.
* @param num_groups Number of groups.
* @param max_centroids Parameter controlling the level of compression of the tdigest. Higher
* values result in a larger, more precise tdigest.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned column's device memory
*
* @returns tdigest column, with 1 tdigest per row
*/
std::unique_ptr<column> group_tdigest(column_view const& values,
cudf::device_span<size_type const> group_offsets,
cudf::device_span<size_type const> group_labels,
cudf::device_span<size_type const> group_valid_counts,
size_type num_groups,
int max_centroids,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

/**
* @brief Merges tdigests within the same group to generate a new tdigest.
*
* The tdigest column produced is of the following structure:
*
* struct {
* // centroids for the digest
* list {
* struct {
* double // mean
* double // weight
* },
* ...
* }
* // these are from the input stream, not the centroids. they are used
* // during the percentile_approx computation near the beginning or
* // end of the quantiles
* double // min
* double // max
* }
*
* Each output row is a single tdigest. The length of the row is the "size" of the
* tdigest, each element of which represents a weighted centroid (mean, weight).
*
* @param values Grouped tdigests to merge.
* @param group_offsets Offsets of groups' starting points within @p values.
* @param group_labels 0-based ID of group that the corresponding value belongs to
* @param num_groups Number of groups.
* @param max_centroids Parameter controlling the level of compression of the tdigest. Higher
* values result in a larger, more precise tdigest.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned column's device memory
*
* @returns tdigest column, with 1 tdigest per row
*/
std::unique_ptr<column> group_merge_tdigest(column_view const& values,
cudf::device_span<size_type const> group_offsets,
cudf::device_span<size_type const> group_labels,
size_type num_groups,
int max_centroids,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

} // namespace tdigest
} // namespace detail
} // namespace cudf
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@
#include <thrust/iterator/discard_iterator.h>

namespace cudf {
namespace groupby {
namespace detail {
namespace tdigest {

using namespace cudf::tdigest;

Expand Down Expand Up @@ -1008,6 +1008,6 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
mr);
}

} // namespace tdigest
} // namespace detail
} // namespace groupby
} // namespace cudf

0 comments on commit cf936b6

Please sign in to comment.