Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for tdigest and merge_tdigest aggregations through cudf::reduce #10433

Merged
merged 25 commits into from
Mar 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
245e68c
Add scan_aggregation and reduce_aggregations. C++ side only.
nvdbaranec Feb 22, 2022
c884d5c
Java bindings.
nvdbaranec Feb 23, 2022
321c9b2
Merge branch 'branch-22.04' into scan_reduce_aggregations
nvdbaranec Feb 24, 2022
900d55c
Python bindings.
nvdbaranec Feb 25, 2022
0398a0d
Copyright updates.
nvdbaranec Feb 25, 2022
a3a71b8
PR review comments.
nvdbaranec Mar 7, 2022
56a6c0f
Formatting
nvdbaranec Mar 7, 2022
8917445
Centralize tdigest aggregation code to quantiles/tdigest.
nvdbaranec Mar 7, 2022
e693562
Clean up some test code.
nvdbaranec Mar 9, 2022
f49e2c9
Merge branch 'scan_reduce_aggregations' into tdigest_code_move
nvdbaranec Mar 9, 2022
23cae44
Small test tweak.
nvdbaranec Mar 9, 2022
7fdc9f5
Merge branch 'scan_reduce_aggregations' into tdigest_code_move
nvdbaranec Mar 9, 2022
3088ec8
tdigest reduce_aggregation functionality and tests.
nvdbaranec Mar 10, 2022
6f940fd
Merge branch 'branch-22.04' into scan_reduce_aggregations
nvdbaranec Mar 11, 2022
13c776a
Merge branch 'scan_reduce_aggregations' into tdigest_code_move
nvdbaranec Mar 11, 2022
3140f5f
Merge branch 'tdigest_code_move' into tdigest_reduction
nvdbaranec Mar 11, 2022
27a854e
Merge branch 'branch-22.04' into tdigest_code_move
nvdbaranec Mar 11, 2022
6827e8f
Copyright update.
nvdbaranec Mar 11, 2022
25c1849
cmake format fixes.
nvdbaranec Mar 14, 2022
b86b3db
Merge branch 'tdigest_code_move' into tdigest_reduction
nvdbaranec Mar 14, 2022
83f4d31
Merge branch 'branch-22.04' into tdigest_reduction
nvdbaranec Mar 14, 2022
6a2d50e
Merge tdigest aggregation for cudf::reduce
nvdbaranec Mar 14, 2022
0fdd74e
Formatting fixes.
nvdbaranec Mar 14, 2022
98e76ef
Simplified the conversion in to_tdigest_scalar.
nvdbaranec Mar 15, 2022
eb6a0c4
Add enforcement that the output_dtype parameter passed to reduce for …
nvdbaranec Mar 16, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -624,8 +624,12 @@ add_library(cudf::cudf ALIAS cudf)

add_library(
cudftestutil STATIC
tests/utilities/base_fixture.cpp tests/utilities/column_utilities.cu
tests/utilities/table_utilities.cu tests/io/metadata_utilities.cpp tests/strings/utilities.cpp
tests/io/metadata_utilities.cpp
tests/quantiles/tdigest_utilities.cu
tests/utilities/base_fixture.cpp
tests/utilities/column_utilities.cu
tests/utilities/table_utilities.cu
tests/strings/utilities.cpp
)

set_target_properties(
Expand Down
4 changes: 2 additions & 2 deletions cpp/include/cudf/detail/aggregation/aggregation.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1048,7 +1048,7 @@ class correlation_aggregation final : public groupby_aggregation {
/**
* @brief Derived aggregation class for specifying TDIGEST aggregation
*/
class tdigest_aggregation final : public groupby_aggregation {
class tdigest_aggregation final : public groupby_aggregation, public reduce_aggregation {
public:
explicit tdigest_aggregation(int max_centroids_)
: aggregation{TDIGEST}, max_centroids{max_centroids_}
Expand All @@ -1072,7 +1072,7 @@ class tdigest_aggregation final : public groupby_aggregation {
/**
* @brief Derived aggregation class for specifying MERGE_TDIGEST aggregation
*/
class merge_tdigest_aggregation final : public groupby_aggregation {
class merge_tdigest_aggregation final : public groupby_aggregation, public reduce_aggregation {
public:
explicit merge_tdigest_aggregation(int max_centroids_)
: aggregation{MERGE_TDIGEST}, max_centroids{max_centroids_}
Expand Down
269 changes: 266 additions & 3 deletions cpp/include/cudf/detail/tdigest/tdigest.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -14,7 +14,10 @@
* limitations under the License.
*/

#pragma once

#include <cudf/types.hpp>
#include <cudf/utilities/span.hpp>

#include <rmm/cuda_stream_view.hpp>

Expand All @@ -24,7 +27,95 @@ namespace detail {
namespace tdigest {

/**
* @brief Create a tdigest column from it's constituent components.
* @brief Generate a tdigest column from a grouped set of numeric input values.
*
* The tdigest column produced is of the following structure:
*
* struct {
* // centroids for the digest
* list {
* struct {
* double // mean
* double // weight
* },
* ...
* }
* // these are from the input stream, not the centroids. they are used
* // during the percentile_approx computation near the beginning or
* // end of the quantiles
* double // min
* double // max
* }
*
* Each output row is a single tdigest. The length of the row is the "size" of the
* tdigest, each element of which represents a weighted centroid (mean, weight).
*
* @param values Grouped (and sorted) values to merge.
* @param group_offsets Offsets of groups' starting points within @p values.
* @param group_labels 0-based ID of group that the corresponding value belongs to
* @param group_valid_counts Per-group counts of valid elements.
* @param num_groups Number of groups.
* @param max_centroids Parameter controlling the level of compression of the tdigest. Higher
* values result in a larger, more precise tdigest.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned column's device memory
*
* @returns tdigest column, with 1 tdigest per row
*/
std::unique_ptr<column> group_tdigest(column_view const& values,
cudf::device_span<size_type const> group_offsets,
cudf::device_span<size_type const> group_labels,
cudf::device_span<size_type const> group_valid_counts,
size_type num_groups,
int max_centroids,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

/**
* @brief Merges tdigests within the same group to generate a new tdigest.
*
* The tdigest column produced is of the following structure:
*
* struct {
* // centroids for the digest
* list {
* struct {
* double // mean
* double // weight
* },
* ...
* }
* // these are from the input stream, not the centroids. they are used
* // during the percentile_approx computation near the beginning or
* // end of the quantiles
* double // min
* double // max
* }
*
* Each output row is a single tdigest. The length of the row is the "size" of the
* tdigest, each element of which represents a weighted centroid (mean, weight).
*
* @param values Grouped tdigests to merge.
* @param group_offsets Offsets of groups' starting points within @p values.
* @param group_labels 0-based ID of group that the corresponding value belongs to
* @param num_groups Number of groups.
* @param max_centroids Parameter controlling the level of compression of the tdigest. Higher
* values result in a larger, more precise tdigest.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned column's device memory
*
* @returns tdigest column, with 1 tdigest per row
*/
std::unique_ptr<column> group_merge_tdigest(column_view const& values,
cudf::device_span<size_type const> group_offsets,
cudf::device_span<size_type const> group_labels,
size_type num_groups,
int max_centroids,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

/**
* @brief Create a tdigest column from its constituent components.
*
* @param num_rows The number of rows in the output column.
* @param centroid_means The inner means column. These values are partitioned into lists by the
Expand Down Expand Up @@ -64,6 +155,178 @@ std::unique_ptr<column> make_empty_tdigest_column(
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Create an empty tdigest scalar.
*
* An empty tdigest scalar is a struct_scalar that contains a single row of length 0
*
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned column's device memory.
*
* @returns An empty tdigest scalar.
*/
std::unique_ptr<scalar> make_empty_tdigest_scalar(
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Generate a tdigest column from a grouped, sorted set of numeric input values.
*
* The input is expected to be sorted in ascending order within each group, with
* nulls at the end.
*
* The tdigest column produced is of the following structure:
** struct {
* // centroids for the digest
* list {
* struct {
* double // mean
* double // weight
* },
* ...
* }
* // these are from the input stream, not the centroids. they are used
* // during the percentile_approx computation near the beginning or
* // end of the quantiles
* double // min
* double // max
* }
*
* Each output row is a single tdigest. The length of the row is the "size" of the
* tdigest, each element of which represents a weighted centroid (mean, weight).
*
* @param values Grouped (and sorted) values to merge.
* @param group_offsets Offsets of groups' starting points within @p values.
* @param group_labels 0-based ID of group that the corresponding value belongs to
* @param group_valid_counts Per-group counts of valid elements.
* @param num_groups Number of groups.
* @param max_centroids Parameter controlling the level of compression of the tdigest. Higher
* values result in a larger, more precise tdigest.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned column's device memory
*
* @returns tdigest column, with 1 tdigest per row
*/
std::unique_ptr<column> group_tdigest(column_view const& values,
cudf::device_span<size_type const> group_offsets,
cudf::device_span<size_type const> group_labels,
cudf::device_span<size_type const> group_valid_counts,
size_type num_groups,
int max_centroids,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

/**
* @brief Merges tdigests within the same group to generate a new tdigest.
*
* The tdigest column produced is of the following structure:
*
* struct {
* // centroids for the digest
* list {
* struct {
* double // mean
* double // weight
* },
* ...
* }
* // these are from the input stream, not the centroids. they are used
* // during the percentile_approx computation near the beginning or
* // end of the quantiles
* double // min
* double // max
* }
*
* Each output row is a single tdigest. The length of the row is the "size" of the
* tdigest, each element of which represents a weighted centroid (mean, weight).
*
* @param values Grouped tdigests to merge.
* @param group_offsets Offsets of groups' starting points within @p values.
* @param group_labels 0-based ID of group that the corresponding value belongs to
* @param num_groups Number of groups.
* @param max_centroids Parameter controlling the level of compression of the tdigest. Higher
* values result in a larger, more precise tdigest.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned column's device memory
*
* @returns tdigest column, with 1 tdigest per row
*/
std::unique_ptr<column> group_merge_tdigest(column_view const& values,
cudf::device_span<size_type const> group_offsets,
cudf::device_span<size_type const> group_labels,
size_type num_groups,
int max_centroids,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

/**
* @brief Generate a tdigest scalar from a set of numeric input values.
*
* The tdigest scalar produced is of the following structure:
** struct {
* // centroids for the digest
* list {
* struct {
* double // mean
* double // weight
* },
* ...
* }
* // these are from the input stream, not the centroids. they are used
* // during the percentile_approx computation near the beginning or
* // end of the quantiles
* double // min
* double // max
* }
*
*
* @param values Values to merge.
* @param max_centroids Parameter controlling the level of compression of the tdigest. Higher
* values result in a larger, more precise tdigest.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned scalar's device memory
*
* @returns tdigest scalar
*/
std::unique_ptr<scalar> reduce_tdigest(column_view const& values,
int max_centroids,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

/**
* @brief Merges multiple tdigest columns to generate a new tdigest scalar.
*
* The tdigest scalar produced is of the following structure:
*
* struct {
* // centroids for the digest
* list {
* struct {
* double // mean
* double // weight
* },
* ...
* }
* // these are from the input stream, not the centroids. they are used
* // during the percentile_approx computation near the beginning or
* // end of the quantiles
* double // min
* double // max
* }
*
* @param values tdigests to merge.
* @param max_centroids Parameter controlling the level of compression of the tdigest. Higher
* values result in a larger, more precise tdigest.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned scalar's device memory
*
* @returns tdigest column, with 1 tdigest per row
*/
std::unique_ptr<scalar> reduce_merge_tdigest(column_view const& input,
int max_centroids,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

} // namespace tdigest
} // namespace detail
} // namespace cudf
} // namespace cudf
Loading