Skip to content

Commit

Permalink
Improvements to tdigest aggregation code. (#9403)
Browse files Browse the repository at this point in the history
Addresses comments from initial PR (#8983).  Specifically implementing a tdigest_column_view for more cleanly accessing the various sub-columns of a tdigest column.

Includes several bounds checking fixes for empty groups.  Addresses an issue where entirely empty digests could potentially lead to an incorrect min/max values, which isn't technically _wrong_ but makes constructing test cases tricky.

Authors:
  - https://github.com/nvdbaranec

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Jake Hemstad (https://github.com/jrhemstad)
  - MithunR (https://github.com/mythrocks)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: #9403
  • Loading branch information
nvdbaranec authored Oct 18, 2021
1 parent 410efd9 commit 4beee70
Show file tree
Hide file tree
Showing 12 changed files with 871 additions and 237 deletions.
1 change: 1 addition & 0 deletions conda/recipes/libcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@ test:
- test -f $PREFIX/include/cudf/structs/detail/concatenate.hpp
- test -f $PREFIX/include/cudf/table/table.hpp
- test -f $PREFIX/include/cudf/table/table_view.hpp
- test -f $PREFIX/include/cudf/tdigest/tdigest_column_view.cuh
- test -f $PREFIX/include/cudf/transform.hpp
- test -f $PREFIX/include/cudf/transpose.hpp
- test -f $PREFIX/include/cudf/types.hpp
Expand Down
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,7 @@ add_library(cudf
src/partitioning/partitioning.cu
src/partitioning/round_robin.cu
src/quantiles/tdigest/tdigest.cu
src/quantiles/tdigest/tdigest_column_view.cpp
src/quantiles/quantile.cu
src/quantiles/quantiles.cu
src/reductions/all.cu
Expand Down
5 changes: 3 additions & 2 deletions cpp/include/cudf/detail/quantiles.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#pragma once

#include <cudf/quantiles.hpp>
#include <cudf/tdigest/tdigest_column_view.cuh>

#include <rmm/cuda_stream_view.hpp>

Expand Down Expand Up @@ -52,13 +53,13 @@ std::unique_ptr<table> quantiles(
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @copydoc cudf::percentile_approx(column_view const&, column_view const&,
* @copydoc cudf::percentile_approx(tdigest_column_view const&, column_view const&,
* rmm::mr::device_memory_resource*)
*
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<column> percentile_approx(
column_view const& input,
tdigest::tdigest_column_view const& input,
column_view const& percentiles,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
Expand Down
54 changes: 22 additions & 32 deletions cpp/include/cudf/detail/tdigest/tdigest.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,42 +23,32 @@ namespace detail {

namespace tdigest {

// mean and weight column indices within tdigest inner struct columns
constexpr size_type mean_column_index = 0;
constexpr size_type weight_column_index = 1;

// min and max column indices within tdigest outer struct columns
constexpr size_type centroid_column_index = 0;
constexpr size_type min_column_index = 1;
constexpr size_type max_column_index = 2;

/**
* @brief Verifies that the input column is a valid tdigest column.
*
* struct {
* // centroids for the digest
* list {
* struct {
* double // mean
* double // weight
* },
* ...
* }
* // these are from the input stream, not the centroids. they are used
* // during the percentile_approx computation near the beginning or
* // end of the quantiles
* double // min
* double // max
* }
*
* Each output row is a single tdigest. The length of the row is the "size" of the
* tdigest, each element of which represents a weighted centroid (mean, weight).
* @brief Create a tdigest column from it's constituent components.
*
* @param col Column to be checkeed
* @param num_rows The number of rows in the output column.
* @param centroid_means The inner means column. These values are partitioned into lists by the
* `tdigest_offsets` column.
* @param centroid_weights The inner weights column. These values are partitioned into lists by the
* `tdigest_offsets` column.
* @param tdigest_offsets Offsets representing each individual tdigest in the output column. The
* offsets partition the centroid means and weights.
* @param min_values Column representing the minimum input value for each tdigest.
* @param max_values Column representing the maximum input value for each tdigest.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned column's device memory.
*
* @throws cudf::logic error if the column is not a valid tdigest column.
* @returns The constructed tdigest column.
*/
void check_is_valid_tdigest_column(column_view const& col);
std::unique_ptr<column> make_tdigest_column(
size_type num_rows,
std::unique_ptr<column>&& centroid_means,
std::unique_ptr<column>&& centroid_weights,
std::unique_ptr<column>&& tdigest_offsets,
std::unique_ptr<column>&& min_values,
std::unique_ptr<column>&& max_values,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Create an empty tdigest column.
Expand Down
4 changes: 2 additions & 2 deletions cpp/include/cudf/quantiles.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
#pragma once

#include <cudf/scalar/scalar.hpp>
#include <cudf/structs/structs_column_view.hpp>
#include <cudf/table/table_view.hpp>
#include <cudf/tdigest/tdigest_column_view.cuh>
#include <cudf/types.hpp>

namespace cudf {
Expand Down Expand Up @@ -121,7 +121,7 @@ std::unique_ptr<table> quantiles(
* @returns LIST Column containing requested percentile values as FLOAT64.
*/
std::unique_ptr<column> percentile_approx(
structs_column_view const& input,
tdigest::tdigest_column_view const& input,
column_view const& percentiles,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

Expand Down
126 changes: 126 additions & 0 deletions cpp/include/cudf/tdigest/tdigest_column_view.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <cudf/column/column_view.hpp>
#include <cudf/detail/iterator.cuh>
#include <cudf/lists/lists_column_view.hpp>

namespace cudf {
namespace tdigest {

struct tdigest_size {
size_type const* offsets;
__device__ size_type operator()(size_type tdigest_index)
{
return offsets[tdigest_index + 1] - offsets[tdigest_index];
}
};

/**
* @brief Given a column_view containing tdigest data, an instance of this class
* provides a wrapper on the compound column for tdigest operations.
*
* A tdigest is a "compressed" set of input scalars represented as a sorted
* set of centroids (https://arxiv.org/pdf/1902.04023.pdf).
* This data can be queried for quantile information. Each row in a tdigest
* column represents an entire tdigest.
*
* The column has the following structure:
*
* struct {
* // centroids for the digest
* list {
* struct {
* double // mean
* double // weight
* }
* }
* // these are from the input stream, not the centroids. they are used
* // during the percentile_approx computation near the beginning or
* // end of the quantiles
* double // min
* double // max
* }
*/
class tdigest_column_view : private column_view {
public:
tdigest_column_view(column_view const& col);
tdigest_column_view(tdigest_column_view&& tdigest_view) = default;
tdigest_column_view(const tdigest_column_view& tdigest_view) = default;
~tdigest_column_view() = default;
tdigest_column_view& operator=(tdigest_column_view const&) = default;
tdigest_column_view& operator=(tdigest_column_view&&) = default;

using column_view::size;
static_assert(std::is_same_v<offset_type, size_type>,
"offset_type is expected to be the same as size_type.");
using offset_iterator = offset_type const*;

// mean and weight column indices within tdigest inner struct columns
static constexpr size_type mean_column_index{0};
static constexpr size_type weight_column_index{1};

// min and max column indices within tdigest outer struct columns
static constexpr size_type centroid_column_index{0};
static constexpr size_type min_column_index{1};
static constexpr size_type max_column_index{2};

/**
* @brief Returns the parent column.
*/
column_view parent() const;

/**
* @brief Returns the column of centroids
*/
lists_column_view centroids() const;

/**
* @brief Returns the internal column of mean values
*/
column_view means() const;

/**
* @brief Returns the internal column of weight values
*/
column_view weights() const;

/**
* @brief Returns an iterator that returns the size of each tdigest
* in the column (each row is 1 digest)
*/
auto size_begin() const
{
return cudf::detail::make_counting_transform_iterator(
0, tdigest_size{centroids().offsets_begin()});
}

/**
* @brief Returns the first min value for the column. Each row corresponds
* to the minimum value for the accompanying digest.
*/
double const* min_begin() const;

/**
* @brief Returns the first max value for the column. Each row corresponds
* to the maximum value for the accompanying digest.
*/
double const* max_begin() const;
};

} // namespace tdigest
} // namespace cudf
Loading

0 comments on commit 4beee70

Please sign in to comment.