From ba763105e006494a536c1a2fafc5112ab3dae362 Mon Sep 17 00:00:00 2001 From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com> Date: Fri, 24 Sep 2021 09:37:11 -0500 Subject: [PATCH] Support for using tdigests to compute approximate percentiles. (#8983) Addresses https://github.com/rapidsai/cudf/issues/7170 Adds 3 pieces of new functionality: - A `TDIGEST` aggregation which creates a tdigest column (https://arxiv.org/pdf/1902.04023.pdf) from a stream of input scalars. - A `MERGE_TDIGEST` aggregation which merges multiple tdigest columns into a new one. - a `percentile_approx` function which performs percentile queries on tdigest data. Also exposes several ::detail functions (`sort`, `merge`, `slice`) in detail headers. Ready for review. I do need to add more tests though. Authors: - https://github.com/nvdbaranec Approvers: - AJ Schmidt (https://github.com/ajschmidt8) - Jake Hemstad (https://github.com/jrhemstad) - MithunR (https://github.com/mythrocks) - Robert Maynard (https://github.com/robertmaynard) URL: https://github.com/rapidsai/cudf/pull/8983 --- conda/recipes/libcudf/meta.yaml | 1 + cpp/CMakeLists.txt | 4 +- cpp/include/cudf/aggregation.hpp | 79 +- .../cudf/detail/aggregation/aggregation.hpp | 76 ++ cpp/include/cudf/detail/copy.hpp | 9 + cpp/include/cudf/detail/merge.cuh | 17 + cpp/include/cudf/detail/quantiles.hpp | 18 +- cpp/include/cudf/detail/sorting.hpp | 16 +- cpp/include/cudf/detail/tdigest/tdigest.hpp | 79 ++ cpp/include/cudf/quantiles.hpp | 28 + cpp/include/cudf/sorting.hpp | 6 +- cpp/include/cudf_test/column_utilities.hpp | 7 +- cpp/src/aggregation/aggregation.cpp | 41 + cpp/src/copying/slice.cu | 34 +- cpp/src/groupby/sort/aggregate.cpp | 91 ++ cpp/src/groupby/sort/group_reductions.hpp | 88 ++ cpp/src/groupby/sort/group_tdigest.cu | 841 ++++++++++++++++++ cpp/src/quantiles/tdigest/tdigest.cu | 383 ++++++++ cpp/src/sort/sort.cu | 8 +- cpp/src/sort/stable_sort.cu | 4 +- cpp/tests/CMakeLists.txt | 2 + cpp/tests/groupby/groupby_test_util.hpp | 55 ++ cpp/tests/groupby/tdigest_tests.cu | 584 ++++++++++++ cpp/tests/quantiles/percentile_approx_test.cu | 435 +++++++++ cpp/tests/utilities/column_utilities.cu | 61 +- 25 files changed, 2919 insertions(+), 48 deletions(-) create mode 100644 cpp/include/cudf/detail/tdigest/tdigest.hpp create mode 100644 cpp/src/groupby/sort/group_tdigest.cu create mode 100644 cpp/src/quantiles/tdigest/tdigest.cu create mode 100644 cpp/tests/groupby/tdigest_tests.cu create mode 100644 cpp/tests/quantiles/percentile_approx_test.cu diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index c3450fe8d88..fd687de6698 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -93,6 +93,7 @@ test: - test -f $PREFIX/include/cudf/detail/sequence.hpp - test -f $PREFIX/include/cudf/detail/sorting.hpp - test -f $PREFIX/include/cudf/detail/stream_compaction.hpp + - test -f $PREFIX/include/cudf/detail/tdigest/tdigest.hpp - test -f $PREFIX/include/cudf/detail/transform.hpp - test -f $PREFIX/include/cudf/detail/transpose.hpp - test -f $PREFIX/include/cudf/detail/unary.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 2df35aa0971..00af1973cfe 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -236,8 +236,9 @@ add_library(cudf src/groupby/sort/group_max_scan.cu src/groupby/sort/group_min_scan.cu src/groupby/sort/group_rank_scan.cu - src/groupby/sort/group_sum_scan.cu src/groupby/sort/group_replace_nulls.cu + src/groupby/sort/group_sum_scan.cu + src/groupby/sort/group_tdigest.cu src/groupby/sort/sort_helper.cu src/hash/hashing.cu src/hash/md5_hash.cu @@ -318,6 +319,7 @@ add_library(cudf src/merge/merge.cu src/partitioning/partitioning.cu src/partitioning/round_robin.cu + src/quantiles/tdigest/tdigest.cu src/quantiles/quantile.cu src/quantiles/quantiles.cu src/reductions/all.cu diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp index c302895880d..fb6401a3cc1 100644 --- a/cpp/include/cudf/aggregation.hpp +++ b/cpp/include/cudf/aggregation.hpp @@ -87,7 +87,9 @@ class aggregation { CUDA, ///< CUDA UDF based reduction MERGE_LISTS, ///< merge multiple lists values into one list MERGE_SETS, ///< merge multiple lists values into one list then drop duplicate entries - MERGE_M2 ///< merge partial values of M2 aggregation + MERGE_M2, ///< merge partial values of M2 aggregation + TDIGEST, ///< create a tdigest from a set of input values + MERGE_TDIGEST ///< create a tdigest by merging multiple tdigests together }; aggregation() = delete; @@ -493,5 +495,80 @@ std::unique_ptr make_merge_sets_aggregation(null_equality nulls_equal = nu template std::unique_ptr make_merge_m2_aggregation(); +/** + * @brief Factory to create a TDIGEST aggregation + * + * Produces a tdigest (https://arxiv.org/pdf/1902.04023.pdf) column from input values. + * The input aggregation values are expected to be fixed-width numeric types. + * + * The tdigest column produced is of the following structure: + * + * struct { + * // centroids for the digest + * list { + * struct { + * double // mean + * double // weight + * }, + * ... + * } + * // these are from the input stream, not the centroids. they are used + * // during the percentile_approx computation near the beginning or + * // end of the quantiles + * double // min + * double // max + * } + * + * Each output row is a single tdigest. The length of the row is the "size" of the + * tdigest, each element of which represents a weighted centroid (mean, weight). + * + * @param max_centroids Parameter controlling compression level and accuracy on subsequent + * queries on the output tdigest data. `max_centroids` places an upper bound on the size of + * the computed tdigests: A value of 1000 will result in a tdigest containing no + * more than 1000 centroids (32 bytes each). Higher result in more accurate tdigest information. + * + * @returns A TDIGEST aggregation object. + */ +template +std::unique_ptr make_tdigest_aggregation(int max_centroids = 1000); + +/** + * @brief Factory to create a MERGE_TDIGEST aggregation + * + * Merges the results from a previous aggregation resulting from a `make_tdigest_aggregation` + * or `make_merge_tdigest_aggregation` to produce a new a tdigest + * (https://arxiv.org/pdf/1902.04023.pdf) column. + * + * The tdigest column produced is of the following structure: + * + * struct { + * // centroids for the digest + * list { + * struct { + * double // mean + * double // weight + * }, + * ... + * } + * // these are from the input stream, not the centroids. they are used + * // during the percentile_approx computation near the beginning or + * // end of the quantiles + * double // min + * double // max + * } + * + * Each output row is a single tdigest. The length of the row is the "size" of the + * tdigest, each element of which represents a weighted centroid (mean, weight). + * + * @param max_centroids Parameter controlling compression level and accuracy on subsequent + * queries on the output tdigest data. `max_centroids` places an upper bound on the size of + * the computed tdigests: A value of 1000 will result in a tdigest containing no + * more than 1000 centroids (32 bytes each). Higher result in more accurate tdigest information. + * + * @returns A MERGE_TDIGEST aggregation object. + */ +template +std::unique_ptr make_merge_tdigest_aggregation(int max_centroids = 1000); + /** @} */ // end of group } // namespace cudf diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index 5a1fc3b9398..05d1bf3e595 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -91,6 +91,10 @@ class simple_aggregations_collector { // Declares the interface for the simple class merge_sets_aggregation const& agg); virtual std::vector> visit(data_type col_type, class merge_m2_aggregation const& agg); + virtual std::vector> visit(data_type col_type, + class tdigest_aggregation const& agg); + virtual std::vector> visit( + data_type col_type, class merge_tdigest_aggregation const& agg); }; class aggregation_finalizer { // Declares the interface for the finalizer @@ -125,6 +129,8 @@ class aggregation_finalizer { // Declares the interface for the finalizer virtual void visit(class merge_lists_aggregation const& agg); virtual void visit(class merge_sets_aggregation const& agg); virtual void visit(class merge_m2_aggregation const& agg); + virtual void visit(class tdigest_aggregation const& agg); + virtual void visit(class merge_tdigest_aggregation const& agg); }; /** @@ -884,6 +890,54 @@ class merge_m2_aggregation final : public groupby_aggregation { void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } }; +/** + * @brief Derived aggregation class for specifying TDIGEST aggregation + */ +class tdigest_aggregation final : public groupby_aggregation { + public: + explicit tdigest_aggregation(int max_centroids_) + : aggregation{TDIGEST}, max_centroids{max_centroids_} + { + } + + int const max_centroids; + + std::unique_ptr clone() const override + { + return std::make_unique(*this); + } + std::vector> get_simple_aggregations( + data_type col_type, simple_aggregations_collector& collector) const override + { + return collector.visit(col_type, *this); + } + void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } +}; + +/** + * @brief Derived aggregation class for specifying MERGE_TDIGEST aggregation + */ +class merge_tdigest_aggregation final : public groupby_aggregation { + public: + explicit merge_tdigest_aggregation(int max_centroids_) + : aggregation{MERGE_TDIGEST}, max_centroids{max_centroids_} + { + } + + int const max_centroids; + + std::unique_ptr clone() const override + { + return std::make_unique(*this); + } + std::vector> get_simple_aggregations( + data_type col_type, simple_aggregations_collector& collector) const override + { + return collector.visit(col_type, *this); + } + void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } +}; + /** * @brief Sentinel value used for `ARGMAX` aggregation. * @@ -1120,6 +1174,24 @@ struct target_type_impl { using type = struct_view; }; +// Always use numeric types for TDIGEST +template +struct target_type_impl() || is_fixed_point())>> { + using type = struct_view; +}; + +// TDIGEST_MERGE. The root column type for a tdigest column is a list_view. Strictly +// speaking, this check is not sufficient to guarantee we are actually being given a +// real tdigest column, but we will do further verification inside the aggregation code. +template +struct target_type_impl>> { + using type = struct_view; +}; + /** * @brief Helper alias to get the accumulator type for performing aggregation * `k` on elements of type `Source` @@ -1224,6 +1296,10 @@ CUDA_HOST_DEVICE_CALLABLE decltype(auto) aggregation_dispatcher(aggregation::Kin return f.template operator()(std::forward(args)...); case aggregation::MERGE_M2: return f.template operator()(std::forward(args)...); + case aggregation::TDIGEST: + return f.template operator()(std::forward(args)...); + case aggregation::MERGE_TDIGEST: + return f.template operator()(std::forward(args)...); default: { #ifndef __CUDA_ARCH__ CUDF_FAIL("Unsupported aggregation."); diff --git a/cpp/include/cudf/detail/copy.hpp b/cpp/include/cudf/detail/copy.hpp index fb5cfad6186..9f06661c8d1 100644 --- a/cpp/include/cudf/detail/copy.hpp +++ b/cpp/include/cudf/detail/copy.hpp @@ -75,6 +75,15 @@ std::vector slice(column_view const& input, std::vector const& indices, rmm::cuda_stream_view stream = rmm::cuda_stream_default); +/** + * @copydoc cudf::slice(table_view const&,std::vector const&) + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::vector slice(table_view const& input, + std::vector const& indices, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); + /** * @copydoc cudf::shift(column_view const&,size_type,scalar const&, * rmm::mr::device_memory_resource*) diff --git a/cpp/include/cudf/detail/merge.cuh b/cpp/include/cudf/detail/merge.cuh index a779c3defbb..ec83e348e33 100644 --- a/cpp/include/cudf/detail/merge.cuh +++ b/cpp/include/cudf/detail/merge.cuh @@ -145,5 +145,22 @@ struct row_lexicographic_tagged_comparator { order const* _column_order{}; }; +/** + * @copydoc std::unique_ptr merge( + * std::vector const& tables_to_merge, + * std::vector const& key_cols, + * std::vector const& column_order, + * std::vector const& null_precedence, + * rmm::mr::device_memory_resource* mr) + * + * @param stream CUDA stream used for device memory operations and kernel launches + */ +std::unique_ptr merge(std::vector const& tables_to_merge, + std::vector const& key_cols, + std::vector const& column_order, + std::vector const& null_precedence, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/detail/quantiles.hpp b/cpp/include/cudf/detail/quantiles.hpp index 5fb2ce4cbe6..7a76f9cab88 100644 --- a/cpp/include/cudf/detail/quantiles.hpp +++ b/cpp/include/cudf/detail/quantiles.hpp @@ -22,7 +22,8 @@ namespace cudf { namespace detail { -/** @copydoc cudf::quantile() +/** + * @copydoc cudf::quantile() * * @param stream CUDA stream used for device memory operations and kernel launches. */ @@ -35,7 +36,8 @@ std::unique_ptr quantile( rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -/** @copydoc cudf::quantiles() +/** + * @copydoc cudf::quantiles() * * @param stream CUDA stream used for device memory operations and kernel launches. */ @@ -49,5 +51,17 @@ std::unique_ptr quantiles( rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @copydoc cudf::percentile_approx(column_view const&, column_view const&, + * rmm::mr::device_memory_resource*) + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr percentile_approx( + column_view const& input, + column_view const& percentiles, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/detail/sorting.hpp b/cpp/include/cudf/detail/sorting.hpp index 3127a5f89f1..b5dfb34c043 100644 --- a/cpp/include/cudf/detail/sorting.hpp +++ b/cpp/include/cudf/detail/sorting.hpp @@ -32,7 +32,7 @@ namespace detail { * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr sorted_order( - table_view input, + table_view const& input, std::vector const& column_order = {}, std::vector const& null_precedence = {}, rmm::cuda_stream_view stream = rmm::cuda_stream_default, @@ -44,7 +44,7 @@ std::unique_ptr sorted_order( * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr stable_sorted_order( - table_view input, + table_view const& input, std::vector const& column_order = {}, std::vector const& null_precedence = {}, rmm::cuda_stream_view stream = rmm::cuda_stream_default, @@ -90,5 +90,17 @@ std::unique_ptr
segmented_sort_by_key( rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @copydoc cudf::sort + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr
sort( + table_view const& values, + std::vector const& column_order = {}, + std::vector const& null_precedence = {}, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/detail/tdigest/tdigest.hpp b/cpp/include/cudf/detail/tdigest/tdigest.hpp new file mode 100644 index 00000000000..94c22911c1e --- /dev/null +++ b/cpp/include/cudf/detail/tdigest/tdigest.hpp @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +namespace cudf { +namespace detail { + +namespace tdigest { + +// mean and weight column indices within tdigest inner struct columns +constexpr size_type mean_column_index = 0; +constexpr size_type weight_column_index = 1; + +// min and max column indices within tdigest outer struct columns +constexpr size_type centroid_column_index = 0; +constexpr size_type min_column_index = 1; +constexpr size_type max_column_index = 2; + +/** + * @brief Verifies that the input column is a valid tdigest column. + * + * struct { + * // centroids for the digest + * list { + * struct { + * double // mean + * double // weight + * }, + * ... + * } + * // these are from the input stream, not the centroids. they are used + * // during the percentile_approx computation near the beginning or + * // end of the quantiles + * double // min + * double // max + * } + * + * Each output row is a single tdigest. The length of the row is the "size" of the + * tdigest, each element of which represents a weighted centroid (mean, weight). + * + * @param col Column to be checkeed + * + * @throws cudf::logic error if the column is not a valid tdigest column. + */ +void check_is_valid_tdigest_column(column_view const& col); + +/** + * @brief Create an empty tdigest column. + * + * An empty tdigest column contains a single row of length 0 + * + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. + * + * @returns An empty tdigest column. + */ +std::unique_ptr make_empty_tdigest_column( + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +} // namespace tdigest +} // namespace detail +} // namespace cudf \ No newline at end of file diff --git a/cpp/include/cudf/quantiles.hpp b/cpp/include/cudf/quantiles.hpp index 94b5c344f4f..d21f6dff79c 100644 --- a/cpp/include/cudf/quantiles.hpp +++ b/cpp/include/cudf/quantiles.hpp @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include @@ -94,5 +95,32 @@ std::unique_ptr
quantiles( std::vector const& null_precedence = {}, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Calculate approximate percentiles on an input tdigest column. + * + * tdigest (https://arxiv.org/pdf/1902.04023.pdf) columns are produced specifically + * by the TDIGEST and MERGE_TDIGEST aggregations. These columns represent + * compressed representations of a very large input data set that can be + * queried for quantile information. + * + * Produces a LIST column where each row `i` represents output from querying the + * corresponding tdigest from `input` row `i`. The length of each output list + * is the number of percentages specified in `percentages`. + * + * @param input tdigest input data. One tdigest per row. + * @param percentiles Desired percentiles in range [0, 1]. + * @param mr Device memory resource used to allocate the returned column's device + * memory + * + * @throws cudf::logic_error if `input` is not a valid tdigest column. + * @throws cudf::logic_error if `percentiles` is not a FLOAT64 column. + * + * @returns LIST Column containing requested percentile values as FLOAT64. + */ +std::unique_ptr percentile_approx( + structs_column_view const& input, + column_view const& percentiles, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ // end of group } // namespace cudf diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp index 36a8131a78e..69eb8b3490a 100644 --- a/cpp/include/cudf/sorting.hpp +++ b/cpp/include/cudf/sorting.hpp @@ -58,7 +58,7 @@ enum class rank_method { * `input` if it were sorted */ std::unique_ptr sorted_order( - table_view input, + table_view const& input, std::vector const& column_order = {}, std::vector const& null_precedence = {}, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); @@ -72,7 +72,7 @@ std::unique_ptr sorted_order( * @copydoc cudf::sorted_order */ std::unique_ptr stable_sorted_order( - table_view input, + table_view const& input, std::vector const& column_order = {}, std::vector const& null_precedence = {}, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); @@ -112,7 +112,7 @@ bool is_sorted(cudf::table_view const& table, * @return New table containing the desired sorted order of `input` */ std::unique_ptr
sort( - table_view input, + table_view const& input, std::vector const& column_order = {}, std::vector const& null_precedence = {}, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/include/cudf_test/column_utilities.hpp b/cpp/include/cudf_test/column_utilities.hpp index 553d8a97bd2..aa77686fee4 100644 --- a/cpp/include/cudf_test/column_utilities.hpp +++ b/cpp/include/cudf_test/column_utilities.hpp @@ -38,6 +38,8 @@ enum class debug_output_level { QUIET // no debug output }; +constexpr size_type default_ulp = 4; + /** * @brief Verifies the property equality of two columns. * @@ -93,12 +95,15 @@ bool expect_columns_equal(cudf::column_view const& lhs, * @param lhs The first column * @param rhs The second column * @param verbosity Level of debug output verbosity + * @param fp_ulps # of ulps of tolerance to allow when comparing + * floating point values * * @returns True if the columns (and their properties) are equivalent, false otherwise */ bool expect_columns_equivalent(cudf::column_view const& lhs, cudf::column_view const& rhs, - debug_output_level verbosity = debug_output_level::FIRST_ERROR); + debug_output_level verbosity = debug_output_level::FIRST_ERROR, + size_type fp_ulps = cudf::test::default_ulp); /** * @brief Verifies the bitwise equality of two device memory buffers. diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp index c3d992e1181..b550b61785b 100644 --- a/cpp/src/aggregation/aggregation.cpp +++ b/cpp/src/aggregation/aggregation.cpp @@ -202,6 +202,18 @@ std::vector> simple_aggregations_collector::visit( return visit(col_type, static_cast(agg)); } +std::vector> simple_aggregations_collector::visit( + data_type col_type, tdigest_aggregation const& agg) +{ + return visit(col_type, static_cast(agg)); +} + +std::vector> simple_aggregations_collector::visit( + data_type col_type, merge_tdigest_aggregation const& agg) +{ + return visit(col_type, static_cast(agg)); +} + // aggregation_finalizer ---------------------------------------- void aggregation_finalizer::visit(aggregation const& agg) {} @@ -346,6 +358,16 @@ void aggregation_finalizer::visit(merge_m2_aggregation const& agg) visit(static_cast(agg)); } +void aggregation_finalizer::visit(tdigest_aggregation const& agg) +{ + visit(static_cast(agg)); +} + +void aggregation_finalizer::visit(merge_tdigest_aggregation const& agg) +{ + visit(static_cast(agg)); +} + } // namespace detail std::vector> aggregation::get_simple_aggregations( @@ -668,6 +690,25 @@ std::unique_ptr make_merge_m2_aggregation() template std::unique_ptr make_merge_m2_aggregation(); template std::unique_ptr make_merge_m2_aggregation(); +template +std::unique_ptr make_tdigest_aggregation(int max_centroids) +{ + return std::make_unique(max_centroids); +} +template std::unique_ptr make_tdigest_aggregation(int max_centroids); +template std::unique_ptr make_tdigest_aggregation( + int max_centroids); + +template +std::unique_ptr make_merge_tdigest_aggregation(int max_centroids) +{ + return std::make_unique(max_centroids); +} +template std::unique_ptr make_merge_tdigest_aggregation( + int max_centroids); +template std::unique_ptr make_merge_tdigest_aggregation( + int max_centroids); + namespace detail { namespace { struct target_type_functor { diff --git a/cpp/src/copying/slice.cu b/cpp/src/copying/slice.cu index 0e41689dc4b..d1c12056393 100644 --- a/cpp/src/copying/slice.cu +++ b/cpp/src/copying/slice.cu @@ -63,17 +63,9 @@ std::vector slice(column_view const& input, return std::vector{begin, begin + indices.size() / 2}; } -} // namespace detail - -std::vector slice(cudf::column_view const& input, - std::vector const& indices) -{ - CUDF_FUNC_RANGE(); - return detail::slice(input, indices, rmm::cuda_stream_default); -} - -std::vector slice(cudf::table_view const& input, - std::vector const& indices) +std::vector slice(table_view const& input, + std::vector const& indices, + rmm::cuda_stream_view stream) { CUDF_FUNC_RANGE(); CUDF_EXPECTS(indices.size() % 2 == 0, "indices size must be even"); @@ -81,7 +73,7 @@ std::vector slice(cudf::table_view const& input, // 2d arrangement of column_views that represent the outgoing table_views sliced_table[i][j] // where i is the i'th column of the j'th table_view - auto op = [&indices](auto const& c) { return cudf::slice(c, indices); }; + auto op = [&indices, stream](auto const& c) { return cudf::detail::slice(c, indices, stream); }; auto f = thrust::make_transform_iterator(input.begin(), op); auto sliced_table = std::vector>(f, f + input.num_columns()); @@ -99,6 +91,22 @@ std::vector slice(cudf::table_view const& input, } return result; -}; +} + +} // namespace detail + +std::vector slice(cudf::column_view const& input, + std::vector const& indices) +{ + CUDF_FUNC_RANGE(); + return detail::slice(input, indices, rmm::cuda_stream_default); +} + +std::vector slice(cudf::table_view const& input, + std::vector const& indices) +{ + CUDF_FUNC_RANGE(); + return detail::slice(input, indices, rmm::cuda_stream_default); +} } // namespace cudf diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp index 726b51b7702..9f3d67ac38b 100644 --- a/cpp/src/groupby/sort/aggregate.cpp +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -525,6 +525,97 @@ void aggregate_result_functor::operator()(aggregation con get_grouped_values(), helper.group_offsets(stream), helper.num_groups(stream), stream, mr)); }; +/** + * @brief Generate a tdigest column from a grouped set of numeric input values. + * + * The tdigest column produced is of the following structure: + * + * struct { + * // centroids for the digest + * list { + * struct { + * double // mean + * double // weight + * }, + * ... + * } + * // these are from the input stream, not the centroids. they are used + * // during the percentile_approx computation near the beginning or + * // end of the quantiles + * double // min + * double // max + * } + * + * Each output row is a single tdigest. The length of the row is the "size" of the + * tdigest, each element of which represents a weighted centroid (mean, weight). + */ +template <> +void aggregate_result_functor::operator()(aggregation const& agg) +{ + if (cache.has_result(col_idx, agg)) { return; } + + auto const max_centroids = + dynamic_cast(agg).max_centroids; + + auto count_agg = make_count_aggregation(); + operator()(*count_agg); + column_view valid_counts = cache.get_result(col_idx, *count_agg); + + cache.add_result(col_idx, + agg, + detail::group_tdigest( + get_sorted_values(), + helper.group_offsets(stream), + helper.group_labels(stream), + {valid_counts.begin(), static_cast(valid_counts.size())}, + helper.num_groups(stream), + max_centroids, + stream, + mr)); +}; + +/** + * @brief Generate a merged tdigest column from a grouped set of input tdigest columns. + * + * The tdigest column produced is of the following structure: + * + * struct { + * // centroids for the digest + * list { + * struct { + * double // mean + * double // weight + * }, + * ... + * } + * // these are from the input stream, not the centroids. they are used + * // during the percentile_approx computation near the beginning or + * // end of the quantiles + * double // min + * double // max + * } + * + * Each output row is a single tdigest. The length of the row is the "size" of the + * tdigest, each element of which represents a weighted centroid (mean, weight). + */ +template <> +void aggregate_result_functor::operator()(aggregation const& agg) +{ + if (cache.has_result(col_idx, agg)) { return; } + + auto const max_centroids = + dynamic_cast(agg).max_centroids; + cache.add_result(col_idx, + agg, + detail::group_merge_tdigest(get_grouped_values(), + helper.group_offsets(stream), + helper.group_labels(stream), + helper.num_groups(stream), + max_centroids, + stream, + mr)); +}; + } // namespace detail // Sort-based groupby diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp index 2770162da2d..cb01ee8e053 100644 --- a/cpp/src/groupby/sort/group_reductions.hpp +++ b/cpp/src/groupby/sort/group_reductions.hpp @@ -442,6 +442,94 @@ std::unique_ptr group_merge_m2(column_view const& values, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); +/** + * @brief Generate a tdigest column from a grouped set of numeric input values. + * + * The tdigest column produced is of the following structure: + * + * struct { + * // centroids for the digest + * list { + * struct { + * double // mean + * double // weight + * }, + * ... + * } + * // these are from the input stream, not the centroids. they are used + * // during the percentile_approx computation near the beginning or + * // end of the quantiles + * double // min + * double // max + * } + * + * Each output row is a single tdigest. The length of the row is the "size" of the + * tdigest, each element of which represents a weighted centroid (mean, weight). + * + * @param values Grouped (and sorted) values to merge. + * @param group_offsets Offsets of groups' starting points within @p values. + * @param group_labels 0-based ID of group that the corresponding value belongs to + * @param group_valid_counts Per-group counts of valid elements. + * @param num_groups Number of groups. + * @param max_centroids Parameter controlling the level of compression of the tdigest. Higher + * values result in a larger, more precise tdigest. + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory + * + * @returns tdigest column, with 1 tdigest per row + */ +std::unique_ptr group_tdigest(column_view const& values, + cudf::device_span group_offsets, + cudf::device_span group_labels, + cudf::device_span group_valid_counts, + size_type num_groups, + int max_centroids, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +/** + * @brief Merges tdigests within the same group to generate a new tdigest. + * + * The tdigest column produced is of the following structure: + * + * struct { + * // centroids for the digest + * list { + * struct { + * double // mean + * double // weight + * }, + * ... + * } + * // these are from the input stream, not the centroids. they are used + * // during the percentile_approx computation near the beginning or + * // end of the quantiles + * double // min + * double // max + * } + * + * Each output row is a single tdigest. The length of the row is the "size" of the + * tdigest, each element of which represents a weighted centroid (mean, weight). + * + * @param values Grouped tdigests to merge. + * @param group_offsets Offsets of groups' starting points within @p values. + * @param group_labels 0-based ID of group that the corresponding value belongs to + * @param num_groups Number of groups. + * @param max_centroids Parameter controlling the level of compression of the tdigest. Higher + * values result in a larger, more precise tdigest. + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory + * + * @returns tdigest column, with 1 tdigest per row + */ +std::unique_ptr group_merge_tdigest(column_view const& values, + cudf::device_span group_offsets, + cudf::device_span group_labels, + size_type num_groups, + int max_centroids, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + /** @endinternal * */ diff --git a/cpp/src/groupby/sort/group_tdigest.cu b/cpp/src/groupby/sort/group_tdigest.cu new file mode 100644 index 00000000000..5b4252a9063 --- /dev/null +++ b/cpp/src/groupby/sort/group_tdigest.cu @@ -0,0 +1,841 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include + +namespace cudf { +namespace groupby { +namespace detail { + +namespace { + +// the most representative point within a cluster of similar +// values. {mean, weight} +// NOTE: Using a tuple here instead of a struct to take advantage of +// thrust zip iterators for output. +using centroid = thrust::tuple; + +// make a centroid from a scalar with a weight of 1. +template +struct make_centroid { + column_device_view const col; + + centroid operator() __device__(size_type index) + { + return {static_cast(col.element(index)), 1, col.is_valid(index)}; + } +}; + +// make a centroid from an input stream of mean/weight values. +struct make_weighted_centroid { + double const* mean; + double const* weight; + + centroid operator() __device__(size_type index) { return {mean[index], weight[index], true}; } +}; + +// merge two centroids +struct merge_centroids { + centroid operator() __device__(centroid const& lhs, centroid const& rhs) + { + bool const lhs_valid = thrust::get<2>(lhs); + bool const rhs_valid = thrust::get<2>(rhs); + if (!lhs_valid && !rhs_valid) { return {0, 0, false}; } + if (!lhs_valid) { return rhs; } + if (!rhs_valid) { return lhs; } + + double const lhs_mean = thrust::get<0>(lhs); + double const rhs_mean = thrust::get<0>(rhs); + double const lhs_weight = thrust::get<1>(lhs); + double const rhs_weight = thrust::get<1>(rhs); + double const new_weight = lhs_weight + rhs_weight; + return {(lhs_mean * lhs_weight + rhs_mean * rhs_weight) / new_weight, new_weight, true}; + } +}; + +/** + * @brief A functor which returns the nearest cumulative weight in the input stream prior to the + * specified next weight limit. + * + * This functor assumes the weight for all scalars is simply 1. Under this assumption, + * the nearest weight that will be <= the next limit is simply the nearest integer < the limit, + * which we can get by just taking floor(next_limit). For example if our next limit is 3.56, the + * nearest whole number <= it is floor(3.56) == 3. + */ +struct nearest_value_scalar_weights { + thrust::pair operator() __device__(double next_limit, size_type) + { + double const f = floor(next_limit); + return {f, max(0, static_cast(next_limit) - 1)}; + } +}; + +/** + * @brief A functor which returns the nearest cumulative weight in the input stream prior to the + * specified next weight limit. + * + * This functor assumes we are dealing with grouped, sorted, weighted centroids. + */ +struct nearest_value_centroid_weights { + double const* cumulative_weights; + offset_type const* outer_offsets; // groups + offset_type const* inner_offsets; // tdigests within a group + + thrust::pair operator() __device__(double next_limit, size_type group_index) + { + auto const tdigest_begin = outer_offsets[group_index]; + auto const tdigest_end = outer_offsets[group_index + 1]; + auto const num_weights = inner_offsets[tdigest_end] - inner_offsets[tdigest_begin]; + double const* group_cumulative_weights = cumulative_weights + inner_offsets[tdigest_begin]; + + auto const index = ((thrust::lower_bound(thrust::seq, + group_cumulative_weights, + group_cumulative_weights + num_weights, + next_limit)) - + group_cumulative_weights); + + return index == 0 ? thrust::pair{0, 0} + : thrust::pair{group_cumulative_weights[index - 1], index - 1}; + } +}; + +/** + * @brief A functor which returns the cumulative input weight for a given index in a + * set of grouped input values. + * + * This functor assumes the weight for all scalars is simply 1. Under this assumption, + * the cumulative weight for a given value index I is simply I+1. + */ +struct cumulative_scalar_weight { + cudf::device_span group_offsets; + cudf::device_span group_labels; + std::tuple operator() __device__(size_type value_index) const + { + auto const group_index = group_labels[value_index]; + auto const relative_value_index = value_index - group_offsets[group_index]; + return {group_index, relative_value_index, relative_value_index + 1}; + } +}; + +/** + * @brief A functor which returns the cumulative input weight for a given index in a + * set of grouped input centroids. + * + * This functor assumes we are dealing with grouped, weighted centroids. + */ +struct cumulative_centroid_weight { + double const* cumulative_weights; + cudf::device_span group_labels; + offset_type const* outer_offsets; // groups + cudf::device_span inner_offsets; // tdigests with a group + + std::tuple operator() __device__(size_type value_index) const + { + auto const tdigest_index = + static_cast( + thrust::upper_bound(thrust::seq, inner_offsets.begin(), inner_offsets.end(), value_index) - + inner_offsets.begin()) - + 1; + auto const group_index = group_labels[tdigest_index]; + auto const first_tdigest_index = outer_offsets[group_index]; + auto const first_weight_index = inner_offsets[first_tdigest_index]; + auto const relative_value_index = value_index - first_weight_index; + double const* group_cumulative_weights = cumulative_weights + first_weight_index; + + return {group_index, relative_value_index, group_cumulative_weights[relative_value_index]}; + } +}; + +// a monotonically increasing scale function which produces a distribution +// of centroids that is more densely packed in the middle of the input +// than at the ends. +__device__ double scale_func_k1(double quantile, double delta_norm) +{ + double k = delta_norm * asin(2.0 * quantile - 1.0); + k += 1.0; + double q = (sin(k / delta_norm) + 1.0) / 2.0; + return q; +} + +/** + * @brief Compute a set of cluster limits (brackets, essentially) for a + * given tdigest based on the specified delta and the total weight of values + * to be added. + * + * The number of clusters generated will always be <= delta_, where delta_ is + * a reasonably small number likely << 10000. + * + * Each input group gets an independent set of clusters generated. 1 thread + * per group. + * + * This kernel is called in a two-pass style. Once to compute the per-group + * cluster sizes and total # of clusters, and once to compute the actual + * weight limits per cluster. + * + * @param delta_ tdigest compression level + * @param num_groups The number of input groups + * @param nearest_weight_ A functor which returns the nearest weight in the input + * stream that falls before our current cluster limit + * @param total_weight_ A functor which returns the expected total weight for + * the entire stream of input values for the specified group. + * @param group_cluster_wl Output. The set of cluster weight limits for each group. + * @param group_num_clusters Output. The number of output clusters for each input group. + * @param group_cluster_offsets Offsets per-group to the start of it's clusters + * + */ +template +__global__ void generate_cluster_limits_kernel(int delta_, + size_type num_groups, + NearestWeightFunc nearest_weight, + TotalWeightIter total_weight_, + CumulativeWeight cumulative_weight, + double* group_cluster_wl, + size_type* group_num_clusters, + offset_type const* group_cluster_offsets) +{ + int const tid = threadIdx.x + blockIdx.x * blockDim.x; + auto const group_index = tid; + if (group_index >= num_groups) { return; } + + // we will generate at most delta clusters. + double const delta = static_cast(delta_); + double const delta_norm = delta / (2.0 * M_PI); + double const total_weight = total_weight_[group_index]; + group_num_clusters[group_index] = 0; + // a group with nothing in it. + if (total_weight <= 0) { return; } + + // start at the correct place based on our cluster offset. + double* cluster_wl = + group_cluster_wl ? group_cluster_wl + group_cluster_offsets[group_index] : nullptr; + + double cur_limit = 0.0; + double cur_weight = 0.0; + double next_limit = -1.0; + int last_inserted_index = -1; + + // compute the first cluster limit + double nearest_w; + int nearest_w_index; + while (1) { + cur_weight = next_limit < 0 ? 0 : max(cur_weight + 1, nearest_w); + if (cur_weight >= total_weight) { break; } + + // based on where we are closing the cluster off (not including the incoming weight), + // compute the next cluster limit + double const quantile = cur_weight / total_weight; + next_limit = total_weight * scale_func_k1(quantile, delta_norm); + + // if the next limit is < the cur limit, we're past the end of the distribution, so we're done. + if (next_limit <= cur_limit) { + if (cluster_wl) { cluster_wl[group_num_clusters[group_index]] = total_weight; } + group_num_clusters[group_index]++; + break; + } + + // compute the weight we will be at in the input values just before closing off the current + // cluster (because adding the next value will cross the current limit). + // NOTE: can't use structured bindings here. + thrust::tie(nearest_w, nearest_w_index) = nearest_weight(next_limit, group_index); + + if (cluster_wl) { + // because of the way the scale functions work, it is possible to generate clusters + // in such a way that we end up with "gaps" where there are no input values that + // fall into a given cluster. An example would be this: + // + // cluster weight limits = 0.00003, 1.008, 3.008 + // + // input values(weight) = A(1), B(2), C(3) + // + // naively inserting these values into the clusters simply by taking a lower_bound, + // we would get the following distribution of input values into those 3 clusters. + // (), (A), (B,C) + // + // whereas what we really want is: + // + // (A), (B), (C) + // + // to fix this, we will artificially adjust the output cluster limits to guarantee + // at least 1 input value will be put in each cluster during the reduction step. + // this does not affect final centroid results as we still use the "real" weight limits + // to compute subsequent clusters - the purpose is only to allow cluster selection + // during the reduction step to be trivial. + // + double adjusted_next_limit = next_limit; + if (nearest_w_index == last_inserted_index || last_inserted_index < 0) { + nearest_w_index = last_inserted_index + 1; + auto [r, i, adjusted] = cumulative_weight(nearest_w_index); + adjusted_next_limit = max(next_limit, adjusted); + } + cluster_wl[group_num_clusters[group_index]] = adjusted_next_limit; + last_inserted_index = nearest_w_index; + } + group_num_clusters[group_index]++; + cur_limit = next_limit; + } +} + +/** + * @brief Compute a set of cluster limits (brackets, essentially) for a + * given tdigest based on the specified delta and the total weight of values + * to be added. + * + * The number of clusters generated will always be <= delta_, where delta_ is + * a reasonably small number likely << 10000. + * + * Each input group gets an independent set of clusters generated. + * + * @param delta_ tdigest compression level + * @param num_groups The number of input groups + * @param nearest_weight A functor which returns the nearest weight in the input + * stream that falls before our current cluster limit + * @param total_weight A functor which returns the expected total weight for + * the entire stream of input values for the specified group. + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory + * + * @returns A tuple containing the set of cluster weight limits for each group, a set of + * list-style offsets indicating group sizes, and the total number of clusters + */ +template +std::tuple, std::unique_ptr, size_type> +generate_group_cluster_info(int delta, + size_type num_groups, + NearestWeight nearest_weight, + TotalWeightIter total_weight, + CumulativeWeight cumulative_weight, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + constexpr size_type block_size = 256; + cudf::detail::grid_1d const grid(num_groups, block_size); + + // compute number of clusters per group + // each thread computes 1 set of clusters (# of cluster sets == # of groups) + rmm::device_uvector group_num_clusters(num_groups, stream); + generate_cluster_limits_kernel<<>>( + delta, + num_groups, + nearest_weight, + total_weight, + cumulative_weight, + nullptr, + group_num_clusters.begin(), + nullptr); + + // generate group cluster offsets (where the clusters for a given group start and end) + auto group_cluster_offsets = cudf::make_fixed_width_column( + data_type{type_id::INT32}, num_groups + 1, mask_state::UNALLOCATED, stream, mr); + auto cluster_size = cudf::detail::make_counting_transform_iterator( + 0, [group_num_clusters = group_num_clusters.begin(), num_groups] __device__(size_type index) { + return index == num_groups ? 0 : group_num_clusters[index]; + }); + thrust::exclusive_scan(rmm::exec_policy(stream), + cluster_size, + cluster_size + num_groups + 1, + group_cluster_offsets->mutable_view().begin(), + 0); + + // total # of clusters + offset_type total_clusters = + cudf::detail::get_value(group_cluster_offsets->view(), num_groups, stream); + + // fill in the actual cluster weight limits + rmm::device_uvector group_cluster_wl(total_clusters, stream); + generate_cluster_limits_kernel<<>>( + delta, + num_groups, + nearest_weight, + total_weight, + cumulative_weight, + group_cluster_wl.begin(), + group_num_clusters.begin(), + group_cluster_offsets->view().begin()); + + return {std::move(group_cluster_wl), + std::move(group_cluster_offsets), + static_cast(total_clusters)}; +} + +/** + * @brief Compute a column of tdigests. + * + * Assembles the output tdigest column based on the specified delta, a stream of + * input values (either scalar or centroids), and an assortment of per-group + * clustering information. + * + * This function is effectively just a reduce_by_key that performs a reduction + * from input values -> centroid clusters as defined by the the cluster weight + * boundaries. + * + * @param delta tdigest compression level + * @param values_begin Beginning of the range of input values. + * @param values_end End of the range of input values. + * @param cumulative_weight Functor which returns cumulative weight and group information for + * an absolute input value index. + * @param min_col Column containing the minimum value per group. + * @param max_col Column containing the maximum value per group. + * @param group_cluster_wl Cluster weight limits for each group. + * @param group_cluster_offsets R-value reference of offsets into the cluster weight limits. + * @param total_clusters Total number of clusters in all groups. + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory + * + * @returns A tdigest column with 1 row per output tdigest. + */ +template +std::unique_ptr compute_tdigests(int delta, + CentroidIter centroids_begin, + CentroidIter centroids_end, + CumulativeWeight group_cumulative_weight, + std::unique_ptr&& min_col, + std::unique_ptr&& max_col, + rmm::device_uvector const& group_cluster_wl, + std::unique_ptr&& group_cluster_offsets, + size_type total_clusters, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // the output for each group is column of data that represents the tdigest. since we want 1 row + // per group, each row will be a list the length of the tdigest for that group. so our output + // column is of the form: + // struct { + // centroids for the digest + // list { + // struct { + // double // mean + // double // weight + // } + // } + // double // min + // double // max + // } + // + // + if (total_clusters == 0) { return cudf::detail::tdigest::make_empty_tdigest_column(stream, mr); } + std::vector> inner_children; + // mean + inner_children.push_back(cudf::make_fixed_width_column( + data_type{type_id::FLOAT64}, total_clusters, mask_state::UNALLOCATED, stream, mr)); + // weight + inner_children.push_back(cudf::make_fixed_width_column( + data_type{type_id::FLOAT64}, total_clusters, mask_state::UNALLOCATED, stream, mr)); + // tdigest struct + auto tdigests = + cudf::make_structs_column(total_clusters, std::move(inner_children), 0, {}, stream, mr); + + // each input group represents an individual tdigest. within each tdigest, we want the keys + // to represent cluster indices (for example, if a tdigest had 100 clusters, the keys should fall + // into the range 0-99). But since we have multiple tdigests, we need to keep the keys unique + // between the groups, so we add our group start offset. + auto keys = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [delta, + group_cluster_wl = group_cluster_wl.data(), + group_cluster_offsets = group_cluster_offsets->view().begin(), + group_cumulative_weight] __device__(size_type value_index) -> size_type { + auto [group_index, relative_value_index, cumulative_weight] = + group_cumulative_weight(value_index); + + // compute start of cluster weight limits for this group + double const* weight_limits = group_cluster_wl + group_cluster_offsets[group_index]; + auto const num_clusters = + group_cluster_offsets[group_index + 1] - group_cluster_offsets[group_index]; + + // local cluster index + size_type const group_cluster_index = + min(num_clusters - 1, + static_cast( + thrust::lower_bound( + thrust::seq, weight_limits, weight_limits + num_clusters, cumulative_weight) - + weight_limits)); + + // add the cluster offset to generate a globally unique key + return group_cluster_index + group_cluster_offsets[group_index]; + }); + + // reduce the centroids down by key. + cudf::mutable_column_view mean_col = + tdigests->child(cudf::detail::tdigest::mean_column_index).mutable_view(); + cudf::mutable_column_view weight_col = + tdigests->child(cudf::detail::tdigest::weight_column_index).mutable_view(); + auto output = thrust::make_zip_iterator(thrust::make_tuple( + mean_col.begin(), weight_col.begin(), thrust::make_discard_iterator())); + auto const num_values = std::distance(centroids_begin, centroids_end); + thrust::reduce_by_key(rmm::exec_policy(stream), + keys, + keys + num_values, // keys + centroids_begin, // values + thrust::make_discard_iterator(), // key output + output, // output + thrust::equal_to{}, // key equality check + merge_centroids{}); + + // create the list + auto const num_groups = group_cluster_offsets->size() - 1; + auto list = cudf::make_lists_column( + num_groups, std::move(group_cluster_offsets), std::move(tdigests), 0, {}); + + // create final tdigest column + std::vector> children; + children.push_back(std::move(list)); + children.push_back(std::move(min_col)); + children.push_back(std::move(max_col)); + return make_structs_column(num_groups, std::move(children), 0, {}, stream, mr); +} + +// retrieve total weight of scalar inputs by group index +struct scalar_total_weight { + size_type const* group_valid_counts; + __device__ double operator()(size_type group_index) { return group_valid_counts[group_index]; } +}; + +// return the min/max value of scalar inputs by group index +template +struct get_scalar_minmax { + column_device_view const col; + device_span group_offsets; + size_type const* group_valid_counts; + + __device__ thrust::tuple operator()(size_type group_index) + { + // note: .element() is taking care of fixed-point conversions for us. + return {static_cast(col.element(group_offsets[group_index])), + static_cast( + col.element(group_offsets[group_index] + (group_valid_counts[group_index] - 1)))}; + } +}; + +struct typed_group_tdigest { + template < + typename T, + typename std::enable_if_t() || cudf::is_fixed_point()>* = nullptr> + std::unique_ptr operator()(column_view const& col, + cudf::device_span group_offsets, + cudf::device_span group_labels, + cudf::device_span group_valid_counts, + size_type num_groups, + int delta, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + { + // first, generate cluster weight information for each input group + auto total_weight = cudf::detail::make_counting_transform_iterator( + 0, scalar_total_weight{group_valid_counts.begin()}); + auto [group_cluster_wl, group_cluster_offsets, total_clusters] = + generate_group_cluster_info(delta, + num_groups, + nearest_value_scalar_weights{}, + total_weight, + cumulative_scalar_weight{group_offsets, group_labels}, + stream, + mr); + + // device column view. handy because the .element() function + // automatically handles fixed-point conversions for us + auto d_col = cudf::column_device_view::create(col); + + // compute min and max columns + auto min_col = cudf::make_fixed_width_column( + data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr); + auto max_col = cudf::make_fixed_width_column( + data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr); + thrust::transform( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(0) + num_groups, + thrust::make_zip_iterator(thrust::make_tuple(min_col->mutable_view().begin(), + max_col->mutable_view().begin())), + get_scalar_minmax{*d_col, group_offsets, group_valid_counts.begin()}); + + // for simple input values, the "centroids" all have a weight of 1. + auto scalar_to_centroid = + cudf::detail::make_counting_transform_iterator(0, make_centroid{*d_col}); + + // generate the final tdigest + return compute_tdigests(delta, + scalar_to_centroid, + scalar_to_centroid + col.size(), + cumulative_scalar_weight{group_offsets, group_labels}, + std::move(min_col), + std::move(max_col), + group_cluster_wl, + std::move(group_cluster_offsets), + total_clusters, + stream, + mr); + } + + template < + typename T, + typename std::enable_if_t() && !cudf::is_fixed_point()>* = nullptr> + std::unique_ptr operator()(column_view const& col, + cudf::device_span group_offsets, + cudf::device_span group_labels, + cudf::device_span group_valid_counts, + size_type num_groups, + int delta, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + { + CUDF_FAIL("Non-numeric type in group_tdigest"); + } +}; + +} // anonymous namespace + +std::unique_ptr group_tdigest(column_view const& col, + cudf::device_span group_offsets, + cudf::device_span group_labels, + cudf::device_span group_valid_counts, + size_type num_groups, + int max_centroids, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + if (col.size() == 0) { return cudf::detail::tdigest::make_empty_tdigest_column(stream, mr); } + + auto const delta = max_centroids; + return cudf::type_dispatcher(col.type(), + typed_group_tdigest{}, + col, + group_offsets, + group_labels, + group_valid_counts, + num_groups, + delta, + stream, + mr); +} + +std::unique_ptr group_merge_tdigest(column_view const& input, + cudf::device_span group_offsets, + cudf::device_span group_labels, + size_type num_groups, + int max_centroids, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + cudf::detail::tdigest::check_is_valid_tdigest_column(input); + + if (num_groups == 0 || input.size() == 0) { + return cudf::detail::tdigest::make_empty_tdigest_column(stream, mr); + } + + structs_column_view scv(input); + lists_column_view lcv(scv.child(cudf::detail::tdigest::centroid_column_index)); + // ideally, we would just call .parent().child() here because tdigests cannot be + // sliced. however, lists_column_view() hides that particular interface. However, + // for the same reason, get_sliced_child() should be just as cheap. + auto data = lcv.get_sliced_child(stream); + structs_column_view tdigest(data); + auto mean = tdigest.child(cudf::detail::tdigest::mean_column_index); + auto weight = tdigest.child(cudf::detail::tdigest::weight_column_index); + + // first step is to merge all the tdigests in each group. at the moment the only way to + // make this work is to retrieve the group sizes (via group_offsets) and the individual digest + // sizes (via input.offsets()) to the gpu and do the merges. The scale problem is that while the + // size of each group will likely be small (size of each group will typically map to # of batches + // the input data was chopped into for tdigest generation), the -number- of groups can be + // arbitrarily large. + // + // thrust::merge and thrust::merge_by_key don't provide what we need. What we would need is an + // algorithm like a super-merge that takes two layers of keys: one which identifies the outer + // grouping of tdigests, and one which identifies the inner groupings of the tdigests within the + // outer groups. + + // bring group offsets back to the host + std::vector h_outer_offsets(group_offsets.size()); + cudaMemcpyAsync(h_outer_offsets.data(), + group_offsets.data(), + sizeof(size_type) * group_offsets.size(), + cudaMemcpyDeviceToHost, + stream); + + // bring tdigest offsets back to the host + auto tdigest_offsets = lcv.offsets(); + std::vector h_inner_offsets(tdigest_offsets.size()); + cudaMemcpyAsync(h_inner_offsets.data(), + tdigest_offsets.begin(), + sizeof(size_type) * tdigest_offsets.size(), + cudaMemcpyDeviceToHost, + stream); + + stream.synchronize(); + + // extract all means and weights into a table + cudf::table_view tdigests_unsliced({mean, weight}); + + // generate the merged (but not yet compressed) tdigests for each group. + std::vector> tdigests; + tdigests.reserve(num_groups); + std::transform( + h_outer_offsets.begin(), + h_outer_offsets.end() - 1, + std::next(h_outer_offsets.begin()), + std::back_inserter(tdigests), + [&](auto tdigest_start, auto tdigest_end) { + // the range of tdigests in this group + auto const num_tdigests = tdigest_end - tdigest_start; + + // slice each tdigest from the input + std::vector unmerged_tdigests; + unmerged_tdigests.reserve(num_tdigests); + auto offset_iter = std::next(h_inner_offsets.begin(), tdigest_start); + std::transform(offset_iter, + offset_iter + num_tdigests, + std::next(offset_iter), + std::back_inserter(unmerged_tdigests), + [&](auto start, auto end) { + return cudf::detail::slice(tdigests_unsliced, {start, end}, stream); + }); + + // merge + return cudf::detail::merge(unmerged_tdigests, {0}, {order::ASCENDING}, {}, stream, mr); + }); + + // generate min and max values + auto min_col = scv.child(cudf::detail::tdigest::min_column_index); + auto merged_min_col = cudf::make_fixed_width_column( + data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr); + thrust::reduce_by_key(rmm::exec_policy(stream), + group_labels.begin(), + group_labels.end(), + min_col.begin(), + thrust::make_discard_iterator(), + merged_min_col->mutable_view().begin(), + thrust::equal_to{}, // key equality check + thrust::minimum{}); + + auto max_col = scv.child(cudf::detail::tdigest::max_column_index); + auto merged_max_col = cudf::make_fixed_width_column( + data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr); + thrust::reduce_by_key(rmm::exec_policy(stream), + group_labels.begin(), + group_labels.end(), + max_col.begin(), + thrust::make_discard_iterator(), + merged_max_col->mutable_view().begin(), + thrust::equal_to{}, // key equality check + thrust::maximum{}); + + // concatenate all the merged tdigests back into one table. + std::vector tdigest_views; + tdigest_views.reserve(num_groups); + std::transform(tdigests.begin(), + tdigests.end(), + std::back_inserter(tdigest_views), + [](std::unique_ptr
const& t) { return t->view(); }); + auto merged = cudf::detail::concatenate(tdigest_views, stream, mr); + + // generate cumulative weights + auto merged_weights = merged->get_column(cudf::detail::tdigest::weight_column_index).view(); + auto cumulative_weights = cudf::make_fixed_width_column( + data_type{type_id::FLOAT64}, merged_weights.size(), mask_state::UNALLOCATED); + auto keys = cudf::detail::make_counting_transform_iterator( + 0, + [group_labels = group_labels.begin(), + inner_offsets = tdigest_offsets.begin(), + num_inner_offsets = tdigest_offsets.size()] __device__(int index) { + // what -original- tdigest index this absolute index corresponds to + auto const iter = thrust::prev( + thrust::upper_bound(thrust::seq, inner_offsets, inner_offsets + num_inner_offsets, index)); + auto const tdigest_index = thrust::distance(inner_offsets, iter); + + // what group index the original tdigest belongs to + return group_labels[tdigest_index]; + }); + thrust::inclusive_scan_by_key(rmm::exec_policy(stream), + keys, + keys + cumulative_weights->size(), + merged_weights.begin(), + cumulative_weights->mutable_view().begin()); + + auto const delta = max_centroids; + + // generate cluster info + auto total_group_weight = cudf::detail::make_counting_transform_iterator( + 0, + [outer_offsets = group_offsets.data(), + inner_offsets = tdigest_offsets.begin(), + cumulative_weights = + cumulative_weights->view().begin()] __device__(size_type group_index) { + auto const last_weight_index = inner_offsets[outer_offsets[group_index + 1]] - 1; + return cumulative_weights[last_weight_index]; + }); + auto [group_cluster_wl, group_cluster_offsets, total_clusters] = generate_group_cluster_info( + delta, + num_groups, + nearest_value_centroid_weights{cumulative_weights->view().begin(), + group_offsets.data(), + tdigest_offsets.begin()}, + total_group_weight, + cumulative_centroid_weight{ + cumulative_weights->view().begin(), + group_labels, + group_offsets.data(), + {tdigest_offsets.begin(), static_cast(tdigest_offsets.size())}}, + stream, + mr); + + // input centroid values + auto centroids = cudf::detail::make_counting_transform_iterator( + 0, + make_weighted_centroid{ + merged->get_column(cudf::detail::tdigest::mean_column_index).view().begin(), + merged_weights.begin()}); + + // compute the tdigest + return compute_tdigests(delta, + centroids, + centroids + merged->num_rows(), + cumulative_centroid_weight{cumulative_weights->view().begin(), + group_labels, + group_offsets.data(), + {tdigest_offsets.begin(), + static_cast(tdigest_offsets.size())}}, + std::move(merged_min_col), + std::move(merged_max_col), + group_cluster_wl, + std::move(group_cluster_offsets), + total_clusters, + stream, + mr); +} + +} // namespace detail +} // namespace groupby +} // namespace cudf diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu new file mode 100644 index 00000000000..9aea59a195b --- /dev/null +++ b/cpp/src/quantiles/tdigest/tdigest.cu @@ -0,0 +1,383 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +namespace cudf { +namespace detail { +namespace tdigest { + +// https://developer.nvidia.com/blog/lerp-faster-cuda/ +template +__device__ inline T lerp(T v0, T v1, T t) +{ + return fma(t, v1, fma(-t, v0, v0)); +} + +struct centroid { + double mean; + double weight; +}; + +struct make_centroid { + double const* means; + double const* weights; + __device__ centroid operator()(size_type i) { return {means[i], weights[i]}; } +}; + +// kernel for computing percentiles on input tdigest (mean, weight) centroid data. +template +__global__ void compute_percentiles_kernel(device_span tdigest_offsets, + column_device_view percentiles, + CentroidIter centroids_, + double const* min_, + double const* max_, + double const* cumulative_weight_, + double* output) +{ + int const tid = threadIdx.x + blockIdx.x * blockDim.x; + + auto const num_tdigests = tdigest_offsets.size() - 1; + auto const tdigest_index = tid / percentiles.size(); + if (tdigest_index >= num_tdigests) { return; } + auto const pindex = tid % percentiles.size(); + + // size of the digest we're querying + auto const tdigest_size = tdigest_offsets[tdigest_index + 1] - tdigest_offsets[tdigest_index]; + // no work to do. values will be set to null + if (tdigest_size == 0 || !percentiles.is_valid(pindex)) { return; } + + output[tid] = [&]() { + double const percentage = percentiles.element(pindex); + double const* cumulative_weight = cumulative_weight_ + tdigest_offsets[tdigest_index]; + + // centroids for this particular tdigest + CentroidIter centroids = centroids_ + tdigest_offsets[tdigest_index]; + + // min and max for the digest + double const* min_val = min_ + tdigest_index; + double const* max_val = max_ + tdigest_index; + + double const total_weight = cumulative_weight[tdigest_size - 1]; + + // The following Arrow code serves as a basis for this computation + // https://github.com/apache/arrow/blob/master/cpp/src/arrow/util/tdigest.cc#L280 + double const weighted_q = percentage * total_weight; + if (weighted_q <= 1) { + return *min_val; + } else if (weighted_q >= total_weight - 1) { + return *max_val; + } + + // determine what centroid this weighted quantile falls within. + size_type const centroid_index = static_cast(thrust::distance( + cumulative_weight, + thrust::lower_bound( + thrust::seq, cumulative_weight, cumulative_weight + tdigest_size, weighted_q))); + centroid c = centroids[centroid_index]; + + // diff == how far from the "center" of the centroid we are, + // in unit weights. + // visually: + // + // centroid of weight 7 + // C <-- center of the centroid + // |-------| + // | | | + // X Y Z + // X has a diff of -2 (2 units to the left of the center of the centroid) + // Y has a diff of 0 (directly in the middle of the centroid) + // Z has a diff of 3 (3 units to the right of the center of the centroid) + double const diff = weighted_q + c.weight / 2 - cumulative_weight[centroid_index]; + + // if we're completely within a centroid of weight 1, just return that. + if (c.weight == 1 && std::abs(diff) < 0.5) { return c.mean; } + + // otherwise, interpolate between two centroids. + + // get the two centroids we want to interpolate between + auto const look_left = diff < 0; + auto const [lhs, rhs] = [&]() { + if (look_left) { + // if we're at the first centroid, "left" of us is the min value + auto const first_centroid = centroid_index == 0; + auto const lhs = first_centroid ? centroid{*min_val, 0} : centroids[centroid_index - 1]; + auto const rhs = c; + return std::pair{lhs, rhs}; + } else { + // if we're at the last centroid, "right" of us is the max value + auto const last_centroid = (centroid_index == tdigest_size - 1); + auto const lhs = c; + auto const rhs = last_centroid ? centroid{*max_val, 0} : centroids[centroid_index + 1]; + return std::pair{lhs, rhs}; + } + }(); + + // compute interpolation value t + + // total interpolation range. the total range of "space" between the lhs and rhs centroids. + auto const tip = lhs.weight / 2 + rhs.weight / 2; + // if we're looking left, diff is negative, so shift it so that we are interpolating + // from lhs -> rhs. + auto const t = (look_left) ? (diff + tip) / tip : diff / tip; + + // interpolate + return lerp(lhs.mean, rhs.mean, t); + }(); +} + +/** + * @brief Calculate approximate percentiles on a provided tdigest column. + * + * Produces a LIST column where each row `i` represents output from querying the + * corresponding tdigest of from row `i` in `input`. The length of each output list + * is the number of percentiles specified in `percentiles` + * + * @param input tdigest input data. One tdigest per row. + * @param percentiles Desired percentiles in range [0, 1]. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device + * memory + * + * @returns Column of doubles containing requested percentile values. + */ +std::unique_ptr compute_approx_percentiles(structs_column_view const& input, + column_view const& percentiles, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + lists_column_view lcv(input.child(centroid_column_index)); + column_view min_col = input.child(min_column_index); + column_view max_col = input.child(max_column_index); + + // offsets, representing the size of each tdigest + auto offsets = lcv.offsets(); + + // extract means and weights + auto data = lcv.parent().child(lists_column_view::child_column_index); + structs_column_view tdigest(data); + auto mean = tdigest.child(mean_column_index); + auto weight = tdigest.child(weight_column_index); + + // compute summed weights + auto cumulative_weights = cudf::make_fixed_width_column(data_type{type_id::FLOAT64}, + mean.size(), + mask_state::UNALLOCATED, + stream, + rmm::mr::get_current_device_resource()); + auto keys = cudf::detail::make_counting_transform_iterator( + 0, + [offsets_begin = offsets.begin(), + offsets_end = offsets.end()] __device__(size_type i) { + return thrust::distance( + offsets_begin, + thrust::prev(thrust::upper_bound(thrust::seq, offsets_begin, offsets_end, i))); + }); + thrust::inclusive_scan_by_key(rmm::exec_policy(stream), + keys, + keys + weight.size(), + weight.begin(), + cumulative_weights->mutable_view().begin()); + + auto percentiles_cdv = column_device_view::create(percentiles); + + // leaf is a column of size input.size() * percentiles.size() + auto const num_output_values = input.size() * percentiles.size(); + + // null percentiles become null results. + auto [null_mask, null_count] = [&]() { + return percentiles.null_count() != 0 + ? cudf::detail::valid_if( + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(0) + num_output_values, + [percentiles = *percentiles_cdv] __device__(size_type i) { + return percentiles.is_valid(i % percentiles.size()); + }) + : std::pair{rmm::device_buffer{}, 0}; + }(); + + auto result = cudf::make_fixed_width_column( + data_type{type_id::FLOAT64}, num_output_values, std::move(null_mask), null_count, stream, mr); + + auto centroids = cudf::detail::make_counting_transform_iterator( + 0, make_centroid{mean.begin(), weight.begin()}); + + constexpr size_type block_size = 256; + cudf::detail::grid_1d const grid(percentiles.size() * input.size(), block_size); + compute_percentiles_kernel<<>>( + {offsets.begin(), static_cast(offsets.size())}, + *percentiles_cdv, + centroids, + min_col.begin(), + max_col.begin(), + cumulative_weights->view().begin(), + result->mutable_view().begin()); + + return result; +} + +void check_is_valid_tdigest_column(column_view const& col) +{ + // sanity check that this is actually tdigest data + CUDF_EXPECTS(col.type().id() == type_id::STRUCT, "Encountered invalid tdigest column"); + CUDF_EXPECTS(col.size() > 0, "tdigest columns must have > 0 rows"); + CUDF_EXPECTS(col.offset() == 0, "Encountered a sliced tdigest column"); + CUDF_EXPECTS(col.nullable() == false, "Encountered nullable tdigest column"); + + structs_column_view scv(col); + CUDF_EXPECTS(scv.num_children() == 3, "Encountered invalid tdigest column"); + CUDF_EXPECTS(scv.child(min_column_index).type().id() == type_id::FLOAT64, + "Encountered invalid tdigest column"); + CUDF_EXPECTS(scv.child(max_column_index).type().id() == type_id::FLOAT64, + "Encountered invalid tdigest column"); + + lists_column_view lcv(scv.child(centroid_column_index)); + auto data = lcv.child(); + CUDF_EXPECTS(data.type().id() == type_id::STRUCT, "Encountered invalid tdigest column"); + CUDF_EXPECTS(data.num_children() == 2, + "Encountered tdigest column with an invalid number of children"); + auto mean = data.child(mean_column_index); + CUDF_EXPECTS(mean.type().id() == type_id::FLOAT64, "Encountered invalid tdigest mean column"); + auto weight = data.child(weight_column_index); + CUDF_EXPECTS(weight.type().id() == type_id::FLOAT64, "Encountered invalid tdigest weight column"); +} + +std::unique_ptr make_empty_tdigest_column(rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // mean/weight columns + std::vector> inner_children; + inner_children.push_back(make_empty_column(data_type(type_id::FLOAT64))); + inner_children.push_back(make_empty_column(data_type(type_id::FLOAT64))); + + auto offsets = cudf::make_fixed_width_column( + data_type(type_id::INT32), 2, mask_state::UNALLOCATED, stream, mr); + thrust::fill(rmm::exec_policy(stream), + offsets->mutable_view().begin(), + offsets->mutable_view().end(), + 0); + auto list = + make_lists_column(1, + std::move(offsets), + cudf::make_structs_column(0, std::move(inner_children), 0, {}, stream, mr), + 0, + {}); + + auto min_col = + cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr); + thrust::fill(rmm::exec_policy(stream), + min_col->mutable_view().begin(), + min_col->mutable_view().end(), + 0); + auto max_col = + cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr); + thrust::fill(rmm::exec_policy(stream), + max_col->mutable_view().begin(), + max_col->mutable_view().end(), + 0); + + std::vector> children; + children.push_back(std::move(list)); + children.push_back(std::move(min_col)); + children.push_back(std::move(max_col)); + + return make_structs_column(1, std::move(children), 0, {}, stream, mr); +} + +} // namespace tdigest. + +std::unique_ptr percentile_approx(structs_column_view const& input, + column_view const& percentiles, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + tdigest::check_is_valid_tdigest_column(input); + CUDF_EXPECTS(percentiles.type().id() == type_id::FLOAT64, + "percentile_approx expects float64 percentile inputs"); + + // output is a list column with each row containing percentiles.size() percentile values + auto offsets = cudf::make_fixed_width_column( + data_type{type_id::INT32}, input.size() + 1, mask_state::UNALLOCATED, stream, mr); + auto row_size_iter = thrust::make_constant_iterator(percentiles.size()); + thrust::exclusive_scan(rmm::exec_policy(stream), + row_size_iter, + row_size_iter + input.size() + 1, + offsets->mutable_view().begin()); + + if (percentiles.size() == 0) { + return cudf::make_lists_column( + input.size(), + std::move(offsets), + cudf::make_empty_column(data_type{type_id::FLOAT64}), + input.size(), + cudf::detail::create_null_mask( + input.size(), mask_state::ALL_NULL, rmm::cuda_stream_view(stream), mr)); + } + + // if any of the input digests are empty, nullify the corresponding output rows (values will be + // uninitialized) + auto [bitmask, null_count] = [stream, mr, input]() { + lists_column_view lcv(input.child(tdigest::centroid_column_index)); + auto iter = cudf::detail::make_counting_transform_iterator( + 0, [offsets = lcv.offsets().begin()] __device__(size_type index) { + return offsets[index + 1] - offsets[index] == 0 ? 1 : 0; + }); + auto const null_count = thrust::reduce(rmm::exec_policy(stream), iter, iter + input.size(), 0); + if (null_count == 0) { + return std::pair{rmm::device_buffer{}, null_count}; + } + return cudf::detail::valid_if( + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(0) + input.size(), + [offsets = lcv.offsets().begin()] __device__(size_type index) { + return offsets[index + 1] - offsets[index] == 0 ? 0 : 1; + }, + stream, + mr); + }(); + + return cudf::make_lists_column( + input.size(), + std::move(offsets), + tdigest::compute_approx_percentiles(input, percentiles, stream, mr), + null_count, + std::move(bitmask), + stream, + mr); +} + +} // namespace detail + +std::unique_ptr percentile_approx(structs_column_view const& input, + column_view const& percentiles, + rmm::mr::device_memory_resource* mr) +{ + return percentile_approx(input, percentiles, rmm::cuda_stream_default, mr); +} + +} // namespace cudf diff --git a/cpp/src/sort/sort.cu b/cpp/src/sort/sort.cu index dc74a5f4ff1..42b57bdb47a 100644 --- a/cpp/src/sort/sort.cu +++ b/cpp/src/sort/sort.cu @@ -26,7 +26,7 @@ namespace cudf { namespace detail { -std::unique_ptr sorted_order(table_view input, +std::unique_ptr sorted_order(table_view const& input, std::vector const& column_order, std::vector const& null_precedence, rmm::cuda_stream_view stream, @@ -75,7 +75,7 @@ struct inplace_column_sort_fn { } }; -std::unique_ptr
sort(table_view input, +std::unique_ptr
sort(table_view const& input, std::vector const& column_order, std::vector const& null_precedence, rmm::cuda_stream_view stream, @@ -101,7 +101,7 @@ std::unique_ptr
sort(table_view input, } // namespace detail -std::unique_ptr sorted_order(table_view input, +std::unique_ptr sorted_order(table_view const& input, std::vector const& column_order, std::vector const& null_precedence, rmm::mr::device_memory_resource* mr) @@ -110,7 +110,7 @@ std::unique_ptr sorted_order(table_view input, return detail::sorted_order(input, column_order, null_precedence, rmm::cuda_stream_default, mr); } -std::unique_ptr
sort(table_view input, +std::unique_ptr
sort(table_view const& input, std::vector const& column_order, std::vector const& null_precedence, rmm::mr::device_memory_resource* mr) diff --git a/cpp/src/sort/stable_sort.cu b/cpp/src/sort/stable_sort.cu index 860e88ae76e..75335579de2 100644 --- a/cpp/src/sort/stable_sort.cu +++ b/cpp/src/sort/stable_sort.cu @@ -25,7 +25,7 @@ namespace cudf { namespace detail { -std::unique_ptr stable_sorted_order(table_view input, +std::unique_ptr stable_sorted_order(table_view const& input, std::vector const& column_order, std::vector const& null_precedence, rmm::cuda_stream_view stream, @@ -36,7 +36,7 @@ std::unique_ptr stable_sorted_order(table_view input, } // namespace detail -std::unique_ptr stable_sorted_order(table_view input, +std::unique_ptr stable_sorted_order(table_view const& input, std::vector const& column_order, std::vector const& null_precedence, rmm::mr::device_memory_resource* mr) diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 03f7967cee0..6d385ff969d 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -85,6 +85,7 @@ ConfigureTest(GROUPBY_TEST groupby/sum_of_squares_tests.cpp groupby/sum_scan_tests.cpp groupby/sum_tests.cpp + groupby/tdigest_tests.cu groupby/var_tests.cpp) ################################################################################################### @@ -123,6 +124,7 @@ ConfigureTest(HASH_MAP_TEST ################################################################################################### # - quantiles tests ------------------------------------------------------------------------------- ConfigureTest(QUANTILES_TEST + quantiles/percentile_approx_test.cu quantiles/quantile_test.cpp quantiles/quantiles_test.cpp) diff --git a/cpp/tests/groupby/groupby_test_util.hpp b/cpp/tests/groupby/groupby_test_util.hpp index 542205b5b51..b333d9dacba 100644 --- a/cpp/tests/groupby/groupby_test_util.hpp +++ b/cpp/tests/groupby/groupby_test_util.hpp @@ -27,6 +27,9 @@ #include #include #include +#include + +#include namespace cudf { namespace test { @@ -128,5 +131,57 @@ inline void test_single_scan(column_view const& keys, expect_vals, *result.second[0].results[0], debug_output_level::ALL_ERRORS); } +template +inline T frand() +{ + return static_cast(rand()) / static_cast(RAND_MAX); +} + +template +inline T rand_range(T min, T max) +{ + return min + static_cast(frand() * (max - min)); +} + +inline std::unique_ptr generate_typed_percentile_distribution( + std::vector const& buckets, + std::vector const& sizes, + data_type t, + bool sorted = false) +{ + srand(0); + + std::vector values; + size_t total_size = std::reduce(sizes.begin(), sizes.end(), 0); + values.reserve(total_size); + for (size_t idx = 0; idx < sizes.size(); idx++) { + double min = idx == 0 ? 0.0f : buckets[idx - 1]; + double max = buckets[idx]; + + for (int v_idx = 0; v_idx < sizes[idx]; v_idx++) { + values.push_back(rand_range(min, max)); + } + } + + if (sorted) { std::sort(values.begin(), values.end()); } + + cudf::test::fixed_width_column_wrapper src(values.begin(), values.end()); + return cudf::cast(src, t); +} + +// "standardized" means the parameters sent into generate_typed_percentile_distribution. the intent +// is to provide a standardized set of inputs for use with tdigest generation tests and +// percentile_approx tests. std::vector +// buckets{10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0}; std::vector +// sizes{50000, 50000, 50000, 50000, 50000, 100000, 100000, 100000, 100000, 100000}; +inline std::unique_ptr generate_standardized_percentile_distribution( + data_type t = data_type{type_id::FLOAT64}, bool sorted = false) +{ + std::vector buckets{10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0, 90.0f, 100.0f}; + std::vector b_sizes{ + 50000, 50000, 50000, 50000, 50000, 100000, 100000, 100000, 100000, 100000}; + return generate_typed_percentile_distribution(buckets, b_sizes, t, sorted); +} + } // namespace test } // namespace cudf diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu new file mode 100644 index 00000000000..818999867c1 --- /dev/null +++ b/cpp/tests/groupby/tdigest_tests.cu @@ -0,0 +1,584 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arrow/util/tdigest.h" + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include + +#include + +namespace cudf { +namespace test { + +using namespace cudf; + +typedef thrust::tuple expected_value; + +template +struct TDigestAllTypes : public cudf::test::BaseFixture { +}; +TYPED_TEST_CASE(TDigestAllTypes, cudf::test::NumericTypes); + +struct tdigest_gen { + template < + typename T, + typename std::enable_if_t() || cudf::is_fixed_point()>* = nullptr> + std::unique_ptr operator()(column_view const& keys, column_view const& values, int delta) + { + cudf::table_view t({keys}); + cudf::groupby::groupby gb(t); + std::vector requests; + std::vector> aggregations; + aggregations.push_back(cudf::make_tdigest_aggregation(delta)); + requests.push_back({values, std::move(aggregations)}); + auto result = gb.aggregate(requests); + return std::move(result.second[0].results[0]); + } + + template < + typename T, + typename std::enable_if_t() && !cudf::is_fixed_point()>* = nullptr> + std::unique_ptr operator()(column_view const& keys, column_view const& values, int delta) + { + CUDF_FAIL("Invalid tdigest test type"); + } +}; + +void tdigest_sample_compare(column_view const& result, + std::vector const& h_expected) +{ + cudf::detail::tdigest::check_is_valid_tdigest_column(result); + cudf::structs_column_view scv(result); + cudf::lists_column_view lcv(scv.child(cudf::detail::tdigest::centroid_column_index)); + cudf::structs_column_view tdigests(lcv.child()); + column_view result_mean = tdigests.child(cudf::detail::tdigest::mean_column_index); + column_view result_weight = tdigests.child(cudf::detail::tdigest::weight_column_index); + + auto expected_mean = cudf::make_fixed_width_column( + data_type{type_id::FLOAT64}, h_expected.size(), mask_state::UNALLOCATED); + auto expected_weight = cudf::make_fixed_width_column( + data_type{type_id::FLOAT64}, h_expected.size(), mask_state::UNALLOCATED); + auto sampled_result_mean = cudf::make_fixed_width_column( + data_type{type_id::FLOAT64}, h_expected.size(), mask_state::UNALLOCATED); + auto sampled_result_weight = cudf::make_fixed_width_column( + data_type{type_id::FLOAT64}, h_expected.size(), mask_state::UNALLOCATED); + + rmm::device_vector expected(h_expected.begin(), h_expected.end()); + auto iter = thrust::make_counting_iterator(0); + thrust::for_each( + rmm::exec_policy(rmm::cuda_stream_default), + iter, + iter + expected.size(), + [expected = expected.data().get(), + expected_mean = expected_mean->mutable_view().begin(), + expected_weight = expected_weight->mutable_view().begin(), + result_mean = result_mean.begin(), + result_weight = result_weight.begin(), + sampled_result_mean = sampled_result_mean->mutable_view().begin(), + sampled_result_weight = + sampled_result_weight->mutable_view().begin()] __device__(size_type index) { + expected_mean[index] = thrust::get<1>(expected[index]); + expected_weight[index] = thrust::get<2>(expected[index]); + auto const src_index = thrust::get<0>(expected[index]); + sampled_result_mean[index] = result_mean[src_index]; + sampled_result_weight[index] = result_weight[src_index]; + }); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected_mean, *sampled_result_mean); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_weight, *sampled_result_weight); +} + +template +std::unique_ptr make_expected_tdigest(column_view const& mean, + column_view const& weight, + T min, + T max) +{ + std::vector> inner_children; + inner_children.push_back(std::make_unique(mean)); + inner_children.push_back(std::make_unique(weight)); + // tdigest struct + auto tdigests = cudf::make_structs_column(mean.size(), std::move(inner_children), 0, {}); + + std::vector h_offsets{0, mean.size()}; + auto offsets = + cudf::make_fixed_width_column(data_type{type_id::INT32}, 2, mask_state::UNALLOCATED); + cudaMemcpy(offsets->mutable_view().begin(), + h_offsets.data(), + sizeof(offset_type) * 2, + cudaMemcpyHostToDevice); + + auto list = cudf::make_lists_column(1, std::move(offsets), std::move(tdigests), 0, {}); + + auto min_col = + cudf::make_fixed_width_column(data_type{type_id::FLOAT64}, 1, mask_state::UNALLOCATED); + thrust::fill(rmm::exec_policy(rmm::cuda_stream_default), + min_col->mutable_view().begin(), + min_col->mutable_view().end(), + static_cast(min)); + auto max_col = + cudf::make_fixed_width_column(data_type{type_id::FLOAT64}, 1, mask_state::UNALLOCATED); + thrust::fill(rmm::exec_policy(rmm::cuda_stream_default), + max_col->mutable_view().begin(), + max_col->mutable_view().end(), + static_cast(max)); + + std::vector> children; + children.push_back(std::move(list)); + children.push_back(std::move(min_col)); + children.push_back(std::move(max_col)); + return make_structs_column(1, std::move(children), 0, {}); +} + +TYPED_TEST(TDigestAllTypes, Simple) +{ + using T = TypeParam; + + // create a tdigest that has far fewer values in it than the delta value. this should result + // in every value remaining uncompressed + cudf::test::fixed_width_column_wrapper values{126, 15, 1, 99, 67}; + cudf::test::fixed_width_column_wrapper keys{0, 0, 0, 0, 0}; + int const delta = 1000; + auto result = cudf::type_dispatcher( + static_cast(values).type(), tdigest_gen{}, keys, values, delta); + + cudf::test::fixed_width_column_wrapper raw_mean({1, 15, 67, 99, 126}); + cudf::test::fixed_width_column_wrapper weight{1, 1, 1, 1, 1}; + auto mean = cudf::cast(raw_mean, data_type{type_id::FLOAT64}); + double const min = 1; + double const max = 126; + auto expected = make_expected_tdigest(*mean, weight, static_cast(min), static_cast(max)); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected); +} + +TYPED_TEST(TDigestAllTypes, SimpleWithNulls) +{ + using T = TypeParam; + + // create a tdigest that has far fewer values in it than the delta value. this should result + // in every value remaining uncompressed + cudf::test::fixed_width_column_wrapper values{{122, 15, 1, 99, 67, 101, 100, 84, 44, 2}, + {1, 0, 1, 0, 1, 0, 1, 0, 1, 0}}; + cudf::test::fixed_width_column_wrapper keys{0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + int const delta = 1000; + auto result = cudf::type_dispatcher( + static_cast(values).type(), tdigest_gen{}, keys, values, delta); + + cudf::test::fixed_width_column_wrapper raw_mean({1, 44, 67, 100, 122}); + cudf::test::fixed_width_column_wrapper weight{1, 1, 1, 1, 1}; + auto mean = cudf::cast(raw_mean, data_type{type_id::FLOAT64}); + double const min = 1; + double const max = 122; + auto expected = make_expected_tdigest(*mean, weight, static_cast(min), static_cast(max)); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected); +} + +TYPED_TEST(TDigestAllTypes, AllNull) +{ + using T = TypeParam; + + // create a tdigest that has far fewer values in it than the delta value. this should result + // in every value remaining uncompressed + cudf::test::fixed_width_column_wrapper values{{122, 15, 1, 99, 67, 101, 100, 84, 44, 2}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}}; + cudf::test::fixed_width_column_wrapper keys{0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + int const delta = 1000; + auto result = cudf::type_dispatcher( + static_cast(values).type(), tdigest_gen{}, keys, values, delta); + + // NOTE: an empty tdigest column still has 1 row. + auto expected = cudf::detail::tdigest::make_empty_tdigest_column(); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected); +} + +TYPED_TEST(TDigestAllTypes, LargeGroups) +{ + auto _values = generate_standardized_percentile_distribution(data_type{type_id::FLOAT64}); + int const delta = 1000; + + // generate a random set of keys + std::vector h_keys; + h_keys.reserve(_values->size()); + auto iter = thrust::make_counting_iterator(0); + std::transform(iter, iter + _values->size(), std::back_inserter(h_keys), [](int i) { + return static_cast(round(rand_range(0, 8))); + }); + cudf::test::fixed_width_column_wrapper _keys(h_keys.begin(), h_keys.end()); + + // group the input values together + cudf::table_view k({_keys}); + cudf::groupby::groupby setup_gb(k); + cudf::table_view v({*_values}); + auto groups = setup_gb.get_groups(v); + + // slice it all up so we have keys/columns for everything. + std::vector keys; + std::vector values; + for (size_t idx = 0; idx < groups.offsets.size() - 1; idx++) { + auto k = + cudf::slice(groups.keys->get_column(0), {groups.offsets[idx], groups.offsets[idx + 1]}); + keys.push_back(k[0]); + + auto v = + cudf::slice(groups.values->get_column(0), {groups.offsets[idx], groups.offsets[idx + 1]}); + values.push_back(v[0]); + } + + // generate a seperate tdigest for each group + std::vector> parts; + std::transform( + iter, iter + values.size(), std::back_inserter(parts), [&keys, &values, delta](int i) { + cudf::table_view t({keys[i]}); + cudf::groupby::groupby gb(t); + std::vector requests; + std::vector> aggregations; + aggregations.push_back(cudf::make_tdigest_aggregation(delta)); + requests.push_back({values[i], std::move(aggregations)}); + auto result = gb.aggregate(requests); + return std::move(result.second[0].results[0]); + }); + std::vector part_views; + std::transform(parts.begin(), + parts.end(), + std::back_inserter(part_views), + [](std::unique_ptr const& col) { return col->view(); }); + auto merged_parts = cudf::concatenate(part_views); + + // generate a tdigest on the whole input set + cudf::table_view t({_keys}); + cudf::groupby::groupby gb(t); + std::vector requests; + std::vector> aggregations; + aggregations.push_back(cudf::make_tdigest_aggregation(delta)); + requests.push_back({*_values, std::move(aggregations)}); + auto result = gb.aggregate(requests); + + // verify that they end up the same. + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result.second[0].results[0], *merged_parts); +} + +struct TDigestTest : public cudf::test::BaseFixture { +}; + +TEST_F(TDigestTest, LargeInputDouble) +{ + // these tests are being done explicitly because of the way we have to precompute the correct + // answers. since the input values generated by the generate_distribution() function below are + // cast to specific types -before- being sent into the aggregation, I can't (safely) just use the + // expected values that you get when using doubles all the way through. so I have to pregenerate + // the correct answers for each type by hand. so, we'll choose a reasonable subset (double, + // decimal, int, bool) + + auto values = generate_standardized_percentile_distribution(data_type{type_id::FLOAT64}); + // all in the same group + auto keys = cudf::make_fixed_width_column( + data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED); + thrust::fill(rmm::exec_policy(rmm::cuda_stream_default), + keys->mutable_view().template begin(), + keys->mutable_view().template end(), + 0); + + // compare against a sample of known/expected values (which themselves were verified against the + // Arrow implementation) + + // delta 1000 + { + int const delta = 1000; + auto result = + cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta); + std::vector expected{{0, 0.00040692343794663995, 7}, + {10, 0.16234555627091204477, 153}, + {59, 5.12764811246045937310, 858}, + {250, 62.54581814492237157310, 2356}, + {368, 87.85834376680742252574, 1735}, + {409, 94.07685720279611985006, 1272}, + {491, 99.94197663121231300920, 130}, + {500, 99.99969880795092080916, 2}}; + + tdigest_sample_compare(*result, expected); + } + + // delta 100 + { + int const delta = 100; + auto result = + cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta); + std::vector expected{{0, 0.07265722021410986331, 739}, + {7, 8.19766194442652640362, 10693}, + {16, 36.82277869518204482802, 20276}, + {29, 72.95424834129075009059, 22623}, + {38, 90.61229683516096145013, 15581}, + {46, 99.07283498858802772702, 5142}, + {50, 99.99970905482754801596, 1}}; + + tdigest_sample_compare(*result, expected); + } + + // delta 10 + { + int const delta = 10; + auto result = + cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta); + std::vector expected{{0, 7.15508346777729631327, 71618}, + {1, 33.04971680740474226923, 187499}, + {2, 62.50566666553867634093, 231762}, + {3, 83.46216572053654658703, 187500}, + {4, 96.42204425201593664951, 71620}, + {5, 99.99970905482754801596, 1}}; + + tdigest_sample_compare(*result, expected); + } +} + +TEST_F(TDigestTest, LargeInputInt) +{ + // these tests are being done explicitly because of the way we have to precompute the correct + // answers. since the input values generated by the generate_distribution() function below are + // cast to specific types -before- being sent into the aggregation, I can't (safely) just use the + // expected values that you get when using doubles all the way through. so I have to pregenerate + // the correct answers for each type by hand. so, we'll choose a reasonable subset (double, + // decimal, int, bool) + + auto values = generate_standardized_percentile_distribution(data_type{type_id::INT32}); + // all in the same group + auto keys = cudf::make_fixed_width_column( + data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED); + thrust::fill(rmm::exec_policy(rmm::cuda_stream_default), + keys->mutable_view().template begin(), + keys->mutable_view().template end(), + 0); + + // compare against a sample of known/expected values (which themselves were verified against the + // Arrow implementation) + + // delta 1000 + { + int const delta = 1000; + auto result = + cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta); + std::vector expected{{0, 0, 7}, + {14, 0, 212}, + {26, 0.83247422680412408447, 388}, + {44, 2, 648}, + {45, 2.42598187311178170589, 662}, + {342, 82.75190258751908345403, 1971}, + {383, 90, 1577}, + {417, 94.88376068376066996279, 1170}, + {418, 95, 1157}, + {479, 99, 307}, + {500, 99, 2}}; + + tdigest_sample_compare(*result, expected); + } + + // delta 100 + { + int const delta = 100; + auto result = + cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta); + std::vector expected{{0, 0, 739}, + {7, 7.71486018890863167741, 10693}, + {16, 36.32491615703294485229, 20276}, + {29, 72.44392874508245938614, 22623}, + {38, 90.14209614273795523332, 15581}, + {46, 98.64041229093737683797, 5142}, + {50, 99, 1}}; + + tdigest_sample_compare(*result, expected); + } + + // delta 10 + { + int const delta = 10; + auto result = + cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta); + std::vector expected{{0, 6.66025300902007799664, 71618}, + {1, 32.54912826201739051157, 187499}, + {2, 62.00734805533262772315, 231762}, + {3, 82.96355733333332693746, 187500}, + {4, 95.91280368612116546956, 71620}, + {5, 99, 1}}; + + tdigest_sample_compare(*result, expected); + } +} + +TEST_F(TDigestTest, LargeInputDecimal) +{ + // these tests are being done explicitly because of the way we have to precompute the correct + // answers. since the input values generated by the generate_distribution() function below are + // cast to specific types -before- being sent into the aggregation, I can't (safely) just use the + // expected values that you get when using doubles all the way through. so I have to pregenerate + // the correct answers for each type by hand. so, we'll choose a reasonable subset (double, + // decimal, int, bool) + + auto values = generate_standardized_percentile_distribution(data_type{type_id::DECIMAL32, -4}); + // all in the same group + auto keys = cudf::make_fixed_width_column( + data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED); + thrust::fill(rmm::exec_policy(rmm::cuda_stream_default), + keys->mutable_view().template begin(), + keys->mutable_view().template end(), + 0); + + // compare against a sample of known/expected values (which themselves were verified against the + // Arrow implementation) + + // delta 1000 + { + int const delta = 1000; + auto result = + cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta); + std::vector expected{{0, 0.00035714285714285709, 7}, + {10, 0.16229738562091505782, 153}, + {59, 5.12759696969697031932, 858}, + {250, 62.54576854838715860296, 2356}, + {368, 87.85829446685879418055, 1735}, + {409, 94.07680636792450457051, 1272}, + {491, 99.94192461538463589932, 130}, + {500, 99.99965000000000259206, 2}}; + + tdigest_sample_compare(*result, expected); + } + + // delta 100 + { + int const delta = 100; + auto result = + cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta); + std::vector expected{{0, 0.07260811907983763525, 739}, + {7, 8.19761183016926864298, 10693}, + {16, 36.82272891595975750079, 20276}, + {29, 72.95419827167043536065, 22623}, + {38, 90.61224673640975879607, 15581}, + {46, 99.07278498638662256326, 5142}, + {50, 99.99970000000000425189, 1}}; + + tdigest_sample_compare(*result, expected); + } + + // delta 10 + { + int const delta = 10; + auto result = + cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta); + std::vector expected{{0, 7.15503361864335740705, 71618}, + {1, 33.04966679715625588187, 187499}, + {2, 62.50561666407782013266, 231762}, + {3, 83.46211575573336460820, 187500}, + {4, 96.42199425300195514410, 71620}, + {5, 99.99970000000000425189, 1}}; + + tdigest_sample_compare(*result, expected); + } +} + +struct TDigestMergeTest : public cudf::test::BaseFixture { +}; + +// Note: there is no need to test different types here as the internals of a tdigest are always +// the same regardless of input. +TEST_F(TDigestMergeTest, Simple) +{ + auto values = generate_standardized_percentile_distribution(data_type{type_id::FLOAT64}); + CUDF_EXPECTS(values->size() == 750000, "Unexpected distribution size"); + // all in the same group + auto keys = cudf::make_fixed_width_column( + data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED); + thrust::fill(rmm::exec_policy(rmm::cuda_stream_default), + keys->mutable_view().template begin(), + keys->mutable_view().template end(), + 0); + + auto split_values = cudf::split(*values, {250000, 500000}); + auto split_keys = cudf::split(*keys, {250000, 500000}); + + int const delta = 1000; + + // generate seperate digests + std::vector> parts; + auto iter = thrust::make_counting_iterator(0); + std::transform( + iter, + iter + split_values.size(), + std::back_inserter(parts), + [&split_keys, &split_values, delta](int i) { + cudf::table_view t({split_keys[i]}); + cudf::groupby::groupby gb(t); + std::vector requests; + std::vector> aggregations; + aggregations.push_back(cudf::make_tdigest_aggregation(delta)); + requests.push_back({split_values[i], std::move(aggregations)}); + auto result = gb.aggregate(requests); + return std::move(result.second[0].results[0]); + }); + std::vector part_views; + std::transform(parts.begin(), + parts.end(), + std::back_inserter(part_views), + [](std::unique_ptr const& col) { return col->view(); }); + + // merge delta = 1000 + { + int const merge_delta = 1000; + + // merge them + auto merge_input = cudf::concatenate(part_views); + cudf::test::fixed_width_column_wrapper merge_keys{0, 0, 0}; + cudf::table_view key_table({merge_keys}); + cudf::groupby::groupby gb(key_table); + std::vector requests; + std::vector> aggregations; + aggregations.push_back( + cudf::make_merge_tdigest_aggregation(merge_delta)); + requests.push_back({*merge_input, std::move(aggregations)}); + auto result = gb.aggregate(requests); + + std::vector expected{{0, 0.00013945158577498588, 2}, + {10, 0.04804393446447510763, 50}, + {59, 1.68846964439246893797, 284}, + {250, 33.36323141295877547918, 1479}, + {368, 65.36307727957283475462, 2292}, + {409, 73.95399208218296394080, 1784}, + {490, 87.67566167909056673579, 1570}, + {491, 87.83119717763385381204, 1570}, + {500, 89.24891838334393412424, 1555}, + {578, 95.87182997389099625707, 583}, + {625, 98.20470345147104751504, 405}, + {700, 99.96818381983835877236, 56}, + {711, 99.99970905482754801596, 1}}; + + tdigest_sample_compare(*result.second[0].results[0], expected); + } +} + +} // namespace test +} // namespace cudf diff --git a/cpp/tests/quantiles/percentile_approx_test.cu b/cpp/tests/quantiles/percentile_approx_test.cu new file mode 100644 index 00000000000..39f7cc593d6 --- /dev/null +++ b/cpp/tests/quantiles/percentile_approx_test.cu @@ -0,0 +1,435 @@ +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include + +using namespace cudf; + +struct tdigest_gen { + template < + typename T, + typename std::enable_if_t() || cudf::is_fixed_point()>* = nullptr> + std::unique_ptr operator()(column_view const& keys, column_view const& values, int delta) + { + cudf::table_view t({keys}); + cudf::groupby::groupby gb(t); + std::vector requests; + std::vector> aggregations; + aggregations.push_back(cudf::make_tdigest_aggregation(delta)); + requests.push_back({values, std::move(aggregations)}); + auto result = gb.aggregate(requests); + return std::move(result.second[0].results[0]); + } + + template < + typename T, + typename std::enable_if_t() && !cudf::is_fixed_point()>* = nullptr> + std::unique_ptr operator()(column_view const& keys, column_view const& values, int delta) + { + CUDF_FAIL("Invalid tdigest test type"); + } +}; + +std::unique_ptr arrow_percentile_approx(column_view const& _values, + int delta, + std::vector const& percentages) +{ + // sort the incoming values using the same settings that groupby does. + // this is a little weak because null_order::AFTER is hardcoded internally to groupby. + table_view t({_values}); + auto sorted_t = cudf::sort(t, {}, {null_order::AFTER}); + auto sorted_values = sorted_t->get_column(0).view(); + + std::vector h_values(sorted_values.size()); + cudaMemcpy(h_values.data(), + sorted_values.data(), + sizeof(double) * sorted_values.size(), + cudaMemcpyDeviceToHost); + std::vector h_validity(sorted_values.size()); + if (sorted_values.null_mask() != nullptr) { + auto validity = cudf::mask_to_bools(sorted_values.null_mask(), 0, sorted_values.size()); + cudaMemcpy(h_validity.data(), + (validity->view().data()), + sizeof(char) * sorted_values.size(), + cudaMemcpyDeviceToHost); + } + + // generate the tdigest + arrow::internal::TDigest atd(delta, sorted_values.size() * 2); + for (size_t idx = 0; idx < h_values.size(); idx++) { + if (sorted_values.null_mask() == nullptr || h_validity[idx]) { atd.Add(h_values[idx]); } + } + + // generate the percentiles and stuff them into a list column + std::vector h_result; + h_result.reserve(percentages.size()); + std::transform( + percentages.begin(), percentages.end(), std::back_inserter(h_result), [&atd](double p) { + return atd.Quantile(p); + }); + cudf::test::fixed_width_column_wrapper result(h_result.begin(), h_result.end()); + cudf::test::fixed_width_column_wrapper offsets{ + 0, static_cast(percentages.size())}; + return cudf::make_lists_column(1, offsets.release(), result.release(), 0, {}); +} + +struct percentile_approx_dispatch { + template < + typename T, + typename std::enable_if_t() || cudf::is_fixed_point()>* = nullptr> + std::unique_ptr operator()(column_view const& keys, + column_view const& values, + int delta, + std::vector const& percentages, + size_type ulps) + { + // arrow implementation. + auto expected = [&]() { + // we're explicitly casting back to doubles here but this is ok because that is + // exactly what happens inside of the cudf implementation as values are processed as well. so + // this should not affect results. + auto as_doubles = cudf::cast(values, data_type{type_id::FLOAT64}); + return arrow_percentile_approx(*as_doubles, delta, percentages); + }(); + + // gpu + cudf::table_view t({keys}); + cudf::groupby::groupby gb(t); + std::vector requests; + std::vector> aggregations; + aggregations.push_back(cudf::make_tdigest_aggregation(delta)); + requests.push_back({values, std::move(aggregations)}); + auto gb_result = gb.aggregate(requests); + + cudf::test::fixed_width_column_wrapper g_percentages(percentages.begin(), + percentages.end()); + structs_column_view scv(*(gb_result.second[0].results[0])); + auto result = cudf::percentile_approx(scv, g_percentages); + + cudf::test::expect_columns_equivalent( + *expected, *result, cudf::test::debug_output_level::FIRST_ERROR, ulps); + + return result; + } + + template < + typename T, + typename std::enable_if_t() && !cudf::is_fixed_point()>* = nullptr> + std::unique_ptr operator()(column_view const& keys, + column_view const& values, + int delta, + std::vector const& percentages, + size_type ulps) + { + CUDF_FAIL("Invalid input type for percentile_approx test"); + } +}; + +void percentile_approx_test(column_view const& _keys, + column_view const& _values, + int delta, + std::vector const& percentages, + size_type ulps) +{ + // first pass: validate the actual percentages we get per group. + + // produce the groups + cudf::table_view k({_keys}); + cudf::groupby::groupby pass1_gb(k); + cudf::table_view v({_values}); + auto groups = pass1_gb.get_groups(v); + // slice it all up so we have keys/columns for everything. + std::vector keys; + std::vector values; + for (size_t idx = 0; idx < groups.offsets.size() - 1; idx++) { + auto k = + cudf::slice(groups.keys->get_column(0), {groups.offsets[idx], groups.offsets[idx + 1]}); + keys.push_back(k[0]); + + auto v = + cudf::slice(groups.values->get_column(0), {groups.offsets[idx], groups.offsets[idx + 1]}); + values.push_back(v[0]); + } + + std::vector> parts; + for (size_t idx = 0; idx < values.size(); idx++) { + // do any casting of the input + parts.push_back(cudf::type_dispatcher(values[idx].type(), + percentile_approx_dispatch{}, + keys[idx], + values[idx], + delta, + percentages, + ulps)); + } + std::vector part_views; + std::transform(parts.begin(), + parts.end(), + std::back_inserter(part_views), + [](std::unique_ptr const& c) { return c->view(); }); + auto expected = cudf::concatenate(part_views); + + // second pass. run the percentile_approx with all the keys in one pass and make sure we get the + // same results as the concatenated by-key results above + + cudf::groupby::groupby gb(k); + std::vector requests; + std::vector> aggregations; + aggregations.push_back(cudf::make_tdigest_aggregation(delta)); + requests.push_back({_values, std::move(aggregations)}); + auto gb_result = gb.aggregate(requests); + + cudf::test::fixed_width_column_wrapper g_percentages(percentages.begin(), + percentages.end()); + structs_column_view scv(*(gb_result.second[0].results[0])); + auto result = cudf::percentile_approx(scv, g_percentages); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *result); +} + +void simple_test(data_type input_type, std::vector> params) +{ + auto values = cudf::test::generate_standardized_percentile_distribution(input_type); + // all in the same group + auto keys = cudf::make_fixed_width_column( + data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED); + thrust::fill(rmm::exec_policy(rmm::cuda_stream_default), + keys->mutable_view().template begin(), + keys->mutable_view().template end(), + 0); + + std::for_each(params.begin(), params.end(), [&](std::pair const& params) { + percentile_approx_test( + *keys, *values, params.first, {0.0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0}, params.second); + }); +} + +struct group_index { + __device__ int operator()(int i) { return i / 150000; } +}; + +void grouped_test(data_type input_type, std::vector> params) +{ + auto values = cudf::test::generate_standardized_percentile_distribution(input_type); + // all in the same group + auto keys = cudf::make_fixed_width_column( + data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED); + auto i = thrust::make_counting_iterator(0); + thrust::transform(rmm::exec_policy(rmm::cuda_stream_default), + i, + i + values->size(), + keys->mutable_view().template begin(), + group_index{}); + + std::for_each(params.begin(), params.end(), [&](std::pair const& params) { + percentile_approx_test( + *keys, *values, params.first, {0.0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0}, params.second); + }); +} + +std::pair make_null_mask(column_view const& col) +{ + return cudf::detail::valid_if(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(col.size()), + [] __device__(size_type i) { return i % 2 == 0; }); +} + +void simple_with_nulls_test(data_type input_type, std::vector> params) +{ + auto values = cudf::test::generate_standardized_percentile_distribution(input_type); + // all in the same group + auto keys = cudf::make_fixed_width_column( + data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED); + thrust::fill(rmm::exec_policy(rmm::cuda_stream_default), + keys->mutable_view().template begin(), + keys->mutable_view().template end(), + 0); + + // add a null mask + auto mask = make_null_mask(*values); + values->set_null_mask(mask.first, mask.second); + + std::for_each(params.begin(), params.end(), [&](std::pair const& params) { + percentile_approx_test( + *keys, *values, params.first, {0.0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0}, params.second); + }); +} + +void grouped_with_nulls_test(data_type input_type, std::vector> params) +{ + auto values = cudf::test::generate_standardized_percentile_distribution(input_type); + // all in the same group + auto keys = cudf::make_fixed_width_column( + data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED); + auto i = thrust::make_counting_iterator(0); + thrust::transform(rmm::exec_policy(rmm::cuda_stream_default), + i, + i + values->size(), + keys->mutable_view().template begin(), + group_index{}); + + // add a null mask + auto mask = make_null_mask(*values); + values->set_null_mask(mask.first, mask.second); + + std::for_each(params.begin(), params.end(), [&](std::pair const& params) { + percentile_approx_test( + *keys, *values, params.first, {0.0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0}, params.second); + }); +} + +template +data_type get_appropriate_type() +{ + if constexpr (cudf::is_fixed_point()) { return data_type{cudf::type_to_id(), -7}; } + return data_type{cudf::type_to_id()}; +} + +using PercentileApproxTypes = + cudf::test::Concat; + +template +struct PercentileApproxInputTypesTest : public cudf::test::BaseFixture { +}; +TYPED_TEST_CASE(PercentileApproxInputTypesTest, PercentileApproxTypes); + +TYPED_TEST(PercentileApproxInputTypesTest, Simple) +{ + using T = TypeParam; + auto const input_type = get_appropriate_type(); + + simple_test(input_type, + {{1000, cudf::test::default_ulp}, + {100, cudf::test::default_ulp * 4}, + {10, cudf::test::default_ulp * 11}}); +} + +TYPED_TEST(PercentileApproxInputTypesTest, Grouped) +{ + using T = TypeParam; + auto const input_type = get_appropriate_type(); + + grouped_test(input_type, + {{1000, cudf::test::default_ulp}, + {100, cudf::test::default_ulp * 2}, + {10, cudf::test::default_ulp * 10}}); +} + +TYPED_TEST(PercentileApproxInputTypesTest, SimpleWithNulls) +{ + using T = TypeParam; + auto const input_type = get_appropriate_type(); + + simple_with_nulls_test(input_type, + {{1000, cudf::test::default_ulp}, + {100, cudf::test::default_ulp * 2}, + {10, cudf::test::default_ulp * 11}}); +} + +TYPED_TEST(PercentileApproxInputTypesTest, GroupedWithNulls) +{ + using T = TypeParam; + auto const input_type = get_appropriate_type(); + + grouped_with_nulls_test(input_type, + {{1000, cudf::test::default_ulp}, + {100, cudf::test::default_ulp * 2}, + {10, cudf::test::default_ulp * 6}}); +} + +struct PercentileApproxTest : public cudf::test::BaseFixture { +}; + +TEST_F(PercentileApproxTest, EmptyInput) +{ + auto empty_ = cudf::detail::tdigest::make_empty_tdigest_column(); + cudf::test::fixed_width_column_wrapper percentiles{0.0, 0.25, 0.3}; + + std::vector input; + input.push_back(*empty_); + input.push_back(*empty_); + input.push_back(*empty_); + auto empty = cudf::concatenate(input); + + structs_column_view scv(*empty); + auto result = cudf::percentile_approx(scv, percentiles); + + cudf::test::fixed_width_column_wrapper offsets{0, 0, 0, 0}; + std::vector nulls{0, 0, 0}; + auto expected = + cudf::make_lists_column(3, + offsets.release(), + cudf::make_empty_column(data_type{type_id::FLOAT64}), + 3, + cudf::test::detail::make_null_mask(nulls.begin(), nulls.end())); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected); +} + +TEST_F(PercentileApproxTest, EmptyPercentiles) +{ + auto const delta = 1000; + + cudf::test::fixed_width_column_wrapper values{0, 1, 2, 3, 4, 5}; + cudf::test::fixed_width_column_wrapper keys{0, 0, 0, 1, 1, 1}; + cudf::table_view t({keys}); + cudf::groupby::groupby gb(t); + std::vector requests; + std::vector> aggregations; + aggregations.push_back(cudf::make_tdigest_aggregation(delta)); + requests.push_back({values, std::move(aggregations)}); + auto tdigest_column = gb.aggregate(requests); + + cudf::test::fixed_width_column_wrapper percentiles{}; + + structs_column_view scv(*tdigest_column.second[0].results[0]); + auto result = cudf::percentile_approx(scv, percentiles); + + cudf::test::fixed_width_column_wrapper offsets{0, 0, 0}; + auto expected = cudf::make_lists_column(2, + offsets.release(), + cudf::make_empty_column(data_type{type_id::FLOAT64}), + 2, + cudf::detail::create_null_mask(2, mask_state::ALL_NULL)); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected); +} + +TEST_F(PercentileApproxTest, NullPercentiles) +{ + auto const delta = 1000; + + cudf::test::fixed_width_column_wrapper values{1, 1, 2, 3, 4, 5, 6, 7, 8}; + cudf::test::fixed_width_column_wrapper keys{0, 0, 0, 0, 0, 1, 1, 1, 1}; + cudf::table_view t({keys}); + cudf::groupby::groupby gb(t); + std::vector requests; + std::vector> aggregations; + aggregations.push_back(cudf::make_tdigest_aggregation(delta)); + requests.push_back({values, std::move(aggregations)}); + auto tdigest_column = gb.aggregate(requests); + + structs_column_view scv(*tdigest_column.second[0].results[0]); + + cudf::test::fixed_width_column_wrapper npercentiles{{0.5, 0.5, 1.0, 1.0}, {0, 0, 1, 1}}; + auto result = cudf::percentile_approx(scv, npercentiles); + + std::vector valids{0, 0, 1, 1}; + cudf::test::lists_column_wrapper expected{{{99, 99, 4, 4}, valids.begin()}, + {{99, 99, 8, 8}, valids.begin()}}; + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected); +} \ No newline at end of file diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu index f3002bc4b1a..0f10d6efe4a 100644 --- a/cpp/tests/utilities/column_utilities.cu +++ b/cpp/tests/utilities/column_utilities.cu @@ -323,7 +323,8 @@ class corresponding_rows_unequal { corresponding_rows_unequal(table_device_view d_lhs, table_device_view d_rhs, column_device_view lhs_row_indices_, - column_device_view rhs_row_indices_) + column_device_view rhs_row_indices_, + size_type /*fp_ulps*/) : comp(d_lhs, d_rhs), lhs_row_indices(lhs_row_indices_), rhs_row_indices(rhs_row_indices_) { } @@ -347,16 +348,20 @@ class corresponding_rows_not_equivalent { column_device_view lhs_row_indices; column_device_view rhs_row_indices; + size_type const fp_ulps; + public: corresponding_rows_not_equivalent(table_device_view d_lhs, table_device_view d_rhs, column_device_view lhs_row_indices_, - column_device_view rhs_row_indices_) + column_device_view rhs_row_indices_, + size_type fp_ulps_) : d_lhs(d_lhs), d_rhs(d_rhs), comp(d_lhs, d_rhs), lhs_row_indices(lhs_row_indices_), - rhs_row_indices(rhs_row_indices_) + rhs_row_indices(rhs_row_indices_), + fp_ulps(fp_ulps_) { CUDF_EXPECTS(d_lhs.num_columns() == 1 and d_rhs.num_columns() == 1, "Unsupported number of columns"); @@ -368,7 +373,8 @@ class corresponding_rows_not_equivalent { column_device_view const& lhs, column_device_view const& rhs, size_type lhs_index, - size_type rhs_index) + size_type rhs_index, + size_type fp_ulps) { if (lhs.is_valid(lhs_index) and rhs.is_valid(rhs_index)) { T const x = lhs.element(lhs_index); @@ -380,10 +386,9 @@ class corresponding_rows_not_equivalent { } else if (std::isnan(x) || std::isnan(y)) { return std::isnan(x) != std::isnan(y); // comparison of (nan==nan) returns false } else { - constexpr int ulp = 4; // ulp = unit of least precision, value taken from google test T const abs_x_minus_y = std::abs(x - y); return abs_x_minus_y >= std::numeric_limits::min() && - abs_x_minus_y > std::numeric_limits::epsilon() * std::abs(x + y) * ulp; + abs_x_minus_y > std::numeric_limits::epsilon() * std::abs(x + y) * fp_ulps; } } else { // if either is null, then the inequality was checked already @@ -409,8 +414,13 @@ class corresponding_rows_not_equivalent { if (not comp(lhs_index, rhs_index)) { auto lhs_col = this->d_lhs.column(0); auto rhs_col = this->d_rhs.column(0); - return type_dispatcher( - lhs_col.type(), typed_element_not_equivalent{}, lhs_col, rhs_col, lhs_index, rhs_index); + return type_dispatcher(lhs_col.type(), + typed_element_not_equivalent{}, + lhs_col, + rhs_col, + lhs_index, + rhs_index, + fp_ulps); } return false; } @@ -468,6 +478,7 @@ struct column_comparator_impl { column_view const& lhs_row_indices, column_view const& rhs_row_indices, debug_output_level verbosity, + size_type fp_ulps, int depth) { auto d_lhs = cudf::table_device_view::create(table_view{{lhs}}); @@ -483,12 +494,12 @@ struct column_comparator_impl { auto differences = rmm::device_uvector( lhs.size(), rmm::cuda_stream_default); // worst case: everything different auto input_iter = thrust::make_counting_iterator(0); - auto diff_iter = - thrust::copy_if(rmm::exec_policy(), - input_iter, - input_iter + lhs_row_indices.size(), - differences.begin(), - ComparatorType(*d_lhs, *d_rhs, *d_lhs_row_indices, *d_rhs_row_indices)); + auto diff_iter = thrust::copy_if( + rmm::exec_policy(), + input_iter, + input_iter + lhs_row_indices.size(), + differences.begin(), + ComparatorType(*d_lhs, *d_rhs, *d_lhs_row_indices, *d_rhs_row_indices, fp_ulps)); differences.resize(thrust::distance(differences.begin(), diff_iter), rmm::cuda_stream_default); // shrink back down @@ -519,6 +530,7 @@ struct column_comparator_impl { column_view const& lhs_row_indices, column_view const& rhs_row_indices, debug_output_level verbosity, + size_type fp_ulps, int depth) { lists_column_view lhs_l(lhs); @@ -638,6 +650,7 @@ struct column_comparator_impl { *lhs_child_indices, *rhs_child_indices, verbosity, + fp_ulps, depth + 1); } @@ -652,6 +665,7 @@ struct column_comparator_impl { column_view const& lhs_row_indices, column_view const& rhs_row_indices, debug_output_level verbosity, + size_type fp_ulps, int depth) { structs_column_view l_scv(lhs); @@ -667,6 +681,7 @@ struct column_comparator_impl { lhs_row_indices, rhs_row_indices, verbosity, + fp_ulps, depth + 1)) { return false; } @@ -683,6 +698,7 @@ struct column_comparator { column_view const& lhs_row_indices, column_view const& rhs_row_indices, debug_output_level verbosity, + size_type fp_ulps, int depth = 0) { CUDF_EXPECTS(lhs_row_indices.size() == rhs_row_indices.size(), @@ -701,7 +717,7 @@ struct column_comparator { // compare values column_comparator_impl comparator{}; - return comparator(lhs, rhs, lhs_row_indices, rhs_row_indices, verbosity, depth); + return comparator(lhs, rhs, lhs_row_indices, rhs_row_indices, verbosity, fp_ulps, depth); } }; @@ -750,8 +766,14 @@ bool expect_columns_equal(cudf::column_view const& lhs, debug_output_level verbosity) { auto indices = generate_all_row_indices(lhs.size()); - return cudf::type_dispatcher( - lhs.type(), column_comparator{}, lhs, rhs, *indices, *indices, verbosity); + return cudf::type_dispatcher(lhs.type(), + column_comparator{}, + lhs, + rhs, + *indices, + *indices, + verbosity, + cudf::test::default_ulp); } /** @@ -759,11 +781,12 @@ bool expect_columns_equal(cudf::column_view const& lhs, */ bool expect_columns_equivalent(cudf::column_view const& lhs, cudf::column_view const& rhs, - debug_output_level verbosity) + debug_output_level verbosity, + size_type fp_ulps) { auto indices = generate_all_row_indices(lhs.size()); return cudf::type_dispatcher( - lhs.type(), column_comparator{}, lhs, rhs, *indices, *indices, verbosity); + lhs.type(), column_comparator{}, lhs, rhs, *indices, *indices, verbosity, fp_ulps); } /**