From c0951ba3ec7f13b31c36a484f4935f5a0d444619 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Tue, 26 Oct 2021 22:54:15 +0530
Subject: [PATCH] add min_periods, ddof to groupby covariance, & correlation
aggregation (#9492)
Addresses part of https://github.com/rapidsai/cudf/issues/8691
Add min_periods and ddof parameters to libcudf groupby covariance and Pearson correlation (python needs this)
Authors:
- Karthikeyan (https://github.com/karthikeyann)
Approvers:
- Devavret Makkar (https://github.com/devavret)
- Jake Hemstad (https://github.com/jrhemstad)
URL: https://github.com/rapidsai/cudf/pull/9492
---
cpp/include/cudf/aggregation.hpp | 11 ++--
.../cudf/detail/aggregation/aggregation.hpp | 26 ++++++++--
cpp/src/aggregation/aggregation.cpp | 18 ++++---
cpp/src/groupby/sort/aggregate.cpp | 50 +++++++++++--------
cpp/src/groupby/sort/group_correlation.cu | 15 ++++--
cpp/src/groupby/sort/group_reductions.hpp | 4 ++
cpp/tests/groupby/correlation_tests.cpp | 46 +++++++++++++----
cpp/tests/groupby/covariance_tests.cpp | 47 +++++++++++++++++
8 files changed, 167 insertions(+), 50 deletions(-)
diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index 6661f518639..374af536dc5 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -503,9 +503,12 @@ std::unique_ptr make_merge_m2_aggregation();
*
* Compute covariance between two columns.
* The input columns are child columns of a non-nullable struct columns.
+ * @param min_periods Minimum number of non-null observations required to produce a result.
+ * @param ddof Delta Degrees of Freedom. The divisor used in calculations is N - ddof, where N is
+ * the number of non-null observations.
*/
template
-std::unique_ptr make_covariance_aggregation();
+std::unique_ptr make_covariance_aggregation(size_type min_periods = 1, size_type ddof = 1);
/**
* @brief Factory to create a CORRELATION aggregation
@@ -513,10 +516,12 @@ std::unique_ptr make_covariance_aggregation();
* Compute correlation coefficient between two columns.
* The input columns are child columns of a non-nullable struct columns.
*
- * @param[in] type: correlation_type
+ * @param type correlation_type
+ * @param min_periods Minimum number of non-null observations required to produce a result.
*/
template
-std::unique_ptr make_correlation_aggregation(correlation_type type);
+std::unique_ptr make_correlation_aggregation(correlation_type type,
+ size_type min_periods = 1);
/**
* @brief Factory to create a TDIGEST aggregation
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index e12ed3f521e..69bde7f57fd 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -901,7 +901,14 @@ class merge_m2_aggregation final : public groupby_aggregation {
*/
class covariance_aggregation final : public groupby_aggregation {
public:
- explicit covariance_aggregation() : aggregation{COVARIANCE} {}
+ explicit covariance_aggregation(size_type min_periods, size_type ddof)
+ : aggregation{COVARIANCE}, _min_periods{min_periods}, _ddof(ddof)
+ {
+ }
+ size_type _min_periods;
+ size_type _ddof;
+
+ size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
std::unique_ptr clone() const override
{
@@ -913,6 +920,12 @@ class covariance_aggregation final : public groupby_aggregation {
return collector.visit(col_type, *this);
}
void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+
+ protected:
+ size_t hash_impl() const
+ {
+ return std::hash{}(_min_periods) ^ std::hash{}(_ddof);
+ }
};
/**
@@ -920,8 +933,12 @@ class covariance_aggregation final : public groupby_aggregation {
*/
class correlation_aggregation final : public groupby_aggregation {
public:
- explicit correlation_aggregation(correlation_type type) : aggregation{CORRELATION}, _type{type} {}
+ explicit correlation_aggregation(correlation_type type, size_type min_periods)
+ : aggregation{CORRELATION}, _type{type}, _min_periods{min_periods}
+ {
+ }
correlation_type _type;
+ size_type _min_periods;
bool is_equal(aggregation const& _other) const override
{
@@ -944,7 +961,10 @@ class correlation_aggregation final : public groupby_aggregation {
void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
protected:
- size_t hash_impl() const { return std::hash{}(static_cast(_type)); }
+ size_t hash_impl() const
+ {
+ return std::hash{}(static_cast(_type)) ^ std::hash{}(_min_periods);
+ }
};
/**
diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp
index 3c6ab157d46..31bf9d65d56 100644
--- a/cpp/src/aggregation/aggregation.cpp
+++ b/cpp/src/aggregation/aggregation.cpp
@@ -713,23 +713,25 @@ template std::unique_ptr make_merge_m2_aggregation
-std::unique_ptr make_covariance_aggregation()
+std::unique_ptr make_covariance_aggregation(size_type min_periods, size_type ddof)
{
- return std::make_unique();
+ return std::make_unique(min_periods, ddof);
}
-template std::unique_ptr make_covariance_aggregation();
-template std::unique_ptr make_covariance_aggregation();
+template std::unique_ptr make_covariance_aggregation(
+ size_type min_periods, size_type ddof);
+template std::unique_ptr make_covariance_aggregation(
+ size_type min_periods, size_type ddof);
/// Factory to create a CORRELATION aggregation
template
-std::unique_ptr make_correlation_aggregation(correlation_type type)
+std::unique_ptr make_correlation_aggregation(correlation_type type, size_type min_periods)
{
- return std::make_unique(type);
+ return std::make_unique(type, min_periods);
}
template std::unique_ptr make_correlation_aggregation(
- correlation_type type);
+ correlation_type type, size_type min_periods);
template std::unique_ptr make_correlation_aggregation(
- correlation_type type);
+ correlation_type type, size_type min_periods);
template
std::unique_ptr make_tdigest_aggregation(int max_centroids)
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index e471fccda07..83c6c1bca57 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -573,6 +573,7 @@ void aggregate_result_functor::operator()(aggregation c
CUDF_EXPECTS(values.num_children() == 2,
"Input to `groupby covariance` must be a structs column having 2 children columns.");
+ auto const& cov_agg = dynamic_cast(agg);
// Covariance only for valid values in both columns.
// in non-identical null mask cases, this prevents caching of the results - STD, MEAN, COUNT.
auto [_, values_child0, values_child1] =
@@ -596,6 +597,8 @@ void aggregate_result_functor::operator()(aggregation c
count,
mean0,
mean1,
+ cov_agg._min_periods,
+ cov_agg._ddof,
stream,
mr));
};
@@ -629,28 +632,33 @@ void aggregate_result_functor::operator()(aggregation
aggregate_result_functor(values_child0, helper, cache, stream, mr).operator()(*std_agg);
aggregate_result_functor(values_child1, helper, cache, stream, mr).operator()(*std_agg);
- auto const stddev0 = cache.get_result(values_child0, *std_agg);
- auto const stddev1 = cache.get_result(values_child1, *std_agg);
-
- auto mean_agg = make_mean_aggregation();
- auto const mean0 = cache.get_result(values_child0, *mean_agg);
- auto const mean1 = cache.get_result(values_child1, *mean_agg);
- auto count_agg = make_count_aggregation();
- auto const count = cache.get_result(values_child0, *count_agg);
-
// Compute covariance here to avoid repeated computation of mean & count
- auto cov_agg = make_covariance_aggregation();
- cache.add_result(values,
- *cov_agg,
- detail::group_covariance(get_grouped_values().child(0),
- get_grouped_values().child(1),
- helper.group_labels(stream),
- helper.num_groups(stream),
- count,
- mean0,
- mean1,
- stream,
- mr));
+ auto cov_agg = make_covariance_aggregation(corr_agg._min_periods);
+ if (not cache.has_result(values, *cov_agg)) {
+ auto mean_agg = make_mean_aggregation();
+ auto const mean0 = cache.get_result(values_child0, *mean_agg);
+ auto const mean1 = cache.get_result(values_child1, *mean_agg);
+ auto count_agg = make_count_aggregation();
+ auto const count = cache.get_result(values_child0, *count_agg);
+
+ auto const& cov_agg_obj = dynamic_cast(*cov_agg);
+ cache.add_result(values,
+ *cov_agg,
+ detail::group_covariance(get_grouped_values().child(0),
+ get_grouped_values().child(1),
+ helper.group_labels(stream),
+ helper.num_groups(stream),
+ count,
+ mean0,
+ mean1,
+ cov_agg_obj._min_periods,
+ cov_agg_obj._ddof,
+ stream,
+ mr));
+ }
+
+ auto const stddev0 = cache.get_result(values_child0, *std_agg);
+ auto const stddev1 = cache.get_result(values_child1, *std_agg);
auto const covariance = cache.get_result(values, *cov_agg);
cache.add_result(
values, agg, detail::group_correlation(covariance, stddev0, stddev1, stream, mr));
diff --git a/cpp/src/groupby/sort/group_correlation.cu b/cpp/src/groupby/sort/group_correlation.cu
index e43d0185e93..cdcf4311be7 100644
--- a/cpp/src/groupby/sort/group_correlation.cu
+++ b/cpp/src/groupby/sort/group_correlation.cu
@@ -113,6 +113,8 @@ std::unique_ptr group_covariance(column_view const& values_0,
column_view const& count,
column_view const& mean_0,
column_view const& mean_1,
+ size_type min_periods,
+ size_type ddof,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
@@ -140,8 +142,13 @@ std::unique_ptr group_covariance(column_view const& values_0,
auto d_values_0 = column_device_view::create(values_0, stream);
auto d_values_1 = column_device_view::create(values_1, stream);
- covariance_transform covariance_transform_op{
- *d_values_0, *d_values_1, mean0_ptr, mean1_ptr, count.data(), group_labels.begin()};
+ covariance_transform covariance_transform_op{*d_values_0,
+ *d_values_1,
+ mean0_ptr,
+ mean1_ptr,
+ count.data(),
+ group_labels.begin(),
+ ddof};
auto result = make_numeric_column(
data_type(type_to_id()), num_groups, mask_state::UNALLOCATED, stream, mr);
@@ -157,8 +164,8 @@ std::unique_ptr group_covariance(column_view const& values_0,
thrust::make_discard_iterator(),
d_result);
- auto is_null = [ddof = covariance_transform_op.ddof] __device__(size_type group_size) {
- return not(group_size == 0 or group_size - ddof <= 0);
+ auto is_null = [ddof, min_periods] __device__(size_type group_size) {
+ return not(group_size == 0 or group_size - ddof <= 0 or group_size < min_periods);
};
auto [new_nullmask, null_count] =
cudf::detail::valid_if(count.begin(), count.end(), is_null, stream, mr);
diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp
index 789a289a07e..75708c7b01c 100644
--- a/cpp/src/groupby/sort/group_reductions.hpp
+++ b/cpp/src/groupby/sort/group_reductions.hpp
@@ -451,6 +451,8 @@ std::unique_ptr group_merge_m2(column_view const& values,
* @param count The count of valid rows of the grouped values of both columns
* @param mean_0 The mean of the first grouped values column
* @param mean_1 The mean of the second grouped values column
+ * @param min_periods The minimum number of non-null rows required to consider the covariance
+ * @param ddof The delta degrees of freedom used in the calculation of the variance
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned column's device memory
*/
@@ -461,6 +463,8 @@ std::unique_ptr group_covariance(column_view const& values_0,
column_view const& count,
column_view const& mean_0,
column_view const& mean_1,
+ size_type min_periods,
+ size_type ddof,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);
diff --git a/cpp/tests/groupby/correlation_tests.cpp b/cpp/tests/groupby/correlation_tests.cpp
index 90d230ef1eb..4aa4ef236f0 100644
--- a/cpp/tests/groupby/correlation_tests.cpp
+++ b/cpp/tests/groupby/correlation_tests.cpp
@@ -32,7 +32,8 @@ using namespace cudf::test::iterators;
namespace cudf {
namespace test {
-using structs = structs_column_wrapper;
+constexpr auto nan = std::numeric_limits::quiet_NaN();
+using structs = structs_column_wrapper;
template
struct groupby_correlation_test : public cudf::test::BaseFixture {
@@ -54,8 +55,7 @@ TYPED_TEST(groupby_correlation_test, basic)
auto vals = structs{{member_0, member_1}};
fixed_width_column_wrapper expect_keys{1, 2, 3};
- fixed_width_column_wrapper expect_vals{
- {1.0, 0.6, std::numeric_limits::quiet_NaN()}};
+ fixed_width_column_wrapper expect_vals{{1.0, 0.6, nan}};
auto agg =
cudf::make_correlation_aggregation(cudf::correlation_type::PEARSON);
@@ -129,8 +129,7 @@ TYPED_TEST(groupby_correlation_test, null_keys_and_values)
auto vals = structs{{val0, val1}};
fixed_width_column_wrapper expect_keys({1, 2, 3, 4}, no_nulls());
- fixed_width_column_wrapper expect_vals(
- {1.0, 0.6, std::numeric_limits::quiet_NaN(), 0.}, {1, 1, 1, 0});
+ fixed_width_column_wrapper expect_vals({1.0, 0.6, nan, 0.}, {1, 1, 1, 0});
auto agg =
cudf::make_correlation_aggregation(cudf::correlation_type::PEARSON);
@@ -153,8 +152,7 @@ TYPED_TEST(groupby_correlation_test, null_values_same)
auto vals = structs{{val0, val1}};
fixed_width_column_wrapper expect_keys({1, 2, 3, 4}, no_nulls());
- fixed_width_column_wrapper expect_vals(
- {1.0, 0.6, std::numeric_limits::quiet_NaN(), 0.}, {1, 1, 1, 0});
+ fixed_width_column_wrapper expect_vals({1.0, 0.6, nan, 0.}, {1, 1, 1, 0});
auto agg =
cudf::make_correlation_aggregation(cudf::correlation_type::PEARSON);
@@ -181,14 +179,41 @@ TYPED_TEST(groupby_correlation_test, null_values_different)
auto vals = structs{{val0, val1}};
fixed_width_column_wrapper expect_keys({1, 2, 3, 4}, no_nulls());
- fixed_width_column_wrapper expect_vals({1.0, 0., std::numeric_limits::quiet_NaN(), 0.},
- {1, 1, 1, 0});
+ fixed_width_column_wrapper expect_vals({1.0, 0., nan, 0.}, {1, 1, 1, 0});
auto agg =
cudf::make_correlation_aggregation(cudf::correlation_type::PEARSON);
test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES);
}
+TYPED_TEST(groupby_correlation_test, min_periods)
+{
+ using V = TypeParam;
+ using R = cudf::detail::target_type_t;
+
+ auto keys = fixed_width_column_wrapper{{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}};
+ auto member_0 = fixed_width_column_wrapper{{1, 1, 1, 2, 2, 3, 3, 1, 1, 4}};
+ auto member_1 = fixed_width_column_wrapper{{1, 1, 1, 2, 0, 3, 3, 1, 1, 2}};
+ auto vals = structs{{member_0, member_1}};
+
+ fixed_width_column_wrapper expect_keys{1, 2, 3};
+
+ fixed_width_column_wrapper expect_vals1{{1.0, 0.6, nan}};
+ auto agg1 =
+ cudf::make_correlation_aggregation(cudf::correlation_type::PEARSON, 3);
+ test_single_agg(keys, vals, expect_keys, expect_vals1, std::move(agg1), force_use_sort_impl::YES);
+
+ fixed_width_column_wrapper expect_vals2{{1.0, 0.6, nan}, {0, 1, 0}};
+ auto agg2 =
+ cudf::make_correlation_aggregation(cudf::correlation_type::PEARSON, 4);
+ test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg2), force_use_sort_impl::YES);
+
+ fixed_width_column_wrapper expect_vals3{{1.0, 0.6, nan}, {0, 0, 0}};
+ auto agg3 =
+ cudf::make_correlation_aggregation(cudf::correlation_type::PEARSON, 5);
+ test_single_agg(keys, vals, expect_keys, expect_vals3, std::move(agg3), force_use_sort_impl::YES);
+}
+
struct groupby_dictionary_correlation_test : public cudf::test::BaseFixture {
};
@@ -203,8 +228,7 @@ TEST_F(groupby_dictionary_correlation_test, basic)
auto vals = structs{{member_0, member_1}};
fixed_width_column_wrapper expect_keys{1, 2, 3};
- fixed_width_column_wrapper expect_vals{
- {1.0, 0.6, std::numeric_limits::quiet_NaN()}};
+ fixed_width_column_wrapper expect_vals{{1.0, 0.6, nan}};
auto agg =
cudf::make_correlation_aggregation(cudf::correlation_type::PEARSON);
diff --git a/cpp/tests/groupby/covariance_tests.cpp b/cpp/tests/groupby/covariance_tests.cpp
index 039fce16222..3a4fbf92387 100644
--- a/cpp/tests/groupby/covariance_tests.cpp
+++ b/cpp/tests/groupby/covariance_tests.cpp
@@ -175,6 +175,53 @@ TYPED_TEST(groupby_covariance_test, null_values_different)
test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES);
}
+TYPED_TEST(groupby_covariance_test, min_periods)
+{
+ using V = TypeParam;
+ using R = cudf::detail::target_type_t;
+
+ auto keys = fixed_width_column_wrapper{{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}};
+ auto member_0 = fixed_width_column_wrapper{{1, 1, 1, 2, 2, 3, 3, 1, 1, 4}};
+ auto member_1 = fixed_width_column_wrapper{{1, 1, 1, 2, 0, 3, 3, 1, 1, 2}};
+ auto vals = structs{{member_0, member_1}};
+
+ fixed_width_column_wrapper expect_keys{1, 2, 3};
+
+ fixed_width_column_wrapper expect_vals1{{1.0, 1.0, 0.0}};
+ auto agg1 = cudf::make_covariance_aggregation(3);
+ test_single_agg(keys, vals, expect_keys, expect_vals1, std::move(agg1), force_use_sort_impl::YES);
+
+ fixed_width_column_wrapper expect_vals2{{1.0, 1.0, 0.0}, {0, 1, 0}};
+ auto agg2 = cudf::make_covariance_aggregation(4);
+ test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg2), force_use_sort_impl::YES);
+
+ fixed_width_column_wrapper expect_vals3{{1.0, 1.0, 0.0}, {0, 0, 0}};
+ auto agg3 = cudf::make_covariance_aggregation(5);
+ test_single_agg(keys, vals, expect_keys, expect_vals3, std::move(agg3), force_use_sort_impl::YES);
+}
+
+TYPED_TEST(groupby_covariance_test, ddof)
+{
+ using V = TypeParam;
+ using R = cudf::detail::target_type_t;
+
+ auto keys = fixed_width_column_wrapper{{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}};
+ auto member_0 = fixed_width_column_wrapper{{1, 1, 1, 2, 2, 3, 3, 1, 1, 4}};
+ auto member_1 = fixed_width_column_wrapper{{1, 1, 1, 2, 0, 3, 3, 1, 1, 2}};
+ auto vals = structs{{member_0, member_1}};
+
+ fixed_width_column_wrapper expect_keys{1, 2, 3};
+
+ fixed_width_column_wrapper expect_vals1{{2.0, 1.5, 0.0}};
+ auto agg1 = cudf::make_covariance_aggregation(1, 2);
+ test_single_agg(keys, vals, expect_keys, expect_vals1, std::move(agg1), force_use_sort_impl::YES);
+
+ auto const inf = std::numeric_limits::infinity();
+ fixed_width_column_wrapper expect_vals2{{inf, 3.0, 0.0}, {0, 1, 0}};
+ auto agg2 = cudf::make_covariance_aggregation(1, 3);
+ test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg2), force_use_sort_impl::YES);
+}
+
struct groupby_dictionary_covariance_test : public cudf::test::BaseFixture {
};