Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Compute whole column variance using numerically stable approach #16448

Merged
merged 14 commits into from
Oct 8, 2024
Prev Previous commit
Next Next commit
Adapt tests to compute host variance with two pass algorithm
We now need to compare to a tolerance, which was probably the case
before, except we were getting lucky.
wence- committed Aug 1, 2024

Verified

This commit was signed with the committer’s verified signature.
alexanderzobnin Alexander Zobnin
commit b1c6a5a4fdfee7976f80312aeee1c58b57a75997
130 changes: 53 additions & 77 deletions cpp/tests/reductions/reduction_tests.cpp
Original file line number Diff line number Diff line change
@@ -33,9 +33,12 @@
#include <cudf/types.hpp>
#include <cudf/wrappers/timestamps.hpp>

#include <thrust/copy.h>
wence- marked this conversation as resolved.
Show resolved Hide resolved
#include <thrust/iterator/counting_iterator.h>

#include <algorithm>
#include <iostream>
#include <iterator>
#include <vector>

using aggregation = cudf::aggregation;
@@ -769,6 +772,25 @@ TYPED_TEST(MultiStepReductionTest, Mean)
expected_value_nulls);
}

template <typename T>
double calc_var(std::vector<T> const& v, int ddof, std::vector<bool> const& mask = {})
wence- marked this conversation as resolved.
Show resolved Hide resolved
{
auto const values = [&]() {
if (mask.empty()) { return v; }
std::vector<T> masked{};
thrust::copy_if(
wence- marked this conversation as resolved.
Show resolved Hide resolved
v.begin(), v.end(), mask.begin(), std::back_inserter(masked), [](auto m) { return m; });
return masked;
}();
auto const valid_count = values.size();
double const mean = std::accumulate(values.cbegin(), values.cend(), double{0}) / valid_count;
double const sq_sum_of_differences =
std::accumulate(values.cbegin(), values.cend(), double{0}, [mean](double acc, auto const v) {
return acc + std::pow(v - mean, 2);
});
return sq_sum_of_differences / (valid_count - ddof);
}

// This test is disabled for only a Debug build because a compiler error
// documented in cpp/src/reductions/std.cu and cpp/src/reductions/var.cu
#ifdef NDEBUG
@@ -781,25 +803,12 @@ TYPED_TEST(MultiStepReductionTest, DISABLED_var_std)
std::vector<int> int_values({-3, 2, 1, 0, 5, -3, -2, 28});
std::vector<bool> host_bools({true, true, false, true, true, true, false, true});

auto calc_var = [](std::vector<T>& v, cudf::size_type valid_count, int ddof) {
double mean = std::accumulate(v.begin(), v.end(), double{0});
mean /= valid_count;

double sum_of_sq = std::accumulate(
v.begin(), v.end(), double{0}, [](double acc, TypeParam i) { return acc + i * i; });

cudf::size_type div = valid_count - ddof;

double var = sum_of_sq / div - ((mean * mean) * valid_count) / div;
return var;
};

// test without nulls
std::vector<T> v = convert_values<T>(int_values);
cudf::test::fixed_width_column_wrapper<T> col(v.begin(), v.end());

auto const ddof = 1;
double var = calc_var(v, v.size(), ddof);
double var = calc_var(v, ddof);
double std = std::sqrt(var);
auto var_agg = cudf::make_variance_aggregation<reduce_aggregation>(ddof);
auto std_agg = cudf::make_std_aggregation<reduce_aggregation>(ddof);
@@ -815,23 +824,19 @@ TYPED_TEST(MultiStepReductionTest, DISABLED_var_std)

// test with nulls
cudf::test::fixed_width_column_wrapper<T> col_nulls = construct_null_column(v, host_bools);
cudf::size_type valid_count =
cudf::column_view(col_nulls).size() - cudf::column_view(col_nulls).null_count();
auto replaced_array = replace_nulls(v, host_bools, T{0});

double var_nulls = calc_var(replaced_array, valid_count, ddof);
double std_nulls = std::sqrt(var_nulls);
double var_nulls = calc_var(v, ddof, host_bools);
double std_nulls = std::sqrt(var_nulls);

EXPECT_EQ(this
->template reduction_test<double>(
col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64))
.first,
var_nulls);
EXPECT_EQ(this
->template reduction_test<double>(
col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64))
.first,
std_nulls);
EXPECT_DOUBLE_EQ(this
->template reduction_test<double>(
col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64))
.first,
var_nulls);
EXPECT_DOUBLE_EQ(this
->template reduction_test<double>(
col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64))
.first,
std_nulls);
}

// ----------------------------------------------------------------------------
@@ -1143,23 +1148,10 @@ TEST_P(ReductionParamTest, DISABLED_std_var)
std::vector<double> int_values({-3, 2, 1, 0, 5, -3, -2, 28});
std::vector<bool> host_bools({true, true, false, true, true, true, false, true});

auto calc_var = [ddof](std::vector<double>& v, cudf::size_type valid_count) {
double mean = std::accumulate(v.begin(), v.end(), double{0});
mean /= valid_count;

double sum_of_sq = std::accumulate(
v.begin(), v.end(), double{0}, [](double acc, double i) { return acc + i * i; });

cudf::size_type div = valid_count - ddof;

double var = sum_of_sq / div - ((mean * mean) * valid_count) / div;
return var;
};

// test without nulls
cudf::test::fixed_width_column_wrapper<double> col(int_values.begin(), int_values.end());

double var = calc_var(int_values, int_values.size());
double var = calc_var(int_values, ddof);
double std = std::sqrt(var);
auto var_agg = cudf::make_variance_aggregation<reduce_aggregation>(ddof);
auto std_agg = cudf::make_std_aggregation<reduce_aggregation>(ddof);
@@ -1176,23 +1168,19 @@ TEST_P(ReductionParamTest, DISABLED_std_var)
// test with nulls
cudf::test::fixed_width_column_wrapper<double> col_nulls =
construct_null_column(int_values, host_bools);
cudf::size_type valid_count =
cudf::column_view(col_nulls).size() - cudf::column_view(col_nulls).null_count();
auto replaced_array = replace_nulls<double>(int_values, host_bools, int{0});

double var_nulls = calc_var(replaced_array, valid_count);
double var_nulls = calc_var(int_values, ddof, host_bools);
double std_nulls = std::sqrt(var_nulls);

EXPECT_EQ(this
->template reduction_test<double>(
col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64))
.first,
var_nulls);
EXPECT_EQ(this
->template reduction_test<double>(
col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64))
.first,
std_nulls);
EXPECT_DOUBLE_EQ(this
->template reduction_test<double>(
col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64))
.first,
var_nulls);
EXPECT_DOUBLE_EQ(this
->template reduction_test<double>(
col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64))
.first,
std_nulls);
}

//-------------------------------------------------------------------
@@ -2475,21 +2463,11 @@ TYPED_TEST(DictionaryReductionTest, DISABLED_VarStd)
std::vector<T> v = convert_values<T>(int_values);
cudf::data_type output_type{cudf::type_to_id<double>()};

auto calc_var = [](std::vector<T> const& v, cudf::size_type valid_count, cudf::size_type ddof) {
double mean = std::accumulate(v.cbegin(), v.cend(), double{0});
mean /= valid_count;
double sum_of_sq = std::accumulate(
v.cbegin(), v.cend(), double{0}, [](double acc, TypeParam i) { return acc + i * i; });
auto const div = valid_count - ddof;
double var = sum_of_sq / div - ((mean * mean) * valid_count) / div;
return var;
};

// test without nulls
cudf::test::dictionary_column_wrapper<T> col(v.begin(), v.end());

cudf::size_type const ddof = 1;
double var = calc_var(v, v.size(), ddof);
double var = calc_var(v, ddof);
double std = std::sqrt(var);
auto var_agg = cudf::make_variance_aggregation<reduce_aggregation>(ddof);
auto std_agg = cudf::make_std_aggregation<reduce_aggregation>(ddof);
@@ -2501,15 +2479,13 @@ TYPED_TEST(DictionaryReductionTest, DISABLED_VarStd)
std::vector<bool> validity({true, true, false, true, true, true, false, true});
cudf::test::dictionary_column_wrapper<T> col_nulls(v.begin(), v.end(), validity.begin());

cudf::size_type const valid_count = std::count(validity.begin(), validity.end(), true);

double var_nulls = calc_var(replace_nulls(v, validity, T{0}), valid_count, ddof);
double var_nulls = calc_var(v, ddof, validity);
double std_nulls = std::sqrt(var_nulls);

EXPECT_EQ(this->template reduction_test<double>(col_nulls, *var_agg, output_type).first,
var_nulls);
EXPECT_EQ(this->template reduction_test<double>(col_nulls, *std_agg, output_type).first,
std_nulls);
EXPECT_DOUBLE_EQ(this->template reduction_test<double>(col_nulls, *var_agg, output_type).first,
var_nulls);
EXPECT_DOUBLE_EQ(this->template reduction_test<double>(col_nulls, *std_agg, output_type).first,
std_nulls);
wence- marked this conversation as resolved.
Show resolved Hide resolved
}

TYPED_TEST(DictionaryReductionTest, NthElement)