From ff2fb61fc2f519d79fc9d1e55bfe0aa7f6eb988a Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Wed, 14 Mar 2018 13:47:22 +0000 Subject: [PATCH] Guard against the case there are insufficient populated values to estimate variance in tests for periodic components (#13) This was triggering a boost exception to be thrown and spamming the logs. In all such cases, we have too little data to correctly determine if such a periodic component is present and so can correctly exit early. --- lib/maths/CPeriodicityHypothesisTests.cc | 160 ++++++++++++----------- 1 file changed, 83 insertions(+), 77 deletions(-) diff --git a/lib/maths/CPeriodicityHypothesisTests.cc b/lib/maths/CPeriodicityHypothesisTests.cc index 4d7543e41e..153263de6c 100644 --- a/lib/maths/CPeriodicityHypothesisTests.cc +++ b/lib/maths/CPeriodicityHypothesisTests.cc @@ -1292,7 +1292,7 @@ bool CPeriodicityHypothesisTests::testStatisticsFor(const TFloatMeanAccumulatorC return false; } - LOG_TRACE("populated = " << 100.0 * populated << "%"); + LOG_TRACE("populated = " << 100.0 * populated / static_cast(buckets.size()) << "%"); stats.s_Range = range.max() - range.min(); stats.s_B = populated; @@ -1466,65 +1466,68 @@ bool CPeriodicityHypothesisTests::testPeriod(const TTimeTimePr2Vec &windows, LOG_TRACE(" populated = " << b); double df1{B - b}; - double v1{varianceAtPercentile(residualVariance(trend, scale), df1, - 50.0 + CONFIDENCE_INTERVAL / 2.0)}; - LOG_TRACE(" variance = " << v1); - LOG_TRACE(" varianceThreshold = " << vt); - LOG_TRACE(" significance = " << CStatisticalTests::leftTailFTest(v1 / v0, df1, df0)); - - double Rt{stats.s_Rt * CTools::truncate(1.0 - 0.5 * (vt - v1) / vt, 0.9, 1.0)}; - if (v1 < vt && CStatisticalTests::leftTailFTest(v1 / v0, df1, df0) <= MAXIMUM_SIGNIFICANCE) - { - double R{CSignal::autocorrelation(period, values)}; - R = autocorrelationAtPercentile(R, B, 50.0 - CONFIDENCE_INTERVAL / 2.0); - LOG_TRACE(" autocorrelation = " << R); - LOG_TRACE(" autocorrelationThreshold = " << Rt); - if (R > Rt) + if (df1 > 0.0) + { + double v1{varianceAtPercentile(residualVariance(trend, scale), df1, + 50.0 + CONFIDENCE_INTERVAL / 2.0)}; + LOG_TRACE(" variance = " << v1); + LOG_TRACE(" varianceThreshold = " << vt); + LOG_TRACE(" significance = " << CStatisticalTests::leftTailFTest(v1 / v0, df1, df0)); + + double Rt{stats.s_Rt * CTools::truncate(1.0 - 0.5 * (vt - v1) / vt, 0.9, 1.0)}; + if (v1 < vt && CStatisticalTests::leftTailFTest(v1 / v0, df1, df0) <= MAXIMUM_SIGNIFICANCE) { - return true; + double R{CSignal::autocorrelation(period, values)}; + R = autocorrelationAtPercentile(R, B, 50.0 - CONFIDENCE_INTERVAL / 2.0); + LOG_TRACE(" autocorrelation = " << R); + LOG_TRACE(" autocorrelationThreshold = " << Rt); + if (R > Rt) + { + return true; + } } - } - // The amplitude test. + // The amplitude test. - double F1{1.0}; - if (v1 > 0.0) - { - try + double F1{1.0}; + if (v1 > 0.0) { - std::size_t n{static_cast( - std::ceil(Rt * static_cast(length(window) / period_)))}; - TMeanAccumulator level; - for (const auto &value : values) + try { - if (CBasicStatistics::count(value) > 0.0) + std::size_t n{static_cast( + std::ceil(Rt * static_cast(length(window) / period_)))}; + TMeanAccumulator level; + for (const auto &value : values) { - level.add(CBasicStatistics::mean(value)); + if (CBasicStatistics::count(value) > 0.0) + { + level.add(CBasicStatistics::mean(value)); + } } - } - TMinAmplitudeVec amplitudes(period, {n, CBasicStatistics::mean(level)}); - periodicTrend(values, window, m_BucketLength, amplitudes); - boost::math::normal normal(0.0, std::sqrt(v1)); - std::for_each(amplitudes.begin(), amplitudes.end(), - [&F1, &normal, at](CMinAmplitude &x) - { - if (x.amplitude() >= at) + TMinAmplitudeVec amplitudes(period, {n, CBasicStatistics::mean(level)}); + periodicTrend(values, window, m_BucketLength, amplitudes); + boost::math::normal normal(0.0, std::sqrt(v1)); + std::for_each(amplitudes.begin(), amplitudes.end(), + [&F1, &normal, at](CMinAmplitude &x) { - F1 = std::min(F1, x.significance(normal)); - } - }); + if (x.amplitude() >= at) + { + F1 = std::min(F1, x.significance(normal)); + } + }); + } + catch (const std::exception &e) + { + LOG_ERROR("Unable to compute significance of amplitude: " << e.what()); + } } - catch (const std::exception &e) + LOG_TRACE(" F(amplitude) = " << F1); + + if (1.0 - std::pow(1.0 - F1, b) <= MAXIMUM_SIGNIFICANCE) { - LOG_ERROR("Unable to compute significance of amplitude: " << e.what()); + return true; } } - LOG_TRACE(" F(amplitude) = " << F1); - - if (1.0 - std::pow(1.0 - F1, b) <= MAXIMUM_SIGNIFICANCE) - { - return true; - } return false; } @@ -1618,7 +1621,7 @@ bool CPeriodicityHypothesisTests::testPartition(const TTimeTimePr2Vec &partition { for (std::size_t i = 0u; i < 2; ++i) { - for (auto &&delta : deltas[i]) + for (auto &delta : deltas[i]) { delta = (delta + m_BucketLength) % windowLength; } @@ -1683,39 +1686,42 @@ bool CPeriodicityHypothesisTests::testPartition(const TTimeTimePr2Vec &partition } double df1{B - b}; - double variance{correction * minimum[0].first}; - double v1{varianceAtPercentile(variance, df1, 50.0 + CONFIDENCE_INTERVAL / 2.0)}; - LOG_TRACE(" variance = " << v1); - LOG_TRACE(" varianceThreshold = " << vt); - LOG_TRACE(" significance = " << CStatisticalTests::leftTailFTest(v1 / v0, df1, df0)); - - if (v1 <= vt && CStatisticalTests::leftTailFTest(v1 / v0, df1, df0) <= MAXIMUM_SIGNIFICANCE) + if (df1 > 0.0) { - double R{-1.0}; - double Rt{stats.s_Rt * CTools::truncate(1.0 - 0.5 * (vt - v1) / vt, 0.9, 1.0)}; + double variance{correction * minimum[0].first}; + double v1{varianceAtPercentile(variance, df1, 50.0 + CONFIDENCE_INTERVAL / 2.0)}; + LOG_TRACE(" variance = " << v1); + LOG_TRACE(" varianceThreshold = " << vt); + LOG_TRACE(" significance = " << CStatisticalTests::leftTailFTest(v1 / v0, df1, df0)); - startOfPartition = best[0].second; - windows[0] = calculateWindows(startOfPartition, windowLength, repeat, partition[0]); - windows[1] = calculateWindows(startOfPartition, windowLength, repeat, partition[1]); - for (const auto &windows_ : windows) + if (v1 <= vt && CStatisticalTests::leftTailFTest(v1 / v0, df1, df0) <= MAXIMUM_SIGNIFICANCE) { - TFloatMeanAccumulatorVec partitionValues; - project(values, windows_, m_BucketLength, partitionValues); - std::size_t windowLength_(length(windows_[0]) / m_BucketLength); - double BW{std::accumulate(partitionValues.begin(), partitionValues.end(), 0.0, - [](double n, const TFloatMeanAccumulator &value) - { return n + (CBasicStatistics::count(value) > 0.0 ? 1.0 : 0.0); })}; - R = std::max(R, autocorrelationAtPercentile(CSignal::autocorrelation( - windowLength_ + period, partitionValues), - BW, 50.0 - CONFIDENCE_INTERVAL / 2.0)); - LOG_TRACE(" autocorrelation = " << R); - LOG_TRACE(" autocorrelationThreshold = " << Rt); - } + double R{-1.0}; + double Rt{stats.s_Rt * CTools::truncate(1.0 - 0.5 * (vt - v1) / vt, 0.9, 1.0)}; - if (R > Rt) - { - stats.s_StartOfPartition = startOfPartition; - return true; + startOfPartition = best[0].second; + windows[0] = calculateWindows(startOfPartition, windowLength, repeat, partition[0]); + windows[1] = calculateWindows(startOfPartition, windowLength, repeat, partition[1]); + for (const auto &windows_ : windows) + { + TFloatMeanAccumulatorVec partitionValues; + project(values, windows_, m_BucketLength, partitionValues); + std::size_t windowLength_(length(windows_[0]) / m_BucketLength); + double BW{std::accumulate(partitionValues.begin(), partitionValues.end(), 0.0, + [](double n, const TFloatMeanAccumulator &value) + { return n + (CBasicStatistics::count(value) > 0.0 ? 1.0 : 0.0); })}; + R = std::max(R, autocorrelationAtPercentile(CSignal::autocorrelation( + windowLength_ + period, partitionValues), + BW, 50.0 - CONFIDENCE_INTERVAL / 2.0)); + LOG_TRACE(" autocorrelation = " << R); + LOG_TRACE(" autocorrelationThreshold = " << Rt); + } + + if (R > Rt) + { + stats.s_StartOfPartition = startOfPartition; + return true; + } } } return false;