diff --git a/include/maths/CNaiveBayes.h b/include/maths/CNaiveBayes.h new file mode 100644 index 0000000000..668fd7af2f --- /dev/null +++ b/include/maths/CNaiveBayes.h @@ -0,0 +1,237 @@ +/* + * ELASTICSEARCH CONFIDENTIAL + * + * Copyright (c) 2018 Elasticsearch BV. All Rights Reserved. + * + * Notice: this software, and all information contained + * therein, is the exclusive property of Elasticsearch BV + * and its licensors, if any, and is protected under applicable + * domestic and foreign law, and international treaties. + * + * Reproduction, republication or distribution without the + * express written consent of Elasticsearch BV is + * strictly prohibited. + */ + +#ifndef INCLUDED_ml_maths_CNaiveBayes_h +#define INCLUDED_ml_maths_CNaiveBayes_h + +#include + +#include + +#include + +#include +#include + +namespace ml +{ +namespace core +{ +class CStatePersistInserter; +class CStateRestoreTraverser; +} +namespace maths +{ +struct SDistributionRestoreParams; + +//! \brief The interface expected by CNaiveBayes for implementations +//! of the class conditional density functions. +class MATHS_EXPORT CNaiveBayesFeatureDensity +{ + public: + using TDouble1Vec = core::CSmallVector; + + public: + virtual ~CNaiveBayesFeatureDensity() = default; + + //! Create and return a clone. + //! + //! \note The caller owns this. + virtual CNaiveBayesFeatureDensity *clone() const = 0; + + //! Initialize by reading state from \p traverser. + virtual bool acceptRestoreTraverser(const SDistributionRestoreParams ¶ms, + core::CStateRestoreTraverser &traverser) = 0; + + //! Persist state by passing information to \p inserter. + virtual void acceptPersistInserter(core::CStatePersistInserter &inserter) const = 0; + + //! Add the value \p x. + virtual void add(const TDouble1Vec &x) = 0; + + //! Compute the log value of the density function at \p x. + virtual double logValue(const TDouble1Vec &x) const = 0; + + //! Age out old values density to account for \p time passing. + virtual void propagateForwardsByTime(double time) = 0; + + //! Debug the memory used by this object. + virtual void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const = 0; + + //! Get the static size of this object. + virtual std::size_t staticSize() const = 0; + + //! Get the memory used by this object. + virtual std::size_t memoryUsage() const = 0; + + //! Get a checksum for this object. + virtual uint64_t checksum(uint64_t seed) const = 0; +}; + +//! \brief An implementation of the class conditional density function +//! based on the CPrior hierarchy. +class MATHS_EXPORT CNaiveBayesFeatureDensityFromPrior final : public CNaiveBayesFeatureDensity +{ + public: + CNaiveBayesFeatureDensityFromPrior() = default; + CNaiveBayesFeatureDensityFromPrior(CPrior &prior); + + //! Create and return a clone. + //! + //! \note The caller owns this. + virtual CNaiveBayesFeatureDensityFromPrior *clone() const; + + //! Initialize by reading state from \p traverser. + virtual bool acceptRestoreTraverser(const SDistributionRestoreParams ¶ms, + core::CStateRestoreTraverser &traverser); + + //! Persist state by passing information to \p inserter. + virtual void acceptPersistInserter(core::CStatePersistInserter &inserter) const; + + //! Add the value \p x. + virtual void add(const TDouble1Vec &x); + + //! Compute the log value of the density function at \p x. + virtual double logValue(const TDouble1Vec &x) const; + + //! Age out old values density to account for \p time passing. + virtual void propagateForwardsByTime(double time); + + //! Debug the memory used by this object. + virtual void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const; + + //! Get the static size of this object. + virtual std::size_t staticSize() const; + + //! Get the memory used by this object. + virtual std::size_t memoryUsage() const; + + //! Get a checksum for this object. + virtual uint64_t checksum(uint64_t seed) const; + + private: + using TPriorPtr = boost::shared_ptr; + + private: + //! The density model. + TPriorPtr m_Prior; +}; + +//! \brief Implements a Naive Bayes classifier. +class MATHS_EXPORT CNaiveBayes +{ + public: + using TDoubleSizePr = std::pair; + using TDoubleSizePrVec = std::vector; + using TDouble1Vec = core::CSmallVector; + using TDouble1VecVec = std::vector; + + public: + explicit CNaiveBayes(const CNaiveBayesFeatureDensity &exemplar, + double decayRate = 0.0); + CNaiveBayes(const SDistributionRestoreParams ¶ms, + core::CStateRestoreTraverser &traverser); + + //! Persist state by passing information to \p inserter. + void acceptPersistInserter(core::CStatePersistInserter &inserter) const; + + //! This can be used to optionally seed the class counts + //! with \p counts. These are added on to data class counts + //! to compute the class posterior probabilities. + void initialClassCounts(const TDoubleSizePrVec &counts); + + //! Add a training data point comprising the pair \f$(x,l)\f$ + //! for feature vector \f$x\f$ and class label \f$l\f$. + //! + //! \param[in] label The class label for \p x. + //! \param[in] x The feature values. + //! \note \p x size should be equal to the number of features. + //! A feature is missing is indicated by passing an empty vector + //! for that feature. + void addTrainingDataPoint(std::size_t label, const TDouble1VecVec &x); + + //! Age out old values from the class conditional densities + //! to account for \p time passing. + void propagateForwardsByTime(double time); + + //! Get the top \p n class probabilities for \p features. + //! + //! \param[in] n The number of class probabilities to estimate. + //! \param[in] x The feature values. + //! \note \p x size should be equal to the number of features. + //! A feature is missing is indicated by passing an empty vector + //! for that feature. + TDoubleSizePrVec highestClassProbabilities(std::size_t n, + const TDouble1VecVec &x) const; + + //! Debug the memory used by this object. + void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const; + + //! Get the memory used by this object. + std::size_t memoryUsage() const; + + //! Get a checksum for this object. + uint64_t checksum(uint64_t seed = 0) const; + + private: + using TFeatureDensityPtr = boost::shared_ptr; + using TFeatureDensityPtrVec = std::vector; + + //! \brief The data associated with a class. + struct SClass + { + //! Initialize by reading state from \p traverser. + bool acceptRestoreTraverser(const SDistributionRestoreParams ¶ms, + core::CStateRestoreTraverser &traverser); + //! Persist state by passing information to \p inserter. + void acceptPersistInserter(core::CStatePersistInserter &inserter) const; + //! Debug the memory used by this object. + void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const; + //! Get the memory used by this object. + std::size_t memoryUsage() const; + //! Get a checksum for this object. + uint64_t checksum(uint64_t seed = 0) const; + + //! The number of examples in this class. + double s_Count = 0.0; + //! The feature conditional densities for this class. + TFeatureDensityPtrVec s_ConditionalDensities; + }; + + using TSizeClassUMap = boost::unordered_map; + + private: + //! Initialize by reading state from \p traverser. + bool acceptRestoreTraverser(const SDistributionRestoreParams ¶ms, + core::CStateRestoreTraverser &traverser); + + //! Validate \p x. + bool validate(const TDouble1VecVec &x) const; + + private: + //! Controls the rate at which data are aged out. + double m_DecayRate; + + //! An exemplar for creating conditional densities. + TFeatureDensityPtr m_Exemplar; + + //! The class conditional density estimates and weights. + TSizeClassUMap m_ClassConditionalDensities; +}; + +} +} + +#endif // INCLUDED_ml_maths_CNaiveBayes_h diff --git a/include/maths/CTimeSeriesChangeDetector.h b/include/maths/CTimeSeriesChangeDetector.h new file mode 100644 index 0000000000..084094b6c2 --- /dev/null +++ b/include/maths/CTimeSeriesChangeDetector.h @@ -0,0 +1,385 @@ +/* + * ELASTICSEARCH CONFIDENTIAL + * + * Copyright (c) 2018 Elasticsearch BV. All Rights Reserved. + * + * Notice: this software, and all information contained + * therein, is the exclusive property of Elasticsearch BV + * and its licensors, if any, and is protected under applicable + * domestic and foreign law, and international treaties. + * + * Reproduction, republication or distribution without the + * express written consent of Elasticsearch BV is + * strictly prohibited. + */ + +#ifndef INCLUDED_ml_maths_CTimeSeriesChangeDetector_h +#define INCLUDED_ml_maths_CTimeSeriesChangeDetector_h + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +namespace ml +{ +namespace core +{ +class CStatePersistInserter; +class CStateRestoreTraverser; +} +namespace maths +{ +class CModelAddSamplesParams; +class CPrior; +class CTimeSeriesDecompositionInterface; +struct SDistributionRestoreParams; + +namespace time_series_change_detector_detail +{ +class CUnivariateTimeSeriesChangeModel; +} + +//! \brief A description of a time series change. +struct MATHS_EXPORT SChangeDescription +{ + using TDouble2Vec = core::CSmallVector; + + //! The types of change we can detect. + enum EDescription + { + E_LevelShift, + E_TimeShift + }; + + SChangeDescription(EDescription decription, double value); + + //! The type of change. + EDescription s_Description; + + //! The change value. + TDouble2Vec s_Value; +}; + +//! \brief Tests a variety of possible changes which might have +//! occurred in a time series and selects one if it provides a +//! good explanation of the recent behaviour. +class MATHS_EXPORT CUnivariateTimeSeriesChangeDetector +{ + public: + using TTimeDoublePr = std::pair; + using TTimeDoublePr1Vec = core::CSmallVector; + using TDouble4Vec = core::CSmallVector; + using TDouble4Vec1Vec = core::CSmallVector; + using TWeightStyleVec = maths_t::TWeightStyleVec; + using TOptionalChangeDescription = boost::optional; + using TPriorPtr = boost::shared_ptr; + + public: + CUnivariateTimeSeriesChangeDetector(const CTimeSeriesDecompositionInterface &trendModel, + const TPriorPtr &residualModel, + core_t::TTime minimumTimeToDetect, + core_t::TTime maximumTimeToDetect, + double minimumDeltaBicToDetect); + + //! Initialize by reading state from \p traverser. + bool acceptRestoreTraverser(const SDistributionRestoreParams ¶ms, + core::CStateRestoreTraverser &traverser); + + //! Persist state by passing information to \p inserter. + void acceptPersistInserter(core::CStatePersistInserter &inserter) const; + + //! Check if there has been a change and get a description + //! if there has been. + TOptionalChangeDescription change(); + + //! Add \p samples to the change detector. + void addSamples(maths_t::EDataType dataType, + const TWeightStyleVec &weightStyles, + const TTimeDoublePr1Vec &samples, + const TDouble4Vec1Vec &weights, + double propagationInterval = 1.0); + + //! Check if we should stop testing. + bool stopTesting() const; + + //! Debug the memory used by this object. + void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const; + + //! Get the memory used by this object. + std::size_t memoryUsage() const; + + //! Get a checksum for this object. + uint64_t checksum(uint64_t seed = 0) const; + + private: + using TUnivariateTimeSeriesChangeModel = + time_series_change_detector_detail::CUnivariateTimeSeriesChangeModel; + using TChangeModelPtr = boost::shared_ptr; + using TChangeModelPtr4Vec = core::CSmallVector; + using TMinMaxAccumulator = CBasicStatistics::CMinMax; + + private: + //! The minimum amount of time we need to observe before + //! selecting a change model. + core_t::TTime m_MinimumTimeToDetect; + + //! The maximum amount of time to try to detect a change. + core_t::TTime m_MaximumTimeToDetect; + + //! The minimum increase in BIC select a change model. + double m_MinimumDeltaBicToDetect; + + //! The start and end of the change model. + TMinMaxAccumulator m_TimeRange; + + //! The count of samples added to the change models. + std::size_t m_SampleCount; + + //! The current evidence of a change. + double m_CurrentEvidenceOfChange; + + //! The change models. + TChangeModelPtr4Vec m_ChangeModels; +}; + +namespace time_series_change_detector_detail +{ + +//! \brief Helper interface for change detection. Implementations of +//! this are used to model specific types of changes which can occur. +class MATHS_EXPORT CUnivariateTimeSeriesChangeModel : private core::CNonCopyable +{ + public: + using TTimeDoublePr = std::pair; + using TTimeDoublePr1Vec = core::CSmallVector; + using TDouble4Vec = core::CSmallVector; + using TDouble4Vec1Vec = core::CSmallVector; + using TWeightStyleVec = maths_t::TWeightStyleVec; + using TPriorPtr = boost::shared_ptr; + using TOptionalChangeDescription = boost::optional; + + public: + CUnivariateTimeSeriesChangeModel(const CTimeSeriesDecompositionInterface &trendModel); + virtual ~CUnivariateTimeSeriesChangeModel() = default; + + //! Initialize by reading state from \p traverser. + virtual bool acceptRestoreTraverser(const SDistributionRestoreParams ¶ms, + core::CStateRestoreTraverser &traverser) = 0; + + //! Persist state by passing information to \p inserter. + virtual void acceptPersistInserter(core::CStatePersistInserter &inserter) const = 0; + + //! The BIC of applying the change. + virtual double bic() const = 0; + + //! Get a description of the change. + virtual TOptionalChangeDescription change() const = 0; + + //! Update the change model with \p samples. + virtual void addSamples(std::size_t count, + maths_t::EDataType dataType, + const TWeightStyleVec &weightStyles, + const TTimeDoublePr1Vec &samples, + const TDouble4Vec1Vec &weights, + double propagationInterval = 1.0) = 0; + + //! Debug the memory used by this object. + virtual void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const = 0; + + //! Get the static size of this object. + virtual std::size_t staticSize() const = 0; + + //! Get the memory used by this object. + virtual std::size_t memoryUsage() const = 0; + + //! Get a checksum for this object. + virtual uint64_t checksum(uint64_t seed) const = 0; + + protected: + //! Get the log-likelihood. + double logLikelihood() const; + + //! Update the data log-likelihood with \p logLikelihood. + void addLogLikelihood(double logLikelihood); + + //! Get the time series trend model. + const CTimeSeriesDecompositionInterface &trendModel() const; + + private: + //! The likelihood of the data under this model. + double m_LogLikelihood; + + //! A model decomposing the time series trend. + const CTimeSeriesDecompositionInterface &m_TrendModel; +}; + +//! \brief Used to capture the likelihood of the data given no change. +class MATHS_EXPORT CUnivariateNoChangeModel final : public CUnivariateTimeSeriesChangeModel +{ + public: + CUnivariateNoChangeModel(const CTimeSeriesDecompositionInterface &trendModel, + const TPriorPtr &residualModel); + + //! Initialize by reading state from \p traverser. + virtual bool acceptRestoreTraverser(const SDistributionRestoreParams ¶ms, + core::CStateRestoreTraverser &traverser); + + //! Persist state by passing information to \p inserter. + virtual void acceptPersistInserter(core::CStatePersistInserter &inserter) const; + + //! Returns the no change BIC. + virtual double bic() const; + + //! Returns a null object. + virtual TOptionalChangeDescription change() const; + + //! Get the log likelihood of \p samples. + virtual void addSamples(std::size_t count, + maths_t::EDataType dataType, + const TWeightStyleVec &weightStyles, + const TTimeDoublePr1Vec &samples, + const TDouble4Vec1Vec &weights, + double propagationInterval = 1.0); + + //! Debug the memory used by this object. + virtual void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const; + + //! Get the static size of this object. + virtual std::size_t staticSize() const; + + //! Get the memory used by this object. + virtual std::size_t memoryUsage() const; + + //! Get a checksum for this object. + virtual uint64_t checksum(uint64_t seed) const; + + private: + //! A reference to the underlying prior. + TPriorPtr m_ResidualModel; +}; + +//! \brief Captures the likelihood of the data given an arbitrary +//! level shift. +class MATHS_EXPORT CUnivariateTimeSeriesLevelShiftModel final : public CUnivariateTimeSeriesChangeModel +{ + public: + CUnivariateTimeSeriesLevelShiftModel(const CTimeSeriesDecompositionInterface &trendModel, + const TPriorPtr &residualModel); + + //! Initialize by reading state from \p traverser. + virtual bool acceptRestoreTraverser(const SDistributionRestoreParams ¶ms, + core::CStateRestoreTraverser &traverser); + + //! Persist state by passing information to \p inserter. + virtual void acceptPersistInserter(core::CStatePersistInserter &inserter) const; + + //! The BIC of applying the level shift. + virtual double bic() const; + + //! Get a description of the level shift. + virtual TOptionalChangeDescription change() const; + + //! Update with \p samples. + virtual void addSamples(std::size_t count, + maths_t::EDataType dataType, + const TWeightStyleVec &weightStyles, + const TTimeDoublePr1Vec &samples, + const TDouble4Vec1Vec &weights, + double propagationInterval = 1.0); + + //! Debug the memory used by this object. + virtual void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const; + + //! Get the static size of this object. + virtual std::size_t staticSize() const; + + //! Get the memory used by this object. + virtual std::size_t memoryUsage() const; + + //! Get a checksum for this object. + virtual uint64_t checksum(uint64_t seed) const; + + private: + using TDoubleVec = std::vector; + using TMeanAccumulator = CBasicStatistics::SSampleMean::TAccumulator; + + private: + //! The optimal shift. + TMeanAccumulator m_Shift; + + //! Get the number of samples. + double m_SampleCount; + + //! The prior for the time series' residual model subject + //! to the shift. + TPriorPtr m_ResidualModel; + + //! The initial residual model mode. + double m_ResidualModelMode; +}; + +//! \brief Captures the likelihood of the data given a specified +//! time shift. +class MATHS_EXPORT CUnivariateTimeSeriesTimeShiftModel final : public CUnivariateTimeSeriesChangeModel +{ + public: + CUnivariateTimeSeriesTimeShiftModel(const CTimeSeriesDecompositionInterface &trendModel, + const TPriorPtr &residualModel, + core_t::TTime shift); + + //! Initialize by reading state from \p traverser. + virtual bool acceptRestoreTraverser(const SDistributionRestoreParams ¶ms, + core::CStateRestoreTraverser &traverser); + + //! Persist state by passing information to \p inserter. + virtual void acceptPersistInserter(core::CStatePersistInserter &inserter) const; + + //! The BIC of applying the time shift. + virtual double bic() const; + + //! Get a description of the time shift. + virtual TOptionalChangeDescription change() const; + + //! Update with \p samples. + virtual void addSamples(std::size_t count, + maths_t::EDataType dataType, + const TWeightStyleVec &weightStyles, + const TTimeDoublePr1Vec &samples, + const TDouble4Vec1Vec &weights, + double propagationInterval = 1.0); + + //! Debug the memory used by this object. + virtual void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const; + + //! Get the static size of this object. + virtual std::size_t staticSize() const; + + //! Get the memory used by this object. + virtual std::size_t memoryUsage() const; + + //! Get a checksum for this object. + virtual uint64_t checksum(uint64_t seed) const; + + private: + //! The shift in time of the time series trend model. + core_t::TTime m_Shift; + + //! The prior for the time series' residual model subject + //! to the shift. + TPriorPtr m_ResidualModel; +}; + +} + +} +} + +#endif // INCLUDED_ml_maths_CTimeSeriesChangeDetector_h diff --git a/lib/maths/CNaiveBayes.cc b/lib/maths/CNaiveBayes.cc new file mode 100644 index 0000000000..dadabe6c72 --- /dev/null +++ b/lib/maths/CNaiveBayes.cc @@ -0,0 +1,364 @@ +/* + * ELASTICSEARCH CONFIDENTIAL + * + * Copyright (c) 2018 Elasticsearch BV. All Rights Reserved. + * + * Notice: this software, and all information contained + * therein, is the exclusive property of Elasticsearch BV + * and its licensors, if any, and is protected under applicable + * domestic and foreign law, and international treaties. + * + * Reproduction, republication or distribution without the + * express written consent of Elasticsearch BV is + * strictly prohibited. + */ + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace ml +{ +namespace maths +{ +namespace +{ +const std::string PRIOR_TAG{"a"}; +const std::string CLASS_LABEL_TAG{"b"}; +const std::string CLASS_MODEL_TAG{"c"}; +const std::string COUNT_TAG{"d"}; +const std::string CONDITIONAL_DENSITY_FROM_PRIOR_TAG{"e"}; +} + +CNaiveBayesFeatureDensityFromPrior::CNaiveBayesFeatureDensityFromPrior(CPrior &prior) : + m_Prior(prior.clone()) +{} + +void CNaiveBayesFeatureDensityFromPrior::add(const TDouble1Vec &x) +{ + m_Prior->addSamples(CConstantWeights::COUNT, x, CConstantWeights::SINGLE_UNIT); +} + +CNaiveBayesFeatureDensityFromPrior *CNaiveBayesFeatureDensityFromPrior::clone() const +{ + return new CNaiveBayesFeatureDensityFromPrior(*m_Prior); +} + +bool CNaiveBayesFeatureDensityFromPrior::acceptRestoreTraverser(const SDistributionRestoreParams ¶ms, + core::CStateRestoreTraverser &traverser) +{ + do + { + const std::string &name{traverser.name()}; + RESTORE(PRIOR_TAG, traverser.traverseSubLevel(boost::bind( + CPriorStateSerialiser(), + boost::cref(params), boost::ref(m_Prior), _1))); + } + while (traverser.next()); + return true; +} + +void CNaiveBayesFeatureDensityFromPrior::acceptPersistInserter(core::CStatePersistInserter &inserter) const +{ + inserter.insertLevel(PRIOR_TAG, boost::bind(CPriorStateSerialiser(), + boost::cref(*m_Prior), _1)); +} + +double CNaiveBayesFeatureDensityFromPrior::logValue(const TDouble1Vec &x) const +{ + double result; + if (m_Prior->jointLogMarginalLikelihood(CConstantWeights::COUNT, x, + CConstantWeights::SINGLE_UNIT, + result) != maths_t::E_FpNoErrors) + { + LOG_ERROR("Bad value density value for " << x); + return boost::numeric::bounds::lowest(); + } + return result; +} + +void CNaiveBayesFeatureDensityFromPrior::debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const +{ + return core::CMemoryDebug::dynamicSize("m_Prior", m_Prior, mem); +} + +std::size_t CNaiveBayesFeatureDensityFromPrior::staticSize() const +{ + return sizeof(*this); +} + +std::size_t CNaiveBayesFeatureDensityFromPrior::memoryUsage() const +{ + return core::CMemory::dynamicSize(m_Prior); +} + +void CNaiveBayesFeatureDensityFromPrior::propagateForwardsByTime(double time) +{ + m_Prior->propagateForwardsByTime(time); +} + +uint64_t CNaiveBayesFeatureDensityFromPrior::checksum(uint64_t seed) const +{ + return CChecksum::calculate(seed, m_Prior); +} + + +CNaiveBayes::CNaiveBayes(const CNaiveBayesFeatureDensity &exemplar, double decayRate) : + m_DecayRate{decayRate}, + m_Exemplar{exemplar.clone()}, + m_ClassConditionalDensities{2} +{} + +CNaiveBayes::CNaiveBayes(const SDistributionRestoreParams ¶ms, + core::CStateRestoreTraverser &traverser) : + m_DecayRate{params.s_DecayRate}, + m_ClassConditionalDensities{2} +{ + traverser.traverseSubLevel(boost::bind(&CNaiveBayes::acceptRestoreTraverser, + this, boost::cref(params), _1)); +} + +bool CNaiveBayes::acceptRestoreTraverser(const SDistributionRestoreParams ¶ms, + core::CStateRestoreTraverser &traverser) +{ + std::size_t label; + do + { + const std::string &name{traverser.name()}; + RESTORE_BUILT_IN(CLASS_LABEL_TAG, label) + RESTORE_SETUP_TEARDOWN(CLASS_MODEL_TAG, + SClass class_, + traverser.traverseSubLevel(boost::bind( + &SClass::acceptRestoreTraverser, + boost::ref(class_), boost::cref(params), _1)), + m_ClassConditionalDensities.emplace(label, class_)) + } + while (traverser.next()); + return true; +} + +void CNaiveBayes::acceptPersistInserter(core::CStatePersistInserter &inserter) const +{ + using TSizeClassUMapCItr = TSizeClassUMap::const_iterator; + using TSizeClassUMapCItrVec = std::vector; + TSizeClassUMapCItrVec classes; + classes.reserve(m_ClassConditionalDensities.size()); + for (auto i = m_ClassConditionalDensities.begin(); i != m_ClassConditionalDensities.end(); ++i) + { + classes.push_back(i); + } + std::sort(classes.begin(), classes.end(), core::CFunctional::SDereference()); + for (const auto &class_ : classes) + { + inserter.insertValue(CLASS_LABEL_TAG, class_->first); + inserter.insertLevel(CLASS_MODEL_TAG, boost::bind(&SClass::acceptPersistInserter, + boost::ref(class_->second), _1)); + } +} + +void CNaiveBayes::initialClassCounts(const TDoubleSizePrVec &counts) +{ + for (const auto &count : counts) + { + m_ClassConditionalDensities[count.second] = SClass{count.first, {}}; + } +} + +void CNaiveBayes::addTrainingDataPoint(std::size_t label, const TDouble1VecVec &x) +{ + if (!this->validate(x)) + { + return; + } + + auto &class_ = m_ClassConditionalDensities[label]; + + if (class_.s_ConditionalDensities.empty()) + { + class_.s_ConditionalDensities.reserve(x.size()); + std::generate_n(std::back_inserter(class_.s_ConditionalDensities), + x.size(), + [this]() { return TFeatureDensityPtr{m_Exemplar->clone()}; }); + } + + bool updateCount{false}; + for (std::size_t i = 0u; i < x.size(); ++i) + { + if (x[i].size() > 0) + { + class_.s_ConditionalDensities[i]->add(x[i]); + updateCount = true; + } + } + + if (updateCount) + { + class_.s_Count += 1.0; + } + else + { + LOG_TRACE("Ignoring empty feature vector"); + } +} + +void CNaiveBayes::propagateForwardsByTime(double time) +{ + double factor{std::exp(-m_DecayRate * time)}; + for (auto &class_ : m_ClassConditionalDensities) + { + class_.second.s_Count *= factor; + for (auto &density : class_.second.s_ConditionalDensities) + { + density->propagateForwardsByTime(time); + } + } +} + +CNaiveBayes::TDoubleSizePrVec +CNaiveBayes::highestClassProbabilities(std::size_t n, const TDouble1VecVec &x) const +{ + if (!this->validate(x)) + { + return {}; + } + if (m_ClassConditionalDensities.empty()) + { + LOG_ERROR("Trying to compute class probabilities without supplying training data"); + return {}; + } + + TDoubleSizePrVec p; + p.reserve(m_ClassConditionalDensities.size()); + + for (const auto &class_ : m_ClassConditionalDensities) + { + double f{CTools::fastLog(class_.second.s_Count)}; + for (std::size_t i = 0u; i < x.size(); ++i) + { + if (x[i].size() > 0) + { + f += class_.second.s_ConditionalDensities[i]->logValue(x[i]); + } + } + p.emplace_back(f, class_.first); + } + + double scale{std::max_element(p.begin(), p.end())->first}; + double Z{0.0}; + for (auto &pc : p) + { + pc.first = std::exp(pc.first - scale); + Z += pc.first; + } + for (auto &pc : p) + { + pc.first /= Z; + } + + n = std::min(n, p.size()); + std::sort(p.begin(), p.begin() + n, std::greater()); + + return TDoubleSizePrVec{p.begin(), p.begin() + n}; +} + +void CNaiveBayes::debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const +{ + core::CMemoryDebug::dynamicSize("m_Exemplar", m_Exemplar, mem); + core::CMemoryDebug::dynamicSize("m_ClassConditionalDensities", + m_ClassConditionalDensities, mem); +} + +std::size_t CNaiveBayes::memoryUsage() const +{ + return core::CMemory::dynamicSize(m_Exemplar) + + core::CMemory::dynamicSize(m_ClassConditionalDensities); +} + +uint64_t CNaiveBayes::checksum(uint64_t seed) const +{ + return CChecksum::calculate(seed, m_ClassConditionalDensities); +} + +bool CNaiveBayes::validate(const TDouble1VecVec &x) const +{ + auto class_ = m_ClassConditionalDensities.begin(); + if ( class_ != m_ClassConditionalDensities.end() + && class_->second.s_ConditionalDensities.size() > 0 + && class_->second.s_ConditionalDensities.size() != x.size()) + { + LOG_ERROR("Unexpected feature vector: " << core::CContainerPrinter::print(x)); + return false; + } + return true; +} + +bool CNaiveBayes::SClass::acceptRestoreTraverser(const SDistributionRestoreParams ¶ms, + core::CStateRestoreTraverser &traverser) +{ + do + { + const std::string &name{traverser.name()}; + RESTORE_BUILT_IN(COUNT_TAG, s_Count) + RESTORE_SETUP_TEARDOWN(CONDITIONAL_DENSITY_FROM_PRIOR_TAG, + CNaiveBayesFeatureDensityFromPrior tmp, + traverser.traverseSubLevel(boost::bind( + &CNaiveBayesFeatureDensityFromPrior::acceptRestoreTraverser, + boost::ref(tmp), boost::cref(params), _1)), + s_ConditionalDensities.emplace_back(tmp.clone())) + // Add other implementation's restore code here. + } + while (traverser.next()); + return true; +} + +void CNaiveBayes::SClass::acceptPersistInserter(core::CStatePersistInserter &inserter) const +{ + inserter.insertValue(COUNT_TAG, s_Count, core::CIEEE754::E_SinglePrecision); + for (const auto &density : s_ConditionalDensities) + { + if (dynamic_cast(density.get())) + { + inserter.insertLevel(CONDITIONAL_DENSITY_FROM_PRIOR_TAG, + boost::bind(&CNaiveBayesFeatureDensity::acceptPersistInserter, + density.get(), _1)); + continue; + } + // Add other implementation's persist code here. + } +} + +void CNaiveBayes::SClass::debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const +{ + core::CMemoryDebug::dynamicSize("s_ConditionalDensities", s_ConditionalDensities, mem); +} + +std::size_t CNaiveBayes::SClass::memoryUsage() const +{ + return core::CMemory::dynamicSize(s_ConditionalDensities); +} + +uint64_t CNaiveBayes::SClass::checksum(uint64_t seed) const +{ + seed = CChecksum::calculate(seed, s_Count); + return CChecksum::calculate(seed, s_ConditionalDensities); +} + +} +} diff --git a/lib/maths/CTimeSeriesChangeDetector.cc b/lib/maths/CTimeSeriesChangeDetector.cc new file mode 100644 index 0000000000..5d36e5893c --- /dev/null +++ b/lib/maths/CTimeSeriesChangeDetector.cc @@ -0,0 +1,518 @@ +/* + * ELASTICSEARCH CONFIDENTIAL + * + * Copyright (c) 2018 Elasticsearch BV. All Rights Reserved. + * + * Notice: this software, and all information contained + * therein, is the exclusive property of Elasticsearch BV + * and its licensors, if any, and is protected under applicable + * domestic and foreign law, and international treaties. + * + * Reproduction, republication or distribution without the + * express written consent of Elasticsearch BV is + * strictly prohibited. + */ + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace ml +{ +namespace maths +{ +using namespace time_series_change_detector_detail; + +namespace +{ +using TDouble1Vec = core::CSmallVector; +using TDouble4Vec = core::CSmallVector; +using TDouble4Vec1Vec = core::CSmallVector; +using TOptionalChangeDescription = CUnivariateTimeSeriesChangeDetector::TOptionalChangeDescription; + +const std::string SAMPLE_COUNT_TAG{"a"}; +const std::string MIN_TIME_TAG{"b"}; +const std::string MAX_TIME_TAG{"c"}; +const std::string CHANGE_MODEL_TAG{"d"}; +const std::string LOG_LIKELIHOOD_TAG{"e"}; +const std::string SHIFT_TAG{"f"}; +const std::string RESIDUAL_MODEL_TAG{"g"}; +} + +SChangeDescription::SChangeDescription(EDescription description, double value) : + s_Description{description}, s_Value{value} +{} + +CUnivariateTimeSeriesChangeDetector::CUnivariateTimeSeriesChangeDetector(const CTimeSeriesDecompositionInterface &trendModel, + const TPriorPtr &residualModel, + core_t::TTime minimumTimeToDetect, + core_t::TTime maximumTimeToDetect, + double minimumDeltaBicToDetect) : + m_MinimumTimeToDetect{minimumTimeToDetect}, + m_MaximumTimeToDetect{maximumTimeToDetect}, + m_MinimumDeltaBicToDetect{minimumDeltaBicToDetect}, + m_SampleCount{0}, + m_CurrentEvidenceOfChange{0.0}, + m_ChangeModels{boost::make_shared(trendModel, residualModel), + boost::make_shared(trendModel, residualModel), + boost::make_shared(trendModel, residualModel, -core::constants::HOUR), + boost::make_shared(trendModel, residualModel, +core::constants::HOUR)} +{} + +bool CUnivariateTimeSeriesChangeDetector::acceptRestoreTraverser(const SDistributionRestoreParams ¶ms, + core::CStateRestoreTraverser &traverser) +{ + auto model = m_ChangeModels.begin(); + do + { + const std::string name{traverser.name()}; + RESTORE_BUILT_IN(SAMPLE_COUNT_TAG, m_SampleCount) + RESTORE_SETUP_TEARDOWN(MIN_TIME_TAG, + core_t::TTime time, + core::CStringUtils::stringToType(traverser.value(), time), + m_TimeRange.add(time)) + RESTORE_SETUP_TEARDOWN(MAX_TIME_TAG, + core_t::TTime time, + core::CStringUtils::stringToType(traverser.value(), time), + m_TimeRange.add(time)) + RESTORE(CHANGE_MODEL_TAG, traverser.traverseSubLevel(boost::bind( + &CUnivariateTimeSeriesChangeModel::acceptRestoreTraverser, + (model++)->get(), boost::cref(params), _1))) + } + while (traverser.next()); + return true; +} + +void CUnivariateTimeSeriesChangeDetector::acceptPersistInserter(core::CStatePersistInserter &inserter) const +{ + inserter.insertValue(SAMPLE_COUNT_TAG, m_SampleCount); + inserter.insertValue(MIN_TIME_TAG, m_TimeRange.min()); + inserter.insertValue(MAX_TIME_TAG, m_TimeRange.max()); + for (const auto &model : m_ChangeModels) + { + inserter.insertLevel(CHANGE_MODEL_TAG, + boost::bind(&CUnivariateTimeSeriesChangeModel::acceptPersistInserter, + model.get(), _1)); + } +} + +TOptionalChangeDescription CUnivariateTimeSeriesChangeDetector::change() +{ + using TChangeModelPtr4VecCItr = TChangeModelPtr4Vec::const_iterator; + using TDoubleChangeModelPtr4VecCItrPr = std::pair; + using TMinAccumulator = CBasicStatistics::COrderStatisticsStack; + + if (m_TimeRange.range() > m_MinimumTimeToDetect) + { + double noChangeBic{m_ChangeModels[0]->bic()}; + TMinAccumulator candidates; + for (auto i = m_ChangeModels.begin() + 1; i != m_ChangeModels.end(); ++i) + { + candidates.add({(*i)->bic(), i}); + } + candidates.sort(); + + double evidences[]{noChangeBic - candidates[0].first, + noChangeBic - candidates[1].first}; + m_CurrentEvidenceOfChange = evidences[0]; + if ( evidences[0] > m_MinimumDeltaBicToDetect + && evidences[0] > evidences[1] + m_MinimumDeltaBicToDetect / 2.0) + { + return (*candidates[0].second)->change(); + } + } + return TOptionalChangeDescription(); +} + +bool CUnivariateTimeSeriesChangeDetector::stopTesting() const +{ + core_t::TTime range{m_TimeRange.range()}; + if (range > m_MinimumTimeToDetect) + { + double scale{0.5 + CTools::smoothHeaviside(2.0 * m_CurrentEvidenceOfChange + / m_MinimumDeltaBicToDetect, 0.2)}; + return static_cast(range) + > m_MinimumTimeToDetect + scale * static_cast( + m_MaximumTimeToDetect - m_MinimumTimeToDetect); + } + return false; +} +void CUnivariateTimeSeriesChangeDetector::addSamples(maths_t::EDataType dataType, + const TWeightStyleVec &weightStyles, + const TTimeDoublePr1Vec &samples, + const TDouble4Vec1Vec &weights, + double propagationInterval) +{ + for (const auto &sample : samples) + { + m_TimeRange.add(sample.first); + } + + ++m_SampleCount; + + for (auto &model : m_ChangeModels) + { + model->addSamples(m_SampleCount, dataType, + weightStyles, samples, weights, + propagationInterval); + } +} + +void CUnivariateTimeSeriesChangeDetector::debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const +{ + core::CMemoryDebug::dynamicSize("m_ChangeModels", m_ChangeModels, mem); +} + +std::size_t CUnivariateTimeSeriesChangeDetector::memoryUsage() const +{ + return core::CMemory::dynamicSize(m_ChangeModels); +} + +uint64_t CUnivariateTimeSeriesChangeDetector::checksum(uint64_t seed) const +{ + seed = CChecksum::calculate(seed, m_TimeRange); + seed = CChecksum::calculate(seed, m_SampleCount); + return CChecksum::calculate(seed, m_ChangeModels); +} + +namespace time_series_change_detector_detail +{ + +CUnivariateTimeSeriesChangeModel::CUnivariateTimeSeriesChangeModel(const CTimeSeriesDecompositionInterface &trendModel) : + m_LogLikelihood{0.0}, m_TrendModel{trendModel} +{} + +double CUnivariateTimeSeriesChangeModel::logLikelihood() const +{ + return m_LogLikelihood; +} + +void CUnivariateTimeSeriesChangeModel::addLogLikelihood(double logLikelihood) +{ + m_LogLikelihood += logLikelihood; +} + +const CTimeSeriesDecompositionInterface &CUnivariateTimeSeriesChangeModel::trendModel() const +{ + return m_TrendModel; +} + +CUnivariateNoChangeModel::CUnivariateNoChangeModel(const CTimeSeriesDecompositionInterface &trendModel, + const TPriorPtr &residualModel) : + CUnivariateTimeSeriesChangeModel{trendModel}, + m_ResidualModel{residualModel} +{} + +bool CUnivariateNoChangeModel::acceptRestoreTraverser(const SDistributionRestoreParams &/*params*/, + core::CStateRestoreTraverser &traverser) +{ + do + { + const std::string name{traverser.name()}; + RESTORE_SETUP_TEARDOWN(LOG_LIKELIHOOD_TAG, + double logLikelihood, + core::CStringUtils::stringToType(traverser.value(), logLikelihood), + this->addLogLikelihood(logLikelihood)) + } + while (traverser.next()); + return true; +} + +void CUnivariateNoChangeModel::acceptPersistInserter(core::CStatePersistInserter &inserter) const +{ + inserter.insertValue(LOG_LIKELIHOOD_TAG, this->logLikelihood()); +} + +double CUnivariateNoChangeModel::bic() const +{ + return -2.0 * this->logLikelihood(); +} + +TOptionalChangeDescription CUnivariateNoChangeModel::change() const +{ + return TOptionalChangeDescription(); +} + +void CUnivariateNoChangeModel::addSamples(std::size_t count, + maths_t::EDataType /*dataType*/, + const TWeightStyleVec &weightStyles, + const TTimeDoublePr1Vec &samples_, + const TDouble4Vec1Vec &weights, + double /*propagationInterval*/) +{ + TDouble1Vec samples; + samples.reserve(samples_.size()); + for (const auto &sample : samples_) + { + samples.push_back(this->trendModel().detrend(sample.first, sample.second, 0.0)); + } + + // See CUnivariateTimeSeriesLevelShiftModel for an explanation + // of the delay updating the log-likelihood. + + double logLikelihood; + if (count >= 5 && m_ResidualModel->jointLogMarginalLikelihood( + weightStyles, samples, weights, + logLikelihood) == maths_t::E_FpNoErrors) + { + this->addLogLikelihood(logLikelihood); + } +} + +void CUnivariateNoChangeModel::debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr /*mem*/) const +{ +} + +std::size_t CUnivariateNoChangeModel::staticSize() const +{ + return sizeof(*this); +} + +std::size_t CUnivariateNoChangeModel::memoryUsage() const +{ + return 0; +} + +uint64_t CUnivariateNoChangeModel::checksum(uint64_t seed) const +{ + seed = CChecksum::calculate(seed, this->logLikelihood()); + seed = CChecksum::calculate(seed, this->trendModel()); + return CChecksum::calculate(seed, m_ResidualModel); +} + +CUnivariateTimeSeriesLevelShiftModel::CUnivariateTimeSeriesLevelShiftModel(const CTimeSeriesDecompositionInterface &trendModel, + const TPriorPtr &residualModel) : + CUnivariateTimeSeriesChangeModel{trendModel}, + m_SampleCount{0.0}, + m_ResidualModel{residualModel->clone()}, + m_ResidualModelMode{residualModel->marginalLikelihoodMode()} +{} + +bool CUnivariateTimeSeriesLevelShiftModel::acceptRestoreTraverser(const SDistributionRestoreParams ¶ms, + core::CStateRestoreTraverser &traverser) +{ + do + { + const std::string name{traverser.name()}; + RESTORE_SETUP_TEARDOWN(LOG_LIKELIHOOD_TAG, + double logLikelihood, + core::CStringUtils::stringToType(traverser.value(), logLikelihood), + this->addLogLikelihood(logLikelihood)) + RESTORE(SHIFT_TAG, m_Shift.fromDelimited(traverser.value())) + RESTORE_BUILT_IN(SAMPLE_COUNT_TAG, m_SampleCount) + RESTORE(RESIDUAL_MODEL_TAG, traverser.traverseSubLevel( + boost::bind(CPriorStateSerialiser(), + boost::cref(params), + boost::ref(m_ResidualModel), _1))) + } + while (traverser.next()); + return true; +} + +void CUnivariateTimeSeriesLevelShiftModel::acceptPersistInserter(core::CStatePersistInserter &inserter) const +{ + inserter.insertValue(LOG_LIKELIHOOD_TAG, this->logLikelihood()); + inserter.insertValue(SHIFT_TAG, m_Shift.toDelimited()); + inserter.insertValue(SAMPLE_COUNT_TAG, m_SampleCount); + inserter.insertLevel(RESIDUAL_MODEL_TAG, boost::bind(CPriorStateSerialiser(), + boost::cref(*m_ResidualModel), _1)); +} + +double CUnivariateTimeSeriesLevelShiftModel::bic() const +{ + return -2.0 * this->logLikelihood() + std::log(m_SampleCount); +} + +TOptionalChangeDescription CUnivariateTimeSeriesLevelShiftModel::change() const +{ + return SChangeDescription{SChangeDescription::E_LevelShift, CBasicStatistics::mean(m_Shift)}; +} + +void CUnivariateTimeSeriesLevelShiftModel::addSamples(std::size_t count, + maths_t::EDataType dataType, + const TWeightStyleVec &weightStyles, + const TTimeDoublePr1Vec &samples_, + const TDouble4Vec1Vec &weights, + double propagationInterval) +{ + TDouble1Vec samples; + samples.reserve(samples_.size()); + for (const auto &sample : samples_) + { + double x{this->trendModel().detrend(sample.first, sample.second, 0.0)}; + samples.push_back(x); + m_Shift.add(x - m_ResidualModelMode); + } + for (auto &sample : samples) + { + sample -= CBasicStatistics::mean(m_Shift); + } + for (const auto &weight : weights) + { + m_SampleCount += maths_t::count(weightStyles, weight); + } + + m_ResidualModel->dataType(dataType); + m_ResidualModel->addSamples(weightStyles, samples, weights); + m_ResidualModel->propagateForwardsByTime(propagationInterval); + + // We delay updating the log-likelihood because early on the + // level can change giving us a better apparent fit to the + // data than a fixed step. Five updates was found to be the + // minimum to get empirically similar sum log-likelihood if + // there is no shift in the data. + + double logLikelihood; + if (count >= 5 && m_ResidualModel->jointLogMarginalLikelihood( + weightStyles, samples, weights, + logLikelihood) == maths_t::E_FpNoErrors) + { + this->addLogLikelihood(logLikelihood); + } +} + +void CUnivariateTimeSeriesLevelShiftModel::debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const +{ + core::CMemoryDebug::dynamicSize("m_ResidualModel", m_ResidualModel, mem); +} + +std::size_t CUnivariateTimeSeriesLevelShiftModel::staticSize() const +{ + return sizeof(*this); +} + +std::size_t CUnivariateTimeSeriesLevelShiftModel::memoryUsage() const +{ + return core::CMemory::dynamicSize(m_ResidualModel); +} + +uint64_t CUnivariateTimeSeriesLevelShiftModel::checksum(uint64_t seed) const +{ + seed = CChecksum::calculate(seed, this->logLikelihood()); + seed = CChecksum::calculate(seed, this->trendModel()); + seed = CChecksum::calculate(seed, m_Shift); + seed = CChecksum::calculate(seed, m_SampleCount); + return CChecksum::calculate(seed, m_ResidualModel); +} + +CUnivariateTimeSeriesTimeShiftModel::CUnivariateTimeSeriesTimeShiftModel(const CTimeSeriesDecompositionInterface &trendModel, + const TPriorPtr &residualModel, + core_t::TTime shift) : + CUnivariateTimeSeriesChangeModel{trendModel}, + m_Shift{shift}, + m_ResidualModel{residualModel->clone()} +{} + +bool CUnivariateTimeSeriesTimeShiftModel::acceptRestoreTraverser(const SDistributionRestoreParams ¶ms, + core::CStateRestoreTraverser &traverser) +{ + do + { + const std::string name{traverser.name()}; + RESTORE_SETUP_TEARDOWN(LOG_LIKELIHOOD_TAG, + double logLikelihood, + core::CStringUtils::stringToType(traverser.value(), logLikelihood), + this->addLogLikelihood(logLikelihood)) + RESTORE(RESIDUAL_MODEL_TAG, traverser.traverseSubLevel( + boost::bind(CPriorStateSerialiser(), + boost::cref(params), + boost::ref(m_ResidualModel), _1))) + } + while (traverser.next()); + return true; +} + +void CUnivariateTimeSeriesTimeShiftModel::acceptPersistInserter(core::CStatePersistInserter &inserter) const +{ + inserter.insertValue(LOG_LIKELIHOOD_TAG, this->logLikelihood()); + inserter.insertLevel(RESIDUAL_MODEL_TAG, boost::bind(CPriorStateSerialiser(), + boost::cref(*m_ResidualModel), _1)); +} + +double CUnivariateTimeSeriesTimeShiftModel::bic() const +{ + return -2.0 * this->logLikelihood(); +} + +TOptionalChangeDescription CUnivariateTimeSeriesTimeShiftModel::change() const +{ + return SChangeDescription{SChangeDescription::E_TimeShift, static_cast(m_Shift)}; +} + +void CUnivariateTimeSeriesTimeShiftModel::addSamples(std::size_t count, + maths_t::EDataType dataType, + const TWeightStyleVec &weightStyles, + const TTimeDoublePr1Vec &samples_, + const TDouble4Vec1Vec &weights, + double propagationInterval) +{ + TDouble1Vec samples; + samples.reserve(samples_.size()); + for (const auto &sample : samples_) + { + samples.push_back(this->trendModel().detrend(sample.first + m_Shift, sample.second, 0.0)); + } + + m_ResidualModel->dataType(dataType); + m_ResidualModel->addSamples(weightStyles, samples, weights); + m_ResidualModel->propagateForwardsByTime(propagationInterval); + + // See CUnivariateTimeSeriesLevelShiftModel for an explanation + // of the delay updating the log-likelihood. + + double logLikelihood; + if (count >= 5 && m_ResidualModel->jointLogMarginalLikelihood( + weightStyles, samples, weights, + logLikelihood) == maths_t::E_FpNoErrors) + { + this->addLogLikelihood(logLikelihood); + } +} + +void CUnivariateTimeSeriesTimeShiftModel::debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const +{ + core::CMemoryDebug::dynamicSize("m_ResidualModel", m_ResidualModel, mem); +} + +std::size_t CUnivariateTimeSeriesTimeShiftModel::staticSize() const +{ + return sizeof(*this); +} + +std::size_t CUnivariateTimeSeriesTimeShiftModel::memoryUsage() const +{ + return core::CMemory::dynamicSize(m_ResidualModel); +} + +uint64_t CUnivariateTimeSeriesTimeShiftModel::checksum(uint64_t seed) const +{ + seed = CChecksum::calculate(seed, this->logLikelihood()); + seed = CChecksum::calculate(seed, this->trendModel()); + seed = CChecksum::calculate(seed, m_Shift); + return CChecksum::calculate(seed, m_ResidualModel); +} + +} + +} +} diff --git a/lib/maths/CTimeSeriesModel.cc b/lib/maths/CTimeSeriesModel.cc index a1f9932835..18ceb9bd85 100644 --- a/lib/maths/CTimeSeriesModel.cc +++ b/lib/maths/CTimeSeriesModel.cc @@ -169,7 +169,6 @@ double computeWinsorisationWeight(const CMultivariatePrior &prior, } // Models - // Version 6.3 const std::string VERSION_6_3_TAG("6.3"); const std::string ID_6_3_TAG{"a"}; diff --git a/lib/maths/CTrendComponent.cc b/lib/maths/CTrendComponent.cc index 0e26b8ef77..f0daec60d9 100644 --- a/lib/maths/CTrendComponent.cc +++ b/lib/maths/CTrendComponent.cc @@ -342,7 +342,6 @@ void CTrendComponent::forecast(core_t::TTime startTime, endTime = startTime + CIntegerTools::ceil(endTime - startTime, step); - core_t::TTime steps{(endTime - startTime) / step}; result.resize(steps, TDouble3Vec(3)); @@ -364,6 +363,7 @@ void CTrendComponent::forecast(core_t::TTime startTime, + CBasicStatistics::variance(m_Models[i].s_ResidualMoments); LOG_TRACE("params = " << core::CContainerPrinter::print(models[i])); LOG_TRACE("covariances = " << modelCovariances[i].toDelimited()) + LOG_TRACE("variances = " << residualVariances[i]); } LOG_TRACE("long time variance = " << CBasicStatistics::variance(m_ValueMoments)); diff --git a/lib/maths/Makefile b/lib/maths/Makefile index fad5cde0ae..4a0407d50a 100644 --- a/lib/maths/Makefile +++ b/lib/maths/Makefile @@ -69,6 +69,7 @@ CMultivariateNormalConjugateFactory.cc \ CMultivariateOneOfNPrior.cc \ CMultivariateOneOfNPriorFactory.cc \ CMultivariatePrior.cc \ +CNaiveBayes.cc \ CNaturalBreaksClassifier.cc \ CNormalMeanPrecConjugate.cc \ COneOfNPrior.cc \ @@ -94,6 +95,7 @@ CSeasonalTime.cc \ CSignal.cc \ CSpline.cc \ CStatisticalTests.cc \ +CTimeSeriesChangeDetector.cc \ CTimeSeriesDecomposition.cc \ CTimeSeriesDecompositionDetail.cc \ CTimeSeriesDecompositionStateSerialiser.cc \ diff --git a/lib/maths/unittest/CNaiveBayesTest.cc b/lib/maths/unittest/CNaiveBayesTest.cc new file mode 100644 index 0000000000..39c4325ed5 --- /dev/null +++ b/lib/maths/unittest/CNaiveBayesTest.cc @@ -0,0 +1,383 @@ +/* + * ELASTICSEARCH CONFIDENTIAL + * + * Copyright (c) 2018 Elasticsearch BV. All Rights Reserved. + * + * Notice: this software, and all information contained + * therein, is the exclusive property of Elasticsearch BV + * and its licensors, if any, and is protected under applicable + * domestic and foreign law, and international treaties. + * + * Reproduction, republication or distribution without the + * express written consent of Elasticsearch BV is + * strictly prohibited. + */ + +#include "CNaiveBayesTest.h" + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include + +using namespace ml; + +using TDoubleVec = std::vector; +using TDouble1Vec = core::CSmallVector; +using TDouble1VecVec = std::vector; +using TDoubleSizePr = std::pair; +using TDoubleSizePrVec = std::vector; +using TMeanAccumulator = maths::CBasicStatistics::SSampleMean::TAccumulator; +using TMeanVarAccumulator = maths::CBasicStatistics::SSampleMeanVar::TAccumulator; + +void CNaiveBayesTest::testClassification() +{ + LOG_DEBUG("+---------------------------------------+"); + LOG_DEBUG("| CNaiveBayesTest::testClassification |"); + LOG_DEBUG("+---------------------------------------+"); + + // We'll test classification using Gaussian naive Bayes. We + // test: + // - We get the probabilities we expect using if the underlying + // classes are consistent with the assumptions, + // - Test with missing data + + // We test two features with true density + // - x(1) ~ N(0,12) | C(1), + // - x(2) ~ N(10,16) | C(1), + // - x(1) ~ N(3,14) | C(2), + // - x(2) ~ N(-5,24) | C(2) + + test::CRandomNumbers rng; + + TDoubleVec trainingData[4]; + rng.generateNormalSamples( 0.0, 12.0, 100, trainingData[0]); + rng.generateNormalSamples(10.0, 16.0, 100, trainingData[1]); + rng.generateNormalSamples( 3.0, 14.0, 200, trainingData[2]); + rng.generateNormalSamples(-5.0, 24.0, 200, trainingData[3]); + + TMeanAccumulator meanMeanError; + + for (auto initialCount : {0.0, 100.0}) + { + maths::CNormalMeanPrecConjugate normal{ + maths::CNormalMeanPrecConjugate::nonInformativePrior(maths_t::E_ContinuousData)}; + maths::CNaiveBayes nb{maths::CNaiveBayesFeatureDensityFromPrior(normal)}; + + if (initialCount > 0) + { + nb.initialClassCounts({{initialCount, 1}, {initialCount, 2}}); + } + + for (std::size_t i = 0u; i < 100; ++i) + { + nb.addTrainingDataPoint(1, {{trainingData[0][i]}, {trainingData[1][i]}}); + } + for (std::size_t i = 0u; i < 200; ++i) + { + nb.addTrainingDataPoint(2, {{trainingData[2][i]}, {trainingData[3][i]}}); + } + + TMeanVarAccumulator moments[4]; + moments[0].add(trainingData[0]); + moments[1].add(trainingData[1]); + moments[2].add(trainingData[2]); + moments[3].add(trainingData[3]); + + // The training data sizes are 100 and 200 so we expect the + // class probabilities to be: + // - P(1) = (initialCount + 100) / (2*initialCount + 300) + // - P(2) = (initialCount + 200) / (2*initialCount + 300) + + TDoubleSizePrVec probabilities(nb.highestClassProbabilities(2, {{}, {}})); + + double P1{(initialCount + 100.0) / (2.0 * initialCount + 300.0)}; + double P2{(initialCount + 200.0) / (2.0 * initialCount + 300.0)}; + + CPPUNIT_ASSERT_EQUAL(std::size_t(2), probabilities.size()); + CPPUNIT_ASSERT_DOUBLES_EQUAL(P1, probabilities[1].first, 1e-5); + CPPUNIT_ASSERT_EQUAL(std::size_t(1), probabilities[1].second); + CPPUNIT_ASSERT_DOUBLES_EQUAL(P2, probabilities[0].first, 1e-5); + CPPUNIT_ASSERT_EQUAL(std::size_t(2), probabilities[0].second); + + // If we supply feature values we should approximately + // get these modulated by the product of the true density + // ratios for those feature values. + + boost::math::normal class1[]{ + boost::math::normal{maths::CBasicStatistics::mean(moments[0]), + std::sqrt(maths::CBasicStatistics::variance(moments[0]))}, + boost::math::normal{maths::CBasicStatistics::mean(moments[1]), + std::sqrt(maths::CBasicStatistics::variance(moments[1]))}}; + boost::math::normal class2[]{ + boost::math::normal{maths::CBasicStatistics::mean(moments[2]), + std::sqrt(maths::CBasicStatistics::variance(moments[2]))}, + boost::math::normal{maths::CBasicStatistics::mean(moments[3]), + std::sqrt(maths::CBasicStatistics::variance(moments[3]))}}; + + TDoubleVec xtest; + rng.generateNormalSamples(0.0, 64.0, 40, xtest); + + TMeanAccumulator meanErrors[3]; + + for (std::size_t i = 0u; i < xtest.size(); i += 2) + { + auto test = [i](double p1, double p2, const TDoubleSizePrVec &p, TMeanAccumulator &meanError) + { + double Z{p1 + p2}; + p1 /= Z; + p2 /= Z; + double p1_{p[0].second == 1 ? p[0].first : p[1].first}; + double p2_{p[0].second == 1 ? p[1].first : p[0].first}; + + if (i % 10 == 0) + { + LOG_DEBUG(i << ") expected P(1) = " << p1 << ", P(2) = " << p2 + << " got P(1) = " << p1_ << ", P(2) = " << p2_); + } + + CPPUNIT_ASSERT_EQUAL(std::size_t(2), p.size()); + CPPUNIT_ASSERT_DOUBLES_EQUAL(p1, p1_, 0.03); + CPPUNIT_ASSERT_DOUBLES_EQUAL(p2, p2_, 0.03); + if (p1 > 0.001) + { + meanError.add(std::fabs((p1 - p1_) / p1)); + } + if (p2 > 0.001) + { + meanError.add(std::fabs((p2 - p2_) / p2)); + } + }; + + // Supply both feature values. + double p1{P1 * maths::CTools::safePdf(class1[0], xtest[i]) + * maths::CTools::safePdf(class1[1], xtest[i+1])}; + double p2{P2 * maths::CTools::safePdf(class2[0], xtest[i]) + * maths::CTools::safePdf(class2[1], xtest[i+1])}; + probabilities = nb.highestClassProbabilities(2, {{xtest[i]}, {xtest[i+1]}}); + test(p1, p2, probabilities, meanErrors[0]); + + // Miss out the first feature value. + p1 = P1 * maths::CTools::safePdf(class1[1], xtest[i+1]); + p2 = P2 * maths::CTools::safePdf(class2[1], xtest[i+1]); + probabilities = nb.highestClassProbabilities(2, {{}, {xtest[i+1]}}); + test(p1, p2, probabilities, meanErrors[1]); + + // Miss out the second feature value. + p1 = P1 * maths::CTools::safePdf(class1[0], xtest[i]); + p2 = P2 * maths::CTools::safePdf(class2[0], xtest[i]); + probabilities = nb.highestClassProbabilities(2, {{xtest[i]}, {}}); + test(p1, p2, probabilities, meanErrors[2]); + } + + for (std::size_t i = 0u; i < 3; ++i) + { + LOG_DEBUG("Mean relative error = " + << maths::CBasicStatistics::mean(meanErrors[i])); + CPPUNIT_ASSERT(maths::CBasicStatistics::mean(meanErrors[i]) < 0.05); + meanMeanError += meanErrors[i]; + } + } +} + +void CNaiveBayesTest::testPropagationByTime() +{ + LOG_DEBUG("+------------------------------------------+"); + LOG_DEBUG("| CNaiveBayesTest::testPropagationByTime |"); + LOG_DEBUG("+------------------------------------------+"); + + // Make feature distributions drift over time and verify that + // the classifier adapts. + + test::CRandomNumbers rng; + + maths::CNormalMeanPrecConjugate normal{ + maths::CNormalMeanPrecConjugate::nonInformativePrior(maths_t::E_ContinuousData, 0.05)}; + maths::CNaiveBayes nb[]{ + maths::CNaiveBayes{maths::CNaiveBayesFeatureDensityFromPrior(normal), 0.05}, + maths::CNaiveBayes{maths::CNaiveBayesFeatureDensityFromPrior(normal), 0.05}}; + + TDoubleVec trainingData[4]; + for (std::size_t i = 0u; i < 1000; ++i) + { + double x{static_cast(i)}; + rng.generateNormalSamples( 0.02 * x - 14.0, 16.0, 1, trainingData[0]); + rng.generateNormalSamples( 0.02 * x - 14.0, 16.0, 1, trainingData[1]); + rng.generateNormalSamples(-0.02 * x + 14.0, 16.0, 1, trainingData[2]); + rng.generateNormalSamples(-0.02 * x + 14.0, 16.0, 1, trainingData[3]); + + nb[0].addTrainingDataPoint(1, {{trainingData[0][0]}, {trainingData[1][0]}}); + nb[0].addTrainingDataPoint(2, {{trainingData[2][0]}, {trainingData[3][0]}}); + nb[0].propagateForwardsByTime(1.0); + + nb[1].addTrainingDataPoint(1, {{trainingData[0][0]}, {trainingData[1][0]}}); + nb[1].addTrainingDataPoint(2, {{trainingData[2][0]}, {trainingData[3][0]}}); +} + + // Check that the value: + // - (-10,-10) gets assigned to class 2 + // - ( 10, 10) gets assigned to class 1 + // for the aged classifier and vice versa. + + { + TDoubleSizePrVec probabilities[]{ + nb[0].highestClassProbabilities(2, {{-10.0}, {-10.0}}), + nb[1].highestClassProbabilities(2, {{-10.0}, {-10.0}})}; + LOG_DEBUG("Aged class probabilities = " + << core::CContainerPrinter::print(probabilities[0])); + LOG_DEBUG("Class probabilities = " + << core::CContainerPrinter::print(probabilities[1])); + CPPUNIT_ASSERT_EQUAL(std::size_t(2), probabilities[0][0].second); + CPPUNIT_ASSERT(probabilities[0][0].first > 0.99); + CPPUNIT_ASSERT_EQUAL(std::size_t(1), probabilities[1][0].second); + CPPUNIT_ASSERT(probabilities[1][0].first > 0.95); + } + { + TDoubleSizePrVec probabilities[]{ + nb[0].highestClassProbabilities(2, {{10.0}, {10.0}}), + nb[1].highestClassProbabilities(2, {{10.0}, {10.0}})}; + LOG_DEBUG("Aged class probabilities = " + << core::CContainerPrinter::print(probabilities[0])); + LOG_DEBUG("Class probabilities = " + << core::CContainerPrinter::print(probabilities[1])); + CPPUNIT_ASSERT_EQUAL(std::size_t(1), probabilities[0][0].second); + CPPUNIT_ASSERT(probabilities[0][0].first > 0.99); + CPPUNIT_ASSERT_EQUAL(std::size_t(2), probabilities[1][0].second); + CPPUNIT_ASSERT(probabilities[1][0].first > 0.95); + } +} + +void CNaiveBayesTest::testMemoryUsage() +{ + LOG_DEBUG("+------------------------------------+"); + LOG_DEBUG("| CNaiveBayesTest::testMemoryUsage |"); + LOG_DEBUG("+------------------------------------+"); + + // Check invariants. + + using TMemoryUsagePtr = boost::scoped_ptr; + using TNaiveBayesPtr = boost::shared_ptr; + + test::CRandomNumbers rng; + + TDoubleVec trainingData[4]; + rng.generateNormalSamples( 0.0, 12.0, 100, trainingData[0]); + rng.generateNormalSamples(10.0, 16.0, 100, trainingData[1]); + rng.generateNormalSamples( 3.0, 14.0, 200, trainingData[2]); + rng.generateNormalSamples(-5.0, 24.0, 200, trainingData[3]); + + TMeanAccumulator meanMeanError; + + maths::CNormalMeanPrecConjugate normal{ + maths::CNormalMeanPrecConjugate::nonInformativePrior(maths_t::E_ContinuousData, 0.1)}; + TNaiveBayesPtr nb{new maths::CNaiveBayes{maths::CNaiveBayesFeatureDensityFromPrior(normal), 0.1}}; + + for (std::size_t i = 0u; i < 100; ++i) + { + nb->addTrainingDataPoint(1, {{trainingData[0][i]}, {trainingData[1][i]}}); + } + for (std::size_t i = 0u; i < 200; ++i) + { + nb->addTrainingDataPoint(2, {{trainingData[2][i]}, {trainingData[3][i]}}); + } + + std::size_t memoryUsage{nb->memoryUsage()}; + TMemoryUsagePtr mem{new core::CMemoryUsage}; + nb->debugMemoryUsage(mem.get()); + + LOG_DEBUG("Memory = " << memoryUsage); + CPPUNIT_ASSERT_EQUAL(memoryUsage, mem->usage()); + + LOG_DEBUG("Memory = " << core::CMemory::dynamicSize(nb)); + CPPUNIT_ASSERT_EQUAL(memoryUsage + sizeof(maths::CNaiveBayes), + core::CMemory::dynamicSize(nb)); +} + +void CNaiveBayesTest::testPersist() +{ + LOG_DEBUG("+--------------------------------+"); + LOG_DEBUG("| CNaiveBayesTest::testPersist |"); + LOG_DEBUG("+--------------------------------+"); + + test::CRandomNumbers rng; + + TDoubleVec trainingData[4]; + rng.generateNormalSamples( 0.0, 12.0, 100, trainingData[0]); + rng.generateNormalSamples(10.0, 16.0, 100, trainingData[1]); + rng.generateNormalSamples( 3.0, 14.0, 200, trainingData[2]); + rng.generateNormalSamples(-5.0, 24.0, 200, trainingData[3]); + + TMeanAccumulator meanMeanError; + + maths::CNormalMeanPrecConjugate normal{ + maths::CNormalMeanPrecConjugate::nonInformativePrior(maths_t::E_ContinuousData, 0.1)}; + maths::CNaiveBayes origNb{maths::CNaiveBayesFeatureDensityFromPrior(normal), 0.1}; + + for (std::size_t i = 0u; i < 100; ++i) + { + origNb.addTrainingDataPoint(1, {{trainingData[0][i]}, {trainingData[1][i]}}); + } + for (std::size_t i = 0u; i < 200; ++i) + { + origNb.addTrainingDataPoint(2, {{trainingData[2][i]}, {trainingData[3][i]}}); + } + + std::string origXml; + { + core::CRapidXmlStatePersistInserter inserter("root"); + origNb.acceptPersistInserter(inserter); + inserter.toXml(origXml); + } + + LOG_DEBUG("Naive Bayes XML representation:\n" << origXml); + + core::CRapidXmlParser parser; + CPPUNIT_ASSERT(parser.parseStringIgnoreCdata(origXml)); + core::CRapidXmlStateRestoreTraverser traverser(parser); + + maths::SDistributionRestoreParams params{maths_t::E_ContinuousData, 0.1, 0.0, 0.0, 0.0}; + maths::CNaiveBayes restoredNb{params, traverser}; + + CPPUNIT_ASSERT_EQUAL(origNb.checksum(), restoredNb.checksum()); + + std::string restoredXml; + { + core::CRapidXmlStatePersistInserter inserter("root"); + origNb.acceptPersistInserter(inserter); + inserter.toXml(restoredXml); + } + CPPUNIT_ASSERT_EQUAL(origXml, restoredXml); +} + +CppUnit::Test *CNaiveBayesTest::suite() +{ + CppUnit::TestSuite *suiteOfTests = new CppUnit::TestSuite("CNaiveBayesTest"); + + suiteOfTests->addTest( new CppUnit::TestCaller( + "CNaiveBayesTest::testClassification", + &CNaiveBayesTest::testClassification) ); + suiteOfTests->addTest( new CppUnit::TestCaller( + "CNaiveBayesTest::testPropagationByTime", + &CNaiveBayesTest::testPropagationByTime) ); + suiteOfTests->addTest( new CppUnit::TestCaller( + "CNaiveBayesTest::testMemoryUsage", + &CNaiveBayesTest::testMemoryUsage) ); + suiteOfTests->addTest( new CppUnit::TestCaller( + "CNaiveBayesTest::testPersist", + &CNaiveBayesTest::testPersist) ); + + return suiteOfTests; +} diff --git a/lib/maths/unittest/CNaiveBayesTest.h b/lib/maths/unittest/CNaiveBayesTest.h new file mode 100644 index 0000000000..5b16454041 --- /dev/null +++ b/lib/maths/unittest/CNaiveBayesTest.h @@ -0,0 +1,32 @@ +/* + * ELASTICSEARCH CONFIDENTIAL + * + * Copyright (c) 2018 Elasticsearch BV. All Rights Reserved. + * + * Notice: this software, and all information contained + * therein, is the exclusive property of Elasticsearch BV + * and its licensors, if any, and is protected under applicable + * domestic and foreign law, and international treaties. + * + * Reproduction, republication or distribution without the + * express written consent of Elasticsearch BV is + * strictly prohibited. + */ + +#ifndef INCLUDED_CNaiveBayesTest_h +#define INCLUDED_CNaiveBayesTest_h + +#include + +class CNaiveBayesTest : public CppUnit::TestFixture +{ + public: + void testClassification(); + void testPropagationByTime(); + void testMemoryUsage(); + void testPersist(); + + static CppUnit::Test *suite(); +}; + +#endif // INCLUDED_CNaiveBayesTest_h diff --git a/lib/maths/unittest/CNaturalBreaksClassifierTest.h b/lib/maths/unittest/CNaturalBreaksClassifierTest.h index a23a604074..fc5b78117f 100644 --- a/lib/maths/unittest/CNaturalBreaksClassifierTest.h +++ b/lib/maths/unittest/CNaturalBreaksClassifierTest.h @@ -18,7 +18,6 @@ #include - class CNaturalBreaksClassifierTest : public CppUnit::TestFixture { public: diff --git a/lib/maths/unittest/CTimeSeriesChangeDetectorTest.cc b/lib/maths/unittest/CTimeSeriesChangeDetectorTest.cc new file mode 100644 index 0000000000..0e5d0c5961 --- /dev/null +++ b/lib/maths/unittest/CTimeSeriesChangeDetectorTest.cc @@ -0,0 +1,361 @@ +/* + * ELASTICSEARCH CONFIDENTIAL + * + * Copyright (c) 2018 Elasticsearch BV. All Rights Reserved. + * + * Notice: this software, and all information contained + * therein, is the exclusive property of Elasticsearch BV + * and its licensors, if any, and is protected under applicable + * domestic and foreign law, and international treaties. + * + * Reproduction, republication or distribution without the + * express written consent of Elasticsearch BV is + * strictly prohibited. + */ + +#include "CTimeSeriesChangeDetectorTest.h" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "TestUtils.h" + +#include + +#include +#include + +using namespace ml; + +namespace +{ + +using TDoubleVec = std::vector; +using TDouble2Vec = core::CSmallVector; +using TPriorPtr = boost::shared_ptr; +using TPriorPtrVec = std::vector; + +core_t::TTime BUCKET_LENGTH{1800}; +const double DECAY_RATE{0.0002}; + +TPriorPtr makeResidualModel() +{ + maths::CGammaRateConjugate gamma{ + maths::CGammaRateConjugate::nonInformativePrior(maths_t::E_ContinuousData, 0.1, DECAY_RATE)}; + maths::CLogNormalMeanPrecConjugate lognormal{ + maths::CLogNormalMeanPrecConjugate::nonInformativePrior(maths_t::E_ContinuousData, 1.0, DECAY_RATE)}; + maths::CNormalMeanPrecConjugate normal{ + maths::CNormalMeanPrecConjugate::nonInformativePrior(maths_t::E_ContinuousData, DECAY_RATE)}; + + TPriorPtrVec mode; + mode.reserve(3u); + mode.emplace_back(gamma.clone()); + mode.emplace_back(lognormal.clone()); + mode.emplace_back(normal.clone()); + maths::COneOfNPrior modePrior{mode, maths_t::E_ContinuousData, DECAY_RATE}; + maths::CXMeansOnline1d clusterer{maths_t::E_ContinuousData, + maths::CAvailableModeDistributions::ALL, + maths_t::E_ClustersFractionWeight, + DECAY_RATE, 0.05, 12.0, 1.0}; + maths::CMultimodalPrior multimodal{maths_t::E_ContinuousData, clusterer, modePrior, DECAY_RATE}; + + TPriorPtrVec models; + mode.emplace_back(gamma.clone()); + mode.emplace_back(lognormal.clone()); + mode.emplace_back(normal.clone()); + mode.emplace_back(multimodal.clone()); + + return TPriorPtr{maths::COneOfNPrior{mode, maths_t::E_ContinuousData, DECAY_RATE}.clone()}; +} + +} + +void CTimeSeriesChangeDetectorTest::testNoChange() +{ + LOG_DEBUG("+-----------------------------------------------+"); + LOG_DEBUG("| CTimeSeriesChangeDetectorTest::testNoChange |"); + LOG_DEBUG("+-----------------------------------------------+"); + + test::CRandomNumbers rng; + + TDoubleVec variances{1.0, 10.0, 20.0, 30.0, 100.0, 1000.0}; + TDoubleVec scales{0.1, 1.0, 2.0, 3.0, 5.0, 8.0}; + + TDoubleVec samples; + for (std::size_t t = 0u; t < 100; ++t) + { + if (t % 10 == 0) + { + LOG_DEBUG(t << "%"); + } + + switch (t % 3) + { + case 0: rng.generateNormalSamples(10.0, variances[(t/3) % variances.size()], 1000, samples); break; + case 1: rng.generateLogNormalSamples(1.0, scales[(t/3) % scales.size()], 1000, samples); break; + case 2: rng.generateGammaSamples(10.0, 10.0 * scales[(t/3) % scales.size()], 1000, samples); break; + } + + maths::CTimeSeriesDecomposition trendModel(DECAY_RATE, BUCKET_LENGTH); + TPriorPtr residualModel(makeResidualModel()); + + auto addSampleToModel = [&trendModel, &residualModel](core_t::TTime time, double x) + { + trendModel.addPoint(time, x); + double detrended{trendModel.detrend(time, x, 0.0)}; + residualModel->addSamples(maths::CConstantWeights::COUNT, {detrended}, {{1.0}}); + residualModel->propagateForwardsByTime(1.0); + }; + + core_t::TTime time{0}; + for (std::size_t i = 0u; i < 950; ++i) + { + addSampleToModel(time, samples[i]); + time += BUCKET_LENGTH; + } + + maths::CUnivariateTimeSeriesChangeDetector detector{trendModel, residualModel, + 6 * core::constants::HOUR, + 24 * core::constants::HOUR, + 12.0}; + for (std::size_t i = 950u; i < samples.size(); ++i) + { + addSampleToModel(time, samples[i]); + detector.addSamples(maths_t::E_ContinuousData, + maths::CConstantWeights::COUNT, + {{time, samples[i]}}, {{1.0}}); + if (detector.stopTesting()) + { + break; + } + + CPPUNIT_ASSERT(!detector.change()); + + time += BUCKET_LENGTH; + } + } +} + +void CTimeSeriesChangeDetectorTest::testLevelShift() +{ + LOG_DEBUG("+-------------------------------------------------+"); + LOG_DEBUG("| CTimeSeriesChangeDetectorTest::testLevelShift |"); + LOG_DEBUG("+-------------------------------------------------+"); + + TGeneratorVec trends{constant, ramp, smoothDaily, weekends, spikeyDaily}; + + this->testChange(trends, + maths::SChangeDescription::E_LevelShift, + [](TGenerator trend, core_t::TTime time) + { + return trend(time) + 0.5; + }, 5.0, 15.0); +} + +void CTimeSeriesChangeDetectorTest::testTimeShift() +{ + LOG_DEBUG("+------------------------------------------------+"); + LOG_DEBUG("| CTimeSeriesChangeDetectorTest::testTimeShift |"); + LOG_DEBUG("+------------------------------------------------+"); + + TGeneratorVec trends{smoothDaily, spikeyDaily}; + + this->testChange(trends, + maths::SChangeDescription::E_TimeShift, + [](TGenerator trend, core_t::TTime time) + { + return trend(time - core::constants::HOUR); + }, -static_cast(core::constants::HOUR), 24.0); + + this->testChange(trends, + maths::SChangeDescription::E_TimeShift, + [](TGenerator trend, core_t::TTime time) + { + return trend(time + core::constants::HOUR); + }, +static_cast(core::constants::HOUR), 24.0); +} + +void CTimeSeriesChangeDetectorTest::testPersist() +{ + LOG_DEBUG("+----------------------------------------------+"); + LOG_DEBUG("| CTimeSeriesChangeDetectorTest::testPersist |"); + LOG_DEBUG("+----------------------------------------------+"); + + test::CRandomNumbers rng; + + TDoubleVec samples; + rng.generateNormalSamples(10.0, 10.0, 1000, samples); + + maths::CTimeSeriesDecomposition trendModel(DECAY_RATE, BUCKET_LENGTH); + TPriorPtr residualModel(makeResidualModel()); + + auto addSampleToModel = [&trendModel, &residualModel](core_t::TTime time, double x) + { + trendModel.addPoint(time, x); + double detrended{trendModel.detrend(time, x, 0.0)}; + residualModel->addSamples(maths::CConstantWeights::COUNT, + {detrended}, + maths::CConstantWeights::SINGLE_UNIT); + residualModel->propagateForwardsByTime(1.0); + }; + + core_t::TTime time{0}; + for (std::size_t i = 0u; i < 990; ++i) + { + addSampleToModel(time, samples[i]); + time += BUCKET_LENGTH; + } + + maths::CUnivariateTimeSeriesChangeDetector origDetector{trendModel, residualModel, + 6 * core::constants::HOUR, + 24 * core::constants::HOUR, + 12.0}; + + maths::SDistributionRestoreParams params{maths_t::E_ContinuousData, + DECAY_RATE, 0.05, 12.0, 1.0}; + for (std::size_t i = 990u; i < samples.size(); ++i) + { + addSampleToModel(time, samples[i]); + std::string origXml; + { + ml::core::CRapidXmlStatePersistInserter inserter{"root"}; + origDetector.acceptPersistInserter(inserter); + inserter.toXml(origXml); + } + + maths::CUnivariateTimeSeriesChangeDetector restoredDetector{trendModel, residualModel, + 6 * core::constants::HOUR, + 24 * core::constants::HOUR, + 12.0}; + core::CRapidXmlParser parser; + CPPUNIT_ASSERT(parser.parseStringIgnoreCdata(origXml)); + core::CRapidXmlStateRestoreTraverser traverser(parser); + traverser.traverseSubLevel(boost::bind( + &maths::CUnivariateTimeSeriesChangeDetector::acceptRestoreTraverser, + &restoredDetector, boost::cref(params), _1)); + + LOG_DEBUG("expected " << origDetector.checksum() + << " got " << restoredDetector.checksum()); + CPPUNIT_ASSERT_EQUAL(origDetector.checksum(), restoredDetector.checksum()); + } +} + +CppUnit::Test *CTimeSeriesChangeDetectorTest::suite() +{ + CppUnit::TestSuite *suiteOfTests = new CppUnit::TestSuite("CTimeSeriesChangeDetectorTest"); + + suiteOfTests->addTest( new CppUnit::TestCaller( + "CTimeSeriesChangeDetectorTest::testNoChange", + &CTimeSeriesChangeDetectorTest::testNoChange) ); + suiteOfTests->addTest( new CppUnit::TestCaller( + "CTimeSeriesChangeDetectorTest::testLevelShift", + &CTimeSeriesChangeDetectorTest::testLevelShift) ); + suiteOfTests->addTest( new CppUnit::TestCaller( + "CTimeSeriesChangeDetectorTest::testTimeShift", + &CTimeSeriesChangeDetectorTest::testTimeShift) ); + suiteOfTests->addTest( new CppUnit::TestCaller( + "CTimeSeriesChangeDetectorTest::testPersist", + &CTimeSeriesChangeDetectorTest::testPersist) ); + + return suiteOfTests; +} + +void CTimeSeriesChangeDetectorTest::testChange(const TGeneratorVec &trends, + maths::SChangeDescription::EDescription description, + TChange applyChange, + double expectedChange, + double expectedMeanBucketsToDetectChange) +{ + using TOptionalSize = boost::optional; + using TMeanAccumulator = maths::CBasicStatistics::SSampleMean::TAccumulator; + + test::CRandomNumbers rng; + + TMeanAccumulator meanBucketsToDetect; + + TDoubleVec samples; + for (std::size_t t = 0u; t < 100; ++t) + { + if (t % 10 == 0) + { + LOG_DEBUG(t << "%"); + } + + rng.generateNormalSamples(0.0, 1.0, 1000, samples); + + maths::CTimeSeriesDecomposition trendModel(DECAY_RATE, BUCKET_LENGTH); + TPriorPtr residualModel(makeResidualModel()); + + auto addSampleToModel = [&trendModel, &residualModel](core_t::TTime time, double x, double weight) + { + trendModel.addPoint(time, x, maths::CConstantWeights::COUNT, {weight}); + double detrended{trendModel.detrend(time, x, 0.0)}; + residualModel->addSamples(maths::CConstantWeights::COUNT, {detrended}, {{weight}}); + residualModel->propagateForwardsByTime(1.0); + }; + + core_t::TTime time{0}; + for (std::size_t i = 0u; i < 950; ++i) + { + double x{10.0 * trends[t % trends.size()](time) + samples[i]}; + addSampleToModel(time, x, 1.0); + time += BUCKET_LENGTH; + } + + maths::CUnivariateTimeSeriesChangeDetector detector{trendModel, residualModel, + 6 * core::constants::HOUR, + 24 * core::constants::HOUR, + 12.0}; + + TOptionalSize bucketsToDetect; + for (std::size_t i = 950u; i < samples.size(); ++i) + { + double x{10.0 * applyChange(trends[t % trends.size()], time) + samples[i]}; + + addSampleToModel(time, x, 0.5); + detector.addSamples(maths_t::E_ContinuousData, + maths::CConstantWeights::COUNT, + {{time, x}}, {{1.0}}); + + auto change = detector.change(); + if (change) + { + if (!bucketsToDetect) + { + bucketsToDetect.reset(i - 949); + } + CPPUNIT_ASSERT_EQUAL(change->s_Description, description); + CPPUNIT_ASSERT_DOUBLES_EQUAL(expectedChange, + change->s_Value[0], + 0.5 * std::fabs(expectedChange)); + break; + } + if (detector.stopTesting()) + { + break; + } + + time += BUCKET_LENGTH; + } + CPPUNIT_ASSERT(bucketsToDetect); + meanBucketsToDetect.add(static_cast(*bucketsToDetect)); + } + + LOG_DEBUG("buckets to detect = " << maths::CBasicStatistics::mean(meanBucketsToDetect)); + CPPUNIT_ASSERT(maths::CBasicStatistics::mean(meanBucketsToDetect) < expectedMeanBucketsToDetectChange); +} diff --git a/lib/maths/unittest/CTimeSeriesChangeDetectorTest.h b/lib/maths/unittest/CTimeSeriesChangeDetectorTest.h new file mode 100644 index 0000000000..74f6c039b3 --- /dev/null +++ b/lib/maths/unittest/CTimeSeriesChangeDetectorTest.h @@ -0,0 +1,48 @@ +/* + * ELASTICSEARCH CONFIDENTIAL + * + * Copyright (c) 2018 Elasticsearch BV. All Rights Reserved. + * + * Notice: this software, and all information contained + * therein, is the exclusive property of Elasticsearch BV + * and its licensors, if any, and is protected under applicable + * domestic and foreign law, and international treaties. + * + * Reproduction, republication or distribution without the + * express written consent of Elasticsearch BV is + * strictly prohibited. + */ + +#ifndef INCLUDED_CTimeSeriesChangeDetectorTest_h +#define INCLUDED_CTimeSeriesChangeDetectorTest_h + +#include + +#include + +#include + +class CTimeSeriesChangeDetectorTest : public CppUnit::TestFixture +{ + public: + void testNoChange(); + void testLevelShift(); + void testTimeShift(); + void testPersist(); + + static CppUnit::Test *suite(); + + private: + using TGenerator = std::function; + using TGeneratorVec = std::vector; + using TChange = std::function; + + private: + void testChange(const TGeneratorVec &trends, + ml::maths::SChangeDescription::EDescription description, + TChange applyChange, + double expectedChange, + double expectedMeanBucketsToDetectChange); +}; + +#endif // INCLUDED_CTimeSeriesChangeDetectorTest_h diff --git a/lib/maths/unittest/Main.cc b/lib/maths/unittest/Main.cc index 0a5347290b..800a6e5dc4 100644 --- a/lib/maths/unittest/Main.cc +++ b/lib/maths/unittest/Main.cc @@ -54,6 +54,7 @@ #include "CMultivariateMultimodalPriorTest.h" #include "CMultivariateNormalConjugateTest.h" #include "CMultivariateOneOfNPriorTest.h" +#include "CNaiveBayesTest.h" #include "CNaturalBreaksClassifierTest.h" #include "CNormalMeanPrecConjugateTest.h" #include "COneOfNPriorTest.h" @@ -79,6 +80,7 @@ #include "CSolversTest.h" #include "CSplineTest.h" #include "CStatisticalTestsTest.h" +#include "CTimeSeriesChangeDetectorTest.h" #include "CTimeSeriesDecompositionTest.h" #include "CTimeSeriesModelTest.h" #include "CToolsTest.h" @@ -131,6 +133,7 @@ int main(int argc, const char **argv) runner.addTest( CMultivariateMultimodalPriorTest::suite() ); runner.addTest( CMultivariateNormalConjugateTest::suite() ); runner.addTest( CMultivariateOneOfNPriorTest::suite() ); + runner.addTest( CNaiveBayesTest::suite() ); runner.addTest( CNaturalBreaksClassifierTest::suite() ); runner.addTest( CNormalMeanPrecConjugateTest::suite() ); runner.addTest( COneOfNPriorTest::suite() ); @@ -159,6 +162,7 @@ int main(int argc, const char **argv) runner.addTest( CTimeSeriesDecompositionTest::suite() ); runner.addTest( CTimeSeriesModelTest::suite() ); runner.addTest( CToolsTest::suite() ); + runner.addTest( CTimeSeriesChangeDetectorTest::suite() ); runner.addTest( CTrendComponentTest::suite() ); runner.addTest( CTrendTestsTest::suite() ); runner.addTest( CXMeansTest::suite() ); diff --git a/lib/maths/unittest/Makefile b/lib/maths/unittest/Makefile index 73323a7a89..7ffe0af477 100644 --- a/lib/maths/unittest/Makefile +++ b/lib/maths/unittest/Makefile @@ -65,6 +65,7 @@ SRCS=\ CMultivariateMultimodalPriorTest.cc \ CMultivariateNormalConjugateTest.cc \ CMultivariateOneOfNPriorTest.cc \ + CNaiveBayesTest.cc \ CNaturalBreaksClassifierTest.cc \ CNormalMeanPrecConjugateTest.cc \ COneOfNPriorTest.cc \ @@ -90,6 +91,7 @@ SRCS=\ CSolversTest.cc \ CSplineTest.cc \ CStatisticalTestsTest.cc \ + CTimeSeriesChangeDetectorTest.cc \ CTimeSeriesDecompositionTest.cc \ CTimeSeriesModelTest.cc \ CToolsTest.cc \