Skip to content

Commit

Permalink
[ML] Add new categorization stats to model_size_stats (#992)
Browse files Browse the repository at this point in the history
This change adds support for the following new model_size_stats
fields:

- categorized_doc_count
- total_category_count
- frequent_category_count
- rare_category_count
- dead_category_count
- categorization_status

Backport of #989
  • Loading branch information
droberts195 authored Feb 10, 2020
1 parent 737f51f commit 371b519
Show file tree
Hide file tree
Showing 24 changed files with 393 additions and 199 deletions.
2 changes: 2 additions & 0 deletions docs/CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ progress, memory usage, etc. (See {ml-pull}906[#906].)

* Improve initialization of learn rate for better and more stable results in regression
and classification. (See {ml-pull}948[#948].)
* Add new model_size_stats fields to instrument categorization. (See {ml-pull}948[#948]
and {pull}51879[#51879], issue: {issue}50794[#50749].)

=== Bug Fixes

Expand Down
6 changes: 3 additions & 3 deletions include/api/CAnomalyJob.h
Original file line number Diff line number Diff line change
Expand Up @@ -120,14 +120,14 @@ class API_EXPORT CAnomalyJob : public CDataProcessor {

struct SBackgroundPersistArgs {
SBackgroundPersistArgs(core_t::TTime time,
const model::CResourceMonitor::SResults& modelSizeStats,
const model::CResourceMonitor::SModelSizeStats& modelSizeStats,
const model::CInterimBucketCorrector& interimBucketCorrector,
const model::CHierarchicalResultsAggregator& aggregator,
core_t::TTime latestRecordTime,
core_t::TTime lastResultsTime);

core_t::TTime s_Time;
model::CResourceMonitor::SResults s_ModelSizeStats;
model::CResourceMonitor::SModelSizeStats s_ModelSizeStats;
model::CInterimBucketCorrector s_InterimBucketCorrector;
model::CHierarchicalResultsAggregator s_Aggregator;
std::string s_NormalizerState;
Expand Down Expand Up @@ -258,7 +258,7 @@ class API_EXPORT CAnomalyJob : public CDataProcessor {
bool persistCopiedState(const std::string& descriptionPrefix,
core_t::TTime time,
const TKeyCRefAnomalyDetectorPtrPrVec& detectors,
const model::CResourceMonitor::SResults& modelSizeStats,
const model::CResourceMonitor::SModelSizeStats& modelSizeStats,
const model::CInterimBucketCorrector& interimBucketCorrector,
const model::CHierarchicalResultsAggregator& aggregator,
const std::string& normalizerState,
Expand Down
2 changes: 1 addition & 1 deletion include/api/CJsonOutputWriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ class API_EXPORT CJsonOutputWriter : public COutputHandler {

//! Report the current levels of resource usage, as given to us
//! from the CResourceMonitor via a callback
void reportMemoryUsage(const model::CResourceMonitor::SResults& results);
void reportMemoryUsage(const model::CResourceMonitor::SModelSizeStats& modelSizeStats);

//! Acknowledge a flush request by echoing back the flush ID
void acknowledgeFlush(const std::string& flushId, core_t::TTime lastFinalizedBucketEnd);
Expand Down
2 changes: 1 addition & 1 deletion include/api/CModelSizeStatsJsonWriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class API_EXPORT CModelSizeStatsJsonWriter : private core::CNonInstantiatable {
public:
//! Writes the model size stats in the \p results in JSON format.
static void write(const std::string& jobId,
const model::CResourceMonitor::SResults& results,
const model::CResourceMonitor::SModelSizeStats& results,
core::CRapidJsonConcurrentLineWriter& writer);
};
}
Expand Down
2 changes: 1 addition & 1 deletion include/api/CModelSnapshotJsonWriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ class API_EXPORT CModelSnapshotJsonWriter {
std::string s_Description;
std::string s_SnapshotId;
size_t s_NumDocs;
model::CResourceMonitor::SResults s_ModelSizeStats;
model::CResourceMonitor::SModelSizeStats s_ModelSizeStats;
std::string s_NormalizerState;
core_t::TTime s_LatestRecordTime;
core_t::TTime s_LatestFinalResultTime;
Expand Down
6 changes: 3 additions & 3 deletions include/model/CAnomalyDetector.h
Original file line number Diff line number Diff line change
Expand Up @@ -270,9 +270,9 @@ class MODEL_EXPORT CAnomalyDetector : public CMonitoredResource {
//! Prune the model.
void prune(std::size_t maximumAge) override;

//! Update the overall model memory stats results with stats from this
//! anomaly detector.
void updateMemoryResults(CResourceMonitor::SResults& results) const override;
//! Update the overall model size stats with information from this anomaly
//! detector.
void updateModelSizeStats(CResourceMonitor::SModelSizeStats& modelSizeStats) const override;

//! Get end of the last complete bucket we've observed.
const core_t::TTime& lastBucketEndTime() const;
Expand Down
5 changes: 3 additions & 2 deletions include/model/CMonitoredResource.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,10 @@ class MODEL_EXPORT CMonitoredResource {
//! discarding the least recently seen entities it knows about.
virtual void prune(std::size_t maximumAge);

//! Update the overall model memory stats results with stats from this
//! Update the overall model size stats results with stats from this
//! monitored resource.
virtual void updateMemoryResults(CResourceMonitor::SResults& results) const = 0;
virtual void
updateModelSizeStats(CResourceMonitor::SModelSizeStats& modelSizeStats) const = 0;
};
}
}
Expand Down
33 changes: 20 additions & 13 deletions include/model/CResourceMonitor.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,23 +38,30 @@ class CMonitoredResource;
//! Assess memory used by models and decide on further memory allocations.
class MODEL_EXPORT CResourceMonitor {
public:
struct MODEL_EXPORT SResults {
std::size_t s_Usage;
std::size_t s_AdjustedUsage;
std::size_t s_ByFields;
std::size_t s_PartitionFields;
std::size_t s_OverFields;
std::size_t s_AllocationFailures;
model_t::EMemoryStatus s_MemoryStatus;
core_t::TTime s_BucketStartTime;
std::size_t s_BytesExceeded;
std::size_t s_BytesMemoryLimit;
struct MODEL_EXPORT SModelSizeStats {
std::size_t s_Usage = 0;
std::size_t s_AdjustedUsage = 0;
std::size_t s_ByFields = 0;
std::size_t s_PartitionFields = 0;
std::size_t s_OverFields = 0;
std::size_t s_AllocationFailures = 0;
model_t::EMemoryStatus s_MemoryStatus = model_t::E_MemoryStatusOk;
core_t::TTime s_BucketStartTime = 0;
std::size_t s_BytesExceeded = 0;
std::size_t s_BytesMemoryLimit = 0;
std::size_t s_CategorizedMessages = 0;
std::size_t s_TotalCategories = 0;
std::size_t s_FrequentCategories = 0;
std::size_t s_RareCategories = 0;
std::size_t s_DeadCategories = 0;
model_t::ECategorizationStatus s_CategorizationStatus = model_t::E_CategorizationStatusOk;
};

public:
using TMonitoredResourcePtrSizeUMap =
boost::unordered_map<CMonitoredResource*, std::size_t>;
using TMemoryUsageReporterFunc = std::function<void(const CResourceMonitor::SResults&)>;
using TMemoryUsageReporterFunc =
std::function<void(const CResourceMonitor::SModelSizeStats&)>;
using TTimeSizeMap = std::map<core_t::TTime, std::size_t>;

//! The minimum time between prunes
Expand Down Expand Up @@ -109,7 +116,7 @@ class MODEL_EXPORT CResourceMonitor {
void sendMemoryUsageReport(core_t::TTime bucketStartTime);

//! Create a memory usage report
SResults createMemoryUsageReport(core_t::TTime bucketStartTime);
SModelSizeStats createMemoryUsageReport(core_t::TTime bucketStartTime);

//! We are being told that a class has failed to allocate memory
//! based on the resource limits, and we will report this to the
Expand Down
31 changes: 27 additions & 4 deletions include/model/CTokenListCategory.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#include <model/ImportExport.h>

#include <algorithm>
#include <map>
#include <string>
#include <utility>
Expand Down Expand Up @@ -94,16 +95,38 @@ class MODEL_EXPORT CTokenListCategory {
//! this category's common unique tokens?
std::size_t missingCommonTokenWeight(const TSizeSizeMap& uniqueTokenIds) const;

//! Is the weight of tokens in a given map that are missing from this
//! category's common unique tokens equal to zero? It is possible to test:
//! Is the weight of tokens in the provided container that are missing from
//! this category's common unique tokens equal to zero? It is possible to
//! test:
//! if (category.missingCommonTokenWeight(uniqueTokenIds) == 0)
//! instead of calling this method. However, this method is much faster
//! as it can return false as soon as a mismatch occurs.
bool isMissingCommonTokenWeightZero(const TSizeSizeMap& uniqueTokenIds) const;
//! \param uniqueTokenIds A container of pairs where the first element is
//! a token ID and the container is sorted into
//! ascending token ID order.
template<typename PAIR_CONTAINER>
bool isMissingCommonTokenWeightZero(const PAIR_CONTAINER& uniqueTokenIds) const {

auto testIter = uniqueTokenIds.begin();
for (const auto& commonItem : m_CommonUniqueTokenIds) {
testIter = std::find_if(testIter, uniqueTokenIds.end(),
[&commonItem](const auto& testItem) {
return testItem.first >= commonItem.first;
});
if (testIter == uniqueTokenIds.end() ||
testIter->first != commonItem.first ||
testIter->second != commonItem.second) {
return false;
}
++testIter;
}

return true;
}

//! Does the supplied token vector contain all our common tokens in the
//! same order as our base token vector?
bool containsCommonTokensInOrder(const TSizeSizePrVec& tokenIds) const;
bool containsCommonInOrderTokensInOrder(const TSizeSizePrVec& tokenIds) const;

//! \return Does the supplied token ID represent a common unique token?
bool isTokenCommon(std::size_t tokenId) const;
Expand Down
6 changes: 0 additions & 6 deletions include/model/CTokenListDataCategorizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,12 +81,6 @@ class CTokenListDataCategorizer : public CTokenListDataCategorizerBase {
//! Get the static size of this object - used for virtual hierarchies
std::size_t staticSize() const override { return sizeof(*this); }

//! Currently the overall model memory stats do not contain any categorizer
//! stats fields.
void updateMemoryResults(CResourceMonitor::SResults& /*results*/) const override {
// NO-OP
}

protected:
//! Split the string into a list of tokens. The result of the
//! tokenisation is returned in \p tokenIds, \p tokenUniqueIds and
Expand Down
33 changes: 23 additions & 10 deletions include/model/CTokenListDataCategorizerBase.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
#include <model/CDataCategorizer.h>
#include <model/CTokenListCategory.h>
#include <model/ImportExport.h>
#include <model/ModelTypes.h>

#include <iosfwd>
#include <list>
#include <map>
#include <memory>
#include <string>
Expand Down Expand Up @@ -75,8 +75,9 @@ class MODEL_EXPORT CTokenListDataCategorizerBase : public CDataCategorizer {
//! second -> weighting
using TSizeSizePr = std::pair<std::size_t, std::size_t>;

//! Used for storing token ID sequences
//! Used for storing token ID sequences and categories with counts
using TSizeSizePrVec = std::vector<TSizeSizePr>;
using TSizeSizePrVecItr = TSizeSizePrVec::iterator;

//! Used for storing distinct token IDs
using TSizeSizeMap = std::map<std::size_t, std::size_t>;
Expand Down Expand Up @@ -157,6 +158,24 @@ class MODEL_EXPORT CTokenListDataCategorizerBase : public CDataCategorizer {
//! Get the memory used by this categorizer.
std::size_t memoryUsage() const override;

//! Update the model size stats with information from this categorizer.
void updateModelSizeStats(CResourceMonitor::SModelSizeStats& modelSizeStats) const override;

//! Categorization status is "warn" if:
//! - At least 100 messages have been categorized
//! and one of the following holds:
//! - There is only 1 category
//! - More than 90% of categories are rare
//! - The number of categories is greater than 50% of the number of categorized messages
//! - There are no frequent match categories
//! - More than 50% of categories are dead
static model_t::ECategorizationStatus
calculateCategorizationStatus(std::size_t categorizedMessages,
std::size_t totalCategories,
std::size_t frequentCategories,
std::size_t rareCategories,
std::size_t deadCategories);

protected:
//! Split the string into a list of tokens. The result of the
//! tokenisation is returned in \p tokenIds, \p tokenUniqueIds and
Expand All @@ -180,19 +199,13 @@ class MODEL_EXPORT CTokenListDataCategorizerBase : public CDataCategorizer {
const TSizeSizePrVec& right,
std::size_t rightWeight) const = 0;

//! Used to hold statistics about the categories we compute:
//! first -> count of matches
//! second -> category vector index
using TSizeSizePrList = std::list<TSizeSizePr>;
using TSizeSizePrListItr = TSizeSizePrList::iterator;

//! Add a match to an existing category
void addCategoryMatch(bool isDryRun,
const std::string& str,
std::size_t rawStringLen,
const TSizeSizePrVec& tokenIds,
const TSizeSizeMap& tokenUniqueIds,
TSizeSizePrListItr& iter);
TSizeSizePrVecItr& iter);

//! Given the total token weight in a vector and a threshold, what is
//! the minimum possible token weight in a different vector that could
Expand Down Expand Up @@ -304,7 +317,7 @@ class MODEL_EXPORT CTokenListDataCategorizerBase : public CDataCategorizer {

//! List of match count/index into category vector in descending order of
//! match count
TSizeSizePrList m_CategoriesByCount;
TSizeSizePrVec m_CategoriesByCount;

//! Used for looking up tokens to a unique ID
TTokenMIndex m_TokenIdLookup;
Expand Down
12 changes: 12 additions & 0 deletions include/model/ModelTypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -802,6 +802,18 @@ enum EMemoryStatus {
MODEL_EXPORT
std::string print(EMemoryStatus memoryStatus);

//! An enumeration of the TokenListDataCategorizer status -
//! Start in the OK state. Moves into the "warn" state if too
//! few categories are being seen frequently.
enum ECategorizationStatus {
E_CategorizationStatusOk = 0, //!< Categorization working as intended
E_CategorizationStatusWarn = 1 //!< Too many categories being created
};

//! Get a string description of \p categorizationStatus.
MODEL_EXPORT
std::string print(ECategorizationStatus categorizationStatus);

//! Styles of probability aggregation available:
//! -# AggregatePeople: the style used to aggregate results for distinct
//! values of the over and partition field.
Expand Down
4 changes: 2 additions & 2 deletions lib/api/CAnomalyJob.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1179,7 +1179,7 @@ bool CAnomalyJob::persistModelsState(const TKeyCRefAnomalyDetectorPtrPrVec& dete
bool CAnomalyJob::persistCopiedState(const std::string& descriptionPrefix,
core_t::TTime time,
const TKeyCRefAnomalyDetectorPtrPrVec& detectors,
const model::CResourceMonitor::SResults& modelSizeStats,
const model::CResourceMonitor::SModelSizeStats& modelSizeStats,
const model::CInterimBucketCorrector& interimBucketCorrector,
const model::CHierarchicalResultsAggregator& aggregator,
const std::string& normalizerState,
Expand Down Expand Up @@ -1563,7 +1563,7 @@ void CAnomalyJob::addRecord(const TAnomalyDetectorPtr detector,

CAnomalyJob::SBackgroundPersistArgs::SBackgroundPersistArgs(
core_t::TTime time,
const model::CResourceMonitor::SResults& modelSizeStats,
const model::CResourceMonitor::SModelSizeStats& modelSizeStats,
const model::CInterimBucketCorrector& interimBucketCorrector,
const model::CHierarchicalResultsAggregator& aggregator,
core_t::TTime latestRecordTime,
Expand Down
2 changes: 1 addition & 1 deletion lib/api/CJsonOutputWriter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -857,7 +857,7 @@ void CJsonOutputWriter::popAllocator() {
m_Writer.popAllocator();
}

void CJsonOutputWriter::reportMemoryUsage(const model::CResourceMonitor::SResults& results) {
void CJsonOutputWriter::reportMemoryUsage(const model::CResourceMonitor::SModelSizeStats& results) {
m_Writer.StartObject();
CModelSizeStatsJsonWriter::write(m_JobId, results, m_Writer);
m_Writer.EndObject();
Expand Down
50 changes: 37 additions & 13 deletions lib/api/CModelSizeStatsJsonWriter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,22 +13,28 @@ namespace api {
namespace {

// JSON field names
const std::string JOB_ID("job_id");
const std::string MODEL_SIZE_STATS("model_size_stats");
const std::string MODEL_BYTES("model_bytes");
const std::string MODEL_BYTES_EXCEEDED("model_bytes_exceeded");
const std::string MODEL_BYTES_MEMORY_LIMIT("model_bytes_memory_limit");
const std::string TOTAL_BY_FIELD_COUNT("total_by_field_count");
const std::string TOTAL_OVER_FIELD_COUNT("total_over_field_count");
const std::string TOTAL_PARTITION_FIELD_COUNT("total_partition_field_count");
const std::string BUCKET_ALLOCATION_FAILURES_COUNT("bucket_allocation_failures_count");
const std::string MEMORY_STATUS("memory_status");
const std::string TIMESTAMP("timestamp");
const std::string LOG_TIME("log_time");
const std::string JOB_ID{"job_id"};
const std::string MODEL_SIZE_STATS{"model_size_stats"};
const std::string MODEL_BYTES{"model_bytes"};
const std::string MODEL_BYTES_EXCEEDED{"model_bytes_exceeded"};
const std::string MODEL_BYTES_MEMORY_LIMIT{"model_bytes_memory_limit"};
const std::string TOTAL_BY_FIELD_COUNT{"total_by_field_count"};
const std::string TOTAL_OVER_FIELD_COUNT{"total_over_field_count"};
const std::string TOTAL_PARTITION_FIELD_COUNT{"total_partition_field_count"};
const std::string BUCKET_ALLOCATION_FAILURES_COUNT{"bucket_allocation_failures_count"};
const std::string MEMORY_STATUS{"memory_status"};
const std::string CATEGORIZED_DOC_COUNT{"categorized_doc_count"};
const std::string TOTAL_CATEGORY_COUNT{"total_category_count"};
const std::string FREQUENT_CATEGORY_COUNT{"frequent_category_count"};
const std::string RARE_CATEGORY_COUNT{"rare_category_count"};
const std::string DEAD_CATEGORY_COUNT{"dead_category_count"};
const std::string CATEGORIZATION_STATUS{"categorization_status"};
const std::string TIMESTAMP{"timestamp"};
const std::string LOG_TIME{"log_time"};
}

void CModelSizeStatsJsonWriter::write(const std::string& jobId,
const model::CResourceMonitor::SResults& results,
const model::CResourceMonitor::SModelSizeStats& results,
core::CRapidJsonConcurrentLineWriter& writer) {
writer.String(MODEL_SIZE_STATS);
writer.StartObject();
Expand Down Expand Up @@ -60,6 +66,24 @@ void CModelSizeStatsJsonWriter::write(const std::string& jobId,
writer.String(MEMORY_STATUS);
writer.String(print(results.s_MemoryStatus));

writer.String(CATEGORIZED_DOC_COUNT);
writer.Uint64(results.s_CategorizedMessages);

writer.String(TOTAL_CATEGORY_COUNT);
writer.Uint64(results.s_TotalCategories);

writer.String(FREQUENT_CATEGORY_COUNT);
writer.Uint64(results.s_FrequentCategories);

writer.String(RARE_CATEGORY_COUNT);
writer.Uint64(results.s_RareCategories);

writer.String(DEAD_CATEGORY_COUNT);
writer.Uint64(results.s_DeadCategories);

writer.String(CATEGORIZATION_STATUS);
writer.String(print(results.s_CategorizationStatus));

writer.String(TIMESTAMP);
writer.Time(results.s_BucketStartTime);

Expand Down
Loading

0 comments on commit 371b519

Please sign in to comment.