Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ML] Add new categorization stats to model_size_stats #989

Merged
merged 5 commits into from
Feb 7, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ progress, memory usage, etc. (See {ml-pull}906[#906].)

* Improve initialization of learn rate for better and more stable results in regression
and classification. (See {ml-pull}948[#948].)
* Add new model_size_stats fields to instrument categorization. (See {ml-pull}948[#948]
and {pull}51879[#51879], issue: {issue}50794[#50749].)

=== Bug Fixes

Expand Down
6 changes: 3 additions & 3 deletions include/api/CAnomalyJob.h
Original file line number Diff line number Diff line change
Expand Up @@ -120,14 +120,14 @@ class API_EXPORT CAnomalyJob : public CDataProcessor {

struct SBackgroundPersistArgs {
SBackgroundPersistArgs(core_t::TTime time,
const model::CResourceMonitor::SResults& modelSizeStats,
const model::CResourceMonitor::SModelSizeStats& modelSizeStats,
const model::CInterimBucketCorrector& interimBucketCorrector,
const model::CHierarchicalResultsAggregator& aggregator,
core_t::TTime latestRecordTime,
core_t::TTime lastResultsTime);

core_t::TTime s_Time;
model::CResourceMonitor::SResults s_ModelSizeStats;
model::CResourceMonitor::SModelSizeStats s_ModelSizeStats;
model::CInterimBucketCorrector s_InterimBucketCorrector;
model::CHierarchicalResultsAggregator s_Aggregator;
std::string s_NormalizerState;
Expand Down Expand Up @@ -258,7 +258,7 @@ class API_EXPORT CAnomalyJob : public CDataProcessor {
bool persistCopiedState(const std::string& descriptionPrefix,
core_t::TTime time,
const TKeyCRefAnomalyDetectorPtrPrVec& detectors,
const model::CResourceMonitor::SResults& modelSizeStats,
const model::CResourceMonitor::SModelSizeStats& modelSizeStats,
const model::CInterimBucketCorrector& interimBucketCorrector,
const model::CHierarchicalResultsAggregator& aggregator,
const std::string& normalizerState,
Expand Down
2 changes: 1 addition & 1 deletion include/api/CJsonOutputWriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ class API_EXPORT CJsonOutputWriter : public COutputHandler {

//! Report the current levels of resource usage, as given to us
//! from the CResourceMonitor via a callback
void reportMemoryUsage(const model::CResourceMonitor::SResults& results);
void reportMemoryUsage(const model::CResourceMonitor::SModelSizeStats& modelSizeStats);

//! Acknowledge a flush request by echoing back the flush ID
void acknowledgeFlush(const std::string& flushId, core_t::TTime lastFinalizedBucketEnd);
Expand Down
2 changes: 1 addition & 1 deletion include/api/CModelSizeStatsJsonWriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class API_EXPORT CModelSizeStatsJsonWriter : private core::CNonInstantiatable {
public:
//! Writes the model size stats in the \p results in JSON format.
static void write(const std::string& jobId,
const model::CResourceMonitor::SResults& results,
const model::CResourceMonitor::SModelSizeStats& results,
core::CRapidJsonConcurrentLineWriter& writer);
};
}
Expand Down
2 changes: 1 addition & 1 deletion include/api/CModelSnapshotJsonWriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ class API_EXPORT CModelSnapshotJsonWriter {
std::string s_Description;
std::string s_SnapshotId;
size_t s_NumDocs;
model::CResourceMonitor::SResults s_ModelSizeStats;
model::CResourceMonitor::SModelSizeStats s_ModelSizeStats;
std::string s_NormalizerState;
core_t::TTime s_LatestRecordTime;
core_t::TTime s_LatestFinalResultTime;
Expand Down
6 changes: 3 additions & 3 deletions include/model/CAnomalyDetector.h
Original file line number Diff line number Diff line change
Expand Up @@ -270,9 +270,9 @@ class MODEL_EXPORT CAnomalyDetector : public CMonitoredResource {
//! Prune the model.
void prune(std::size_t maximumAge) override;

//! Update the overall model memory stats results with stats from this
//! anomaly detector.
void updateMemoryResults(CResourceMonitor::SResults& results) const override;
//! Update the overall model size stats with information from this anomaly
//! detector.
void updateModelSizeStats(CResourceMonitor::SModelSizeStats& modelSizeStats) const override;

//! Get end of the last complete bucket we've observed.
const core_t::TTime& lastBucketEndTime() const;
Expand Down
5 changes: 3 additions & 2 deletions include/model/CMonitoredResource.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,10 @@ class MODEL_EXPORT CMonitoredResource {
//! discarding the least recently seen entities it knows about.
virtual void prune(std::size_t maximumAge);

//! Update the overall model memory stats results with stats from this
//! Update the overall model size stats results with stats from this
//! monitored resource.
virtual void updateMemoryResults(CResourceMonitor::SResults& results) const = 0;
virtual void
updateModelSizeStats(CResourceMonitor::SModelSizeStats& modelSizeStats) const = 0;
};
}
}
Expand Down
33 changes: 20 additions & 13 deletions include/model/CResourceMonitor.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,23 +38,30 @@ class CMonitoredResource;
//! Assess memory used by models and decide on further memory allocations.
class MODEL_EXPORT CResourceMonitor {
public:
struct MODEL_EXPORT SResults {
std::size_t s_Usage;
std::size_t s_AdjustedUsage;
std::size_t s_ByFields;
std::size_t s_PartitionFields;
std::size_t s_OverFields;
std::size_t s_AllocationFailures;
model_t::EMemoryStatus s_MemoryStatus;
core_t::TTime s_BucketStartTime;
std::size_t s_BytesExceeded;
std::size_t s_BytesMemoryLimit;
struct MODEL_EXPORT SModelSizeStats {
std::size_t s_Usage = 0;
std::size_t s_AdjustedUsage = 0;
std::size_t s_ByFields = 0;
std::size_t s_PartitionFields = 0;
std::size_t s_OverFields = 0;
std::size_t s_AllocationFailures = 0;
model_t::EMemoryStatus s_MemoryStatus = model_t::E_MemoryStatusOk;
core_t::TTime s_BucketStartTime = 0;
std::size_t s_BytesExceeded = 0;
std::size_t s_BytesMemoryLimit = 0;
std::size_t s_CategorizedMessages = 0;
std::size_t s_TotalCategories = 0;
std::size_t s_FrequentCategories = 0;
std::size_t s_RareCategories = 0;
std::size_t s_DeadCategories = 0;
model_t::ECategorizationStatus s_CategorizationStatus = model_t::E_CategorizationStatusOk;
};

public:
using TMonitoredResourcePtrSizeUMap =
boost::unordered_map<CMonitoredResource*, std::size_t>;
using TMemoryUsageReporterFunc = std::function<void(const CResourceMonitor::SResults&)>;
using TMemoryUsageReporterFunc =
std::function<void(const CResourceMonitor::SModelSizeStats&)>;
using TTimeSizeMap = std::map<core_t::TTime, std::size_t>;

//! The minimum time between prunes
Expand Down Expand Up @@ -109,7 +116,7 @@ class MODEL_EXPORT CResourceMonitor {
void sendMemoryUsageReport(core_t::TTime bucketStartTime);

//! Create a memory usage report
SResults createMemoryUsageReport(core_t::TTime bucketStartTime);
SModelSizeStats createMemoryUsageReport(core_t::TTime bucketStartTime);

//! We are being told that a class has failed to allocate memory
//! based on the resource limits, and we will report this to the
Expand Down
31 changes: 27 additions & 4 deletions include/model/CTokenListCategory.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#include <model/ImportExport.h>

#include <algorithm>
#include <map>
#include <string>
#include <utility>
Expand Down Expand Up @@ -94,16 +95,38 @@ class MODEL_EXPORT CTokenListCategory {
//! this category's common unique tokens?
std::size_t missingCommonTokenWeight(const TSizeSizeMap& uniqueTokenIds) const;

//! Is the weight of tokens in a given map that are missing from this
//! category's common unique tokens equal to zero? It is possible to test:
//! Is the weight of tokens in the provided container that are missing from
//! this category's common unique tokens equal to zero? It is possible to
//! test:
//! if (category.missingCommonTokenWeight(uniqueTokenIds) == 0)
//! instead of calling this method. However, this method is much faster
//! as it can return false as soon as a mismatch occurs.
bool isMissingCommonTokenWeightZero(const TSizeSizeMap& uniqueTokenIds) const;
//! \param uniqueTokenIds A container of pairs where the first element is
//! a token ID and the container is sorted into
//! ascending token ID order.
template<typename PAIR_CONTAINER>
bool isMissingCommonTokenWeightZero(const PAIR_CONTAINER& uniqueTokenIds) const {

auto testIter = uniqueTokenIds.begin();
for (const auto& commonItem : m_CommonUniqueTokenIds) {
testIter = std::find_if(testIter, uniqueTokenIds.end(),
[&commonItem](const auto& testItem) {
return testItem.first >= commonItem.first;
});
if (testIter == uniqueTokenIds.end() ||
testIter->first != commonItem.first ||
testIter->second != commonItem.second) {
return false;
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It strikes me that this becomes somewhat cleaner using std::find, i.e.

while (commonIter != m_CommonUniqueTokenIds.end()) {
  testIter = std::find(testIter, uniqueTokenIds.end(), *commonIter,
                       [](const auto& lhs, const auto& rhs) { return lhs->first >= rhs->first; }); 
  if (testIter == uniqueTokenIds.end() || *testIter != *commonIter) {
      return false;
  }
  ++commonIter;
}

I guess this is just moving well tested code to a different place, so feel free to leave as if you'd rather not change.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I more-or-less did this. The second part of the condition has to be testIter->first != commonIter->first || testIter->second != commonIter->second instead of simply *testIter != *commonIter because in some cases one is a std::pair<const std::size_t, std::size_t> and the other is a std::pair<std::size_t, std::size_t>.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, great. It's a shame comparison doesn't work with const. That feels like an error in the std::pair implementation.

++testIter;
}

return true;
}

//! Does the supplied token vector contain all our common tokens in the
//! same order as our base token vector?
bool containsCommonTokensInOrder(const TSizeSizePrVec& tokenIds) const;
bool containsCommonInOrderTokensInOrder(const TSizeSizePrVec& tokenIds) const;

//! \return Does the supplied token ID represent a common unique token?
bool isTokenCommon(std::size_t tokenId) const;
Expand Down
6 changes: 0 additions & 6 deletions include/model/CTokenListDataCategorizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,12 +81,6 @@ class CTokenListDataCategorizer : public CTokenListDataCategorizerBase {
//! Get the static size of this object - used for virtual hierarchies
std::size_t staticSize() const override { return sizeof(*this); }

//! Currently the overall model memory stats do not contain any categorizer
//! stats fields.
void updateMemoryResults(CResourceMonitor::SResults& /*results*/) const override {
// NO-OP
}

protected:
//! Split the string into a list of tokens. The result of the
//! tokenisation is returned in \p tokenIds, \p tokenUniqueIds and
Expand Down
33 changes: 23 additions & 10 deletions include/model/CTokenListDataCategorizerBase.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
#include <model/CDataCategorizer.h>
#include <model/CTokenListCategory.h>
#include <model/ImportExport.h>
#include <model/ModelTypes.h>

#include <iosfwd>
#include <list>
#include <map>
#include <memory>
#include <string>
Expand Down Expand Up @@ -75,8 +75,9 @@ class MODEL_EXPORT CTokenListDataCategorizerBase : public CDataCategorizer {
//! second -> weighting
using TSizeSizePr = std::pair<std::size_t, std::size_t>;

//! Used for storing token ID sequences
//! Used for storing token ID sequences and categories with counts
using TSizeSizePrVec = std::vector<TSizeSizePr>;
using TSizeSizePrVecItr = TSizeSizePrVec::iterator;

//! Used for storing distinct token IDs
using TSizeSizeMap = std::map<std::size_t, std::size_t>;
Expand Down Expand Up @@ -157,6 +158,24 @@ class MODEL_EXPORT CTokenListDataCategorizerBase : public CDataCategorizer {
//! Get the memory used by this categorizer.
std::size_t memoryUsage() const override;

//! Update the model size stats with information from this categorizer.
void updateModelSizeStats(CResourceMonitor::SModelSizeStats& modelSizeStats) const override;

//! Categorization status is "warn" if:
//! - At least 100 messages have been categorized
//! and one of the following holds:
//! - There is only 1 category
//! - More than 90% of categories are rare
//! - The number of categories is greater than 50% of the number of categorized messages
//! - There are no frequent match categories
//! - More than 50% of categories are dead
static model_t::ECategorizationStatus
calculateCategorizationStatus(std::size_t categorizedMessages,
std::size_t totalCategories,
std::size_t frequentCategories,
std::size_t rareCategories,
std::size_t deadCategories);

protected:
//! Split the string into a list of tokens. The result of the
//! tokenisation is returned in \p tokenIds, \p tokenUniqueIds and
Expand All @@ -180,19 +199,13 @@ class MODEL_EXPORT CTokenListDataCategorizerBase : public CDataCategorizer {
const TSizeSizePrVec& right,
std::size_t rightWeight) const = 0;

//! Used to hold statistics about the categories we compute:
//! first -> count of matches
//! second -> category vector index
using TSizeSizePrList = std::list<TSizeSizePr>;
using TSizeSizePrListItr = TSizeSizePrList::iterator;

//! Add a match to an existing category
void addCategoryMatch(bool isDryRun,
const std::string& str,
std::size_t rawStringLen,
const TSizeSizePrVec& tokenIds,
const TSizeSizeMap& tokenUniqueIds,
TSizeSizePrListItr& iter);
TSizeSizePrVecItr& iter);

//! Given the total token weight in a vector and a threshold, what is
//! the minimum possible token weight in a different vector that could
Expand Down Expand Up @@ -304,7 +317,7 @@ class MODEL_EXPORT CTokenListDataCategorizerBase : public CDataCategorizer {

//! List of match count/index into category vector in descending order of
//! match count
TSizeSizePrList m_CategoriesByCount;
TSizeSizePrVec m_CategoriesByCount;

//! Used for looking up tokens to a unique ID
TTokenMIndex m_TokenIdLookup;
Expand Down
12 changes: 12 additions & 0 deletions include/model/ModelTypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -802,6 +802,18 @@ enum EMemoryStatus {
MODEL_EXPORT
std::string print(EMemoryStatus memoryStatus);

//! An enumeration of the TokenListDataCategorizer status -
//! Start in the OK state. Moves into the "warn" state if too
//! few categories are being seen frequently.
enum ECategorizationStatus {
E_CategorizationStatusOk = 0, //!< Categorization working as intended
E_CategorizationStatusWarn = 1 //!< Too many categories being created
};

//! Get a string description of \p categorizationStatus.
MODEL_EXPORT
std::string print(ECategorizationStatus categorizationStatus);

//! Styles of probability aggregation available:
//! -# AggregatePeople: the style used to aggregate results for distinct
//! values of the over and partition field.
Expand Down
4 changes: 2 additions & 2 deletions lib/api/CAnomalyJob.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1179,7 +1179,7 @@ bool CAnomalyJob::persistModelsState(const TKeyCRefAnomalyDetectorPtrPrVec& dete
bool CAnomalyJob::persistCopiedState(const std::string& descriptionPrefix,
core_t::TTime time,
const TKeyCRefAnomalyDetectorPtrPrVec& detectors,
const model::CResourceMonitor::SResults& modelSizeStats,
const model::CResourceMonitor::SModelSizeStats& modelSizeStats,
const model::CInterimBucketCorrector& interimBucketCorrector,
const model::CHierarchicalResultsAggregator& aggregator,
const std::string& normalizerState,
Expand Down Expand Up @@ -1563,7 +1563,7 @@ void CAnomalyJob::addRecord(const TAnomalyDetectorPtr detector,

CAnomalyJob::SBackgroundPersistArgs::SBackgroundPersistArgs(
core_t::TTime time,
const model::CResourceMonitor::SResults& modelSizeStats,
const model::CResourceMonitor::SModelSizeStats& modelSizeStats,
const model::CInterimBucketCorrector& interimBucketCorrector,
const model::CHierarchicalResultsAggregator& aggregator,
core_t::TTime latestRecordTime,
Expand Down
2 changes: 1 addition & 1 deletion lib/api/CJsonOutputWriter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -857,7 +857,7 @@ void CJsonOutputWriter::popAllocator() {
m_Writer.popAllocator();
}

void CJsonOutputWriter::reportMemoryUsage(const model::CResourceMonitor::SResults& results) {
void CJsonOutputWriter::reportMemoryUsage(const model::CResourceMonitor::SModelSizeStats& results) {
m_Writer.StartObject();
CModelSizeStatsJsonWriter::write(m_JobId, results, m_Writer);
m_Writer.EndObject();
Expand Down
50 changes: 37 additions & 13 deletions lib/api/CModelSizeStatsJsonWriter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,22 +13,28 @@ namespace api {
namespace {

// JSON field names
const std::string JOB_ID("job_id");
const std::string MODEL_SIZE_STATS("model_size_stats");
const std::string MODEL_BYTES("model_bytes");
const std::string MODEL_BYTES_EXCEEDED("model_bytes_exceeded");
const std::string MODEL_BYTES_MEMORY_LIMIT("model_bytes_memory_limit");
const std::string TOTAL_BY_FIELD_COUNT("total_by_field_count");
const std::string TOTAL_OVER_FIELD_COUNT("total_over_field_count");
const std::string TOTAL_PARTITION_FIELD_COUNT("total_partition_field_count");
const std::string BUCKET_ALLOCATION_FAILURES_COUNT("bucket_allocation_failures_count");
const std::string MEMORY_STATUS("memory_status");
const std::string TIMESTAMP("timestamp");
const std::string LOG_TIME("log_time");
const std::string JOB_ID{"job_id"};
const std::string MODEL_SIZE_STATS{"model_size_stats"};
const std::string MODEL_BYTES{"model_bytes"};
const std::string MODEL_BYTES_EXCEEDED{"model_bytes_exceeded"};
const std::string MODEL_BYTES_MEMORY_LIMIT{"model_bytes_memory_limit"};
const std::string TOTAL_BY_FIELD_COUNT{"total_by_field_count"};
const std::string TOTAL_OVER_FIELD_COUNT{"total_over_field_count"};
const std::string TOTAL_PARTITION_FIELD_COUNT{"total_partition_field_count"};
const std::string BUCKET_ALLOCATION_FAILURES_COUNT{"bucket_allocation_failures_count"};
const std::string MEMORY_STATUS{"memory_status"};
const std::string CATEGORIZED_DOC_COUNT{"categorized_doc_count"};
const std::string TOTAL_CATEGORY_COUNT{"total_category_count"};
const std::string FREQUENT_CATEGORY_COUNT{"frequent_category_count"};
const std::string RARE_CATEGORY_COUNT{"rare_category_count"};
const std::string DEAD_CATEGORY_COUNT{"dead_category_count"};
const std::string CATEGORIZATION_STATUS{"categorization_status"};
const std::string TIMESTAMP{"timestamp"};
const std::string LOG_TIME{"log_time"};
}

void CModelSizeStatsJsonWriter::write(const std::string& jobId,
const model::CResourceMonitor::SResults& results,
const model::CResourceMonitor::SModelSizeStats& results,
core::CRapidJsonConcurrentLineWriter& writer) {
writer.String(MODEL_SIZE_STATS);
writer.StartObject();
Expand Down Expand Up @@ -60,6 +66,24 @@ void CModelSizeStatsJsonWriter::write(const std::string& jobId,
writer.String(MEMORY_STATUS);
writer.String(print(results.s_MemoryStatus));

writer.String(CATEGORIZED_DOC_COUNT);
writer.Uint64(results.s_CategorizedMessages);

writer.String(TOTAL_CATEGORY_COUNT);
writer.Uint64(results.s_TotalCategories);

writer.String(FREQUENT_CATEGORY_COUNT);
writer.Uint64(results.s_FrequentCategories);

writer.String(RARE_CATEGORY_COUNT);
writer.Uint64(results.s_RareCategories);

writer.String(DEAD_CATEGORY_COUNT);
writer.Uint64(results.s_DeadCategories);

writer.String(CATEGORIZATION_STATUS);
writer.String(print(results.s_CategorizationStatus));

writer.String(TIMESTAMP);
writer.Time(results.s_BucketStartTime);

Expand Down
Loading