Skip to content

Commit

Permalink
[ML] Add new categorization stats to model_size_stats (#51879)
Browse files Browse the repository at this point in the history
This change adds support for the following new model_size_stats
fields:

- categorized_doc_count
- total_category_count
- frequent_category_count
- rare_category_count
- dead_category_count
- categorization_status

Relates #50749
  • Loading branch information
droberts195 authored Feb 6, 2020
1 parent de4cf2b commit 72346b9
Show file tree
Hide file tree
Showing 11 changed files with 497 additions and 20 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
import java.util.Objects;

/**
* Provide access to the C++ model memory usage numbers for the Java process.
* Provide access to the C++ model size stats for the Java process.
*/
public class ModelSizeStats implements ToXContentObject {

Expand All @@ -54,6 +54,12 @@ public class ModelSizeStats implements ToXContentObject {
public static final ParseField TOTAL_PARTITION_FIELD_COUNT_FIELD = new ParseField("total_partition_field_count");
public static final ParseField BUCKET_ALLOCATION_FAILURES_COUNT_FIELD = new ParseField("bucket_allocation_failures_count");
public static final ParseField MEMORY_STATUS_FIELD = new ParseField("memory_status");
public static final ParseField CATEGORIZED_DOC_COUNT_FIELD = new ParseField("categorized_doc_count");
public static final ParseField TOTAL_CATEGORY_COUNT_FIELD = new ParseField("total_category_count");
public static final ParseField FREQUENT_CATEGORY_COUNT_FIELD = new ParseField("frequent_category_count");
public static final ParseField RARE_CATEGORY_COUNT_FIELD = new ParseField("rare_category_count");
public static final ParseField DEAD_CATEGORY_COUNT_FIELD = new ParseField("dead_category_count");
public static final ParseField CATEGORIZATION_STATUS_FIELD = new ParseField("categorization_status");
public static final ParseField LOG_TIME_FIELD = new ParseField("log_time");
public static final ParseField TIMESTAMP_FIELD = new ParseField("timestamp");

Expand All @@ -69,6 +75,14 @@ public class ModelSizeStats implements ToXContentObject {
PARSER.declareLong(Builder::setTotalByFieldCount, TOTAL_BY_FIELD_COUNT_FIELD);
PARSER.declareLong(Builder::setTotalOverFieldCount, TOTAL_OVER_FIELD_COUNT_FIELD);
PARSER.declareLong(Builder::setTotalPartitionFieldCount, TOTAL_PARTITION_FIELD_COUNT_FIELD);
PARSER.declareField(Builder::setMemoryStatus, p -> MemoryStatus.fromString(p.text()), MEMORY_STATUS_FIELD, ValueType.STRING);
PARSER.declareLong(Builder::setCategorizedDocCount, CATEGORIZED_DOC_COUNT_FIELD);
PARSER.declareLong(Builder::setTotalCategoryCount, TOTAL_CATEGORY_COUNT_FIELD);
PARSER.declareLong(Builder::setFrequentCategoryCount, FREQUENT_CATEGORY_COUNT_FIELD);
PARSER.declareLong(Builder::setRareCategoryCount, RARE_CATEGORY_COUNT_FIELD);
PARSER.declareLong(Builder::setDeadCategoryCount, DEAD_CATEGORY_COUNT_FIELD);
PARSER.declareField(Builder::setCategorizationStatus,
p -> CategorizationStatus.fromString(p.text()), CATEGORIZATION_STATUS_FIELD, ValueType.STRING);
PARSER.declareField(Builder::setLogTime,
(p) -> TimeUtil.parseTimeField(p, LOG_TIME_FIELD.getPreferredName()),
LOG_TIME_FIELD,
Expand All @@ -77,7 +91,6 @@ public class ModelSizeStats implements ToXContentObject {
(p) -> TimeUtil.parseTimeField(p, TIMESTAMP_FIELD.getPreferredName()),
TIMESTAMP_FIELD,
ValueType.VALUE);
PARSER.declareField(Builder::setMemoryStatus, p -> MemoryStatus.fromString(p.text()), MEMORY_STATUS_FIELD, ValueType.STRING);
}

/**
Expand All @@ -99,6 +112,23 @@ public String toString() {
}
}

/**
* The status of categorization for a job. OK is default, WARN
* means that inappropriate numbers of categories are being found
*/
public enum CategorizationStatus {
OK, WARN;

public static CategorizationStatus fromString(String statusName) {
return valueOf(statusName.trim().toUpperCase(Locale.ROOT));
}

@Override
public String toString() {
return name().toLowerCase(Locale.ROOT);
}
}

private final String jobId;
private final long modelBytes;
private final Long modelBytesExceeded;
Expand All @@ -108,12 +138,20 @@ public String toString() {
private final long totalPartitionFieldCount;
private final long bucketAllocationFailuresCount;
private final MemoryStatus memoryStatus;
private final long categorizedDocCount;
private final long totalCategoryCount;
private final long frequentCategoryCount;
private final long rareCategoryCount;
private final long deadCategoryCount;
private final CategorizationStatus categorizationStatus;
private final Date timestamp;
private final Date logTime;

private ModelSizeStats(String jobId, long modelBytes, Long modelBytesExceeded, Long modelBytesMemoryLimit, long totalByFieldCount,
long totalOverFieldCount, long totalPartitionFieldCount, long bucketAllocationFailuresCount,
MemoryStatus memoryStatus, Date timestamp, Date logTime) {
MemoryStatus memoryStatus, long categorizedDocCount, long totalCategoryCount, long frequentCategoryCount,
long rareCategoryCount, long deadCategoryCount, CategorizationStatus categorizationStatus,
Date timestamp, Date logTime) {
this.jobId = jobId;
this.modelBytes = modelBytes;
this.modelBytesExceeded = modelBytesExceeded;
Expand All @@ -123,6 +161,12 @@ private ModelSizeStats(String jobId, long modelBytes, Long modelBytesExceeded, L
this.totalPartitionFieldCount = totalPartitionFieldCount;
this.bucketAllocationFailuresCount = bucketAllocationFailuresCount;
this.memoryStatus = memoryStatus;
this.categorizedDocCount = categorizedDocCount;
this.totalCategoryCount = totalCategoryCount;
this.frequentCategoryCount = frequentCategoryCount;
this.rareCategoryCount = rareCategoryCount;
this.deadCategoryCount = deadCategoryCount;
this.categorizationStatus = categorizationStatus;
this.timestamp = timestamp;
this.logTime = logTime;
}
Expand All @@ -145,6 +189,12 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
builder.field(TOTAL_PARTITION_FIELD_COUNT_FIELD.getPreferredName(), totalPartitionFieldCount);
builder.field(BUCKET_ALLOCATION_FAILURES_COUNT_FIELD.getPreferredName(), bucketAllocationFailuresCount);
builder.field(MEMORY_STATUS_FIELD.getPreferredName(), memoryStatus);
builder.field(CATEGORIZED_DOC_COUNT_FIELD.getPreferredName(), categorizedDocCount);
builder.field(TOTAL_CATEGORY_COUNT_FIELD.getPreferredName(), totalCategoryCount);
builder.field(FREQUENT_CATEGORY_COUNT_FIELD.getPreferredName(), frequentCategoryCount);
builder.field(RARE_CATEGORY_COUNT_FIELD.getPreferredName(), rareCategoryCount);
builder.field(DEAD_CATEGORY_COUNT_FIELD.getPreferredName(), deadCategoryCount);
builder.field(CATEGORIZATION_STATUS_FIELD.getPreferredName(), categorizationStatus);
builder.timeField(LOG_TIME_FIELD.getPreferredName(), LOG_TIME_FIELD.getPreferredName() + "_string", logTime.getTime());
if (timestamp != null) {
builder.timeField(TIMESTAMP_FIELD.getPreferredName(), TIMESTAMP_FIELD.getPreferredName() + "_string", timestamp.getTime());
Expand Down Expand Up @@ -190,6 +240,30 @@ public MemoryStatus getMemoryStatus() {
return memoryStatus;
}

public long getCategorizedDocCount() {
return categorizedDocCount;
}

public long getTotalCategoryCount() {
return totalCategoryCount;
}

public long getFrequentCategoryCount() {
return frequentCategoryCount;
}

public long getRareCategoryCount() {
return rareCategoryCount;
}

public long getDeadCategoryCount() {
return deadCategoryCount;
}

public CategorizationStatus getCategorizationStatus() {
return categorizationStatus;
}

/**
* The timestamp of the last processed record when this instance was created.
*
Expand All @@ -211,7 +285,8 @@ public Date getLogTime() {
@Override
public int hashCode() {
return Objects.hash(jobId, modelBytes, modelBytesExceeded, modelBytesMemoryLimit, totalByFieldCount, totalOverFieldCount,
totalPartitionFieldCount, this.bucketAllocationFailuresCount, memoryStatus, timestamp, logTime);
totalPartitionFieldCount, this.bucketAllocationFailuresCount, memoryStatus, categorizedDocCount, totalCategoryCount,
frequentCategoryCount, rareCategoryCount, deadCategoryCount, categorizationStatus, timestamp, logTime);
}

/**
Expand All @@ -233,7 +308,14 @@ public boolean equals(Object other) {
&& Objects.equals(this.modelBytesMemoryLimit, that.modelBytesMemoryLimit) && this.totalByFieldCount == that.totalByFieldCount
&& this.totalOverFieldCount == that.totalOverFieldCount && this.totalPartitionFieldCount == that.totalPartitionFieldCount
&& this.bucketAllocationFailuresCount == that.bucketAllocationFailuresCount
&& Objects.equals(this.memoryStatus, that.memoryStatus) && Objects.equals(this.timestamp, that.timestamp)
&& Objects.equals(this.memoryStatus, that.memoryStatus)
&& this.categorizedDocCount == that.categorizedDocCount
&& this.totalCategoryCount == that.totalCategoryCount
&& this.frequentCategoryCount == that.frequentCategoryCount
&& this.rareCategoryCount == that.rareCategoryCount
&& this.deadCategoryCount == that.deadCategoryCount
&& Objects.equals(this.categorizationStatus, that.categorizationStatus)
&& Objects.equals(this.timestamp, that.timestamp)
&& Objects.equals(this.logTime, that.logTime)
&& Objects.equals(this.jobId, that.jobId);
}
Expand All @@ -249,12 +331,19 @@ public static class Builder {
private long totalPartitionFieldCount;
private long bucketAllocationFailuresCount;
private MemoryStatus memoryStatus;
private long categorizedDocCount;
private long totalCategoryCount;
private long frequentCategoryCount;
private long rareCategoryCount;
private long deadCategoryCount;
private CategorizationStatus categorizationStatus;
private Date timestamp;
private Date logTime;

public Builder(String jobId) {
this.jobId = jobId;
memoryStatus = MemoryStatus.OK;
categorizationStatus = CategorizationStatus.OK;
logTime = new Date();
}

Expand All @@ -268,6 +357,12 @@ public Builder(ModelSizeStats modelSizeStats) {
this.totalPartitionFieldCount = modelSizeStats.totalPartitionFieldCount;
this.bucketAllocationFailuresCount = modelSizeStats.bucketAllocationFailuresCount;
this.memoryStatus = modelSizeStats.memoryStatus;
this.categorizedDocCount = modelSizeStats.categorizedDocCount;
this.totalCategoryCount = modelSizeStats.totalCategoryCount;
this.frequentCategoryCount = modelSizeStats.frequentCategoryCount;
this.rareCategoryCount = modelSizeStats.rareCategoryCount;
this.deadCategoryCount = modelSizeStats.deadCategoryCount;
this.categorizationStatus = modelSizeStats.categorizationStatus;
this.timestamp = modelSizeStats.timestamp;
this.logTime = modelSizeStats.logTime;
}
Expand Down Expand Up @@ -313,6 +408,37 @@ public Builder setMemoryStatus(MemoryStatus memoryStatus) {
return this;
}

public Builder setCategorizedDocCount(long categorizedDocCount) {
this.categorizedDocCount = categorizedDocCount;
return this;
}

public Builder setTotalCategoryCount(long totalCategoryCount) {
this.totalCategoryCount = totalCategoryCount;
return this;
}

public Builder setFrequentCategoryCount(long frequentCategoryCount) {
this.frequentCategoryCount = frequentCategoryCount;
return this;
}

public Builder setRareCategoryCount(long rareCategoryCount) {
this.rareCategoryCount = rareCategoryCount;
return this;
}

public Builder setDeadCategoryCount(long deadCategoryCount) {
this.deadCategoryCount = deadCategoryCount;
return this;
}

public Builder setCategorizationStatus(CategorizationStatus categorizationStatus) {
Objects.requireNonNull(categorizationStatus, "[" + CATEGORIZATION_STATUS_FIELD.getPreferredName() + "] must not be null");
this.categorizationStatus = categorizationStatus;
return this;
}

public Builder setTimestamp(Date timestamp) {
this.timestamp = timestamp;
return this;
Expand All @@ -325,7 +451,8 @@ public Builder setLogTime(Date logTime) {

public ModelSizeStats build() {
return new ModelSizeStats(jobId, modelBytes, modelBytesExceeded, modelBytesMemoryLimit, totalByFieldCount, totalOverFieldCount,
totalPartitionFieldCount, bucketAllocationFailuresCount, memoryStatus, timestamp, logTime);
totalPartitionFieldCount, bucketAllocationFailuresCount, memoryStatus, categorizedDocCount, totalCategoryCount,
frequentCategoryCount, rareCategoryCount, deadCategoryCount, categorizationStatus, timestamp, logTime);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

import java.util.Date;

import static org.elasticsearch.client.ml.job.process.ModelSizeStats.CategorizationStatus;
import static org.elasticsearch.client.ml.job.process.ModelSizeStats.MemoryStatus;

public class ModelSizeStatsTests extends AbstractXContentTestCase<ModelSizeStats> {
Expand All @@ -38,6 +39,12 @@ public void testDefaultConstructor() {
assertEquals(0, stats.getTotalPartitionFieldCount());
assertEquals(0, stats.getBucketAllocationFailuresCount());
assertEquals(MemoryStatus.OK, stats.getMemoryStatus());
assertEquals(0, stats.getCategorizedDocCount());
assertEquals(0, stats.getTotalCategoryCount());
assertEquals(0, stats.getFrequentCategoryCount());
assertEquals(0, stats.getRareCategoryCount());
assertEquals(0, stats.getDeadCategoryCount());
assertEquals(CategorizationStatus.OK, stats.getCategorizationStatus());
}

public void testSetMemoryStatus_GivenNull() {
Expand Down Expand Up @@ -85,13 +92,31 @@ public static ModelSizeStats createRandomized() {
stats.setTotalPartitionFieldCount(randomNonNegativeLong());
}
if (randomBoolean()) {
stats.setLogTime(new Date(TimeValue.parseTimeValue(randomTimeValue(), "test").millis()));
stats.setMemoryStatus(randomFrom(MemoryStatus.values()));
}
if (randomBoolean()) {
stats.setTimestamp(new Date(TimeValue.parseTimeValue(randomTimeValue(), "test").millis()));
stats.setCategorizedDocCount(randomNonNegativeLong());
}
if (randomBoolean()) {
stats.setMemoryStatus(randomFrom(MemoryStatus.values()));
stats.setTotalCategoryCount(randomNonNegativeLong());
}
if (randomBoolean()) {
stats.setFrequentCategoryCount(randomNonNegativeLong());
}
if (randomBoolean()) {
stats.setRareCategoryCount(randomNonNegativeLong());
}
if (randomBoolean()) {
stats.setDeadCategoryCount(randomNonNegativeLong());
}
if (randomBoolean()) {
stats.setCategorizationStatus(randomFrom(CategorizationStatus.values()));
}
if (randomBoolean()) {
stats.setLogTime(new Date(TimeValue.parseTimeValue(randomTimeValue(), "test").millis()));
}
if (randomBoolean()) {
stats.setTimestamp(new Date(TimeValue.parseTimeValue(randomTimeValue(), "test").millis()));
}
return stats.build();
}
Expand Down
41 changes: 41 additions & 0 deletions docs/reference/ml/anomaly-detection/apis/get-job-stats.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,35 @@ model. It has the following properties:
processed due to insufficient model memory. This situation is also signified
by a `hard_limit: memory_status` property value.

`model_size_stats`.`categorized_doc_count`:::
(long) The number of documents that have had a field categorized.

`model_size_stats`.`categorization_status`:::
(string) The status of categorization for this job.
Contains one of the following values.
+
--
* `ok`: Categorization is performing acceptably well (or not being
used at all).
* `warn`: Categorization is detecting a distribution of categories
that suggests the input data is inappropriate for categorization.
Problems could be that there is only one category, more than 90% of
categories are rare, the number of categories is greater than 50% of
the number of categorized documents, there are no frequently
matched categories, or more than 50% of categories are dead.

--

`model_size_stats`.`dead_category_count`:::
(long) The number of categories created by categorization that will
never be assigned again because another category's definition
makes it a superset of the dead category. (Dead categories are a
side effect of the way categorization has no prior training.)

`model_size_stats`.`frequent_category_count`:::
(long) The number of categories that match more than 1% of categorized
documents.

`model_size_stats`.`job_id`:::
(string)
include::{docdir}/ml/ml-shared.asciidoc[tag=job-id-anomaly-detection]
Expand Down Expand Up @@ -226,13 +255,19 @@ this value indicates the latest size.
`model_size_stats`.`model_bytes_memory_limit`:::
(long) The upper limit for memory usage, checked on increasing values.

`model_size_stats`.`rare_category_count`:::
(long) The number of categories that match just one categorized document.

`model_size_stats`.`result_type`:::
(string) For internal use. The type of result.

`model_size_stats`.`total_by_field_count`:::
(long) The number of `by` field values that were analyzed by the models. This
value is cumulative for all detectors.

`model_size_stats`.`total_category_count`:::
(long) The number of categories created by categorization.

`model_size_stats`.`total_over_field_count`:::
(long) The number of `over` field values that were analyzed by the models. This
value is cumulative for all detectors.
Expand Down Expand Up @@ -371,6 +406,12 @@ The API returns the following results:
"total_partition_field_count" : 2,
"bucket_allocation_failures_count" : 0,
"memory_status" : "ok",
"categorized_doc_count" : 0,
"total_category_count" : 0,
"frequent_category_count" : 0,
"rare_category_count" : 0,
"dead_category_count" : 0,
"categorization_status" : "ok",
"log_time" : 1576017596000,
"timestamp" : 1580410800000
},
Expand Down
Loading

0 comments on commit 72346b9

Please sign in to comment.