Skip to content

Commit

Permalink
[ML] improve the autoscaling decider reason messages (#69227)
Browse files Browse the repository at this point in the history
It can be difficult to fully grok why a scaling decision was made.

This commit improves the messaging to with additional information on two different no_scale decisions.
  • Loading branch information
benwtrent authored Feb 19, 2021
1 parent d88104c commit 623f547
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
Expand Down Expand Up @@ -222,13 +223,6 @@ private void resetScaleDownCoolDown() {
this.scaleDownDetected = NO_SCALE_DOWN_POSSIBLE;
}

private boolean canScaleDown(TimeValue coolDown) {
if (this.scaleDownDetected == NO_SCALE_DOWN_POSSIBLE) {
return false;
}
return timeSupplier.get() - scaleDownDetected >= coolDown.millis();
}

private boolean newScaleDownCheck() {
return scaleDownDetected == NO_SCALE_DOWN_POSSIBLE;
}
Expand Down Expand Up @@ -325,8 +319,18 @@ public AutoscalingDeciderResult scale(Settings configuration, AutoscalingDecider
return noScaleResultOrRefresh(reasonBuilder, memoryTrackingStale, new AutoscalingDeciderResult(
context.currentCapacity(),
reasonBuilder
.setSimpleReason("Passing currently perceived capacity as there are analytics and anomaly jobs in the queue, " +
"but the number in the queue is less than the configured maximum allowed.")
.setSimpleReason(
String.format(
Locale.ROOT,
"Passing currently perceived capacity as there are [%d] analytics and [%d] anomaly jobs in the queue, "
+ "but the number in the queue is less than the configured maximum allowed. "
+ "[%d] for analytics and [%d] for anomaly jobs",
waitingAnalyticsJobs.size(),
waitingAnomalyJobs.size(),
NUM_ANALYTICS_JOBS_IN_QUEUE.get(configuration),
NUM_ANOMALY_JOBS_IN_QUEUE.get(configuration)
)
)
.build()));
}
if (mlMemoryTracker.isRecentlyRefreshed(memoryTrackingStale) == false) {
Expand Down Expand Up @@ -379,11 +383,13 @@ public AutoscalingDeciderResult scale(Settings configuration, AutoscalingDecider
checkForScaleDown(nodes, clusterState, largestJob, currentScale, reasonBuilder);

if (scaleDownDecision.isPresent()) {
final long now = timeSupplier.get();
if (newScaleDownCheck()) {
scaleDownDetected = timeSupplier.get();
scaleDownDetected = now;
}
TimeValue downScaleDelay = DOWN_SCALE_DELAY.get(configuration);
if (canScaleDown(downScaleDelay)) {
long msLeftToScale = downScaleDelay.millis() - (now - scaleDownDetected);
if (msLeftToScale <= 0) {
return scaleDownDecision.get();
}
logger.debug(() -> new ParameterizedMessage(
Expand All @@ -396,11 +402,15 @@ public AutoscalingDeciderResult scale(Settings configuration, AutoscalingDecider
context.currentCapacity(),
reasonBuilder
.setSimpleReason(
"Passing currently perceived capacity as configured down scale delay has not be satisfied; configured delay ["
+ downScaleDelay.millis()
+ "] last detected scale down event ["
+ scaleDownDetected
+ "]")
String.format(
Locale.ROOT,
"Passing currently perceived capacity as down scale delay has not be satisfied; configured delay [%s]"
+ "last detected scale down event [%s]. Will request scale down in approximately [%s]",
downScaleDelay.getStringRep(),
XContentElasticsearchExtension.DEFAULT_DATE_PRINTER.print(scaleDownDetected),
TimeValue.timeValueMillis(msLeftToScale).getStringRep()
)
)
.build());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -400,8 +400,10 @@ public void testScale_WithNoScaleUpButWaitingJobs() {
DeciderContext deciderContext = new DeciderContext(clusterState, autoscalingCapacity);

AutoscalingDeciderResult result = service.scale(settings, deciderContext);
assertThat(result.reason().summary(),
containsString("Passing currently perceived capacity as there are analytics and anomaly jobs in the queue"));
assertThat(
result.reason().summary(),
containsString("but the number in the queue is less than the configured maximum allowed")
);
assertThat(result.requiredCapacity(), equalTo(autoscalingCapacity));
}

Expand Down

0 comments on commit 623f547

Please sign in to comment.