Skip to content

Commit

Permalink
[8.4][ML] Validate trained model deployment queue_capacity limit (#89611
Browse files Browse the repository at this point in the history
)

When starting a trained model deployment, a queue is created.
If the queue_capacity is too large, it can lead to OOM and a node
crash.

This commit adds validation that the queue_capacity cannot be more
than 1M.

Closes #89555
  • Loading branch information
dimitris-athanasiou authored Aug 25, 2022
1 parent 85d40b6 commit 571d611
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 3 deletions.
6 changes: 6 additions & 0 deletions docs/changelog/89611.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 89611
summary: "[ML] Validate trained model deployment `queue_capacity` limit"
area: Machine Learning
type: bug
issues:
- 89555
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ Defaults to 1.
Controls how many inference requests are allowed in the queue at a time.
Every machine learning node in the cluster where the model can be allocated
has a queue of this size; when the number of requests exceeds the total value,
new requests are rejected with a 429 error. Defaults to 1024.
new requests are rejected with a 429 error. Defaults to 1024. Max allowed value is 1000000.

`threads_per_allocation`::
(Optional, integer)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ public static class Request extends MasterNodeRequest<Request> implements ToXCon
AllocationStatus.State.FULLY_ALLOCATED };

private static final int MAX_THREADS_PER_ALLOCATION = 32;
/**
* If the queue is created then we can OOM when we create the queue.
*/
private static final int MAX_QUEUE_CAPACITY = 1_000_000;

public static final ParseField MODEL_ID = new ParseField("model_id");
public static final ParseField TIMEOUT = new ParseField("timeout");
Expand Down Expand Up @@ -248,6 +252,9 @@ public ActionRequestValidationException validate() {
if (queueCapacity < 1) {
validationException.addValidationError("[" + QUEUE_CAPACITY + "] must be a positive integer");
}
if (queueCapacity > MAX_QUEUE_CAPACITY) {
validationException.addValidationError("[" + QUEUE_CAPACITY + "] must be less than " + MAX_QUEUE_CAPACITY);
}
return validationException.validationErrors().isEmpty() ? null : validationException;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,13 @@ public static Request createRandom() {
request.setWaitForState(randomFrom(AllocationStatus.State.values()));
}
if (randomBoolean()) {
request.setThreadsPerAllocation(randomIntBetween(1, 8));
request.setThreadsPerAllocation(randomFrom(1, 2, 4, 8, 16, 32));
}
if (randomBoolean()) {
request.setNumberOfAllocations(randomIntBetween(1, 8));
}
if (randomBoolean()) {
request.setQueueCapacity(randomIntBetween(1, 10000));
request.setQueueCapacity(randomIntBetween(1, 1000000));
}
return request;
}
Expand Down Expand Up @@ -150,6 +150,25 @@ public void testValidate_GivenQueueCapacityIsNegative() {
assertThat(e.getMessage(), containsString("[queue_capacity] must be a positive integer"));
}

public void testValidate_GivenQueueCapacityIsAtLimit() {
Request request = createRandom();
request.setQueueCapacity(1_000_000);

ActionRequestValidationException e = request.validate();

assertThat(e, is(nullValue()));
}

public void testValidate_GivenQueueCapacityIsOverLimit() {
Request request = createRandom();
request.setQueueCapacity(1_000_001);

ActionRequestValidationException e = request.validate();

assertThat(e, is(not(nullValue())));
assertThat(e.getMessage(), containsString("[queue_capacity] must be less than 1000000"));
}

public void testDefaults() {
Request request = new Request(randomAlphaOfLength(10));
assertThat(request.getTimeout(), equalTo(TimeValue.timeValueSeconds(20)));
Expand Down

0 comments on commit 571d611

Please sign in to comment.