-
Notifications
You must be signed in to change notification settings - Fork 138
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add TTL to un-deploy model automatically #2365
Changes from all commits
8c6c83d
bdb425c
d96208e
16416ac
8f7f6a9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -36,7 +36,7 @@ public class MLDeployingSettingTests { | |
|
||
private MLDeploySetting deploySettingNull; | ||
|
||
private final String expectedInputStr = "{\"is_auto_deploy_enabled\":true}"; | ||
private final String expectedInputStr = "{\"is_auto_deploy_enabled\":true,\"model_ttl_minutes\":-1}"; | ||
|
||
@Rule | ||
public ExpectedException exceptionRule = ExpectedException.none(); | ||
|
@@ -66,7 +66,7 @@ public void testToXContent() throws Exception { | |
|
||
@Test | ||
public void testToXContentIncomplete() throws Exception { | ||
final String expectedIncompleteInputStr = "{}"; | ||
final String expectedIncompleteInputStr = "{\"model_ttl_minutes\":-1}"; | ||
|
||
String jsonStr = serializationWithToXContent(deploySettingNull); | ||
assertEquals(expectedIncompleteInputStr, jsonStr); | ||
|
@@ -109,12 +109,12 @@ public void parseWithIllegalArgumentInteger() throws Exception { | |
|
||
@Test | ||
public void parseWithIllegalField() throws Exception { | ||
final String expectedInputStrWithIllegalField = "{\"is_auto_deploy_enabled\":true," + | ||
final String expectedInputStrWithIllegalField = "{\"is_auto_deploy_enabled\":true," + "\"model_ttl_hours\":0," + | ||
"\"illegal_field\":\"This field need to be skipped.\"}"; | ||
|
||
testParseFromJsonString(expectedInputStrWithIllegalField, parsedInput -> { | ||
try { | ||
assertEquals(expectedInputStr, serializationWithToXContent(parsedInput)); | ||
assertEquals("{\"is_auto_deploy_enabled\":true,\"model_ttl_minutes\":-1}", serializationWithToXContent(parsedInput)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just the opposite, with model_ttl_minutes == 0, the model will be removed in the next cron job running cycle right away. By default, without any given value for this ttl in minutes, model_ttl_minutes == -1 and it will make the model never expire. |
||
} catch (IOException e) { | ||
throw new RuntimeException(e); | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -41,6 +41,8 @@ | |
import org.opensearch.ml.common.transport.sync.MLSyncUpInput; | ||
import org.opensearch.ml.common.transport.sync.MLSyncUpNodeResponse; | ||
import org.opensearch.ml.common.transport.sync.MLSyncUpNodesRequest; | ||
import org.opensearch.ml.common.transport.undeploy.MLUndeployModelAction; | ||
import org.opensearch.ml.common.transport.undeploy.MLUndeployModelNodesRequest; | ||
import org.opensearch.ml.engine.encryptor.Encryptor; | ||
import org.opensearch.ml.engine.indices.MLIndicesHandler; | ||
import org.opensearch.search.SearchHit; | ||
|
@@ -101,8 +103,17 @@ public void run() { | |
Map<String, Set<String>> runningDeployModelTasks = new HashMap<>(); | ||
// key is model id, value is set of worker node ids | ||
Map<String, Set<String>> deployingModels = new HashMap<>(); | ||
// key is expired model_id, value is set of worker node ids | ||
Map<String, Set<String>> expiredModelToNodes = new HashMap<>(); | ||
for (MLSyncUpNodeResponse response : responses) { | ||
String nodeId = response.getNode().getId(); | ||
String[] expiredModelIds = response.getExpiredModelIds(); | ||
if (expiredModelIds != null && expiredModelIds.length > 0) { | ||
Arrays | ||
.stream(expiredModelIds) | ||
.forEach(modelId -> { expiredModelToNodes.computeIfAbsent(modelId, it -> new HashSet<>()).add(nodeId); }); | ||
} | ||
|
||
String[] deployedModelIds = response.getDeployedModelIds(); | ||
if (deployedModelIds != null && deployedModelIds.length > 0) { | ||
for (String modelId : deployedModelIds) { | ||
|
@@ -126,6 +137,17 @@ public void run() { | |
} | ||
} | ||
} | ||
|
||
Set<String> modelsToUndeploy = new HashSet<>(); | ||
for (String modelId : expiredModelToNodes.keySet()) { | ||
if (modelWorkerNodes.containsKey(modelId) | ||
&& expiredModelToNodes.get(modelId).size() == modelWorkerNodes.get(modelId).size()) { | ||
// this model has expired in all the nodes | ||
modelWorkerNodes.remove(modelId); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Cron job use modelWorkerNodes to check if model has been deployed to any nodes (check line 349 of this class). This will cause sync up cron job set model as deploy_failed. |
||
modelsToUndeploy.add(modelId); | ||
} | ||
} | ||
|
||
for (Map.Entry<String, Set<String>> entry : modelWorkerNodes.entrySet()) { | ||
String modelId = entry.getKey(); | ||
log.debug("will sync model worker nodes for model: {}: {}", modelId, entry.getValue().toArray(new String[0])); | ||
|
@@ -154,6 +176,8 @@ public void run() { | |
log.error("Failed to sync model routing", ex); | ||
}) | ||
); | ||
// Undeploy expired models | ||
undeployExpiredModels(modelsToUndeploy, modelWorkerNodes); | ||
|
||
// refresh model status | ||
mlIndicesHandler | ||
|
@@ -163,6 +187,20 @@ public void run() { | |
}, e -> { log.error("Failed to sync model routing", e); })); | ||
} | ||
|
||
private void undeployExpiredModels(Set<String> expiredModels, Map<String, Set<String>> modelWorkerNodes) { | ||
expiredModels.forEach(modelId -> { | ||
String[] targetNodeIds = modelWorkerNodes.keySet().toArray(new String[0]); | ||
|
||
MLUndeployModelNodesRequest mlUndeployModelNodesRequest = new MLUndeployModelNodesRequest( | ||
targetNodeIds, | ||
new String[] { modelId } | ||
); | ||
client.execute(MLUndeployModelAction.INSTANCE, mlUndeployModelNodesRequest, ActionListener.wrap(r -> { | ||
log.debug("model {} is un_deployed", modelId); | ||
}, e -> { log.error("Failed to undeploy model {}", modelId, e); })); | ||
}); | ||
} | ||
|
||
@VisibleForTesting | ||
void initMLConfig() { | ||
if (mlConfigInited) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,6 +7,8 @@ | |
|
||
import static org.opensearch.ml.settings.MLCommonsSettings.ML_COMMONS_MONITORING_REQUEST_COUNT; | ||
|
||
import java.time.Duration; | ||
import java.time.Instant; | ||
import java.util.HashSet; | ||
import java.util.List; | ||
import java.util.Map; | ||
|
@@ -68,6 +70,7 @@ public synchronized void initModelState( | |
modelCache.setFunctionName(functionName); | ||
modelCache.setTargetWorkerNodes(targetWorkerNodes); | ||
modelCache.setDeployToAllNodes(deployToAllNodes); | ||
modelCache.setLastAccessTime(Instant.now()); | ||
modelCaches.put(modelId, modelCache); | ||
} | ||
|
||
|
@@ -87,6 +90,7 @@ public synchronized void initModelStateLocal( | |
modelCache.setFunctionName(functionName); | ||
modelCache.setTargetWorkerNodes(targetWorkerNodes); | ||
modelCache.setDeployToAllNodes(false); | ||
modelCache.setLastAccessTime(Instant.now()); | ||
modelCaches.put(modelId, modelCache); | ||
} | ||
|
||
|
@@ -341,6 +345,29 @@ public String[] getLocalDeployedModels() { | |
.toArray(new String[0]); | ||
} | ||
|
||
/** | ||
* Get expired models on node. | ||
* | ||
* @return array of expired model id | ||
*/ | ||
public String[] getExpiredModels() { | ||
return modelCaches.entrySet().stream().filter(entry -> { | ||
MLModel mlModel = entry.getValue().getCachedModelInfo(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Cached model info only set when predict request comes. Check https://github.com/opensearch-project/ml-commons/pull/1472/files For a deployed model, if no predict request comes, the cached model info is null. Then the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add this line |
||
if (mlModel.getDeploySetting() == null) { | ||
return false; // no TTL, never expire | ||
} | ||
Duration liveDuration = Duration.between(entry.getValue().getLastAccessTime(), Instant.now()); | ||
Long ttlInMinutes = mlModel.getDeploySetting().getModelTTLInMinutes(); | ||
if (ttlInMinutes < 0) { | ||
return false; | ||
} | ||
Duration ttl = Duration.ofMinutes(ttlInMinutes); | ||
boolean isModelExpired = liveDuration.getSeconds() >= ttl.getSeconds(); | ||
return isModelExpired | ||
&& (mlModel.getModelState() == MLModelState.DEPLOYED || mlModel.getModelState() == MLModelState.PARTIALLY_DEPLOYED); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we care the model status for model TTL ? For example, if we have some bug, and model stays on DEPLOY_FAILED, I think TTL could be a good way to remove model to avoid memory leak. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The Deploy_Failed may indicate some insights to the cluster or ml service. Auto undeploying the model won't fix the problem but only hide it. We shouldn't use this as a tool to clear the failure status? Otherwise, the model will likely be in Failed status when it's deployed again. Let's be conservative in the first release for this mechanism There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We have auto deploy, if model deploy stuck there, why shouldn't we use TTL to undeploy model to remove the unnecessary memory usage? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's doable. But is it hiding problems that may bite us more in the future? When the deploy stuck, why shouldn't we look into the reasons first and then un-deploy if it's the real solution? Also the auto deploy is only for remote model which takes little memory. So there might be other different reasons for the deployment failure like the one I showed in the meeting? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Make sense, you can add a todo and test more |
||
}).map(entry -> entry.getKey()).collect(Collectors.toList()).toArray(new String[0]); | ||
} | ||
|
||
/** | ||
* Check if model is running on node. | ||
* | ||
|
@@ -403,6 +430,16 @@ public void setTargetWorkerNodes(String modelId, List<String> targetWorkerNodes) | |
} | ||
} | ||
|
||
/** | ||
* Set the last access time to Instant.now() | ||
* | ||
* @param modelId model id | ||
*/ | ||
public void refreshLastAccessTime(String modelId) { | ||
MLModelCache modelCache = modelCaches.get(modelId); | ||
modelCache.setLastAccessTime(Instant.now()); | ||
} | ||
|
||
/** | ||
* Remove model. | ||
* | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why separate hours and minutes? Seems use minutes should be enough.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Initially only used Hours. But it may take too much time to test expiration in hours. Having both hours and minutes cover most cases. Most likely only hours will be used, e.g. 24 hours, 72 hours (6 days), etc.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why not just use minutes ? 24 hours will be 24 * 60,
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It should be easier to set a longer time in hours? For 1 week of 7 days, it would be 10080 minutes, but this is not an obvious number to understand the time length.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It doesn't require both hours and minutes, only 1 of them will create a valid deploySetting. If both of them are null, default values are -1 and this model will never expire.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would suggest just use the minimal time unit for this, that could be easier. If you add hours, why not add days and months?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.