-
Notifications
You must be signed in to change notification settings - Fork 140
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
renaming metrics #1224
renaming metrics #1224
Changes from 10 commits
29d6398
1b703f9
8439f45
30b1761
1a057eb
7e3b4de
42f2c8c
04e2106
569d8fc
62b0e7b
0b61450
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -212,7 +212,7 @@ public MLModelManager( | |
public void registerModelMeta(MLRegisterModelMetaInput mlRegisterModelMetaInput, ActionListener<String> listener) { | ||
try { | ||
FunctionName functionName = mlRegisterModelMetaInput.getFunctionName(); | ||
mlStats.getStat(MLNodeLevelStat.ML_NODE_TOTAL_REQUEST_COUNT).increment(); | ||
mlStats.getStat(MLNodeLevelStat.ML_REQUEST_COUNT).increment(); | ||
mlStats.createCounterStatIfAbsent(functionName, REGISTER, ML_ACTION_REQUEST_COUNT).increment(); | ||
String modelGroupId = mlRegisterModelMetaInput.getModelGroupId(); | ||
if (Strings.isBlank(modelGroupId)) { | ||
|
@@ -322,9 +322,9 @@ public void registerMLModel(MLRegisterModelInput registerModelInput, MLTask mlTa | |
|
||
checkAndAddRunningTask(mlTask, maxRegisterTasksPerNode); | ||
try { | ||
mlStats.getStat(MLNodeLevelStat.ML_NODE_TOTAL_REQUEST_COUNT).increment(); | ||
mlStats.getStat(MLNodeLevelStat.ML_NODE_EXECUTING_TASK_COUNT).increment(); | ||
mlStats.getStat(MLNodeLevelStat.ML_REQUEST_COUNT).increment(); | ||
mlStats.createCounterStatIfAbsent(mlTask.getFunctionName(), REGISTER, ML_ACTION_REQUEST_COUNT).increment(); | ||
mlStats.getStat(MLNodeLevelStat.ML_EXECUTING_TASK_COUNT).increment(); | ||
|
||
String modelGroupId = registerModelInput.getModelGroupId(); | ||
GetRequest getModelGroupRequest = new GetRequest(ML_MODEL_GROUP_INDEX).id(modelGroupId); | ||
|
@@ -384,17 +384,14 @@ public void registerMLModel(MLRegisterModelInput registerModelInput, MLTask mlTa | |
} catch (Exception e) { | ||
handleException(registerModelInput.getFunctionName(), mlTask.getTaskId(), e); | ||
} finally { | ||
mlStats.getStat(MLNodeLevelStat.ML_NODE_EXECUTING_TASK_COUNT).increment(); | ||
mlStats.getStat(MLNodeLevelStat.ML_EXECUTING_TASK_COUNT).decrement(); | ||
} | ||
} | ||
|
||
private void indexRemoteModel(MLRegisterModelInput registerModelInput, MLTask mlTask, String modelVersion) { | ||
String taskId = mlTask.getTaskId(); | ||
FunctionName functionName = mlTask.getFunctionName(); | ||
try (ThreadContext.StoredContext context = client.threadPool().getThreadContext().stashContext()) { | ||
mlStats.getStat(MLNodeLevelStat.ML_NODE_TOTAL_REQUEST_COUNT).increment(); | ||
mlStats.createCounterStatIfAbsent(functionName, REGISTER, ML_ACTION_REQUEST_COUNT).increment(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This line is to track how many register requests on function level. By removing this, can we still track that? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, because we are tracking this in the parent function |
||
mlStats.getStat(MLNodeLevelStat.ML_NODE_EXECUTING_TASK_COUNT).increment(); | ||
|
||
String modelName = registerModelInput.getModelName(); | ||
String version = modelVersion == null ? registerModelInput.getVersion() : modelVersion; | ||
|
@@ -443,8 +440,6 @@ private void indexRemoteModel(MLRegisterModelInput registerModelInput, MLTask ml | |
} catch (Exception e) { | ||
logException("Failed to upload model", e, log); | ||
handleException(functionName, taskId, e); | ||
} finally { | ||
mlStats.getStat(MLNodeLevelStat.ML_NODE_EXECUTING_TASK_COUNT).increment(); | ||
} | ||
} | ||
|
||
|
@@ -462,9 +457,6 @@ private void registerModelFromUrl(MLRegisterModelInput registerModelInput, MLTas | |
String taskId = mlTask.getTaskId(); | ||
FunctionName functionName = mlTask.getFunctionName(); | ||
try (ThreadContext.StoredContext context = client.threadPool().getThreadContext().stashContext()) { | ||
mlStats.getStat(MLNodeLevelStat.ML_NODE_TOTAL_REQUEST_COUNT).increment(); | ||
mlStats.createCounterStatIfAbsent(functionName, REGISTER, ML_ACTION_REQUEST_COUNT).increment(); | ||
mlStats.getStat(MLNodeLevelStat.ML_NODE_EXECUTING_TASK_COUNT).increment(); | ||
String modelName = registerModelInput.getModelName(); | ||
String version = modelVersion == null ? registerModelInput.getVersion() : modelVersion; | ||
String modelGroupId = registerModelInput.getModelGroupId(); | ||
|
@@ -509,8 +501,6 @@ private void registerModelFromUrl(MLRegisterModelInput registerModelInput, MLTas | |
} catch (Exception e) { | ||
logException("Failed to register model", e, log); | ||
handleException(functionName, taskId, e); | ||
} finally { | ||
mlStats.getStat(MLNodeLevelStat.ML_NODE_EXECUTING_TASK_COUNT).increment(); | ||
} | ||
} | ||
|
||
|
@@ -693,7 +683,7 @@ private void handleException(FunctionName functionName, String taskId, Exception | |
&& !(e instanceof MLResourceNotFoundException) | ||
&& !(e instanceof IllegalArgumentException)) { | ||
mlStats.createCounterStatIfAbsent(functionName, REGISTER, MLActionLevelStat.ML_ACTION_FAILURE_COUNT).increment(); | ||
mlStats.getStat(MLNodeLevelStat.ML_NODE_TOTAL_FAILURE_COUNT).increment(); | ||
mlStats.getStat(MLNodeLevelStat.ML_FAILURE_COUNT).increment(); | ||
} | ||
Map<String, Object> updated = ImmutableMap.of(ERROR_FIELD, MLExceptionUtils.getRootCauseMessage(e), STATE_FIELD, FAILED); | ||
mlTaskManager.updateMLTask(taskId, updated, TIMEOUT_IN_MILLIS, true); | ||
|
@@ -718,7 +708,8 @@ public void deployModel( | |
ActionListener<String> listener | ||
) { | ||
mlStats.createCounterStatIfAbsent(functionName, ActionName.DEPLOY, ML_ACTION_REQUEST_COUNT).increment(); | ||
mlStats.getStat(MLNodeLevelStat.ML_NODE_TOTAL_REQUEST_COUNT).increment(); | ||
mlStats.getStat(MLNodeLevelStat.ML_EXECUTING_TASK_COUNT).increment(); | ||
mlStats.getStat(MLNodeLevelStat.ML_REQUEST_COUNT).increment(); | ||
List<String> workerNodes = mlTask.getWorkerNodes(); | ||
if (modelCacheHelper.isModelDeployed(modelId)) { | ||
if (workerNodes != null && workerNodes.size() > 0) { | ||
|
@@ -800,7 +791,7 @@ public void deployModel( | |
MLExecutable mlExecutable = mlEngine.deployExecute(mlModel, params); | ||
try { | ||
modelCacheHelper.setMLExecutor(modelId, mlExecutable); | ||
mlStats.getStat(MLNodeLevelStat.ML_NODE_TOTAL_MODEL_COUNT).increment(); | ||
mlStats.getStat(MLNodeLevelStat.ML_DEPLOYED_MODEL_COUNT).increment(); | ||
modelCacheHelper.setModelState(modelId, MLModelState.DEPLOYED); | ||
listener.onResponse("successful"); | ||
} catch (Exception e) { | ||
|
@@ -813,7 +804,7 @@ public void deployModel( | |
Predictable predictable = mlEngine.deploy(mlModel, params); | ||
try { | ||
modelCacheHelper.setPredictor(modelId, predictable); | ||
mlStats.getStat(MLNodeLevelStat.ML_NODE_TOTAL_MODEL_COUNT).increment(); | ||
mlStats.getStat(MLNodeLevelStat.ML_DEPLOYED_MODEL_COUNT).increment(); | ||
modelCacheHelper.setModelState(modelId, MLModelState.DEPLOYED); | ||
Long modelContentSizeInBytes = mlModel.getModelContentSizeInBytes(); | ||
long contentSize = modelContentSizeInBytes == null | ||
|
@@ -837,6 +828,8 @@ public void deployModel( | |
}))); | ||
} catch (Exception e) { | ||
handleDeployModelException(modelId, functionName, listener, e); | ||
} finally { | ||
mlStats.getStat(MLNodeLevelStat.ML_EXECUTING_TASK_COUNT).decrement(); | ||
} | ||
} | ||
|
||
|
@@ -846,7 +839,7 @@ private void handleDeployModelException(String modelId, FunctionName functionNam | |
&& !(e instanceof MLResourceNotFoundException) | ||
&& !(e instanceof IllegalArgumentException)) { | ||
mlStats.createCounterStatIfAbsent(functionName, ActionName.DEPLOY, MLActionLevelStat.ML_ACTION_FAILURE_COUNT).increment(); | ||
mlStats.getStat(MLNodeLevelStat.ML_NODE_TOTAL_FAILURE_COUNT).increment(); | ||
mlStats.getStat(MLNodeLevelStat.ML_FAILURE_COUNT).increment(); | ||
} | ||
removeModel(modelId); | ||
listener.onFailure(e); | ||
|
@@ -855,7 +848,7 @@ private void handleDeployModelException(String modelId, FunctionName functionNam | |
private void setupPredictable(String modelId, MLModel mlModel, Map<String, Object> params) { | ||
Predictable predictable = mlEngine.deploy(mlModel, params); | ||
modelCacheHelper.setPredictor(modelId, predictable); | ||
mlStats.getStat(MLNodeLevelStat.ML_NODE_TOTAL_MODEL_COUNT).increment(); | ||
mlStats.getStat(MLNodeLevelStat.ML_DEPLOYED_MODEL_COUNT).increment(); | ||
modelCacheHelper.setModelState(modelId, MLModelState.DEPLOYED); | ||
} | ||
|
||
|
@@ -1056,8 +1049,8 @@ public synchronized Map<String, String> undeployModel(String[] modelIds) { | |
for (String modelId : modelIds) { | ||
if (modelCacheHelper.isModelDeployed(modelId)) { | ||
modelUndeployStatus.put(modelId, UNDEPLOYED); | ||
mlStats.getStat(MLNodeLevelStat.ML_NODE_TOTAL_MODEL_COUNT).decrement(); | ||
mlStats.getStat(MLNodeLevelStat.ML_NODE_TOTAL_REQUEST_COUNT).increment(); | ||
mlStats.getStat(MLNodeLevelStat.ML_DEPLOYED_MODEL_COUNT).decrement(); | ||
mlStats.getStat(MLNodeLevelStat.ML_REQUEST_COUNT).increment(); | ||
mlStats | ||
.createCounterStatIfAbsent(getModelFunctionName(modelId), ActionName.UNDEPLOY, ML_ACTION_REQUEST_COUNT) | ||
.increment(); | ||
|
@@ -1070,7 +1063,7 @@ public synchronized Map<String, String> undeployModel(String[] modelIds) { | |
log.debug("undeploy all models {}", Arrays.toString(getLocalDeployedModels())); | ||
for (String modelId : getLocalDeployedModels()) { | ||
modelUndeployStatus.put(modelId, UNDEPLOYED); | ||
mlStats.getStat(MLNodeLevelStat.ML_NODE_TOTAL_MODEL_COUNT).decrement(); | ||
mlStats.getStat(MLNodeLevelStat.ML_DEPLOYED_MODEL_COUNT).decrement(); | ||
mlStats.createCounterStatIfAbsent(getModelFunctionName(modelId), ActionName.UNDEPLOY, ML_ACTION_REQUEST_COUNT).increment(); | ||
removeModel(modelId); | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,12 +10,13 @@ | |
* This enum represents node level stats. | ||
*/ | ||
public enum MLNodeLevelStat { | ||
ML_NODE_JVM_HEAP_USAGE, | ||
ML_NODE_EXECUTING_TASK_COUNT, | ||
ML_NODE_TOTAL_REQUEST_COUNT, | ||
ML_NODE_TOTAL_FAILURE_COUNT, | ||
ML_NODE_TOTAL_MODEL_COUNT, | ||
ML_NODE_TOTAL_CIRCUIT_BREAKER_TRIGGER_COUNT; | ||
ML_JVM_HEAP_USAGE, | ||
ML_EXECUTING_TASK_COUNT, // How many tasks are executing currently. If any task starts, then it will be 1, if the task finished then it | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
// will get back to 0. | ||
ML_REQUEST_COUNT, | ||
ML_FAILURE_COUNT, | ||
ML_DEPLOYED_MODEL_COUNT, | ||
ML_CIRCUIT_BREAKER_TRIGGER_COUNT; | ||
|
||
public static MLNodeLevelStat from(String value) { | ||
try { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why remove this line?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we are already counting this in the
registerMLModel
function in MLModelManager class