From b68034be4e0e7e0d9875b0331207fec7959ed1a6 Mon Sep 17 00:00:00 2001 From: Yaliang Wu Date: Fri, 30 Dec 2022 17:04:04 -0800 Subject: [PATCH 1/3] change task worker node to list; add target worker node to cache Signed-off-by: Yaliang Wu --- .../java/org/opensearch/ml/common/MLTask.java | 26 ++++++++++------- .../org/opensearch/ml/common/MLTaskTests.java | 5 ++-- .../transport/forward/MLForwardInputTest.java | 3 +- .../forward/MLForwardRequestTest.java | 3 +- .../transport/load/LoadModelInputTest.java | 3 +- .../load/LoadModelNodesRequestTest.java | 3 +- .../transport/task/MLTaskGetResponseTest.java | 7 +++-- .../action/load/TransportLoadModelAction.java | 5 ++-- .../upload/TransportUploadModelAction.java | 5 ++-- .../org/opensearch/ml/model/MLModelCache.java | 15 ++++++++++ .../ml/model/MLModelCacheHelper.java | 8 +++++- .../opensearch/ml/model/MLModelManager.java | 2 +- .../opensearch/ml/profile/MLModelProfile.java | 8 ++++++ .../ml/task/MLPredictTaskRunner.java | 4 ++- .../ml/task/MLTrainAndPredictTaskRunner.java | 4 ++- .../ml/task/MLTrainingTaskRunner.java | 4 ++- .../TransportLoadModelOnNodeActionTests.java | 2 +- .../profile/MLProfileNodeResponseTests.java | 3 +- .../profile/MLProfileResponseTests.java | 5 ++-- .../MLProfileTransportActionTests.java | 2 +- .../TransportUploadModelActionTests.java | 5 +++- .../ml/model/MLModelCacheHelperTests.java | 28 +++++++++++-------- .../ml/rest/RestMLProfileActionTests.java | 4 ++- 23 files changed, 107 insertions(+), 47 deletions(-) diff --git a/common/src/main/java/org/opensearch/ml/common/MLTask.java b/common/src/main/java/org/opensearch/ml/common/MLTask.java index 47de92c706..be5b7269de 100644 --- a/common/src/main/java/org/opensearch/ml/common/MLTask.java +++ b/common/src/main/java/org/opensearch/ml/common/MLTask.java @@ -20,6 +20,8 @@ import java.io.IOException; import java.time.Instant; +import java.util.ArrayList; +import java.util.List; import static org.opensearch.common.xcontent.XContentParserUtils.ensureExpectedToken; import static org.opensearch.ml.common.CommonValue.USER; @@ -54,7 +56,7 @@ public class MLTask implements ToXContentObject, Writeable { private Float progress; private final String outputIndex; @Setter - private String workerNode; + private List workerNodes; private final Instant createTime; private Instant lastUpdateTime; @Setter @@ -72,7 +74,7 @@ public MLTask( MLInputDataType inputType, Float progress, String outputIndex, - String workerNode, + List workerNodes, Instant createTime, Instant lastUpdateTime, String error, @@ -87,7 +89,7 @@ public MLTask( this.inputType = inputType; this.progress = progress; this.outputIndex = outputIndex; - this.workerNode = workerNode; + this.workerNodes = workerNodes; this.createTime = createTime; this.lastUpdateTime = lastUpdateTime; this.error = error; @@ -108,7 +110,7 @@ public MLTask(StreamInput input) throws IOException { } this.progress = input.readOptionalFloat(); this.outputIndex = input.readOptionalString(); - this.workerNode = input.readString(); + this.workerNodes = input.readStringList(); this.createTime = input.readInstant(); this.lastUpdateTime = input.readInstant(); this.error = input.readOptionalString(); @@ -135,7 +137,7 @@ public void writeTo(StreamOutput out) throws IOException { } out.writeOptionalFloat(progress); out.writeOptionalString(outputIndex); - out.writeString(workerNode); + out.writeStringCollection(workerNodes); out.writeInstant(createTime); out.writeInstant(lastUpdateTime); out.writeOptionalString(error); @@ -174,8 +176,8 @@ public XContentBuilder toXContent(XContentBuilder xContentBuilder, Params params if (outputIndex != null) { builder.field(OUTPUT_INDEX_FIELD, outputIndex); } - if (workerNode != null) { - builder.field(WORKER_NODE_FIELD, workerNode); + if (workerNodes != null) { + builder.field(WORKER_NODE_FIELD, workerNodes); } if (createTime != null) { builder.field(CREATE_TIME_FIELD, createTime.toEpochMilli()); @@ -207,7 +209,7 @@ public static MLTask parse(XContentParser parser) throws IOException { MLInputDataType inputType = null; Float progress = null; String outputIndex = null; - String workerNode = null; + List workerNodes = null; Instant createTime = null; Instant lastUpdateTime = null; String error = null; @@ -245,7 +247,11 @@ public static MLTask parse(XContentParser parser) throws IOException { outputIndex = parser.text(); break; case WORKER_NODE_FIELD: - workerNode = parser.text(); + workerNodes = new ArrayList<>(); + ensureExpectedToken(XContentParser.Token.START_ARRAY, parser.currentToken(), parser); + while (parser.nextToken() != XContentParser.Token.END_ARRAY) { + workerNodes.add(parser.text()); + } break; case CREATE_TIME_FIELD: createTime = Instant.ofEpochMilli(parser.longValue()); @@ -276,7 +282,7 @@ public static MLTask parse(XContentParser parser) throws IOException { .inputType(inputType) .progress(progress) .outputIndex(outputIndex) - .workerNode(workerNode) + .workerNodes(workerNodes) .createTime(createTime) .lastUpdateTime(lastUpdateTime) .error(error) diff --git a/common/src/test/java/org/opensearch/ml/common/MLTaskTests.java b/common/src/test/java/org/opensearch/ml/common/MLTaskTests.java index 5768acdd7a..66cadfcbf0 100644 --- a/common/src/test/java/org/opensearch/ml/common/MLTaskTests.java +++ b/common/src/test/java/org/opensearch/ml/common/MLTaskTests.java @@ -8,6 +8,7 @@ import java.io.IOException; import java.time.Instant; import java.time.temporal.ChronoUnit; +import java.util.Arrays; import org.junit.Assert; import org.junit.Before; @@ -32,7 +33,7 @@ public void setup() { .functionName(FunctionName.KMEANS) .state(MLTaskState.RUNNING) .inputType(MLInputDataType.DATA_FRAME) - .workerNode("node1") + .workerNodes(Arrays.asList("node1")) .progress(0.0f) .outputIndex("test_index") .error("test_error") @@ -57,7 +58,7 @@ public void toXContent() throws IOException { Assert.assertEquals( "{\"task_id\":\"dummy taskId\",\"model_id\":\"test_model_id\",\"task_type\":\"PREDICTION\"," + "\"function_name\":\"KMEANS\",\"state\":\"RUNNING\",\"input_type\":\"DATA_FRAME\",\"progress\":0.0," - + "\"output_index\":\"test_index\",\"worker_node\":\"node1\",\"create_time\":1641599940000," + + "\"output_index\":\"test_index\",\"worker_node\":[\"node1\"],\"create_time\":1641599940000," + "\"last_update_time\":1641600000000,\"error\":\"test_error\",\"is_async\":false}", taskContent ); diff --git a/common/src/test/java/org/opensearch/ml/common/transport/forward/MLForwardInputTest.java b/common/src/test/java/org/opensearch/ml/common/transport/forward/MLForwardInputTest.java index 11c287a521..d870265702 100644 --- a/common/src/test/java/org/opensearch/ml/common/transport/forward/MLForwardInputTest.java +++ b/common/src/test/java/org/opensearch/ml/common/transport/forward/MLForwardInputTest.java @@ -24,6 +24,7 @@ import java.io.IOException; import java.time.Instant; import java.time.temporal.ChronoUnit; +import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.function.Consumer; @@ -48,7 +49,7 @@ public void setUp() throws Exception { .functionName(functionName) .state(MLTaskState.RUNNING) .inputType(MLInputDataType.DATA_FRAME) - .workerNode("mlTaskNode1") + .workerNodes(Arrays.asList("mlTaskNode1")) .progress(0.0f) .outputIndex("test_index") .error("test_error") diff --git a/common/src/test/java/org/opensearch/ml/common/transport/forward/MLForwardRequestTest.java b/common/src/test/java/org/opensearch/ml/common/transport/forward/MLForwardRequestTest.java index 690e3ae274..f79900a1ca 100644 --- a/common/src/test/java/org/opensearch/ml/common/transport/forward/MLForwardRequestTest.java +++ b/common/src/test/java/org/opensearch/ml/common/transport/forward/MLForwardRequestTest.java @@ -27,6 +27,7 @@ import java.io.UncheckedIOException; import java.time.Instant; import java.time.temporal.ChronoUnit; +import java.util.Arrays; import java.util.Collections; import java.util.HashMap; @@ -52,7 +53,7 @@ public void setUp() throws Exception { .functionName(functionName) .state(MLTaskState.RUNNING) .inputType(MLInputDataType.DATA_FRAME) - .workerNode("mlTaskNode1") + .workerNodes(Arrays.asList("mlTaskNode1")) .progress(0.0f) .outputIndex("test_index") .error("test_error") diff --git a/common/src/test/java/org/opensearch/ml/common/transport/load/LoadModelInputTest.java b/common/src/test/java/org/opensearch/ml/common/transport/load/LoadModelInputTest.java index c70f3e5e1c..be626d5576 100644 --- a/common/src/test/java/org/opensearch/ml/common/transport/load/LoadModelInputTest.java +++ b/common/src/test/java/org/opensearch/ml/common/transport/load/LoadModelInputTest.java @@ -19,6 +19,7 @@ import java.io.IOException; import java.time.Instant; import java.time.temporal.ChronoUnit; +import java.util.Arrays; import java.util.Collections; import java.util.HashMap; @@ -44,7 +45,7 @@ public void setUp() throws Exception { .functionName(FunctionName.LINEAR_REGRESSION) .state(MLTaskState.RUNNING) .inputType(MLInputDataType.DATA_FRAME) - .workerNode("node1") + .workerNodes(Arrays.asList("node1")) .progress(0.0f) .outputIndex("test_index") .error("test_error") diff --git a/common/src/test/java/org/opensearch/ml/common/transport/load/LoadModelNodesRequestTest.java b/common/src/test/java/org/opensearch/ml/common/transport/load/LoadModelNodesRequestTest.java index 157852a691..ff224de006 100644 --- a/common/src/test/java/org/opensearch/ml/common/transport/load/LoadModelNodesRequestTest.java +++ b/common/src/test/java/org/opensearch/ml/common/transport/load/LoadModelNodesRequestTest.java @@ -20,6 +20,7 @@ import java.net.InetAddress; import java.time.Instant; import java.time.temporal.ChronoUnit; +import java.util.Arrays; import java.util.Collections; import static org.junit.Assert.*; @@ -70,7 +71,7 @@ public void setUp() throws Exception { .functionName(FunctionName.LINEAR_REGRESSION) .state(MLTaskState.RUNNING) .inputType(MLInputDataType.DATA_FRAME) - .workerNode("node1") + .workerNodes(Arrays.asList("node1")) .progress(0.0f) .outputIndex("test_index") .error("test_error") diff --git a/common/src/test/java/org/opensearch/ml/common/transport/task/MLTaskGetResponseTest.java b/common/src/test/java/org/opensearch/ml/common/transport/task/MLTaskGetResponseTest.java index 03458295b2..8dad80d860 100644 --- a/common/src/test/java/org/opensearch/ml/common/transport/task/MLTaskGetResponseTest.java +++ b/common/src/test/java/org/opensearch/ml/common/transport/task/MLTaskGetResponseTest.java @@ -17,6 +17,7 @@ import java.io.IOException; import java.time.Instant; +import java.util.Arrays; import static org.junit.Assert.*; import static org.junit.Assert.assertEquals; @@ -35,7 +36,7 @@ public void setUp() { .inputType(MLInputDataType.DATA_FRAME) .progress(1.3f) .outputIndex("some index") - .workerNode("some node") + .workerNodes(Arrays.asList("some node")) .createTime(Instant.ofEpochMilli(123)) .lastUpdateTime(Instant.ofEpochMilli(123)) .error("error") @@ -59,7 +60,7 @@ public void writeTo_Success() throws IOException { assertEquals(response.mlTask.getInputType(), parsedResponse.mlTask.getInputType()); assertEquals(response.mlTask.getProgress(), parsedResponse.mlTask.getProgress()); assertEquals(response.mlTask.getOutputIndex(), parsedResponse.mlTask.getOutputIndex()); - assertEquals(response.mlTask.getWorkerNode(), parsedResponse.mlTask.getWorkerNode()); + assertEquals(response.mlTask.getWorkerNodes(), parsedResponse.mlTask.getWorkerNodes()); assertEquals(response.mlTask.getCreateTime(), parsedResponse.mlTask.getCreateTime()); assertEquals(response.mlTask.getLastUpdateTime(), parsedResponse.mlTask.getLastUpdateTime()); assertEquals(response.mlTask.getError(), parsedResponse.mlTask.getError()); @@ -80,7 +81,7 @@ public void toXContentTest() throws IOException { "\"input_type\":\"DATA_FRAME\"," + "\"progress\":1.3," + "\"output_index\":\"some index\"," + - "\"worker_node\":\"some node\"," + + "\"worker_node\":[\"some node\"]," + "\"create_time\":123," + "\"last_update_time\":123," + "\"error\":\"error\"," + diff --git a/plugin/src/main/java/org/opensearch/ml/action/load/TransportLoadModelAction.java b/plugin/src/main/java/org/opensearch/ml/action/load/TransportLoadModelAction.java index 1549d0f34d..f9e5671bb8 100644 --- a/plugin/src/main/java/org/opensearch/ml/action/load/TransportLoadModelAction.java +++ b/plugin/src/main/java/org/opensearch/ml/action/load/TransportLoadModelAction.java @@ -137,8 +137,7 @@ protected void doExecute(Task task, ActionRequest request, ActionListener { String taskId = response.getId(); diff --git a/plugin/src/main/java/org/opensearch/ml/action/upload/TransportUploadModelAction.java b/plugin/src/main/java/org/opensearch/ml/action/upload/TransportUploadModelAction.java index b3d553d795..5dd5030ee8 100644 --- a/plugin/src/main/java/org/opensearch/ml/action/upload/TransportUploadModelAction.java +++ b/plugin/src/main/java/org/opensearch/ml/action/upload/TransportUploadModelAction.java @@ -52,6 +52,7 @@ import org.opensearch.threadpool.ThreadPool; import org.opensearch.transport.TransportService; +import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; @Log4j2 @@ -127,12 +128,12 @@ protected void doExecute(Task task, ActionRequest request, ActionListener { String nodeId = node.getId(); - mlTask.setWorkerNode(nodeId); + mlTask.setWorkerNodes(ImmutableList.of(nodeId)); mlTaskManager.createMLTask(mlTask, ActionListener.wrap(response -> { String taskId = response.getId(); diff --git a/plugin/src/main/java/org/opensearch/ml/model/MLModelCache.java b/plugin/src/main/java/org/opensearch/ml/model/MLModelCache.java index 874def5a01..9f2b9a9b84 100644 --- a/plugin/src/main/java/org/opensearch/ml/model/MLModelCache.java +++ b/plugin/src/main/java/org/opensearch/ml/model/MLModelCache.java @@ -6,6 +6,7 @@ package org.opensearch.ml.model; import java.util.DoubleSummaryStatistics; +import java.util.List; import java.util.Queue; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; @@ -29,16 +30,30 @@ public class MLModelCache { private @Setter(AccessLevel.PROTECTED) @Getter(AccessLevel.PROTECTED) MLModelState modelState; private @Setter(AccessLevel.PROTECTED) @Getter(AccessLevel.PROTECTED) FunctionName functionName; private @Setter(AccessLevel.PROTECTED) @Getter(AccessLevel.PROTECTED) Predictable predictor; + private @Getter(AccessLevel.PROTECTED) Set targetWorkerNodes; private final Set workerNodes; private final Queue modelInferenceDurationQueue; private final Queue predictRequestDurationQueue; public MLModelCache() { + targetWorkerNodes = ConcurrentHashMap.newKeySet(); workerNodes = ConcurrentHashMap.newKeySet(); modelInferenceDurationQueue = new ConcurrentLinkedQueue<>(); predictRequestDurationQueue = new ConcurrentLinkedQueue<>(); } + public void setTargetWorkerNodes(List targetWorkerNodes) { + if (targetWorkerNodes == null || targetWorkerNodes.size() == 0) { + throw new IllegalArgumentException("Null or empty target worker nodes"); + } + this.targetWorkerNodes.clear(); + this.targetWorkerNodes.addAll(targetWorkerNodes); + } + + public String[] getTargetWorkerNodes() { + return targetWorkerNodes.toArray(new String[0]); + } + public void removeWorkerNode(String nodeId) { workerNodes.remove(nodeId); } diff --git a/plugin/src/main/java/org/opensearch/ml/model/MLModelCacheHelper.java b/plugin/src/main/java/org/opensearch/ml/model/MLModelCacheHelper.java index 9c3ac29875..081328e55c 100644 --- a/plugin/src/main/java/org/opensearch/ml/model/MLModelCacheHelper.java +++ b/plugin/src/main/java/org/opensearch/ml/model/MLModelCacheHelper.java @@ -8,6 +8,7 @@ import static org.opensearch.ml.settings.MLCommonsSettings.ML_COMMONS_MONITORING_REQUEST_COUNT; import java.util.HashSet; +import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; @@ -41,7 +42,7 @@ public MLModelCacheHelper(ClusterService clusterService, Settings settings) { * @param state model state * @param functionName function name */ - public synchronized void initModelState(String modelId, MLModelState state, FunctionName functionName) { + public synchronized void initModelState(String modelId, MLModelState state, FunctionName functionName, List targetWorkerNodes) { if (isModelRunningOnNode(modelId)) { throw new MLLimitExceededException("Duplicate load model task"); } @@ -49,6 +50,7 @@ public synchronized void initModelState(String modelId, MLModelState state, Func MLModelCache modelCache = new MLModelCache(); modelCache.setModelState(state); modelCache.setFunctionName(functionName); + modelCache.setTargetWorkerNodes(targetWorkerNodes); modelCaches.put(modelId, modelCache); } @@ -254,6 +256,10 @@ public MLModelProfile getModelProfile(String modelId) { if (modelCache.getPredictor() != null) { builder.predictor(modelCache.getPredictor().toString()); } + String[] targetWorkerNodes = modelCache.getTargetWorkerNodes(); + if (targetWorkerNodes.length > 0) { + builder.targetWorkerNodes(targetWorkerNodes); + } String[] workerNodes = modelCache.getWorkerNodes(); if (workerNodes.length > 0) { builder.workerNodes(workerNodes); diff --git a/plugin/src/main/java/org/opensearch/ml/model/MLModelManager.java b/plugin/src/main/java/org/opensearch/ml/model/MLModelManager.java index db963ea10a..5459a3cd4d 100644 --- a/plugin/src/main/java/org/opensearch/ml/model/MLModelManager.java +++ b/plugin/src/main/java/org/opensearch/ml/model/MLModelManager.java @@ -430,7 +430,7 @@ public void loadModel( listener.onFailure(new IllegalArgumentException("Exceed max model per node limit")); return; } - modelCacheHelper.initModelState(modelId, MLModelState.LOADING, functionName); + modelCacheHelper.initModelState(modelId, MLModelState.LOADING, functionName, mlTask.getWorkerNodes()); try (ThreadContext.StoredContext context = client.threadPool().getThreadContext().stashContext()) { checkAndAddRunningTask(mlTask, maxLoadTasksPerNode); this.getModel(modelId, threadedActionListener(LOAD_THREAD_POOL, ActionListener.wrap(mlModel -> { diff --git a/plugin/src/main/java/org/opensearch/ml/profile/MLModelProfile.java b/plugin/src/main/java/org/opensearch/ml/profile/MLModelProfile.java index dfdcfe919f..94a0502b7b 100644 --- a/plugin/src/main/java/org/opensearch/ml/profile/MLModelProfile.java +++ b/plugin/src/main/java/org/opensearch/ml/profile/MLModelProfile.java @@ -24,6 +24,7 @@ public class MLModelProfile implements ToXContentFragment, Writeable { private final MLModelState modelState; private final String predictor; + private final String[] targetWorkerNodes; private final String[] workerNodes; private final MLPredictRequestStats modelInferenceStats; private final MLPredictRequestStats predictRequestStats; @@ -32,12 +33,14 @@ public class MLModelProfile implements ToXContentFragment, Writeable { public MLModelProfile( MLModelState modelState, String predictor, + String[] targetWorkerNodes, String[] workerNodes, MLPredictRequestStats modelInferenceStats, MLPredictRequestStats predictRequestStats ) { this.modelState = modelState; this.predictor = predictor; + this.targetWorkerNodes = targetWorkerNodes; this.workerNodes = workerNodes; this.modelInferenceStats = modelInferenceStats; this.predictRequestStats = predictRequestStats; @@ -52,6 +55,9 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws if (predictor != null) { builder.field("predictor", predictor); } + if (targetWorkerNodes != null) { + builder.field("worker_nodes", targetWorkerNodes); + } if (workerNodes != null) { builder.field("worker_nodes", workerNodes); } @@ -72,6 +78,7 @@ public MLModelProfile(StreamInput in) throws IOException { this.modelState = null; } this.predictor = in.readOptionalString(); + this.targetWorkerNodes = in.readOptionalStringArray(); this.workerNodes = in.readOptionalStringArray(); if (in.readBoolean()) { this.modelInferenceStats = new MLPredictRequestStats(in); @@ -94,6 +101,7 @@ public void writeTo(StreamOutput out) throws IOException { out.writeBoolean(false); } out.writeOptionalString(predictor); + out.writeOptionalStringArray(targetWorkerNodes); out.writeOptionalStringArray(workerNodes); if (modelInferenceStats != null) { out.writeBoolean(true); diff --git a/plugin/src/main/java/org/opensearch/ml/task/MLPredictTaskRunner.java b/plugin/src/main/java/org/opensearch/ml/task/MLPredictTaskRunner.java index 701286a603..726401ef73 100644 --- a/plugin/src/main/java/org/opensearch/ml/task/MLPredictTaskRunner.java +++ b/plugin/src/main/java/org/opensearch/ml/task/MLPredictTaskRunner.java @@ -60,6 +60,8 @@ import org.opensearch.transport.TransportResponseHandler; import org.opensearch.transport.TransportService; +import com.google.common.collect.ImmutableList; + /** * MLPredictTaskRunner is responsible for running predict tasks. */ @@ -159,7 +161,7 @@ protected void executeTask(MLPredictionTaskRequest request, ActionListener argumentCaptor = ArgumentCaptor.forClass(Exception.class); verify(actionListener).onFailure(argumentCaptor.capture()); @@ -225,7 +227,8 @@ public void testTransportUploadModelActionDoExecuteWithCreateTaskException() { listener.onFailure(new Exception("Failed to create upload model task")); return null; }).when(mlTaskManager).createMLTask(any(), any()); - + when(node1.getId()).thenReturn("NodeId1"); + when(clusterService.localNode()).thenReturn(node1); transportUploadModelAction.doExecute(task, prepareRequest(), actionListener); ArgumentCaptor argumentCaptor = ArgumentCaptor.forClass(Exception.class); verify(actionListener).onFailure(argumentCaptor.capture()); diff --git a/plugin/src/test/java/org/opensearch/ml/model/MLModelCacheHelperTests.java b/plugin/src/test/java/org/opensearch/ml/model/MLModelCacheHelperTests.java index ec2d988077..5aa7b03e40 100644 --- a/plugin/src/test/java/org/opensearch/ml/model/MLModelCacheHelperTests.java +++ b/plugin/src/test/java/org/opensearch/ml/model/MLModelCacheHelperTests.java @@ -12,7 +12,9 @@ import static org.opensearch.ml.settings.MLCommonsSettings.ML_COMMONS_MONITORING_REQUEST_COUNT; import static org.opensearch.ml.utils.TestHelper.clusterSetting; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.Set; @@ -47,6 +49,8 @@ public class MLModelCacheHelperTests extends OpenSearchTestCase { private TextEmbeddingModel predictor; private int maxMonitoringRequests; + private List targetWorkerNodes; + @Before public void setup() { MockitoAnnotations.openMocks(this); @@ -61,11 +65,13 @@ public void setup() { modelId = "model_id1"; nodeId = "node_id1"; predictor = spy(new TextEmbeddingModel()); + targetWorkerNodes = new ArrayList<>(); + targetWorkerNodes.add(nodeId); } public void testModelState() { assertFalse(cacheHelper.isModelLoaded(modelId)); - cacheHelper.initModelState(modelId, MLModelState.LOADING, FunctionName.TEXT_EMBEDDING); + cacheHelper.initModelState(modelId, MLModelState.LOADING, FunctionName.TEXT_EMBEDDING, targetWorkerNodes); assertFalse(cacheHelper.isModelLoaded(modelId)); cacheHelper.setModelState(modelId, MLModelState.LOADED); assertTrue(cacheHelper.isModelLoaded(modelId)); @@ -75,8 +81,8 @@ public void testModelState() { public void testModelState_DuplicateError() { expectedEx.expect(MLLimitExceededException.class); expectedEx.expectMessage("Duplicate load model task"); - cacheHelper.initModelState(modelId, MLModelState.LOADING, FunctionName.TEXT_EMBEDDING); - cacheHelper.initModelState(modelId, MLModelState.LOADING, FunctionName.TEXT_EMBEDDING); + cacheHelper.initModelState(modelId, MLModelState.LOADING, FunctionName.TEXT_EMBEDDING, targetWorkerNodes); + cacheHelper.initModelState(modelId, MLModelState.LOADING, FunctionName.TEXT_EMBEDDING, targetWorkerNodes); } public void testPredictor_NotFoundException() { @@ -86,7 +92,7 @@ public void testPredictor_NotFoundException() { } public void testPredictor() { - cacheHelper.initModelState(modelId, MLModelState.LOADING, FunctionName.TEXT_EMBEDDING); + cacheHelper.initModelState(modelId, MLModelState.LOADING, FunctionName.TEXT_EMBEDDING, targetWorkerNodes); assertNull(cacheHelper.getPredictor(modelId)); cacheHelper.setPredictor(modelId, predictor); assertEquals(predictor, cacheHelper.getPredictor(modelId)); @@ -94,7 +100,7 @@ public void testPredictor() { public void testGetAndRemoveModel() { assertFalse(cacheHelper.isModelRunningOnNode(modelId)); - cacheHelper.initModelState(modelId, MLModelState.LOADING, FunctionName.TEXT_EMBEDDING); + cacheHelper.initModelState(modelId, MLModelState.LOADING, FunctionName.TEXT_EMBEDDING, targetWorkerNodes); String[] loadedModels = cacheHelper.getLoadedModels(); assertEquals(0, loadedModels.length); @@ -110,7 +116,7 @@ public void testGetAndRemoveModel() { } public void testRemoveModel_WrongModelId() { - cacheHelper.initModelState(modelId, MLModelState.LOADING, FunctionName.TEXT_EMBEDDING); + cacheHelper.initModelState(modelId, MLModelState.LOADING, FunctionName.TEXT_EMBEDDING, targetWorkerNodes); cacheHelper.removeModel("wrong_model_id"); assertArrayEquals(new String[] { modelId }, cacheHelper.getAllModels()); } @@ -163,7 +169,7 @@ public void testRemoveWorkerNode_ModelState() { } public void testRemoveModel_Loaded() { - cacheHelper.initModelState(modelId, MLModelState.LOADING, FunctionName.TEXT_EMBEDDING); + cacheHelper.initModelState(modelId, MLModelState.LOADING, FunctionName.TEXT_EMBEDDING, targetWorkerNodes); cacheHelper.setModelState(modelId, MLModelState.LOADED); cacheHelper.setPredictor(modelId, predictor); cacheHelper.removeModel(modelId); @@ -179,7 +185,7 @@ public void testClearWorkerNodes_NullModelState() { } public void testClearWorkerNodes_ModelState() { - cacheHelper.initModelState(modelId, MLModelState.LOADED, FunctionName.TEXT_EMBEDDING); + cacheHelper.initModelState(modelId, MLModelState.LOADED, FunctionName.TEXT_EMBEDDING, targetWorkerNodes); cacheHelper.addWorkerNode(modelId, nodeId); cacheHelper.clearWorkerNodes(); assertArrayEquals(new String[] { modelId }, cacheHelper.getAllModels()); @@ -206,7 +212,7 @@ public void testSyncWorkerNodes_NullModelState() { public void testSyncWorkerNodes_ModelState() { String modelId2 = "model_id2"; - cacheHelper.initModelState(modelId2, MLModelState.LOADED, FunctionName.TEXT_EMBEDDING); + cacheHelper.initModelState(modelId2, MLModelState.LOADED, FunctionName.TEXT_EMBEDDING, targetWorkerNodes); cacheHelper.addWorkerNode(modelId, nodeId); cacheHelper.addWorkerNode(modelId2, nodeId); @@ -243,7 +249,7 @@ public void testGetModelProfile_WrongModelId() { } public void testGetModelProfile() { - cacheHelper.initModelState(modelId, MLModelState.LOADING, FunctionName.TEXT_EMBEDDING); + cacheHelper.initModelState(modelId, MLModelState.LOADING, FunctionName.TEXT_EMBEDDING, targetWorkerNodes); cacheHelper.setModelState(modelId, MLModelState.LOADED); cacheHelper.setPredictor(modelId, predictor); cacheHelper.addWorkerNode(modelId, nodeId); @@ -266,7 +272,7 @@ public void testGetModelProfile() { } public void testGetModelProfile_Loading() { - cacheHelper.initModelState(modelId, MLModelState.LOADING, FunctionName.TEXT_EMBEDDING); + cacheHelper.initModelState(modelId, MLModelState.LOADING, FunctionName.TEXT_EMBEDDING, targetWorkerNodes); MLModelProfile modelProfile = cacheHelper.getModelProfile(modelId); assertNotNull(modelProfile); assertEquals(MLModelState.LOADING, modelProfile.getModelState()); diff --git a/plugin/src/test/java/org/opensearch/ml/rest/RestMLProfileActionTests.java b/plugin/src/test/java/org/opensearch/ml/rest/RestMLProfileActionTests.java index dfae77e3b4..f78aa3ba4e 100644 --- a/plugin/src/test/java/org/opensearch/ml/rest/RestMLProfileActionTests.java +++ b/plugin/src/test/java/org/opensearch/ml/rest/RestMLProfileActionTests.java @@ -67,6 +67,8 @@ import org.opensearch.threadpool.TestThreadPool; import org.opensearch.threadpool.ThreadPool; +import com.google.common.collect.ImmutableList; + public class RestMLProfileActionTests extends OpenSearchTestCase { @Rule public ExpectedException thrown = ExpectedException.none(); @@ -115,7 +117,7 @@ public void setup() throws IOException { .inputType(MLInputDataType.DATA_FRAME) .progress(0.4f) .outputIndex("test_index") - .workerNode("test_node") + .workerNodes(ImmutableList.of("test_node")) .createTime(Instant.ofEpochMilli(123)) .lastUpdateTime(Instant.ofEpochMilli(123)) .error("error") From f5288110220f6112fa664df40bd40ccd91209db0 Mon Sep 17 00:00:00 2001 From: Yaliang Wu Date: Fri, 30 Dec 2022 17:19:00 -0800 Subject: [PATCH 2/3] fix target worker node field name Signed-off-by: Yaliang Wu --- .../src/main/java/org/opensearch/ml/profile/MLModelProfile.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugin/src/main/java/org/opensearch/ml/profile/MLModelProfile.java b/plugin/src/main/java/org/opensearch/ml/profile/MLModelProfile.java index 94a0502b7b..2b8b10644c 100644 --- a/plugin/src/main/java/org/opensearch/ml/profile/MLModelProfile.java +++ b/plugin/src/main/java/org/opensearch/ml/profile/MLModelProfile.java @@ -56,7 +56,7 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws builder.field("predictor", predictor); } if (targetWorkerNodes != null) { - builder.field("worker_nodes", targetWorkerNodes); + builder.field("target_worker_nodes", targetWorkerNodes); } if (workerNodes != null) { builder.field("worker_nodes", workerNodes); From deb1c226717f3d09a8c7fea17dd0aaf32ec6b6e6 Mon Sep 17 00:00:00 2001 From: Yaliang Wu Date: Fri, 30 Dec 2022 17:43:58 -0800 Subject: [PATCH 3/3] support work nodes string in old tasks Signed-off-by: Yaliang Wu --- .../main/java/org/opensearch/ml/common/MLTask.java | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/common/src/main/java/org/opensearch/ml/common/MLTask.java b/common/src/main/java/org/opensearch/ml/common/MLTask.java index be5b7269de..35be815980 100644 --- a/common/src/main/java/org/opensearch/ml/common/MLTask.java +++ b/common/src/main/java/org/opensearch/ml/common/MLTask.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.time.Instant; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import static org.opensearch.common.xcontent.XContentParserUtils.ensureExpectedToken; @@ -247,10 +248,14 @@ public static MLTask parse(XContentParser parser) throws IOException { outputIndex = parser.text(); break; case WORKER_NODE_FIELD: - workerNodes = new ArrayList<>(); - ensureExpectedToken(XContentParser.Token.START_ARRAY, parser.currentToken(), parser); - while (parser.nextToken() != XContentParser.Token.END_ARRAY) { - workerNodes.add(parser.text()); + if (XContentParser.Token.START_ARRAY == parser.currentToken()) { + workerNodes = new ArrayList<>(); + while (parser.nextToken() != XContentParser.Token.END_ARRAY) { + workerNodes.add(parser.text()); + } + } else { + String[] nodes = parser.text().split(","); + workerNodes = Arrays.asList(nodes); } break; case CREATE_TIME_FIELD: