use protostuff to serialize/deserialize RCF model

Signed-off-by: Yaliang Wu <[email protected]>
opensearch-project · Mar 25, 2022 · e4e4e9e · e4e4e9e
1 parent 36d8db6
commit e4e4e9e
Show file tree

Hide file tree

Showing 15 changed files with 322 additions and 59 deletions.
diff --git a/ml-algorithms/build.gradle b/ml-algorithms/build.gradle
@@ -23,6 +23,10 @@ dependencies {
     implementation group: 'commons-io', name: 'commons-io', version: '2.11.0'
     implementation files('lib/randomcutforest-parkservices-2.0.1.jar')
     implementation files('lib/randomcutforest-core-2.0.1.jar')
+    implementation group: 'io.protostuff', name: 'protostuff-core', version: '1.8.0'
+    implementation group: 'io.protostuff', name: 'protostuff-runtime', version: '1.8.0'
+    implementation group: 'io.protostuff', name: 'protostuff-api', version: '1.8.0'
+    implementation group: 'io.protostuff', name: 'protostuff-collectionschema', version: '1.8.0'
     testImplementation group: 'junit', name: 'junit', version: '4.12'
     testImplementation group: 'org.mockito', name: 'mockito-core', version: '4.4.0'
     testImplementation group: 'org.mockito', name: 'mockito-inline', version: '4.4.0'

diff --git a/ml-algorithms/lib/randomcutforest-core-2.0.1.jar b/ml-algorithms/lib/randomcutforest-core-2.0.1.jar
diff --git a/ml-algorithms/lib/randomcutforest-parkservices-2.0.1.jar b/ml-algorithms/lib/randomcutforest-parkservices-2.0.1.jar
diff --git a/...lgorithms/src/main/java/org/opensearch/ml/engine/algorithms/rcf/BatchRandomCutForest.java b/...lgorithms/src/main/java/org/opensearch/ml/engine/algorithms/rcf/BatchRandomCutForest.java
@@ -9,20 +9,19 @@
 import com.amazon.randomcutforest.state.RandomCutForestMapper;
 import com.amazon.randomcutforest.state.RandomCutForestState;
 import lombok.extern.log4j.Log4j2;
+import org.opensearch.ml.common.FunctionName;
+import org.opensearch.ml.common.Model;
 import org.opensearch.ml.common.dataframe.ColumnMeta;
 import org.opensearch.ml.common.dataframe.ColumnValue;
 import org.opensearch.ml.common.dataframe.DataFrame;
 import org.opensearch.ml.common.dataframe.DataFrameBuilder;
 import org.opensearch.ml.common.dataframe.Row;
-import org.opensearch.ml.common.input.parameter.rcf.BatchRCFParams;
-import org.opensearch.ml.common.FunctionName;
 import org.opensearch.ml.common.input.parameter.MLAlgoParams;
+import org.opensearch.ml.common.input.parameter.rcf.BatchRCFParams;
 import org.opensearch.ml.common.output.MLOutput;
 import org.opensearch.ml.common.output.MLPredictionOutput;
-import org.opensearch.ml.common.Model;
 import org.opensearch.ml.engine.TrainAndPredictable;
 import org.opensearch.ml.engine.annotation.Function;
-import org.opensearch.ml.engine.utils.ModelSerDeSer;
 
 import java.util.ArrayList;
 import java.util.HashMap;
@@ -68,7 +67,7 @@ public MLOutput predict(DataFrame dataFrame, Model model) {
         if (model == null) {
             throw new IllegalArgumentException("No model found for batch RCF prediction.");
         }
-        RandomCutForestState state = (RandomCutForestState) ModelSerDeSer.deserialize(model.getContent());
+        RandomCutForestState state = RCFModelSerDeSer.deserializeRCF(model.getContent());
         RandomCutForest forest = rcfMapper.toModel(state);
         List<Map<String, Object>> predictResult = process(dataFrame, forest, 0);
         return MLPredictionOutput.builder().predictionResult(DataFrameBuilder.load(predictResult)).build();
@@ -83,7 +82,7 @@ public Model train(DataFrame dataFrame) {
         model.setName(FunctionName.BATCH_RCF.name());
         model.setVersion(1);
         RandomCutForestState state = rcfMapper.toState(forest);
-        model.setContent(ModelSerDeSer.serialize(state));
+        model.setContent(RCFModelSerDeSer.serializeRCF(state));
         return model;
     }
 

diff --git a/...hms/src/main/java/org/opensearch/ml/engine/algorithms/rcf/FixedInTimeRandomCutForest.java b/...hms/src/main/java/org/opensearch/ml/engine/algorithms/rcf/FixedInTimeRandomCutForest.java
@@ -12,22 +12,21 @@
 import com.amazon.randomcutforest.parkservices.state.ThresholdedRandomCutForestMapper;
 import com.amazon.randomcutforest.parkservices.state.ThresholdedRandomCutForestState;
 import lombok.extern.log4j.Log4j2;
+import org.opensearch.ml.common.FunctionName;
+import org.opensearch.ml.common.Model;
 import org.opensearch.ml.common.dataframe.ColumnMeta;
 import org.opensearch.ml.common.dataframe.ColumnType;
 import org.opensearch.ml.common.dataframe.ColumnValue;
 import org.opensearch.ml.common.dataframe.DataFrame;
 import org.opensearch.ml.common.dataframe.DataFrameBuilder;
 import org.opensearch.ml.common.dataframe.Row;
 import org.opensearch.ml.common.exception.MLValidationException;
-import org.opensearch.ml.common.FunctionName;
 import org.opensearch.ml.common.input.parameter.MLAlgoParams;
+import org.opensearch.ml.common.input.parameter.rcf.FitRCFParams;
 import org.opensearch.ml.common.output.MLOutput;
 import org.opensearch.ml.common.output.MLPredictionOutput;
-import org.opensearch.ml.common.Model;
-import org.opensearch.ml.common.input.parameter.rcf.FitRCFParams;
 import org.opensearch.ml.engine.TrainAndPredictable;
 import org.opensearch.ml.engine.annotation.Function;
-import org.opensearch.ml.engine.utils.ModelSerDeSer;
 
 import java.text.DateFormat;
 import java.text.ParseException;
@@ -99,7 +98,7 @@ public MLOutput predict(DataFrame dataFrame, Model model) {
         if (model == null) {
             throw new IllegalArgumentException("No model found for FIT RCF prediction.");
         }
-        ThresholdedRandomCutForestState state = (ThresholdedRandomCutForestState) ModelSerDeSer.deserialize(model.getContent());
+        ThresholdedRandomCutForestState state = RCFModelSerDeSer.deserializeTRCF(model.getContent());
         ThresholdedRandomCutForest forest = trcfMapper.toModel(state);
         List<Map<String, Object>> predictResult = process(dataFrame, forest);
         return MLPredictionOutput.builder().predictionResult(DataFrameBuilder.load(predictResult)).build();
@@ -113,7 +112,7 @@ public Model train(DataFrame dataFrame) {
         model.setName(FunctionName.FIT_RCF.name());
         model.setVersion(1);
         ThresholdedRandomCutForestState state = trcfMapper.toState(forest);
-         model.setContent(ModelSerDeSer.serialize(state));
+        model.setContent(RCFModelSerDeSer.serializeTRCF(state));
         return model;
     }
 

diff --git a/ml-algorithms/src/main/java/org/opensearch/ml/engine/algorithms/rcf/RCFModelSerDeSer.java b/ml-algorithms/src/main/java/org/opensearch/ml/engine/algorithms/rcf/RCFModelSerDeSer.java
@@ -0,0 +1,59 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+package org.opensearch.ml.engine.algorithms.rcf;
+
+import com.amazon.randomcutforest.parkservices.state.ThresholdedRandomCutForestState;
+import com.amazon.randomcutforest.state.RandomCutForestState;
+import io.protostuff.LinkedBuffer;
+import io.protostuff.ProtostuffIOUtil;
+import io.protostuff.Schema;
+import io.protostuff.runtime.RuntimeSchema;
+import lombok.experimental.UtilityClass;
+
+import java.security.AccessController;
+import java.security.PrivilegedAction;
+
+@UtilityClass
+public class RCFModelSerDeSer {
+    private static final int SERIALIZATION_BUFFER_BYTES = 512;
+    private static final Schema<RandomCutForestState> rcfSchema =
+            AccessController.doPrivileged((PrivilegedAction<Schema<RandomCutForestState>>) () ->
+                    RuntimeSchema.getSchema(RandomCutForestState.class));
+    private static final Schema<ThresholdedRandomCutForestState> trcfSchema =
+            AccessController.doPrivileged((PrivilegedAction<Schema<ThresholdedRandomCutForestState>>) () ->
+                    RuntimeSchema.getSchema(ThresholdedRandomCutForestState.class));
+
+    public static byte[] serializeRCF(RandomCutForestState model) {
+        return serialize(model, rcfSchema);
+    }
+
+    public static byte[] serializeTRCF(ThresholdedRandomCutForestState model) {
+        return serialize(model, trcfSchema);
+    }
+
+    public static RandomCutForestState deserializeRCF(byte[] bytes) {
+        return deserialize(bytes, rcfSchema);
+    }
+
+    public static ThresholdedRandomCutForestState deserializeTRCF(byte[] bytes) {
+        return deserialize(bytes, trcfSchema);
+    }
+
+    private static <T> byte[] serialize(T model, Schema<T> schema) {
+        LinkedBuffer buffer = LinkedBuffer.allocate(SERIALIZATION_BUFFER_BYTES);
+        byte[] bytes = AccessController.doPrivileged((PrivilegedAction<byte[]>) () -> ProtostuffIOUtil.toByteArray(model, schema, buffer));
+        return bytes;
+    }
+
+    private static <T> T deserialize(byte[] bytes, Schema<T> schema) {
+        T model = schema.newMessage();
+        AccessController.doPrivileged((PrivilegedAction<Void>) () -> {
+            ProtostuffIOUtil.mergeFrom(bytes, model, schema);
+            return null;
+        });
+        return model;
+    }
+}
diff --git a/ml-algorithms/src/main/java/org/opensearch/ml/engine/utils/ModelSerDeSer.java b/ml-algorithms/src/main/java/org/opensearch/ml/engine/utils/ModelSerDeSer.java
@@ -6,12 +6,12 @@
 package org.opensearch.ml.engine.utils;
 
 import lombok.experimental.UtilityClass;
+import org.apache.commons.io.serialization.ValidatingObjectInputStream;
 import org.opensearch.ml.engine.exceptions.ModelSerDeSerException;
 
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
-import java.io.ObjectInputStream;
 import java.io.ObjectOutputStream;
 
 @UtilityClass
@@ -41,8 +41,11 @@ public static byte[] serialize(Object model) {
     }
 
     public static Object deserialize(byte[] modelBin) {
-        try (ObjectInputStream objectInputStream = new ObjectInputStream(new ByteArrayInputStream(modelBin))) {
-            return objectInputStream.readObject();
+        try (ByteArrayInputStream inputStream = new ByteArrayInputStream(modelBin);
+             ValidatingObjectInputStream validatingObjectInputStream = new ValidatingObjectInputStream(inputStream)){
+            // Validate the model class type to avoid deserialization attack.
+            validatingObjectInputStream.accept(ACCEPT_CLASS_PATTERNS);
+            return validatingObjectInputStream.readObject();
         } catch (IOException | ClassNotFoundException e) {
             throw new ModelSerDeSerException("Failed to deserialize model.", e.getCause());
         }

diff --git a/ml-algorithms/src/test/java/org/opensearch/ml/engine/MLEngineTest.java b/ml-algorithms/src/test/java/org/opensearch/ml/engine/MLEngineTest.java
@@ -30,9 +30,9 @@
 import java.io.IOException;
 import java.util.Arrays;
 
-import static org.opensearch.ml.engine.helper.KMeansHelper.constructKMeansDataFrame;
 import static org.opensearch.ml.engine.helper.LinearRegressionHelper.constructLinearRegressionPredictionDataFrame;
 import static org.opensearch.ml.engine.helper.LinearRegressionHelper.constructLinearRegressionTrainDataFrame;
+import static org.opensearch.ml.engine.helper.MLTestHelper.constructTestDataFrame;
 
 public class MLEngineTest {
     @Rule
@@ -41,7 +41,7 @@ public class MLEngineTest {
     @Test
     public void predictKMeans() {
         Model model = trainKMeansModel();
-        DataFrame predictionDataFrame = constructKMeansDataFrame(10);
+        DataFrame predictionDataFrame = constructTestDataFrame(10);
         MLInputDataset inputDataset = DataFrameInputDataset.builder().dataFrame(predictionDataFrame).build();
         Input mlInput = MLInput.builder().algorithm(FunctionName.KMEANS).inputDataset(inputDataset).build();
         MLPredictionOutput output = (MLPredictionOutput)MLEngine.predict(mlInput, model);
@@ -106,7 +106,7 @@ public void train_EmptyDataFrame() {
         FunctionName algoName = FunctionName.LINEAR_REGRESSION;
         try (MockedStatic<MLEngineClassLoader> loader = Mockito.mockStatic(MLEngineClassLoader.class)) {
             loader.when(() -> MLEngineClassLoader.initInstance(algoName, null, MLAlgoParams.class)).thenReturn(null);
-            MLInputDataset inputDataset = DataFrameInputDataset.builder().dataFrame(constructKMeansDataFrame(0)).build();
+            MLInputDataset inputDataset = DataFrameInputDataset.builder().dataFrame(constructTestDataFrame(0)).build();
             MLEngine.train(MLInput.builder().algorithm(algoName).inputDataset(inputDataset).build());
         }
     }
@@ -118,7 +118,7 @@ public void train_UnsupportedAlgorithm() {
         FunctionName algoName = FunctionName.LINEAR_REGRESSION;
         try (MockedStatic<MLEngineClassLoader> loader = Mockito.mockStatic(MLEngineClassLoader.class)) {
             loader.when(() -> MLEngineClassLoader.initInstance(algoName, null, MLAlgoParams.class)).thenReturn(null);
-            MLInputDataset inputDataset = DataFrameInputDataset.builder().dataFrame(constructKMeansDataFrame(10)).build();
+            MLInputDataset inputDataset = DataFrameInputDataset.builder().dataFrame(constructTestDataFrame(10)).build();
             MLEngine.train(MLInput.builder().algorithm(algoName).inputDataset(inputDataset).build());
         }
     }
@@ -134,7 +134,7 @@ public void predictNullInput() {
     public void predictWithoutAlgoName() {
         exceptionRule.expect(IllegalArgumentException.class);
         exceptionRule.expectMessage("algorithm can't be null");
-        MLInputDataset inputDataset = DataFrameInputDataset.builder().dataFrame(constructKMeansDataFrame(10)).build();
+        MLInputDataset inputDataset = DataFrameInputDataset.builder().dataFrame(constructTestDataFrame(10)).build();
         Input mlInput = MLInput.builder().inputDataset(inputDataset).build();
         MLEngine.predict(mlInput, null);
     }
@@ -165,7 +165,7 @@ public void predictUnsupportedAlgorithm() {
     public void trainAndPredictWithKmeans() {
         int dataSize = 100;
         MLAlgoParams parameters = KMeansParams.builder().build();
-        DataFrame dataFrame = constructKMeansDataFrame(dataSize);
+        DataFrame dataFrame = constructTestDataFrame(dataSize);
         MLInputDataset inputData = new DataFrameInputDataset(dataFrame);
         Input input = new MLInput(FunctionName.KMEANS, parameters, inputData);
         MLPredictionOutput output = (MLPredictionOutput) MLEngine.trainAndPredict(input);
@@ -216,7 +216,7 @@ private Model trainKMeansModel() {
                 .iterations(10)
                 .distanceType(KMeansParams.DistanceType.EUCLIDEAN)
                 .build();
-        DataFrame trainDataFrame = constructKMeansDataFrame(100);
+        DataFrame trainDataFrame = constructTestDataFrame(100);
         MLInputDataset inputDataset = DataFrameInputDataset.builder().dataFrame(trainDataFrame).build();
         Input mlInput = MLInput.builder().algorithm(FunctionName.KMEANS).parameters(parameters).inputDataset(inputDataset).build();
         return MLEngine.train(mlInput);

diff --git a/ml-algorithms/src/test/java/org/opensearch/ml/engine/ModelSerDeSerTest.java b/ml-algorithms/src/test/java/org/opensearch/ml/engine/ModelSerDeSerTest.java
@@ -8,41 +8,40 @@
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.ExpectedException;
-import org.opensearch.ml.common.input.parameter.clustering.KMeansParams;
 import org.opensearch.ml.common.Model;
+import org.opensearch.ml.common.input.parameter.clustering.KMeansParams;
+import org.opensearch.ml.common.input.parameter.regression.LinearRegressionParams;
 import org.opensearch.ml.engine.algorithms.clustering.KMeans;
-import org.opensearch.ml.engine.exceptions.ModelSerDeSerException;
+import org.opensearch.ml.engine.algorithms.regression.LinearRegression;
 import org.opensearch.ml.engine.utils.ModelSerDeSer;
 import org.tribuo.clustering.kmeans.KMeansModel;
+import org.tribuo.regression.sgd.linear.LinearSGDModel;
 
-import java.util.Arrays;
-
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-import static org.opensearch.ml.engine.helper.KMeansHelper.constructKMeansDataFrame;
+import static org.junit.Assert.assertNotNull;
+import static org.opensearch.ml.engine.helper.MLTestHelper.constructTestDataFrame;
 
 public class ModelSerDeSerTest {
     @Rule
     public ExpectedException thrown = ExpectedException.none();
 
-    private final Object dummyModel = new Object();
-
-    @Test
-    public void testModelSerDeSerBlocklModel() {
-        thrown.expect(ModelSerDeSerException.class);
-        byte[] modelBin = ModelSerDeSer.serialize(dummyModel);
-        Object model = ModelSerDeSer.deserialize(modelBin);
-        assertTrue(model.equals(dummyModel));
-    }
-
     @Test
     public void testModelSerDeSerKMeans() {
         KMeansParams params = KMeansParams.builder().build();
         KMeans kMeans = new KMeans(params);
-        Model model = kMeans.train(constructKMeansDataFrame(100));
+        Model model = kMeans.train(constructTestDataFrame(100));
+
+        KMeansModel deserializedModel = (KMeansModel) ModelSerDeSer.deserialize(model.getContent());
+        assertNotNull(deserializedModel);
+    }
 
-        KMeansModel kMeansModel = (KMeansModel) ModelSerDeSer.deserialize(model.getContent());
-        byte[] serializedModel = ModelSerDeSer.serialize(kMeansModel);
-        assertFalse(Arrays.equals(serializedModel, model.getContent()));
+    @Test
+    public void testModelSerDeSerLinearRegression() {
+        LinearRegressionParams params = LinearRegressionParams.builder().target("f2").build();
+        LinearRegression linearRegression = new LinearRegression(params);
+        Model model = linearRegression.train(constructTestDataFrame(100));
+
+        LinearSGDModel deserializedModel = (LinearSGDModel) ModelSerDeSer.deserialize(model.getContent());
+        assertNotNull(deserializedModel);
     }
-}
+
+}
diff --git a/ml-algorithms/src/test/java/org/opensearch/ml/engine/algorithms/clustering/KMeansTest.java b/ml-algorithms/src/test/java/org/opensearch/ml/engine/algorithms/clustering/KMeansTest.java
@@ -16,7 +16,7 @@
 import org.opensearch.ml.common.output.MLPredictionOutput;
 import org.opensearch.ml.common.Model;
 
-import static org.opensearch.ml.engine.helper.KMeansHelper.constructKMeansDataFrame;
+import static org.opensearch.ml.engine.helper.MLTestHelper.constructTestDataFrame;
 
 
 public class KMeansTest {
@@ -107,11 +107,11 @@ public void constructorWithNegtiveIterations() {
     }
 
     private void constructKMeansPredictionDataFrame() {
-        predictionDataFrame = constructKMeansDataFrame(predictionSize);
+        predictionDataFrame = constructTestDataFrame(predictionSize);
     }
 
     private void constructKMeansTrainDataFrame() {
-        trainDataFrame = constructKMeansDataFrame(trainSize);
+        trainDataFrame = constructTestDataFrame(trainSize);
     }
 
 }
diff --git a/...lgorithms/src/test/java/org/opensearch/ml/engine/algorithms/rcf/RCFModelSerDeSerTest.java b/...lgorithms/src/test/java/org/opensearch/ml/engine/algorithms/rcf/RCFModelSerDeSerTest.java
@@ -0,0 +1,61 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+package org.opensearch.ml.engine.algorithms.rcf;
+
+import com.amazon.randomcutforest.RandomCutForest;
+import com.amazon.randomcutforest.parkservices.ThresholdedRandomCutForest;
+import com.amazon.randomcutforest.parkservices.state.ThresholdedRandomCutForestMapper;
+import com.amazon.randomcutforest.parkservices.state.ThresholdedRandomCutForestState;
+import com.amazon.randomcutforest.state.RandomCutForestMapper;
+import com.amazon.randomcutforest.state.RandomCutForestState;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+import org.opensearch.ml.common.Model;
+import org.opensearch.ml.common.input.parameter.rcf.BatchRCFParams;
+import org.opensearch.ml.common.input.parameter.rcf.FitRCFParams;
+
+import java.util.Arrays;
+
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+import static org.opensearch.ml.engine.helper.MLTestHelper.TIME_FIELD;
+import static org.opensearch.ml.engine.helper.MLTestHelper.constructTestDataFrame;
+
+public class RCFModelSerDeSerTest {
+    @Rule
+    public ExpectedException thrown = ExpectedException.none();
+
+    private final RandomCutForestMapper rcfMapper = new RandomCutForestMapper();
+    private final ThresholdedRandomCutForestMapper trcfMapper = new ThresholdedRandomCutForestMapper();
+
+    @Test
+    public void testModelSerDeSerBatchRCF() {
+        BatchRCFParams params = BatchRCFParams.builder().build();
+        BatchRandomCutForest batchRCF = new BatchRandomCutForest(params);
+        Model model = batchRCF.train(constructTestDataFrame(500));
+
+        RandomCutForestState deserializedState = RCFModelSerDeSer.deserializeRCF(model.getContent());
+        RandomCutForest forest = rcfMapper.toModel(deserializedState);
+        assertNotNull(forest);
+        byte[] serializedModel = RCFModelSerDeSer.serializeRCF(deserializedState);
+        assertTrue(Arrays.equals(serializedModel, model.getContent()));
+    }
+
+    @Test
+    public void testModelSerDeSerFitRCF() {
+        FitRCFParams params = FitRCFParams.builder().timeField(TIME_FIELD).build();
+        FixedInTimeRandomCutForest fitRCF = new FixedInTimeRandomCutForest(params);
+        Model model = fitRCF.train(constructTestDataFrame(500, true));
+
+        ThresholdedRandomCutForestState deserializedState = RCFModelSerDeSer.deserializeTRCF(model.getContent());
+        ThresholdedRandomCutForest forest = trcfMapper.toModel(deserializedState);
+        assertNotNull(forest);
+        byte[] serializedModel = RCFModelSerDeSer.serializeTRCF(deserializedState);
+        assertTrue(Arrays.equals(serializedModel, model.getContent()));
+    }
+
+}