oracle · Craigacp · Oct 1, 2021 · Aug 20, 2021 · Aug 20, 2021 · Aug 23, 2021
diff --git a/Common/LibLinear/src/main/java/org/tribuo/common/liblinear/LibLinearModel.java b/Common/LibLinear/src/main/java/org/tribuo/common/liblinear/LibLinearModel.java
@@ -59,8 +59,11 @@ public abstract class LibLinearModel<T extends Output<T>> extends Model<T> {
 
     /**
      * The list of LibLinear models. Multiple models are used by multi-label and multidimensional regression outputs.
+     * <p>
+     * Not final to support deserialization reordering of multidimensional regression models which have an incorrect id mapping.
+     * Will be final again in some future version which doesn't maintain serialization compatibility with 4.X.
      */
-    protected final List<de.bwaldvogel.liblinear.Model> models;
+    protected List<de.bwaldvogel.liblinear.Model> models;
 
     /**
      * Constructs a LibLinear model from the supplied arguments.

diff --git a/Common/LibLinear/src/main/java/org/tribuo/common/liblinear/LibLinearTrainer.java b/Common/LibLinear/src/main/java/org/tribuo/common/liblinear/LibLinearTrainer.java
@@ -129,7 +129,7 @@ public LibLinearModel<T> train(Dataset<T> examples) {
     }
 
     @Override
-    public LibLinearModel<T> train(Dataset<T> examples, Map<String, Provenance> runProvenance) {
+    public synchronized LibLinearModel<T> train(Dataset<T> examples, Map<String, Provenance> runProvenance) {
         if (examples.getOutputInfo().getUnknownCount() > 0) {
             throw new IllegalArgumentException("The supplied Dataset contained unknown Outputs, and this Trainer is supervised.");
         }

diff --git a/Common/LibSVM/src/main/java/org/tribuo/common/libsvm/LibSVMModel.java b/Common/LibSVM/src/main/java/org/tribuo/common/libsvm/LibSVMModel.java
@@ -66,8 +66,11 @@ public abstract class LibSVMModel<T extends Output<T>> extends Model<T> implemen
 
     /**
      * The LibSVM models. Multiple models are used for multi-label or multidimensional regression outputs.
+     * <p>
+     * Not final to support deserialization reordering of multidimensional regression models which have an incorrect id mapping.
+     * Will be final again in some future version which doesn't maintain serialization compatibility with 4.X.
      */
-    protected final List<svm_model> models;
+    protected List<svm_model> models;
 
     /**
      * Constructs a LibSVMModel from the supplied arguments.

diff --git a/Common/Trees/src/main/java/org/tribuo/common/tree/AbstractCARTTrainer.java b/Common/Trees/src/main/java/org/tribuo/common/tree/AbstractCARTTrainer.java
@@ -30,9 +30,9 @@
 import org.tribuo.util.Util;
 
 import java.time.OffsetDateTime;
+import java.util.ArrayDeque;
 import java.util.Collections;
 import java.util.Deque;
-import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 import java.util.SplittableRandom;
@@ -198,7 +198,7 @@ public TreeModel<T> train(Dataset<T> examples, Map<String, Provenance> runProven
                 minChildWeight, scaledMinImpurityDecrease);
 
         AbstractTrainingNode<T> root = mkTrainingNode(examples, leafDeterminer);
-        Deque<AbstractTrainingNode<T>> queue = new LinkedList<>();
+        Deque<AbstractTrainingNode<T>> queue = new ArrayDeque<>();
         queue.add(root);
 
         while (!queue.isEmpty()) {

diff --git a/Common/XGBoost/src/main/java/org/tribuo/common/xgboost/XGBoostModel.java b/Common/XGBoost/src/main/java/org/tribuo/common/xgboost/XGBoostModel.java
@@ -31,11 +31,13 @@
 import org.tribuo.Prediction;
 import org.tribuo.common.xgboost.XGBoostTrainer.DMatrixTuple;
 import org.tribuo.provenance.ModelProvenance;
+import org.tribuo.provenance.TrainerProvenance;
 
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.ObjectInputStream;
 import java.io.ObjectOutputStream;
+import java.lang.reflect.InvocationTargetException;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Comparator;
@@ -82,6 +84,9 @@ public final class XGBoostModel<T extends Output<T>> extends Model<T> {
 
     private final XGBoostOutputConverter<T> converter;
 
+    // Used to signal if the model has been rewritten to fix the issue with multidimensional XGBoost regression models in 4.0 and 4.1.0.
+    private boolean regression41MappingFix;
+
     /**
      * The XGBoost4J Boosters.
      */
@@ -93,6 +98,7 @@ public final class XGBoostModel<T extends Output<T>> extends Model<T> {
         super(name,description,featureIDMap,labelIDMap,converter.generatesProbabilities());
         this.converter = converter;
         this.models = models;
+        this.regression41MappingFix = true;
     }
 
     /**
@@ -198,39 +204,43 @@ public List<XGBoostFeatureImportance> getFeatureImportance() {
     public Map<String, List<Pair<String,Double>>> getTopFeatures(int n) {
         try {
             int maxFeatures = n < 0 ? featureIDMap.size() : n;
-            // Aggregate feature scores across all the models.
-            // This throws away model specific information which is useful in the case of regression,
-            // but it's very tricky to get the dimension name associated with the model.
-            Map<String, MutableDouble> outputMap = new HashMap<>();
-            for (Booster model : models) {
+            Map<String, List<Pair<String,Double>>> map = new HashMap<>();
+            for (int i = 0; i < models.size(); i++) {
+                Booster model = models.get(i);
+                Map<String, MutableDouble> outputMap = new HashMap<>();
                 Map<String, Integer> xgboostMap = model.getFeatureScore("");
                 for (Map.Entry<String,Integer> f : xgboostMap.entrySet()) {
                     int id = Integer.parseInt(f.getKey().substring(1));
                     String name = featureIDMap.get(id).getName();
                     MutableDouble curVal = outputMap.computeIfAbsent(name,(k)->new MutableDouble());
                     curVal.increment(f.getValue());
                 }
-            }
-            Comparator<Pair<String, Double>> comparator = Comparator.comparingDouble(p -> Math.abs(p.getB()));
-            PriorityQueue<Pair<String,Double>> q = new PriorityQueue<>(maxFeatures,comparator);
-            for (Map.Entry<String,MutableDouble> e : outputMap.entrySet()) {
-                Pair<String,Double> cur = new Pair<>(e.getKey(), e.getValue().doubleValue());
 
-                if (q.size() < maxFeatures) {
-                    q.offer(cur);
-                } else if (comparator.compare(cur,q.peek()) > 0) {
-                    q.poll();
-                    q.offer(cur);
+                Comparator<Pair<String, Double>> comparator = Comparator.comparingDouble(p -> Math.abs(p.getB()));
+                PriorityQueue<Pair<String,Double>> q = new PriorityQueue<>(maxFeatures,comparator);
+                for (Map.Entry<String,MutableDouble> e : outputMap.entrySet()) {
+                    Pair<String,Double> cur = new Pair<>(e.getKey(), e.getValue().doubleValue());
+
+                    if (q.size() < maxFeatures) {
+                        q.offer(cur);
+                    } else if (comparator.compare(cur,q.peek()) > 0) {
+                        q.poll();
+                        q.offer(cur);
+                    }
                 }
-            }
-            List<Pair<String,Double>> list = new ArrayList<>();
-            while(q.size() > 0) {
-                list.add(q.poll());
-            }
-            Collections.reverse(list);
+                List<Pair<String,Double>> list = new ArrayList<>();
+                while(q.size() > 0) {
+                    list.add(q.poll());
+                }
+                Collections.reverse(list);
 
-            Map<String, List<Pair<String,Double>>> map = new HashMap<>();
-            map.put(Model.ALL_OUTPUTS,list);
+                if (models.size() == 1) {
+                    map.put(Model.ALL_OUTPUTS, list);
+                } else {
+                    String dimensionName = outputIDInfo.getOutput(i).toString();
+                    map.put(dimensionName, list);
+                }
+            }
 
             return map;
         } catch (XGBoostError e) {
@@ -299,12 +309,34 @@ private void writeObject(ObjectOutputStream out) throws IOException {
     private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
         in.defaultReadObject();
         try {
-            models = new ArrayList<>();
+            this.models = new ArrayList<>();
             int numModels = in.readInt();
             for (int i = 0; i < numModels; i++) {
                 // Now read in the byte array and rebuild each Booster
                 byte[] serialisedBooster = (byte[]) in.readObject();
-                models.add(XGBoost.loadModel(serialisedBooster));
+                this.models.add(XGBoost.loadModel(serialisedBooster));
+            }
+            try {
+                Class<?> regressionClass = Class.forName("org.tribuo.regression.ImmutableRegressionInfo");
+                String tribuoVersion = (String) provenance.getTrainerProvenance().getInstanceValues().get(TrainerProvenance.TRIBUO_VERSION_STRING).getValue();
+                if (regressionClass.isInstance(outputIDInfo) && !regression41MappingFix &&
+                        (tribuoVersion.startsWith("4.0.0") || tribuoVersion.startsWith("4.0.1") || tribuoVersion.startsWith("4.0.2") || tribuoVersion.startsWith("4.1.0")
+                                // This is explicit to catch the test model which has a 4.1.1-SNAPSHOT Tribuo version.
+                                || tribuoVersion.equals("4.1.1-SNAPSHOT"))) {
+                    // rewrite the model stream
+                    regression41MappingFix = true;
+                    int[] mapping = (int[]) regressionClass.getDeclaredMethod("getIDtoNaturalOrderMapping").invoke(outputIDInfo);
+                    List<Booster> copy = new ArrayList<>(models);
+                    for (int i = 0; i < mapping.length; i++) {
+                        copy.set(i,models.get(mapping[i]));
+                    }
+                    this.models = copy;
+                }
+            } catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException e) {
+                throw new RuntimeException("Failed to rewrite 4.1.0 or earlier regression model due to a reflection failure.",e);
+            } catch (ClassNotFoundException e) {
+                // pass as this isn't a regression model as otherwise it would have thrown ClassNotFoundException
+                // during the reading of the input stream.
             }
         } catch (XGBoostError e) {
             throw new IOException("Failed to deserialize the XGBoost model",e);

diff --git a/Core/src/main/java/org/tribuo/impl/IndexedArrayExample.java b/Core/src/main/java/org/tribuo/impl/IndexedArrayExample.java
@@ -40,6 +40,9 @@
  * <p>
  * Used in feature selection to provide log n lookups. May be used
  * elsewhere in the future as a performance optimisation.
+ * <p>
+ * Note: output id caching is only valid with single dimensional {@link Output}s like ClusterID, Event and Label.
+ * Other outputs may return -1 from {@link #getOutputID()}.
  */
 public class IndexedArrayExample<T extends Output<T>> extends ArrayExample<T> {
     private static final long serialVersionUID = 1L;

diff --git a/Interop/ONNX/src/main/java/org/tribuo/interop/onnx/RegressorTransformer.java b/Interop/ONNX/src/main/java/org/tribuo/interop/onnx/RegressorTransformer.java
@@ -57,6 +57,9 @@ public Regressor transformToOutput(List<OnnxValue> tensor, ImmutableOutputInfo<R
         } else if (predictions[0].length != outputIDInfo.size()) {
             throw new IllegalArgumentException("Supplied tensor has an incorrect number of dimensions, predictions[0].length = " + predictions[0].length + ", expected " + outputIDInfo.size());
         }
+        // Note this inserts in an ordering which is not necessarily the natural one,
+        // but the Regressor constructor sorts it to maintain the natural ordering.
+        // The names and the values still line up, so this code is valid.
         String[] names = new String[outputIDInfo.size()];
         double[] values = new double[outputIDInfo.size()];
         for (Pair<Integer,Regressor> p : outputIDInfo) {
@@ -116,6 +119,9 @@ public List<Regressor> transformToBatchOutput(List<OnnxValue> tensor, ImmutableO
         float[][] predictions = getBatchPredictions(tensor);
         List<Regressor> output = new ArrayList<>();
 
+        // Similar to transformToOutput, names and values are ordered by
+        // the id, not the natural ordering, but the Regressor constructor
+        // fixes that.
         String[] names = new String[outputIDInfo.size()];
         for (Pair<Integer,Regressor> p : outputIDInfo) {
             int id = p.getA();

diff --git a/Interop/Tensorflow/src/main/java/org/tribuo/interop/tensorflow/RegressorConverter.java b/Interop/Tensorflow/src/main/java/org/tribuo/interop/tensorflow/RegressorConverter.java
@@ -33,6 +33,7 @@
 import org.tribuo.Example;
 import org.tribuo.ImmutableOutputInfo;
 import org.tribuo.Prediction;
+import org.tribuo.regression.ImmutableRegressionInfo;
 import org.tribuo.regression.Regressor;
 import org.tensorflow.Tensor;
 
@@ -89,6 +90,9 @@ public Regressor convertToOutput(Tensor tensor, ImmutableOutputInfo<Regressor> o
         }
         String[] names = new String[outputIDInfo.size()];
         double[] values = new double[outputIDInfo.size()];
+        // Note this inserts in an ordering which is not necessarily the natural one,
+        // but the Regressor constructor sorts it to maintain the natural ordering.
+        // The names and the values still line up, so this code is valid.
         for (Pair<Integer,Regressor> p : outputIDInfo) {
             int id = p.getA();
             names[id] = p.getB().getNames()[0];
@@ -152,6 +156,9 @@ public List<Regressor> convertToBatchOutput(Tensor tensor, ImmutableOutputInfo<R
         List<Regressor> output = new ArrayList<>();
         int batchSize = (int) predictions.shape().asArray()[0];
 
+        // Similar to convertToOutput, names and values are ordered by
+        // the id, not the natural ordering, but the Regressor constructor
+        // fixes that.
         String[] names = new String[outputIDInfo.size()];
         for (Pair<Integer,Regressor> p : outputIDInfo) {
             int id = p.getA();
@@ -171,21 +178,28 @@ public List<Regressor> convertToBatchOutput(Tensor tensor, ImmutableOutputInfo<R
     @Override
     public Tensor convertToTensor(Regressor example, ImmutableOutputInfo<Regressor> outputIDInfo) {
         TFloat32 output = TFloat32.tensorOf(Shape.of(1,outputIDInfo.size()));
+        // We map through the id to natural order mapping as regressor might not
+        // be stored in the id order.
+        int[] ids = ((ImmutableRegressionInfo) outputIDInfo).getIDtoNaturalOrderMapping();
         double[] values = example.getValues();
-        for (int i = 0; i < values.length; i++) {
-            output.setFloat((float) values[i],i);
+        for (Pair<Integer,Regressor> p : outputIDInfo) {
+            int id = p.getA();
+            output.setFloat((float) values[ids[id]],0,id);
         }
         return output;
     }
 
     @Override
     public Tensor convertToTensor(List<Example<Regressor>> examples, ImmutableOutputInfo<Regressor> outputIDInfo) {
         TFloat32 output = TFloat32.tensorOf(Shape.of(examples.size(),outputIDInfo.size()));
+        // We map through the id to natural order mapping as regressor might not
+        // be stored in the id order.
+        int[] ids = ((ImmutableRegressionInfo) outputIDInfo).getIDtoNaturalOrderMapping();
         int i = 0;
         for (Example<Regressor> e : examples) {
             double[] values = e.getOutput().getValues();
             for (int j = 0; j < outputIDInfo.size(); j++) {
-                output.setFloat((float)values[j],i,j);
+                output.setFloat((float)values[ids[j]],i,j);
             }
             i++;
         }