Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes multidimensional regression #177

Merged
merged 25 commits into from
Oct 1, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
c18a3ed
Fixing RegressionInfo so the ids are assigned in lexicographic order.
Craigacp Aug 20, 2021
96ac9e9
Fixing the id assignment issue in LibLinearRegressionTrainer.
Craigacp Aug 20, 2021
6956b2f
Actually fix the issue in LibLinearRegressionTrainer.
Craigacp Aug 23, 2021
c7037ea
Fix a bug where LibLinearRegressionType reported itself as classifica…
Craigacp Aug 23, 2021
c6ca6b3
Fixing LibSVMRegressionTrainer.
Craigacp Aug 23, 2021
9f49a19
Fix XGBoost.
Craigacp Aug 23, 2021
f0f2a41
Fix a bug in standardized multidimensional LibSVM regressions.
Craigacp Aug 23, 2021
faa6b16
Working on tests for LibSVM regression.
Craigacp Sep 3, 2021
4ceff12
Adding mapping methods to the regression info.
Craigacp Sep 3, 2021
bd58560
Trying a fix for LibLinear.
Craigacp Sep 3, 2021
6d4f601
Fixing a concurrency and reproducibility issue in liblinear by making…
Craigacp Sep 4, 2021
ebec67f
Tidying up the liblinear tests.
Craigacp Sep 4, 2021
c38fded
Fixing TensorFlow.
Craigacp Sep 12, 2021
e370a95
Fixing regression trees.
Craigacp Sep 12, 2021
2a4b946
Updating XGBoost fix.
Craigacp Sep 12, 2021
82fb7b3
Fix for liblinear so models deserialize correctly.
Craigacp Sep 22, 2021
531cc57
Adding an example config file for CART regression trees.
Craigacp Sep 22, 2021
a722a32
Fixing LibSVM deserialization.
Craigacp Sep 23, 2021
644f3c7
Fixing ElasticNetCDTrainer as it also emitted corrupted multidimensio…
Craigacp Sep 23, 2021
048f0bd
Fixing trees.
Craigacp Sep 24, 2021
5027fab
Adding an id test to the regression ensembles.
Craigacp Sep 24, 2021
9fb8066
Fixing TensorFlow again.
Craigacp Sep 24, 2021
aa711e4
Fixing the regression SGD test so it is reproducible.
Craigacp Sep 24, 2021
52515d6
Fix XGBoost so it re-orders things on deserialization.
Craigacp Sep 24, 2021
45e9e2c
Fix for XGBoost and SLM so they don't re-order the dimensions twice w…
Craigacp Sep 24, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,11 @@ public abstract class LibLinearModel<T extends Output<T>> extends Model<T> {

/**
* The list of LibLinear models. Multiple models are used by multi-label and multidimensional regression outputs.
* <p>
* Not final to support deserialization reordering of multidimensional regression models which have an incorrect id mapping.
* Will be final again in some future version which doesn't maintain serialization compatibility with 4.X.
*/
protected final List<de.bwaldvogel.liblinear.Model> models;
protected List<de.bwaldvogel.liblinear.Model> models;

/**
* Constructs a LibLinear model from the supplied arguments.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ public LibLinearModel<T> train(Dataset<T> examples) {
}

@Override
public LibLinearModel<T> train(Dataset<T> examples, Map<String, Provenance> runProvenance) {
public synchronized LibLinearModel<T> train(Dataset<T> examples, Map<String, Provenance> runProvenance) {
if (examples.getOutputInfo().getUnknownCount() > 0) {
throw new IllegalArgumentException("The supplied Dataset contained unknown Outputs, and this Trainer is supervised.");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,11 @@ public abstract class LibSVMModel<T extends Output<T>> extends Model<T> implemen

/**
* The LibSVM models. Multiple models are used for multi-label or multidimensional regression outputs.
* <p>
* Not final to support deserialization reordering of multidimensional regression models which have an incorrect id mapping.
* Will be final again in some future version which doesn't maintain serialization compatibility with 4.X.
*/
protected final List<svm_model> models;
protected List<svm_model> models;

/**
* Constructs a LibSVMModel from the supplied arguments.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@
import org.tribuo.util.Util;

import java.time.OffsetDateTime;
import java.util.ArrayDeque;
import java.util.Collections;
import java.util.Deque;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.SplittableRandom;
Expand Down Expand Up @@ -198,7 +198,7 @@ public TreeModel<T> train(Dataset<T> examples, Map<String, Provenance> runProven
minChildWeight, scaledMinImpurityDecrease);

AbstractTrainingNode<T> root = mkTrainingNode(examples, leafDeterminer);
Deque<AbstractTrainingNode<T>> queue = new LinkedList<>();
Deque<AbstractTrainingNode<T>> queue = new ArrayDeque<>();
queue.add(root);

while (!queue.isEmpty()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,13 @@
import org.tribuo.Prediction;
import org.tribuo.common.xgboost.XGBoostTrainer.DMatrixTuple;
import org.tribuo.provenance.ModelProvenance;
import org.tribuo.provenance.TrainerProvenance;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
Expand Down Expand Up @@ -82,6 +84,9 @@ public final class XGBoostModel<T extends Output<T>> extends Model<T> {

private final XGBoostOutputConverter<T> converter;

// Used to signal if the model has been rewritten to fix the issue with multidimensional XGBoost regression models in 4.0 and 4.1.0.
private boolean regression41MappingFix;

/**
* The XGBoost4J Boosters.
*/
Expand All @@ -93,6 +98,7 @@ public final class XGBoostModel<T extends Output<T>> extends Model<T> {
super(name,description,featureIDMap,labelIDMap,converter.generatesProbabilities());
this.converter = converter;
this.models = models;
this.regression41MappingFix = true;
}

/**
Expand Down Expand Up @@ -198,39 +204,43 @@ public List<XGBoostFeatureImportance> getFeatureImportance() {
public Map<String, List<Pair<String,Double>>> getTopFeatures(int n) {
try {
int maxFeatures = n < 0 ? featureIDMap.size() : n;
// Aggregate feature scores across all the models.
// This throws away model specific information which is useful in the case of regression,
// but it's very tricky to get the dimension name associated with the model.
Map<String, MutableDouble> outputMap = new HashMap<>();
for (Booster model : models) {
Map<String, List<Pair<String,Double>>> map = new HashMap<>();
for (int i = 0; i < models.size(); i++) {
Booster model = models.get(i);
Map<String, MutableDouble> outputMap = new HashMap<>();
Map<String, Integer> xgboostMap = model.getFeatureScore("");
for (Map.Entry<String,Integer> f : xgboostMap.entrySet()) {
int id = Integer.parseInt(f.getKey().substring(1));
String name = featureIDMap.get(id).getName();
MutableDouble curVal = outputMap.computeIfAbsent(name,(k)->new MutableDouble());
curVal.increment(f.getValue());
}
}
Comparator<Pair<String, Double>> comparator = Comparator.comparingDouble(p -> Math.abs(p.getB()));
PriorityQueue<Pair<String,Double>> q = new PriorityQueue<>(maxFeatures,comparator);
for (Map.Entry<String,MutableDouble> e : outputMap.entrySet()) {
Pair<String,Double> cur = new Pair<>(e.getKey(), e.getValue().doubleValue());

if (q.size() < maxFeatures) {
q.offer(cur);
} else if (comparator.compare(cur,q.peek()) > 0) {
q.poll();
q.offer(cur);
Comparator<Pair<String, Double>> comparator = Comparator.comparingDouble(p -> Math.abs(p.getB()));
PriorityQueue<Pair<String,Double>> q = new PriorityQueue<>(maxFeatures,comparator);
for (Map.Entry<String,MutableDouble> e : outputMap.entrySet()) {
Pair<String,Double> cur = new Pair<>(e.getKey(), e.getValue().doubleValue());

if (q.size() < maxFeatures) {
q.offer(cur);
} else if (comparator.compare(cur,q.peek()) > 0) {
q.poll();
q.offer(cur);
}
}
}
List<Pair<String,Double>> list = new ArrayList<>();
while(q.size() > 0) {
list.add(q.poll());
}
Collections.reverse(list);
List<Pair<String,Double>> list = new ArrayList<>();
while(q.size() > 0) {
list.add(q.poll());
}
Collections.reverse(list);

Map<String, List<Pair<String,Double>>> map = new HashMap<>();
map.put(Model.ALL_OUTPUTS,list);
if (models.size() == 1) {
map.put(Model.ALL_OUTPUTS, list);
} else {
String dimensionName = outputIDInfo.getOutput(i).toString();
map.put(dimensionName, list);
}
}

return map;
} catch (XGBoostError e) {
Expand Down Expand Up @@ -299,12 +309,34 @@ private void writeObject(ObjectOutputStream out) throws IOException {
private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
in.defaultReadObject();
try {
models = new ArrayList<>();
this.models = new ArrayList<>();
int numModels = in.readInt();
for (int i = 0; i < numModels; i++) {
// Now read in the byte array and rebuild each Booster
byte[] serialisedBooster = (byte[]) in.readObject();
models.add(XGBoost.loadModel(serialisedBooster));
this.models.add(XGBoost.loadModel(serialisedBooster));
}
try {
Class<?> regressionClass = Class.forName("org.tribuo.regression.ImmutableRegressionInfo");
String tribuoVersion = (String) provenance.getTrainerProvenance().getInstanceValues().get(TrainerProvenance.TRIBUO_VERSION_STRING).getValue();
if (regressionClass.isInstance(outputIDInfo) && !regression41MappingFix &&
(tribuoVersion.startsWith("4.0.0") || tribuoVersion.startsWith("4.0.1") || tribuoVersion.startsWith("4.0.2") || tribuoVersion.startsWith("4.1.0")
// This is explicit to catch the test model which has a 4.1.1-SNAPSHOT Tribuo version.
|| tribuoVersion.equals("4.1.1-SNAPSHOT"))) {
// rewrite the model stream
regression41MappingFix = true;
int[] mapping = (int[]) regressionClass.getDeclaredMethod("getIDtoNaturalOrderMapping").invoke(outputIDInfo);
List<Booster> copy = new ArrayList<>(models);
for (int i = 0; i < mapping.length; i++) {
copy.set(i,models.get(mapping[i]));
}
this.models = copy;
}
} catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException e) {
throw new RuntimeException("Failed to rewrite 4.1.0 or earlier regression model due to a reflection failure.",e);
} catch (ClassNotFoundException e) {
// pass as this isn't a regression model as otherwise it would have thrown ClassNotFoundException
// during the reading of the input stream.
}
} catch (XGBoostError e) {
throw new IOException("Failed to deserialize the XGBoost model",e);
Expand Down
3 changes: 3 additions & 0 deletions Core/src/main/java/org/tribuo/impl/IndexedArrayExample.java
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@
* <p>
* Used in feature selection to provide log n lookups. May be used
* elsewhere in the future as a performance optimisation.
* <p>
* Note: output id caching is only valid with single dimensional {@link Output}s like ClusterID, Event and Label.
* Other outputs may return -1 from {@link #getOutputID()}.
*/
public class IndexedArrayExample<T extends Output<T>> extends ArrayExample<T> {
private static final long serialVersionUID = 1L;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ public Regressor transformToOutput(List<OnnxValue> tensor, ImmutableOutputInfo<R
} else if (predictions[0].length != outputIDInfo.size()) {
throw new IllegalArgumentException("Supplied tensor has an incorrect number of dimensions, predictions[0].length = " + predictions[0].length + ", expected " + outputIDInfo.size());
}
// Note this inserts in an ordering which is not necessarily the natural one,
// but the Regressor constructor sorts it to maintain the natural ordering.
// The names and the values still line up, so this code is valid.
String[] names = new String[outputIDInfo.size()];
double[] values = new double[outputIDInfo.size()];
for (Pair<Integer,Regressor> p : outputIDInfo) {
Expand Down Expand Up @@ -116,6 +119,9 @@ public List<Regressor> transformToBatchOutput(List<OnnxValue> tensor, ImmutableO
float[][] predictions = getBatchPredictions(tensor);
List<Regressor> output = new ArrayList<>();

// Similar to transformToOutput, names and values are ordered by
// the id, not the natural ordering, but the Regressor constructor
// fixes that.
String[] names = new String[outputIDInfo.size()];
for (Pair<Integer,Regressor> p : outputIDInfo) {
int id = p.getA();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import org.tribuo.Example;
import org.tribuo.ImmutableOutputInfo;
import org.tribuo.Prediction;
import org.tribuo.regression.ImmutableRegressionInfo;
import org.tribuo.regression.Regressor;
import org.tensorflow.Tensor;

Expand Down Expand Up @@ -89,6 +90,9 @@ public Regressor convertToOutput(Tensor tensor, ImmutableOutputInfo<Regressor> o
}
String[] names = new String[outputIDInfo.size()];
double[] values = new double[outputIDInfo.size()];
// Note this inserts in an ordering which is not necessarily the natural one,
// but the Regressor constructor sorts it to maintain the natural ordering.
// The names and the values still line up, so this code is valid.
for (Pair<Integer,Regressor> p : outputIDInfo) {
int id = p.getA();
names[id] = p.getB().getNames()[0];
Expand Down Expand Up @@ -152,6 +156,9 @@ public List<Regressor> convertToBatchOutput(Tensor tensor, ImmutableOutputInfo<R
List<Regressor> output = new ArrayList<>();
int batchSize = (int) predictions.shape().asArray()[0];

// Similar to convertToOutput, names and values are ordered by
// the id, not the natural ordering, but the Regressor constructor
// fixes that.
String[] names = new String[outputIDInfo.size()];
for (Pair<Integer,Regressor> p : outputIDInfo) {
int id = p.getA();
Expand All @@ -171,21 +178,28 @@ public List<Regressor> convertToBatchOutput(Tensor tensor, ImmutableOutputInfo<R
@Override
public Tensor convertToTensor(Regressor example, ImmutableOutputInfo<Regressor> outputIDInfo) {
TFloat32 output = TFloat32.tensorOf(Shape.of(1,outputIDInfo.size()));
// We map through the id to natural order mapping as regressor might not
// be stored in the id order.
int[] ids = ((ImmutableRegressionInfo) outputIDInfo).getIDtoNaturalOrderMapping();
double[] values = example.getValues();
for (int i = 0; i < values.length; i++) {
output.setFloat((float) values[i],i);
for (Pair<Integer,Regressor> p : outputIDInfo) {
int id = p.getA();
output.setFloat((float) values[ids[id]],0,id);
}
return output;
}

@Override
public Tensor convertToTensor(List<Example<Regressor>> examples, ImmutableOutputInfo<Regressor> outputIDInfo) {
TFloat32 output = TFloat32.tensorOf(Shape.of(examples.size(),outputIDInfo.size()));
// We map through the id to natural order mapping as regressor might not
// be stored in the id order.
int[] ids = ((ImmutableRegressionInfo) outputIDInfo).getIDtoNaturalOrderMapping();
int i = 0;
for (Example<Regressor> e : examples) {
double[] values = e.getOutput().getValues();
for (int j = 0; j < outputIDInfo.size(); j++) {
output.setFloat((float)values[j],i,j);
output.setFloat((float)values[ids[j]],i,j);
}
i++;
}
Expand Down
Loading