From 386cbfa469d63fdf1fecfc044dfc82df587eb98b Mon Sep 17 00:00:00 2001 From: Geoffrey Stewart Date: Tue, 15 Mar 2022 15:43:39 -0700 Subject: [PATCH 1/5] Improve the prediction technique by identifying points far away from exemplars as noise, rather than requiring points to be close to noise point exemplars. Signed-off-by: Geoffrey Stewart --- .../clustering/hdbscan/HdbscanModel.java | 45 ++++++--- .../clustering/hdbscan/HdbscanTrainer.java | 91 ++++++++++++++----- 2 files changed, 103 insertions(+), 33 deletions(-) diff --git a/Clustering/Hdbscan/src/main/java/org/tribuo/clustering/hdbscan/HdbscanModel.java b/Clustering/Hdbscan/src/main/java/org/tribuo/clustering/hdbscan/HdbscanModel.java index 711fb2f93..c90a15338 100644 --- a/Clustering/Hdbscan/src/main/java/org/tribuo/clustering/hdbscan/HdbscanModel.java +++ b/Clustering/Hdbscan/src/main/java/org/tribuo/clustering/hdbscan/HdbscanModel.java @@ -68,14 +68,17 @@ public final class HdbscanModel extends Model { private final List clusterExemplars; + private final double noisePointsOutlierScore; + HdbscanModel(String name, ModelProvenance description, ImmutableFeatureMap featureIDMap, ImmutableOutputInfo outputIDInfo, List clusterLabels, DenseVector outlierScoresVector, - List clusterExemplars, DistanceType distType) { + List clusterExemplars, DistanceType distType, double noisePointsOutlierScore) { super(name,description,featureIDMap,outputIDInfo,false); this.clusterLabels = clusterLabels; this.outlierScoresVector = outlierScoresVector; this.clusterExemplars = clusterExemplars; this.distType = distType; + this.noisePointsOutlierScore = noisePointsOutlierScore; } /** @@ -115,18 +118,38 @@ public Prediction predict(Example example) { if (vector.numActiveElements() == 0) { throw new IllegalArgumentException("No features found in Example " + example); } + double minDistance = Double.POSITIVE_INFINITY; - int clusterLabel = -1; - double clusterOutlierScore = 0.0; - for (HdbscanTrainer.ClusterExemplar clusterExemplar : clusterExemplars) { - double distance = DistanceType.getDistance(clusterExemplar.getFeatures(), vector, distType); - if (distance < minDistance) { - minDistance = distance; - clusterLabel = clusterExemplar.getLabel(); - clusterOutlierScore = clusterExemplar.getOutlierScore(); + int clusterLabel = HdbscanTrainer.OUTLIER_NOISE_CLUSTER_LABEL; + double outlierScore = 0.0; + if (Double.compare(noisePointsOutlierScore, 0) > 0) { // This will be true from models > 4.2 + boolean isNoisePoint = true; + for (HdbscanTrainer.ClusterExemplar clusterExemplar : clusterExemplars) { + double distance = DistanceType.getDistance(clusterExemplar.getFeatures(), vector, distType); + if (isNoisePoint && distance <= clusterExemplar.getMaxDistToEdge()) { + isNoisePoint = false; + } + if (distance < minDistance) { + minDistance = distance; + clusterLabel = clusterExemplar.getLabel(); + outlierScore = clusterExemplar.getOutlierScore(); + } + } + if (isNoisePoint) { + outlierScore = noisePointsOutlierScore; + } + } + else { + for (HdbscanTrainer.ClusterExemplar clusterExemplar : clusterExemplars) { + double distance = DistanceType.getDistance(clusterExemplar.getFeatures(), vector, distType); + if (distance < minDistance) { + minDistance = distance; + clusterLabel = clusterExemplar.getLabel(); + outlierScore = clusterExemplar.getOutlierScore(); + } } } - return new Prediction<>(new ClusterID(clusterLabel, clusterOutlierScore),vector.size(),example); + return new Prediction<>(new ClusterID(clusterLabel, outlierScore),vector.size(),example); } @Override @@ -145,7 +168,7 @@ protected HdbscanModel copy(String newName, ModelProvenance newProvenance) { List copyClusterLabels = Collections.unmodifiableList(clusterLabels); List copyExemplars = new ArrayList<>(clusterExemplars); return new HdbscanModel(newName, newProvenance, featureIDMap, outputIDInfo, copyClusterLabels, - copyOutlierScoresVector, copyExemplars, distType); + copyOutlierScoresVector, copyExemplars, distType, noisePointsOutlierScore); } private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException { diff --git a/Clustering/Hdbscan/src/main/java/org/tribuo/clustering/hdbscan/HdbscanTrainer.java b/Clustering/Hdbscan/src/main/java/org/tribuo/clustering/hdbscan/HdbscanTrainer.java index e23650ecc..f603bd83b 100644 --- a/Clustering/Hdbscan/src/main/java/org/tribuo/clustering/hdbscan/HdbscanTrainer.java +++ b/Clustering/Hdbscan/src/main/java/org/tribuo/clustering/hdbscan/HdbscanTrainer.java @@ -82,6 +82,8 @@ public final class HdbscanTrainer implements Trainer { static final int OUTLIER_NOISE_CLUSTER_LABEL = 0; + private static final double MAX_OUTLIER_SCORE = 0.9999; + /** * Available distance functions. * @deprecated @@ -241,7 +243,10 @@ public HdbscanModel train(Dataset examples, Map r ImmutableOutputInfo outputMap = new ImmutableClusteringInfo(counts); // Compute the cluster exemplars. - List clusterExemplars = computeExemplars(data, clusterAssignments); + List clusterExemplars = computeExemplars(data, clusterAssignments, distType); + + // Get the outlier score value for points that are predicted as noise points. + double noisePointsOutlierScore = getNoisePointsOutlierScore(clusterAssignments); logger.log(Level.INFO, "Hdbscan is done."); @@ -249,7 +254,7 @@ public HdbscanModel train(Dataset examples, Map r examples.getProvenance(), trainerProvenance, runProvenance); return new HdbscanModel("hdbscan-model", provenance, featureMap, outputMap, clusterLabels, outlierScoresVector, - clusterExemplars, distType); + clusterExemplars, distType, noisePointsOutlierScore); } @Override @@ -705,14 +710,15 @@ private static Map>> generateClusterAssignme } /** - * Compute the exemplars. These are representative points which are subsets of their clusters and noise points, and + * Compute the exemplars. These are representative points which are subsets of their clusters, and * will be used for prediction on unseen data points. * * @param data An array of {@link DenseVector} containing the data. * @param clusterAssignments A map of the cluster labels, and the points assigned to them. * @return A list of {@link ClusterExemplar}s which are used for predictions. */ - private static List computeExemplars(SGDVector[] data, Map>> clusterAssignments) { + private static List computeExemplars(SGDVector[] data, Map>> clusterAssignments, + DistanceType distType) { List clusterExemplars = new ArrayList<>(); // The formula to calculate the exemplar number. This calculates the number of exemplars to be used for this // configuration. The appropriate number of exemplars is important for prediction. At the time, this @@ -721,37 +727,69 @@ private static List computeExemplars(SGDVector[] data, Map>> e : clusterAssignments.entrySet()) { int clusterLabel = e.getKey(); - List> outlierScoreIndexList = clusterAssignments.get(clusterLabel); - - // Put the items into a TreeMap. This achieves the required sorting and removes duplicate outlier scores to - // provide the best samples - TreeMap outlierScoreIndexTree = new TreeMap<>(); - outlierScoreIndexList.forEach(p -> outlierScoreIndexTree.put(p.getA(), p.getB())); - int numExemplarsThisCluster = e.getValue().size() * numExemplars / data.length; - if (numExemplarsThisCluster > outlierScoreIndexTree.size()) { - numExemplarsThisCluster = outlierScoreIndexTree.size(); - } if (clusterLabel != OUTLIER_NOISE_CLUSTER_LABEL) { + List> outlierScoreIndexList = clusterAssignments.get(clusterLabel); + + // Put the items into a TreeMap. This achieves the required sorting and removes duplicate outlier scores to + // provide the best samples + TreeMap outlierScoreIndexTree = new TreeMap<>(); + outlierScoreIndexList.forEach(p -> outlierScoreIndexTree.put(p.getA(), p.getB())); + int numExemplarsThisCluster = e.getValue().size() * numExemplars / data.length; + if (numExemplarsThisCluster > outlierScoreIndexTree.size()) { + numExemplarsThisCluster = outlierScoreIndexTree.size(); + } + + List subsetClusterExemplars = new ArrayList<>(); + for (int i = 0; i < numExemplarsThisCluster; i++) { // Note that for non-outliers, the first node is polled from the tree, which has the lowest outlier // score out of the remaining points assigned this cluster. Entry entry = outlierScoreIndexTree.pollFirstEntry(); - clusterExemplars.add(new ClusterExemplar(clusterLabel, entry.getKey(), data[entry.getValue()])); + subsetClusterExemplars.add(new ClusterExemplar(clusterLabel, entry.getKey(), data[entry.getValue()])); } - } - else { - for (int i = 0; i < numExemplarsThisCluster; i++) { - // Note that for outliers the last node is polled from the tree, which has the highest outlier score - // out of the remaining points assigned this cluster. - Entry entry = outlierScoreIndexTree.pollLastEntry(); - clusterExemplars.add(new ClusterExemplar(clusterLabel, entry.getKey(), data[entry.getValue()])); + + // For each of the exemplars in this cluster, iterate the remaining nodes in the tree to find the maximum + // distance between the exemplar and the members of the cluster. The other exemplars don't need to be + // checked here since they won't be on the fringe of the cluster. + for (ClusterExemplar clusterExemplar : subsetClusterExemplars) { + double maxInnerDist = Double.NEGATIVE_INFINITY; + for (Entry entry : outlierScoreIndexTree.entrySet()) { + double distance = DistanceType.getDistance(clusterExemplar.getFeatures(), data[entry.getValue()], distType); + if (distance > maxInnerDist){ + maxInnerDist = distance; + } + } + clusterExemplar.setMaxDistToEdge(maxInnerDist); } + clusterExemplars.addAll(subsetClusterExemplars); } } return clusterExemplars; } + /** + * Determine the outlier score value for points that are predicted as noise points. + * + * @param clusterAssignments A map of the cluster labels, and the points assigned to them. + * @return An outlier score value for points predicted as noise points. + */ + private static double getNoisePointsOutlierScore(Map>> clusterAssignments) { + + List> outlierScoreIndexList = clusterAssignments.get(OUTLIER_NOISE_CLUSTER_LABEL); + if ((outlierScoreIndexList == null) || outlierScoreIndexList.isEmpty()) { + return MAX_OUTLIER_SCORE; + } + + double upperOutlierScoreBound = Double.NEGATIVE_INFINITY; + for (Pair outlierScoreIndex : outlierScoreIndexList) { + if (outlierScoreIndex.getA() > upperOutlierScoreBound) { + upperOutlierScoreBound = outlierScoreIndex.getA(); + } + } + return upperOutlierScoreBound; + } + @Override public String toString() { return "HdbscanTrainer(minClusterSize=" + minClusterSize + ",distanceType=" + distType + ",k=" + k + ",numThreads=" + numThreads + ")"; @@ -771,6 +809,7 @@ final static class ClusterExemplar implements Serializable { private final Integer label; private final Double outlierScore; private final SGDVector features; + private Double maxDistToEdge = Double.NEGATIVE_INFINITY; ClusterExemplar(Integer label, Double outlierScore, SGDVector features) { this.label = label; @@ -789,6 +828,14 @@ Double getOutlierScore() { SGDVector getFeatures() { return features; } + + void setMaxDistToEdge(Double maxDistToEdge) { + this.maxDistToEdge = maxDistToEdge; + } + + Double getMaxDistToEdge() { + return maxDistToEdge; + } } } From e7ec51a132d536c457af56b6ffe45a1c75251e94 Mon Sep 17 00:00:00 2001 From: Geoffrey Stewart Date: Wed, 23 Mar 2022 14:09:25 -0700 Subject: [PATCH 2/5] make some improvments to the PR including some new assertions for predictions made using outlier points --- .../clustering/hdbscan/HdbscanModel.java | 1 + .../clustering/hdbscan/HdbscanTrainer.java | 50 ++++++++++--------- .../clustering/hdbscan/TestHdbscan.java | 20 ++++++++ 3 files changed, 48 insertions(+), 23 deletions(-) diff --git a/Clustering/Hdbscan/src/main/java/org/tribuo/clustering/hdbscan/HdbscanModel.java b/Clustering/Hdbscan/src/main/java/org/tribuo/clustering/hdbscan/HdbscanModel.java index c90a15338..03558ffd3 100644 --- a/Clustering/Hdbscan/src/main/java/org/tribuo/clustering/hdbscan/HdbscanModel.java +++ b/Clustering/Hdbscan/src/main/java/org/tribuo/clustering/hdbscan/HdbscanModel.java @@ -136,6 +136,7 @@ public Prediction predict(Example example) { } } if (isNoisePoint) { + clusterLabel = HdbscanTrainer.OUTLIER_NOISE_CLUSTER_LABEL; outlierScore = noisePointsOutlierScore; } } diff --git a/Clustering/Hdbscan/src/main/java/org/tribuo/clustering/hdbscan/HdbscanTrainer.java b/Clustering/Hdbscan/src/main/java/org/tribuo/clustering/hdbscan/HdbscanTrainer.java index f603bd83b..e96cbbeeb 100644 --- a/Clustering/Hdbscan/src/main/java/org/tribuo/clustering/hdbscan/HdbscanTrainer.java +++ b/Clustering/Hdbscan/src/main/java/org/tribuo/clustering/hdbscan/HdbscanTrainer.java @@ -55,6 +55,8 @@ import java.util.TreeSet; import java.util.logging.Level; import java.util.logging.Logger; +import java.util.stream.IntStream; +import java.util.stream.Stream; /** * An HDBSCAN* trainer which generates a hierarchical, density-based clustering representation @@ -715,6 +717,7 @@ private static Map>> generateClusterAssignme * * @param data An array of {@link DenseVector} containing the data. * @param clusterAssignments A map of the cluster labels, and the points assigned to them. + * @param distType The distance metric to employ. * @return A list of {@link ClusterExemplar}s which are used for predictions. */ private static List computeExemplars(SGDVector[] data, Map>> clusterAssignments, @@ -740,29 +743,28 @@ private static List computeExemplars(SGDVector[] data, Map subsetClusterExemplars = new ArrayList<>(); - - for (int i = 0; i < numExemplarsThisCluster; i++) { - // Note that for non-outliers, the first node is polled from the tree, which has the lowest outlier - // score out of the remaining points assigned this cluster. - Entry entry = outlierScoreIndexTree.pollFirstEntry(); - subsetClusterExemplars.add(new ClusterExemplar(clusterLabel, entry.getKey(), data[entry.getValue()])); - } - - // For each of the exemplars in this cluster, iterate the remaining nodes in the tree to find the maximum - // distance between the exemplar and the members of the cluster. The other exemplars don't need to be - // checked here since they won't be on the fringe of the cluster. - for (ClusterExemplar clusterExemplar : subsetClusterExemplars) { + // First, get the entries that will be used for cluster exemplars. + // Note that for non-outliers, the first node is polled from the tree, which has the lowest outlier + // score out of the remaining points assigned this cluster. + List> partialClusterExemplars = new ArrayList<>(); + Stream intStream = IntStream.range(0, numExemplarsThisCluster).boxed(); + intStream.forEach((i) -> partialClusterExemplars.add(outlierScoreIndexTree.pollFirstEntry())); + + // For each of the partial exemplars in this cluster, iterate the remaining nodes in the tree to find + // the maximum distance between the exemplar and the members of the cluster. The other exemplars don't + // need to be checked here since they won't be on the fringe of the cluster. + for (Entry partialClusterExemplar : partialClusterExemplars) { + SGDVector features = data[partialClusterExemplar.getValue()]; double maxInnerDist = Double.NEGATIVE_INFINITY; for (Entry entry : outlierScoreIndexTree.entrySet()) { - double distance = DistanceType.getDistance(clusterExemplar.getFeatures(), data[entry.getValue()], distType); + double distance = DistanceType.getDistance(features, data[entry.getValue()], distType); if (distance > maxInnerDist){ maxInnerDist = distance; } } - clusterExemplar.setMaxDistToEdge(maxInnerDist); + clusterExemplars.add(new ClusterExemplar(clusterLabel, partialClusterExemplar.getKey(), features, + maxInnerDist)); } - clusterExemplars.addAll(subsetClusterExemplars); } } return clusterExemplars; @@ -809,12 +811,13 @@ final static class ClusterExemplar implements Serializable { private final Integer label; private final Double outlierScore; private final SGDVector features; - private Double maxDistToEdge = Double.NEGATIVE_INFINITY; + private final Double maxDistToEdge; - ClusterExemplar(Integer label, Double outlierScore, SGDVector features) { + ClusterExemplar(Integer label, Double outlierScore, SGDVector features, Double maxDistToEdge) { this.label = label; this.outlierScore = outlierScore; this.features = features; + this.maxDistToEdge = maxDistToEdge; } Integer getLabel() { @@ -829,12 +832,13 @@ SGDVector getFeatures() { return features; } - void setMaxDistToEdge(Double maxDistToEdge) { - this.maxDistToEdge = maxDistToEdge; - } - Double getMaxDistToEdge() { - return maxDistToEdge; + if (maxDistToEdge != null) { + return maxDistToEdge; + } + else { + return Double.NEGATIVE_INFINITY; + } } } diff --git a/Clustering/Hdbscan/src/test/java/org/tribuo/clustering/hdbscan/TestHdbscan.java b/Clustering/Hdbscan/src/test/java/org/tribuo/clustering/hdbscan/TestHdbscan.java index 1ef1abf77..d171c28b0 100644 --- a/Clustering/Hdbscan/src/test/java/org/tribuo/clustering/hdbscan/TestHdbscan.java +++ b/Clustering/Hdbscan/src/test/java/org/tribuo/clustering/hdbscan/TestHdbscan.java @@ -172,6 +172,26 @@ public void testEndToEndPredictWithCSVData() { assertArrayEquals(expectedLabelPredictions, actualLabelPredictions); assertArrayEquals(expectedOutlierScorePredictions, actualOutlierScorePredictions); + + CSVDataSource nextCsvTestSource = new CSVDataSource<>(Paths.get("src/test/resources/basic-gaussians-predict-with-outliers.csv"),rowProcessor,false); + Dataset nextTestSet = new MutableDataset<>(nextCsvTestSource); + + predictions = model.predict(nextTestSet); + + i = 0; + actualLabelPredictions = new int[nextTestSet.size()]; + actualOutlierScorePredictions = new double[nextTestSet.size()]; + for (Prediction pred : predictions) { + actualLabelPredictions[i] = pred.getOutput().getID(); + actualOutlierScorePredictions[i] = pred.getOutput().getScore(); + i++; + } + + int[] nextExpectedLabelPredictions = {5,0,3,0,4,0}; + double[] nextExpectedOutlierScorePredictions = {0.04384108680937504,0.837375806784261,0.04922915472735656,0.837375806784261,0.02915273635987492,0.837375806784261}; + + assertArrayEquals(nextExpectedLabelPredictions, actualLabelPredictions); + assertArrayEquals(nextExpectedOutlierScorePredictions, actualOutlierScorePredictions); } public static void runBasicTrainPredict(HdbscanTrainer trainer) { From 8555ae492be5e0d8793ff5ef52b829ad8e3d023b Mon Sep 17 00:00:00 2001 From: Geoffrey Stewart Date: Wed, 23 Mar 2022 14:10:24 -0700 Subject: [PATCH 3/5] add the new csv file for prediction test --- .../resources/basic-gaussians-predict-with-outliers.csv | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 Clustering/Hdbscan/src/test/resources/basic-gaussians-predict-with-outliers.csv diff --git a/Clustering/Hdbscan/src/test/resources/basic-gaussians-predict-with-outliers.csv b/Clustering/Hdbscan/src/test/resources/basic-gaussians-predict-with-outliers.csv new file mode 100644 index 000000000..32b0fade8 --- /dev/null +++ b/Clustering/Hdbscan/src/test/resources/basic-gaussians-predict-with-outliers.csv @@ -0,0 +1,7 @@ +Feature1,Feature2,Feature3 +-2.3302356259487063,3.9431416146381046,1.0315528543744679 +12.5,15.0,17.1 +0.41679363204429154,8.247732287302664,9.810651956897404 +-16.0,-13.3,14.4 +1.2947698963877157,-1.0272570581099394,1.6991984313559259 +-14.9,-13.9,-15.5 From 3a5e8ba24937670e2197523f5df1bbc8c880c5b1 Mon Sep 17 00:00:00 2001 From: Geoffrey Stewart Date: Wed, 23 Mar 2022 16:34:59 -0700 Subject: [PATCH 4/5] Add a test which deserializes a 4.2 model and asserts that the predictions are correct --- .../clustering/hdbscan/TestHdbscan.java | 50 ++++++++++++++++++ .../Hdbscan_minClSize7_L2_k7_nt1_v4.2.model | Bin 0 -> 38297 bytes 2 files changed, 50 insertions(+) create mode 100644 Clustering/Hdbscan/src/test/resources/Hdbscan_minClSize7_L2_k7_nt1_v4.2.model diff --git a/Clustering/Hdbscan/src/test/java/org/tribuo/clustering/hdbscan/TestHdbscan.java b/Clustering/Hdbscan/src/test/java/org/tribuo/clustering/hdbscan/TestHdbscan.java index d171c28b0..8474e548f 100644 --- a/Clustering/Hdbscan/src/test/java/org/tribuo/clustering/hdbscan/TestHdbscan.java +++ b/Clustering/Hdbscan/src/test/java/org/tribuo/clustering/hdbscan/TestHdbscan.java @@ -40,6 +40,9 @@ import org.tribuo.math.distance.DistanceType; import org.tribuo.test.Helpers; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.ObjectInputStream; import java.nio.file.Paths; import java.util.Arrays; import java.util.Collections; @@ -54,6 +57,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.fail; /** * Unit tests with small datasets for Hdbscan @@ -236,6 +240,52 @@ public void testBasicTrainPredict() { runBasicTrainPredict(t); } + @Test + public void deserializeHdbscanModelV42Test() { + String serializedModelFilename = "Hdbscan_minClSize7_L2_k7_nt1_v4.2.model"; + String serializedModelPath = this.getClass().getClassLoader().getResource(serializedModelFilename).getPath(); + + HdbscanModel model = null; + try (ObjectInputStream oin = new ObjectInputStream(new FileInputStream(serializedModelPath))) { + Object data = oin.readObject(); + model = (HdbscanModel) data; + if (!model.validate(ClusterID.class)) { + fail("This is not a Clustering model."); + } + } catch (IOException e) { + fail("There is a problem accessing the serialized model file " + serializedModelPath); + } catch (ClassNotFoundException e) { + fail("There is a problem deserializing the model file " + serializedModelPath); + } + + ClusteringFactory clusteringFactory = new ClusteringFactory(); + ResponseProcessor emptyResponseProcessor = new EmptyResponseProcessor<>(clusteringFactory); + Map regexMappingProcessors = new HashMap<>(); + regexMappingProcessors.put("Feature1", new DoubleFieldProcessor("Feature1")); + regexMappingProcessors.put("Feature2", new DoubleFieldProcessor("Feature2")); + regexMappingProcessors.put("Feature3", new DoubleFieldProcessor("Feature3")); + RowProcessor rowProcessor = new RowProcessor<>(emptyResponseProcessor,regexMappingProcessors); + CSVDataSource csvTestSource = new CSVDataSource<>(Paths.get("src/test/resources/basic-gaussians-predict.csv"),rowProcessor,false); + Dataset testSet = new MutableDataset<>(csvTestSource); + + List> predictions = model.predict(testSet); + + int i = 0; + int[] actualLabelPredictions = new int[testSet.size()]; + double[] actualOutlierScorePredictions = new double[testSet.size()]; + for (Prediction pred : predictions) { + actualLabelPredictions[i] = pred.getOutput().getID(); + actualOutlierScorePredictions[i] = pred.getOutput().getScore(); + i++; + } + + int[] expectedLabelPredictions = {5,3,5,5,3,5,4,4,5,3,3,3,3,4,4,5,4,5,5,4}; + double[] expectedOutlierScorePredictions = {0.04384108680937504,0.04922915472735656,4.6591582469379667E-4,0.025225544503289288,0.04922915472735656,0.0,0.044397942146806146,0.044397942146806146,0.025225544503289288,0.0,0.04922915472735656,0.0,0.0,0.044397942146806146,0.02395925569434121,0.003121298369468062,0.02915273635987492,0.03422951971100352,0.0,0.02915273635987492}; + + assertArrayEquals(expectedLabelPredictions, actualLabelPredictions); + assertArrayEquals(expectedOutlierScorePredictions, actualOutlierScorePredictions); + } + public static void runEvaluation(HdbscanTrainer trainer) { DataSource gaussianSource = new GaussianClusterDataSource(1000, 1L); TrainTestSplitter splitter = new TrainTestSplitter<>(gaussianSource, 0.7f, 2L); diff --git a/Clustering/Hdbscan/src/test/resources/Hdbscan_minClSize7_L2_k7_nt1_v4.2.model b/Clustering/Hdbscan/src/test/resources/Hdbscan_minClSize7_L2_k7_nt1_v4.2.model new file mode 100644 index 0000000000000000000000000000000000000000..a05ecf7fb78c8323d9f416a8498cc8b6ff6b5011 GIT binary patch literal 38297 zcmdRX30TbE_y4phDiX3KNtS4rByy&eEJ=lg_GzI_O-Z(F2?^Pzuovd+wcFc~6lF38qX64EENh2m88* z25Niwg@({+!M*|B+CHA{As%i4+H<;oF9`Ic`BA!l#Kb9*^C=1fX3KDze~_PBa0s2E zFn^U>n43;0-PccNzHbP9#(avrfNQ>+JIya-4P_HWY(7QK6A|44JZKK6LbV7I0U=-2GD}lEd;^{eNbR1-Ora6Z0`{mObb~;^PmR?(Ou$(x2WxzTAMw!wk!ZtlK*zI0z&2(sLZ z=0*<Dvf2ZqrC_-#k0Oz2)uOwb|fgA5liA~6L_?SkWXM4#{= z3hIBAZY|M66>y)6TT>)Md?RS{DN??kMD+)B>u-K-QM(T5hKAb#gP`6Q^XF@q4G%)f z2l5AuHh;iq&v6U!K_F7nRSzF4xfV)L#H}d(`~uxPt=#zCYE9`&_d%WP6X@p|9t2bZ zQf{pb`mKQeNzf_%1^V=;VQzk*G=$#W7(1GqUsppDk`xLW#bVYJabHhl^`LH66O5>} zF`J|XMZ(`L+=3$MPjd_4XME9TzmQOW`!%6%$R>m^x*LN1YMLL7?&i0|E!dYo9{9a0 zKtsa|HKUYAU}ykcFgAo;L6OelCl#pjrc?7P2W4b4f+Z)cZGDq7D9`lWPnSc~JF-pJ z=iBV#BjL(U5mc5hI*=m37($)6(D2>6&3&ly-j63#-1hmt z5XTt$Jz)%`POvgKO`mK)Rq$0wq&y1GQCP3lVQO@QTAAnY?X3JHgdrA=J|u`)-T&_% zA-k79=(Vi_=rnIyup;M7QrniDMnvOF5go;!jl?h@_-sP`-D$y5R}PPoJ;d50PNASt zihlU@8%$^#!B@=B4kNST{H#(qogyz-GPL~p%M~&}t6R6=l|`Ez=jP;s*R7NIBV#_L zk7uAiT5#x;?xWnEM(2-m^`2pSLqlCI7aFiSAaHGfc~{R1mqj8=)C0{2em(za%s#~M z<_~d6ehrmTg(iCh`fCRUyLtH0wEf)NL$v+<(DEJV=MhTpnnnw{)}Z-;0p5H(^uw>t zTngFNTK^KEt0M*t6_2-9bgbuh1mA{{U6>Rzvhln6e_U6&*axy)l1K^5tH^Sx?&T6U z!%?Ni;Z?vq(9aJoUSGsg27;I zA!sqw4QL#o^~M5iyZrr%=x~Y6?Ok-DRSZyC@~;x9ze*^=5~B6G4Mx1qYm4jx*owYr z8*vNv@afuy_?_BkKBcdRpIbHhA7;x0hWwG2r2F!B z+Yw#e%g^X6^zuU0Kqk^0P?iry?{XxthTdx?@HsFml9vzr%aCy73-k#$GITLCp#L`x z6&gJgdhWfynzUR0IZ>p2LkNGyiPFzE#GbY$l!hDvYl>_@sJ~^n8*;~JAzeAamWXow z5Up$PM{dkty{!-u7#i$B!}X+7#{WZSV?NX#!%%w$gm?u8BUcOc52n#6hX1m^`OYGi z>#dE&P#Z{Rp${|<`Fr5-ZcQgx_5{s_+I4z2q9+=k+8!Zc+UE94_#E~`W!3-4VfB1) z^n>QB#C(bpU$%?iJQ(?PsDk`qM5kE(%c=zh`}!m2F^m?XgH#h^5UWBpRq*olqoKKq z);XFdADTb^W}_(@;^U@2)sXKM(J7|?y)wRFH$`a0T<{MIe-P;{YWt$$qb+n{%mV|w ze7%vw>$%W<6&Sd%$SU@M|WE=8sQc2;JTXdFteH!=FE6 z{$q#zyPX299iPsp{GxxAHtSmMNq3X4|8GAz}McbrY`7= zg{F|7Cs|j16ZI%2L5OwvElcT?S-skfW`&^Hv&nhUy-oMpFXhfdpK(DUo069$m z^U&Ugq|hld2>HbP9a=CX`HIo#HUBrGS17G*=^sR2XV)WEZ$ns&KX`~3MAt!l9U8{n zmHzGWqTq&rRbAi&6B$$R`Umi3Y+FCypoC;a$OZTx4LIqvD>Q5Y0 z51{i#FgpB(qpc|jH3jZN)Ra^HU48!Wi1EUU>_?#qV!rbOD*Q3=XFpQN2ni(|HL_gy zWtdKpS`!+Gf?UBA_;*%xTbImz+=6>*QE~GgmjX4uX~)oP?;1*W$Y=FMUPZS#d6Lhu zi~q3)Et(;;AmlgjSAKrUaf&AYR{~nn`2AsvVtW50qpgsy)uYkf)-B;uifrvVMfe8l z@a53qkZ;2oq*#vFjC}*O`Q?&YNtXTPdB4#vj5hxuWE!6h1=r{lFP+69$gR-vMnToJ zZozaNzVT=`_t05LoZy8qPdrP6u}*>s^$cAHOBIXBT68l9vE^g|APy~I!jxlt-UFQ~k5qO((oo!pPc58@DB zaTgT7RWEmm)X9k}OqVtGJw;*h55WKZ;QiA%#W?3q1=9eRD%AGhN;37N(X)W>cqGSQ<^Um_AikPuI#!-_X$9 z!gA_VGgAXgQ&R)ODO0ByOt&Dq>Yp64duRd~Xiw4B)v~uSwY8tK(18#+ggQ(GJF__W z5~5CQPH3T9W;l3<6QiMk2>v1OTu)D1S6fdXP5uRL9;yrNRUHW=!C)Z9kBZT+=_zRZ z1XFao?T0}kr<%V`^P*mCFa-rC$lH)^;==RLUkHDUodLc9VSygRA(05|p=SPP_~VxRqoexL5Y3z)@gmmhKW2T`WF;@Nh#MO|6k7l3 zl&Sws70H__zLHmHp8wCJyH8<0(rvUoS}r!9Z*5^|YJ(;jx{VNQAol2X1~t{Jf47|A zF~<+i_taASs|q4S->rd&Bc?RpjuJXWY&FqM}w~li0^-bQAtYih_o{p!^wY>JlRMXhN=Q1MWqXui)=U zl%^p5V6uFrX7|&&Nq2AU9}$KAX?Fq5zfaM81;lX_^>)`Wf`3RhS^nn{>|bdL5+a2B z!`Zk?8UMs-B0P#}{&5gB@pAJEp=thkKok`6!$ex>zAu>G)sKG~siCR*XTkqXtM&iC zv|8`~nAK?1h@M)Bu_}6yH2Lcc_uuvka*c>{ZZ9WQ%|A|}CS3t-!C|u(Ro!+X>}Lx^ zG&DsA*Juwf2=b3e{*C(4J0@(@LgaN)}O9DRZmyX$UsYX z8edX$=Cg@L8;>mxN(lunzzNtkMT;7=IApLN>180HPTf(zO!_k0Ja5{!OY$*VUf};hyjV(_@GT^BCld&HPv1hoDZv zaGCH)72iQZKV3IbGQy!ir>JzlniD>7vY%}sc;18#oak?RMapKDl zSNx}4;>)eWgF=J(g|jZi48H8)gzn(Go^(j$hzDdHrD;k~$DEZ=Vcm+!?o5su@O+gz z!EU`E({+4U^Ud=@cK+p`1mDr)zk=xYUZl(7$Xk!*YOMYtiLOu3^?KK{5eTKRULfFI9bkZirn|Yp-yyE+O`g&c&`*AAeidR&=}Yn$jk! zYNLlv=m&eM+Ravlw8i$l!)>?d=lOkDWG~iz3w^Ds)#^)C{cgm1KVO!rcB{>yQE_(f za539+%_5jNy};C{S_IVS$R znyR|-(rM>`>#1Xx$&}mV%k>VoPg=IJqoJ3|=1_Jl?^N_JqmI#_>s_xtLLEzYPVioE zv3Ixxd0O9rO0KR5KKIEE+TYeqA^`rwxPy1;CZC)aQnEj1%YHX!(^X!a1XWIM%X{MijNakmqjLiej_ai>pktr~#i`IM`-^Q;{-dwUsFDgn zWtlc&y~CZVnkFTAEvFajcq0mn@7rC;t`GkFOf8g7m29o1Wli7PJ6u9uY49ds^#b?t z)9LB`T-IcNJaOdWq@DMvGBUPfug0(Y2e_iyCK-Oc!F`)xztsJJdv?cIxh3P&OsNvI z49gjZcl8c;;hz1y0TE|k_zOE>?9(b;pojNtCmtF-d?$};(}DKa4;+Ssc}I8rDyAMB>Q@09Vy7A4s}|53gV zdx6XTwJBt!QE50;sngZ;nqy;jUFb5VTHdAJt@Axlw!r0VFL3bjduqiBofhiA$(JmL z*4I)g(OIP~S@FHYMP`jGxo;cUOWR4z8Ll@+`v7&s8PzqdBLktT{~e0&FxlSW4oO~V z>vyy_I7z9iEu)x=sUyQaAM?DB1ob~Bj-@B}Fe`*@cW9_z#415+%9iu z-C8K=H)3hc)uFvxcOiJm6Sa-5T~|ls(-7$_z6aj*qIx3V-6QS~Cy^;-eu^LvWtf1= z2nqbr6AGphDFRfaOoTuv!pcRoVI71R&Jtoci**oXy?$UmTtS?|c({y65%v)&Odz0e zwkIk!1M^`WA~g~*jo`y#m|2AI&w52bF&<74EI2D-Gd5R*KuBO}PXJgs!AG!&h!HHf z_pvP)nJ5zhz0V^EyJJ%h`L6c)o!L<&=h6!C$(hhV{= zBC;3&69@pD6-g0hU|VpC;1kh{4Z@%z_F)16@XvVI77;NliwW3L5m3yG@i0rT00aS3 z30WeG6=6Kghf`Pwks=6~Dw4uf%!~oBBBBg`h!6-AEQV898}2ep#VHXyOu)z(3Nwq8 z;Ve!OQ1}DO5-Ea!nTawYi_35dGZQHhD3L57hV}L&fswH+QHDRTL6}7Zv?pW{W~>OO zdY!A7g+L+T31*xUVIi{rqKp9Oi5Z(Kf`Tm&NfB8Lh1-BA!&!{{&nc`I%ivF$;#mR3ar}xCkGSB~b7OA%;JC(t*W9QbZP8 zK*-`U%q&8{%0-w(vbaVf5(FRdfxC=g!M2D%VTqnnxcUT@fWpX_iVeaj1T)SODa?mc zA}mA}>mafswZt{TRBQ__BT^#So~*(w1WK>kFqM!HsRFh@#6A&bYysxOWkk&}6xJb< z!V*MEM2t|0KYCI`PzeJ5z&d)8CCacO5f+R`un?d`nFy#z7NcNhf`$0NhGS|^sed7X z4I%(Ukcl$mPVddB$f|+!pJy1orm!q-T>^#3Vlko& z>m{;;EM~z_*o>aaumncwi4Ut3Nr~tc!Nc{y1VTbYtS6P2rKgk#9HPz)&Ipy)ujNiCEARm4Lz@1PlJaRqd$^!{d73GD3nt#=zcWU+D) zK8!3vz-HhSfr2G4vWTom7E2INBH5ncF$-2nlwkrEBT^!KB2+AfQE-!B=AOzhRV0O( zF+9P72{}ss)`9j z3LB171VO}Rf{H(|Eg~t5f>S-=VdWSUOJGpUN0ea}B88DfBrr3E$50pr1K={8!YDnZ z2wD6gB8DY;Vj-9@0jIDSks>|_D$e3mPbvv!5qOM(sTc*Pdd0(H1PkWF1d(Azfa2Lh zWU(wEMwHPNEzq_4WIK{?Nw(%xm;b*s=mR%ikaqa-d`SA zTed#mclZJDmVex)PLF|7y`S*IwjHXbUI`q%xf7C8ro8h|KLcsUbIiuPGlKHnYrR7r zCqW@KZBE|aBB-uSv$GxM2i5lN5?_5Bp<=dEled{6@Xif8y~B+M9F0;HCE5#M-0&;T zXgmXrhgN6qp(zQ!CIK%CI%GfG*$yvUTuoXq1y&YHLD)r}4YCKY!2y+_z!Fq6VA**4c+;W0neGDNxea+HG8IaiH%YI(`v9uO zC>wfiwt))nuL0?O`at2A^-T?%lHjpYXjZ6WDm0ur=A7l&2_+@o>8YCSP}w?q(}Z`K zBre0|>-MCl(NObs#kfffAK;~py?c@E4~-88jo3Os5x6;%&*@&>4crN7Lv;d}a4TuT z(nAY{;uK)LJIR$WoD01BM?=1;D+2R|Z2VxU{!soQwBcCL72q}B(E1u^D--}u&DZ<) z;v=ElFUV<^&352k*HwC)Jpx$A-dr=#i~!F0nM31D#DRBu)9mAKjzD3+`Z>o`j7a&a zWBY#;KMV)1s%u)snpwbFsLEMiS^@=@;@Jm^CXxMBllU>2k){uv-!iPi)ra8a?roQh z1|^X=4TrzV*$(*vT<6_$_ppw`>)~RNF9K#l(KQVlGpS~BoH1|RUfeooE-5GM*U}KJF$1#X+qP7P;&Oj(bSalona2USy%SXV4|!4y`v!cyj~Fjjo*Dad87w z?mo1DB_0OZqaFsb6I!6;dV|8&1@lS#ve9Y-zwC0x`5W|MFi;X|AKBbVH_L%)<&84C zHmL!dIwUM)_cbW2OYzur(aQgG zxx^3DAH7a7kI;ZO4%16Fw5366LjLU6yHlXzjzenOg?eDG+wZd7#)jl?P<9S=vWbI| zCkkc@Jg-1CHzUP-7HO-hRa=AK;8@Oi=m~2!eXSQ&$yc zK&A|^#Nzx;(B8p|%Q-reKcN16o3mrhcuOcduAFh>ohwvd39HNAEDoGH*UR%b+t7IW zs8ASj5-K&HBxotT0%laXSNnzjP|z4S()daNdS!kw)Hx>;SYDaITAIrw+8wSJcrQt5$ z2gCrk*hTe=tr=W5Ge2S_xgQvhZfoxA!-OKH_z~&Veoz=1oxZIa;RP*gKH`=MMJLlX z71{5HhLJOe(Vxh{^Ps_-awK?A+NRdIt3exH_@4db@oobMzR4tPPto6G`!fQnrnEg& z0mdrxhC^<@pn|GAAk#wz>g(C-(m!4o%0ZE&#%|5>iBQ}Y;XF8&0YzIun#-#IcGApc z*O#_IiSp#+w!QO#A>p$5;i3(|ap*j!Ua`*&!2fWc_9Mk|_wYMa!J^*Z&-&x7`A3&w$mi+1^N1&wsnP#FI61J*Bv zqOd39N&9Pbk1B3qB|_csvEe^2#zIB&>+py^8=$__;OvE^-$=gN?FyaS9u`2!V*3Ou zNCD1I37EOP0m^^A2=82>0z8SZhApnXP`UJ+`8Byz5Pa7G9#7D6KRsOzN)YNrezqU#2wXsN@D>-eeRaTv0=bnQnD~ixfLqaj!K&j*bN1@Dsp}*MhMp*;2zez zxy?8Lm=yyn`)v4t+U>GqdS?YRZ);k-Kf4uJpW}xwn<)+~r~J@vP^-r5Gi6)Bn~fv*8ZFSQH&P!;%B_>@N{d!_}anRqohj!y{z9nY%pS z^;eLAh8L{)$(`@eI!!zVI8lZ?GaPJr6{ob$EX6qgm2V zpmLC^c2r~xG^M@uGcai&_o-TU<57*%S3!ekzXq?LIl!DOHpalG42m~@WADg+0EK}$ zed}5bp?L4)4ZG69fw$#r;>R&QP-!#t@|k;%P-Xq==O<5hC^ULhR9mqfifPV6?Mz~! zI?t-~-M(MIdN4HFIb8?povpS|L*jswPoJNrp8(9_y)g>zE6_esYWQQE0=zc++F9Wq z4s64TgYU0$gV&oiuLs@Mgj%c32QBwJ2gau#l_P9afwj$5$;o&=@CyCwqDwNNK6q71 zt?p8&wzEEaD#{Sp7LSIwc=re9fq|oC+Ll16uDd*xq(h0RRn6=HvA|uJ9ICl=0aR`? zIm}E8BiomEd1&MGt!+>#|JeW1st&Yo?pf?~eJKdOqXlJ7zhO!g2YB~9s%%Q)$^I-m z6Z=S3D+gYg%8vP-p9g4*zEn$Nd+9>an8oXBbU9G0>#(m6%Lv}M9Gg;S^PC*7c}}d+m77o8%>s}lP`>d9`X2&;)%);|VV9}g@)t?v!B zQ^#LB!JGrUdE)Ofre;9#>Fb_LsJr3i_brY*`F5z?7ZAWfGng~&Kp)%o17trmej7aO z$E#o{*Q;CURFDHDzV|(EwxvSj0*yW&XokR=U>8|FZ`oH3 zjrZI+Yc!Og@cr%eQ4eFFR#iuCOr8SSzV)%GyG>$Dp=q>}@zCuQcs+wza%@--RJi_( z3QTeb=7->8=LVHQ&Hb&?Yo}a+vgqWTYZgNDC}ew-&$JBxuA@z^pDfE5F%{=vcsgxg z-Nuw4sFr;dE;}(8N>pA@a0t~)(;9FJ{CLRDNwyLa;0OW6O?>%Ibz zhmw%H*KZdLgNhePSF_s=LGitT;(eT|fW79Hf5*4=z|t-n)LOO*n4510y2u{_PTGR{ zoqHQey9#$M+kLrK4XUrkQl$rJL-F^!rNa-s0G_P@Z=rWOJWHS3K0JR8G`_wtX6~L4 zav!MroVsv@4g=UzZx_CQJ`iek1IMTwv4gs%YsF*FhCpf1`whQm2LZS2%JrgvzEF1k z?YUg*c;I?V9e0h0hWg<(D{angfb#V#950n7K=CvEiCg^@pyH*!m*JXA&=j*^y>W&( z)O`9L%UL%VID76Ef7$ySnvS3T;MB1VswNk7+_=yIycw@sW;eYP)<^goDu!Q4s(gM# z_S(sb`Yv$jzr)NZl8 z@s#Qg-0J=-bd5ZrHZ^7H$iO6MUS%{V@XB%0-tv#-^Ul=K;Z@>22C~_91(&1twpYt=A{>)j_w}(?LGynl<)d@cqszwz__?L zHV?|0O_aZ#41?QWzElm7Y=pw+Vol#2>Y-FVZPZI=cc|Jf&vyTC4R|hv`U`Z+NV`}| z&P@v|T}0;D9dQFe?;GUkDt}WvJs5Zo&ShU-^$IHT6D<6$j|aA*k&7XlOU4BX#CENE zI%pD9g=f#C-oFHI*t?Hrv1USH#1{K!rVeO6YeyYAH4$o$H_b^?oeR%rn)ImO$}S^{v`Nt)MPy+61qg;ZXVHQj_&DBPhGm>SLG0Laj$-c7xJA8t1SRtxCQ*0bhr~(QJ>BQ#gsww z810r+_aZ1$tlM?=>=WRaH}1o zkA)}e=k}dzC<*mv1`Rd3wGv894(8lmgZ%g0vBMY6v;yWYcS+aaBk*YO!U>JT=}>Et zOdIP|2D!H%8aLXZ^SA7lp%qgcph4I2c#OXTR5<0A7ujb)Ra)D&EgIoaQ9bmM@m&X~ zZ@&8^V-517-nNRLtH6GHWsToVqlZ*-eASHH@kQSB1+Zhr+s$&H0EIpeMpz8p4{Whw zYhB|}P-8q>X7P>}Al%>UU(GhwXi$WTJ6_NsM#;Emobjs2TuLK!1z_r0+ZkbTwZN8+*+DDW9-x7kgB zl;P48o&Dp(py-jh`$v`MQ2BkAi$w5DsEcEqj6wTNzFpbbyidx|Flx)J z<#%#`@!@#cIn(2Ky&uZCp+2t}s?J98(D}oE?^1R!*>24=E19*-G+-UMB+)VN5Hz+(&R#B8g#L;|w$;WlEl{4h zG$?ZA1*q8f^`oIh2kJN75q;+&`JGK=&reuGMb1sBqyz(CIdoi|UmXCIDFyZ#iH%S* zYVnubMea~{qi7viy8@e0GeyTP2uhx=ajNZWjN%44x>nPYpp5FhX=Q~rISx57L*Mib z2!q;_CYCkH5l}LF?OBQ?+Fznvwsf3&0cCfNT-f)(2?XE1CC?Fp^_|3FQP)0dpLPqF zwM@N%cP*eg^zr>^vSq+9e{OrRcn(w^w4C*3zdF>dy&g1X4cgBQW?f)x3Wn!SIeUCI zBLAR0(tFL2Gf?RK*;(Oq2C!r^`Wp^2CgTc~voZ?5EDeDAS2{aqqH|fTnbWylC&rWe zCnIChR=4F_Nqg!}9!pQBL__hEi?bIlL+6yw*&`!f%?0*ud!03(jmU9Rw0PIa+4nS| zq1tmo=;;_J3-$6}_%s$?E~;BC(Ju*|4`!PVJ{kwFN-frIT(tt~cAjZYbw}&l!YLz` z`KLj{f=zmT-gBU)!SCc9rX$qvzDB>xj)l^>0grEv3r72hPg%8$5-=?Emye3t5B0U` z6EEhjCjHd1ulGXxt`8>1cWt`*_u)%if%%I0o4Y6%`5l~Bd#x|PgXZXPkE=OQ5nZKv zwYCu|kGhr)ShNC~##lv$Zf*q5NeP|IjQ7Bu?mHf)dI{S>_?z@Ud9!W5Wux<1MN9P- z_A?YG4%fe@6QTfx4VIH!)@%oMagMaA{ud~v%bb>~vx0)-F}pHZs=(P;;Qy@u5GdI9 zd*0Y(IZ(TEp6|qYAK>N4e$rD9YzJONmR#;nTi_me?>TAPBcWc=-^lN?Zu#={JCqnj zYwTNmjckXPG0PeKWwM}N_U5GZ2N}R%TjfnKcZRA^Jr5h?AG4Ddt6ddShMK&g17qUM z;8|Qz`H{GZLVRFO+xOG<5c2!By{c^f?nuUah4E$fwA}JMwj)$qNv})sjfFaMNpAgB z4&?8toNAES4&}znX2)#PCeLL>UlY`!q=pTBg!8To5R+Ba=) zL-SNH9_!Bg7pBc`95w=hfDxJr1U;yvzx_gT3OHdG-vUAE^bk2LA*I;;` z2Q|%!8)xhFA@2do%I03nQJDpmZpp9uXm&u+q(NIE?)HN+j`z09gHU~Lm~l=WNB;1v zj$cJjXQ8X*bw zm2a>o&l~wKl@6?;%0P`=T}KEl40zX$^qmm(2p+a|4i>LzflBxI*J5uY$$7+%x;Q3d z^KWRFJW1@fuNSaWj}4DP>yF?)g4D~IGAh$M8Lcyt2`&bKqsac_?s;-2fO-;Y7bwQ3 zYoT#{ot<-)iQ+nf_OF#_4{(n`^Y--TW{uVgsOtNuveAJ_;?`~*F*`~@4mit#o;6%h zCCgb?%)T&_JkhzVW$x8=t-#*@!P|XS3DmDJ`1-*{2E`F}812lAhiWH3ug*v-v>y&t zn|LG&3i@%%tv#H8>9KECivK1uzR@r|-F?i+7N`~1SGxWB2|8!(x_C+IBpJ70@2q;* zdDaLjEI*H~>mPykuZiDg-^n23@a*!{KYY;WRb0o+5tjLL3;h88-kfL;mnivI@I$No)K6XM~=v2)+7qx*oco!60D}dsSPln3v>IZk6 zBiEnOX#t+sg+X7v4#SJ&oGXdZED-$RDso+J$h*-rGmZ{rEYB&GKK-EB?fd$hQZu1= z^ro=(^UA;!f4jk&tq)xD1GY=#yr3lcMn7XCbY7Rp8&$ew15}G|zjXACJ85^_hCy~& z9d^*PbgS%m<9xDQ7_ZD9*Lwe6br2-iO>4Z`gwC;D{v&X=e%jNLV1~|thbCv%rNXs2 zeRq`2q@XzB;vmY67Bv6$tW)h%ppM3Tp^%Tx0aokvvI9_D!)&s-{qs`bPKl0Rt?f$c zs~u83SEJ1UN-U3fgv??=)nNBgGiM!vD!r(Jh2`^6KldHI{mlqyDD+*qbj24aZ%j?1 zeYpgASFI(3?pZ+P&ygbwb^YLV9e06Nd^{9Z$-2W-I+QnQpXY?RLi6m&o`%zqzaq6( zBgSnm)Vi!KJ*u)FYV(ey&bnv|Y~Jni#2eL66Oqw2O+pU1A&FthmhZ^*B`+@jtJa5i zz?yz<;ZU($czyetJ&K@@CToudb3BeQK%6#H9OiCzc>!; zR=Q(*=0tc=UL_v#5ydf-{4Li=mO{0@HB~Qggotr$No~-M^mMp>cKHVNO4H6SS|$pHZ%Jgu+g*!v^kh z@G4+P{KFp%C?7#dp~u=mEj>SqyFeGpwH7Dmgw6#{>Ale%{`HXac;bfK{w2U!`F!%- zF$3Y=vdb=+6VZL)=+H6i=cPbp|25|$-@8N69(n0kmAm0?mCiidhpABU_1Na8r%}92 zM#AIi;V>vppPoKsaTJub)_CVE)P#~%<+?R%&H%^C(CU+_9aJB_?EB`SB$Ue>oc2zG z3K!pJ&zRcb3MKIo&Mb-wa6eELLfpck^neHJ*El*n_F5csey%Gp+K{$ zXx$9QMzY!j|Xm@ zu0h75Dr$lm{t7ce@b`0|G;99o#Klk0z1R-VFXk*@opkcjeU=K90n%z0#_B@xSk9L# zG2@|%;`zQ}9|aoMil34wLg&NqWlO|aXdOSHHNs<-64aGlb-ViD4G8{*8?aps5`P;r z$a%ur=-v3GUXmOi)iS zMB*?#ZST+0Lg(A;qhm*nMt<}snKwlv(Y=)A`N>ST4q*M1y&)cq`GxVL@_C;azcbM} z=fr^+^ObRMYyTU%r7!Bgqf-52FQWL5ATG*rP43%dQU<&w=Z5!PBnK}8rY$jB-vX?D zg((vjyoQFnxOn>fDBy9mR>8j8@SaL?_!FB9PoWt}fpnwdL5 z&CX)|mUmakxJR|6>F3r<$AS5MPM$jQZ%dVZPu(()MD5Tv{{Y-oT453WCG(7pc5_!HZ$;>TvM(thazkbWn3CPb7#K(m9S)3oo>Kw##p?3HCuvzH* zRjPF?!?#Qu-CrGF;i)DL6*HUi&TT>ayhF#KRw{~T6{x43zoFGI8J*an`r?If2&81bWG^IHj_4fnSJdQTA<(WcYJ1L1zn z_35*7V|zWh|K0XFxpe(#6u($^K`iuH7L;tOH8Ogg2F1hd9n@yjLs>5EhI4WjO&GIv?ceSDO@upoFvpbEUV)wus@-7+Z zoKur{_Yk`GT)M#E+`H{?f060?*+7M|_i5i#zPXZqJ}X@L>WXikP+I9Ry#meq(qPZu zUhOete-|%VU$xQrCKOJeymR8Pn{cnv+;$$b0?IWvAK0&D0F}Nwe)V-!A^8P=DiJD& z4U~I3vp-ZuwGD^~+E4o1%(qV-j#}CYjqkJfx@vGCzl^qHS^f%OPI)Posjm+u6$OLm zt2;xj&bG)Kla-+~GU36-AblusJDW56_h+ctbL--;>~!+{$jG*NKK4p9slQ@i-7s1`bEE+D^NV{-g|a@I+R(j ziHYFZ!b9IW##uWbq<3;bU|j_;RhGn@*oxlgz_771wmSpMvM}mZpJu3>RyuT7!zmjw1f`^|N_pP=ehakN?^x-S&GcVf6Nef`VH0IEM7SyHm+ z1u4hU(+OL6c_jJnNqD}mTc$Xke#;ElOtIZ@(#wE#Vf&2>;rbx>>wmy2op?S>@sMzy zka$(UiWRQUm4nK0dU8vzzJZdgRX5MwL+@9nQDvU*LUE9mFAw`Hl7yP29G-@23lv(N zF#GD&0u{{1Lq>n@1h!Jr*LB<|U}bb}%uiN>Dv8aUP1BD<(S11G=!o8(bmVzOx2HfO zZ&KDQyD_9*VceFz{56LLR!|||&t=DBCwOjpz<%G)C-8jNxL}?0eo$k!?#-g*!N9GT zUb&#s394s=(O%UIgW>@iE^y9=TrcYLR{M!v^#SI!qh}tMHj{cvk{Tjg?oWcEL3`LW z$33BlnpgLsU@mYCQoJ>St^qS`_zsUaN9hwGfI#)nOYKqF! ztQII(F0<;!VdSqkwO1vrc??wvxkYBGRH&0FgVBypNWA9>$J8(8mO$}xp3^)O2dLTi zQGaDB9a!7jgVNFaK;gZ#F#an%PgO>EYyH}=1la8RSH~Vfe(ACs6(bVSJ14W3I(d1j zQ1PMF;@~nHU=G_gWgw3VMN2|dcWq_@clc`Su{K)J6s~LK5?esN*I}+-I$HYLc1X6U z9WC~h3Y<8pzPCRnKy~bPTOK1Fs-a_s<5=W3Bn059YK(YZNP z=E&7qP!;=3}bD|C<<^p2QbHUL__5(JC4hE8(?@W z&RiB)0QDW5OAk+=ILVD?aRu^Ba{XdmUbJm`brrBe)wNaMeTJ$tIl7e(jo`(U%O;7b z@5uQtj9XTC>1cn~*#v?=Rtc52zI~q?vjPgI4A^d0{{R?;DJPRhG9hc1fEcJZ17#mO_4+FYH9?g3t* zI4v3vWt4>QWLNTjO6Ye!o3ERzZe$CLNo(yK@+r`qo;vNLmKnUh`R&|uFIT9MdeZ*m z7>eijUq!iVt_u(I4_>E@L2+)e<2S8$p>8zZi5n46qm|oYHOdGoCw#A9oW%Q9LqVeIm|_YX>iRRVz}DOD z+*0fV_0fY`Vqc6S-&wpkw)omfwG3eYVBQ@pj^bJBD{0Ex(0#RGzwyz@$j|CG_r)%K z2`F_*cS~OMgp5bS^+P^X2Op0R$!Hc?a3{(CF{w$eqxEQ%tCVfa@Y@k?YU!0j>pDQS@U~htx#;4 zKhkB|NivQpyk8fNixRI&3kQo#sIu)peb$c$z?A!PX5#nxz`2NJWE8#NvZF6JqZ4IBZLjuI1nmP7z=`-U&l zGbN$8XtA91I&>~9O!hCe%Ov#*-@jC*=+M|U=-qsEPTBFBD87EHX2`lz!{8Trulw=hT)vU{pZC(L;2PYf0cc}axoc;zfTM5NlC8>uqIvHhD#lhpmDh)v9H(&PXPv0o!hHm_ zPZnuD7DMl|R_K>KZyXQ2h>7h354V%&-9qQf(HaY90dq0i@oX@PN1c%Hv_4Qk>Z@}- zc{%=E5!8x3vsqeV2lY>Hw(YR87W$V^8T#mf*GH2pgC~Q8Mt9-mX9+oK~4P^`IBDA zpSbeaQvIqcdbjy)Kv=Ok@bn+Aq$FiQar~zhhdTR_=Lq3_UG1{4w)nYde95?l@;0SF zt!$*hCUHDYYF}MEsSDnD75SZ&kKLw3 z^*c%0TamHo!{q0|&92D)?>3bmp8f;F#V+@f2 z!Jn{(eC?3-3+ zG^LE|s07UR>#KZGe4Tq&@nv!NNnk2`t1J7=0d`HvgQss_LjB!VtL+^XD2}vO{r%W8 zz&#Z=L&c*3O0LMB&~lCu{yz`sJi>@D-Z%%Xue_@bs>lzQdU#k*?ff#S_c zHmdVhGSK^lmy?%Ur$XM#R+TS{=uqeWcJ8t5Z-MbGVv+fL949R}xM#y7G|%geUa6ga zsSed;<8u?09H43Lse$PWJK@dJ5yK`r(#iUjNDZiozn~B0JI?<8GBgP)H_>TM? z!FXrCSUcfJ)|6*EG1p{gZDXqScNhjBr zdV{zfS|tj={m9wnII96z=XNeXzFrjs|FHnME*GaJrmsvzl&vY*eBjM7b$JqK9+VfS`Y3{kv)S=1nJPhi%6PxV%@fFfmBbxw5& zxqkA#g6ApwdeZ-^Y>(~S6Z{T1oYal7mj@#|>>8cTSCRgb;D6RYcy&ki|4@fQ$47pL z*CT)Oa#P>Ib9rQcm9NseENhIuPc~J3g3e-9bWfaPboT&yPgY*P|Li_}RA25Tm%(P} zUAL{>;_21sJ^PLN3C$?}thsT#>4qP|_r4@vz3u36Qz!q1^7G2d1~yK>U7KpPMD6>6n zvHEx_u>F%y+HHyjp3mE_Wm&GkRq^hyj{6Ap=}L!HHykJ5@7CGKdG=2~0x@0xbq&;% zsd)~5sY%wW?DK>^PKNIAQjM#%IuQ9^51y4DZAb4(1>YMJejltX@|fwzJ+>fRH_NA9 zU7K8l{O7d_xwl$|LH+i&CC`=(Li^e8@~67vp;*%x{g=7ueUEDD=0$&*kMR7gA=D@t1~a-|u{SI+uKBojKv+o3}qH=$&<0tU)x| z|Ku!8&do&UO4Z@pFOF^oM(fUR=O2_n)uII*3fldlBtoU(X%c$hzur(kL9QO^9{JhK zn{f)NFC=o!CtIQKMND|TY27yv#toQFw+pvZ3dp?hJspP{H6^0Y6ey&uT7L(v3oPT^ z@t-d}f@i&Z=r;!^f85ODMjVZOu~&LgV{P+rrK$C@H5^ z*h`#*28q#cTGr%1rN)=NT9Xvuq2e{;YerPy9lfKr;Ne>Gdk2}`ci9^=RH4Ax@rhZE z9Xxm2XEXP_1+u3ue%1eK>fD23isCrFl9cfnvn$Wk*wr>PHAzY0CsO21S@MWPLTEkO z(28g(WFo|EvN4rO4@7$Dfs&NnDeZ2%+il-Al}D18ru_ccz5IF4+;eBnne#j6+~4=} zonIsm)O$NqMOo&!Z;sG!x{Uk?on3o6Io?zs1r&_C*Va$}Y^oxJ{x)x$gb4KI4QhG$ z*-lW}(4KMvdGvz=740frUMO?`c`s8|?pRFCm!2e9 z=1mqLnI#H}x%~_jb>p)ZTPJ`xZ?n$v_W=-v?;$KBt#O>gYcsjX5r=YL+Qrp_R90DL zwWJJ0nH8aR>;Byj+74PuxC@b+GU^3N>7Vc8fWQV;TT?JAwXc?cRTj|JR+D*~OYQ_%gj2io);nb@5KL%{Bud8H&M ze!TKdbj7%DCp2%C4uG0b9@^J|JeYr6xot)s{+HU8c-R+1d*G3kM`z>yYUD{;?o{!M z?xQZ1Fu^mJ!2Esb4YPuo)u2kOiK%Y1GI2=FOH7%#e79|%BclgsBx>#yguYN8P9Q}A3;-Hd3h)&2-ICC zCa*}w` Date: Fri, 25 Mar 2022 10:13:51 -0700 Subject: [PATCH 5/5] Update the comments regarding the cluster exemplars --- .../org/tribuo/clustering/hdbscan/HdbscanTrainer.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Clustering/Hdbscan/src/main/java/org/tribuo/clustering/hdbscan/HdbscanTrainer.java b/Clustering/Hdbscan/src/main/java/org/tribuo/clustering/hdbscan/HdbscanTrainer.java index e96cbbeeb..6c0992c84 100644 --- a/Clustering/Hdbscan/src/main/java/org/tribuo/clustering/hdbscan/HdbscanTrainer.java +++ b/Clustering/Hdbscan/src/main/java/org/tribuo/clustering/hdbscan/HdbscanTrainer.java @@ -734,8 +734,8 @@ private static List computeExemplars(SGDVector[] data, Map> outlierScoreIndexList = clusterAssignments.get(clusterLabel); - // Put the items into a TreeMap. This achieves the required sorting and removes duplicate outlier scores to - // provide the best samples + // Put the items into a TreeMap. This achieves the required sorting and removes duplicate outlier scores + // to provide the best samples. TreeMap outlierScoreIndexTree = new TreeMap<>(); outlierScoreIndexList.forEach(p -> outlierScoreIndexTree.put(p.getA(), p.getB())); int numExemplarsThisCluster = e.getValue().size() * numExemplars / data.length; @@ -744,8 +744,8 @@ private static List computeExemplars(SGDVector[] data, Map> partialClusterExemplars = new ArrayList<>(); Stream intStream = IntStream.range(0, numExemplarsThisCluster).boxed(); intStream.forEach((i) -> partialClusterExemplars.add(outlierScoreIndexTree.pollFirstEntry()));