Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve the prediction technique #222

Merged
merged 5 commits into from
Mar 25, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -68,14 +68,17 @@ public final class HdbscanModel extends Model<ClusterID> {

private final List<HdbscanTrainer.ClusterExemplar> clusterExemplars;

private final double noisePointsOutlierScore;

HdbscanModel(String name, ModelProvenance description, ImmutableFeatureMap featureIDMap,
ImmutableOutputInfo<ClusterID> outputIDInfo, List<Integer> clusterLabels, DenseVector outlierScoresVector,
List<HdbscanTrainer.ClusterExemplar> clusterExemplars, DistanceType distType) {
List<HdbscanTrainer.ClusterExemplar> clusterExemplars, DistanceType distType, double noisePointsOutlierScore) {
super(name,description,featureIDMap,outputIDInfo,false);
this.clusterLabels = clusterLabels;
this.outlierScoresVector = outlierScoresVector;
this.clusterExemplars = clusterExemplars;
this.distType = distType;
this.noisePointsOutlierScore = noisePointsOutlierScore;
}

/**
Expand Down Expand Up @@ -115,18 +118,38 @@ public Prediction<ClusterID> predict(Example<ClusterID> example) {
if (vector.numActiveElements() == 0) {
throw new IllegalArgumentException("No features found in Example " + example);
}

double minDistance = Double.POSITIVE_INFINITY;
int clusterLabel = -1;
double clusterOutlierScore = 0.0;
for (HdbscanTrainer.ClusterExemplar clusterExemplar : clusterExemplars) {
double distance = DistanceType.getDistance(clusterExemplar.getFeatures(), vector, distType);
if (distance < minDistance) {
minDistance = distance;
clusterLabel = clusterExemplar.getLabel();
clusterOutlierScore = clusterExemplar.getOutlierScore();
int clusterLabel = HdbscanTrainer.OUTLIER_NOISE_CLUSTER_LABEL;
double outlierScore = 0.0;
if (Double.compare(noisePointsOutlierScore, 0) > 0) { // This will be true from models > 4.2
boolean isNoisePoint = true;
for (HdbscanTrainer.ClusterExemplar clusterExemplar : clusterExemplars) {
double distance = DistanceType.getDistance(clusterExemplar.getFeatures(), vector, distType);
if (isNoisePoint && distance <= clusterExemplar.getMaxDistToEdge()) {
isNoisePoint = false;
}
if (distance < minDistance) {
minDistance = distance;
clusterLabel = clusterExemplar.getLabel();
outlierScore = clusterExemplar.getOutlierScore();
}
}
if (isNoisePoint) {
outlierScore = noisePointsOutlierScore;
Craigacp marked this conversation as resolved.
Show resolved Hide resolved
}
}
else {
for (HdbscanTrainer.ClusterExemplar clusterExemplar : clusterExemplars) {
double distance = DistanceType.getDistance(clusterExemplar.getFeatures(), vector, distType);
if (distance < minDistance) {
minDistance = distance;
clusterLabel = clusterExemplar.getLabel();
outlierScore = clusterExemplar.getOutlierScore();
}
}
}
return new Prediction<>(new ClusterID(clusterLabel, clusterOutlierScore),vector.size(),example);
return new Prediction<>(new ClusterID(clusterLabel, outlierScore),vector.size(),example);
}

@Override
Expand All @@ -145,7 +168,7 @@ protected HdbscanModel copy(String newName, ModelProvenance newProvenance) {
List<Integer> copyClusterLabels = Collections.unmodifiableList(clusterLabels);
List<HdbscanTrainer.ClusterExemplar> copyExemplars = new ArrayList<>(clusterExemplars);
return new HdbscanModel(newName, newProvenance, featureIDMap, outputIDInfo, copyClusterLabels,
copyOutlierScoresVector, copyExemplars, distType);
copyOutlierScoresVector, copyExemplars, distType, noisePointsOutlierScore);
}

private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ public final class HdbscanTrainer implements Trainer<ClusterID> {

static final int OUTLIER_NOISE_CLUSTER_LABEL = 0;

private static final double MAX_OUTLIER_SCORE = 0.9999;

/**
* Available distance functions.
* @deprecated
Expand Down Expand Up @@ -241,15 +243,18 @@ public HdbscanModel train(Dataset<ClusterID> examples, Map<String, Provenance> r
ImmutableOutputInfo<ClusterID> outputMap = new ImmutableClusteringInfo(counts);

// Compute the cluster exemplars.
List<ClusterExemplar> clusterExemplars = computeExemplars(data, clusterAssignments);
List<ClusterExemplar> clusterExemplars = computeExemplars(data, clusterAssignments, distType);

// Get the outlier score value for points that are predicted as noise points.
double noisePointsOutlierScore = getNoisePointsOutlierScore(clusterAssignments);

logger.log(Level.INFO, "Hdbscan is done.");

ModelProvenance provenance = new ModelProvenance(HdbscanModel.class.getName(), OffsetDateTime.now(),
examples.getProvenance(), trainerProvenance, runProvenance);

return new HdbscanModel("hdbscan-model", provenance, featureMap, outputMap, clusterLabels, outlierScoresVector,
clusterExemplars, distType);
clusterExemplars, distType, noisePointsOutlierScore);
}

@Override
Expand Down Expand Up @@ -705,14 +710,15 @@ private static Map<Integer, List<Pair<Double, Integer>>> generateClusterAssignme
}

/**
* Compute the exemplars. These are representative points which are subsets of their clusters and noise points, and
* Compute the exemplars. These are representative points which are subsets of their clusters, and
* will be used for prediction on unseen data points.
*
* @param data An array of {@link DenseVector} containing the data.
* @param clusterAssignments A map of the cluster labels, and the points assigned to them.
* @return A list of {@link ClusterExemplar}s which are used for predictions.
*/
private static List<ClusterExemplar> computeExemplars(SGDVector[] data, Map<Integer, List<Pair<Double, Integer>>> clusterAssignments) {
private static List<ClusterExemplar> computeExemplars(SGDVector[] data, Map<Integer, List<Pair<Double, Integer>>> clusterAssignments,
DistanceType distType) {
geoffreydstewart marked this conversation as resolved.
Show resolved Hide resolved
List<ClusterExemplar> clusterExemplars = new ArrayList<>();
// The formula to calculate the exemplar number. This calculates the number of exemplars to be used for this
// configuration. The appropriate number of exemplars is important for prediction. At the time, this
Expand All @@ -721,37 +727,69 @@ private static List<ClusterExemplar> computeExemplars(SGDVector[] data, Map<Inte

for (Entry<Integer, List<Pair<Double, Integer>>> e : clusterAssignments.entrySet()) {
int clusterLabel = e.getKey();
List<Pair<Double, Integer>> outlierScoreIndexList = clusterAssignments.get(clusterLabel);

// Put the items into a TreeMap. This achieves the required sorting and removes duplicate outlier scores to
// provide the best samples
TreeMap<Double, Integer> outlierScoreIndexTree = new TreeMap<>();
outlierScoreIndexList.forEach(p -> outlierScoreIndexTree.put(p.getA(), p.getB()));
int numExemplarsThisCluster = e.getValue().size() * numExemplars / data.length;
if (numExemplarsThisCluster > outlierScoreIndexTree.size()) {
numExemplarsThisCluster = outlierScoreIndexTree.size();
}

if (clusterLabel != OUTLIER_NOISE_CLUSTER_LABEL) {
List<Pair<Double, Integer>> outlierScoreIndexList = clusterAssignments.get(clusterLabel);

// Put the items into a TreeMap. This achieves the required sorting and removes duplicate outlier scores to
// provide the best samples
TreeMap<Double, Integer> outlierScoreIndexTree = new TreeMap<>();
outlierScoreIndexList.forEach(p -> outlierScoreIndexTree.put(p.getA(), p.getB()));
int numExemplarsThisCluster = e.getValue().size() * numExemplars / data.length;
if (numExemplarsThisCluster > outlierScoreIndexTree.size()) {
numExemplarsThisCluster = outlierScoreIndexTree.size();
}

List<ClusterExemplar> subsetClusterExemplars = new ArrayList<>();

for (int i = 0; i < numExemplarsThisCluster; i++) {
// Note that for non-outliers, the first node is polled from the tree, which has the lowest outlier
// score out of the remaining points assigned this cluster.
Entry<Double, Integer> entry = outlierScoreIndexTree.pollFirstEntry();
clusterExemplars.add(new ClusterExemplar(clusterLabel, entry.getKey(), data[entry.getValue()]));
subsetClusterExemplars.add(new ClusterExemplar(clusterLabel, entry.getKey(), data[entry.getValue()]));
}
}
else {
for (int i = 0; i < numExemplarsThisCluster; i++) {
// Note that for outliers the last node is polled from the tree, which has the highest outlier score
// out of the remaining points assigned this cluster.
Entry<Double, Integer> entry = outlierScoreIndexTree.pollLastEntry();
clusterExemplars.add(new ClusterExemplar(clusterLabel, entry.getKey(), data[entry.getValue()]));

// For each of the exemplars in this cluster, iterate the remaining nodes in the tree to find the maximum
// distance between the exemplar and the members of the cluster. The other exemplars don't need to be
// checked here since they won't be on the fringe of the cluster.
for (ClusterExemplar clusterExemplar : subsetClusterExemplars) {
double maxInnerDist = Double.NEGATIVE_INFINITY;
for (Entry<Double, Integer> entry : outlierScoreIndexTree.entrySet()) {
double distance = DistanceType.getDistance(clusterExemplar.getFeatures(), data[entry.getValue()], distType);
if (distance > maxInnerDist){
maxInnerDist = distance;
}
}
clusterExemplar.setMaxDistToEdge(maxInnerDist);
}
clusterExemplars.addAll(subsetClusterExemplars);
}
}
return clusterExemplars;
}

/**
* Determine the outlier score value for points that are predicted as noise points.
*
* @param clusterAssignments A map of the cluster labels, and the points assigned to them.
* @return An outlier score value for points predicted as noise points.
*/
private static double getNoisePointsOutlierScore(Map<Integer, List<Pair<Double, Integer>>> clusterAssignments) {

List<Pair<Double, Integer>> outlierScoreIndexList = clusterAssignments.get(OUTLIER_NOISE_CLUSTER_LABEL);
if ((outlierScoreIndexList == null) || outlierScoreIndexList.isEmpty()) {
return MAX_OUTLIER_SCORE;
}

double upperOutlierScoreBound = Double.NEGATIVE_INFINITY;
for (Pair<Double, Integer> outlierScoreIndex : outlierScoreIndexList) {
if (outlierScoreIndex.getA() > upperOutlierScoreBound) {
upperOutlierScoreBound = outlierScoreIndex.getA();
}
}
return upperOutlierScoreBound;
}

@Override
public String toString() {
return "HdbscanTrainer(minClusterSize=" + minClusterSize + ",distanceType=" + distType + ",k=" + k + ",numThreads=" + numThreads + ")";
Expand All @@ -771,6 +809,7 @@ final static class ClusterExemplar implements Serializable {
private final Integer label;
private final Double outlierScore;
private final SGDVector features;
private Double maxDistToEdge = Double.NEGATIVE_INFINITY;
Craigacp marked this conversation as resolved.
Show resolved Hide resolved

ClusterExemplar(Integer label, Double outlierScore, SGDVector features) {
this.label = label;
Expand All @@ -789,6 +828,14 @@ Double getOutlierScore() {
SGDVector getFeatures() {
return features;
}

void setMaxDistToEdge(Double maxDistToEdge) {
this.maxDistToEdge = maxDistToEdge;
}

Double getMaxDistToEdge() {
return maxDistToEdge;
}
}

}