Skip to content

Commit

Permalink
CMGOS parallelisation fix, LDCOFE small cluster fix, CMGOS input changed
Browse files Browse the repository at this point in the history
  • Loading branch information
JohannGebhardt committed Sep 26, 2013
1 parent 9dcf17e commit 43477e4
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 60 deletions.
2 changes: 1 addition & 1 deletion build.properties
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
extension.version=2
extension.revision=1
extension.update=001
extension.update=002
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,10 @@ public class CMGOSEvaluator implements Evaluator {

private RandomGenerator generator;
/**
* Number of points to define a &quote;small cluster&quote;
* Small clusters are removed.
* Percentage determining if a cluster is large or small.
*
*/
private int minimumInstancesForCluster;
private double percentage;
/**
* Lambda for regularization method
*/
Expand All @@ -106,7 +106,7 @@ public class CMGOSEvaluator implements Evaluator {
* h
*/
private int h;
private int subsetPoints;
private int numberOfSubsets;
private int fastMCDPoints;
private int initIteration;
CovarianceMatrix[] CovariancematrixPerCluster;
Expand Down Expand Up @@ -141,7 +141,7 @@ public class CMGOSEvaluator implements Evaluator {
* @param fastMCDPoints
* @param subsetPoints
*/
public CMGOSEvaluator(DistanceMeasure measure, double[][] points, int[] belongsToCluster, double[][] centroids, int[] clusterSize, int threads, int removeRuns, double probability, int cov_sampling, RandomGenerator generator, int pointCountSmall, double lamda, int cov, int h, int subsetPoints, int fastMCDPoints, int initIteration) {
public CMGOSEvaluator(DistanceMeasure measure, double[][] points, int[] belongsToCluster, double[][] centroids, int[] clusterSize, int threads, int removeRuns, double probability, int cov_sampling, RandomGenerator generator, double percentage, double lamda, int cov, int h, int numberOfSubsets, int fastMCDPoints, int initIteration) {

this.measure = measure;
this.points = points;
Expand All @@ -153,11 +153,11 @@ public CMGOSEvaluator(DistanceMeasure measure, double[][] points, int[] belongsT
this.probability = probability;
this.cov_sampling = cov_sampling;
this.generator = generator;
this.minimumInstancesForCluster = pointCountSmall;
this.percentage = percentage;
this.regularizedLambda = lamda;
this.red = cov;
this.h = h;
this.subsetPoints = subsetPoints;
this.numberOfSubsets = numberOfSubsets;
this.fastMCDPoints = fastMCDPoints;
this.initIteration = initIteration;
}
Expand Down Expand Up @@ -213,18 +213,9 @@ private boolean[] reassignPoints(boolean[] removed_cluster, double limit) {
public double[] evaluate() throws OperatorException {
// remove small clusters
boolean[] removed_cluster = new boolean[this.centroids.length];
if (this.minimumInstancesForCluster != -2) {
double limit = 0.0;
// use formula (rule of thumb)
if (minimumInstancesForCluster == -1) {
limit = ((1 - this.probability) * this.points.length) / (this.clusterSize.length);
}
// use user-input
else {
limit = minimumInstancesForCluster;
}
removed_cluster = this.reassignPoints(removed_cluster, limit);
}
double limit = percentage * points.length/centroids.length;
removed_cluster = this.reassignPoints(removed_cluster, limit);


int TotalNumberOfPoints = points.length;
int NumberOfCluster = this.centroids.length;
Expand Down Expand Up @@ -389,7 +380,7 @@ public double[] evaluate() throws OperatorException {
id++;
}
if (!thereisone) {
throw new OperatorException("No cluster left. This is a problem. Try not to remove small clusters or reduce number");
throw new OperatorException("No cluster left. This is a problem. Try not to remove small clusters or reduce number of clusters.");
}
S = new double[CovariancematrixPerCluster[id].getCovMat().length][CovariancematrixPerCluster[id].getCovMat()[0].length];
for (int ClusterId = 0; ClusterId < NumberOfCluster; ClusterId++) {
Expand Down Expand Up @@ -656,8 +647,8 @@ public HashMap<Double, LinkedList<CovarianceMatrix>> getMap() {
return this.retMap;
}

public void run() {
for (int id = (this.id * anz); id < ((this.id * anz) + anz); id++) {
public void run() {
for (int id = (this.id * anz); id <= ((this.id * anz) + anz); id++) {
if (map2.containsKey(id)) {
HashMap<Double, LinkedList<CovarianceMatrix>> map = map2.get(id);
for (double d : map.keySet()) {
Expand All @@ -676,7 +667,7 @@ public void run() {

// construct up to five disjoint random subsets of size nsub according
// to Section 3.3 (say, five subsets of size nsub = 300);
double anz_subset = Math.floor(data.length / this.subsetPoints);
double anz_subset = this.numberOfSubsets;
double anz_points = Math.floor(data.length / anz_subset);
boolean[] taken = new boolean[data.length];
int merge_id = 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ public double[] evaluate() {
distances[i] = measure.calculateDistance(
centroids[clusterIndex], points[i]);
summationDistances[clusterIndex] += distances[i];
} else {
} else {
// It is a small cluster
double MinDistance = Double.MAX_VALUE;

Expand All @@ -151,8 +151,14 @@ public double[] evaluate() {
summationDistances[i] /= clusterSize[i];

for (int i = 0; i < n; i++) {
result[i] = distances[i]
/ summationDistances[belongsToLargeCluster[i]];
if(summationDistances[belongsToLargeCluster[i]]== 0.0) {
result[i] = 0;
}
else {
result[i] = distances[i]
/ summationDistances[belongsToLargeCluster[i]];
}

}

return result;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,8 @@ public class CMGOSAnomalyDetectionOperator extends AbstractClusteringAnomalyDete
/**
*
**/
public static final String PARAMETER_POINTS_SUBSET = "instances in a subset";
public static final String PARAMETER_POINTS_SUBSET_DESCRIPTION = "Defines the number of instances in a (random) subset used in fastMCD. Friedmann recommends to have at most 5 subsets, which menas this number should not be lower then (number of examples / 5).";
public static final String PARAMETER_NUMBER_OF_SUBSETS = "number of subsets";
public static final String PARAMETER_POINTS_SUBSET_DESCRIPTION = "Defines the number of subsets used in fastMCD. Friedmann recommends to have at most 5 subsets.";
/**
*
**/
Expand Down Expand Up @@ -125,18 +125,10 @@ public class CMGOSAnomalyDetectionOperator extends AbstractClusteringAnomalyDete
public static final String PARAMETER_PARALLELIZE_EVALUATION_PROCESS = "parallelize evaluation process";
public static final String PARAMETER_PARALLELIZE_EVALUATION_PROCESS_DESCRIPTION = "Specifies that evaluation process should be performed in parallel";
/**
* Boolean to remove small cluster
*/
public static final String PARAMETER_REMOVE_CLUSTER = "remove small clusters";
public static final String PARAMETER_REMOVE_CLUSTER_DESCRIPTION = "Too small clusters can negatively influence the result. If ticked, a rule of thumb is applied to remove clusters with less than ((1 - 'normal probability') * nbr of datapoints) / nbr of clusters. These points will be assigned to the nearest large cluster";
/**
* Number of points to define a &quote;small cluster&quote;
*/
public static final String PARAMETER_NUMBER_POINTS_SMALL_CLUSTER = "minimum";
public static final String PARAMETER_NUMBER_POINTS_SMALL_CLUSTER_DESCRIPTION = "Minimum of instances in a cluster. Smaller clusters will be removed and instances assigned to the next large cluster.";

public static final String PARAMETER_NUMBER_POINTS_MAN = "minimum number of instances";
public static final String PARAMETER_NUMBER_POINTS_MAN_DESCRIPTION = "Set the minimum number of points for a cluster manually.";
* Parameter name for gamma &quot; ratio between the maximum size of small
* clusters and the average cluster size &quot. Small clusters are removed.;
**/
public static String PARAMETER_GAMMA = "gamma";


public CMGOSAnomalyDetectionOperator(OperatorDescription description) {
Expand All @@ -151,13 +143,7 @@ public double[] doWork(ExampleSet exampleSet, Attributes attributes, double[][]
if (getParameterAsBoolean(PARAMETER_PARALLELIZE_EVALUATION_PROCESS))
parallel = getParameterAsInt(PARAMETER_NUMBER_OF_THREADS);

int rem_cluster_anz = -2;
if (getParameterAsBoolean(PARAMETER_REMOVE_CLUSTER))
if (getParameterAsBoolean(PARAMETER_NUMBER_POINTS_MAN))
rem_cluster_anz = getParameterAsInt(PARAMETER_NUMBER_POINTS_SMALL_CLUSTER);
else
rem_cluster_anz = -1;

double percentage = getParameterAsDouble(PARAMETER_GAMMA);

int[] belongsToCluster = getBelongsToCluster();
double[][] centroids = getCentriods();
Expand All @@ -169,7 +155,7 @@ public double[] doWork(ExampleSet exampleSet, Attributes attributes, double[][]
variancePoints = getParameterAsInt(PARAMETER_NUMBER_COVARIANCE_POINTS+"_");

RandomGenerator generator = RandomGenerator.getRandomGenerator(this);
CMGOSEvaluator evaluator = new CMGOSEvaluator(measure, points, belongsToCluster, centroids, clusterSize, parallel, getParameterAsInt(PARAMETER_NUMBER_OF_REMOVE), getParameterAsDouble(PARAMETER_OUTLIER_PROBABILITY), variancePoints, generator, rem_cluster_anz, getParameterAsDouble(PARAMETER_LAMBDA), getParameterAsInt(PARAMETER_COVARIANCE), getParameterAsInt(PARAMETER_H), getParameterAsInt(PARAMETER_POINTS_SUBSET), getParameterAsInt(PARAMETER_FMCD), getParameterAsInt(PARAMETER_RUN));
CMGOSEvaluator evaluator = new CMGOSEvaluator(measure, points, belongsToCluster, centroids, clusterSize, parallel, getParameterAsInt(PARAMETER_NUMBER_OF_REMOVE), getParameterAsDouble(PARAMETER_OUTLIER_PROBABILITY), variancePoints, generator, percentage, getParameterAsDouble(PARAMETER_LAMBDA), getParameterAsInt(PARAMETER_COVARIANCE), getParameterAsInt(PARAMETER_H), getParameterAsInt(PARAMETER_NUMBER_OF_SUBSETS), getParameterAsInt(PARAMETER_FMCD), getParameterAsInt(PARAMETER_RUN));

double[] e = evaluator.evaluate();
return e;
Expand All @@ -180,26 +166,21 @@ public List<ParameterType> getParameterTypes() {
List<ParameterType> types = new LinkedList<ParameterType>();

types.add(new ParameterTypeDouble(PARAMETER_OUTLIER_PROBABILITY, PARAMETER_OUTLIER_PROBABILITY_DESCRIPTION, 0, 1.0, 0.975, false));
types.add(new ParameterTypeDouble(PARAMETER_GAMMA,"Ratio between the maximum size of small clusters and the average cluster size. Small" +
"clusters are removed.",
0, 1, 0.1));

types.add(new ParameterTypeBoolean(PARAMETER_REMOVE_CLUSTER, PARAMETER_REMOVE_CLUSTER_DESCRIPTION, false, false));
ParameterTypeBoolean type3 = (new ParameterTypeBoolean(PARAMETER_NUMBER_POINTS_MAN, PARAMETER_NUMBER_POINTS_MAN_DESCRIPTION, false, false));
type3.registerDependencyCondition(new BooleanParameterCondition(this, PARAMETER_REMOVE_CLUSTER, true, true));
types.add(type3);

ParameterTypeInt type = (new ParameterTypeInt(PARAMETER_NUMBER_POINTS_SMALL_CLUSTER, PARAMETER_NUMBER_POINTS_SMALL_CLUSTER_DESCRIPTION, 0, Integer.MAX_VALUE, 0, false));
type.registerDependencyCondition(new BooleanParameterCondition(this, PARAMETER_NUMBER_POINTS_MAN, true, true));
types.add(type);

types.add(new ParameterTypeCategory(PARAMETER_COVARIANCE, PARAMETER_COVARIANCE_DESCRIPTION, COV, 0, false));

ParameterTypeInt type2 = new ParameterTypeInt(PARAMETER_NUMBER_OF_REMOVE, PARAMETER_NUMBER_OF_REMOVE_DESCRIPTION, 0, Integer.MAX_VALUE, 1, false);
type2.registerDependencyCondition(new EqualTypeCondition(getParameterHandler(), PARAMETER_COVARIANCE, COV, false, METHOD_COV_REDUCTION, METHOD_COV_REGULARIZE));
types.add(type2);

type3 = (new ParameterTypeBoolean(PARAMETER_LIMIT_COVARIANCE_POINTS, PARAMETER_LIMIT_COVARIANCE_POINTS_DESCRIPTION, false, false));
ParameterTypeBoolean type3 = (new ParameterTypeBoolean(PARAMETER_LIMIT_COVARIANCE_POINTS, PARAMETER_LIMIT_COVARIANCE_POINTS_DESCRIPTION, false, false));
type3.registerDependencyCondition(new EqualTypeCondition(getParameterHandler(), PARAMETER_COVARIANCE, COV, false, 1));
types.add(type3);
type = (new ParameterTypeInt(PARAMETER_NUMBER_COVARIANCE_POINTS, PARAMETER_NUMBER_COVARIANCE_POINTS_DESCRIPTION, 1, Integer.MAX_VALUE, 1000, false));
ParameterTypeInt type = (new ParameterTypeInt(PARAMETER_NUMBER_COVARIANCE_POINTS, PARAMETER_NUMBER_COVARIANCE_POINTS_DESCRIPTION, 1, Integer.MAX_VALUE, 1000, false));
type.registerDependencyCondition(new BooleanParameterCondition(this, PARAMETER_LIMIT_COVARIANCE_POINTS, true, true));
types.add(type);
type3 = (new ParameterTypeBoolean(PARAMETER_LIMIT_COVARIANCE_POINTS+"_", PARAMETER_LIMIT_COVARIANCE_POINTS_DESCRIPTION, false, false));
Expand All @@ -217,7 +198,7 @@ public List<ParameterType> getParameterTypes() {
type = (new ParameterTypeInt(PARAMETER_FMCD, PARAMETER_FMCD_DESCRIPTION, 0, Integer.MAX_VALUE, 600, false));
type.registerDependencyCondition(new EqualTypeCondition(getParameterHandler(), PARAMETER_COVARIANCE, COV, false, METHOD_COV_MCD));
types.add(type);
type = (new ParameterTypeInt(PARAMETER_POINTS_SUBSET, PARAMETER_POINTS_SUBSET_DESCRIPTION, 0, Integer.MAX_VALUE, 300, false));
type = (new ParameterTypeInt(PARAMETER_NUMBER_OF_SUBSETS, PARAMETER_POINTS_SUBSET_DESCRIPTION, 0, Integer.MAX_VALUE, 5, false));
type.registerDependencyCondition(new EqualTypeCondition(getParameterHandler(), PARAMETER_COVARIANCE, COV, false, METHOD_COV_MCD));
types.add(type);

Expand Down

0 comments on commit 43477e4

Please sign in to comment.