diff --git a/build.properties b/build.properties index c7cba6e..41aa0f6 100644 --- a/build.properties +++ b/build.properties @@ -1,3 +1,3 @@ extension.version=2 extension.revision=1 -extension.update=000 +extension.update=001 diff --git a/resources/com/rapidminer/resources/i18n/OperatorsDocAnomalyDetection.xml b/resources/com/rapidminer/resources/i18n/OperatorsDocAnomalyDetection.xml index 3ed37de..457fd24 100644 --- a/resources/com/rapidminer/resources/i18n/OperatorsDocAnomalyDetection.xml +++ b/resources/com/rapidminer/resources/i18n/OperatorsDocAnomalyDetection.xml @@ -289,11 +289,15 @@ column in the Example Set. There are two modes, one with a static and one with a dynamic bandwidth. In the static mode every bin has the same binwidth equally distributed over the value range. In the - dynamic mode the bindwidth can vary, but you can specify a minimum - number of examples contained in a bin. The default values for either - the number of bins or the minimum number of examples per bin is the - square root of the number of total examples (column properties set - to -1). To compute the outlier + dynamic mode the binwidth can vary, but you can specify a minimum + number of examples contained in a bin. The parameter number of bins sets + the total number of bins used for either mode. The binwidth / minimum number + values per bin is then calculated automatically. + In the dynamic mode it is possible that there are less bins then specified if + some bins contain more than the minimum number of values. + The default values for + the number of bins is the square root of the number of total examples + (number of bins set to -1). To compute the outlier score, the histograms are normalized to one in height first. Then, the score is inverted, so that anomalies have a high score and normal examples a low score. It is also possible to apply a diff --git a/src/de/dfki/madm/anomalydetection/evaluator/statistical_based/HistogramEvaluator.java b/src/de/dfki/madm/anomalydetection/evaluator/statistical_based/HistogramEvaluator.java index c882961..47bae52 100644 --- a/src/de/dfki/madm/anomalydetection/evaluator/statistical_based/HistogramEvaluator.java +++ b/src/de/dfki/madm/anomalydetection/evaluator/statistical_based/HistogramEvaluator.java @@ -43,7 +43,8 @@ public class HistogramEvaluator { public HistogramEvaluator(Operator logger) { this.logger = logger; } - + static int asdf = 0; + private ArrayList[] histogram; @SuppressWarnings("unchecked") public ExampleSet evaluate (ExampleSet exampleSet, boolean log_scale, boolean ranked, HashMap bin_info_help, HashMap mode_help) { @@ -87,7 +88,7 @@ public ExampleSet evaluate (ExampleSet exampleSet, boolean log_scale, boolean ra // initialize histogram, one histogram for every dimension // list of bins for every histogram - ArrayList[] histogram = new ArrayList[number_of_features]; + histogram = new ArrayList[number_of_features]; for(int i = 0; i < number_of_features; i++) { histogram[i] = new ArrayList(); } @@ -118,13 +119,27 @@ public ExampleSet evaluate (ExampleSet exampleSet, boolean log_scale, boolean ra // create histograms for (int i = 0; i < number_of_features; i++) { + int last = 0; + double bin_start = data[0][i]; if(mode[i].equals("dynamic binwidth")){ // For nominal values every value gets its own bin. Rapidminer handels nominal values as intergers => binwidth 1 if (nominal[i]) { - createDynamicHistogram(histogram,data,0,1,i,true); + while(last 1) { + length = length - histogram[i].get(histogram[i].size()-1).get_quantity(); + binwidth = binwidth -1; + values_per_bin = (int) Math.floor(length/binwidth); + } + } } } else { @@ -132,7 +147,10 @@ public ExampleSet evaluate (ExampleSet exampleSet, boolean log_scale, boolean ra if(nominal[i] || binwidth == 0) { binwidth = 1.0; } - createStaticHistogram(histogram,data,0,binwidth,i,data[0][i]); + while(last[] histogram_array, double[][] data, int first, int n, int feature, boolean nominal) { + + public static int createDynamicHistogram(ArrayList[] histogram_array, double[][] data, int first, int n, int feature, boolean nominal) { + int last = first; int end = 0; // create new bin @@ -327,9 +347,12 @@ else if (histogram_array[feature].size() == 0) { /* * if end of that file isn't reached start over with the last unused value as first value */ + return last+1; + /*System.out.println(asdf); if(last < data.length-1) { + asdf++; createDynamicHistogram(histogram_array,data,last+1,n,feature,nominal); - } + }*/ } /** Create histogram with static binWidth * @param histogram_array @@ -339,7 +362,7 @@ else if (histogram_array[feature].size() == 0) { * @param feature * @param binStart */ - public static void createStaticHistogram(ArrayList[] histogram_array, double[][] data, int first, double binWidth, int feature, double binStart){ + public static int createStaticHistogram(ArrayList[] histogram_array, double[][] data, int first, double binWidth, int feature, double binStart){ HistogramBin bin = new HistogramBin(binStart,binStart+binWidth,0,0); int last = first-1; for(int i = first; i < data.length&&data[i][feature] <= bin.get_range_to(); i++) { @@ -347,9 +370,10 @@ public static void createStaticHistogram(ArrayList[] histogram_arr last = i; } histogram_array[feature].add(bin); - if(last < data.length - 1) { + return last+1; + /*if(last < data.length - 1) { createStaticHistogram(histogram_array,data,last+1,binWidth,feature,binStart+binWidth); - } + }*/ } /** Sort the rows of an multidimensional array independently. diff --git a/src/de/dfki/madm/anomalydetection/operator/statistical_based/HistogramOperator.java b/src/de/dfki/madm/anomalydetection/operator/statistical_based/HistogramOperator.java index ee17503..2f71950 100644 --- a/src/de/dfki/madm/anomalydetection/operator/statistical_based/HistogramOperator.java +++ b/src/de/dfki/madm/anomalydetection/operator/statistical_based/HistogramOperator.java @@ -76,7 +76,7 @@ public class HistogramOperator extends Operator { private static final String PARAMETER_PROPERTIES_LIST = "histogram properties"; private static String[] CONDITION_NAMES = new String[] { "all", "single"}; private static final String PARAMETER_FILTER_TYPE = "parameter mode"; - private static final String PARAMETER_BIN_INFO ="bin_info"; + private static final String PARAMETER_BIN_INFO ="number of bins"; private static final String PARAMETER_MODE="select mode"; private static final String PARAMETER_COLUMN_PROPERTIES = "column properties"; private static final String PARAMETER_ATTRIBUTE_NAME = "attribute name"; @@ -221,10 +221,13 @@ public List getParameterTypes() { String[] mode = new String[2]; mode[0] = "fixed binwidth"; mode[1] = "dynamic binwidth"; - ParameterTypeString type_int= new ParameterTypeString(PARAMETER_BIN_INFO,"Specifies how many bins or how many values per bins are used. Set to -1 for default value (sqrt(N)).","-1"); + ParameterTypeString type_int= new ParameterTypeString(PARAMETER_BIN_INFO,"Specifies the number of bins. " + + "When using static binwidth the binwidth is set to (range of values)/(number of bins)."+ + "When using dynamic binwidth the minimum number of bins is set to (number of examples)/(number of bins)." + + "In this case it is possible that there are less bins than specified if some bins contain more than the minimum number of values. Set to -1 for default value (sqrt(N)).","-1"); ParameterTypeStringCategory type_category = new ParameterTypeStringCategory(PARAMETER_MODE,"Select dynamic or fixed binwidth mode",mode,"fixed binwidth"); type_category.setEditable(false); - ParameterTypeList typeList = new ParameterTypeList(PARAMETER_PROPERTIES_LIST, "properties for every column - select mode and number of bins/number of values per bin for every column (set binwidth to -1 for default value or to nominal for categorical data)", + ParameterTypeList typeList = new ParameterTypeList(PARAMETER_PROPERTIES_LIST, "properties for every column - select mode and number of bins for every column (set binwidth to -1 for default value or to nominal for categorical data)", new ParameterTypeAttribute(PARAMETER_ATTRIBUTE_NAME, "The index of the column whose properties should be changed.",getExampleSetInputPort()), new ParameterTypeTupel(PARAMETER_COLUMN_PROPERTIES, "properties", type_category,