SMT - KDD 2016

fpetitjean · Jun 6, 2016 · 0757de3 · 0757de3
1 parent 5bb46f1
commit 0757de3
Showing 1 changed file with 190 additions and 0 deletions.
diff --git a/src/explorer/ChordalysisModellingSMT.java b/src/explorer/ChordalysisModellingSMT.java
@@ -0,0 +1,190 @@
+/*******************************************************************************
+ * Copyright (C) 2016 Francois Petitjean
+ * 
+ * This file is part of Chordalysis.
+ * 
+ * Chordalysis is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, version 3 of the License.
+ * 
+ * Chordalysis is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with Chordalysis.  If not, see <http://www.gnu.org/licenses/>.
+ ******************************************************************************/
+package explorer;
+
+import java.io.IOException;
+import java.util.ArrayList;
+
+import lattice.Lattice;
+import model.DecomposableModel;
+import model.GraphAction;
+import model.ScoredGraphAction;
+import stats.EntropyComputer;
+import stats.MyPriorityQueue;
+import stats.scorer.GraphActionScorer;
+import stats.scorer.GraphActionScorerPValue;
+import weka.core.Instances;
+import weka.core.converters.ArffLoader.ArffReader;
+
+/**
+ * This class searches a statistically significant decomposable model to explain a dataset using Prioritized Chordalysis. 
+ * It uses Stepwise Multiple Testing, accepted for publication at KDD 2016. 
+ * See paper "A multiple test correction for streams and cascades of statistical hypothesis tests," KDD 2016
+ * See paper "Scaling log-linear analysis to high-dimensional data," ICDM 2013
+ * See paper "Scaling log-linear analysis to datasets with thousands of variables," SDM 2015
+ * @see http://www.francois-petitjean.com/Research/
+ */
+public class ChordalysisModellingSMT{
+
+	int nbInstances;
+	double pValueThreshold;
+	DecomposableModel bestModel;
+	EntropyComputer entropyComputer;
+	protected Lattice lattice;
+	Instances dataset;
+	ArrayList<GraphAction> operationsPerformed;
+	MyPriorityQueue pq;
+	GraphActionScorer scorer;
+
+	boolean hasMissingValues = true;
+	public void setHasMissingValues(boolean hasMissingValues){
+	    this.hasMissingValues = hasMissingValues;
+	}
+
+	int maxNSteps = Integer.MAX_VALUE;
+	public void setMaxNSteps(int nSteps){
+		this.maxNSteps = nSteps;
+		System.out.println(maxNSteps);
+	}
+
+	/**
+	 * Default constructor
+	 * 
+	 * @param pValueThreshold
+	 *            minimum p-value for statistical consistency (commonly 0.05)
+	 */
+	public ChordalysisModellingSMT(double pValueThreshold) {
+		this.pValueThreshold = pValueThreshold;
+		operationsPerformed = new ArrayList<GraphAction>();
+	}
+
+	/**
+	 * Launch the modelling
+	 * 
+	 * @param dataset
+	 *            the dataset from which the analysis is performed on
+	 */
+	public void buildModel(Instances dataset) {
+		buildModelNoExplore(dataset);
+		this.explore();
+	}
+
+	public int getNbInstances() {
+		return nbInstances;
+	}
+
+	public void buildModelNoExplore(Instances dataset) {
+		this.nbInstances = dataset.numInstances();
+		this.dataset = dataset;
+		int[] variables = new int[dataset.numAttributes()];
+		int[] nbValuesForAttribute = new int[variables.length];
+		for (int i = 0; i < variables.length; i++) {
+			variables[i] = i;
+			if(hasMissingValues){
+			    nbValuesForAttribute[i] = dataset.attribute(i).numValues()+1;
+			}else{
+			    nbValuesForAttribute[i] = dataset.attribute(i).numValues();
+			}
+		}
+		this.lattice = new Lattice(dataset,hasMissingValues);
+		this.entropyComputer = new EntropyComputer(dataset.numInstances(), this.lattice);
+		this.scorer = new GraphActionScorerPValue(nbInstances, entropyComputer);
+		this.bestModel = new DecomposableModel(variables, nbValuesForAttribute);
+		this.pq = new MyPriorityQueue(variables.length, bestModel, scorer);
+		for (int i = 0; i < variables.length; i++) {
+			for (int j = i + 1; j < variables.length; j++) {
+				pq.enableEdge(i, j);
+			}
+		}
+
+
+	}
+
+	/**
+	 * Launch the modelling
+	 * 
+	 * @param dataset the structure of the dataset which the analysis is performed
+	 * @param 
+	 * @throws IOException 
+	 * 
+	 */
+	public void buildModel(Instances dataset,ArffReader loader) throws IOException {
+		buildModelNoExplore(dataset, loader);
+		this.explore();
+	}
+
+	public void buildModelNoExplore(Instances dataset,ArffReader loader) throws IOException {
+		this.dataset = dataset;
+		int[] variables = new int[dataset.numAttributes()];
+		int[] nbValuesForAttribute = new int[variables.length];
+		for (int i = 0; i < variables.length; i++) {
+			variables[i] = i;
+			nbValuesForAttribute[i] = dataset.attribute(i).numValues();
+		}
+		this.lattice = new Lattice(dataset,loader);
+		this.nbInstances = this.lattice.getNbInstances();
+
+
+		this.entropyComputer = new EntropyComputer(nbInstances, this.lattice);
+		this.scorer = new GraphActionScorerPValue(nbInstances, entropyComputer);
+		this.bestModel = new DecomposableModel(variables, nbValuesForAttribute);
+		this.pq = new MyPriorityQueue(variables.length, bestModel, scorer);
+		for (int i = 0; i < variables.length; i++) {
+			for (int j = i + 1; j < variables.length; j++) {
+				pq.enableEdge(i, j);
+			}
+		}
+
+
+	}
+
+	/**
+	 * @return the Decomposable model that has been built
+	 */
+	public DecomposableModel getModel() {
+		return bestModel;
+	}
+
+	public void explore() {
+	    pq.processStoredModifications();
+	    double remainingBudget = pValueThreshold;
+	    int step=0;
+		while (!pq.isEmpty()&& step<maxNSteps) {
+			int nTests = pq.size();
+
+			double correctedPValueThreshold = remainingBudget / nTests;
+
+			// System.out.println(pq);
+			ScoredGraphAction todo = pq.poll();
+
+			if (todo.getScore()> correctedPValueThreshold) {
+				break;
+			}
+			double usedBudget = todo.getScore()*nTests;
+			remainingBudget -= usedBudget;
+			operationsPerformed.add(todo);
+			bestModel.performAction(todo, bestModel, pq);
+			step++;
+		}
+	}
+
+	public Lattice getLattice() {
+		return lattice;
+	}
+
+}