Evaluation completed and UI implemented. Commenting still to do

fionamac89 · Mar 27, 2014 · e71a432 · e71a432
1 parent 023ce51
commit e71a432
Show file tree

Hide file tree

Showing 16 changed files with 509 additions and 653 deletions.
diff --git a/.classpath b/.classpath
@@ -2,10 +2,10 @@
 <classpath>
 	<classpathentry kind="src" path="src"/>
 	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
-	<classpathentry kind="lib" path="/Users/Fiona/Documents/workspace/Project/lib/mysql-connector-java-5.1.28-bin.jar"/>
-	<classpathentry kind="lib" path="/Users/Fiona/Documents/workspace/Project/lib/json-simple-1.1.1.jar"/>
-	<classpathentry kind="lib" path="/Users/Fiona/Documents/workspace/Project/lib/lucene-analyzers-common-4.6.1.jar"/>
-	<classpathentry kind="lib" path="/Users/Fiona/Documents/workspace/Project/lib/lucene-core-4.6.1.jar"/>
 	<classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
+	<classpathentry kind="lib" path="lib/json-simple-1.1.1.jar"/>
+	<classpathentry kind="lib" path="lib/lucene-analyzers-common-4.6.1.jar"/>
+	<classpathentry kind="lib" path="lib/lucene-core-4.6.1.jar"/>
+	<classpathentry kind="lib" path="lib/mysql-connector-java-5.1.28-bin.jar"/>
 	<classpathentry kind="output" path="bin"/>
 </classpath>
diff --git a/stopwords.txt → lib/stopwords.txt b/stopwords.txt → lib/stopwords.txt
diff --git a/src/classifier/Classifier.java b/src/classifier/Classifier.java
@@ -1,12 +1,21 @@
 package classifier;
-
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 
 import com.datumbox.opensource.classifiers.NaiveBayes;
 import com.datumbox.opensource.dataobjects.NaiveBayesKnowledgeBase;
-
+/**
+ * The purpose of this class is to interface with the Datumbox Naive Bayes classification packages
+ * and to make the necessary functionality available to the main system that has been developed.
+ * 
+ * This class allows for the creation of a 'dataset' in the format used by the Datumbox classifier,
+ * creation and training of the classifier as well as performing classification and the ability to 
+ * return the classified data to the main system for entry into the database.
+ * 
+ * @author Fiona MacIsaac
+ *
+ */
 public class Classifier {
 
 	private NaiveBayes nb = null;
@@ -16,75 +25,91 @@ public class Classifier {
 
 	public Classifier() {
 		nb = new NaiveBayes();
-		trainingDataset = new HashMap<>();
-		classified = new HashMap<>();
+		trainingDataset = new HashMap<String, String[]>();
+		classified = new HashMap<Integer, Integer>();
 	}
 
-	//Method to take genre words and convert them into string array.
+	/**
+	 * Taken from com.datumbox.opensource.features TextTokenizer.java
+	 * 
+	 * Puts the input of a list of strings into a string array for use by
+	 * the classifier.
+	 * 
+	 * @param words
+	 * @return
+	 */
 	public String[] readLines(List<String> words) {
 		return words.toArray(new String[words.size()]);
 	}
-
-
-	/*
-	 * Process
+
+	/**
+	 * Adds the appropriate string array of words to the dataset map
+	 * with the respective genre as the key.
+	 * 
+	 * @param genre
+	 * @param words
 	 */
-
-	//Create Datasets & Load examples into memory
-		/*
-		 * -Get list of genres
-		 * -For each genre, get all words from thesaurus and convert these to string array
-		 * -Put all this info into map<String, String[]>
-		 */
-
 	public void addToDataset(String genre, String[] words) {
 		trainingDataset.put(genre, words);
 	}
 
-
-	//Train classifier (feature selection?)
-		/* NaiveBayes nb = new NaiveBayes();
-         * nb.setChisquareCriticalValue(6.63); //May need to alter NB class to remove this feature selection being used for now
-         * nb.train(trainingExamples);
-         */
+	/**
+	 * Used to train the classifier using the dataset.
+	 */
 	public void trainClassifier() {
 		nb.train(trainingDataset);
 	}
-	//Get the knowledge base for the trained classifier
-		/*
-		 * NaiveBayesKnowledgeBase knowledgeBase = nb.getKnowledgeBase();
-		 */
+
+	/**
+	 * Retains the knowledgebase that was created during training
+	 */
 	public void setKnowledgeBase() {
 		knowledgeBase = nb.getKnowledgeBase();
 	}
-	//Reset classifier and training set
-		/*
-		 * nb = null;
-		 * trainingExamples = null;
-		 */
+
+	/**
+	 * Resets the classifier and training set so that the classifier can now
+	 * be used on test data.
+	 */
 	public void resetClassifier() {
 		 nb = null;
 		 trainingDataset = null;
 	}
 
-
-	//Use the classifier by giving it the trained knowledge base
-		/*
-		 * nb = new NaiveBayes(knowledgeBase);
-		 * String output = nb.predict(String)
-		 */
+	/**
+	 * Sets up the classifier with the trained knowledge base.
+	 */
 	public void prepClassifier() {
 		nb = new NaiveBayes(knowledgeBase);
 	}
 
+	/**
+	 * Predict the genre of the overview parameter.
+	 * 
+	 * @param overview
+	 * @return
+	 */
 	public String classifyData(String overview) {
 		return nb.predict(overview);	
 	}
 
+	/**
+	 * For each film overview that has been classified, save the filmid
+	 * and genreid in a map so the data can be entered into the relevant
+	 * table in the database.
+	 * 
+	 * @param filmid
+	 * @param genreid
+	 */
 	public void setClassified(int filmid, int genreid) {
 		classified.put(filmid, genreid);
 	}
 
+	/**
+	 * Return the classified data.
+	 * 
+	 * @return
+	 */
 	public Map<Integer, Integer> getClassifiedData() {
 		return classified;
 	}

diff --git a/src/com/datumbox/opensource/classifiers/NaiveBayes.java b/src/com/datumbox/opensource/classifiers/NaiveBayes.java
@@ -35,7 +35,7 @@
  * @see <a href="http://blog.datumbox.com/developing-a-naive-bayes-text-classifier-in-java/">http://blog.datumbox.com/developing-a-naive-bayes-text-classifier-in-java/</a>
  */
 public class NaiveBayes {
-    private double chisquareCriticalValue = 10.83; //equivalent to pvalue 0.001. It is used by feature selection algorithm
+    //private double chisquareCriticalValue = 10.83; //equivalent to pvalue 0.001. It is used by feature selection algorithm
 
     private NaiveBayesKnowledgeBase knowledgeBase;
 
@@ -64,24 +64,6 @@ public NaiveBayesKnowledgeBase getKnowledgeBase() {
         return knowledgeBase;
     }
 
-    /**
-     * Gets the chisquareCriticalValue paramter.
-     * 
-     * @return 
-     */
-    public double getChisquareCriticalValue() {
-        return chisquareCriticalValue;
-    }
-
-    /**
-     * Sets the chisquareCriticalValue parameter.
-     * 
-     * @param chisquareCriticalValue 
-     */
-    public void setChisquareCriticalValue(double chisquareCriticalValue) {
-        this.chisquareCriticalValue = chisquareCriticalValue;
-    }
-
     /**
      * Preprocesses the original dataset and converts it to a List of Documents.
      * 
@@ -133,20 +115,6 @@ private FeatureStats selectFeatures(List<Document> dataset) {
         //the FeatureStats object contains statistics about all the features found in the documents
         FeatureStats stats = featureExtractor.extractFeatureStats(dataset); //extract the stats of the dataset
 
-        //we pass this information to the feature selection algorithm and we get a list with the selected features
-       /*Map<String, Double> selectedFeatures = featureExtractor.chisquare(stats, chisquareCriticalValue);
-        
-        //clip from the stats all the features that are not selected
-        Iterator<Map.Entry<String, Map<String, Integer>>> it = stats.featureCategoryJointCount.entrySet().iterator();
-        while(it.hasNext()) {
-            String feature = it.next().getKey();
-        
-            if(selectedFeatures.containsKey(feature)==false) {
-                //if the feature is not in the selectedFeatures list remove it
-                it.remove();
-            }
-        }*/
-
         return stats;
     }
 

diff --git a/src/com/datumbox/opensource/examples/NaiveBayesExample.java b/src/com/datumbox/opensource/examples/NaiveBayesExample.java
diff --git a/src/com/datumbox/opensource/features/FeatureExtraction.java b/src/com/datumbox/opensource/features/FeatureExtraction.java
@@ -83,60 +83,5 @@ public FeatureStats extractFeatureStats(List<Document> dataset) {
 
         return stats;
     }
-
-    /**
-     * Perform feature selection by using the chisquare non-parametrical 
-     * statistical test.
-     * 
-     * @param stats
-     * @param criticalLevel
-     * @return 
-     */
-  /*  public Map<String, Double> chisquare(FeatureStats stats, double criticalLevel) {
-        Map<String, Double> selectedFeatures = new HashMap<>();
-        
-        String feature;
-        String category;
-        Map<String, Integer> categoryList;
-        
-        int N1dot, N0dot, N00, N01, N10, N11;
-        double chisquareScore;
-        Double previousScore;
-        for(Map.Entry<String, Map<String, Integer>> entry1 : stats.featureCategoryJointCount.entrySet()) {
-            feature = entry1.getKey();
-            categoryList = entry1.getValue();
-            
-            //calculate the N1. (number of documents that have the feature)
-            N1dot = 0;
-            for(Integer count : categoryList.values()) {
-                N1dot+=count;
-            }
-            
-            //also the N0. (number of documents that DONT have the feature)
-            N0dot = stats.n - N1dot;
-            
-            for(Map.Entry<String, Integer> entry2 : categoryList.entrySet()) {
-                category = entry2.getKey();
-                N11 = entry2.getValue(); //N11 is the number of documents that have the feature and belong on the specific category
-                N01 = stats.categoryCounts.get(category)-N11; //N01 is the total number of documents that do not have the particular feature BUT they belong to the specific category
-                
-                N00 = N0dot - N01; //N00 counts the number of documents that don't have the feature and don't belong to the specific category
-                N10 = N1dot - N11; //N10 counts the number of documents that have the feature and don't belong to the specific category
-                
-                //calculate the chisquare score based on the above statistics
-                chisquareScore = stats.n*Math.pow(N11*N00-N10*N01, 2)/((N11+N01)*(N11+N10)*(N10+N00)*(N01+N00));
-                
-                //if the score is larger than the critical value then add it in the list
-                if(chisquareScore>=criticalLevel) {
-                    previousScore = selectedFeatures.get(feature);
-                    if(previousScore==null || chisquareScore>previousScore) {
-                        selectedFeatures.put(feature, chisquareScore);
-                    }
-                }
-            }
-        }
-        
-        return selectedFeatures;
-    }*/
 } 
-
+