Skip to content

Commit

Permalink
Evaluation completed and UI implemented. Commenting still to do
Browse files Browse the repository at this point in the history
  • Loading branch information
Fiona MacIsaac committed Mar 27, 2014
1 parent 023ce51 commit e71a432
Show file tree
Hide file tree
Showing 16 changed files with 509 additions and 653 deletions.
8 changes: 4 additions & 4 deletions .classpath
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
<classpath>
<classpathentry kind="src" path="src"/>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
<classpathentry kind="lib" path="/Users/Fiona/Documents/workspace/Project/lib/mysql-connector-java-5.1.28-bin.jar"/>
<classpathentry kind="lib" path="/Users/Fiona/Documents/workspace/Project/lib/json-simple-1.1.1.jar"/>
<classpathentry kind="lib" path="/Users/Fiona/Documents/workspace/Project/lib/lucene-analyzers-common-4.6.1.jar"/>
<classpathentry kind="lib" path="/Users/Fiona/Documents/workspace/Project/lib/lucene-core-4.6.1.jar"/>
<classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
<classpathentry kind="lib" path="lib/json-simple-1.1.1.jar"/>
<classpathentry kind="lib" path="lib/lucene-analyzers-common-4.6.1.jar"/>
<classpathentry kind="lib" path="lib/lucene-core-4.6.1.jar"/>
<classpathentry kind="lib" path="lib/mysql-connector-java-5.1.28-bin.jar"/>
<classpathentry kind="output" path="bin"/>
</classpath>
File renamed without changes.
101 changes: 63 additions & 38 deletions src/classifier/Classifier.java
Original file line number Diff line number Diff line change
@@ -1,12 +1,21 @@
package classifier;

import java.util.HashMap;
import java.util.List;
import java.util.Map;

import com.datumbox.opensource.classifiers.NaiveBayes;
import com.datumbox.opensource.dataobjects.NaiveBayesKnowledgeBase;

/**
* The purpose of this class is to interface with the Datumbox Naive Bayes classification packages
* and to make the necessary functionality available to the main system that has been developed.
*
* This class allows for the creation of a 'dataset' in the format used by the Datumbox classifier,
* creation and training of the classifier as well as performing classification and the ability to
* return the classified data to the main system for entry into the database.
*
* @author Fiona MacIsaac
*
*/
public class Classifier {

private NaiveBayes nb = null;
Expand All @@ -16,75 +25,91 @@ public class Classifier {

public Classifier() {
nb = new NaiveBayes();
trainingDataset = new HashMap<>();
classified = new HashMap<>();
trainingDataset = new HashMap<String, String[]>();
classified = new HashMap<Integer, Integer>();
}

//Method to take genre words and convert them into string array.
/**
* Taken from com.datumbox.opensource.features TextTokenizer.java
*
* Puts the input of a list of strings into a string array for use by
* the classifier.
*
* @param words
* @return
*/
public String[] readLines(List<String> words) {
return words.toArray(new String[words.size()]);
}


/*
* Process

/**
* Adds the appropriate string array of words to the dataset map
* with the respective genre as the key.
*
* @param genre
* @param words
*/

//Create Datasets & Load examples into memory
/*
* -Get list of genres
* -For each genre, get all words from thesaurus and convert these to string array
* -Put all this info into map<String, String[]>
*/

public void addToDataset(String genre, String[] words) {
trainingDataset.put(genre, words);
}


//Train classifier (feature selection?)
/* NaiveBayes nb = new NaiveBayes();
* nb.setChisquareCriticalValue(6.63); //May need to alter NB class to remove this feature selection being used for now
* nb.train(trainingExamples);
*/
/**
* Used to train the classifier using the dataset.
*/
public void trainClassifier() {
nb.train(trainingDataset);
}
//Get the knowledge base for the trained classifier
/*
* NaiveBayesKnowledgeBase knowledgeBase = nb.getKnowledgeBase();
*/

/**
* Retains the knowledgebase that was created during training
*/
public void setKnowledgeBase() {
knowledgeBase = nb.getKnowledgeBase();
}
//Reset classifier and training set
/*
* nb = null;
* trainingExamples = null;
*/

/**
* Resets the classifier and training set so that the classifier can now
* be used on test data.
*/
public void resetClassifier() {
nb = null;
trainingDataset = null;
}


//Use the classifier by giving it the trained knowledge base
/*
* nb = new NaiveBayes(knowledgeBase);
* String output = nb.predict(String)
*/
/**
* Sets up the classifier with the trained knowledge base.
*/
public void prepClassifier() {
nb = new NaiveBayes(knowledgeBase);
}

/**
* Predict the genre of the overview parameter.
*
* @param overview
* @return
*/
public String classifyData(String overview) {
return nb.predict(overview);
}

/**
* For each film overview that has been classified, save the filmid
* and genreid in a map so the data can be entered into the relevant
* table in the database.
*
* @param filmid
* @param genreid
*/
public void setClassified(int filmid, int genreid) {
classified.put(filmid, genreid);
}

/**
* Return the classified data.
*
* @return
*/
public Map<Integer, Integer> getClassifiedData() {
return classified;
}
Expand Down
34 changes: 1 addition & 33 deletions src/com/datumbox/opensource/classifiers/NaiveBayes.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
* @see <a href="http://blog.datumbox.com/developing-a-naive-bayes-text-classifier-in-java/">http://blog.datumbox.com/developing-a-naive-bayes-text-classifier-in-java/</a>
*/
public class NaiveBayes {
private double chisquareCriticalValue = 10.83; //equivalent to pvalue 0.001. It is used by feature selection algorithm
//private double chisquareCriticalValue = 10.83; //equivalent to pvalue 0.001. It is used by feature selection algorithm

private NaiveBayesKnowledgeBase knowledgeBase;

Expand Down Expand Up @@ -64,24 +64,6 @@ public NaiveBayesKnowledgeBase getKnowledgeBase() {
return knowledgeBase;
}

/**
* Gets the chisquareCriticalValue paramter.
*
* @return
*/
public double getChisquareCriticalValue() {
return chisquareCriticalValue;
}

/**
* Sets the chisquareCriticalValue parameter.
*
* @param chisquareCriticalValue
*/
public void setChisquareCriticalValue(double chisquareCriticalValue) {
this.chisquareCriticalValue = chisquareCriticalValue;
}

/**
* Preprocesses the original dataset and converts it to a List of Documents.
*
Expand Down Expand Up @@ -133,20 +115,6 @@ private FeatureStats selectFeatures(List<Document> dataset) {
//the FeatureStats object contains statistics about all the features found in the documents
FeatureStats stats = featureExtractor.extractFeatureStats(dataset); //extract the stats of the dataset

//we pass this information to the feature selection algorithm and we get a list with the selected features
/*Map<String, Double> selectedFeatures = featureExtractor.chisquare(stats, chisquareCriticalValue);
//clip from the stats all the features that are not selected
Iterator<Map.Entry<String, Map<String, Integer>>> it = stats.featureCategoryJointCount.entrySet().iterator();
while(it.hasNext()) {
String feature = it.next().getKey();
if(selectedFeatures.containsKey(feature)==false) {
//if the feature is not in the selectedFeatures list remove it
it.remove();
}
}*/

return stats;
}

Expand Down
109 changes: 0 additions & 109 deletions src/com/datumbox/opensource/examples/NaiveBayesExample.java

This file was deleted.

57 changes: 1 addition & 56 deletions src/com/datumbox/opensource/features/FeatureExtraction.java
Original file line number Diff line number Diff line change
Expand Up @@ -83,60 +83,5 @@ public FeatureStats extractFeatureStats(List<Document> dataset) {

return stats;
}

/**
* Perform feature selection by using the chisquare non-parametrical
* statistical test.
*
* @param stats
* @param criticalLevel
* @return
*/
/* public Map<String, Double> chisquare(FeatureStats stats, double criticalLevel) {
Map<String, Double> selectedFeatures = new HashMap<>();
String feature;
String category;
Map<String, Integer> categoryList;
int N1dot, N0dot, N00, N01, N10, N11;
double chisquareScore;
Double previousScore;
for(Map.Entry<String, Map<String, Integer>> entry1 : stats.featureCategoryJointCount.entrySet()) {
feature = entry1.getKey();
categoryList = entry1.getValue();
//calculate the N1. (number of documents that have the feature)
N1dot = 0;
for(Integer count : categoryList.values()) {
N1dot+=count;
}
//also the N0. (number of documents that DONT have the feature)
N0dot = stats.n - N1dot;
for(Map.Entry<String, Integer> entry2 : categoryList.entrySet()) {
category = entry2.getKey();
N11 = entry2.getValue(); //N11 is the number of documents that have the feature and belong on the specific category
N01 = stats.categoryCounts.get(category)-N11; //N01 is the total number of documents that do not have the particular feature BUT they belong to the specific category
N00 = N0dot - N01; //N00 counts the number of documents that don't have the feature and don't belong to the specific category
N10 = N1dot - N11; //N10 counts the number of documents that have the feature and don't belong to the specific category
//calculate the chisquare score based on the above statistics
chisquareScore = stats.n*Math.pow(N11*N00-N10*N01, 2)/((N11+N01)*(N11+N10)*(N10+N00)*(N01+N00));
//if the score is larger than the critical value then add it in the list
if(chisquareScore>=criticalLevel) {
previousScore = selectedFeatures.get(feature);
if(previousScore==null || chisquareScore>previousScore) {
selectedFeatures.put(feature, chisquareScore);
}
}
}
}
return selectedFeatures;
}*/
}
Loading

0 comments on commit e71a432

Please sign in to comment.