diff --git a/DSAVLTree.h b/DSAVLTree.h index 6e243c4..80daa56 100644 --- a/DSAVLTree.h +++ b/DSAVLTree.h @@ -109,9 +109,6 @@ template class DSAVLTree { private: Node* root; -// Node* left; -// Node* right; -// int height;//height of entire tree int count;//total number of nodes bool contains(Node* n, T& val);//TODO test private functions? @@ -126,11 +123,9 @@ class DSAVLTree { DSAVLTree& operator= (const DSAVLTree& copy); Node& copyHelper(Node*& node); -// int getHeight(Node* node);//get height from any starting node - void insert(T& x);//TODO doesn't accept an int + void insert(T& x); bool contains(T& val) { return contains(root, val); } T& find(Node* node, T& val);//given a value, find the matching object in the tree -// Node*& find(Node* node, T& val);//iterative void balanceTree(Node*& node);//balance the tree using right/left rotate void rightRotate(Node*& k1); @@ -148,9 +143,6 @@ template DSAVLTree::DSAVLTree() { root = nullptr; -// left = nullptr; -// right = nullptr; -// height = 0; count = 0; } @@ -220,7 +212,6 @@ void DSAVLTree::insert(T& x)//public template bool DSAVLTree::contains(Node* n, T& val) { -// std::cout << "contains function" << std::endl; if (n == nullptr) return false; else if (n->getData() == val) @@ -234,7 +225,6 @@ bool DSAVLTree::contains(Node* n, T& val) template void DSAVLTree::insert(Node*& n, T& val)//private { -// std::cout << "insert function" << std::endl; if (n == nullptr){//tree is empty or at the end of a leaf n = new Node(val);//make new node to insert } @@ -258,27 +248,6 @@ T& DSAVLTree::find(Node* node, T& val)//TODO add an edge case for when the else return find(node->getRight(), val); } -//Node*& DSAVLTree::find(Node* node, T& val)//iterative TODO seg fault -//{ -// Node* empty; -// std::cout << "find function" << std::endl; -// if (node == nullptr) -// return empty; -// while (node != nullptr) -// { -// std::cout << "while loop" << std::endl; -// if (node->getData() == val){ -// std::cout << "Found node" << std::endl; -// return node; -// } -// else if (val < node->getData()) -// node = node->getLeft(); -// else -// node = node->getRight(); -// } -// std::cout << "not found" << std::endl; -// return empty; -//} template void DSAVLTree::balanceTree(Node*& node) diff --git a/DocParser.cpp b/DocParser.cpp index e16ac8c..6b8de6a 100644 --- a/DocParser.cpp +++ b/DocParser.cpp @@ -6,7 +6,6 @@ DocParser::DocParser() {} void DocParser::parse(const string& filename, StopWord& stop) { -// cout << "NEW DOC: " << filename << endl; numDocs++; //TODO write to persistence file @@ -67,27 +66,22 @@ void DocParser::parse(const string& filename, StopWord& stop) { { space = text.find(" "); Word curr(text.substr(0, space)); -// cout << "current: " << curr.getStr() << endl; curr.toLower();//remove caps if (stop.isStopWord(curr.getStr())){ -// cout << "stop word found" << endl; text = text.substr(space + 1);//cut off curr word space = text.find(" "); continue;//don't add to tree } curr.removePunc();//remove punctuation curr.stemming(); -// cout << "current: " << curr.getStr() << endl; - //put unique words into the avl tree + //put unique words into the avl tree if (!words.contains(curr)){//if the word is not already in the tree/new unique word curr.incrFreq(currDoc); words.insert(curr); -// cout << "inserted " << curr.getStr() << endl; } else{ words.find(words.getRoot(), curr).incrFreq(currDoc);//index document on object in tree -// curr.incrFreq(currDoc);//indexes a temporary variable, not the actual Word object in the tree } text = text.substr(space + 1);//cut off curr word @@ -111,11 +105,6 @@ void DocParser::persistenceIndex()//read in persistence file to index words } -void DocParser::order(Word& w)//for top 25 most frequent words -{ -// top.insert(pair (w, w.getDocs().size())); // ERROR -} - DSAVLTree& DocParser::getWordTree() { return words; } DSAVLTree& DocParser::getOrgTree() { return orgs; } DSAVLTree& DocParser::getPersonTree() { return people; } diff --git a/DocParser.h b/DocParser.h index 06a4660..e1b0703 100644 --- a/DocParser.h +++ b/DocParser.h @@ -25,14 +25,12 @@ class DocParser { DSAVLTree words; DSAVLTree orgs; DSAVLTree people; - map top; int numDocs = 0; public: DocParser(); void parse(const string& filename, StopWord& stop);//parse the documents for unique words void getFiles(const string& directory, StopWord& stop);//returns filenames for traversal through directory - void order(Word& w);//order top 25 more frequent words void persistenceIndex();//read in persistence file to index words DSAVLTree& getWordTree(); diff --git a/Document.cpp b/Document.cpp index b4055c3..0fcb1c8 100644 --- a/Document.cpp +++ b/Document.cpp @@ -20,6 +20,8 @@ Document& Document::operator= (const Document& copy) date = copy.date; filePath = copy.filePath; uuid = copy.uuid; + + return *this; } bool Document::operator==(const Document& d) diff --git a/QueryProcessor.cpp b/QueryProcessor.cpp index 5b51dfa..1abb05a 100644 --- a/QueryProcessor.cpp +++ b/QueryProcessor.cpp @@ -9,14 +9,12 @@ QueryProcessor::QueryProcessor(){} void QueryProcessor::parseQuery(string& q, DSAVLTree& words, DSAVLTree& orgs, DSAVLTree& people, StopWord& stop)//parse query { this->query = q; -// std::cout << "NEW QUERY: " << query << std::endl; int space; while (space != -1) { space = query.find(" ");//for first word Word curr(query.substr(0, space)); curr.toLower(); - cout << curr.getStr() << endl; if (specialStopCheck(stop, curr.getStr())) { query = query.substr(space + 1);//cut off word @@ -25,15 +23,12 @@ void QueryProcessor::parseQuery(string& q, DSAVLTree& words, DSAVLTree wordList = parseAndOr(); for (int i = 0; i < wordList.size(); i++){ - cout << wordList.at(i).getStr() << endl; intersection(wordList.at(i), words); } } else{ -// std::cout << "union" << std::endl; setUnion(parseAndOr(), words); } } @@ -68,20 +63,16 @@ void QueryProcessor::parseQuery(string& q, DSAVLTree& words, DSAVLTreegetData().getDocs()); } else if (curr.getStr() == "person"){ query = query.substr(space + 1);//cut off operator KEEP -// cout << "query: " << query << endl; Word person(findPersonOrg()); - cout << "person " << person.getStr() << std::endl; if (people.contains(person)){ addPersonOrg(people.find(people.getRoot(), person).getDocs());//index has to include only those that have this person queryWords.push_back(people.find(people.getRoot(), person)); } else cout << person.getStr() << " is not found." << endl; - //addPersonOrg(people.find(people.getRoot(), person)->getData().getDocs()); } else{//just a term Word term(curr); @@ -92,7 +83,6 @@ void QueryProcessor::parseQuery(string& q, DSAVLTree& words, DSAVLTreegetData().getDocs()); query = query.substr(space + 1); } space = query.find(" ");//to check if youve reached the end of the query @@ -108,12 +98,10 @@ vector QueryProcessor::parseAndOr() int space = query.find(" "); query = query.substr(space + 1);//cut off operator -// cout << "query: " << query << endl; while (check) { Word word; space = query.find(" "); -// cout << space << endl; if (space != -1){//not at the end of the line if (query.substr(0, space) != "AND" && query.substr(0, space) != "OR" && query.substr(0, space) != "NOT" && query.substr(0, space) != "PERSON" && query.substr(0, space) != "ORG") {//if its not a key word @@ -184,7 +172,6 @@ void QueryProcessor::setUnion(vector a, DSAVLTree& tree)//OR keyword cout << a.at(i).getStr() << " is not found." << endl; continue; } -// vector temp = tree.find(tree.getRoot(), a.at(i))->getData().getDocs(); for (int d = 0; d < temp.size(); d++)//for every doc in the Word objects index { vector::iterator it = find(finalIndex.begin(), finalIndex.end(), temp.at(d)); @@ -200,7 +187,6 @@ void QueryProcessor::intersection(Word& word, DSAVLTree& tree)//AND keywor vector finalList; if (finalIndex.size() == 0)//first word { - cout << "if statement" << endl; if (tree.contains(word)) { queryWords.push_back(tree.find(tree.getRoot(), word)); @@ -220,17 +206,13 @@ void QueryProcessor::intersection(Word& word, DSAVLTree& tree)//AND keywor } else cout << word.getStr() << " not found" << endl; -// cout << "else statment" << endl; for (int i = 0; i < temp.size(); i++) { -// cout << "inner for loop running" << endl; vector::iterator it = find(finalIndex.begin(), finalIndex.end(), temp.at(i)); if (it != finalIndex.end()){//doc of a exists in final, only add docs that are in word.getDocs and final finalList.push_back(*it);//add the docs that contain the word from the finalIndex -// cout << "added to final list" << endl; } } -// cout << "Done" << endl; finalIndex.clear(); finalIndex = finalList;//should only contain docs that contain the word } @@ -238,13 +220,11 @@ void QueryProcessor::intersection(Word& word, DSAVLTree& tree)//AND keywor void QueryProcessor::addTerm(vector& a) { -// cout << "size: " << a.size() << endl; for (int i = 0; i < a.size(); i++) { vector::iterator finalIt = find(finalIndex.begin(), finalIndex.end(), a.at(i)); if (finalIt == finalIndex.end()){//if the doc is NOT in the final index, add it finalIndex.push_back(a.at(i)); -// cout << "added new " << endl; } } } @@ -261,24 +241,16 @@ void QueryProcessor::complement(vector& a)//delete set a from finalInd void QueryProcessor::addPersonOrg(vector& a)//remove any docs from final that don't include the person/org { - cout << "Add person/org" << endl; //if finalIndex already has values, remove any docs that don't contain person/org //else: query only has person/org keywords, just add the files that contain the person/org vector personList; if (finalIndex.size() > 0) { - cout << "person if" << endl; for (int i = 0; i < a.size(); i++)//used to be finalIndex.size(); {//get person/org document index list from doc parser (a), each file in finalIndex should be in the person/orgs index - cout << "person for" << endl; -// vector::iterator it = find(a.begin(), a.end(), finalIndex.at(i)); vector::iterator it = find(finalIndex.begin(), finalIndex.end(), a.at(i)); if (it != finalIndex.end()){//doc of person/org list exists in final index, keep -// cout << "does not exist in doc list" << endl; personList.push_back(*it); - cout << "added " << it->getPath() << endl; -// finalIndex.erase(finalIndex.begin() + i);//remove the file that doens't contain person/org -// i--;//account for file lost } } finalIndex.clear(); @@ -292,7 +264,6 @@ void QueryProcessor::addPersonOrg(vector& a)//remove any docs from fin finalIndex.push_back(a.at(i));//add files with person/org } } - cout << "person done" << endl; } void QueryProcessor::rankIndex() diff --git a/QueryProcessor.h b/QueryProcessor.h index 81dafbe..803954b 100644 --- a/QueryProcessor.h +++ b/QueryProcessor.h @@ -23,18 +23,20 @@ class QueryProcessor { public: QueryProcessor(); + void parseQuery(string& query, DSAVLTree& words, DSAVLTree& orgs, DSAVLTree& people, StopWord& stop);//parse query vector parseAndOr(); Word findPersonOrg();//get the full name of the person being searched for (accounts for those w/first+last -// void intersection(vector& a, vector&b);//AND keyword + void setUnion(vector a, DSAVLTree& tree);//OR keyword void intersection(Word& word, DSAVLTree & tree);//AND keyword -// void intersection(vector a, DSAVLTree& tree);//AND keyword void addTerm(vector& a);//add a single term's docs to the final void complement(vector& a);//set subtraction void addPersonOrg(vector& a);//remove any docs that don't include the given person or org + void rankIndex();//rank the documents in the final index by relevancy/frecuency bool specialStopCheck(StopWord& stop, string& word); + void clearFinal();//resets all vectors for next query vector& getFinal(); vector& getBest();//returns the top 15 ranked documents diff --git a/StopWord.cpp b/StopWord.cpp index f991ee4..35afd24 100644 --- a/StopWord.cpp +++ b/StopWord.cpp @@ -8,8 +8,8 @@ StopWord::StopWord() { ifstream stop;//make the stop words AVL tree stop.open("stopWords.txt"); -// if (stop.is_open()) -// cout << "OPened stops" << endl; + if (stop.is_open()) + cout << "OPened stops" << endl; string curr; while (getline(stop, curr))//make an avl tree of stop words { diff --git a/UserInterface.cpp b/UserInterface.cpp index 38daef9..441a55d 100644 --- a/UserInterface.cpp +++ b/UserInterface.cpp @@ -38,7 +38,6 @@ void UserInterface::run(const string& file) cout << "Search: " << endl; string query; getline(cin, query); -// cout << "query: " << query << endl; start = std::chrono::high_resolution_clock::now(); process.parseQuery(query, docReader.getWordTree(), docReader.getOrgTree(), docReader.getPersonTree(), stops); end = std::chrono::high_resolution_clock::now(); @@ -46,7 +45,6 @@ void UserInterface::run(const string& file) cout << std::fixed << "Query Execution Time: " << time_in_seconds.count() << endl; displayResults(); -// cout << "displayed" << endl; choice = -1; while (choice != "0"){ @@ -90,7 +88,6 @@ void UserInterface::clearIndex() docReader.getWordTree().deleteTree(docReader.getWordTree().getRoot()); docReader.getOrgTree().deleteTree(docReader.getOrgTree().getRoot()); docReader.getPersonTree().deleteTree(docReader.getPersonTree().getRoot()); - //TODO also erase contents of persistence file? } void UserInterface::parseDocs(const string& direct) diff --git a/Word.cpp b/Word.cpp index 95354d7..0e39c8a 100644 --- a/Word.cpp +++ b/Word.cpp @@ -8,19 +8,12 @@ Word::Word() { str = ""; } + Word::Word(string word) { str = word; } -//Word& Word::operator=(const Word& w) -//{ -// str = w.str; -// docs = w.docs; -// frequency = w.frequency; -// total = w.total; -//} - bool Word::operator<(const Word& w) { if (str < w.str) @@ -32,11 +25,6 @@ bool Word::operator==(const Word& w) { return str == w.str;//if theyre the same string, theyre the same word } -// -//void Word::sort() -//{ -// //TODO -//} void Word::toLower() { diff --git a/Word.h b/Word.h index e35f28c..5e59739 100644 --- a/Word.h +++ b/Word.h @@ -24,11 +24,9 @@ class Word { public: Word(); Word(string word); -// Word& operator=(const Word& w);//SEG FAULT bool operator<(const Word& w); bool operator==(const Word& w); -// void sort();//sorts documents by index (doc with largest freq. goes first) void toLower(); void removePunc(); void stemming();//using porter2 stemming library diff --git a/main.cpp b/main.cpp index 03324f2..03578a7 100644 --- a/main.cpp +++ b/main.cpp @@ -12,23 +12,9 @@ using namespace std; int main(int argc, char** argv) { - //TODO LIST: Rank indexes, make persistence file (figure out how to erase contents of file), figure out publication info, try to combine fine/contains - //add a check for if the word is not found - std::chrono::time_point start, end; + //mnt/c/users/18476/c++/searchData 2648 files + //TODO LIST: Fix ranking articles, make persistence file (figure out how to erase contents of file) -// start = std::chrono::high_resolution_clock::now(); UserInterface parse; - parse.run(argv[2]); -// parse.parseDocs(argv[2]);//absolute path -// string query = "investors"; -// StopWord word; -// parse.getQueryProcessor().parseQuery(query, parse.getDocParser().getWordTree(), parse.getDocParser().getOrgTree(), parse.getDocParser().getPersonTree(), word); -// end = std::chrono::high_resolution_clock::now(); -// //calculate the duration between start and end and print to the terminal -// std::chrono::duration time_in_seconds = end - start; -// std::cout << std::fixed << "Duration: " << time_in_seconds.count() << std::endl; - -// Word w(argv[1]);//search term -// w.stemming(); -// parse.getDocParser().getOrgTree().inOrder(parse.getDocParser().getOrgTree().getRoot()); + parse.run(argv[1]); }