diff --git a/DocParser.cpp b/DocParser.cpp index 4aed6ad..486259f 100644 --- a/DocParser.cpp +++ b/DocParser.cpp @@ -8,7 +8,6 @@ DocParser::DocParser() {} void DocParser::parse(const string& filename, StopWord& stop) { numDocs++; - //TODO write to persistence file //parse main text rapidjson::Document doc; diff --git a/QueryProcessor.cpp b/QueryProcessor.cpp index 1a2a70b..2f43286 100644 --- a/QueryProcessor.cpp +++ b/QueryProcessor.cpp @@ -22,17 +22,15 @@ void QueryProcessor::parseQuery(string& q, DSAVLTree& words, DSAVLTree wordList = parseAndOr(); if (curr.getStr() == "and"){//perform corresponding set operation - vector wordList = parseAndOr(); for (int i = 0; i < wordList.size(); i++){ intersection(wordList.at(i), words); } } else{ - setUnion(parseAndOr(), words); -// for (int i = 0; i < queryWords.size(); i++){ -// cout << queryWords.at(i).getStr() << endl; -// } + for (int i = 0; i < wordList.size(); i++) + setUnion(wordList.at(i), words); } } else if (curr.getStr() == "not"){ @@ -51,7 +49,6 @@ void QueryProcessor::parseQuery(string& q, DSAVLTree& words, DSAVLTree a, DSAVLTree& tree)//OR keyword +void QueryProcessor::setUnion(Word& word, DSAVLTree &tree) { - for (int i = 0; i < a.size(); i++)//for every word object in the OR operator + vector temp; + if (tree.contains(word)){ + queryWords.push_back(tree.find(tree.getRoot(), word)); + temp = tree.find(tree.getRoot(), word).getDocs(); + } + else + cout << word.getStr() << " not found" << endl; + + for (int i = 0; i < temp.size(); i++) { - vector temp; - if (tree.contains(a.at(i))){ - temp = tree.find(tree.getRoot(), a.at(i)).getDocs(); -// cout << "a.at(i): " << a.at(i).getStr() << endl; - queryWords.push_back(tree.find(tree.getRoot(), a.at(i))); -// cout << queryWords.at(queryWords.size()-1).getDocs().size(); - } - else{ - cout << a.at(i).getStr() << " is not found." << endl; - continue; - } - for (int d = 0; d < temp.size(); d++)//for every doc in the Word objects index - { - vector::iterator it = find(finalIndex.begin(), finalIndex.end(), temp.at(d)); - if (it == finalIndex.end()){//if the doc is NOT in the final index, add it - finalIndex.push_back(temp.at(d)); - } + vector::iterator it = find(finalIndex.begin(), finalIndex.end(), temp.at(i)); + if (it == finalIndex.end()){//doc of a exists in final, only add docs that are in word.getDocs and final + finalIndex.push_back(temp.at(i));//add the docs that contain the word from the finalIndex } } } @@ -274,53 +265,42 @@ void QueryProcessor::addPersonOrg(vector& a)//remove any docs from fin void QueryProcessor::rankIndex() { cout << "Rank index" << endl; -// cout << "finalIndex size " << finalIndex.size() << endl; -// cout << "query words size: " << queryWords.size() << endl; + cout << "finalIndex size " << finalIndex.size() << endl; + cout << "query words size: " << queryWords.size() << endl; vector freqs; //corresponding total freqs for each doc in finalIndex for (int queryIndex = 0; queryIndex < finalIndex.size(); queryIndex++)//for every doc in final index { int sum = 0; for (int i = 0; i < queryWords.size(); i++)//for every word in query { + //if its an OR query, not every word will be in every file of finalIndex //get the each words frequency in the current doc and add them all together - sum += queryWords.at(i).getDocFreq(finalIndex.at(queryIndex));//add total freq of each word for this doc -// cout << queryWords.at(i).getDocFreq(finalIndex.at(queryIndex)) << " " << finalIndex.at(queryIndex).getPath() << endl; - }//doc not found + vector::iterator it = find(queryWords.at(i).getDocs().begin(), queryWords.at(i).getDocs().end(), finalIndex.at(queryIndex)); + if (it != queryWords.at(i).getDocs().end())//this word is present in the doc + sum += queryWords.at(i).getDocFreq(finalIndex.at(queryIndex));//add total freq of each word for this doc + } freqs.push_back(sum); - cout << "sum: " << sum << endl; } cout << "done with for loop" << endl; //result: total frequency for each doc -// cout << "Frequency" << endl; -// for (int i = 0; i < freqs.size(); i++) -// cout << freqs.at(i) << " " << finalIndex.at(i).getPath() << endl; - - //get the top 15 docs with the highest freq -// for (int n = 0; n < 15; n++){ -//// if (n > freqs.size() || freqs.size() == 0)//less that 15 docs in the finalIndex -//// break; -// int highest = freqs.at(0); -// int index = 0; -// if (n > freqs.size())//less that 15 docs in the finalIndex -// break; -// for (int i = 1; i < freqs.size(); i++)//find the next highest freq -// { -// if (freqs.at(i) > highest){//get highest freq -// highest = freqs.at(i); -// index = i; -// } -// } -// best.push_back(finalIndex.at(index));//get the corresponding doc for that freq -//// cout << "next higheset frequency: " << freqs.at(index) << endl; -// freqs.erase(freqs.begin() + index); -// finalIndex.erase(finalIndex.begin() + index); -// } - -// cout << "Best 15: " << endl; -// for (int i = best.size()-1; i >= 0; i--) -// cout << best.at(i).getPath() << endl; + for (int n = 0; n < 15; n++){ + if (n > freqs.size() || freqs.size() == 0)//less that 15 docs in the finalIndex + break; + int highest = freqs.at(0); + int index = 0; + for (int i = 1; i < freqs.size(); i++)//find the next highest freq + { + if (freqs.at(i) > highest){//get highest freq + highest = freqs.at(i); + index = i; + } + } + best.push_back(finalIndex.at(index));//get the corresponding doc for that freq + freqs.erase(freqs.begin() + index); + finalIndex.erase(finalIndex.begin() + index); + } } bool QueryProcessor::specialStopCheck(StopWord& stop, string& word) diff --git a/QueryProcessor.h b/QueryProcessor.h index 803954b..dd87c60 100644 --- a/QueryProcessor.h +++ b/QueryProcessor.h @@ -28,8 +28,8 @@ class QueryProcessor { vector parseAndOr(); Word findPersonOrg();//get the full name of the person being searched for (accounts for those w/first+last - void setUnion(vector a, DSAVLTree& tree);//OR keyword - void intersection(Word& word, DSAVLTree & tree);//AND keyword + void setUnion(Word& word, DSAVLTree& tree); + void intersection(Word& word, DSAVLTree& tree);//AND keyword void addTerm(vector& a);//add a single term's docs to the final void complement(vector& a);//set subtraction void addPersonOrg(vector& a);//remove any docs that don't include the given person or org diff --git a/UserInterface.cpp b/UserInterface.cpp index a6d1f9f..4b06e82 100644 --- a/UserInterface.cpp +++ b/UserInterface.cpp @@ -13,28 +13,37 @@ void UserInterface::run(const string& file) bool go = true; string choice; - while (choice != "1" && choice != "2"){ - cout << "Enter 1 to parse files or 2 to use persistence file: " << endl; - cin.clear(); - cin >> choice; - if (choice == "1"){ - cout << "parsing..." << endl; - start = std::chrono::high_resolution_clock::now(); - docReader.getFiles(file, stops); - end = std::chrono::high_resolution_clock::now(); - std::chrono::duration time_in_seconds = end - start; - cout << std::fixed << "Parsing Time: " << time_in_seconds.count() << endl; - cout << "done!" << endl; - } - else if (choice == "2"){ - cout << "parsing..." << endl; - docReader.persistenceIndex();//TODO - cout << "done!" << endl; - } - else{ - cout << "Incorrect input." << endl; - } - } +// while (choice != "1" && choice != "2"){ +// cout << "Enter 1 to parse files or 2 to use persistence file: " << endl; +// cin.clear(); +// cin >> choice; +// if (choice == "1"){ +// cout << "parsing..." << endl; +// start = std::chrono::high_resolution_clock::now(); +// docReader.getFiles(file, stops); +// end = std::chrono::high_resolution_clock::now(); +// std::chrono::duration time_in_seconds = end - start; +// cout << std::fixed << "Parsing Time: " << time_in_seconds.count() << endl; +// cout << "done!" << endl; +// } +// else if (choice == "2"){ +// cout << "parsing..." << endl; +// docReader.persistenceIndex();//TODO +// cout << "done!" << endl; +// } +// else{ +// cout << "Incorrect input." << endl; +// } +// } + + //Parse documents + cout << "parsing..." << endl; + start = std::chrono::high_resolution_clock::now(); + docReader.getFiles(file, stops); + end = std::chrono::high_resolution_clock::now(); + std::chrono::duration time_in_seconds = end - start; + cout << std::fixed << "Parsing Time: " << time_in_seconds.count() << endl; + cout << "done!" << endl; while (go) { @@ -56,10 +65,13 @@ void UserInterface::run(const string& file) cin >> choice; if (choice == "0")//exit break; -// else if (choice == "1" || choice == "2" || choice == "3" || choice == "4" || choice == "5" || choice == "6" || choice == "7" || choice == "8" || choice == "9" || choice == "10" || choice == "11" || choice == "12" || choice == "13" || choice == "14" || choice == "15"){ - else if (stoi(choice) > 0 && stoi(choice) <= process.getBest().size()){ - showText(process.getBest().at(stoi(choice) - 1)); - cout << endl; + else if (choice == "1" || choice == "2" || choice == "3" || choice == "4" || choice == "5" || choice == "6" || choice == "7" || choice == "8" || choice == "9" || choice == "10" || choice == "11" || choice == "12" || choice == "13" || choice == "14" || choice == "15"){ + if (stoi(choice) > 0 && stoi(choice) <= process.getBest().size()){ + showText(process.getBest().at(stoi(choice) - 1)); + cout << endl; + } + else + cout << "Incorrect input." << endl; } else cout << "Incorrect input." << endl; @@ -95,13 +107,6 @@ void UserInterface::clearIndex() docReader.getPersonTree().deleteTree(docReader.getPersonTree().getRoot()); } -void UserInterface::parseDocs(const string& direct) -{ - std::cout << "parsing documents..." << std::endl; - docReader.getFiles(direct, stops); - std::cout << "done parsing!" << std::endl; -} - void UserInterface::displayResults()//with ranking { if (process.getBest().size() == 0) @@ -117,21 +122,6 @@ void UserInterface::displayResults()//with ranking } } -//void UserInterface::displayResults()//without ranking -//{ -// if (process.getFinal().size() == 0) -// cout << "No results found" << endl; -// -// for (int i = 0; i < process.getFinal().size(); i++) -// { -// if (i == 15) -// break; -// cout << i + 1 << ") "; -// cout << "Title: " << process.getFinal().at(i).getTitle() << ", " << process.getFinal().at(i).getPub() << ", Date: " << process.getFinal().at(i).getDate() << endl; -// cout << "Path: " << process.getFinal().at(i).getPath() << endl; -// } -//} - void UserInterface::showText(Document& d) { rapidjson::Document doc; @@ -174,20 +164,12 @@ void UserInterface::topWordsHelper(Node* n) } } -void UserInterface::getTopWords() +void UserInterface::getTopWords()//go through tree and get the frequency of each word (in order) { - cout << "Get top words" << endl; topWordsHelper(docReader.getWordTree().getRoot()); vector top;//top 25 words -// vector freqs; //corresponding total freqs for each word -// vector all;//save words too? corresponding to freqs - //go through tree and get the frequency of each word (in order) //result: total frequency for each doc -// cout << "Frequency" << endl; -// for (int i = 0; i < freqs.size(); i++) -// cout << freqs.at(i) << " " << finalIndex.at(i).getPath() << endl; - //get the top 25 most frequent words cout << "Top 25 Most Frequent Words: " << endl; for (int n = 0; n < 25; n++){ @@ -202,7 +184,6 @@ void UserInterface::getTopWords() index = i; } } -// top.push_back(all.at(index));//get the corresponding doc for that freq cout << all.at(index) << ": " << frequency.at(index) << endl; frequency.erase(frequency.begin() + index); all.erase(all.begin() + index); diff --git a/UserInterface.h b/UserInterface.h index 0942127..774c130 100644 --- a/UserInterface.h +++ b/UserInterface.h @@ -32,7 +32,6 @@ class UserInterface { void run(const string& file);//run search engine void clearIndex();//delete every tree - void parseDocs(const string& direct);//parse all documents void displayResults(); void showText(Document& d); void stats(); diff --git a/Word.cpp b/Word.cpp index 0e39c8a..1f57338 100644 --- a/Word.cpp +++ b/Word.cpp @@ -81,14 +81,17 @@ int Word::getDocFreq(Document& doc) { for (int i = 0; i < docs.size(); i++) { - if (docs.at(i) == doc) + if (docs.at(i) == doc){ return frequency.at(i);//return corresponding freq. for given doc + } } cout << "Doc not found." << endl; } vector& Word::getFrequency() { return frequency; } + int Word::getTotal() { return total; } + std::ostream& operator<< (std::ostream& out, const Word& w) { out << w.str; diff --git a/stopWords.txt b/stopWords.txt index ffd6135..4bfcdf4 100644 --- a/stopWords.txt +++ b/stopWords.txt @@ -1,3 +1,4 @@ +a able about above