From c5af51a244d880e01f730ff05c3c693a12ea37f0 Mon Sep 17 00:00:00 2001 From: karinashin Date: Sun, 1 May 2022 13:51:19 -0500 Subject: [PATCH] fixed AND/OR + PERSON/ORG queries, added top 25 ranking, fixed stop words --- DocParser.cpp | 1 + QueryProcessor.cpp | 34 +++++++++++++++------------------- StopWord.cpp | 2 ++ UserInterface.cpp | 37 ++++++++++++++++++++++++------------- UserInterface.h | 5 +++++ Word.cpp | 2 ++ 6 files changed, 49 insertions(+), 32 deletions(-) diff --git a/DocParser.cpp b/DocParser.cpp index a60e3cd..e16ac8c 100644 --- a/DocParser.cpp +++ b/DocParser.cpp @@ -70,6 +70,7 @@ void DocParser::parse(const string& filename, StopWord& stop) { // cout << "current: " << curr.getStr() << endl; curr.toLower();//remove caps if (stop.isStopWord(curr.getStr())){ +// cout << "stop word found" << endl; text = text.substr(space + 1);//cut off curr word space = text.find(" "); continue;//don't add to tree diff --git a/QueryProcessor.cpp b/QueryProcessor.cpp index 1f034b1..5b51dfa 100644 --- a/QueryProcessor.cpp +++ b/QueryProcessor.cpp @@ -18,7 +18,10 @@ void QueryProcessor::parseQuery(string& q, DSAVLTree& words, DSAVLTree& words, DSAVLTreegetData().getDocs()); } else if (curr.getStr() == "org"){ query = query.substr(space + 1);//cut off operator @@ -95,10 +97,6 @@ void QueryProcessor::parseQuery(string& q, DSAVLTree& words, DSAVLTree QueryProcessor::parseAndOr() word = query.substr(0, space); word.stemming(); args.push_back(word); -// queryWords.push_back(word);//for ranking query = query.substr(space + 1); } else @@ -134,7 +131,6 @@ vector QueryProcessor::parseAndOr() query = "";//empty query word.stemming(); args.push_back(word); -// queryWords.push_back(word);//for ranking break; } } @@ -170,9 +166,7 @@ Word QueryProcessor::findPersonOrg()//operator is already removed from query space = query.find(" "); count++; } -// cout << "Name: " << name << endl; Word person(name); -// queryWords.push_back(person);//for ranking return person; } @@ -181,7 +175,6 @@ void QueryProcessor::setUnion(vector a, DSAVLTree& tree)//OR keyword { for (int i = 0; i < a.size(); i++)//for every word object in the OR operator { -// cout << a.at(i).getStr() << endl; vector temp; if (tree.contains(a.at(i))){ temp = tree.find(tree.getRoot(), a.at(i)).getDocs(); @@ -210,6 +203,7 @@ void QueryProcessor::intersection(Word& word, DSAVLTree& tree)//AND keywor cout << "if statement" << endl; if (tree.contains(word)) { + queryWords.push_back(tree.find(tree.getRoot(), word)); vector temp = tree.find(tree.getRoot(), word).getDocs(); for (int i = 0; i < temp.size(); i++) finalIndex.push_back(temp.at(i)); @@ -220,21 +214,23 @@ void QueryProcessor::intersection(Word& word, DSAVLTree& tree)//AND keywor else { vector temp; - if (tree.contains(word)) + if (tree.contains(word)){ + queryWords.push_back(tree.find(tree.getRoot(), word)); temp = tree.find(tree.getRoot(), word).getDocs(); + } else cout << word.getStr() << " not found" << endl; - cout << "else statment" << endl; +// cout << "else statment" << endl; for (int i = 0; i < temp.size(); i++) { - cout << "inner for loop running" << endl; +// cout << "inner for loop running" << endl; vector::iterator it = find(finalIndex.begin(), finalIndex.end(), temp.at(i)); if (it != finalIndex.end()){//doc of a exists in final, only add docs that are in word.getDocs and final finalList.push_back(*it);//add the docs that contain the word from the finalIndex - cout << "added to final list" << endl; +// cout << "added to final list" << endl; } } - cout << "Done" << endl; +// cout << "Done" << endl; finalIndex.clear(); finalIndex = finalList;//should only contain docs that contain the word } @@ -264,8 +260,8 @@ void QueryProcessor::complement(vector& a)//delete set a from finalInd } void QueryProcessor::addPersonOrg(vector& a)//remove any docs from final that don't include the person/org -{//doesn't work with AND EX: AND investor stock PERSON cramer - cout << "Add person/org" << endl;//TODO 21883 is being printed with random chars +{ + cout << "Add person/org" << endl; //if finalIndex already has values, remove any docs that don't contain person/org //else: query only has person/org keywords, just add the files that contain the person/org vector personList; @@ -277,7 +273,7 @@ void QueryProcessor::addPersonOrg(vector& a)//remove any docs from fin cout << "person for" << endl; // vector::iterator it = find(a.begin(), a.end(), finalIndex.at(i)); vector::iterator it = find(finalIndex.begin(), finalIndex.end(), a.at(i)); - if (it != a.end()){//doc of person/org list exists in final index, keep + if (it != finalIndex.end()){//doc of person/org list exists in final index, keep // cout << "does not exist in doc list" << endl; personList.push_back(*it); cout << "added " << it->getPath() << endl; @@ -350,7 +346,7 @@ void QueryProcessor::rankIndex() bool QueryProcessor::specialStopCheck(StopWord& stop, string& word) { - if (word == "AND" || word == "OR" || word == "NOT")//exclude key words + if (word == "and" || word == "or" || word == "not")//exclude key words return false; else if (stop.isStopWord(word)) return true; diff --git a/StopWord.cpp b/StopWord.cpp index 912cd85..f991ee4 100644 --- a/StopWord.cpp +++ b/StopWord.cpp @@ -8,6 +8,8 @@ StopWord::StopWord() { ifstream stop;//make the stop words AVL tree stop.open("stopWords.txt"); +// if (stop.is_open()) +// cout << "OPened stops" << endl; string curr; while (getline(stop, curr))//make an avl tree of stop words { diff --git a/UserInterface.cpp b/UserInterface.cpp index 7f306a8..38daef9 100644 --- a/UserInterface.cpp +++ b/UserInterface.cpp @@ -162,12 +162,23 @@ void UserInterface::stats() getTopWords(); } +void UserInterface::topWordsHelper(Node* n) +{ + if (n != nullptr){ + topWordsHelper(n->getLeft()); + frequency.push_back(n->getData().getTotal()); + all.push_back(n->getData()); + topWordsHelper(n->getRight()); + } +} + void UserInterface::getTopWords() { cout << "Get top words" << endl; + topWordsHelper(docReader.getWordTree().getRoot()); vector top;//top 25 words - vector freqs; //corresponding total freqs for each word - vector all;//save words too? corresponding to freqs +// vector freqs; //corresponding total freqs for each word +// vector all;//save words too? corresponding to freqs //go through tree and get the frequency of each word (in order) //result: total frequency for each doc @@ -175,23 +186,23 @@ void UserInterface::getTopWords() // for (int i = 0; i < freqs.size(); i++) // cout << freqs.at(i) << " " << finalIndex.at(i).getPath() << endl; - - //get the top 15 docs with the highest freq - for (int n = 0; n < 15; n++){ - int highest = freqs.at(0); + //get the top 25 most frequent words + cout << "Top 25 Most Frequent Words: " << endl; + for (int n = 0; n < 25; n++){ + int highest = frequency.at(0); int index = 0; - if (n > freqs.size())//less that 15 docs in the finalIndex + if (n > frequency.size())//less than 25 total words break; - for (int i = 1; i < freqs.size(); i++)//find the next highest freq + for (int i = 1; i < frequency.size(); i++)//find the next highest freq { - if (freqs.at(i) > highest){//get highest freq - highest = freqs.at(i); + if (frequency.at(i) > highest){//get highest freq + highest = frequency.at(i); index = i; } } - top.push_back(all.at(index));//get the corresponding doc for that freq - cout << "next higheset frequency: " << freqs.at(index) << endl; - freqs.erase(freqs.begin() + index); +// top.push_back(all.at(index));//get the corresponding doc for that freq + cout << all.at(index) << ": " << frequency.at(index) << endl; + frequency.erase(frequency.begin() + index); all.erase(all.begin() + index); } } diff --git a/UserInterface.h b/UserInterface.h index b4c1c29..e3d7501 100644 --- a/UserInterface.h +++ b/UserInterface.h @@ -23,6 +23,10 @@ class UserInterface { QueryProcessor process; StopWord stops; + //for ranking + vector frequency;//frequencies of words + vector all;//vector of all words + public: UserInterface(); void run(const string& file);//run search engine @@ -31,6 +35,7 @@ class UserInterface { void displayResults(); void showText(Document& d); void stats(); + void topWordsHelper(Node* n);//populates frequency and word vectors void getTopWords();//prints out top 25 most frequent words DocParser& getDocParser();//used to access word/org/person avl trees QueryProcessor& getQueryProcessor(); diff --git a/Word.cpp b/Word.cpp index 3c923d4..95354d7 100644 --- a/Word.cpp +++ b/Word.cpp @@ -79,12 +79,14 @@ void Word::incrFreq(Document& doc) for (int i = 0; i < docs.size(); i++){ if (docs.at(i) == doc){//found doc frequency.at(i)++;//increment corresponding freq for existing doc + total++; return; } } //if function didn't return, no doc was found docs.push_back(doc);//add new doc to word's index frequency.push_back(1);//frequency that corresponds to the current doc + total++; } int Word::getDocFreq(Document& doc)