Skip to content

Commit

Permalink
fixed OR ranking display, cleaned up comments
Browse files Browse the repository at this point in the history
  • Loading branch information
karinashin committed May 2, 2022
1 parent f7f15f1 commit f43a1f5
Show file tree
Hide file tree
Showing 7 changed files with 85 additions and 122 deletions.
1 change: 0 additions & 1 deletion DocParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ DocParser::DocParser() {}
void DocParser::parse(const string& filename, StopWord& stop) {
numDocs++;

//TODO write to persistence file
//parse main text
rapidjson::Document doc;

Expand Down
98 changes: 39 additions & 59 deletions QueryProcessor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,15 @@ void QueryProcessor::parseQuery(string& q, DSAVLTree<Word>& words, DSAVLTree<Wor
}

if (curr.getStr() == "and" || curr.getStr() == "or"){
vector<Word> wordList = parseAndOr();
if (curr.getStr() == "and"){//perform corresponding set operation
vector<Word> wordList = parseAndOr();
for (int i = 0; i < wordList.size(); i++){
intersection(wordList.at(i), words);
}
}
else{
setUnion(parseAndOr(), words);
// for (int i = 0; i < queryWords.size(); i++){
// cout << queryWords.at(i).getStr() << endl;
// }
for (int i = 0; i < wordList.size(); i++)
setUnion(wordList.at(i), words);
}
}
else if (curr.getStr() == "not"){
Expand All @@ -51,7 +49,6 @@ void QueryProcessor::parseQuery(string& q, DSAVLTree<Word>& words, DSAVLTree<Wor

if (words.contains(word1)){
complement(words.find(words.getRoot(), word1).getDocs());
// queryWords.push_back(words.find(words.getRoot(), word1));//won't have any frequency in docs since its "NOT"
}
else
cout << word1.getStr() << " is not found." << endl;
Expand Down Expand Up @@ -162,27 +159,21 @@ Word QueryProcessor::findPersonOrg()//operator is already removed from query
return person;
}

void QueryProcessor::setUnion(vector<Word> a, DSAVLTree<Word>& tree)//OR keyword
void QueryProcessor::setUnion(Word& word, DSAVLTree<Word> &tree)
{
for (int i = 0; i < a.size(); i++)//for every word object in the OR operator
vector<Document> temp;
if (tree.contains(word)){
queryWords.push_back(tree.find(tree.getRoot(), word));
temp = tree.find(tree.getRoot(), word).getDocs();
}
else
cout << word.getStr() << " not found" << endl;

for (int i = 0; i < temp.size(); i++)
{
vector<Document> temp;
if (tree.contains(a.at(i))){
temp = tree.find(tree.getRoot(), a.at(i)).getDocs();
// cout << "a.at(i): " << a.at(i).getStr() << endl;
queryWords.push_back(tree.find(tree.getRoot(), a.at(i)));
// cout << queryWords.at(queryWords.size()-1).getDocs().size();
}
else{
cout << a.at(i).getStr() << " is not found." << endl;
continue;
}
for (int d = 0; d < temp.size(); d++)//for every doc in the Word objects index
{
vector<Document>::iterator it = find(finalIndex.begin(), finalIndex.end(), temp.at(d));
if (it == finalIndex.end()){//if the doc is NOT in the final index, add it
finalIndex.push_back(temp.at(d));
}
vector<Document>::iterator it = find(finalIndex.begin(), finalIndex.end(), temp.at(i));
if (it == finalIndex.end()){//doc of a exists in final, only add docs that are in word.getDocs and final
finalIndex.push_back(temp.at(i));//add the docs that contain the word from the finalIndex
}
}
}
Expand Down Expand Up @@ -274,53 +265,42 @@ void QueryProcessor::addPersonOrg(vector<Document>& a)//remove any docs from fin
void QueryProcessor::rankIndex()
{
cout << "Rank index" << endl;
// cout << "finalIndex size " << finalIndex.size() << endl;
// cout << "query words size: " << queryWords.size() << endl;
cout << "finalIndex size " << finalIndex.size() << endl;
cout << "query words size: " << queryWords.size() << endl;
vector<int> freqs; //corresponding total freqs for each doc in finalIndex
for (int queryIndex = 0; queryIndex < finalIndex.size(); queryIndex++)//for every doc in final index
{
int sum = 0;
for (int i = 0; i < queryWords.size(); i++)//for every word in query
{
//if its an OR query, not every word will be in every file of finalIndex
//get the each words frequency in the current doc and add them all together
sum += queryWords.at(i).getDocFreq(finalIndex.at(queryIndex));//add total freq of each word for this doc
// cout << queryWords.at(i).getDocFreq(finalIndex.at(queryIndex)) << " " << finalIndex.at(queryIndex).getPath() << endl;
}//doc not found
vector<Document>::iterator it = find(queryWords.at(i).getDocs().begin(), queryWords.at(i).getDocs().end(), finalIndex.at(queryIndex));
if (it != queryWords.at(i).getDocs().end())//this word is present in the doc
sum += queryWords.at(i).getDocFreq(finalIndex.at(queryIndex));//add total freq of each word for this doc
}
freqs.push_back(sum);
cout << "sum: " << sum << endl;
}
cout << "done with for loop" << endl;
//result: total frequency for each doc

// cout << "Frequency" << endl;
// for (int i = 0; i < freqs.size(); i++)
// cout << freqs.at(i) << " " << finalIndex.at(i).getPath() << endl;


//get the top 15 docs with the highest freq
// for (int n = 0; n < 15; n++){
//// if (n > freqs.size() || freqs.size() == 0)//less that 15 docs in the finalIndex
//// break;
// int highest = freqs.at(0);
// int index = 0;
// if (n > freqs.size())//less that 15 docs in the finalIndex
// break;
// for (int i = 1; i < freqs.size(); i++)//find the next highest freq
// {
// if (freqs.at(i) > highest){//get highest freq
// highest = freqs.at(i);
// index = i;
// }
// }
// best.push_back(finalIndex.at(index));//get the corresponding doc for that freq
//// cout << "next higheset frequency: " << freqs.at(index) << endl;
// freqs.erase(freqs.begin() + index);
// finalIndex.erase(finalIndex.begin() + index);
// }

// cout << "Best 15: " << endl;
// for (int i = best.size()-1; i >= 0; i--)
// cout << best.at(i).getPath() << endl;
for (int n = 0; n < 15; n++){
if (n > freqs.size() || freqs.size() == 0)//less that 15 docs in the finalIndex
break;
int highest = freqs.at(0);
int index = 0;
for (int i = 1; i < freqs.size(); i++)//find the next highest freq
{
if (freqs.at(i) > highest){//get highest freq
highest = freqs.at(i);
index = i;
}
}
best.push_back(finalIndex.at(index));//get the corresponding doc for that freq
freqs.erase(freqs.begin() + index);
finalIndex.erase(finalIndex.begin() + index);
}
}

bool QueryProcessor::specialStopCheck(StopWord& stop, string& word)
Expand Down
4 changes: 2 additions & 2 deletions QueryProcessor.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ class QueryProcessor {
vector<Word> parseAndOr();
Word findPersonOrg();//get the full name of the person being searched for (accounts for those w/first+last

void setUnion(vector<Word> a, DSAVLTree<Word>& tree);//OR keyword
void intersection(Word& word, DSAVLTree<Word> & tree);//AND keyword
void setUnion(Word& word, DSAVLTree<Word>& tree);
void intersection(Word& word, DSAVLTree<Word>& tree);//AND keyword
void addTerm(vector<Document>& a);//add a single term's docs to the final
void complement(vector<Document>& a);//set subtraction
void addPersonOrg(vector<Document>& a);//remove any docs that don't include the given person or org
Expand Down
97 changes: 39 additions & 58 deletions UserInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,28 +13,37 @@ void UserInterface::run(const string& file)
bool go = true;

string choice;
while (choice != "1" && choice != "2"){
cout << "Enter 1 to parse files or 2 to use persistence file: " << endl;
cin.clear();
cin >> choice;
if (choice == "1"){
cout << "parsing..." << endl;
start = std::chrono::high_resolution_clock::now();
docReader.getFiles(file, stops);
end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> time_in_seconds = end - start;
cout << std::fixed << "Parsing Time: " << time_in_seconds.count() << endl;
cout << "done!" << endl;
}
else if (choice == "2"){
cout << "parsing..." << endl;
docReader.persistenceIndex();//TODO
cout << "done!" << endl;
}
else{
cout << "Incorrect input." << endl;
}
}
// while (choice != "1" && choice != "2"){
// cout << "Enter 1 to parse files or 2 to use persistence file: " << endl;
// cin.clear();
// cin >> choice;
// if (choice == "1"){
// cout << "parsing..." << endl;
// start = std::chrono::high_resolution_clock::now();
// docReader.getFiles(file, stops);
// end = std::chrono::high_resolution_clock::now();
// std::chrono::duration<double> time_in_seconds = end - start;
// cout << std::fixed << "Parsing Time: " << time_in_seconds.count() << endl;
// cout << "done!" << endl;
// }
// else if (choice == "2"){
// cout << "parsing..." << endl;
// docReader.persistenceIndex();//TODO
// cout << "done!" << endl;
// }
// else{
// cout << "Incorrect input." << endl;
// }
// }

//Parse documents
cout << "parsing..." << endl;
start = std::chrono::high_resolution_clock::now();
docReader.getFiles(file, stops);
end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> time_in_seconds = end - start;
cout << std::fixed << "Parsing Time: " << time_in_seconds.count() << endl;
cout << "done!" << endl;

while (go)
{
Expand All @@ -56,10 +65,13 @@ void UserInterface::run(const string& file)
cin >> choice;
if (choice == "0")//exit
break;
// else if (choice == "1" || choice == "2" || choice == "3" || choice == "4" || choice == "5" || choice == "6" || choice == "7" || choice == "8" || choice == "9" || choice == "10" || choice == "11" || choice == "12" || choice == "13" || choice == "14" || choice == "15"){
else if (stoi(choice) > 0 && stoi(choice) <= process.getBest().size()){
showText(process.getBest().at(stoi(choice) - 1));
cout << endl;
else if (choice == "1" || choice == "2" || choice == "3" || choice == "4" || choice == "5" || choice == "6" || choice == "7" || choice == "8" || choice == "9" || choice == "10" || choice == "11" || choice == "12" || choice == "13" || choice == "14" || choice == "15"){
if (stoi(choice) > 0 && stoi(choice) <= process.getBest().size()){
showText(process.getBest().at(stoi(choice) - 1));
cout << endl;
}
else
cout << "Incorrect input." << endl;
}
else
cout << "Incorrect input." << endl;
Expand Down Expand Up @@ -95,13 +107,6 @@ void UserInterface::clearIndex()
docReader.getPersonTree().deleteTree(docReader.getPersonTree().getRoot());
}

void UserInterface::parseDocs(const string& direct)
{
std::cout << "parsing documents..." << std::endl;
docReader.getFiles(direct, stops);
std::cout << "done parsing!" << std::endl;
}

void UserInterface::displayResults()//with ranking
{
if (process.getBest().size() == 0)
Expand All @@ -117,21 +122,6 @@ void UserInterface::displayResults()//with ranking
}
}

//void UserInterface::displayResults()//without ranking
//{
// if (process.getFinal().size() == 0)
// cout << "No results found" << endl;
//
// for (int i = 0; i < process.getFinal().size(); i++)
// {
// if (i == 15)
// break;
// cout << i + 1 << ") ";
// cout << "Title: " << process.getFinal().at(i).getTitle() << ", " << process.getFinal().at(i).getPub() << ", Date: " << process.getFinal().at(i).getDate() << endl;
// cout << "Path: " << process.getFinal().at(i).getPath() << endl;
// }
//}

void UserInterface::showText(Document& d)
{
rapidjson::Document doc;
Expand Down Expand Up @@ -174,20 +164,12 @@ void UserInterface::topWordsHelper(Node<Word>* n)
}
}

void UserInterface::getTopWords()
void UserInterface::getTopWords()//go through tree and get the frequency of each word (in order)
{
cout << "Get top words" << endl;
topWordsHelper(docReader.getWordTree().getRoot());
vector<Word> top;//top 25 words
// vector<int> freqs; //corresponding total freqs for each word
// vector<Word> all;//save words too? corresponding to freqs
//go through tree and get the frequency of each word (in order)
//result: total frequency for each doc

// cout << "Frequency" << endl;
// for (int i = 0; i < freqs.size(); i++)
// cout << freqs.at(i) << " " << finalIndex.at(i).getPath() << endl;

//get the top 25 most frequent words
cout << "Top 25 Most Frequent Words: " << endl;
for (int n = 0; n < 25; n++){
Expand All @@ -202,7 +184,6 @@ void UserInterface::getTopWords()
index = i;
}
}
// top.push_back(all.at(index));//get the corresponding doc for that freq
cout << all.at(index) << ": " << frequency.at(index) << endl;
frequency.erase(frequency.begin() + index);
all.erase(all.begin() + index);
Expand Down
1 change: 0 additions & 1 deletion UserInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ class UserInterface {

void run(const string& file);//run search engine
void clearIndex();//delete every tree
void parseDocs(const string& direct);//parse all documents
void displayResults();
void showText(Document& d);
void stats();
Expand Down
5 changes: 4 additions & 1 deletion Word.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,14 +81,17 @@ int Word::getDocFreq(Document& doc)
{
for (int i = 0; i < docs.size(); i++)
{
if (docs.at(i) == doc)
if (docs.at(i) == doc){
return frequency.at(i);//return corresponding freq. for given doc
}
}
cout << "Doc not found." << endl;
}

vector<int>& Word::getFrequency() { return frequency; }

int Word::getTotal() { return total; }

std::ostream& operator<< (std::ostream& out, const Word& w)
{
out << w.str;
Expand Down
1 change: 1 addition & 0 deletions stopWords.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
a
able
about
above
Expand Down

0 comments on commit f43a1f5

Please sign in to comment.