diff --git a/DocParser.cpp b/DocParser.cpp index 6b8de6a..4aed6ad 100644 --- a/DocParser.cpp +++ b/DocParser.cpp @@ -75,6 +75,12 @@ void DocParser::parse(const string& filename, StopWord& stop) { curr.removePunc();//remove punctuation curr.stemming(); + if (curr.getStr().empty()){//don't insert an empty string + text = text.substr(space + 1); + space = text.find(" "); + continue; + } + //put unique words into the avl tree if (!words.contains(curr)){//if the word is not already in the tree/new unique word curr.incrFreq(currDoc); diff --git a/QueryProcessor.cpp b/QueryProcessor.cpp index 1abb05a..57c80c7 100644 --- a/QueryProcessor.cpp +++ b/QueryProcessor.cpp @@ -88,7 +88,7 @@ void QueryProcessor::parseQuery(string& q, DSAVLTree& words, DSAVLTree QueryProcessor::parseAndOr() @@ -268,6 +268,7 @@ void QueryProcessor::addPersonOrg(vector& a)//remove any docs from fin void QueryProcessor::rankIndex() { + //TODO something with NOT operator doesn't work with ranking system cout << "Rank index" << endl; // cout << "finalIndex size " << finalIndex.size() << endl; // cout << "query words size: " << queryWords.size() << endl; @@ -279,20 +280,22 @@ void QueryProcessor::rankIndex() { //get the each words frequency in the current doc and add them all together sum += queryWords.at(i).getDocFreq(finalIndex.at(queryIndex));//add total freq of each word for this doc - cout << queryWords.at(i).getDocFreq(finalIndex.at(queryIndex)) << " " << finalIndex.at(queryIndex).getPath() << endl; +// cout << queryWords.at(i).getDocFreq(finalIndex.at(queryIndex)) << " " << finalIndex.at(queryIndex).getPath() << endl; } freqs.push_back(sum); // cout << "sum: " << sum << endl; } //result: total frequency for each doc - cout << "Frequency" << endl; - for (int i = 0; i < freqs.size(); i++) - cout << freqs.at(i) << " " << finalIndex.at(i).getPath() << endl; +// cout << "Frequency" << endl; +// for (int i = 0; i < freqs.size(); i++) +// cout << freqs.at(i) << " " << finalIndex.at(i).getPath() << endl; //get the top 15 docs with the highest freq for (int n = 0; n < 15; n++){ +// if (n > freqs.size() || freqs.size() == 0)//less that 15 docs in the finalIndex +// break; int highest = freqs.at(0); int index = 0; if (n > freqs.size())//less that 15 docs in the finalIndex @@ -305,7 +308,7 @@ void QueryProcessor::rankIndex() } } best.push_back(finalIndex.at(index));//get the corresponding doc for that freq - cout << "next higheset frequency: " << freqs.at(index) << endl; +// cout << "next higheset frequency: " << freqs.at(index) << endl; freqs.erase(freqs.begin() + index); finalIndex.erase(finalIndex.begin() + index); } diff --git a/UserInterface.cpp b/UserInterface.cpp index 441a55d..9be212a 100644 --- a/UserInterface.cpp +++ b/UserInterface.cpp @@ -1,5 +1,5 @@ // -// Created by 18476 on 4/23/2022. +// Created by Karina Shin on 4/23/2022. // #include "UserInterface.h" @@ -19,7 +19,11 @@ void UserInterface::run(const string& file) cin >> choice; if (choice == "1"){ cout << "parsing..." << endl; + start = std::chrono::high_resolution_clock::now(); docReader.getFiles(file, stops); + end = std::chrono::high_resolution_clock::now(); + std::chrono::duration time_in_seconds = end - start; + cout << std::fixed << "Parsing Time: " << time_in_seconds.count() << endl; cout << "done!" << endl; } else if (choice == "2"){ @@ -97,36 +101,36 @@ void UserInterface::parseDocs(const string& direct) std::cout << "done parsing!" << std::endl; } -//void UserInterface::displayResults()//with ranking -//{ -// if (process.getBest().size() == 0) -// cout << "No results found" << endl; -// -// for (int i = 0; i < process.getBest().size(); i++) -// { -// if (i == 15) -// break; -// cout << i + 1 << ") "; -// cout << "Title: " << process.getBest().at(i).getTitle() << ", " << process.getBest().at(i).getPub() << ", Date: " << process.getBest().at(i).getDate() << endl; -// cout << "Path: " << process.getBest().at(i).getPath() << endl; -// } -//} - -void UserInterface::displayResults()//without ranking +void UserInterface::displayResults()//with ranking { - if (process.getFinal().size() == 0) + if (process.getBest().size() == 0) cout << "No results found" << endl; - for (int i = 0; i < process.getFinal().size(); i++) + for (int i = 0; i < process.getBest().size(); i++) { if (i == 15) break; cout << i + 1 << ") "; - cout << "Title: " << process.getFinal().at(i).getTitle() << ", " << process.getFinal().at(i).getPub() << ", Date: " << process.getFinal().at(i).getDate() << endl; - cout << "Path: " << process.getFinal().at(i).getPath() << endl; + cout << "Title: " << process.getBest().at(i).getTitle() << ", " << process.getBest().at(i).getPub() << ", Date: " << process.getBest().at(i).getDate() << endl; + cout << "Path: " << process.getBest().at(i).getPath() << endl; } } +//void UserInterface::displayResults()//without ranking +//{ +// if (process.getFinal().size() == 0) +// cout << "No results found" << endl; +// +// for (int i = 0; i < process.getFinal().size(); i++) +// { +// if (i == 15) +// break; +// cout << i + 1 << ") "; +// cout << "Title: " << process.getFinal().at(i).getTitle() << ", " << process.getFinal().at(i).getPub() << ", Date: " << process.getFinal().at(i).getDate() << endl; +// cout << "Path: " << process.getFinal().at(i).getPath() << endl; +// } +//} + void UserInterface::showText(Document& d) { rapidjson::Document doc;