Skip to content

Commit

Permalink
fixed AND/OR + PERSON/ORG queries, added top 25 ranking, fixed stop w…
Browse files Browse the repository at this point in the history
…ords
  • Loading branch information
karinashin committed May 1, 2022
1 parent 9c989c0 commit c5af51a
Show file tree
Hide file tree
Showing 6 changed files with 49 additions and 32 deletions.
1 change: 1 addition & 0 deletions DocParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ void DocParser::parse(const string& filename, StopWord& stop) {
// cout << "current: " << curr.getStr() << endl;
curr.toLower();//remove caps
if (stop.isStopWord(curr.getStr())){
// cout << "stop word found" << endl;
text = text.substr(space + 1);//cut off curr word
space = text.find(" ");
continue;//don't add to tree
Expand Down
34 changes: 15 additions & 19 deletions QueryProcessor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,10 @@ void QueryProcessor::parseQuery(string& q, DSAVLTree<Word>& words, DSAVLTree<Wor
curr.toLower();
cout << curr.getStr() << endl;
if (specialStopCheck(stop, curr.getStr()))
{
query = query.substr(space + 1);//cut off word
continue;//skip stop words
}

if (curr.getStr() == "and" || curr.getStr() == "or"){
if (curr.getStr() == "and"){//perform corresponding set operation
Expand Down Expand Up @@ -54,7 +57,6 @@ void QueryProcessor::parseQuery(string& q, DSAVLTree<Word>& words, DSAVLTree<Wor
}
else
cout << word1.getStr() << " is not found." << endl;
// complement(words.find(words.getRoot(), word1)->getData().getDocs());
}
else if (curr.getStr() == "org"){
query = query.substr(space + 1);//cut off operator
Expand Down Expand Up @@ -95,10 +97,6 @@ void QueryProcessor::parseQuery(string& q, DSAVLTree<Word>& words, DSAVLTree<Wor
}
space = query.find(" ");//to check if youve reached the end of the query
}
// cout << "Query words: " << endl;
// for (int i = 0; i < queryWords.size(); i++)
// cout << queryWords.at(i) << endl;
// cout << "done parsing query" << endl;

// rankIndex();//TODO
}
Expand All @@ -122,7 +120,6 @@ vector<Word> QueryProcessor::parseAndOr()
word = query.substr(0, space);
word.stemming();
args.push_back(word);
// queryWords.push_back(word);//for ranking
query = query.substr(space + 1);
}
else
Expand All @@ -134,7 +131,6 @@ vector<Word> QueryProcessor::parseAndOr()
query = "";//empty query
word.stemming();
args.push_back(word);
// queryWords.push_back(word);//for ranking
break;
}
}
Expand Down Expand Up @@ -170,9 +166,7 @@ Word QueryProcessor::findPersonOrg()//operator is already removed from query
space = query.find(" ");
count++;
}
// cout << "Name: " << name << endl;
Word person(name);
// queryWords.push_back(person);//for ranking

return person;
}
Expand All @@ -181,7 +175,6 @@ void QueryProcessor::setUnion(vector<Word> a, DSAVLTree<Word>& tree)//OR keyword
{
for (int i = 0; i < a.size(); i++)//for every word object in the OR operator
{
// cout << a.at(i).getStr() << endl;
vector<Document> temp;
if (tree.contains(a.at(i))){
temp = tree.find(tree.getRoot(), a.at(i)).getDocs();
Expand Down Expand Up @@ -210,6 +203,7 @@ void QueryProcessor::intersection(Word& word, DSAVLTree<Word>& tree)//AND keywor
cout << "if statement" << endl;
if (tree.contains(word))
{
queryWords.push_back(tree.find(tree.getRoot(), word));
vector<Document> temp = tree.find(tree.getRoot(), word).getDocs();
for (int i = 0; i < temp.size(); i++)
finalIndex.push_back(temp.at(i));
Expand All @@ -220,21 +214,23 @@ void QueryProcessor::intersection(Word& word, DSAVLTree<Word>& tree)//AND keywor
else
{
vector<Document> temp;
if (tree.contains(word))
if (tree.contains(word)){
queryWords.push_back(tree.find(tree.getRoot(), word));
temp = tree.find(tree.getRoot(), word).getDocs();
}
else
cout << word.getStr() << " not found" << endl;
cout << "else statment" << endl;
// cout << "else statment" << endl;
for (int i = 0; i < temp.size(); i++)
{
cout << "inner for loop running" << endl;
// cout << "inner for loop running" << endl;
vector<Document>::iterator it = find(finalIndex.begin(), finalIndex.end(), temp.at(i));
if (it != finalIndex.end()){//doc of a exists in final, only add docs that are in word.getDocs and final
finalList.push_back(*it);//add the docs that contain the word from the finalIndex
cout << "added to final list" << endl;
// cout << "added to final list" << endl;
}
}
cout << "Done" << endl;
// cout << "Done" << endl;
finalIndex.clear();
finalIndex = finalList;//should only contain docs that contain the word
}
Expand Down Expand Up @@ -264,8 +260,8 @@ void QueryProcessor::complement(vector<Document>& a)//delete set a from finalInd
}

void QueryProcessor::addPersonOrg(vector<Document>& a)//remove any docs from final that don't include the person/org
{//doesn't work with AND EX: AND investor stock PERSON cramer
cout << "Add person/org" << endl;//TODO 21883 is being printed with random chars
{
cout << "Add person/org" << endl;
//if finalIndex already has values, remove any docs that don't contain person/org
//else: query only has person/org keywords, just add the files that contain the person/org
vector<Document> personList;
Expand All @@ -277,7 +273,7 @@ void QueryProcessor::addPersonOrg(vector<Document>& a)//remove any docs from fin
cout << "person for" << endl;
// vector<Document>::iterator it = find(a.begin(), a.end(), finalIndex.at(i));
vector<Document>::iterator it = find(finalIndex.begin(), finalIndex.end(), a.at(i));
if (it != a.end()){//doc of person/org list exists in final index, keep
if (it != finalIndex.end()){//doc of person/org list exists in final index, keep
// cout << "does not exist in doc list" << endl;
personList.push_back(*it);
cout << "added " << it->getPath() << endl;
Expand Down Expand Up @@ -350,7 +346,7 @@ void QueryProcessor::rankIndex()

bool QueryProcessor::specialStopCheck(StopWord& stop, string& word)
{
if (word == "AND" || word == "OR" || word == "NOT")//exclude key words
if (word == "and" || word == "or" || word == "not")//exclude key words
return false;
else if (stop.isStopWord(word))
return true;
Expand Down
2 changes: 2 additions & 0 deletions StopWord.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ StopWord::StopWord()
{
ifstream stop;//make the stop words AVL tree
stop.open("stopWords.txt");
// if (stop.is_open())
// cout << "OPened stops" << endl;
string curr;
while (getline(stop, curr))//make an avl tree of stop words
{
Expand Down
37 changes: 24 additions & 13 deletions UserInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,36 +162,47 @@ void UserInterface::stats()
getTopWords();
}

void UserInterface::topWordsHelper(Node<Word>* n)
{
if (n != nullptr){
topWordsHelper(n->getLeft());
frequency.push_back(n->getData().getTotal());
all.push_back(n->getData());
topWordsHelper(n->getRight());
}
}

void UserInterface::getTopWords()
{
cout << "Get top words" << endl;
topWordsHelper(docReader.getWordTree().getRoot());
vector<Word> top;//top 25 words
vector<int> freqs; //corresponding total freqs for each word
vector<Word> all;//save words too? corresponding to freqs
// vector<int> freqs; //corresponding total freqs for each word
// vector<Word> all;//save words too? corresponding to freqs
//go through tree and get the frequency of each word (in order)
//result: total frequency for each doc

// cout << "Frequency" << endl;
// for (int i = 0; i < freqs.size(); i++)
// cout << freqs.at(i) << " " << finalIndex.at(i).getPath() << endl;


//get the top 15 docs with the highest freq
for (int n = 0; n < 15; n++){
int highest = freqs.at(0);
//get the top 25 most frequent words
cout << "Top 25 Most Frequent Words: " << endl;
for (int n = 0; n < 25; n++){
int highest = frequency.at(0);
int index = 0;
if (n > freqs.size())//less that 15 docs in the finalIndex
if (n > frequency.size())//less than 25 total words
break;
for (int i = 1; i < freqs.size(); i++)//find the next highest freq
for (int i = 1; i < frequency.size(); i++)//find the next highest freq
{
if (freqs.at(i) > highest){//get highest freq
highest = freqs.at(i);
if (frequency.at(i) > highest){//get highest freq
highest = frequency.at(i);
index = i;
}
}
top.push_back(all.at(index));//get the corresponding doc for that freq
cout << "next higheset frequency: " << freqs.at(index) << endl;
freqs.erase(freqs.begin() + index);
// top.push_back(all.at(index));//get the corresponding doc for that freq
cout << all.at(index) << ": " << frequency.at(index) << endl;
frequency.erase(frequency.begin() + index);
all.erase(all.begin() + index);
}
}
Expand Down
5 changes: 5 additions & 0 deletions UserInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ class UserInterface {
QueryProcessor process;
StopWord stops;

//for ranking
vector<int> frequency;//frequencies of words
vector<Word> all;//vector of all words

public:
UserInterface();
void run(const string& file);//run search engine
Expand All @@ -31,6 +35,7 @@ class UserInterface {
void displayResults();
void showText(Document& d);
void stats();
void topWordsHelper(Node<Word>* n);//populates frequency and word vectors
void getTopWords();//prints out top 25 most frequent words
DocParser& getDocParser();//used to access word/org/person avl trees
QueryProcessor& getQueryProcessor();
Expand Down
2 changes: 2 additions & 0 deletions Word.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,12 +79,14 @@ void Word::incrFreq(Document& doc)
for (int i = 0; i < docs.size(); i++){
if (docs.at(i) == doc){//found doc
frequency.at(i)++;//increment corresponding freq for existing doc
total++;
return;
}
}
//if function didn't return, no doc was found
docs.push_back(doc);//add new doc to word's index
frequency.push_back(1);//frequency that corresponds to the current doc
total++;
}

int Word::getDocFreq(Document& doc)
Expand Down

0 comments on commit c5af51a

Please sign in to comment.