Skip to content

Commit

Permalink
using avl tree for stop words, moved to parse class
Browse files Browse the repository at this point in the history
  • Loading branch information
karinashin committed Apr 18, 2022
1 parent b65f767 commit 36e834a
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 23 deletions.
20 changes: 19 additions & 1 deletion DocParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,17 @@
//

#include "DocParser.h"
DocParser::DocParser()
{
ifstream stop;//make the stop words AVL tree
stop.open("stopWords.txt");
string curr;
while (getline(stop, curr))//make an avl tree of stop words
{
string s = curr.substr(0, curr.length()-1);//cut off end char
stops.insert(s);
}
}

void DocParser::parse(const string& filename) {
// cout << "NEW DOC: " << filename << endl;
Expand Down Expand Up @@ -39,7 +50,7 @@ void DocParser::parse(const string& filename) {
// cout << "text: " << text << endl;
Word curr(text.substr(0, space));
curr.toLower();//remove caps
if (curr.isStopWord()){
if (isStopWord(curr.getStr())){
text = text.substr(space + 1);//cut off curr word
space = text.find(" ");
continue;//don't add to tree
Expand All @@ -48,6 +59,8 @@ void DocParser::parse(const string& filename) {
curr.stemming();
// cout << "current: " << curr.getStr() << endl;
//put unique words into the avl tree
// Node<Word>& found = words.find(words.getRoot(), curr);//ref to word in tree

if (!words.contains(curr)){//if the word is not already in the tree/new unique word
curr.incrFreq(currDoc);//TODO combine contains and find
words.insert(curr);
Expand Down Expand Up @@ -80,6 +93,11 @@ vector<Document>& DocParser::findIndex(Word& obj)
return words.find(words.getRoot(), obj).getDocs();
}

bool DocParser::isStopWord(string& str)
{
return stops.contains(str);//if str is in the avl tree, its a stop word
}

vector<Document>& DocParser::findWordIndex(Word& w) { return w.getDocs(); }
vector<Document>& DocParser::findOrgIndex(Word& org) { return org.getDocs(); }
vector<Document>& DocParser::findPersonIndex(Word& p) {return p.getDocs(); }
Expand Down
3 changes: 3 additions & 0 deletions DocParser.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,14 @@ class DocParser {
DSAVLTree<Word> words;
DSAVLTree<Word> orgs;
DSAVLTree<Word> people;
DSAVLTree<string> stops;

public:
DocParser();
void parse(const string& filename);//parse the documents and create 3 AVLTrees
void getFiles(const string& directory);//returns filenames for traversal through directory
vector<Document>& findIndex(Word& obj);//return the document index of a given Word object
bool isStopWord(string& str);

vector<Document>& findWordIndex(Word& w);
vector<Document>& findOrgIndex(Word& org);
Expand Down
29 changes: 14 additions & 15 deletions Word.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,28 +47,27 @@ void Word::removePunc()
str = buffer;
}

bool Word::isStopWord()
{
ifstream stop;
stop.open("stopWords.txt");
string curr;
while (getline(stop, curr))//go through entire list of stop words
{
string s = curr.substr(0, curr.length()-1);//cut off end char
if (s == str){//the current string is a stop word
return true;
}
}
return false;
}
//bool Word::isStopWord()
//{
// DSAVLTree<string> stops;
// ifstream stop;
// stop.open("stopWords.txt");
// string curr;
// while (getline(stop, curr))//make an avl tree of stop words
// {
// string s = curr.substr(0, curr.length()-1);//cut off end char
// stops.insert(s);
// }
// return stops.contains(str);//if str is in the avl tree, its a stop word
//}

void Word::stemming()
{
Porter2Stemmer::trim(str);//TODO error undefined reference
Porter2Stemmer::stem(str);
}

string Word::getStr() { return str; }
string& Word::getStr() { return str; }
vector<Document>& Word::getDocs() { return docs; }
void Word::printDocs()
{
Expand Down
6 changes: 4 additions & 2 deletions Word.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,15 @@
#include <iostream>
#include "Document.h"
#include "porter2_stemmer.h"
#include "DSAVLTree.h"

using namespace std;
class Word {
private:
string str;
vector<Document> docs;//documents word appears in
vector<int> frequency;//frequency for each doc word appears in
// DSAVLTree<string> stops;//stop words
int total = 0;

public:
Expand All @@ -28,10 +30,10 @@ class Word {
bool operator==(const Word& w);
void toLower();
void removePunc();
bool isStopWord();
// bool isStopWord();
void stemming();//using porter2 stemming library

string getStr();
string& getStr();
vector<Document>& getDocs();
void printDocs();
void incrFreq(Document& doc);//given a doc, incrememnt its corresponding freq.
Expand Down
11 changes: 6 additions & 5 deletions main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,13 @@ int main(int argc, char** argv) {
// cout << parse.getWordTree().getCount();
Word w("investors");
w.stemming();
if (parse.getWordTree().contains(w)){
cout << "true" << endl;
parse.getWordTree().find(parse.getWordTree().getRoot(), w).printDocs();
}
//data folder took 1:45 4/17
// if (parse.getWordTree().contains(w)){
// cout << "true" << endl;
// parse.getWordTree().find(parse.getWordTree().getRoot(), w).printDocs();
// }
parse.getWordTree().find(parse.getWordTree().getRoot(), w).printDocs();

//data folder took 1:45 4/17
// DSAVLTree<Word> tree;
// Word a("a");
// Word z("z");
Expand Down

0 comments on commit 36e834a

Please sign in to comment.