From 36e834ac13f26e99b83ed0dc756687d56c3dac79 Mon Sep 17 00:00:00 2001 From: karinashin Date: Mon, 18 Apr 2022 11:44:00 -0500 Subject: [PATCH] using avl tree for stop words, moved to parse class --- DocParser.cpp | 20 +++++++++++++++++++- DocParser.h | 3 +++ Word.cpp | 29 ++++++++++++++--------------- Word.h | 6 ++++-- main.cpp | 11 ++++++----- 5 files changed, 46 insertions(+), 23 deletions(-) diff --git a/DocParser.cpp b/DocParser.cpp index dce8bcc..033c81b 100644 --- a/DocParser.cpp +++ b/DocParser.cpp @@ -3,6 +3,17 @@ // #include "DocParser.h" +DocParser::DocParser() +{ + ifstream stop;//make the stop words AVL tree + stop.open("stopWords.txt"); + string curr; + while (getline(stop, curr))//make an avl tree of stop words + { + string s = curr.substr(0, curr.length()-1);//cut off end char + stops.insert(s); + } +} void DocParser::parse(const string& filename) { // cout << "NEW DOC: " << filename << endl; @@ -39,7 +50,7 @@ void DocParser::parse(const string& filename) { // cout << "text: " << text << endl; Word curr(text.substr(0, space)); curr.toLower();//remove caps - if (curr.isStopWord()){ + if (isStopWord(curr.getStr())){ text = text.substr(space + 1);//cut off curr word space = text.find(" "); continue;//don't add to tree @@ -48,6 +59,8 @@ void DocParser::parse(const string& filename) { curr.stemming(); // cout << "current: " << curr.getStr() << endl; //put unique words into the avl tree +// Node& found = words.find(words.getRoot(), curr);//ref to word in tree + if (!words.contains(curr)){//if the word is not already in the tree/new unique word curr.incrFreq(currDoc);//TODO combine contains and find words.insert(curr); @@ -80,6 +93,11 @@ vector& DocParser::findIndex(Word& obj) return words.find(words.getRoot(), obj).getDocs(); } +bool DocParser::isStopWord(string& str) +{ + return stops.contains(str);//if str is in the avl tree, its a stop word +} + vector& DocParser::findWordIndex(Word& w) { return w.getDocs(); } vector& DocParser::findOrgIndex(Word& org) { return org.getDocs(); } vector& DocParser::findPersonIndex(Word& p) {return p.getDocs(); } diff --git a/DocParser.h b/DocParser.h index 4c50ecd..4188dbd 100644 --- a/DocParser.h +++ b/DocParser.h @@ -23,11 +23,14 @@ class DocParser { DSAVLTree words; DSAVLTree orgs; DSAVLTree people; + DSAVLTree stops; public: + DocParser(); void parse(const string& filename);//parse the documents and create 3 AVLTrees void getFiles(const string& directory);//returns filenames for traversal through directory vector& findIndex(Word& obj);//return the document index of a given Word object + bool isStopWord(string& str); vector& findWordIndex(Word& w); vector& findOrgIndex(Word& org); diff --git a/Word.cpp b/Word.cpp index d227050..5470429 100644 --- a/Word.cpp +++ b/Word.cpp @@ -47,20 +47,19 @@ void Word::removePunc() str = buffer; } -bool Word::isStopWord() -{ - ifstream stop; - stop.open("stopWords.txt"); - string curr; - while (getline(stop, curr))//go through entire list of stop words - { - string s = curr.substr(0, curr.length()-1);//cut off end char - if (s == str){//the current string is a stop word - return true; - } - } - return false; -} +//bool Word::isStopWord() +//{ +// DSAVLTree stops; +// ifstream stop; +// stop.open("stopWords.txt"); +// string curr; +// while (getline(stop, curr))//make an avl tree of stop words +// { +// string s = curr.substr(0, curr.length()-1);//cut off end char +// stops.insert(s); +// } +// return stops.contains(str);//if str is in the avl tree, its a stop word +//} void Word::stemming() { @@ -68,7 +67,7 @@ void Word::stemming() Porter2Stemmer::stem(str); } -string Word::getStr() { return str; } +string& Word::getStr() { return str; } vector& Word::getDocs() { return docs; } void Word::printDocs() { diff --git a/Word.h b/Word.h index 3f43311..05c187f 100644 --- a/Word.h +++ b/Word.h @@ -11,6 +11,7 @@ #include #include "Document.h" #include "porter2_stemmer.h" +#include "DSAVLTree.h" using namespace std; class Word { @@ -18,6 +19,7 @@ class Word { string str; vector docs;//documents word appears in vector frequency;//frequency for each doc word appears in +// DSAVLTree stops;//stop words int total = 0; public: @@ -28,10 +30,10 @@ class Word { bool operator==(const Word& w); void toLower(); void removePunc(); - bool isStopWord(); +// bool isStopWord(); void stemming();//using porter2 stemming library - string getStr(); + string& getStr(); vector& getDocs(); void printDocs(); void incrFreq(Document& doc);//given a doc, incrememnt its corresponding freq. diff --git a/main.cpp b/main.cpp index 2c0d3c6..3a49e08 100644 --- a/main.cpp +++ b/main.cpp @@ -16,12 +16,13 @@ int main(int argc, char** argv) { // cout << parse.getWordTree().getCount(); Word w("investors"); w.stemming(); - if (parse.getWordTree().contains(w)){ - cout << "true" << endl; - parse.getWordTree().find(parse.getWordTree().getRoot(), w).printDocs(); - } - //data folder took 1:45 4/17 +// if (parse.getWordTree().contains(w)){ +// cout << "true" << endl; +// parse.getWordTree().find(parse.getWordTree().getRoot(), w).printDocs(); +// } + parse.getWordTree().find(parse.getWordTree().getRoot(), w).printDocs(); + //data folder took 1:45 4/17 // DSAVLTree tree; // Word a("a"); // Word z("z");