Skip to content

Commit

Permalink
made stop word class so I can access stop words in all classes
Browse files Browse the repository at this point in the history
  • Loading branch information
karinashin committed Apr 24, 2022
1 parent 8505f98 commit c62fe75
Show file tree
Hide file tree
Showing 10 changed files with 74 additions and 55 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ project(22s_final_proj)
set(CMAKE_CXX_STANDARD 17)

#add_executable(22s_final_proj main.cpp catch_setup.cpp DSAVLTree.h DocParser.cpp DocParser.h Word.cpp Word.h Document.cpp Document.h catchTests.cpp)
add_executable(22s_final_proj main.cpp DSAVLTree.h DocParser.cpp DocParser.h Word.cpp Word.h Document.cpp Document.h porter2_stemmer.cpp porter2_stemmer.h QueryProcessor.cpp QueryProcessor.h UserInterface.cpp UserInterface.h)
add_executable(22s_final_proj main.cpp DSAVLTree.h DocParser.cpp DocParser.h Word.cpp Word.h Document.cpp Document.h porter2_stemmer.cpp porter2_stemmer.h QueryProcessor.cpp QueryProcessor.h UserInterface.cpp UserInterface.h StopWord.cpp StopWord.h)
30 changes: 5 additions & 25 deletions DocParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,24 +3,9 @@
//

#include "DocParser.h"
DocParser::DocParser()
{
ifstream stop;//make the stop words AVL tree
stop.open("stopWords.txt");
string curr;
while (getline(stop, curr))//make an avl tree of stop words
{
string s = curr.substr(0, curr.length()-1);//cut off end char
stops.insert(s);
}
}

DocParser::DocParser(DSAVLTree<string>& stopWords)
{
//TODO
}
DocParser::DocParser() {}

void DocParser::parse(const string& filename) {
void DocParser::parse(const string& filename, StopWord& stop) {
cout << "NEW DOC: " << filename << endl;
//TODO parse for org and person, put unique ones into avl tree
//parse main text
Expand Down Expand Up @@ -58,7 +43,7 @@ void DocParser::parse(const string& filename) {
Word curr(text.substr(0, space));
// cout << "current: " << curr.getStr() << endl;
curr.toLower();//remove caps
if (isStopWord(curr.getStr())){
if (stop.isStopWord(curr.getStr())){
text = text.substr(space + 1);//cut off curr word
space = text.find(" ");
continue;//don't add to tree
Expand Down Expand Up @@ -89,13 +74,13 @@ void DocParser::parse(const string& filename) {
}
}

void DocParser::getFiles(const string& directory)
void DocParser::getFiles(const string& directory, StopWord& stop)
{
for (const auto & entry : fs::recursive_directory_iterator(directory)){
if (entry.is_regular_file()) {
if (entry.path().extension().string() == ".json") {
string filename = entry.path().c_str();
parse(filename);
parse(filename, stop);
}
}
}
Expand All @@ -107,11 +92,6 @@ vector<Document>& DocParser::findIndex(Word& obj)
// return words.find(words.getRoot(), obj)->getData().getDocs();
}

bool DocParser::isStopWord(string& str)
{
return stops.contains(str);//if str is in the avl tree, its a stop word
}

vector<Document>& DocParser::findWordIndex(Word& w) { return w.getDocs(); }
vector<Document>& DocParser::findOrgIndex(Word& org) { return org.getDocs(); }
vector<Document>& DocParser::findPersonIndex(Word& p) {return p.getDocs(); }
Expand Down
9 changes: 3 additions & 6 deletions DocParser.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "DSAVLTree.h"
#include "Word.h"
#include "include/rapidjson/document.h"
#include "StopWord.h"

using namespace std;
namespace fs = std::filesystem;
Expand All @@ -23,16 +24,12 @@ class DocParser {
DSAVLTree<Word> words;
DSAVLTree<Word> orgs;
DSAVLTree<Word> people;
DSAVLTree<string> stops;

public:
DocParser();
DocParser(DSAVLTree<string>& stopWords);
void parse(const string& filename);//parse the documents and create 3 AVLTrees
// void parse(DSAVLTree<Word>& words, DSAVLTree<Word> orgs, DSAVLTree<Word> people);
void getFiles(const string& directory);//returns filenames for traversal through directory
void parse(const string& filename, StopWord& stop);//parse the documents and create 3 AVLTrees
void getFiles(const string& directory, StopWord& stop);//returns filenames for traversal through directory
vector<Document>& findIndex(Word& obj);//return the document index of a given Word object
bool isStopWord(string& str);

vector<Document>& findWordIndex(Word& w);
vector<Document>& findOrgIndex(Word& org);
Expand Down
5 changes: 3 additions & 2 deletions QueryProcessor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,16 @@ QueryProcessor::QueryProcessor(DSAVLTree<Word>& w, DSAVLTree<Word>& o, DSAVLTree
// people = p;
}

void QueryProcessor::parseQuery(string& query)//parse query
void QueryProcessor::parseQuery(string& query, StopWord& stop)//parse query
{
std::cout << "NEW QUERY: " << query << std::endl;
int space;
while (space != -1)
{
space = query.find(" ");
string curr = query.substr(0, space);
//TODO remove stop words and stem
if (stop.isStopWord(curr))
continue;//skip stop words

if (curr == "AND" || curr == "OR"){//2 arg operators
query = query.substr(space + 1);//cut off operator
Expand Down
3 changes: 2 additions & 1 deletion QueryProcessor.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "Document.h"
#include "DSAVLTree.h"
#include "Word.h"
#include "StopWord.h"

class QueryProcessor {
private:
Expand All @@ -23,7 +24,7 @@ class QueryProcessor {
public:
QueryProcessor();
QueryProcessor(DSAVLTree<Word>& w, DSAVLTree<Word>& o, DSAVLTree<Word>& p);
void parseQuery(string& query);//parse query
void parseQuery(string& query, StopWord& stop);//parse query
void setUnion(vector<Document>& a, vector<Document>& b);//OR keyword
void intersection(vector<Document>& a, vector<Document>&b);//AND keyword
void complement(vector<Document>& a);//set subtraction
Expand Down
21 changes: 21 additions & 0 deletions StopWord.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
//
// Created by Karina Shin on 4/23/2022.
//

#include "StopWord.h"

StopWord::StopWord()
{
ifstream stop;//make the stop words AVL tree
stop.open("stopWords.txt");
string curr;
while (getline(stop, curr))//make an avl tree of stop words
{
string s = curr.substr(0, curr.length()-1);//cut off end char
stops.insert(s);
}
}
bool StopWord::isStopWord(string& str)
{
return stops.contains(str);//if str is in the avl tree, its a stop word
}
24 changes: 24 additions & 0 deletions StopWord.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
//
// Created by Karina Shin on 4/23/2022.
//

#ifndef INC_22S_FINAL_PROJ_STOPWORD_H
#define INC_22S_FINAL_PROJ_STOPWORD_H

#include <iostream>
#include <fstream>
#include <string>
#include "DSAVLTree.h"

using namespace std;

class StopWord {
private:
DSAVLTree<string> stops;
public:
StopWord();
bool isStopWord(string& str);
};


#endif //INC_22S_FINAL_PROJ_STOPWORD_H
22 changes: 7 additions & 15 deletions UserInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,7 @@

#include "UserInterface.h"

UserInterface::UserInterface()
{
ifstream stop;//make the stop words AVL tree
stop.open("stopWords.txt");
string curr;
while (getline(stop, curr))//make an avl tree of stop words
{
string s = curr.substr(0, curr.length()-1);//cut off end char
stops.insert(s);
}
}
UserInterface::UserInterface() {}

void UserInterface::clearIndex()
{
Expand All @@ -24,10 +14,10 @@ void UserInterface::clearIndex()
//TODO also erase contents of persistence file?
}

void UserInterface::parseDocs(string& direct)
void UserInterface::parseDocs(const string& direct)
{
std::cout << "parsing documents..." << std::endl;
docReader.parse(direct);
docReader.getFiles(direct, stops);
std::cout << "done parsing!" << std::endl;
}

Expand All @@ -38,10 +28,12 @@ void UserInterface::persistentIndex()

void UserInterface::enterQuery(string& query)
{
process.parseQuery(query);
process.parseQuery(query, stops);
}

void UserInterface::stats()
{

}
}

DocParser& UserInterface::getDocParser() { return docReader; }
6 changes: 4 additions & 2 deletions UserInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,24 +10,26 @@
#include "DSAVLTree.h"
#include "Word.h"
#include "Document.h"
#include "StopWord.h"

class UserInterface {
private:
DocParser docReader;
QueryProcessor process;
StopWord stops;

DSAVLTree<Word> words;
DSAVLTree<Word> orgs;
DSAVLTree<Word> persons;
DSAVLTree<string> stops;

public:
UserInterface();
void clearIndex();//delete every tree
void parseDocs(string& direct);//parse all documents
void parseDocs(const string& direct);//parse all documents
void persistentIndex();//read in persistence file to index words
void enterQuery(string& query);
void stats();
DocParser& getDocParser();//used to access word/org/person avl trees
};


Expand Down
7 changes: 4 additions & 3 deletions main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,21 @@
#include "DSAVLTree.h"
#include "Word.h"
#include "QueryProcessor.h"
#include "UserInterface.h"

using namespace std;
int main(int argc, char** argv) {
//TODO find out where to put stop words list so that it can be used in all classes
//just have the avl tree indexes in the user interface class and then pass it everywhere else?
//can't have multiple avl trees in each class (doc parser and query), should just have one set

DocParser parse;
UserInterface parse;
cout << "parsing..." << endl;
parse.getFiles(argv[2]);//absolute path
parse.parseDocs(argv[2]);//absolute path
cout << "done!" << endl;
Word w(argv[1]);//search term
w.stemming();
parse.getWordTree().find(parse.getWordTree().getRoot(), w).printDocs();
parse.getDocParser().getWordTree().find(parse.getDocParser().getWordTree().getRoot(), w).printDocs();
// parse.getWordTree().find(parse.getWordTree().getRoot(), w)->getData().printDocs();

// vector<int> finalIndex;
Expand Down

0 comments on commit c62fe75

Please sign in to comment.