forked from SMUCSE2341/22s-final-project-karinashin
-
Notifications
You must be signed in to change notification settings - Fork 0
/
DocParser.cpp
117 lines (96 loc) · 3.48 KB
/
DocParser.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
//
// Created by Karina Shin on 4/9/2022.
//
#include "DocParser.h"
DocParser::DocParser() {}
void DocParser::parse(const string& filename, StopWord& stop) {
numDocs++;
//parse main text
rapidjson::Document doc;
//for every file in the folder
ifstream stream;
stream.open(filename);
string wholeFile;
string temp;
while (getline(stream, temp))
wholeFile += temp;
stream.close();
doc.Parse(wholeFile.c_str());
if (!doc.IsObject()) cout << "somethings wrong" << endl;
//make Document object for current file
string title = doc["title"].GetString();
string date = doc["thread"]["published"].GetString();
string id = doc["uuid"].GetString();
string pub = doc["thread"]["site"].GetString();//TODO wheres the pub? this is the site name
Document currDoc(title, pub, date, filename, id);//make doc object for this file
//checks
doc["entities"].IsObject();
doc["entities"].IsArray();
for (auto &v : doc["entities"]["persons"].GetArray()) {//parse for person
Word person(v["name"].GetString());
if (!people.contains(person)){//if the word is not already in the tree/new unique word
person.incrFreq(currDoc);
people.insert(person);
}
else{
people.find(people.getRoot(), person).incrFreq(currDoc);//index document on object in tree
}
}
for (auto &v : doc["entities"]["organizations"].GetArray()) {//parse for orgs
Word o(v["name"].GetString());
if (!orgs.contains(o)){//if the word is not already in the tree/new unique word
o.incrFreq(currDoc);
orgs.insert(o);
}
else{
orgs.find(orgs.getRoot(), o).incrFreq(currDoc);//index document on object in tree
}
}
string text = doc["text"].GetString();
int space;
while (space != -1)//WORDS
{
space = text.find(" ");
Word curr(text.substr(0, space));
curr.toLower();//remove caps
curr.removePunc();//remove punctuation
curr.stemming();
if (stop.isStopWord(curr.getStr())){
text = text.substr(space + 1);//cut off curr word
space = text.find(" ");
continue;//don't add to tree
}
if (curr.getStr().empty()){//don't insert an empty string
text = text.substr(space + 1);
space = text.find(" ");
continue;
}
//put unique words into the avl tree
if (!words.contains(curr)){//if the word is not already in the tree/new unique word
curr.incrFreq(currDoc);
words.insert(curr);
}
else{
words.find(words.getRoot(), curr).incrFreq(currDoc);//index document on object in tree
}
text = text.substr(space + 1);//cut off curr word
}
}
void DocParser::getFiles(const string& directory, StopWord& stop)
{
for (const auto & entry : fs::recursive_directory_iterator(directory)){
if (entry.is_regular_file()) {
if (entry.path().extension().string() == ".json") {
string filename = entry.path().c_str();
parse(filename, stop);
}
}
}
}
void DocParser::persistenceIndex()//read in persistence file to index words
{
}
DSAVLTree<Word>& DocParser::getWordTree() { return words; }
DSAVLTree<Word>& DocParser::getOrgTree() { return orgs; }
DSAVLTree<Word>& DocParser::getPersonTree() { return people; }
int DocParser::getNumDocs() { return numDocs; }