Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feedback #1

Open
wants to merge 31 commits into
base: feedback
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
bdeac3c
Setting up GitHub Classroom Feedback
github-classroom[bot] Apr 6, 2022
7a5873e
Begun AVL Tree Implementation
Apr 11, 2022
2cb5914
AVL Rotation functions have been implemented, still need to test a lo…
Apr 12, 2022
b5b679b
Have tested RR, RL, LR, and LL rotations in main, trying to figure ou…
Apr 12, 2022
fc4effe
Added json folder to project
Apr 12, 2022
b0056d2
Fixed rapidjson header
Apr 12, 2022
de2244c
Imported stop words, parser can read all input datafile names recursi…
Apr 13, 2022
20f7331
Debugging Porter stemming errors
Apr 15, 2022
bf628c2
Debugging undefined reference error for stemmer
Apr 18, 2022
cf907d0
Can now access all persons, organizations, and the text of a given JS…
Apr 18, 2022
90156f4
Can access people, orgs, and text for all JSON files within a path
Apr 18, 2022
d556f50
Implemented functionality to separate text into words, and trimming/s…
Apr 18, 2022
6fa36e4
Can stem words and add to avl tree, working on algo to add all documents
Apr 19, 2022
93d68cb
Can run in Debug mode fine, but getting segfault on release mode
Apr 19, 2022
cfba30b
Although slow, can successfully check all documents to see if it cont…
Apr 19, 2022
a17775f
Taking 10min+ for 6000 files, trying to increase efficiency
Apr 19, 2022
2758bd3
Refactored code by splitting responsibilities between Parser class an…
Apr 20, 2022
52f5d23
Began implementation of QueryProcessor
Apr 20, 2022
0abfa2a
Can now store search components into respective vectors (Ands, ors, n…
Apr 20, 2022
3125bb3
Developing set logic (intersections and unions)
Apr 25, 2022
4ed6d89
Added ostream operator for term and inorder traversal for AVL tree to…
Apr 26, 2022
db5a6dd
Am able to create persistence file with numbered words with all respe…
Apr 26, 2022
6800ca2
Can now efficiently read from persistence index (saves a TON of time)
Apr 27, 2022
1423ce7
Have created the menu framework
Apr 27, 2022
8325a08
Can now independently populate each persistence index (terms, orgs, p…
Apr 28, 2022
2969363
Intersections and Unions seem to be working, will test more tomorrow
Apr 28, 2022
b7a21d6
All the operators appear to be working, placed persistence index into…
Apr 29, 2022
043df67
After running for ~50 hrs in generating persistence index for the ter…
May 2, 2022
13f3f89
Recloned repo and pasted changes from first repo
May 3, 2022
5f0ec5e
Probable final push
May 3, 2022
94a684a
Actual final push
May 3, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
data/
cmake-build-debug/
cmake-build-release/
persistence_index/
.idea/
8 changes: 5 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
cmake_minimum_required(VERSION 3.20)
cmake_minimum_required(VERSION 3.16)
project(22s_final_proj)

set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD 17)

add_executable(22s_final_proj main.cpp catch_setup.cpp)
configure_file(file_cleanup/stop_words.txt NEWLINE_STYLE UNIX)

add_executable(22s_final_proj main.cpp data_structures/AVLTree.h Term.cpp Term.h engine_architecture/Parser.cpp engine_architecture/Parser.h file_cleanup/stop_words.txt file_cleanup/porter2_stemmer-master/porter2_stemmer.h file_cleanup/porter2_stemmer-master/porter2_stemmer.cpp file_cleanup/porter2_stemmer-master/util/hash.h file_cleanup/porter2_stemmer-master/util/string_view.h engine_architecture/IndexHandler.cpp engine_architecture/IndexHandler.h engine_architecture/QueryProcessor.cpp engine_architecture/QueryProcessor.h engine_architecture/SearchEngine.cpp engine_architecture/SearchEngine.h DSDoc.cpp DSDoc.h DSDoc.cpp DSDoc.h)
23 changes: 23 additions & 0 deletions Catch_Tests/AVL_tests.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#include "../catch.hpp"
#include "../data_structures/AVLTree.h"

TEST_CASE("AVL Tree Implementation") {

SECTION("AVL Node") {
AVLNode<int> node (5, nullptr, nullptr, 0);
CHECK(node.left == nullptr);
CHECK(node.right == nullptr);
CHECK(node.element == 5);
}

SECTION("Case 1 functionality") {
AVLTree<int> tree;
tree.insert(3);
tree.insert(2);
tree.insert(1);
tree.insert(4);
tree.insert(6);

}

}
112 changes: 112 additions & 0 deletions DSDoc.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
//
// Created by wnahl on 5/2/2022.
//

#include "DSDoc.h"

using namespace std;

DSDoc::DSDoc(std::string filepath, std::vector<std::string> terms) {
this->filepath = filepath;
this->terms = terms;
string cur_line;
string whole_file;
ifstream in(filepath);
while (getline(in, cur_line)) {
whole_file += cur_line + '\n';
}
in.close();
doc.Parse(whole_file.c_str());
relevancy = get_relevancy();
}

DSDoc::DSDoc(const DSDoc& other) {
this->filepath = other.filepath;
string cur_line;
string whole_file;
ifstream in(other.filepath);
while (getline(in, cur_line)) {
whole_file += cur_line + '\n';
}
in.close();
this->doc.Parse(whole_file.c_str());
this->relevancy = other.relevancy;
this->terms = other.terms;
}

DSDoc& DSDoc::operator= (const DSDoc& other) {
this->filepath = other.filepath;
string cur_line;
string whole_file;
ifstream in(other.filepath);
while (getline(in, cur_line)) {
whole_file += cur_line + '\n';
}
in.close();
this->doc.Parse(whole_file.c_str());
this->relevancy = other.relevancy;
this->terms = other.terms;
return *this;
}

std::string DSDoc::get_title() {
string title = doc["title"].GetString();
return title;
}

std::string DSDoc::get_text() {
string text = doc["text"].GetString();
return text;
}

std::string DSDoc::get_publication() {
string publication = doc["thread"]["site"].GetString();
return publication;

}

std::string DSDoc::get_url() {
string url = doc["url"].GetString();
return url;
}

std::string DSDoc::get_date_published() {
string date = doc["published"].GetString();
return date;
}

double DSDoc::get_relevancy() {
double relevancy = 0.0;
double correct_words = 0;
int total_words = 0;
string text = doc["text"].GetString();

string word = "";
for (int i = 0; i < text.size(); i++) {
if (text[i] == ' ') {
//CHECK IF WORD IS STOP WORD?

for (int j = 0; j < terms.size(); j++) {
if (word == terms[j])
correct_words+= 1;
}
total_words++;
word = "";
} else {
word += text[i];
}
}

relevancy = correct_words/total_words;

return relevancy;
}

bool DSDoc::operator<(const DSDoc &other) const {
if (this->relevancy < other.relevancy ) {
return true;
}
return false;
}


34 changes: 34 additions & 0 deletions DSDoc.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#ifndef INC_22S_FINAL_PROJ_DSDOC_H
#define INC_22S_FINAL_PROJ_DSDOC_H

#include <string>
#include <vector>
#include <fstream>
#include "rapidjson/rapidjson.h"
#include "rapidjson/document.h"

class DSDoc {
public:
DSDoc(std::string filepath, std::vector<std::string> terms);

DSDoc(const DSDoc& other);
DSDoc& operator= (const DSDoc& other);

std::string get_title();
std::string get_text();
std::string get_publication();
std::string get_url();
std::string get_date_published();
double get_relevancy();

bool operator<(const DSDoc& other) const;

private:
std::string filepath;
std::vector<std::string> terms;
rapidjson::Document doc;
double relevancy;
};


#endif //INC_22S_FINAL_PROJ_DSDOC_H
53 changes: 53 additions & 0 deletions Term.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@


using namespace std;

#include "Term.h"

Term::Term(std::string term) {
this->term = term;
}

void Term::add_document(string current_doc_number) {
documents_vector.push_back(current_doc_number);
}

bool Term::operator==(const Term &T) {
return (term == T.term);
}

// < operator is necessary for the AVL trees to balance correctly
bool Term::operator<(const Term &T) {
return (term < T.term);
}

// Using the contains function so we don't duplicate documents for a term
bool Term::contains(string& docID) {
for (int i = 0; i < documents_vector.size(); i++) {
if (docID == documents_vector[i]) {
return true;
}
}
return false;
}

vector<string>& Term::get_IDs() {
return documents_vector;
}

std::string& Term::get_term() {
return term;
}

ostream& operator<<(ostream& out, Term& t) {
out << t.get_term() << " ";
for (int i = 0; i < t.get_IDs().size(); i++) {
out << t.get_IDs()[i] << " ";
}
out << endl;
return out;
}




32 changes: 32 additions & 0 deletions Term.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#ifndef INC_22S_FINAL_PROJ_TERM_H
#define INC_22S_FINAL_PROJ_TERM_H

#include <vector>
#include <string>
#include <iostream>

class Term {
public:
Term(std::string term); //The AVL trees consist of "Terms"
void add_document(string current_doc_number);

bool operator==(const Term& T);
bool operator<(const Term& T);


bool contains(string& docID);
vector<string>& get_IDs();
std::string& get_term();

friend ostream& operator<<(ostream& out, Term& t);

private:
//Each term has a string which represents a word
//and the vector represents all of the documents
//where this word is found
std::vector<string> documents_vector;
std::string term;
};


#endif //INC_22S_FINAL_PROJ_TERM_H
Binary file added cmake-build-relwithdebinfo/22s_final_proj
Binary file not shown.
Loading