Skip to content

Commit

Permalink
top 10 function, seperate functionality
Browse files Browse the repository at this point in the history
  • Loading branch information
syeboah10 committed Nov 15, 2023
1 parent 14cff3c commit 4bd65c3
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 62 deletions.
85 changes: 24 additions & 61 deletions course-clusters/clusters.py
Original file line number Diff line number Diff line change
@@ -1,56 +1,14 @@
import string
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from load_course_info import loadCourseDesc, getSubjects

subjects_url = "https://classes.cornell.edu/api/2.0/config/subjects.json?roster=FA23"
subjects = getSubjects(subjects_url)

urls = [f"https://classes.cornell.edu/api/2.0/search/classes.json?roster=FA23&subject={sub}" for sub in subjects]

def fetchDescriptions(urls):
descriptions = []
for x in urls:
descriptions += loadCourseDesc(x)
print(len(descriptions))
return descriptions


course_descriptions = fetchDescriptions(urls)
def preprocess_text(text):
""" Tokenization and preprocessing with custom stopwords"""
custom_stopwords = ["of", "the", "in", "and", "on", "an", "a", "to"]
strong_words = ["technology","calculus","business", "Artificial Intelligence", "First-Year Writing", "computer","python","java","economics","US","writing","biology","chemistry", "physics", "engineering","ancient"
"programming", "algorithms", "data structures","art","software","anthropology" "databases","fiction","mathematics", "history","civilization"]

translator = str.maketrans("", "", string.punctuation)
text = text.lower()
text = text.translate(translator)
tokens = text.split()
tokens = [token for token in tokens if token not in custom_stopwords]
for word in strong_words:
if word in tokens:
tokens += [word] * 10

return " ".join(tokens)

def removeStopwords():
"Removes stopwords for all the descriptions "
preprocessed = []
for desc in course_descriptions:
if desc:
preprocessed.append(preprocess_text(desc))
return preprocessed

preprocessed_descriptions = removeStopwords()
from sklearn.metrics.pairwise import cosine_similarity
from preproccess import preprocessed_descriptions
from loadclass import course_descriptions

tagged_data = [TaggedDocument(words=desc.split(), tags=[str(i)]) for i, desc in enumerate(preprocessed_descriptions)]

model = Doc2Vec(vector_size=20, window=2, min_count=1, workers=4, epochs=100)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)



def createClusters():
""" Creates the clusters for assigned documents based on the Doc2Vec similarity if cluster is full then we start a new cluster.
Also puts into clusters by most similar cluster.
Expand All @@ -60,8 +18,7 @@ def createClusters():

for i, desc in enumerate(preprocessed_descriptions):
most_similar_cluster_id = None
max_similarity = 0

max_similarity = 0
for cluster_id, cluster_vector in inner_clusters.items():
similarity = model.dv.similarity(i, cluster_id)
if similarity > max_similarity and len(inner_clusters[cluster_id]) <= 50:
Expand All @@ -77,11 +34,6 @@ def createClusters():
return inner_clusters






clusters = createClusters()
def merge_single_item_clusters(clusters):
""" for all small clusters, merge them with their most similar cluster"""
singles = [cluster_id for cluster_id, courses in clusters.items() if len(courses) < 50]
Expand All @@ -102,15 +54,26 @@ def merge_single_item_clusters(clusters):

return clusters

clusters = merge_single_item_clusters(clusters)

def get_top_similar_courses(clusters, model):
"""Gets the top 10 most similar courses for each course in a cluster"""
res = {}
for cluster_id, cluster in clusters.items():
for course_id, outcourse in enumerate(cluster):
if outcourse == None:
continue
max_10 = []
vec_outcourse = model.infer_vector([outcourse])
for inner, innercourse in enumerate(cluster):
if innercourse is not None and inner != course_id:
vec_innercourse = model.infer_vector([innercourse])
val = cosine_similarity([vec_outcourse],[vec_innercourse])[0][0]
max_10.append((val, innercourse))
max_10.sort(reverse=True)
res[outcourse] = [max_10[i][1] for i in range(11)]
return res


with open('output.txt', 'w') as file:
for cluster_id, courses in clusters.items():
file.write(f"Cluster {cluster_id + 1}:\n")
for course in courses:
if course :
file.write(course + '\n')
file.write("\n")

clusters = createClusters()
merged_clusters = merge_single_item_clusters(clusters)
top_similar_courses = get_top_similar_courses(merged_clusters, model)
19 changes: 18 additions & 1 deletion course-clusters/load_course_info.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
import requests

url = "https://classes.cornell.edu/api/2.0/search/classes.json?roster=FA14&subject=CS"
subjects_url = "https://classes.cornell.edu/api/2.0/config/subjects.json?roster=FA23"


def loadCourseDesc(url):
lis = []
res = requests.get(url)
Expand All @@ -8,9 +12,22 @@ def loadCourseDesc(url):
lis.append(x['description'])
return lis


def getSubjects(url):
res = requests.get(url)
print(res.status_code)
data = res.json()
return [x['value'] for x in data['data']['subjects']]


subjects = getSubjects(subjects_url)
urls = [f"https://classes.cornell.edu/api/2.0/search/classes.json?roster=FA23&subject={sub}" for sub in subjects]

def fetchDescriptions(urls):
descriptions = []
for x in urls:
descriptions += loadCourseDesc(x)
print(len(descriptions))
return descriptions

course_descriptions = fetchDescriptions(urls)

28 changes: 28 additions & 0 deletions course-clusters/preproccess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import string
from load_course_info import course_descriptions

def preprocess_text(text):
""" Tokenization and preprocessing with custom stopwords"""
custom_stopwords = ["of", "the", "in", "and", "on", "an", "a", "to"]
strong_words = ["technology","calculus","buisness", "Artificial Intelligence", "First-Year Writing", "computer","python","java","economics","US","writing","biology","chemistry", "physics", "engineering","ancient"
"programming", "algorithms", "data structures","art","software","anthropology" "databases","fiction","mathematics", "history","civilization"]

translator = str.maketrans("", "", string.punctuation)
text = text.lower()
text = text.translate(translator)
tokens = text.split()
tokens = [token for token in tokens if token not in custom_stopwords]
for word in strong_words:
if word in tokens:
tokens += [word] * 10
return " ".join(tokens)

def removeStopwords():
"Removes stopwords for all the descriptions "
preprocessed = []
for desc in course_descriptions:
if desc:
preprocessed.append(preprocess_text(desc))
return preprocessed

preprocessed_descriptions = removeStopwords()

0 comments on commit 4bd65c3

Please sign in to comment.