top 10 function, seperate functionality

cornell-dti · Nov 15, 2023 · 4bd65c3 · 4bd65c3
1 parent 14cff3c
commit 4bd65c3
Show file tree

Hide file tree

Showing 3 changed files with 70 additions and 62 deletions.
diff --git a/course-clusters/clusters.py b/course-clusters/clusters.py
@@ -1,56 +1,14 @@
-import string
 from gensim.models.doc2vec import Doc2Vec, TaggedDocument
-from load_course_info import loadCourseDesc, getSubjects
-
-subjects_url = "https://classes.cornell.edu/api/2.0/config/subjects.json?roster=FA23"
-subjects = getSubjects(subjects_url)
-
-urls = [f"https://classes.cornell.edu/api/2.0/search/classes.json?roster=FA23&subject={sub}" for sub in subjects]
-
-def fetchDescriptions(urls):
-    descriptions = []
-    for x in urls:
-        descriptions += loadCourseDesc(x)
-    print(len(descriptions))
-    return descriptions
-
-
-course_descriptions = fetchDescriptions(urls)
-def preprocess_text(text):
-    """ Tokenization and preprocessing with custom stopwords"""
-    custom_stopwords = ["of", "the", "in", "and", "on", "an", "a", "to"]
-    strong_words = ["technology","calculus","business", "Artificial Intelligence", "First-Year Writing", "computer","python","java","economics","US","writing","biology","chemistry", "physics", "engineering","ancient"
-                    "programming", "algorithms", "data structures","art","software","anthropology" "databases","fiction","mathematics", "history","civilization"]
-
-    translator = str.maketrans("", "", string.punctuation)
-    text = text.lower()
-    text = text.translate(translator)
-    tokens = text.split()
-    tokens = [token for token in tokens if token not in custom_stopwords]
-    for word in strong_words:
-        if word in tokens:
-            tokens += [word] * 10 
-
-    return " ".join(tokens)
-
-def removeStopwords():
-    "Removes stopwords for all the descriptions "
-    preprocessed = []
-    for desc in course_descriptions:
-        if desc:
-            preprocessed.append(preprocess_text(desc))
-    return preprocessed
-
-preprocessed_descriptions = removeStopwords()
+from sklearn.metrics.pairwise import cosine_similarity
+from preproccess import preprocessed_descriptions
+from loadclass import course_descriptions
 
 tagged_data = [TaggedDocument(words=desc.split(), tags=[str(i)]) for i, desc in enumerate(preprocessed_descriptions)]
-
 model = Doc2Vec(vector_size=20, window=2, min_count=1, workers=4, epochs=100)
 model.build_vocab(tagged_data)
 model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
 
 
-
 def createClusters():
     """ Creates the clusters for assigned documents based on the Doc2Vec similarity if cluster is full then we start a new cluster.
     Also puts into clusters by most similar cluster.
@@ -60,8 +18,7 @@ def createClusters():
 
     for i, desc in enumerate(preprocessed_descriptions):
         most_similar_cluster_id = None
-        max_similarity = 0 
-
+        max_similarity = 0
         for cluster_id, cluster_vector in inner_clusters.items():
             similarity = model.dv.similarity(i, cluster_id)
             if similarity > max_similarity and len(inner_clusters[cluster_id]) <= 50:
@@ -77,11 +34,6 @@ def createClusters():
     return inner_clusters
 
 
-
-
-
-
-clusters = createClusters()
 def merge_single_item_clusters(clusters):
     """ for all small clusters, merge them with their most similar cluster"""
     singles = [cluster_id for cluster_id, courses in clusters.items() if len(courses) < 50]
@@ -102,15 +54,26 @@ def merge_single_item_clusters(clusters):
 
     return clusters
 
-clusters = merge_single_item_clusters(clusters)
 
+def get_top_similar_courses(clusters, model):
+    """Gets the top 10 most similar courses for each course in a cluster"""
+    res = {}
+    for cluster_id, cluster in clusters.items():
+        for course_id, outcourse in enumerate(cluster):
+            if outcourse == None:
+                continue
+            max_10 = []
+            vec_outcourse = model.infer_vector([outcourse])
+            for inner, innercourse in enumerate(cluster):
+                if innercourse is not None and inner != course_id:
+                    vec_innercourse = model.infer_vector([innercourse])
+                    val = cosine_similarity([vec_outcourse],[vec_innercourse])[0][0]
+                    max_10.append((val, innercourse))
+            max_10.sort(reverse=True)
+            res[outcourse] = [max_10[i][1] for i in range(11)]
+    return res
 
 
-with open('output.txt', 'w') as file:
-    for cluster_id, courses in clusters.items():
-        file.write(f"Cluster {cluster_id + 1}:\n")
-        for course in courses:
-            if course :
-                file.write(course + '\n')
-        file.write("\n")
-
+clusters = createClusters()
+merged_clusters = merge_single_item_clusters(clusters)
+top_similar_courses = get_top_similar_courses(merged_clusters, model)
diff --git a/course-clusters/load_course_info.py b/course-clusters/load_course_info.py
@@ -1,5 +1,9 @@
 import requests
 
+url = "https://classes.cornell.edu/api/2.0/search/classes.json?roster=FA14&subject=CS"
+subjects_url = "https://classes.cornell.edu/api/2.0/config/subjects.json?roster=FA23"
+
+
 def loadCourseDesc(url):
     lis = []
     res = requests.get(url)
@@ -8,9 +12,22 @@ def loadCourseDesc(url):
         lis.append(x['description'])
     return lis
 
-
 def getSubjects(url):
     res = requests.get(url)
+    print(res.status_code)
     data = res.json()
     return [x['value'] for x in data['data']['subjects']]
 
+
+subjects = getSubjects(subjects_url)
+urls = [f"https://classes.cornell.edu/api/2.0/search/classes.json?roster=FA23&subject={sub}" for sub in subjects]
+
+def fetchDescriptions(urls):
+    descriptions = []
+    for x in urls:
+        descriptions += loadCourseDesc(x)
+    print(len(descriptions))
+    return descriptions
+
+course_descriptions = fetchDescriptions(urls)
+
diff --git a/course-clusters/preproccess.py b/course-clusters/preproccess.py
@@ -0,0 +1,28 @@
+import string
+from load_course_info import course_descriptions
+
+def preprocess_text(text):
+    """ Tokenization and preprocessing with custom stopwords"""
+    custom_stopwords = ["of", "the", "in", "and", "on", "an", "a", "to"]
+    strong_words = ["technology","calculus","buisness", "Artificial Intelligence", "First-Year Writing", "computer","python","java","economics","US","writing","biology","chemistry", "physics", "engineering","ancient"
+                    "programming", "algorithms", "data structures","art","software","anthropology" "databases","fiction","mathematics", "history","civilization"]
+
+    translator = str.maketrans("", "", string.punctuation)
+    text = text.lower()
+    text = text.translate(translator)
+    tokens = text.split()
+    tokens = [token for token in tokens if token not in custom_stopwords]
+    for word in strong_words:
+        if word in tokens:
+            tokens += [word] * 10
+    return " ".join(tokens)
+
+def removeStopwords():
+    "Removes stopwords for all the descriptions "
+    preprocessed = []
+    for desc in course_descriptions:
+        if desc:
+            preprocessed.append(preprocess_text(desc))
+    return preprocessed
+
+preprocessed_descriptions = removeStopwords()