-
Notifications
You must be signed in to change notification settings - Fork 3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking βSign up for GitHubβ, you agree to our terms of service and privacy statement. Weβll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Similar Courses Algorithm #470
Changes from 5 commits
d27f941
e2939c5
3159abe
5d285eb
6963acc
51c42cb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
/** | ||
* Applies stemming rules to reduce a word to its base form | ||
*/ | ||
const stemWord = (word) => { | ||
if (word.endsWith("sses")) { | ||
return word.replace(/sses$/, 'ss'); | ||
} if (word.endsWith("ies")) { | ||
return word.replace(/ies$/, 'y'); | ||
} if (word.endsWith("es") && !/[aeiou]es$/.test(word)) { | ||
return word.replace(/es$/, ''); | ||
} if (word.endsWith("s") && word.length > 1 && !/[sxz]$/.test(word)) { | ||
return word.replace(/s$/, ''); | ||
} | ||
return word; | ||
} | ||
|
||
/** | ||
* Preprocesses the description to remove pluralities and unnecessary punctuation | ||
* @param description A course description that needs to be preprocessed | ||
* @returns The processed description for a course | ||
*/ | ||
export const preprocess = (description: string) => { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I noticed in the testing picture you uploaded that there are some strange text breaks or punctuation that now occurs between words? Not sure if that's still happening |
||
let sentences = description.match(/[^.!?]*[.!?]\s+[A-Z]/g) || [description]; | ||
let processedText = sentences.map(sentence => { | ||
let words = sentence.match(/\b\w+\b/g) || []; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another thought I had about preprocessing was getting rid of "filler words," (i.e. and, the, to, for, with...) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nice idea! Also i saw "this" and maybe any pronouns? |
||
let cleanedWords = words.map(word => { | ||
const singularWord = stemWord(word.toLowerCase()); | ||
return singularWord.replace(/[^\w\s]/g, ''); | ||
}); | ||
return cleanedWords.join(' '); | ||
}); | ||
return processedText.join('. '); | ||
} | ||
|
||
/** | ||
* Calculates the inverse document frequency for the given terms | ||
* @param terms list of terms in the course description | ||
* @param words list of all course descriptions as word arrays | ||
* @returns a dictionary with terms as keys and their IDF scores as values | ||
*/ | ||
export const idf = (terms, words) => { | ||
let df = {}; | ||
let idf = {}; | ||
for (const term of terms) { | ||
df[term] = words.reduce((count, wordsSet) => (count + (wordsSet.includes(term) ? 1 : 0)), 0); | ||
idf[term] = 1 / (df[term] + 1); | ||
} | ||
return idf; | ||
} | ||
|
||
/** | ||
* Calculates the TF-IDF vector for the given terms | ||
* @param terms list of terms in the course description | ||
* @param idf inverse document frequency (IDF) for the terms | ||
* @returns a dictionary with terms as keys and their TF-IDF scores as values | ||
*/ | ||
export const tfidf = (terms, idf) => { | ||
let d = {}; | ||
for (const term of terms) { | ||
if (!d[term]) { | ||
d[term] = 0; | ||
} | ||
d[term]++; | ||
} | ||
for (const term in d) { | ||
if (idf && idf[term] === undefined) { | ||
idf[term] = 1; | ||
} | ||
d[term] *= idf[term]; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe you could also normalize by dividing by term frequency here to make sure that the tfidf score accounts for different lengths of the documents to reflect an accurate importance for each term no matter document length. |
||
} | ||
return d; | ||
} | ||
|
||
/** | ||
* Computes the dot product between two vectors | ||
*/ | ||
const dot = (a, b) => { | ||
let sum = 0; | ||
for (let key in a) { | ||
if (b[key]) { | ||
sum += a[key] * b[key]; | ||
} | ||
} | ||
return sum; | ||
} | ||
|
||
/** | ||
* Computes the magnitude of a vector | ||
*/ | ||
const norm = (vec) => { | ||
const sum = dot(vec, vec); | ||
return Math.sqrt(sum); | ||
} | ||
|
||
/** | ||
* Calculates the cosine similarity of two frequency word vectors | ||
* @param vecA frequency word vector corresponding to the first course description | ||
* @param vecB frequency word vector corresponding to the second course description | ||
* @returns a number representing the similarity between the two descriptions | ||
*/ | ||
export const cosineSimilarity = (vecA, vecB) => { | ||
const dotProduct = dot(vecA, vecB); | ||
const magA = norm(vecA); | ||
const magB = norm(vecB); | ||
return dotProduct / (magA * magB); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe you could also add a check here in case magA or magB is 0 to avoid dividing by 0. |
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
import express from 'express'; | ||
|
||
import { CourseIdRequestType, CourseInfoRequestType } from './course.type'; | ||
import { getCourseByInfo, getReviewsCrossListOR } from './course.controller'; | ||
import { CourseIdRequestType, CourseInfoRequestType, CourseDescriptionRequestType } from './course.type'; | ||
import { getCourseByInfo, getReviewsCrossListOR, getProcessedDescription, getSimilarity } from './course.controller'; | ||
|
||
import { getCourseById } from '../utils'; | ||
|
||
|
@@ -69,3 +69,24 @@ courseRouter.post('/get-reviews', async (req, res) => { | |
.json({ error: `Internal Server Error: ${err.message}` }); | ||
} | ||
}); | ||
|
||
/** Reachable at POST /api/courses/getPreDesc | ||
* @body description: a course description | ||
* Gets the processed description to use for the similarity algorithm | ||
* Currently used for testing | ||
*/ | ||
courseRouter.post('/getPreDesc', async (req, res) => { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would there be any errors to catch here? Also I think that a route name more like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep we should handle errors at the router level |
||
const { description }: CourseDescriptionRequestType = req.body; | ||
const processed = getProcessedDescription(description); | ||
return res.status(200).json({ result: processed }); | ||
}); | ||
|
||
/** Reachable at POST /api/courses/getSimilarity | ||
* @body courseId: a course's id field | ||
* Gets the array of the top 5 similar courses for the course with id = courseId | ||
*/ | ||
courseRouter.post('/getSimilarity', async (req, res) => { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here, |
||
// const { courseId }: CourseIdRequestType = req.body; | ||
const similarity = getSimilarity(); | ||
return res.status(200).json({ result: similarity }); | ||
}); |
Unchanged files with check annotations Beta
import ReviewModal from './ReviewModal' | ||
enum PageStatus { | ||
Check warning on line 29 in client/src/modules/Course/Components/Course.tsx
|
||
Loading, | ||
Check warning on line 30 in client/src/modules/Course/Components/Course.tsx
|
||
Success, | ||
Error, | ||
} | ||
export const Course = () => { | ||
* Fetches current course info and reviews and updates UI state | ||
*/ | ||
useEffect(() => { | ||
async function updateCurrentClass(number: number, subject: string) { | ||
Check warning on line 68 in client/src/modules/Course/Components/Course.tsx
|
||
try { | ||
const response = await axios.post(`/api/courses/get-by-info`, { | ||
number, | ||
courseId: courseId, | ||
}) | ||
clearSessionReview() | ||
if (response.status === 200) { | ||
toast.success( | ||
'Thanks for reviewing! New reviews are updated every 24 hours.' | ||
toast.error('An error occurred, please try again.') | ||
} | ||
} catch (e) { | ||
clearSessionReview() | ||
toast.error('An error occurred, please try again.') | ||
} | ||
} |
} | ||
useEffect(() => { | ||
const signIn = (redirectFrom: string) => { | ||
Session.setPersistent({ redirectFrom: redirectFrom }) | ||
history.push('/login') | ||
} |
import React, { useEffect, useState } from 'react' | ||
import { Redirect, useParams } from 'react-router-dom' | ||
Check warning on line 2 in client/src/modules/Admin/Components/Admin.tsx
|
||
import axios from 'axios' | ||
}; | ||
const [updatingField, setUpdatingField] = useState<string>(""); | ||
const [addSemester, setAddSemester] = useState('') | ||
Check warning on line 39 in client/src/modules/Admin/Components/Admin.tsx
|
||
const [isAdminModalOpen, setIsAdminModalOpen] = useState<boolean>(false) | ||
const { isLoggedIn, token, isAuthenticating } = useAuthMandatoryLogin('admin') | ||
* If this is the user's second click, call addAllCourses above to initiaize | ||
* the local database | ||
*/ | ||
function renderInitButton(doubleClick: boolean) { | ||
// Offer button to edit database | ||
if (doubleClick) { | ||
return ( | ||
} | ||
function renderAdmin(token: string) { | ||
return ( | ||
<div className={styles.adminWrapper}> | ||
<div className="headInfo"> |
getCourse() | ||
function renderButtons(review: any) { | ||
const reported = review.reported | ||
if (reported === 1) { | ||
return ( |
useEffect(() => { | ||
async function getAdmins() { | ||
const response = await axios.post('/api/admin/users/get', {token: token}) | ||
const admins = response.data.result | ||
if (response.status === 200) { | ||
setAdmins(admins) | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are these test cases (the short ones that are just the course name) meant to be tested for similarity against the longer descriptions?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These are just courses without descriptions on the course roster API, so I used the course title as a filler for now.