Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve Similarity Results #490

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion server/scripts/populate-courses.ts
Original file line number Diff line number Diff line change
Expand Up @@ -610,6 +610,7 @@ export const addCourseDescription = async (course): Promise<boolean> => {
*/
export const addAllSimilarityData = async (): Promise<boolean> => {
try {
console.log("adding similarity data");
const courses = await Classes.find().exec();
if (courses) {
for (const course of courses) {
Expand Down Expand Up @@ -637,7 +638,21 @@ const addSimilarityData = async (courses, course): Promise<boolean> => {
const similarities = [];
const tfidf = await RecommendationMetadata.findOne({ _id: courseId }).exec();
for (const c of courses) {
if (c._id !== courseId && !c.crossList.includes(courseId) && c.classRating !== null && c.classRating !== 0) {
let crossList = false;
for (const crosslist of c.crossList) {
if (similarities.some(sim => sim._id === crosslist)) {
crossList = true;
break;
}
}
if (
c._id !== courseId &&
!c.crossList.includes(courseId) &&
c.classRating &&
c.classRating !== null &&
c.classRating !== 0 &&
!crossList
) {
const compTfidf = await RecommendationMetadata.findOne({ _id: c._id }).exec();
const cos = cosineSimilarity(tfidf.tfidfVector, compTfidf.tfidfVector);
if (cos < 1) {
Expand Down
6 changes: 5 additions & 1 deletion server/scripts/populate-recdata.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import { preprocess, idf, tfidf } from '../src/course/course.recalgo';
*/
export const addAllProcessedDescriptions = async (): Promise<boolean> => {
try {
console.log("adding processed descriptions");
const courses = await Classes.find().exec();
if (courses) {
for (const course of courses) {
Expand All @@ -35,7 +36,6 @@ const addProcessedDescription = async (course): Promise<boolean> => {
const subject = course.classSub;
const num = course.classNum;
try {
console.log(`${subject} ${num}: ${processed}`)
const rec = await RecommendationMetadata.findOne({ _id: courseId });
if (rec) {
await RecommendationMetadata.updateOne(
Expand All @@ -58,6 +58,7 @@ const addProcessedDescription = async (course): Promise<boolean> => {
throw new Error();
}
}
console.log(`${subject} ${num}`);
return true;
} catch (err) {
console.log(`Error in adding processed description for ${subject} ${num}: ${err}`);
Expand All @@ -71,10 +72,12 @@ const addProcessedDescription = async (course): Promise<boolean> => {
*/
export const addIdfVector = async (): Promise<boolean> => {
try {
console.log("adding idf vector");
const metadata = await RecommendationMetadata.find().exec();
const descriptions = metadata.map(course => course.processedDescription.split(' '));
const allTerms = [...new Set(descriptions.flat())];
const idfValues = idf(allTerms, descriptions);
await GlobalMetadata.deleteMany({});
const res = await new GlobalMetadata({
idfVector: idfValues
}).save();
Expand All @@ -97,6 +100,7 @@ export const addIdfVector = async (): Promise<boolean> => {
*/
export const addAllTfIdfVectors = async (): Promise<boolean> => {
try {
console.log("adding tfidf vectors");
const courses = await RecommendationMetadata.find().exec();
const global = await GlobalMetadata.findOne().exec();
const idfVector = global.idfVector;
Expand Down
28 changes: 15 additions & 13 deletions server/src/course/course.recalgo.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,30 +14,32 @@ const stemWord = (word) => {
return word;
}

const cleanWords = (sentence: string, fillerWords: string[]) =>
sentence
.match(/\b\w+\b/g)
?.map(word => {
let singularWord = stemWord(word.toLowerCase());
return fillerWords.includes(singularWord) ? '' : singularWord;
})
.filter(Boolean)
.join(' ');

/**
* Preprocesses the description to remove pluralities and unnecessary punctuation
* @param description A course description that needs to be preprocessed
* @returns The processed description for a course
*/
export const preprocess = (description: string) => {
const sentences = description.match(/[^.!?]*[.!?]\s+[A-Z]/g) || [description];
const fillerWords = ["and", "the", "to", "for", "with"];
const sentences = description.match(/[^.!?]+[.!?]*/g) || [description];

const processedText = sentences.map(sentence => {
const words = sentence.match(/\b\w+\b/g) || [];
const cleanedWords = words.map(word => {
let singularWord = stemWord(word.toLowerCase());
fillerWords.forEach(filler => {
const regex = new RegExp(`\\b${filler}\\b`, 'g');
singularWord = singularWord.replace(regex, '');
});
return singularWord.replace(/[^\w\s]/g, '');
});
return cleanedWords.join(' ');
const cleaned = cleanWords(sentence, fillerWords);
return cleaned;
});
return processedText.join('. ');
}

return processedText.join('. ').trim();
};

/**
* Calculates the inverse document frequency for the given terms
Expand Down
Loading