From 55fd241e2798c741f67195638bd71b3fcba3f17e Mon Sep 17 00:00:00 2001
From: Jacqueline <jacqueline.cai2004@gmail.com>
Date: Fri, 13 Dec 2024 03:21:10 -0500
Subject: [PATCH 1/3] Improve similarity results

---
 server/scripts/populate-courses.ts  | 17 ++++++++++++++++-
 server/scripts/populate-recdata.ts  |  6 +++++-
 server/src/course/course.recalgo.ts | 28 +++++++++++++++-------------
 3 files changed, 36 insertions(+), 15 deletions(-)
diff --git a/server/scripts/populate-courses.ts b/server/scripts/populate-courses.ts
index ce84073d..376c32c5 100644
--- a/server/scripts/populate-courses.ts
+++ b/server/scripts/populate-courses.ts
@@ -610,6 +610,7 @@ export const addCourseDescription = async (course): Promise<boolean> => {
  */
 export const addAllSimilarityData = async (): Promise<boolean> => {
   try {
+    console.log("adding similarity data");
     const courses = await Classes.find().exec();
     if (courses) {
       for (const course of courses) {
@@ -637,7 +638,21 @@ const addSimilarityData = async (courses, course): Promise<boolean> => {
     const similarities = [];
     const tfidf = await RecommendationMetadata.findOne({ _id: courseId }).exec();
     for (const c of courses) {
-      if (c._id !== courseId && !c.crossList.includes(courseId) && c.classRating !== null && c.classRating !== 0) {
+      let crossList = false;
+      for (const crosslist of c.crossList) {
+        if (similarities.some(sim => sim._id === crosslist)) {
+          crossList = true;
+          break;
+        }
+      }
+      if (
+        c._id !== courseId &&
+        !c.crossList.includes(courseId) &&
+        c.classRating &&
+        c.classRating !== null &&
+        c.classRating !== 0 &&
+        !crossList
+      ) {
         const compTfidf = await RecommendationMetadata.findOne({ _id: c._id }).exec();
         const cos = cosineSimilarity(tfidf.tfidfVector, compTfidf.tfidfVector);
         if (cos < 1) {
diff --git a/server/scripts/populate-recdata.ts b/server/scripts/populate-recdata.ts
index 73b2d420..a43403b2 100644
--- a/server/scripts/populate-recdata.ts
+++ b/server/scripts/populate-recdata.ts
@@ -9,6 +9,7 @@ import { preprocess, idf, tfidf } from '../src/course/course.recalgo';
  */
 export const addAllProcessedDescriptions = async (): Promise<boolean> => {
   try {
+    console.log("adding processed descriptions");
     const courses = await Classes.find().exec();
     if (courses) {
       for (const course of courses) {
@@ -35,7 +36,6 @@ const addProcessedDescription = async (course): Promise<boolean> => {
   const subject = course.classSub;
   const num = course.classNum;
   try {
-    console.log(`${subject} ${num}: ${processed}`)
     const rec = await RecommendationMetadata.findOne({ _id: courseId });
     if (rec) {
       await RecommendationMetadata.updateOne(
@@ -58,6 +58,7 @@ const addProcessedDescription = async (course): Promise<boolean> => {
         throw new Error();
       }
     }
+    console.log(`${subject} ${num}`);
     return true;
   } catch (err) {
     console.log(`Error in adding processed description for ${subject} ${num}: ${err}`);
@@ -71,10 +72,12 @@ const addProcessedDescription = async (course): Promise<boolean> => {
  */
 export const addIdfVector = async (): Promise<boolean> => {
   try {
+    console.log("adding idf vector");
     const metadata = await RecommendationMetadata.find().exec();
     const descriptions = metadata.map(course => course.processedDescription.split(' '));
     const allTerms = [...new Set(descriptions.flat())];
     const idfValues = idf(allTerms, descriptions);
+    await GlobalMetadata.deleteMany({});
     const res = await new GlobalMetadata({
       idfVector: idfValues
     }).save();
@@ -97,6 +100,7 @@ export const addIdfVector = async (): Promise<boolean> => {
  */
 export const addAllTfIdfVectors = async (): Promise<boolean> => {
   try {
+    console.log("adding tfidf vectors");
     const courses = await RecommendationMetadata.find().exec();
     const global = await GlobalMetadata.findOne().exec();
     const idfVector = global.idfVector;
diff --git a/server/src/course/course.recalgo.ts b/server/src/course/course.recalgo.ts
index d0b56064..aaa91fbb 100644
--- a/server/src/course/course.recalgo.ts
+++ b/server/src/course/course.recalgo.ts
@@ -14,30 +14,32 @@ const stemWord = (word) => {
   return word;
 }
 
+const cleanWords = (sentence: string, fillerWords: string[]) =>
+  sentence
+    .match(/\b\w+\b/g)
+    ?.map(word => {
+      let singularWord = stemWord(word.toLowerCase());
+      return fillerWords.includes(singularWord) ? '' : singularWord;
+    })
+    .filter(Boolean)
+    .join(' ');
+
 /**
  * Preprocesses the description to remove pluralities and unnecessary punctuation
  * @param description A course description that needs to be preprocessed
  * @returns The processed description for a course
  */
 export const preprocess = (description: string) => {
-  const sentences = description.match(/[^.!?]*[.!?]\s+[A-Z]/g) || [description];
   const fillerWords = ["and", "the", "to", "for", "with"];
+  const sentences = description.match(/[^.!?]+[.!?]*/g) || [description];
 
   const processedText = sentences.map(sentence => {
-    const words = sentence.match(/\b\w+\b/g) || [];
-    const cleanedWords = words.map(word => {
-      let singularWord = stemWord(word.toLowerCase());
-      fillerWords.forEach(filler => {
-        const regex = new RegExp(`\\b${filler}\\b`, 'g');
-        singularWord = singularWord.replace(regex, '');
-      });
-      return singularWord.replace(/[^\w\s]/g, '');
-    });
-    return cleanedWords.join(' ');
+    const cleaned = cleanWords(sentence, fillerWords);
+    return cleaned;
   });
-  return processedText.join('. ');
-}
 
+  return processedText.join('. ').trim();
+};
 
 /**
  * Calculates the inverse document frequency for the given terms

From 8a30fe6022fe6a66fb83dee8852601396b115855 Mon Sep 17 00:00:00 2001
From: Jacqueline <jacqueline.cai2004@gmail.com>
Date: Fri, 13 Dec 2024 21:35:32 -0500
Subject: [PATCH 2/3] Update filler words

---
 server/src/course/course.recalgo.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/src/course/course.recalgo.ts b/server/src/course/course.recalgo.ts
index aaa91fbb..5c31cd77 100644
--- a/server/src/course/course.recalgo.ts
+++ b/server/src/course/course.recalgo.ts
@@ -30,7 +30,7 @@ const cleanWords = (sentence: string, fillerWords: string[]) =>
  * @returns The processed description for a course
  */
 export const preprocess = (description: string) => {
-  const fillerWords = ["and", "the", "to", "for", "with"];
+  const fillerWords = ["and", "the", "to", "for", "with", "it", "you", "not", "but", "have", "been", "of", "all", "in", "your", "their", "do", "this", "a", "is", "be"];
   const sentences = description.match(/[^.!?]+[.!?]*/g) || [description];
 
   const processedText = sentences.map(sentence => {

From 7f0ed008a15217e9f67213ada7f7c4ca2669400d Mon Sep 17 00:00:00 2001
From: Jacqueline <jacqueline.cai2004@gmail.com>
Date: Mon, 16 Dec 2024 08:07:44 -0500
Subject: [PATCH 3/3] Add filler word

---
 server/src/course/course.recalgo.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/src/course/course.recalgo.ts b/server/src/course/course.recalgo.ts
index 5c31cd77..36dca585 100644
--- a/server/src/course/course.recalgo.ts
+++ b/server/src/course/course.recalgo.ts
@@ -30,7 +30,7 @@ const cleanWords = (sentence: string, fillerWords: string[]) =>
  * @returns The processed description for a course
  */
 export const preprocess = (description: string) => {
-  const fillerWords = ["and", "the", "to", "for", "with", "it", "you", "not", "but", "have", "been", "of", "all", "in", "your", "their", "do", "this", "a", "is", "be"];
+  const fillerWords = ["and", "the", "to", "for", "with", "it", "you", "not", "but", "have", "been", "of", "all", "in", "your", "their", "do", "this", "a", "is", "be", "will"];
   const sentences = description.match(/[^.!?]+[.!?]*/g) || [description];
 
   const processedText = sentences.map(sentence => {