Improve similarity results

cornell-dti · Dec 13, 2024 · 55fd241 · 55fd241
1 parent bd9efbe
commit 55fd241
Show file tree

Hide file tree

Showing 3 changed files with 36 additions and 15 deletions.
diff --git a/server/scripts/populate-courses.ts b/server/scripts/populate-courses.ts
@@ -610,6 +610,7 @@ export const addCourseDescription = async (course): Promise<boolean> => {
  */
 export const addAllSimilarityData = async (): Promise<boolean> => {
   try {
+    console.log("adding similarity data");
     const courses = await Classes.find().exec();
     if (courses) {
       for (const course of courses) {
@@ -637,7 +638,21 @@ const addSimilarityData = async (courses, course): Promise<boolean> => {
     const similarities = [];
     const tfidf = await RecommendationMetadata.findOne({ _id: courseId }).exec();
     for (const c of courses) {
-      if (c._id !== courseId && !c.crossList.includes(courseId) && c.classRating !== null && c.classRating !== 0) {
+      let crossList = false;
+      for (const crosslist of c.crossList) {
+        if (similarities.some(sim => sim._id === crosslist)) {
+          crossList = true;
+          break;
+        }
+      }
+      if (
+        c._id !== courseId &&
+        !c.crossList.includes(courseId) &&
+        c.classRating &&
+        c.classRating !== null &&
+        c.classRating !== 0 &&
+        !crossList
+      ) {
         const compTfidf = await RecommendationMetadata.findOne({ _id: c._id }).exec();
         const cos = cosineSimilarity(tfidf.tfidfVector, compTfidf.tfidfVector);
         if (cos < 1) {

diff --git a/server/scripts/populate-recdata.ts b/server/scripts/populate-recdata.ts
@@ -9,6 +9,7 @@ import { preprocess, idf, tfidf } from '../src/course/course.recalgo';
  */
 export const addAllProcessedDescriptions = async (): Promise<boolean> => {
   try {
+    console.log("adding processed descriptions");
     const courses = await Classes.find().exec();
     if (courses) {
       for (const course of courses) {
@@ -35,7 +36,6 @@ const addProcessedDescription = async (course): Promise<boolean> => {
   const subject = course.classSub;
   const num = course.classNum;
   try {
-    console.log(`${subject} ${num}: ${processed}`)
     const rec = await RecommendationMetadata.findOne({ _id: courseId });
     if (rec) {
       await RecommendationMetadata.updateOne(
@@ -58,6 +58,7 @@ const addProcessedDescription = async (course): Promise<boolean> => {
         throw new Error();
       }
     }
+    console.log(`${subject} ${num}`);
     return true;
   } catch (err) {
     console.log(`Error in adding processed description for ${subject} ${num}: ${err}`);
@@ -71,10 +72,12 @@ const addProcessedDescription = async (course): Promise<boolean> => {
  */
 export const addIdfVector = async (): Promise<boolean> => {
   try {
+    console.log("adding idf vector");
     const metadata = await RecommendationMetadata.find().exec();
     const descriptions = metadata.map(course => course.processedDescription.split(' '));
     const allTerms = [...new Set(descriptions.flat())];
     const idfValues = idf(allTerms, descriptions);
+    await GlobalMetadata.deleteMany({});
     const res = await new GlobalMetadata({
       idfVector: idfValues
     }).save();
@@ -97,6 +100,7 @@ export const addIdfVector = async (): Promise<boolean> => {
  */
 export const addAllTfIdfVectors = async (): Promise<boolean> => {
   try {
+    console.log("adding tfidf vectors");
     const courses = await RecommendationMetadata.find().exec();
     const global = await GlobalMetadata.findOne().exec();
     const idfVector = global.idfVector;

diff --git a/server/src/course/course.recalgo.ts b/server/src/course/course.recalgo.ts
@@ -14,30 +14,32 @@ const stemWord = (word) => {
   return word;
 }
 
+const cleanWords = (sentence: string, fillerWords: string[]) =>
+  sentence
+    .match(/\b\w+\b/g)
+    ?.map(word => {
+      let singularWord = stemWord(word.toLowerCase());
+      return fillerWords.includes(singularWord) ? '' : singularWord;
+    })
+    .filter(Boolean)
+    .join(' ');
+
 /**
  * Preprocesses the description to remove pluralities and unnecessary punctuation
  * @param description A course description that needs to be preprocessed
  * @returns The processed description for a course
  */
 export const preprocess = (description: string) => {
-  const sentences = description.match(/[^.!?]*[.!?]\s+[A-Z]/g) || [description];
   const fillerWords = ["and", "the", "to", "for", "with"];
+  const sentences = description.match(/[^.!?]+[.!?]*/g) || [description];
 
   const processedText = sentences.map(sentence => {
-    const words = sentence.match(/\b\w+\b/g) || [];
-    const cleanedWords = words.map(word => {
-      let singularWord = stemWord(word.toLowerCase());
-      fillerWords.forEach(filler => {
-        const regex = new RegExp(`\\b${filler}\\b`, 'g');
-        singularWord = singularWord.replace(regex, '');
-      });
-      return singularWord.replace(/[^\w\s]/g, '');
-    });
-    return cleanedWords.join(' ');
+    const cleaned = cleanWords(sentence, fillerWords);
+    return cleaned;
   });
-  return processedText.join('. ');
-}
 
+  return processedText.join('. ').trim();
+};
 
 /**
  * Calculates the inverse document frequency for the given terms