Skip to content

Commit

Permalink
Improve similarity results
Browse files Browse the repository at this point in the history
  • Loading branch information
jacquelinecai committed Dec 13, 2024
1 parent bd9efbe commit 55fd241
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 15 deletions.
17 changes: 16 additions & 1 deletion server/scripts/populate-courses.ts
Original file line number Diff line number Diff line change
Expand Up @@ -610,6 +610,7 @@ export const addCourseDescription = async (course): Promise<boolean> => {
*/
export const addAllSimilarityData = async (): Promise<boolean> => {
try {
console.log("adding similarity data");
const courses = await Classes.find().exec();
if (courses) {
for (const course of courses) {
Expand Down Expand Up @@ -637,7 +638,21 @@ const addSimilarityData = async (courses, course): Promise<boolean> => {
const similarities = [];
const tfidf = await RecommendationMetadata.findOne({ _id: courseId }).exec();
for (const c of courses) {
if (c._id !== courseId && !c.crossList.includes(courseId) && c.classRating !== null && c.classRating !== 0) {
let crossList = false;
for (const crosslist of c.crossList) {
if (similarities.some(sim => sim._id === crosslist)) {
crossList = true;
break;
}
}
if (
c._id !== courseId &&
!c.crossList.includes(courseId) &&
c.classRating &&
c.classRating !== null &&
c.classRating !== 0 &&
!crossList
) {
const compTfidf = await RecommendationMetadata.findOne({ _id: c._id }).exec();
const cos = cosineSimilarity(tfidf.tfidfVector, compTfidf.tfidfVector);
if (cos < 1) {
Expand Down
6 changes: 5 additions & 1 deletion server/scripts/populate-recdata.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import { preprocess, idf, tfidf } from '../src/course/course.recalgo';
*/
export const addAllProcessedDescriptions = async (): Promise<boolean> => {
try {
console.log("adding processed descriptions");
const courses = await Classes.find().exec();
if (courses) {
for (const course of courses) {
Expand All @@ -35,7 +36,6 @@ const addProcessedDescription = async (course): Promise<boolean> => {
const subject = course.classSub;
const num = course.classNum;
try {
console.log(`${subject} ${num}: ${processed}`)
const rec = await RecommendationMetadata.findOne({ _id: courseId });
if (rec) {
await RecommendationMetadata.updateOne(
Expand All @@ -58,6 +58,7 @@ const addProcessedDescription = async (course): Promise<boolean> => {
throw new Error();
}
}
console.log(`${subject} ${num}`);
return true;
} catch (err) {
console.log(`Error in adding processed description for ${subject} ${num}: ${err}`);
Expand All @@ -71,10 +72,12 @@ const addProcessedDescription = async (course): Promise<boolean> => {
*/
export const addIdfVector = async (): Promise<boolean> => {
try {
console.log("adding idf vector");
const metadata = await RecommendationMetadata.find().exec();
const descriptions = metadata.map(course => course.processedDescription.split(' '));
const allTerms = [...new Set(descriptions.flat())];
const idfValues = idf(allTerms, descriptions);
await GlobalMetadata.deleteMany({});
const res = await new GlobalMetadata({
idfVector: idfValues
}).save();
Expand All @@ -97,6 +100,7 @@ export const addIdfVector = async (): Promise<boolean> => {
*/
export const addAllTfIdfVectors = async (): Promise<boolean> => {
try {
console.log("adding tfidf vectors");
const courses = await RecommendationMetadata.find().exec();
const global = await GlobalMetadata.findOne().exec();
const idfVector = global.idfVector;
Expand Down
28 changes: 15 additions & 13 deletions server/src/course/course.recalgo.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,30 +14,32 @@ const stemWord = (word) => {
return word;
}

const cleanWords = (sentence: string, fillerWords: string[]) =>
sentence
.match(/\b\w+\b/g)
?.map(word => {
let singularWord = stemWord(word.toLowerCase());
return fillerWords.includes(singularWord) ? '' : singularWord;
})
.filter(Boolean)
.join(' ');

/**
* Preprocesses the description to remove pluralities and unnecessary punctuation
* @param description A course description that needs to be preprocessed
* @returns The processed description for a course
*/
export const preprocess = (description: string) => {
const sentences = description.match(/[^.!?]*[.!?]\s+[A-Z]/g) || [description];
const fillerWords = ["and", "the", "to", "for", "with"];
const sentences = description.match(/[^.!?]+[.!?]*/g) || [description];

const processedText = sentences.map(sentence => {
const words = sentence.match(/\b\w+\b/g) || [];
const cleanedWords = words.map(word => {
let singularWord = stemWord(word.toLowerCase());
fillerWords.forEach(filler => {
const regex = new RegExp(`\\b${filler}\\b`, 'g');
singularWord = singularWord.replace(regex, '');
});
return singularWord.replace(/[^\w\s]/g, '');
});
return cleanedWords.join(' ');
const cleaned = cleanWords(sentence, fillerWords);
return cleaned;
});
return processedText.join('. ');
}

return processedText.join('. ').trim();
};

/**
* Calculates the inverse document frequency for the given terms
Expand Down

0 comments on commit 55fd241

Please sign in to comment.