devsoc-unsw · francojreyes · May 29, 2023 · May 29, 2023 · May 29, 2023 · May 29, 2023
diff --git a/scraper/Dockerfile b/scraper/Dockerfile
@@ -10,33 +10,11 @@ RUN yarn build
 FROM node:18.19.0-alpine as runner
 ENV NODE_ENV production
 
-# Installs latest Chromium (92) package.
-RUN apk add --no-cache \
-      chromium \
-      nss \
-      freetype \
-      harfbuzz \
-      ca-certificates \
-      ttf-freefont 
-
-# Tell Puppeteer to skip installing Chrome. We'll be using the installed package.
-ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true \
-    PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium-browser
-
 WORKDIR /app
 
 COPY package.json yarn.lock ./
 RUN yarn install --frozen-lockfile
 
-# Add user so we don't need --no-sandbox.
-RUN addgroup -S pptruser && adduser -S -G pptruser pptruser \
-    && mkdir -p /home/pptruser/Downloads /app \
-    && chown -R pptruser:pptruser /home/pptruser \
-    && chown -R pptruser:pptruser /app
-
-# Run everything after as non-privileged user.
-USER pptruser
-
 COPY --from=builder /app/dist ./dist
 
 CMD yarn scrape
diff --git a/scraper/package.json b/scraper/package.json
@@ -15,12 +15,12 @@
     "@sentry/node": "7.92.0",
     "@sentry/tracing": "7.92.0",
     "axios": "0.27.2",
-    "lodash": "4.17.21",
-    "puppeteer": "14.4.1"
+    "cheerio": "^1.0.0-rc.12",
+    "lodash": "4.17.21"
   },
   "devDependencies": {
     "@types/lodash": "4.14.202",
-    "@types/puppeteer": "5.4.7",
+    "@types/node": "^20.2.5",
     "prettier": "2.8.8",
     "typescript": "4.9.5"
   }

diff --git a/scraper/src/page-scraper/PageScraper.ts b/scraper/src/page-scraper/PageScraper.ts
@@ -1,4 +1,4 @@
-import puppeteer from "puppeteer";
+import { load, CheerioAPI } from 'cheerio';
 
 import {
   Chunk,
@@ -16,15 +16,16 @@ import { parsePage } from "./ParsePage";
 import { parseNotes } from "./page-helpers/GetNotes";
 import { getClassesByTerm } from "./page-helpers/GetClassesByTerm";
 import { getCourseWarningsFromClassWarnings } from "./page-helpers/GetCourseWarnings";
+import axios from 'axios';
 
 /**
  * Breaks the page down into relevant chunks from which data can be extracted
- * @param { puppeteer.Page } page: page to be broken down into chunks
+ * @param { CheerioAPI } $: Cheerio API loaded with page to be broken down into chunks
  * @returns { Promise<PageData[]> }: Extracted data as a course info chunk and list of class chunks to be parsed
  */
-const getChunks = async (page: puppeteer.Page): Promise<PageData[]> => {
-  const tableSelector: string = '[class="formBody"][colspan="3"]';
-  return await page.$$eval(tableSelector, parsePage);
+const getChunks = async ($: CheerioAPI): Promise<PageData[]> => {
+  const table = $('td.formBody[colspan="3"]').get();
+  return parsePage(table);
 };
 
 interface GetCourseInfoAndNotesParams {
@@ -63,10 +64,14 @@ type ScrapePageReturn = Promise<ScrapePageReturnSync>;
 /**
  * Function scrapes all the course data on the given page
  * Returns an array of courses on the page
- * @param { puppeteer.Page } page Page to be scraped
+ * @param { string } url url to be scraped
  * @returns { ScrapePageReturn }: All the data on the current page, along with all the courseWarnings found on that page
  */
-const scrapePage = async (page: puppeteer.Page): ScrapePageReturn => {
+const scrapePage = async (url: string): ScrapePageReturn => {
+  // Load the page
+  const response = await axios.get(url);
+  const $ = load(response.data);
+
   const coursesData: TimetableData = {
     Summer: [],
     T1: [],
@@ -77,7 +82,7 @@ const scrapePage = async (page: puppeteer.Page): ScrapePageReturn => {
     Other: [],
   };
   const courseWarnings: CourseWarning[] = [];
-  const pageChunks: PageData[] = await getChunks(page);
+  const pageChunks: PageData[] = await getChunks($);
 
   for (const course of pageChunks) {
     if (!course) {
@@ -87,10 +92,10 @@ const scrapePage = async (page: puppeteer.Page): ScrapePageReturn => {
     let courseHead: CourseHead;
     try {
       // Get course code and name, that is not a chunk
-      courseHead = await getCourseHeadChunk(page);
+      courseHead = await getCourseHeadChunk($);
       const parsedData = getCourseInfoAndNotes({
         courseInfo: course.courseInfo,
-        url: page.url(),
+        url,
       });
       const { courseInfo } = parsedData;
       const notes = parseNotes(parsedData.notes);
@@ -134,6 +139,7 @@ const scrapePage = async (page: puppeteer.Page): ScrapePageReturn => {
       throw new Error(err);
     }
   }
+
   return { coursesData, courseWarnings };
 };
 

diff --git a/scraper/src/page-scraper/ParsePage.ts b/scraper/src/page-scraper/ParsePage.ts
@@ -1,25 +1,24 @@
+import { load, Element, SelectorType } from 'cheerio'
 import { Chunk, ClassChunk, PageData } from "../scraper-helpers/interfaces";
 
 /**
  * Parses the tables on the page, extracts the courses on the page as chunks
- * @param { HTMLElement[] } elements: List of table elements on the page that need to be parsed
+ * @param { Element[] } elements: List of table elements on the page that need to be parsed
  * @returns { PageData[] }: List of course chunks, classified as a pageData object
  */
-const parsePage = (elements: HTMLElement[]): PageData[] => {
+const parsePage = (elements: Element[]): PageData[] => {
   /**
    * Extracts the tables on the page containing course data
-   * @param { HTMLElement[] } courseTables: List of all the tables on the page
-   * @returns { HTMLElement[][] }: List of elements that contain data about a course, group together so each list only contains chunks relevant to one course
+   * @param { Element[] } courseTables: List of all the tables on the page
+   * @returns { Element[][] }: List of elements that contain data about a course, group together so each list only contains chunks relevant to one course
    */
-  const getCourseElements = (courseTables: HTMLElement[]): HTMLElement[][] => {
-    const elementList: HTMLElement[][] = [];
-    const tableTagName: string = "TABLE";
+  const getCourseElements = (courseTables: Element[]): Element[][] => {
+    const elementList: Element[][] = [];
 
     for (const course of courseTables) {
       // Get every td which has more than 1 table
-      const subtables = [...course.children].filter(
-        (element: HTMLElement): element is HTMLElement => element.tagName === tableTagName,
-      );
+      const $ = load(course);
+      const subtables = $('table').get();
       if (subtables.length > 1) {
         elementList.push(subtables);
       }
@@ -33,10 +32,11 @@ const parsePage = (elements: HTMLElement[]): PageData[] => {
    * Relies on the fact that notes follow "Class Notes" header
    * @param subtable: Table tag equivalent to a class chunk
    */
-  const getClassNotes = (subtable: HTMLElement): string[] => {
-    const notes = [
-      ...subtable.querySelectorAll<HTMLElement>('td.label[colspan="5"], font[color="red"]'),
-    ].map((note) => note.innerText);
+  const getClassNotes = (subtable: Element): string[] => {
+    const $ = load(subtable);
+    const notes = $('td.label[colspan="5"], font[color="red"]')
+      .map((_, element) => $(element).text().replace(/\s+/g, ' ').trim())
+      .get();
     const noteStartIndex = notes.indexOf("Class Notes");
     let noteCount = 0;
     let classNotes: string[] = [];
@@ -49,14 +49,14 @@ const parsePage = (elements: HTMLElement[]): PageData[] => {
   };
 
   interface GetClassTablesParams {
-    subtables: NodeListOf<HTMLElement>;
-    dataClassSelector: string;
+    subtables: Element[];
+    dataClassSelector: SelectorType;
   }
 
   /**
    * Extracts all the classChunks from the page
-   * @param { NodeListOf<HTMLElement> } subtables: List of table elements that contain one class chunk each
-   * @param { string } dataClassSelector: selector to extract elements with the data class
+   * @param { Element[]} subtables: List of table elements that contain one class chunk each
+   * @param { SelectorType } dataClassSelector: selector to extract elements with the data class
    * @returns { ClassChunk[] }: List of class chunks that were extracted
    */
   const getClassTables = ({ subtables, dataClassSelector }: GetClassTablesParams): ClassChunk[] => {
@@ -66,9 +66,10 @@ const parsePage = (elements: HTMLElement[]): PageData[] => {
     const tablelist: ClassChunk[] = [];
     for (const subtable of subtables) {
       // classNotes.push(getClassNotes(subtable))
-      const data = [...subtable.querySelectorAll<HTMLElement>(dataClassSelector)].map(
-        (element: HTMLElement) => element.innerText,
-      );
+      const $ = load(subtable);
+      const data = $(dataClassSelector)
+        .map((_, element) => $(element).text().replace(/\s+/g, ' ').trim())
+        .get();
       const notes = getClassNotes(subtable);
       tablelist.push({
         data: data.slice(0, data.length - notes.length),
@@ -79,55 +80,58 @@ const parsePage = (elements: HTMLElement[]): PageData[] => {
   };
 
   interface GetCourseInfoChunkParams {
-    courseInfoElement: HTMLElement;
-    dataClassSelector: string;
+    courseInfoElement: Element;
+    dataClassSelector: SelectorType;
   }
 
   /**
    * Extracts course info chunk from the page
-   * @param { HTMLElement } courseInfoElement: The dom element that contains the courseInfo chunk
-   * @param { string } dataClassSelector: selector to extract dom elements with the data class
+   * @param { Element } courseInfoElement: The dom element that contains the courseInfo chunk
+   * @param { SelectorType } dataClassSelector: selector to extract dom elements with the data class
    * @returns { Chunk }: Extracted courseInfo chunk
    */
   const getCourseInfoChunk = ({
     courseInfoElement,
     dataClassSelector,
-  }: GetCourseInfoChunkParams): Chunk =>
+  }: GetCourseInfoChunkParams): Chunk => {
     // This should be the course info table. --> get data elements
-    ({
-      data: [...courseInfoElement.querySelectorAll<HTMLElement>(dataClassSelector)].map(
-        (element) => element.innerText,
-      ),
-    });
+    const $ = load(courseInfoElement);
+    return {
+      data: $(dataClassSelector)
+        .map((_, element) => $(element).text().replace(/\s+/g, ' ').trim())
+        .get()
+    };
+  }
+
   /**
    * Checks if the element contains a class chunk or not
-   * @param { HTMLElement } element: Chunk to be checked
+   * @param { Element } element: Chunk to be checked
    * @returns { boolean }: true, if the element contains a class chunk, otherwise false
    */
-  const hasClassChunk = (element: HTMLElement): boolean => {
+  const hasClassChunk = (element: Element): boolean => {
     // If the table has any element with id top, then it is the classes table.
-    const classQuery: string = 'a[href="#top"]';
-    const classlist: HTMLElement = element.querySelector(classQuery);
-    return classlist !== null;
+    const $ = load(element);
+    const classQuery: SelectorType = 'a[href="#top"]';
+    return $(classQuery).length !== 0;
   };
 
   /**
    * Checks if the element has a note and no useful data
-   * @param { HTMLElement } element: The dom element to be checked
+   * @param { Element } element: The dom element to be checked
    * @returns { boolean }: true, if the element has a note dom element, false otherwise
    */
-  const isNoteElement = (element: HTMLElement): boolean => {
-    const noteClassSelector: string = ".note";
-    const note: HTMLElement = element.querySelector(noteClassSelector);
-    return note !== null;
+  const isNoteElement = (element: Element): boolean => {
+    const $ = load(element);
+    const noteClassSelector: SelectorType = ".note";
+    return $(noteClassSelector).length !== 0;
   };
 
   /**
    * Checks if the subtables indicate that the parent might contain a course info chunk
-   * @param { NodeListOf<HTMLElement> } subtables: The subtables that might be part of the table element that contains a courseInfoChunk
+   * @param { Element[] } subtables: The subtables that might be part of the table element that contains a courseInfoChunk
    * @returns { boolean }: true, if the parent contains a course info chunk, false otherwise
    */
-  const hasCourseInfoChunk = (subtables: NodeListOf<HTMLElement>): boolean =>
+  const hasCourseInfoChunk = (subtables: Element[]): boolean =>
     subtables.length === 3;
 
   interface ExtractChunksReturn {
@@ -137,13 +141,14 @@ const parsePage = (elements: HTMLElement[]): PageData[] => {
 
   /**
    * Extracts the course info and class chunks (if present) from the element
-   * @param { HTMLElement } element: Dom element to extract chunks from
+   * @param { Element } element: Dom element to extract chunks from
    * @returns { ExtractChunksReturn }: The extracted course info and class chunks, if found
    */
-  const extractChunks = (element: HTMLElement): ExtractChunksReturn => {
-    const dataClassSelector: string = ".data";
-    const pathToInnerTable: string = ":scope > tbody > tr > td > table";
-    const subtables: NodeListOf<HTMLElement> = element.querySelectorAll(pathToInnerTable);
+  const extractChunks = (element: Element): ExtractChunksReturn => {
+    const $ = load(element);
+    const dataClassSelector: SelectorType = ".data";
+    const pathToInnerTable: SelectorType = ":root > tbody > tr > td > table";
+    const subtables: Element[] = $(pathToInnerTable).get();
 
     if (hasClassChunk(element)) {
       return {
@@ -168,21 +173,22 @@ const parsePage = (elements: HTMLElement[]): PageData[] => {
 
   /**
    * Extracts chunks from list of elements relating to a single course
-   * @param { HTMLElement[]} elements: list of elements relating to a single course
+   * @param { Element[]} elements: list of elements relating to a single course
    * @returns { PageData }: extracted courseInfo and courseClasses chunks, formatted as a PageData object
    */
-  const parseCourse = (elements: HTMLElement[]): PageData => {
-    const dataClassSelector: string = ".data";
+  const parseCourse = (elements: Element[]): PageData => {
+    const dataClassSelector: SelectorType = ".data";
 
     const courseClasses: ClassChunk[] = [];
     let courseInfo: Chunk;
     for (const element of elements) {
       // If there are any data fields inside the chunk, then categorize it
-      const data: HTMLElement = element.querySelector(dataClassSelector);
+      const $ = load(element);
+      const data: Element = $(dataClassSelector).get(0);
       if (data) {
         const { classChunks, courseInfoChunk } = extractChunks(element);
 
-        if (courseInfoChunk) {
+        if (courseInfoChunk && !courseInfo) {
           courseInfo = courseInfoChunk;
         }
 

diff --git a/scraper/src/page-scraper/chunk-scraper/CourseHeader.ts b/scraper/src/page-scraper/chunk-scraper/CourseHeader.ts
@@ -1,28 +1,26 @@
-import { Page } from "puppeteer";
+import { CheerioAPI } from 'cheerio';
 
 import { CourseHead } from "../../scraper-helpers/interfaces";
 import { transformHtmlSpecials } from "./class-scraper/class-helpers/TransformHtmlSpecials";
 
 /**
  * Extracts the course header information, and splits it into the course code and name
- * @param { puppeteer.Page } page: Page that is to be evaluated
+ * @param { CheerioAPI } $: Cheerio API loaded with page that is to be evaluated
  * @returns { RegExpExecArray }: Runs the regex to extract data on the extracted data and returns the array
  */
-const extractCourseHeadFromPage = async (page: Page): Promise<RegExpExecArray> =>
-  await page.evaluate(() => {
-    // Get the course code and course name
-    const courseHeader = document.getElementsByClassName("classSearchMinorHeading")[0].innerHTML;
-    const headerRegex = /(^[A-Z]{4}[0-9]{4})(.*)/;
-    return headerRegex.exec(courseHeader);
-  });
+const extractCourseHeadFromPage = async ($: CheerioAPI): Promise<RegExpExecArray> => {
+  const courseHeader = $(".classSearchMinorHeading").first().text();
+  const headerRegex = /(^[A-Z]{4}[0-9]{4})(.*)/;
+  return headerRegex.exec(courseHeader);
+}
 
 /**
  * Gets the data from the title of the course (course code, name)
- * @param { puppeteer.Page } page: page which displays the data to scrape
+ * @param { CheerioAPI } $: Cheerio API loaded with page which displays the data to scrape
  * @returns { Promise<CourseHead> }: Data about the title of the course: The course code and the course name
  */
-const getCourseHeadChunk = async (page: Page): Promise<CourseHead> => {
-  const data = await extractCourseHeadFromPage(page);
+const getCourseHeadChunk = async ($: CheerioAPI): Promise<CourseHead> => {
+  const data = await extractCourseHeadFromPage($);
   // There must be at least 3 elements in courseHead
   if (!(data && data.length > 2)) {
     throw new Error(`Malformed course header: ${data}`);

diff --git a/scraper/src/page-scraper/chunk-scraper/CourseInfo.ts b/scraper/src/page-scraper/chunk-scraper/CourseInfo.ts
@@ -60,7 +60,7 @@ const parseCourseInfoChunk = ({
   const notes: string[] = [];
   // Find all the terms the course runs in
   while (index < data.length) {
-    if (isTerm(data[index])) {
+    if (isTerm(data[index]) && !termsOffered.includes(data[index])) {
       termsOffered.push(data[index]);
     }
 

diff --git a/scraper/src/page-scraper/page-helpers/GetHrefs.ts b/scraper/src/page-scraper/page-helpers/GetHrefs.ts