Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: refactor to use cheerio instead of Puppeteer #149

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
22 changes: 0 additions & 22 deletions scraper/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,33 +10,11 @@ RUN yarn build
FROM node:18.19.0-alpine as runner
ENV NODE_ENV production

# Installs latest Chromium (92) package.
RUN apk add --no-cache \
chromium \
nss \
freetype \
harfbuzz \
ca-certificates \
ttf-freefont

# Tell Puppeteer to skip installing Chrome. We'll be using the installed package.
ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true \
PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium-browser

WORKDIR /app

COPY package.json yarn.lock ./
RUN yarn install --frozen-lockfile

# Add user so we don't need --no-sandbox.
RUN addgroup -S pptruser && adduser -S -G pptruser pptruser \
&& mkdir -p /home/pptruser/Downloads /app \
&& chown -R pptruser:pptruser /home/pptruser \
&& chown -R pptruser:pptruser /app

# Run everything after as non-privileged user.
USER pptruser

COPY --from=builder /app/dist ./dist

CMD yarn scrape
6 changes: 3 additions & 3 deletions scraper/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@
"@sentry/node": "7.92.0",
"@sentry/tracing": "7.92.0",
"axios": "0.27.2",
"lodash": "4.17.21",
"puppeteer": "14.4.1"
"cheerio": "^1.0.0-rc.12",
"lodash": "4.17.21"
},
"devDependencies": {
"@types/lodash": "4.14.202",
"@types/puppeteer": "5.4.7",
"@types/node": "^20.2.5",
"prettier": "2.8.8",
"typescript": "4.9.5"
}
Expand Down
26 changes: 16 additions & 10 deletions scraper/src/page-scraper/PageScraper.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import puppeteer from "puppeteer";
import { load, CheerioAPI } from 'cheerio';

import {
Chunk,
Expand All @@ -16,15 +16,16 @@ import { parsePage } from "./ParsePage";
import { parseNotes } from "./page-helpers/GetNotes";
import { getClassesByTerm } from "./page-helpers/GetClassesByTerm";
import { getCourseWarningsFromClassWarnings } from "./page-helpers/GetCourseWarnings";
import axios from 'axios';

/**
* Breaks the page down into relevant chunks from which data can be extracted
* @param { puppeteer.Page } page: page to be broken down into chunks
* @param { CheerioAPI } $: Cheerio API loaded with page to be broken down into chunks
* @returns { Promise<PageData[]> }: Extracted data as a course info chunk and list of class chunks to be parsed
*/
const getChunks = async (page: puppeteer.Page): Promise<PageData[]> => {
const tableSelector: string = '[class="formBody"][colspan="3"]';
return await page.$$eval(tableSelector, parsePage);
const getChunks = async ($: CheerioAPI): Promise<PageData[]> => {
const table = $('td.formBody[colspan="3"]').get();
return parsePage(table);
};

interface GetCourseInfoAndNotesParams {
Expand Down Expand Up @@ -63,10 +64,14 @@ type ScrapePageReturn = Promise<ScrapePageReturnSync>;
/**
* Function scrapes all the course data on the given page
* Returns an array of courses on the page
* @param { puppeteer.Page } page Page to be scraped
* @param { string } url url to be scraped
* @returns { ScrapePageReturn }: All the data on the current page, along with all the courseWarnings found on that page
*/
const scrapePage = async (page: puppeteer.Page): ScrapePageReturn => {
const scrapePage = async (url: string): ScrapePageReturn => {
// Load the page
const response = await axios.get(url);
const $ = load(response.data);

const coursesData: TimetableData = {
Summer: [],
T1: [],
Expand All @@ -77,7 +82,7 @@ const scrapePage = async (page: puppeteer.Page): ScrapePageReturn => {
Other: [],
};
const courseWarnings: CourseWarning[] = [];
const pageChunks: PageData[] = await getChunks(page);
const pageChunks: PageData[] = await getChunks($);

for (const course of pageChunks) {
if (!course) {
Expand All @@ -87,10 +92,10 @@ const scrapePage = async (page: puppeteer.Page): ScrapePageReturn => {
let courseHead: CourseHead;
try {
// Get course code and name, that is not a chunk
courseHead = await getCourseHeadChunk(page);
courseHead = await getCourseHeadChunk($);
const parsedData = getCourseInfoAndNotes({
courseInfo: course.courseInfo,
url: page.url(),
url,
});
const { courseInfo } = parsedData;
const notes = parseNotes(parsedData.notes);
Expand Down Expand Up @@ -134,6 +139,7 @@ const scrapePage = async (page: puppeteer.Page): ScrapePageReturn => {
throw new Error(err);
}
}

return { coursesData, courseWarnings };
};

Expand Down
112 changes: 59 additions & 53 deletions scraper/src/page-scraper/ParsePage.ts
Original file line number Diff line number Diff line change
@@ -1,25 +1,24 @@
import { load, Element, SelectorType } from 'cheerio'
import { Chunk, ClassChunk, PageData } from "../scraper-helpers/interfaces";

/**
* Parses the tables on the page, extracts the courses on the page as chunks
* @param { HTMLElement[] } elements: List of table elements on the page that need to be parsed
* @param { Element[] } elements: List of table elements on the page that need to be parsed
* @returns { PageData[] }: List of course chunks, classified as a pageData object
*/
const parsePage = (elements: HTMLElement[]): PageData[] => {
const parsePage = (elements: Element[]): PageData[] => {
/**
* Extracts the tables on the page containing course data
* @param { HTMLElement[] } courseTables: List of all the tables on the page
* @returns { HTMLElement[][] }: List of elements that contain data about a course, group together so each list only contains chunks relevant to one course
* @param { Element[] } courseTables: List of all the tables on the page
* @returns { Element[][] }: List of elements that contain data about a course, group together so each list only contains chunks relevant to one course
*/
const getCourseElements = (courseTables: HTMLElement[]): HTMLElement[][] => {
const elementList: HTMLElement[][] = [];
const tableTagName: string = "TABLE";
const getCourseElements = (courseTables: Element[]): Element[][] => {
const elementList: Element[][] = [];

for (const course of courseTables) {
// Get every td which has more than 1 table
const subtables = [...course.children].filter(
(element: HTMLElement): element is HTMLElement => element.tagName === tableTagName,
);
const $ = load(course);
const subtables = $('table').get();
if (subtables.length > 1) {
elementList.push(subtables);
}
Expand All @@ -33,10 +32,11 @@ const parsePage = (elements: HTMLElement[]): PageData[] => {
* Relies on the fact that notes follow "Class Notes" header
* @param subtable: Table tag equivalent to a class chunk
*/
const getClassNotes = (subtable: HTMLElement): string[] => {
const notes = [
...subtable.querySelectorAll<HTMLElement>('td.label[colspan="5"], font[color="red"]'),
].map((note) => note.innerText);
const getClassNotes = (subtable: Element): string[] => {
const $ = load(subtable);
const notes = $('td.label[colspan="5"], font[color="red"]')
.map((_, element) => $(element).text().replace(/\s+/g, ' ').trim())
.get();
const noteStartIndex = notes.indexOf("Class Notes");
let noteCount = 0;
let classNotes: string[] = [];
Expand All @@ -49,14 +49,14 @@ const parsePage = (elements: HTMLElement[]): PageData[] => {
};

interface GetClassTablesParams {
subtables: NodeListOf<HTMLElement>;
dataClassSelector: string;
subtables: Element[];
dataClassSelector: SelectorType;
}

/**
* Extracts all the classChunks from the page
* @param { NodeListOf<HTMLElement> } subtables: List of table elements that contain one class chunk each
* @param { string } dataClassSelector: selector to extract elements with the data class
* @param { Element[]} subtables: List of table elements that contain one class chunk each
* @param { SelectorType } dataClassSelector: selector to extract elements with the data class
* @returns { ClassChunk[] }: List of class chunks that were extracted
*/
const getClassTables = ({ subtables, dataClassSelector }: GetClassTablesParams): ClassChunk[] => {
Expand All @@ -66,9 +66,10 @@ const parsePage = (elements: HTMLElement[]): PageData[] => {
const tablelist: ClassChunk[] = [];
for (const subtable of subtables) {
// classNotes.push(getClassNotes(subtable))
const data = [...subtable.querySelectorAll<HTMLElement>(dataClassSelector)].map(
(element: HTMLElement) => element.innerText,
);
const $ = load(subtable);
const data = $(dataClassSelector)
.map((_, element) => $(element).text().replace(/\s+/g, ' ').trim())
.get();
const notes = getClassNotes(subtable);
tablelist.push({
data: data.slice(0, data.length - notes.length),
Expand All @@ -79,55 +80,58 @@ const parsePage = (elements: HTMLElement[]): PageData[] => {
};

interface GetCourseInfoChunkParams {
courseInfoElement: HTMLElement;
dataClassSelector: string;
courseInfoElement: Element;
dataClassSelector: SelectorType;
}

/**
* Extracts course info chunk from the page
* @param { HTMLElement } courseInfoElement: The dom element that contains the courseInfo chunk
* @param { string } dataClassSelector: selector to extract dom elements with the data class
* @param { Element } courseInfoElement: The dom element that contains the courseInfo chunk
* @param { SelectorType } dataClassSelector: selector to extract dom elements with the data class
* @returns { Chunk }: Extracted courseInfo chunk
*/
const getCourseInfoChunk = ({
courseInfoElement,
dataClassSelector,
}: GetCourseInfoChunkParams): Chunk =>
}: GetCourseInfoChunkParams): Chunk => {
// This should be the course info table. --> get data elements
({
data: [...courseInfoElement.querySelectorAll<HTMLElement>(dataClassSelector)].map(
(element) => element.innerText,
),
});
const $ = load(courseInfoElement);
return {
data: $(dataClassSelector)
.map((_, element) => $(element).text().replace(/\s+/g, ' ').trim())
.get()
};
}

/**
* Checks if the element contains a class chunk or not
* @param { HTMLElement } element: Chunk to be checked
* @param { Element } element: Chunk to be checked
* @returns { boolean }: true, if the element contains a class chunk, otherwise false
*/
const hasClassChunk = (element: HTMLElement): boolean => {
const hasClassChunk = (element: Element): boolean => {
// If the table has any element with id top, then it is the classes table.
const classQuery: string = 'a[href="#top"]';
const classlist: HTMLElement = element.querySelector(classQuery);
return classlist !== null;
const $ = load(element);
const classQuery: SelectorType = 'a[href="#top"]';
return $(classQuery).length !== 0;
};

/**
* Checks if the element has a note and no useful data
* @param { HTMLElement } element: The dom element to be checked
* @param { Element } element: The dom element to be checked
* @returns { boolean }: true, if the element has a note dom element, false otherwise
*/
const isNoteElement = (element: HTMLElement): boolean => {
const noteClassSelector: string = ".note";
const note: HTMLElement = element.querySelector(noteClassSelector);
return note !== null;
const isNoteElement = (element: Element): boolean => {
const $ = load(element);
const noteClassSelector: SelectorType = ".note";
return $(noteClassSelector).length !== 0;
};

/**
* Checks if the subtables indicate that the parent might contain a course info chunk
* @param { NodeListOf<HTMLElement> } subtables: The subtables that might be part of the table element that contains a courseInfoChunk
* @param { Element[] } subtables: The subtables that might be part of the table element that contains a courseInfoChunk
* @returns { boolean }: true, if the parent contains a course info chunk, false otherwise
*/
const hasCourseInfoChunk = (subtables: NodeListOf<HTMLElement>): boolean =>
const hasCourseInfoChunk = (subtables: Element[]): boolean =>
subtables.length === 3;

interface ExtractChunksReturn {
Expand All @@ -137,13 +141,14 @@ const parsePage = (elements: HTMLElement[]): PageData[] => {

/**
* Extracts the course info and class chunks (if present) from the element
* @param { HTMLElement } element: Dom element to extract chunks from
* @param { Element } element: Dom element to extract chunks from
* @returns { ExtractChunksReturn }: The extracted course info and class chunks, if found
*/
const extractChunks = (element: HTMLElement): ExtractChunksReturn => {
const dataClassSelector: string = ".data";
const pathToInnerTable: string = ":scope > tbody > tr > td > table";
const subtables: NodeListOf<HTMLElement> = element.querySelectorAll(pathToInnerTable);
const extractChunks = (element: Element): ExtractChunksReturn => {
const $ = load(element);
const dataClassSelector: SelectorType = ".data";
const pathToInnerTable: SelectorType = ":root > tbody > tr > td > table";
const subtables: Element[] = $(pathToInnerTable).get();

if (hasClassChunk(element)) {
return {
Expand All @@ -168,21 +173,22 @@ const parsePage = (elements: HTMLElement[]): PageData[] => {

/**
* Extracts chunks from list of elements relating to a single course
* @param { HTMLElement[]} elements: list of elements relating to a single course
* @param { Element[]} elements: list of elements relating to a single course
* @returns { PageData }: extracted courseInfo and courseClasses chunks, formatted as a PageData object
*/
const parseCourse = (elements: HTMLElement[]): PageData => {
const dataClassSelector: string = ".data";
const parseCourse = (elements: Element[]): PageData => {
const dataClassSelector: SelectorType = ".data";

const courseClasses: ClassChunk[] = [];
let courseInfo: Chunk;
for (const element of elements) {
// If there are any data fields inside the chunk, then categorize it
const data: HTMLElement = element.querySelector(dataClassSelector);
const $ = load(element);
const data: Element = $(dataClassSelector).get(0);
if (data) {
const { classChunks, courseInfoChunk } = extractChunks(element);

if (courseInfoChunk) {
if (courseInfoChunk && !courseInfo) {
courseInfo = courseInfoChunk;
}

Expand Down
22 changes: 10 additions & 12 deletions scraper/src/page-scraper/chunk-scraper/CourseHeader.ts
Original file line number Diff line number Diff line change
@@ -1,28 +1,26 @@
import { Page } from "puppeteer";
import { CheerioAPI } from 'cheerio';

import { CourseHead } from "../../scraper-helpers/interfaces";
import { transformHtmlSpecials } from "./class-scraper/class-helpers/TransformHtmlSpecials";

/**
* Extracts the course header information, and splits it into the course code and name
* @param { puppeteer.Page } page: Page that is to be evaluated
* @param { CheerioAPI } $: Cheerio API loaded with page that is to be evaluated
* @returns { RegExpExecArray }: Runs the regex to extract data on the extracted data and returns the array
*/
const extractCourseHeadFromPage = async (page: Page): Promise<RegExpExecArray> =>
await page.evaluate(() => {
// Get the course code and course name
const courseHeader = document.getElementsByClassName("classSearchMinorHeading")[0].innerHTML;
const headerRegex = /(^[A-Z]{4}[0-9]{4})(.*)/;
return headerRegex.exec(courseHeader);
});
const extractCourseHeadFromPage = async ($: CheerioAPI): Promise<RegExpExecArray> => {
const courseHeader = $(".classSearchMinorHeading").first().text();
const headerRegex = /(^[A-Z]{4}[0-9]{4})(.*)/;
return headerRegex.exec(courseHeader);
}

/**
* Gets the data from the title of the course (course code, name)
* @param { puppeteer.Page } page: page which displays the data to scrape
* @param { CheerioAPI } $: Cheerio API loaded with page which displays the data to scrape
* @returns { Promise<CourseHead> }: Data about the title of the course: The course code and the course name
*/
const getCourseHeadChunk = async (page: Page): Promise<CourseHead> => {
const data = await extractCourseHeadFromPage(page);
const getCourseHeadChunk = async ($: CheerioAPI): Promise<CourseHead> => {
const data = await extractCourseHeadFromPage($);
// There must be at least 3 elements in courseHead
if (!(data && data.length > 2)) {
throw new Error(`Malformed course header: ${data}`);
Expand Down
2 changes: 1 addition & 1 deletion scraper/src/page-scraper/chunk-scraper/CourseInfo.ts
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ const parseCourseInfoChunk = ({
const notes: string[] = [];
// Find all the terms the course runs in
while (index < data.length) {
if (isTerm(data[index])) {
if (isTerm(data[index]) && !termsOffered.includes(data[index])) {
termsOffered.push(data[index]);
}

Expand Down
9 changes: 0 additions & 9 deletions scraper/src/page-scraper/page-helpers/GetHrefs.ts

This file was deleted.

Loading