add more data in scraping script

UBBGDSC · Apr 15, 2024 · d7ea5bf · d7ea5bf
1 parent de2fe35
commit d7ea5bf
Show file tree

Hide file tree

Showing 6 changed files with 42 additions and 15 deletions.
diff --git a/code/demo_scraping.js b/code/demo_scraping.js
@@ -1,14 +1,16 @@
 import puppeteer from "puppeteer";
+import fs from "fs";
 
 const URL =
   "https://www.olx.ro/imobiliare/apartamente-garsoniere-de-inchiriat/cluj-napoca/?currency=EUR";
 
-// the values of the selectors are just placeholders, we will update them live during the workshop
 const SELECTORS = {
   ANNOUNCEMENT: ".css-qfzx1y",
   TITLE: "h6",
   SURFACE: ".css-643j0o",
   PRICE: ".css-tyui9s",
+  IMAGE: "img",
+  URL: ".css-1apmciz .css-z3gu2d",
   NEXT_PAGE: `a[data-testid="pagination-forward"]`,
 };
 
@@ -23,7 +25,7 @@ async function run() {
   const data = [];
 
   while (true) {
-    console.log("Scraping page " + page.url());
+    console.log("Scrapping page " + page.url());
 
     await page.waitForSelector(SELECTORS.ANNOUNCEMENT);
     const announcements = await page.$$(SELECTORS.ANNOUNCEMENT);
@@ -33,39 +35,52 @@ async function run() {
 
       const title = await announcement.$eval(
         SELECTORS.TITLE,
-        (el) => el.textContent,
+        (el) => el.textContent
       );
       const price = await announcement.$eval(
         SELECTORS.PRICE,
-        (el) => el.textContent,
+        (el) => el.textContent
       );
       try {
         surface = await announcement.$eval(
           SELECTORS.SURFACE,
-          (el) => el.textContent,
+          (el) => el.textContent
         );
       } catch (e) {
         // do nothing
       }
+      const imagePath = await announcement.$eval(
+        SELECTORS.IMAGE,
+        (el) => el.src
+      );
+
+      const url = await announcement.$eval(SELECTORS.URL, (el) => el.href);
 
       data.push({
         title: title,
         price: price,
         surface: surface,
+        imagePath: imagePath,
+        url: url,
       });
     }
     try {
       const nextPageURL = await page.$eval(
         SELECTORS.NEXT_PAGE,
-        (el) => el.href,
+        (el) => el.href
       );
       await page.goto(nextPageURL);
     } catch (e) {
-      console.log(e);
+      console.log("No more pages to scrap");
       break;
     }
   }
 
+  fs.writeFile("data.json", JSON.stringify(data), (err) => {
+    if (err) throw err;
+    console.log("File Saved");
+  });
+
   console.log("closing browser");
 
   await browser.close();

diff --git a/code/exercise_scraping.js b/code/exercise_scraping.js
@@ -11,6 +11,7 @@ const SELECTORS = {
   DESCRIPTION: ".EventRectangle-styles-description-sDn7J",
   IMAGE: ".EventRectangle-styles-picture-SPjDJ",
   TAGS: ".MuiChip-label",
+  URL: ".EventRectangle-styles-viewDetails-PsfIW",
   LOAD_MORE: ".loadMoreContainer a",
 };
 
@@ -41,31 +42,34 @@ async function run() {
 
     const location = await event.$eval(
       SELECTORS.LOCATION,
-      (el) => el.innerText,
+      (el) => el.innerText
     );
 
     const description = await event.$eval(
       SELECTORS.DESCRIPTION,
-      (el) => el.innerText,
+      (el) => el.innerText
     );
 
     const image = await event.$eval(SELECTORS.IMAGE, (el) => el.src);
 
     const tags = await event.$$eval(SELECTORS.TAGS, (tags) =>
-      tags.map((tag) => tag.innerText),
+      tags.map((tag) => tag.innerText)
     );
 
+    const url = await event.$eval(SELECTORS.URL, (el) => el.href);
+
     data.push({
       title: title,
       date: date,
       location: location,
       description: description,
       image: image,
       tags: tags,
+      url: url,
     });
   }
 
-  fs.writeFile("gdsc_events.json", JSON.stringify(data), (err) => {
+  fs.writeFile("data.json", JSON.stringify(data), (err) => {
     if (err) throw err;
     console.log("File saved");
   });

diff --git a/src/exercise-scraping.md b/src/exercise-scraping.md
@@ -4,7 +4,7 @@ Congrats! You have successfully completed the web scraping demo. Now it's time t
 
 ## Objective
 
-You are the biggest GDSC fan, and you want to attend as many workshops as possible. You are able to go anywhere in the world, so the location is not a problem. For this, you want to scrape the GDSC events from the [GDSC website](https://gdsc.community.dev/events/#/list). For each event, you want to extract the title, date, location, description, image link, and the tags associated with the event.
+You are the biggest GDSC fan, and you want to attend as many workshops as possible. You are able to go anywhere in the world, so the location is not a problem. For this, you want to scrape the GDSC events from the [GDSC website](https://gdsc.community.dev/events/#/list). For each event, you want to extract the title, date, location, description, image link, the tags associated with the event, and the url
 
 <img src="./images/upcoming_events.png" alt="Go Away" width="900"/>
 

diff --git a/src/web-scraping.md b/src/web-scraping.md
@@ -8,7 +8,7 @@ To begin with, we'll analyze the website, and identify the structure of the page
 
 <img src="./images/olx.png" alt="OLX" width="1000"/>
 
-We can see that the website has a list of announcements for apartments in Cluj-Napoca. Each announcement has a title, a price, a location (always Cluj Napoca), and a surface area. For each unique aspect of the announcement, we will try to identify the corresponding selector.
+We can see that the website has a list of announcements for apartments in Cluj-Napoca. Each announcement has a title, a price, a location (always Cluj Napoca), a surface area, an image, and an url. For each unique aspect of the announcement, we will try to identify the corresponding selector.
 
 ```javascript
 import puppeteer from "puppeteer";
@@ -22,6 +22,8 @@ const SELECTORS = {
   TITLE: ".offer-title",
   PRICE: ".price",
   SURFACE: ".surface",
+  URL: ".url",
+  IMAGE: ".image",
 };
 ```
 
@@ -78,10 +80,16 @@ for (const announcement of announcements) {
     // do nothing
   }
 
+  const imagePath = await announcement.$eval(SELECTORS.IMAGE, (el) => el.src);
+
+  const url = await announcement.$eval(SELECTORS.URL, (el) => el.href);
+
   data.push({
     title: title,
     price: price,
     surface: surface,
+    image: imagePath,
+    url: url,
   });
 }
 ```
@@ -100,6 +108,8 @@ const SELECTORS = {
   TITLE: ".offer-title",
   PRICE: ".price",
   SURFACE: ".surface",
+  URL: ".url",
+  IMAGE: ".image",
   NEXT_PAGE: ".pager .next a",
 };
 ```

diff --git a/src/web-scraping/constants.md b/src/web-scraping/constants.md
diff --git a/src/web-scraping/launching-browser.md b/src/web-scraping/launching-browser.md