From d7ea5bff882fea418849c338087e5e9d7703d299 Mon Sep 17 00:00:00 2001 From: Mircea Maierean Date: Mon, 15 Apr 2024 17:35:30 +0300 Subject: [PATCH] add more data in scraping script --- code/demo_scraping.js | 29 ++++++++++++++++++++------- code/exercise_scraping.js | 12 +++++++---- src/exercise-scraping.md | 2 +- src/web-scraping.md | 12 ++++++++++- src/web-scraping/constants.md | 1 - src/web-scraping/launching-browser.md | 1 - 6 files changed, 42 insertions(+), 15 deletions(-) delete mode 100644 src/web-scraping/constants.md delete mode 100644 src/web-scraping/launching-browser.md diff --git a/code/demo_scraping.js b/code/demo_scraping.js index 753107f..d10cf30 100644 --- a/code/demo_scraping.js +++ b/code/demo_scraping.js @@ -1,14 +1,16 @@ import puppeteer from "puppeteer"; +import fs from "fs"; const URL = "https://www.olx.ro/imobiliare/apartamente-garsoniere-de-inchiriat/cluj-napoca/?currency=EUR"; -// the values of the selectors are just placeholders, we will update them live during the workshop const SELECTORS = { ANNOUNCEMENT: ".css-qfzx1y", TITLE: "h6", SURFACE: ".css-643j0o", PRICE: ".css-tyui9s", + IMAGE: "img", + URL: ".css-1apmciz .css-z3gu2d", NEXT_PAGE: `a[data-testid="pagination-forward"]`, }; @@ -23,7 +25,7 @@ async function run() { const data = []; while (true) { - console.log("Scraping page " + page.url()); + console.log("Scrapping page " + page.url()); await page.waitForSelector(SELECTORS.ANNOUNCEMENT); const announcements = await page.$$(SELECTORS.ANNOUNCEMENT); @@ -33,39 +35,52 @@ async function run() { const title = await announcement.$eval( SELECTORS.TITLE, - (el) => el.textContent, + (el) => el.textContent ); const price = await announcement.$eval( SELECTORS.PRICE, - (el) => el.textContent, + (el) => el.textContent ); try { surface = await announcement.$eval( SELECTORS.SURFACE, - (el) => el.textContent, + (el) => el.textContent ); } catch (e) { // do nothing } + const imagePath = await announcement.$eval( + SELECTORS.IMAGE, + (el) => el.src + ); + + const url = await announcement.$eval(SELECTORS.URL, (el) => el.href); data.push({ title: title, price: price, surface: surface, + imagePath: imagePath, + url: url, }); } try { const nextPageURL = await page.$eval( SELECTORS.NEXT_PAGE, - (el) => el.href, + (el) => el.href ); await page.goto(nextPageURL); } catch (e) { - console.log(e); + console.log("No more pages to scrap"); break; } } + fs.writeFile("data.json", JSON.stringify(data), (err) => { + if (err) throw err; + console.log("File Saved"); + }); + console.log("closing browser"); await browser.close(); diff --git a/code/exercise_scraping.js b/code/exercise_scraping.js index 7944fe3..e2385ac 100644 --- a/code/exercise_scraping.js +++ b/code/exercise_scraping.js @@ -11,6 +11,7 @@ const SELECTORS = { DESCRIPTION: ".EventRectangle-styles-description-sDn7J", IMAGE: ".EventRectangle-styles-picture-SPjDJ", TAGS: ".MuiChip-label", + URL: ".EventRectangle-styles-viewDetails-PsfIW", LOAD_MORE: ".loadMoreContainer a", }; @@ -41,20 +42,22 @@ async function run() { const location = await event.$eval( SELECTORS.LOCATION, - (el) => el.innerText, + (el) => el.innerText ); const description = await event.$eval( SELECTORS.DESCRIPTION, - (el) => el.innerText, + (el) => el.innerText ); const image = await event.$eval(SELECTORS.IMAGE, (el) => el.src); const tags = await event.$$eval(SELECTORS.TAGS, (tags) => - tags.map((tag) => tag.innerText), + tags.map((tag) => tag.innerText) ); + const url = await event.$eval(SELECTORS.URL, (el) => el.href); + data.push({ title: title, date: date, @@ -62,10 +65,11 @@ async function run() { description: description, image: image, tags: tags, + url: url, }); } - fs.writeFile("gdsc_events.json", JSON.stringify(data), (err) => { + fs.writeFile("data.json", JSON.stringify(data), (err) => { if (err) throw err; console.log("File saved"); }); diff --git a/src/exercise-scraping.md b/src/exercise-scraping.md index dc534bb..b3eac5d 100644 --- a/src/exercise-scraping.md +++ b/src/exercise-scraping.md @@ -4,7 +4,7 @@ Congrats! You have successfully completed the web scraping demo. Now it's time t ## Objective -You are the biggest GDSC fan, and you want to attend as many workshops as possible. You are able to go anywhere in the world, so the location is not a problem. For this, you want to scrape the GDSC events from the [GDSC website](https://gdsc.community.dev/events/#/list). For each event, you want to extract the title, date, location, description, image link, and the tags associated with the event. +You are the biggest GDSC fan, and you want to attend as many workshops as possible. You are able to go anywhere in the world, so the location is not a problem. For this, you want to scrape the GDSC events from the [GDSC website](https://gdsc.community.dev/events/#/list). For each event, you want to extract the title, date, location, description, image link, the tags associated with the event, and the url Go Away diff --git a/src/web-scraping.md b/src/web-scraping.md index 9672fd5..9b83ae1 100644 --- a/src/web-scraping.md +++ b/src/web-scraping.md @@ -8,7 +8,7 @@ To begin with, we'll analyze the website, and identify the structure of the page OLX -We can see that the website has a list of announcements for apartments in Cluj-Napoca. Each announcement has a title, a price, a location (always Cluj Napoca), and a surface area. For each unique aspect of the announcement, we will try to identify the corresponding selector. +We can see that the website has a list of announcements for apartments in Cluj-Napoca. Each announcement has a title, a price, a location (always Cluj Napoca), a surface area, an image, and an url. For each unique aspect of the announcement, we will try to identify the corresponding selector. ```javascript import puppeteer from "puppeteer"; @@ -22,6 +22,8 @@ const SELECTORS = { TITLE: ".offer-title", PRICE: ".price", SURFACE: ".surface", + URL: ".url", + IMAGE: ".image", }; ``` @@ -78,10 +80,16 @@ for (const announcement of announcements) { // do nothing } + const imagePath = await announcement.$eval(SELECTORS.IMAGE, (el) => el.src); + + const url = await announcement.$eval(SELECTORS.URL, (el) => el.href); + data.push({ title: title, price: price, surface: surface, + image: imagePath, + url: url, }); } ``` @@ -100,6 +108,8 @@ const SELECTORS = { TITLE: ".offer-title", PRICE: ".price", SURFACE: ".surface", + URL: ".url", + IMAGE: ".image", NEXT_PAGE: ".pager .next a", }; ``` diff --git a/src/web-scraping/constants.md b/src/web-scraping/constants.md deleted file mode 100644 index 27db1a3..0000000 --- a/src/web-scraping/constants.md +++ /dev/null @@ -1 +0,0 @@ -# Setting up constants diff --git a/src/web-scraping/launching-browser.md b/src/web-scraping/launching-browser.md deleted file mode 100644 index 48b6808..0000000 --- a/src/web-scraping/launching-browser.md +++ /dev/null @@ -1 +0,0 @@ -# Launching the browser