Skip to content

Commit

Permalink
add more data in scraping script
Browse files Browse the repository at this point in the history
  • Loading branch information
mirceamaierean committed Apr 15, 2024
1 parent de2fe35 commit d7ea5bf
Show file tree
Hide file tree
Showing 6 changed files with 42 additions and 15 deletions.
29 changes: 22 additions & 7 deletions code/demo_scraping.js
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
import puppeteer from "puppeteer";
import fs from "fs";

const URL =
"https://www.olx.ro/imobiliare/apartamente-garsoniere-de-inchiriat/cluj-napoca/?currency=EUR";

// the values of the selectors are just placeholders, we will update them live during the workshop
const SELECTORS = {
ANNOUNCEMENT: ".css-qfzx1y",
TITLE: "h6",
SURFACE: ".css-643j0o",
PRICE: ".css-tyui9s",
IMAGE: "img",
URL: ".css-1apmciz .css-z3gu2d",
NEXT_PAGE: `a[data-testid="pagination-forward"]`,
};

Expand All @@ -23,7 +25,7 @@ async function run() {
const data = [];

while (true) {
console.log("Scraping page " + page.url());
console.log("Scrapping page " + page.url());

await page.waitForSelector(SELECTORS.ANNOUNCEMENT);
const announcements = await page.$$(SELECTORS.ANNOUNCEMENT);
Expand All @@ -33,39 +35,52 @@ async function run() {

const title = await announcement.$eval(
SELECTORS.TITLE,
(el) => el.textContent,
(el) => el.textContent
);
const price = await announcement.$eval(
SELECTORS.PRICE,
(el) => el.textContent,
(el) => el.textContent
);
try {
surface = await announcement.$eval(
SELECTORS.SURFACE,
(el) => el.textContent,
(el) => el.textContent
);
} catch (e) {
// do nothing
}
const imagePath = await announcement.$eval(
SELECTORS.IMAGE,
(el) => el.src
);

const url = await announcement.$eval(SELECTORS.URL, (el) => el.href);

data.push({
title: title,
price: price,
surface: surface,
imagePath: imagePath,
url: url,
});
}
try {
const nextPageURL = await page.$eval(
SELECTORS.NEXT_PAGE,
(el) => el.href,
(el) => el.href
);
await page.goto(nextPageURL);
} catch (e) {
console.log(e);
console.log("No more pages to scrap");
break;
}
}

fs.writeFile("data.json", JSON.stringify(data), (err) => {
if (err) throw err;
console.log("File Saved");
});

console.log("closing browser");

await browser.close();
Expand Down
12 changes: 8 additions & 4 deletions code/exercise_scraping.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ const SELECTORS = {
DESCRIPTION: ".EventRectangle-styles-description-sDn7J",
IMAGE: ".EventRectangle-styles-picture-SPjDJ",
TAGS: ".MuiChip-label",
URL: ".EventRectangle-styles-viewDetails-PsfIW",
LOAD_MORE: ".loadMoreContainer a",
};

Expand Down Expand Up @@ -41,31 +42,34 @@ async function run() {

const location = await event.$eval(
SELECTORS.LOCATION,
(el) => el.innerText,
(el) => el.innerText
);

const description = await event.$eval(
SELECTORS.DESCRIPTION,
(el) => el.innerText,
(el) => el.innerText
);

const image = await event.$eval(SELECTORS.IMAGE, (el) => el.src);

const tags = await event.$$eval(SELECTORS.TAGS, (tags) =>
tags.map((tag) => tag.innerText),
tags.map((tag) => tag.innerText)
);

const url = await event.$eval(SELECTORS.URL, (el) => el.href);

data.push({
title: title,
date: date,
location: location,
description: description,
image: image,
tags: tags,
url: url,
});
}

fs.writeFile("gdsc_events.json", JSON.stringify(data), (err) => {
fs.writeFile("data.json", JSON.stringify(data), (err) => {
if (err) throw err;
console.log("File saved");
});
Expand Down
2 changes: 1 addition & 1 deletion src/exercise-scraping.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ Congrats! You have successfully completed the web scraping demo. Now it's time t

## Objective

You are the biggest GDSC fan, and you want to attend as many workshops as possible. You are able to go anywhere in the world, so the location is not a problem. For this, you want to scrape the GDSC events from the [GDSC website](https://gdsc.community.dev/events/#/list). For each event, you want to extract the title, date, location, description, image link, and the tags associated with the event.
You are the biggest GDSC fan, and you want to attend as many workshops as possible. You are able to go anywhere in the world, so the location is not a problem. For this, you want to scrape the GDSC events from the [GDSC website](https://gdsc.community.dev/events/#/list). For each event, you want to extract the title, date, location, description, image link, the tags associated with the event, and the url

<img src="./images/upcoming_events.png" alt="Go Away" width="900"/>

Expand Down
12 changes: 11 additions & 1 deletion src/web-scraping.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ To begin with, we'll analyze the website, and identify the structure of the page

<img src="./images/olx.png" alt="OLX" width="1000"/>

We can see that the website has a list of announcements for apartments in Cluj-Napoca. Each announcement has a title, a price, a location (always Cluj Napoca), and a surface area. For each unique aspect of the announcement, we will try to identify the corresponding selector.
We can see that the website has a list of announcements for apartments in Cluj-Napoca. Each announcement has a title, a price, a location (always Cluj Napoca), a surface area, an image, and an url. For each unique aspect of the announcement, we will try to identify the corresponding selector.

```javascript
import puppeteer from "puppeteer";
Expand All @@ -22,6 +22,8 @@ const SELECTORS = {
TITLE: ".offer-title",
PRICE: ".price",
SURFACE: ".surface",
URL: ".url",
IMAGE: ".image",
};
```

Expand Down Expand Up @@ -78,10 +80,16 @@ for (const announcement of announcements) {
// do nothing
}

const imagePath = await announcement.$eval(SELECTORS.IMAGE, (el) => el.src);

const url = await announcement.$eval(SELECTORS.URL, (el) => el.href);

data.push({
title: title,
price: price,
surface: surface,
image: imagePath,
url: url,
});
}
```
Expand All @@ -100,6 +108,8 @@ const SELECTORS = {
TITLE: ".offer-title",
PRICE: ".price",
SURFACE: ".surface",
URL: ".url",
IMAGE: ".image",
NEXT_PAGE: ".pager .next a",
};
```
Expand Down
1 change: 0 additions & 1 deletion src/web-scraping/constants.md

This file was deleted.

1 change: 0 additions & 1 deletion src/web-scraping/launching-browser.md

This file was deleted.

0 comments on commit d7ea5bf

Please sign in to comment.