-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrapper.py
36 lines (30 loc) · 1.22 KB
/
scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import requests as _requests
import bs4 as _bs4
import constants as _constants
def _create_url(tag: str = "love") -> str:
return f"https://www.goodreads.com/quotes/tag/{tag}"
def get_page(url: str) -> _bs4.BeautifulSoup:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
page = _requests.get(url, headers=headers)
if page.status_code != 200:
raise Exception(f"Failed to fetch page: {url}, Status Code: {page.status_code}")
soup = _bs4.BeautifulSoup(page.content, "html.parser")
return soup
def _extract_quote_and_author(quote):
quote_text=quote.contents[0].text.strip()
author=quote.find(class_="authorOrTitle").text.strip()
return quote_text, author
def scrape_quotes():
collection=list()
for tag in _constants.TAGS:
url = _create_url(tag)
soup = get_page(url)
quotes = soup.find_all(class_="quoteText")
for quote in quotes:
quote_text, author = _extract_quote_and_author(quote)
data=dict({"quote":quote_text, "author":author, "tag":tag})
collection.append(data)
return collection
print(scrape_quotes())