-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
70 lines (47 loc) · 2.21 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#External imports
from bs4 import BeautifulSoup
import requests
import time
import datetime
import smtplib
#UI External Imports
import pyfiglet
#Internal imports
from exceptions import *
#Typing imports
from typing import Tuple
class Scraper:
"""Scraper class for scraping the product information off an Amazon product page"""
def get_content(self, response: requests.Response) -> BeautifulSoup:
"""Parses and returns a beautified version of the given web page's HTML content"""
soup = BeautifulSoup(response.content, 'html.parser')
page = BeautifulSoup(soup.prettify(), 'html.parser')
return page
def extract_info(self, content: BeautifulSoup) -> Tuple[str, float, datetime.date]:
"""Extracts the product title and price of the product from the HTML"""
title = content.find(id="productTitle").get_text().strip()
whole_price = content.find(class_ = "a-price-whole").get_text()
whole_price = whole_price.split('.')[0].strip()
fraction_price = content.find(class_ = "a-price-fraction").get_text().strip()
final_price = str(whole_price) + '.' + str(fraction_price)
final_price = float(final_price)
today = datetime.date.today()
return (title, final_price, today)
def scrape(self, URL: str) -> Tuple[str, float, datetime.date]:
"""Takes a URL of the product page and returns information about the product"""
try:
page : requests.Response = requests.get(URL)
except:
raise WebsiteNotFound()
content : BeautifulSoup = self.get_content(page) #Extracting the HTML in a 'prettified' format
return self.extract_info(content) #Extracting and returning product info from the HTML
if __name__ == "__main__":
ascii_art = pyfiglet.figlet_format("Amazon Web Scraper", font="slant")
print(ascii_art)
URL = input("Enter URL of the product you want data on: ")
URL = URL.strip()
scraper = Scraper()
name, price, date = scraper.scrape(URL)
print(f"Product name: {name}")
print(f"Product price: {price}")
print(f"Date registered/viewed: {date}")