-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrichScrapper.py
121 lines (96 loc) · 3.84 KB
/
richScrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# Web scraper of richest men in the world from https://www.forbes.com/real-time-billionaires
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import pandas as pd
import time
import winsound
def scroll_until_find_class(driver, class_name):
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(2)
# Check if the desired element is present
if len(driver.find_elements(By.CLASS_NAME, class_name)) > 0:
print(f"Found the element with class {class_name}")
break
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
print("Reached the bottom of the page but did not find the element")
break
last_height = new_height
def scrollToEnd(driver):
item = driver.find_element(By.CLASS_NAME, "scrolly-table")
last_height = driver.execute_script("return arguments[0].scrollHeight", item)
while True:
# Scroll down within the table element
driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", item)
# Wait to load the page
time.sleep(2)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return arguments[0].scrollHeight", item)
if new_height == last_height:
break
last_height = new_height
# Chrome headless
# chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument("--headless")
# chrome_options.add_argument("--disable-gpu")
# chrome_options.add_argument("--no-sandbox")
# chrome_options.add_argument("--disable-dev-shm-usage")
# Initialize the WebDriver
# driver = webdriver.Chrome(options=chrome_options)
driver = webdriver.Chrome()
# Load the webpage
driver.get('https://www.forbes.com/real-time-billionaires')
# Wait for the page to load fully
driver.implicitly_wait(10)
print("Page loaded")
scroll_until_find_class(driver, "scrolly-table")
print("Found the table")
# Scroll to the end of the table to load all content
scrollToEnd(driver)
print("Scrolled to the end of the table")
# Get the page source after scrolling
content = driver.page_source
soup = BeautifulSoup(content, features="html.parser")
# Lists to hold the scraped data
names = []
ranks = []
netWorths = []
sources = []
# Find all 'td' elements with the class 'name'
for element in soup.find_all('td', attrs={'class': 'name'}):
# Find the 'a' tag within the 'td' element
name_tag = element.find('a')
if name_tag:
names.append(name_tag.text.strip())
for element in soup.find_all('td', attrs={'class': 'rank'}):
rank = element.find('span')
if rank:
ranks.append(rank.text.strip())
for element in soup.find_all('td', attrs={'class':'Net Worth'}):
netWorth = element.find('span')
if netWorth:
netWorths.append(netWorth.text.strip())
for element in soup.find_all('td', attrs={'class':'source'}):
source = element.find('span')
if source:
sources.append(source.text.strip())
# Close the WebDriver
driver.quit()
winsound.PlaySound('C:/Users/munaa/Downloads/mixkit-confirmation-tone-2867.wav', winsound.SND_FILENAME)
# Create a DataFrame from the scraped data
df = pd.DataFrame({
'Rank': ranks,
'Name': names,
'Net Worth': netWorths,
'Source': sources
})
# Save the DataFrame to a CSV file
df.to_csv("Rich_List.csv", index=False, encoding='utf-8')
# Print the DataFrame
print(df.info())