-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
188 lines (129 loc) · 5.86 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
from selenium import webdriver
import pandas as pd
from bs4 import BeautifulSoup
import requests
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import re
from time import sleep
def scrap_link(at) :
# Scrap all the links
links = []
j = 1
page = "?page="
try :
r = requests.get(at+page+str(j))
soup = BeautifulSoup(r.content, 'html.parser')
nb_pages = int(re.match('.*?([0-9]+)$',soup.find_all("a", "block br3 brc8 large tdnone lheight24")[-1]["href"]).group(1))
except :
nb_pages = 1
for j in range(1,nb_pages+1) :
try :
r = requests.get(at+page+str(j))
soup = BeautifulSoup(r.content, 'html.parser')
for i in soup.find_all("h3", "lheight22 margintop5") :
links.append(i.find("a")["href"])
print("Scraped page " + str(j) )
except :
print("Break at :" + str(j))
break
# Remove duplicates
links = list(dict.fromkeys(links))
return links
def open_browser(at) :
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(at)
driver.implicitly_wait(15)
element = WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler")))
driver.find_element_by_id('onetrust-accept-btn-handler').click()
# Check and click if the phone number is present
return driver
def go_to(driver, at) :
driver.get(at)
def show_number(driver) :
try :
element = WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.CLASS_NAME, "css-1af0qjn-BaseStyles")))
driver.find_element_by_class_name('css-1af0qjn-BaseStyles').click()
t = True
except :
t = False
return t
def scrap_data(links) :
# Open a chrome browser
driver = webdriver.Chrome(ChromeDriverManager().install())
# Create a DataFrame
df = pd.DataFrame(columns = {"Link","Tags", "Loc", "Title", "Type", "Desc", "Author", "Tel", "About us", "Images"})
first_run = True
# Scrap all informations
for i in links :
try :
if first_run :
driver = open_browser(i)
first_run = False
else :
go_to(driver, i)
driver.implicitly_wait(15)
t = show_number(driver)
sleep(1)
soup = BeautifulSoup(driver.page_source, 'html.parser')
#Tags
tags = soup.find_all("li", "css-7dfllt")[2].text
# Loc
tmp = soup.find("div", "css-1nrl4q4")
loc = tmp.find("p", "css-7xdcwc-Text eu5v0x0").text +" "+ tmp.find("p", "css-xl6fe0-Text eu5v0x0").text
# Title
title = soup.find("h1", "css-r9zjja-Text eu5v0x0").text
# Type
t = soup.find("p", "css-xl6fe0-Text eu5v0x0").text
# Desc
desc = soup.find("div", "css-g5mtbi-Text").text
# Author
auth = soup.find("h2", "css-u8mbra-Text eu5v0x0").text
if t :
# Tel
tel = soup.find("div", "css-r8u9sk").text
if tel == "xxx xxx xxxArata" :
driver.quit()
driver = open_browser(i)
driver.implicitly_wait(15)
t = show_number(driver)
sleep(1)
soup = BeautifulSoup(driver.page_source, 'html.parser')
tel = soup.find("div", "css-r8u9sk").text
img = ""
try :
for j in soup.find("div", "swiper-container swiper-container-initialized swiper-container-horizontal").find_all("img") :
try :
img += j["src"] + "\n"
except :
img += j["data-src"] + "\n"
except :
pass
try :
ab = soup.find("div", "css-1oj9129-Text").text
except :
ab = ""
df = df.append({"Link" : i ,"Tags" : tags, "Loc" : loc, "Title" : title, "Type" : t, "Desc" : desc, "Author" : auth, "Tel" : tel, "About us" : ab, "Images" : img}, ignore_index=True)
print("Scraped "+ title + " from " + loc+ " by " + auth)
# Show if a problem occured
except Exception as e:
print(e)
print("Problem :"+ str(i)+ "\n")
# Create the promoted column
df["Promoted"] = df["Link"].str.contains("promoted")
# Erase promoted in Link column
df["Link"] = df["Link"].str.replace(";promoted", "")
df = df[["Link","Tags", "Loc", "Title", "Type", "Desc", "Author", "Tel", "About us", "Images"]]
return df
def save_file(df,at) :
# Save csv
df.to_csv(at.split(r"/")[-2]+".csv", index = False)
def main():
at = "https://www.olx.ro/servicii-afaceri-colaborari/contabilitate-traduceri/iasi_39939/"
links = scrap_link(at)
df = scrap_data(links)
save_file(df, at)
if __name__ == "__main__":
main()