-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcompany_scraper.py
71 lines (59 loc) · 2.8 KB
/
company_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import pprint
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
HEADERS=({ "User-Agent": user_agent, "Content-Encoding": "br", "Cf-Ray": "865efb10dc77f3a5-BOM", "Cookie": "ajs_anonymous_id=63153436-7aaf-4b38-ba88-43580c577f8d; _wellfound=aa63168ca1d3975fa811dd3d4d9d71e5; _gcl_au=1.1.1858267607.1710694766; _hjSession_1444722=eyJpZCI6ImZmMzVjNGI0LTEzOGEtNDVmMy1hMDE4LWViNGJiMjZiZDM4MyIsImMiOjE3MTA2OTQ3NjU1NTcsInMiOjEsInIiOjAsInNiIjowLCJzciI6MCwic2UiOjAsImZzIjoxfQ==; _fbp=fb.1.1710694765588.9572783; _hjSessionUser_1444722=eyJpZCI6ImJjYjBhYjNkLWE1YzgtNTA5ZC05NGNmLWE1Y2I4MGQ4NjJjMCIsImNyZWF0ZWQiOjE3MTA2OTQ3NjU1NTcsImV4aXN0aW5nIjp0cnVlfQ==; _ga=GA1.1.661819419.1710694766; datadome=g_PKAobcHT6hxUQ91f7M0Rgb5v76GTj8s26TQ3AUr0C61YPjD40KiZr7fMC4Ra~B_tCqLAKNBKcZl04Q97d6wHKxJBE4iSo_3eLlFm8ADvQB8cygqD_EpeGR9DP2Ffiy; _ga_705F94181H=GS1.1.1710698960.2.1.1710698997.23.0.0"})
def fetch_jobs(location, role, page):
jobs = []
if page == 0:
URL = f'''https://wellfound.com/location/{location}'''
else:
URL = f'''https://wellfound.com/location/{location}?page={page}'''
print(URL)
doc = requests.get(URL)
role=""
location = "united-states"
if not doc:
print("Error fetching URL")
return None
soup = BeautifulSoup(doc.content, 'html.parser')
jobs_dict = {}
job_listings = soup.findAll('div', class_='styles_result__rPRNG')
# inside the job listings div loop through all the a tags to get job title and link
for job in job_listings:
company_name = job.find('h4', class_='styles_name__rSxBl').text
job_title = job.find('a', class_='styles_jobTitle___jT4l').text
job_link = job.find('a', class_='styles_jobTitle___jT4l')['href']
upd_job_link = f'''https://wellfound.com{job_link}'''
# in the jobs_dict match the company name to an array of job titles and links
# print(type('https://wellfound.com'), type(job_link))
if company_name in jobs_dict:
jobs_dict[company_name].append({'title': job_title, 'link': upd_job_link})
else:
jobs_dict[company_name] = [{'title': job_title, 'link': upd_job_link}]
pprint.pprint(jobs_dict)
return jobs_dict
role=""
location = input("Enter location: ")
max_pages = 40
i = 0
good_page_count = 0
error_count = []
while i < max_pages:
if(max_pages > 250):
break
if(len(error_count) > 10):
break
jobs = fetch_jobs(location, role, i)
i += 1
good_page_count += 1
if not jobs:
good_page_count -= 1
max_pages += 1
error_count.append(i)
else:
error_count = []
print(f"Page {i} done")
print(f"Good page count: {good_page_count}")