-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathsce_scraper.py
97 lines (77 loc) · 3.66 KB
/
sce_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from bs4 import BeautifulSoup
import json
from selenium import webdriver
import time
class ProjectNode:
# TODO: Figure out how to add languages/apis used from the images
def __init__(self, project_name, project_type, project_desc, github_link):
self.title = project_name
self.project_type = project_type
self.description = project_desc
self.link = github_link
def to_string(self):
return f"Title: {self.title}\nType: {self.project_type}\nDescription: {self.description}\nGitHub Link: {self.link}"
def __str__(self):
return json.dumps(self.__dict__)
class Scraper:
def __init__(self):
self.project_titles = []
self.project_types = []
self.project_descs = []
self.links = []
def scrape(self, url):
# "Some elements are dynamically generated by scripts and won't appear on your bs4. You'll need to use a
# different package like requests-html or selenium that can render these elements before parsing them."
# https://stackoverflow.com/questions/59090591/beautifulsoup-how-to-show-the-inside-of-a-div-that-wont-show
print("Scraping...")
browser = webdriver.Firefox()
browser.get(url)
time.sleep(20)
html = browser.page_source
self.doc = BeautifulSoup(html, "html.parser")
def extract_about_info(self) -> str:
print("Extracting about info...")
html_about_desc = self.doc.find('h3',
class_='text-xl text-center')
discord_link = html_about_desc.find('a').get('href')
return f"Discord Link: {discord_link}"
def extract_project_info(self) -> list[ProjectNode]:
print("Extracting project info...")
html_project_titles = self.doc.find_all('h2',
class_='mb-2 text-2xl font-bold tracking-tight text-gray-900 dark:text-white')
for titles in html_project_titles:
t = titles.find('a')
self.project_titles.append(t.get_text())
html_project_types = self.doc.find_all('span',
class_='bg-primary-100 text-primary-800 text-xs font-medium inline-flex items-center px-2.5 py-0.5 rounded dark:bg-primary-200 dark:text-primary-800')
for project_type in html_project_types:
self.project_types.append(project_type.get_text())
html_project_descs = self.doc.find_all('p', class_='mb-5 font-light text-gray-500 dark:text-gray-400')
for project_desc in html_project_descs:
self.project_descs.append(project_desc.get_text())
html_links = self.doc.find_all('a',
class_='inline-flex items-center font-medium text-primary-600 dark:text-primary-500 hover:underline')
for link in html_links:
self.links.append(link.get('href'))
nodes = []
for title, proj_type, desc, link in zip(self.project_titles, self.project_types, self.project_descs, self.links):
nodes.append(
ProjectNode(
project_name=title,
project_type=proj_type,
project_desc=desc,
github_link=link,
)
)
return nodes
scraper = Scraper()
scraper.scrape('https://sce.sjsu.edu/projects')
nodes = scraper.extract_project_info()
for node in nodes:
print(node.to_string())
with open("proj_output.json", "w") as f:
json.dump([node.__dict__ for node in nodes], f, indent=4)
scraper.scrape('https://sce.sjsu.edu/about')
d = scraper.extract_about_info()
with open("about_output.json", "w") as f:
json.dump(d, f)