-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathsessionize.py
101 lines (75 loc) · 2.83 KB
/
sessionize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import dateparser
import requests
from bs4 import BeautifulSoup
import twitter_utils
def get(url):
res = requests.get(url)
return BeautifulSoup(res.text, 'html.parser')
def find_navy_section(root, label):
for elm in root.select('.text-navy'):
if elm.contents[-1].strip().startswith(label):
return elm.find_parent(lambda e: e.has_attr('class') and 'col-' in ' '.join(e['class'])).find('h2')
def parse_event(url):
root = get(url)
if root.find('span', string='Speaker Profile'):
return None
if 'Log in' in root.find('title').string:
return None
if '@ Sessionize.com' not in root.find('title').string:
return None
data = {
'Conference Name': root.select('.ibox-title h4')[0].string,
'CFP URL': url,
}
elm = find_navy_section(root, 'location')
if elm:
data['Location'] = elm.select('.block')[-1].string
elm = find_navy_section(root, 'website')
if elm:
data['Conference URL'] = elm.find('a')['href']
elm = find_navy_section(root, 'event date')
if elm:
data['Conference Start Date'] = data['Conference End Date'] = dateparser.parse(elm.string).date()
elm = find_navy_section(root, 'event starts')
if elm:
data['Conference Start Date'] = dateparser.parse(elm.string).date()
elm = find_navy_section(root, 'event ends')
if elm:
data['Conference End Date'] = dateparser.parse(elm.string).date()
# Find the UTC version of the CFP end date.
elm = root.select('.js-closedate')[0]
if not elm:
raise ValueError(f'js-closedate not found in {url}')
utc_cfp_end_date = dateparser.parse(elm['data-date']).replace(tzinfo=None)
data['CFP End Date'] = utc_cfp_end_date
elm = find_navy_section(root, 'CfS closes at')
if not elm:
raise ValueError(f'CfS closes at not found in {url}')
time = elm.parent.select('.text-navy')[0].string[13:]
parsed = dateparser.parse(f'{elm.string} {time}')
utc_offset = parsed - utc_cfp_end_date
elm = find_navy_section(root, 'CfS opens at')
if elm:
time = elm.parent.select('.text-navy')[0].string[13:]
date = elm.string
parsed = dateparser.parse(f'{date} {time}')
data['CFP Start Date'] = (parsed - utc_offset).date()
return data
def find_events():
seen_urls = set()
for url in twitter_utils.search_for_url('sessionize.com'):
# Skip the queryparams and downcase it.
clean_url = url.split('?')[0].lower().rstrip('/')
if clean_url in seen_urls:
continue
if '/api/' in clean_url:
continue
evt = parse_event(clean_url)
if evt is not None:
yield evt
seen_urls.add(clean_url)
def scrape():
yield from find_events()
if __name__ == '__main__':
for d in find_events():
print(d)