-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathscrape.py
146 lines (113 loc) · 3.49 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import requests
import logging
import json
import csv
import time
import os
import glob
import pandas as pd
from collections import namedtuple
logging.basicConfig(filename='scrape.log', level=logging.DEBUG)
def get_json(year, page):
"""
Return the json data for the given year.
"""
url = "http://www.bmw-berlin-marathon.com/files/addons/scc_events_data/ajax.results.php"
params = {'t': 'BM_{}'.format(year), 'ci': 'MAL', 'page': str(page)}
response = requests.get(url, params=params)
logging.debug('fetched year {0} page {1}'.format(year, page))
response.raise_for_status()
time.sleep(1)
return response.json()
Meta = namedtuple('Meta', ('page', 'n_pages', 'n_rows'))
def get_metadata(j):
"""
Return metadata about the records for the given year.
"""
page = int(j.get('page', 1))
n_pages = int(j.get('total', 1))
n_rows = int(j.get('records', 0))
return Meta(page, n_pages, n_rows)
Row = namedtuple('Row', (
'id',
'place',
'bib',
'surname',
'forename',
'team',
'nationality',
'yob',
'sex',
'age_class',
'age_class_place',
'net_time',
'clock_time',
))
def get_rows(j):
"""
Return a generator of rows.
"""
for entry in j.get('rows', []):
cell = entry.get('cell', [])
yield Row(*cell)
def get_data(j):
return get_metadata(j), get_rows(j)
def get_jsons(year):
"""
Return a generator of all json pages for given year.
"""
j = get_json(year, 1)
yield j
meta = get_metadata(j)
for page in range(2, meta.n_pages + 1):
try:
j = get_json(year, page)
except Exception as e:
logging.exception(e)
logging.critical('retry:{0}:{1}'.format(year, page))
else:
yield j
def make_csv(json_glob, csv_filename, mode='w'):
# Get the filenames
fns = glob.glob(json_glob)
appending = 'a' in mode
with open(csv_filename, mode) as cf:
writer = csv.writer(cf)
# Only write headers if not appending to existing
if not appending:
writer.writerow(('year',) + Row._fields)
# Write all data to csv
for fn in fns:
year = os.path.basename(fn).split('-')[0]
with open(fn) as f:
j = json.load(f)
rows = get_rows(j)
[writer.writerow((year,) + row) for row in rows]
# Sort the data so that order is consistent
df = pd.read_csv(csv_filename).sort_values(['year', 'id']).reset_index(drop=True)
if appending:
kwargs = dict()
else:
kwargs = dict(index=True, index_label='idx')
df.to_csv(csv_filename, **kwargs)
return df
def main(years, directory='data'):
"""
Get json data from Berlin marathon API and write it to a file in csv format.
"""
for year in years:
for j in get_jsons(year):
try:
meta, rows = get_data(j)
jname = os.path.join(directory,
'{0}-{1}.json'.format(year, meta.page))
with open(jname, 'w') as f:
json.dump(j, f)
logging.debug('Success: year {0} page {1}'.format(year,
meta.page))
except Exception as e:
logging.exception(e)
if __name__ == '__main__':
years = range(2005, 2017)
main(years, directory='data')
make_csv('data/*.json', 'berlin_marathon_times_dirty.csv')