forked from dkamm/coinmarketcap-scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.py
130 lines (99 loc) · 4.52 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import argparse
import concurrent.futures
import datetime
import bs4
import numpy as np
import pandas as pd
import requests
import tqdm
def parse_all_response(resp):
soup = bs4.BeautifulSoup(resp.text, 'lxml')
table = soup.find('table')
# use id instead of text to determine column names because its values are nicely formatted already
# remove 'th-' prefix from id. the only column without an id is # so use 'th-#' as default
columns = ['slug'] + [x.get('id', 'th-#')[3:] for x in table.thead.find_all('th')]
def get_val(td):
tag = td
# some columns like price store values within inner <a>
if tag.find('a'):
tag = tag.find('a')
# numeric columns store their value in these attributes in addition to text.
# use these attributes to avoid parsing $ and , in text
for key in ['data-usd', 'data-supply']:
val = tag.get(key)
if val:
try:
return np.float64(val)
except ValueError:
return np.nan
return tag.text
rows = []
for tr in table.tbody.find_all('tr'):
slug = tr.get('id')[3:] # remove 'id-' prefix from id
rows.append([slug] + [get_val(x) for x in tr.find_all('td')])
# index has the same information as #
# name has the same value as symbol because its first <a> is the currency symbol
# slug also has basically the same information as name
return pd.DataFrame(columns=columns, data=rows)
def parse_historical_coin_response(resp):
soup = bs4.BeautifulSoup(resp.text, 'lxml')
soup_hist = soup.find(id='historical-data')
if not soup_hist:
return
table = soup_hist.find('table')
# they added *'s to the end of some columns
columns = [x.text.lower().replace(' ', '').rstrip('*') for x in table.thead.find_all('th')]
def get_val(td):
# numeric columns store their value in this attribute in addition to text
val = td.get('data-format-value')
if val:
try:
return np.float64(val)
except ValueError:
return np.nan
return td.text
rows = []
for tr in table.tbody.find_all('tr'):
if tr.td.text == 'No data was found for the selected time period.':
return
rows.append([get_val(x) for x in tr.find_all('td')])
df = pd.DataFrame(columns=columns, data=rows)
df['date'] = pd.to_datetime(df.date)
return df.set_index('date')
def all_url():
return 'https://coinmarketcap.com/all/views/all/'
def historical_coin_url(slug, start, end):
return 'https://coinmarketcap.com/currencies/{slug}/historical-data/?start={start}&end={end}'.format(
slug=slug, start=start.strftime('%Y%m%d'), end=end.strftime('%Y%m%d'))
# possible future use
def markets_url(slug):
return 'https://coinmarketcap.com/currencies/{slug}/#markets'.format(slug=slug)
def str_to_date(s):
return datetime.datetime.strptime(s, '%Y-%m-%d')
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--outfile', type=str)
parser.add_argument('--start', type=str_to_date, default=datetime.date(2013, 4, 28))
parser.add_argument('--end', type=str_to_date, default=datetime.date.today())
parser.add_argument('--symbols', type=str, nargs='*')
args = parser.parse_args()
all_df = parse_all_response(requests.get(all_url()))
slugs = all_df.slug.values
if args.symbols:
slugs = all_df.loc[all_df.symbol.isin(args.symbols)].slug.values
symbols = all_df.loc[all_df.slug.isin(slugs)].symbol.values
urls = [historical_coin_url(x, args.start, args.end) for x in slugs]
with concurrent.futures.ThreadPoolExecutor() as executor:
responses = [x for x in tqdm.tqdm(executor.map(requests.get, urls),
desc='downloading historical coin pages',
total=len(urls))]
with concurrent.futures.ProcessPoolExecutor() as executor:
historical_coin_dfs = [x for x in tqdm.tqdm(executor.map(parse_historical_coin_response, responses),
desc='parsing historical coin pages',
total=len(responses)) if x is not None]
for slug, symbol, historical_coin_df in zip(slugs, symbols, historical_coin_dfs):
historical_coin_df['slug'] = slug
historical_coin_df['symbol'] = symbol
pd.concat(historical_coin_dfs).to_csv(args.outfile)
if __name__ == '__main__':
main()