-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpfr.py
executable file
·254 lines (217 loc) · 8.07 KB
/
pfr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
import pandas as pd
from pyquery import PyQuery as pq
import requests
from bs4 import BeautifulSoup
import unicodedata
def get_pfr_id_and_data(idx, td, pfr_id_idx):
"""
Help get the pfr id as you scrape the data.
Parameters
----------
idx : int
The index of the td elements we are enumerating when scraping through
a row.
td : tag element
The td element that contains the information/ player id.
pfr_id_idx: int
The index of the td element with the table row (tr) that contains the
player id.
"""
if idx == pfr_id_idx:
# get the element in the list contains the player name
# and the associated player id
return td.text_content(), td.attrib.get("data-append-csv")
else:
return td.text_content()
def get_college_stats_link(idx, td, col_stat_idx):
"""
Help get the college stats link as you scrape the data.
Parameters
----------
idx : int
The index of the td elements we are enumerating when scraping through
a row.
td : tag element
The td element that contains the college stat link.
col_stat_idx: int
The index of the td element with the table row (tr) that contains the
college stats link.
"""
if idx == col_stat_idx:
# try to get the link, will throw an error if it's not there
# so catch it and return an emtpy string
try:
return td.find('a').get('href')
except Exception as e:
return ''
def get_pfr_id_and_college_stats(idx, td, pfr_id_idx, col_stat_idx):
"""
Help get the pfr id and college stats link as you scrape the data.
Parameters
----------
idx : int
The index of the td elements we are enumerating when scraping through
a row.
td : tag element
The td element that contains the information/ player id.
pfr_id_idx: int
The index of the td element with the table row (tr) that contains the
player id.
col_stat_idx: int
The index of the td element with the table row (tr) that contains the
college stats link.
"""
# get the pfr id
if idx == pfr_id_idx:
return get_pfr_id_and_data(idx, td, pfr_id_idx)
# get the college stats
elif idx == col_stat_idx:
return get_college_stats_link(idx, td, col_stat_idx)
else:
return td.text_content()
def create_pq(url):
"""
Create PyQuery object used for scraping pro-football-reference.
Parameters
----------
url : str
Url used to create PyQuery objbect
Returns
-------
pq_obj : PyQuery objecy
PyQuery object used for scraping data
"""
response = requests.get(url)
html = response.text.replace('<!--', '').replace('-->', '')
pq_obj = pq(html)
return pq_obj
def get_combine_table(url, row_css_selector, col_css_selector,
pfr_id_idx=None, col_stat_idx=None):
"""Scrape the combine data table and return it as a DataFrame."""
# set things up using pyquery
pq_obj = create_pq(url)
rows = pq_obj(row_css_selector)
headers = pq_obj(col_css_selector)
# If we don't want to get the player id, just extract the text data
if pfr_id_idx is None and col_stat_idx is None:
data = [[td.text_content() for td in row.iterchildren()]
for row in rows if row.attrib == ""]
# otherwise get the pfr id and col stats from index of the element
else:
data = [[get_pfr_id_and_college_stats(idx, td, pfr_id_idx, col_stat_idx)
for idx, td in enumerate(row.iterchildren())]
for row in rows if row.attrib == ""]
cols = [th.text_content() for th in headers]
df = pd.DataFrame(data=data, columns=cols)
return df
def get_table(url, row_css_selector, col_css_selector,
pfr_id_idx=None):
"""Scrape the data table and return it as a DataFrame."""
# set things up using pyquery
pq_obj = create_pq(url)
rows = pq_obj(row_css_selector)
headers = pq_obj(col_css_selector)
# If we don't want to get the player id, just extract the text data
if pfr_id_idx is None:
data = [[td.text_content() for td in row.iterchildren()]
for row in rows if row.attrib == ""]
# otherwise get the pfr id from index of the element
else:
data = [[get_pfr_id_and_data(idx, td, pfr_id_idx)
for idx, td in enumerate(row.iterchildren())]
for row in rows if row.attrib == ""]
cols = [th.text_content() for th in headers]
df = pd.DataFrame(data=data, columns=cols)
return df
def get_pfr_player_ids_and_info(url):
"""
Scrape the player ids from a pro-football-reference player directory.
The function returns the the raw text and link scraped from
pro-football-reference and the cleaned up columns which contain
the player names, pfr id, position, and years played (from and to).
I know this function uses bs4 instead of pyquery but too lazy to switch
atm.
Parameters
----------
url : str
Player directory page.
Returns
--------
df : pd.DataFrame
A DataFrame contaning pfr player ids and additional information.
"""
html = requests.get(url).text
soup = BeautifulSoup(html, "lxml")
players = soup.select("#div_players p")
data = [[player.find("a").attrs["href"], player.get_text()]
for player in players]
df = pd.DataFrame(data, columns=["Link", "Text"])
# clean up some of the data to be returned
df["Pfr_ID"] = df.Link.str.extract("/.*/.*/(.*)\.", expand=False)
df[["Player", "Pos", "Years"]] = df.Text.str.split("( \(.*\) )",
expand=True)
df.loc[:, "Pos"] = (df.Pos.str.replace("(\(|\))", "")
.str.rstrip()
.str.lstrip())
df[["From", "To"]] = df.Years.str.split("-", expand=True).astype(int)
# no need to keep years
reordered_cols = ["Player", "Pfr_ID", "Pos", "From", "To", "Link", "Text"]
df = df[reordered_cols]
return df
def get_college_info(url):
"""
Scrape college information from a player's bio.
Returns college information from a player's bio inlcuding college name,
the associated college link, and the player's college stats url. The
information is returned as a list of tuples, with either the college
name or 'College Stats' as the first item in each tuple, and a link
as the second item in the tuple.
Parameters
----------
url : str
The player url to scrape data from.
Returns
-------
info_list : list
A list of tuples containing the college, college link and/or college
stats link for the player.
"""
pq_obj = create_pq(url)
selector = "#meta > div > p:contains(College)"
info = pq_obj(selector)
if len(info) > 0:
# get the text content clean it up and return it
info_list = [(e.text_content(), e.attrib.get("href"))
for e in info[0].getchildren()[1:]]
return info_list
else: # Python automatically returns None, but prefer to be explicit
return None
def get_birth_info(url):
"""
Scrape birthday and location of a player.
Parameters
----------
url : str
The player url to scrape data from.
Returns
-------
info_list : list
A list containing the birthda and location.
"""
pq_obj = create_pq(url)
selector = "#meta > div > p:contains(Born)"
info = pq_obj(selector)
if len(info) > 0:
# get the text content clean it up and return it
info_list = [e.text_content() for e in info[0].getchildren()[1:]]
info_list = [unicodedata.normalize("NFKD", i) for i in info_list]
info_list = [i.strip() for i in info_list]
return info_list
else:
return None
def get_row_data(pq_obj, row_css_selector):
"""Extracts row data and returns it as a matrix."""
rows = pq_obj(row_css_selector)
data = [[td.text_content() for td in row.iterchildren()]
for row in rows]
return data