-
Notifications
You must be signed in to change notification settings - Fork 1
/
tournamentCrawler.py
75 lines (57 loc) · 2.07 KB
/
tournamentCrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# -*- coding: utf-8 -*-
"""
Created on Mon Oct 13 17:12:51 2014
@author: Gaspard, Thomas, Arnaud
"""
import re
from utils import *
tournaments_fields = [
'Year',
'RoundNumber',
'WinnerScores',
'LoserScores',
'TieBreakScores',
'Tournament',
'TournamentPrize',
'Surface',
'Draw',
'Country',
'IDTournament',
'TournamentStart',
'TournamentEnd',
'Retirement',
'Timestamp',
't', 'y', 'r', 'p' ]
def getTournamentHTML(e,y):
return getHTML( 'http://www.atpworldtour.com/Share/Event-Draws.aspx?e=' +
str(e) + '&y=' + str(y) )
def parseTournamentInfos(content, infos):
draw = re.findall("Draw: <\/span>([0-9]+)<\/p>", content)[0]
surface = re.findall("Surface: <\/span>(.*)<\/p>", content)[0]
prizeMoney = re.findall("Prize Money: <\/span>(.*)<\/p>", content)[0].replace(",","")
tournament = re.findall("<a class=\"tournamentTitle\".*>(.*)<\/a><\/h3>", content)
if len( tournament ) == 0:
tournament = re.findall("<span class=\"tournamentTitle\"><strong>(.*)</strong></span><\/h3>", content)
subtitle = re.findall("<p class=\"tournamentSubTitle\">(.*) - (.*)-(.*)<\/p>", content)[0]
country = subtitle[0]
tournamentStart = subtitle[1]
tournamentEnd = subtitle[2]
res = infos.copy()
res.update( {
'Tournament' : tournament[0] ,
'TournamentPrize' : prizeMoney,
'Surface' : surface,
'Draw' : draw,
'Country' : country,
'TournamentStart' : tournamentStart,
'TournamentEnd' : tournamentEnd } )
return res
def getTournamentInfos(e, y, infos):
res = parseTournamentInfos( getTournamentHTML(e, y), infos)
res['e'] = e
res['y'] = y
return res
def getAllTournamentInfos(dico):
content = getTournamentHTML( dico['e'], dico['y'] )
return (re.findall('players\/(.*)\.asp', content) ,
parseTournamentInfos(content, dico) )