-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfc_zurich.R
100 lines (83 loc) · 3.53 KB
/
fc_zurich.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#' ---
#' title: "Scrap FC Zurich Spectator Data"
#' author: "Lucas Jamar"
#' ---
#' setup, include=FALSE
knitr::opts_chunk$set(eval = FALSE)
#' Load libraries
library(pbapply)
library(data.table)
library(stringr)
library(xml2)
library(rvest)
library(parsedate)
#' Read the pages of each season containing the matches of that season
seasons <- data.table(start = 2016:2020)
seasons[, end := start - floor((start + 1)/100)*100 + 1]
seasons[, season := paste0(start, "-", end)]
seasons[, link := paste0("https://www.fcz.ch/de/profis/spielplan/?season=", 1:5, "#schedule")]
matches <- data.table()
urls <- data.table()
for(link in seasons$link) {
page <- xml2::read_html(link)
match <- rvest::html_table(page)
match <- data.table::as.data.table(match)
match <- match[, 1:5]
match[, link := link]
matches <- data.table::rbindlist(list(matches, match), fill = TRUE)
url <- rvest::html_nodes(page, "a")
url <- rvest::html_attr(url, "href")
url <- data.table::as.data.table(url)
urls <- rbindlist(list(urls, url))
}
setnames(matches, c("match_start", "league", "match_number", "match", "results", "link"))
matches[seasons, on = .(link), season := season]
matches[, link := NULL]
#' Parse information from data
matches[, home_team := stringr::str_split(match, " – ", simplify = TRUE)[, 1]]
matches[, away_team := stringr::str_split(match, " – ", simplify = TRUE)[, 2]]
matches[, home_goals := stringr::str_extract_all(results, "\\d+", simplify = TRUE)[, 1]]
matches[, away_goals := stringr::str_extract_all(results, "\\d+", simplify = TRUE)[, 2]]
matches[, home_goals_first_half := stringr::str_extract_all(results, "\\d+", simplify = TRUE)[, 3]]
matches[, away_goals_first_half := stringr::str_extract_all(results, "\\d+", simplify = TRUE)[, 4]]
goals_columns <- c("home_goals",
"away_goals",
"home_goals_first_half",
"away_goals_first_half")
matches[, (goals_columns) := lapply(.SD, as.numeric), .SDcols = goals_columns]
matches[, match_start := parsedate::parse_date(match_start)]
matches[, date := as.Date(match_start)]
#' Get the links of individual matches
setnames(urls, "url", "link")
urls <- urls[link %like% "/de/profis/news/20"]
urls[, link := paste0("https://www.fcz.ch", link)]
get_match_info <- function(link) {
page <- xml2::read_html(link)
text <- rvest::html_text(page)
return(text)
}
#' Extract info out of page
urls[, text := pblapply(link, get_match_info)]
urls[, index := .I]
urls[, title := stringr::str_split(text, " - ", simplify = TRUE)[, 1]]
urls[, text := stringr::str_squish(text)]
#' Title repeats itself twice in text. Take text occurring after second title
urls[, text := stringr::str_split(text, title, simplify = TRUE)[, 3]]
#' Match date is in next 20 words
urls[, date := stringr::word(text, 1, 20, sep = " ")]
urls[, date := parsedate::parse_date(date)]
urls[, date := as.Date(date)]
#' Get the text before last occurrence of "Zuschauer"
urls[, text := stringr::str_split(text, "Zuschauer")]
urls[, text := tail(head(unlist(text), -1), 1), by = index]
urls[, text := unlist(text)]
#' Get last number in text corresponding to Zuschauer number
urls[, spectators := stringr::str_extract_all(text, "\\d+")]
urls[, spectators := data.table::last(unlist(spectators)), by = index]
urls[, spectators := as.numeric(spectators)]
urls[, c("index", "text") := NULL]
#' Merge on match date. Remove missing values.
matches[urls, on = .(date), names(urls) := mget(names(urls))]
lubridate::tz(matches$match_start) <- "Europe/Berlin"
#' Write to file
fwrite(matches, "raw_data/fc_zurich.csv")