-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathweb scrapping.R
134 lines (106 loc) · 3.49 KB
/
web scrapping.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#Web scraping
# Loading required libraries:
# General-purpose data wrangling
#install.packages("tidyverse")
library(tidyverse)
# Parsing of HTML/XML files
#install.packages("rvest")
library(rvest)
# String manipulation
#install.packages("stringr")
library(stringr)
# Verbose regular expressions
#install.packages("rebus")
library(rebus)
# Eases DateTime manipulation
#install.packages("lubridate")
library(lubridate)
#url <- "https://boqspecialist.com.au"
#boqs_site_xml <- read_html(url)
url_2 <- 'http://www.trustpilot.com/review/www.amazon.com'
test_site_xml <- read_html(url_2)
get_last_page <- function(html){
pages_data <- html %>%
# The '.' indicates the class
html_nodes('.pagination-page') %>%
# Extract the raw text as a list
html_text()
# The second to last of the buttons is the one
pages_data[(length(pages_data)-1)] %>%
# Take the raw string
unname() %>%
# Convert to number
as.numeric()
}
first_page <- read_html(url_2)
latest_page_number <- get_last_page(first_page)
get_reviews <- function(html){
html %>%
# The relevant tag
html_nodes('.review-info__body__text') %>%
html_text() %>%
# Trim additional white space
str_trim() %>%
# Convert the list into a vector
unlist()
}
get_reviewer_names <- function(html){
html %>%
html_nodes('.user-review-name-link') %>%
html_text() %>%
str_trim() %>%
unlist()
}
get_review_dates <- function(html){
status <- html %>%
html_nodes('time') %>%
# The status information is this time a tag attribute
html_attrs() %>%
# Extract the second element
map(2) %>%
unlist()
dates <- html %>%
html_nodes('time') %>%
html_attrs() %>%
map(1) %>%
# Parse the string into a datetime object with lubridate
ymd_hms() %>%
unlist()
# Combine the status and the date information to filter one via the other
return_dates <- tibble(status = status, dates = dates) %>%
# Only these are actual reviews
filter(status == 'ndate') %>%
# Select and convert to vector
pull(dates) %>%
# Convert DateTimes to POSIX objects
as.POSIXct(origin = '1970-01-01 00:00:00')
# The lengths still occasionally do not lign up. You then arbitrarily crop the dates to fit
# This can cause data imperfections, however reviews on one page are generally close in time)
length_reviews <- length(get_reviews(html))
return_reviews <- if (length(return_dates)> length_reviews){
return_dates[1:length_reviews]
} else{
return_dates
}
return_reviews
}
get_star_rating <- function(html){
# The pattern you look for: the first digit after `count-`
pattern = 'count-'%R% capture(DIGIT)
ratings <- html %>%
html_nodes('.rating.summary-rating') %>%
html_attrs() %>%
# Apply the pattern match to all attributes
map(str_match, pattern = pattern) %>%
# str_match[1] is the fully matched string, the second entry
# is the part you extract with the capture in your pattern
map(2) %>%
unlist()
# Leave out the first instance, as it is not part of a review
ratings[2:length(ratings)]
}
for(i in 1:60){
date_time[i] <- review_dates[i+3] %>%
data.frame(date_time)
}
print(date_time[[3]])