forked from jhofman/self_balancing_bikes
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean_data.R
149 lines (127 loc) · 6 KB
/
clean_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
library(data.table)
library(reshape)
library(plyr)
library(dplyr)
library(fossil)
# define a function to turn strings into datetimes
parse_datetime <- function(s, format="%Y-%m-%d %H:%M:%S") {
as.POSIXct(as.character(s), format=format)
}
########################################
# load and clean trip data from download_trips.sh
########################################
# load each month of the trip data into one big data frame
csvs <- Sys.glob('data/*-tripdata.csv')
trips <- data.frame()
for (csv in csvs) {
tmp <- read.table(csv, header=T, sep=',', na.strings='\\N')
trips <- rbind(trips, tmp)
}
# parse the start and stop time strings to datetimes
trips <- transform(trips,
starttime=parse_datetime(starttime),
stoptime=parse_datetime(stoptime))
# add a column for year/month/day (without time of day)
trips <- transform(trips,
ymd=parse_datetime(strftime(starttime, format="%Y-%m-%d"), "%Y-%m-%d"))
# recode gender from (0,1,2) to (Unknown, Male, Female)
trips <- transform(trips, gender=revalue(as.factor(gender), c("0"="Unknown", "1"="Male", "2"="Female")))
# save data frame for easy loading in the future
save(trips, file='data/trips.RData')
########################################
# load and clean station capacity data
########################################
stationcap <- data.table(read.csv("data/station_cap.csv"))
setnames(stationcap, c('station.name', 'station_capacity'),
c('station.name','station.capacity'))
########################################
# load and clean station availability data
########################################
availability <- read.delim(file = gzfile("data/availability.tsv.gz"), header = F,
col.names = c("station.name", "timestamp", "interval", "bikes.available"))
availability <- mutate(availability,
interval = as.POSIXct(interval, origin = "1970-01-01"),
rounded.interval = as.character(strftime(interval, format = "%H:%M")),
ymd = as.character(strftime(interval, format = "%Y-%m-%d")),
is.weekday = strftime(ymd, format = "%u") < 6
)
availability <- data.table(availability)
########################################
# add columns to trips data
########################################
trips <- mutate(trips,
starthour = as.numeric(as.character(strftime(starttime, format = "%H"))),
startminutes = as.numeric(as.character(strftime(starttime, format = "%M"))),
rounded.starttime = as.factor(sprintf('%02d:%02d',starthour, round_any(startminutes, 15, f=floor))),
stophour = as.numeric(as.character(strftime(stoptime, format = "%H"))),
stopminutes = as.numeric(as.character(strftime(stoptime, format = "%M"))),
rounded.stoptime = as.factor(sprintf('%02d:%02d',stophour, round_any(stopminutes, 15, f=floor))),
startymd = as.character(strftime(starttime, format = "%Y-%m-%d")),
stopymd = as.character(strftime(stoptime, format = "%Y-%m-%d")),
distance = deg.dist(start.station.longitude, start.station.latitude, end.station.longitude, end.station.latitude),
is.weekday = strftime(trips$starttime, format = "%u") < 6
)
trips$ymd <- NULL
trips <- data.table(trips)
# merge start and end station capacity into trips
setkey(stationcap, station.name)
setkey(trips, start.station.name)
trips <- stationcap[trips, nomatch=NA] #for every row in trips, look up matching key
setnames(trips, "station.name", "start.station.name")
# merge station availability into trips
setkey(availability, station.name, ymd, rounded.interval)
setkey(trips, start.station.name, startymd, rounded.starttime)
trips <- availability[trips, nomatch = NA]
setnames(trips, c("station.name", "ymd", "rounded.interval"), c("start.station.name","startymd","rounded.starttime"))
trips$interval <- NULL
trips$timestamp <- NULL
trips$is.weekday <- NULL
setnames(trips, "is.weekday.1", "is.weekday")
###############################################################
# Determines bike transportations by Citibike workers, theft, #
# reparations, etc. Also generates separate dataframes which #
# show when and where bikes where transported by hour/station.#
###############################################################
# creates a datatable in which the next start station is appended
# to every row.
setkey(trips, starttime)
teleportations <- trips[ , list(
time.lastknown = stoptime,
station.disappeared = end.station.name,
station.reappeared = lead(start.station.name),
time.reappeared = lead(starttime),
station.disappeared.latitude = end.station.latitude,
station.disappeared.longitude = end.station.longitude,
station.reappeared.latitude = lead(start.station.latitude),
station.reappeared.longitude = lead(start.station.longitude),
is.weekday,
startymd,
stopymd,
rounded.time.lastknown = rounded.stoptime,
rounded.time.reappeared = lead(rounded.starttime),
hour.lastknown = stophour,
hour.reappeared = lead(starthour)
),
by = bikeid ][station.disappeared != station.reappeared]
####################################################
# Calculates a given station's 3 nearest neighbors #
# and their corresponding availability/capacity #
# per hour #
####################################################
stationprox <- trips[ start.station.name != end.station.name , list(
d = distance[1]),
by=c("start.station.name", "end.station.name")]
stationprox <- stationprox[ , list(
end.station.name,
d,
rank = rank(d)),
by=start.station.name][rank <=3]
stationprox <- stationprox[ , list(
station.1 = start.station.name,
station.2 = end.station.name,
d)]
write.csv(stationprox, file="stationprox.csv", row.names = FALSE)
########################################
# save all data tables to one RData file
########################################
save(trips, teleportations, stationcap, availability, stationprox, file = "data/clean_citibike.RData")