-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathOpenDoorScience_Loops.r
315 lines (275 loc) · 10.9 KB
/
OpenDoorScience_Loops.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
# Preamble ----------------------------------------------------------------
# Loops are are used when we want a command or set of commands repeated
# many times. The most basic kinds of loops in R are "for" and "while"
# loops, and we will focus on their use here. Note that in general these
# loops are not usually the most efficient way of completing a task,
# especially with large datasets. We will also look at a couple useful
# alternatives to loops.
# Libraries ---------------------------------------------------------------
library(tidyverse)
library(ggthemes)
library(sf)
library(lubridate)
library(naturecounts)
library(mgcv)
library(DHARMa)
# Import data -------------------------------------------------------------
dat = nc_data_dl(username = "insert_your_username",
collections = "BBS50-CAN",
info = "science hour",
region = list(statprov = "SK"),
years = c(2011, 2021))
# Data prep ---------------------------------------------------------------
# species list
sp_code = search_species()
# modify the code list so that the crows are aggregated
sp_code = sp_code %>% select(species_id, english_name, scientific_name)
# join species names to detections
dat_j = dat %>%
left_join(sp_code) %>%
# remove columns with all NA
select(where(~!all(is.na(.x))))
# zero fill
dat_f = dat_j %>%
naturecounts::format_zero_fill(extra_species = c("english_name", "scientific_name"),
extra_event = c("SiteCode", "RouteIdentifier",
"latitude", "longitude",
"survey_year", "survey_month",
"survey_day", "TimeCollected",
"CollectorNumber"))
# For loops ---------------------------------------------------------------
# For loops execute a command a set number of times as defined by the user.
# Here's a super basic example:
for (i in 1:5) {
# i is an index variable that will be assigned the values after the "in"
# operator. In the first iteration i = 1, in the second i = 2, etc.
print(i)
# in this case the only command we are executing is to print i
}
# i does not have to start at 1
for (i in 5:10) {
print(i)
}
for (i in -1:4) {
print(i)
}
# it does not even have to be a list of numbers
sp = unique(dat_f$english)
for (i in sp) {
print(i)
}
# the same principle can apply to rows of a dataframe, or elements of a list
# or vector
# when NOT to use a loop:
# on rows of a very large dataframe
# for example, let's concatenate a date column from survey year, month and day
for (i in 1:nrow(dat_f)) {
dat_f$date[i] = make_date(dat_f$survey_year[i], dat_f$survey_month[i], dat_f$survey_day[i])
}
# runs forever
# there are much better vectorized ways of doing this that we already know: dplyr
dat_f = dat_f %>%
mutate(date = make_date(survey_year, survey_month, survey_day))
# more realistic example of where loops might be useful
# What if we wanted to produce an exploratory trend plots for each of a
# select list of species?
# code from my original tidyverse intro
ex = dat_f %>%
filter(english_name %in% c("Baird's Sparrow", "Chestnut-collared Longspur", "Sprague's Pipit")) %>%
group_by(RouteIdentifier, survey_year, english_name) %>%
summarise(TotalCount = sum(ObservationCount)) %>%
group_by(survey_year, english_name) %>%
summarise(mu_count = sum(TotalCount) / n())
# plot
f_trend = ggplot(ex, aes(x = survey_year, y = mu_count, col = english_name)) +
ggthemes::theme_tufte() +
geom_line() +
ylab("Average count per route") +
theme(legend.title = element_blank(),
axis.title.x = element_blank())
f_trend
# but what if we wanted a separate plot for each species without going
# through the trouble of coding each one?
sp = c("Baird's Sparrow", "Chestnut-collared Longspur", "Sprague's Pipit")
for (i in sp) {
f_trend = ggplot(ex %>% filter(english_name == i),
aes(x = survey_year, y = mu_count)) +
ggthemes::theme_tufte() +
geom_line() +
ylab("Average count per route") +
ggtitle(i) +
theme(legend.title = element_blank(),
axis.title.x = element_blank())
print(f_trend)
}
# and of course we could save all of those plots too
for (i in sp) {
f_trend = ggplot(ex %>% filter(english_name == i),
aes(x = survey_year, y = mu_count)) +
ggthemes::theme_tufte() +
geom_line() +
ylab("Average count per route") +
ggtitle(i) +
theme(legend.title = element_blank(),
axis.title.x = element_blank())
ggsave(paste0("figures/bbs_trend_", i, ".pdf"),
f_trend,
height = 4,
width = 6)
}
# another useful application of a for loop is when you have to import/export many
# files at once.
# For example, what if we wanted to write individual dataframes for our three
# grassland SAR as csv
# first, split our dataframe
dat_ls = dat_f %>%
filter(english_name %in% sp) %>%
group_by(english_name) %>%
group_split()
# loop over each element of the list of dataframes and write to csv
for (i in 1:length(dat_ls)) {
write_csv(dat_ls[[i]], paste0("data/bbs/bbs_", sp[i], ".csv"))
# when we refer to an element of a list, we use double square brackets
# when we refer to a row of a dataframe or an element of a vector, we use
# single square brackets
}
# and of course we can do the opposite and import many files from the same
# directory
fnames = list.files(path = "data/bbs",
pattern = ".csv$", recursive = F, full.names = T)
# empty list for the new files
dat_ls = list()
for (i in 1:length(fnames)) {
dat_ls[[i]] = read_csv(fnames[i])
}
dat_ex = bind_rows(dat_ls)
# you could use the same technique to import shapefiles, rasters, or any
# other data type
# what about running models for multiple species from one dataset?
# summarise the dataset at the route level
dat_r = dat_f %>%
filter(english_name %in% c("Baird's Sparrow", "Chestnut-collared Longspur", "Sprague's Pipit")) %>%
group_by(RouteIdentifier, survey_year, english_name) %>%
summarise(TotalCount = sum(ObservationCount)) %>%
group_by(english_name) %>%
group_split()
# empty list for the models
mod_ls = list()
for (i in 1:length(dat_r)) {
mod_ls[[i]] = gam(TotalCount ~ s(survey_year, k = 5), data = dat_r[[i]], family = poisson())
print(summary(mod_ls[[i]]))
plot(mod_ls[[i]], shade = T)
}
# While loops -------------------------------------------------------------
# While loops are slightly different in that they will execute a command
# until a condition is met. They are used in cases where the number of
# iterations is unknown. While loops usually have fewer alternatives to
# their use than for loops. I don't use them very often, but they are good
# to be familiar with.
# this is a silly example that does the same thing as the basic for loops above
i = 0
while (i <= 5) {
# as long as i is less than or equal to 5, the loop will continue to run
print(i)
i = i + 1
# add one to i, or the loop will run forever
}
# randomly draw detections until there's 10 of each grassland SAR
i = 0
# i is the number of detections of each species
while (any(i != 10)) {
dat_s = dat_f %>%
filter(english_name %in% sp,
ObservationCount > 0) %>%
slice_sample(n = 30)
i = dat_s %>%
group_by(english_name) %>%
summarise(n = n()) %>%
pull(n)
}
# works, but more complicated than it has to be
dat_s = dat_f %>%
filter(english_name %in% sp,
ObservationCount > 0) %>%
group_by(english_name) %>%
slice_sample(n = 10)
# group_by followed by slice does the same thing
# but there are other cases where while is difficult to replace. One
# example that was very useful to me was when designing a new monitoring
# program for the Canadian Prairies. I wanted to draw spatially-balanced
# random grids over a large area, but also wanted them to be balanced by
# land cover. I used while to repeat the draw until the grids were a good
# representation of the landscape. If you're interested in that code, shoot
# me an email.
# For loop alternatives ---------------------------------------------------
# The apply family of functions are basically wrappers around loops. Many
# of the above examples of iterating over lists could be accomplished with
# lapply, which is usually how I do it. Let's redo a couple examples.
# species trend plots
ex_ls = ex %>%
group_by(english_name) %>%
group_split()
lapply(ex_ls, function(x) {
ggplot(x, aes(x = survey_year, y = mu_count)) +
ggthemes::theme_tufte() +
geom_line() +
ylab("Average count per route") +
ggtitle(first(x$english_name)) +
theme(legend.title = element_blank(),
axis.title.x = element_blank())
}
)
# bulk read files
fnames = list.files(path = "data/bbs", pattern = ".csv$", recursive = F, full.names = T)
dat_ex = lapply(fnames, read_csv)
dat_ex = bind_rows(dat_ls)
# the purrr library has some neat tools that can also replace loops
# Here's an example of a workflow for modelling a list of species
mod_res = dat_f %>%
# using our list of three SAR
filter(english_name %in% sp) %>%
group_by(RouteIdentifier, survey_year, english_name) %>%
# calcalate total count for each route
summarise(TotalCount = sum(ObservationCount)) %>%
group_by(english_name) %>%
# nest the data frame (dataframe for each species within a larger dataframe)
nest() %>%
# apply model to each nested dataframe using map()
mutate(mod = map(.x = data, .f = function(x) {
gam(TotalCount ~ s(survey_year, k = 5), data = x, family = poisson())
}))
# scaled model residuals from the DHARMa package
mod_res = mod_res %>%
mutate(scaled_resids = map(mod, simulateResiduals, plot = F, re.form = NULL))
# let's look at one type of diagnostic plot - for the dispersion test
lapply(mod_res$scaled_resids, testDispersion)
# all three species show overdispersion - much more digging to be done here
# model predictions
mod_res = mod_res %>%
mutate(fit = map(mod, predict,
newdata = data.frame(survey_year = seq(2011, 2021, 1)),
type = "link", se.fit = T),
fit = map(fit, function(x) {
data.frame(t(bind_rows(x))) %>%
rename(fit = 1, se = 2)
}),
pred = map(.x = data, .f = function(x) {
data.frame(survey_year = seq(min(x$survey_year), max(x$survey_year), 1))
}))
# unnest with just the species name, prediction data, and predictions
mp = mod_res %>%
select(english_name, pred, fit) %>%
unnest() %>%
mutate(ci2.5 = exp(fit - 1.96 * se),
ci97.5 = exp(fit + 1.96 * se),
fit = exp(fit))
# make a nice plot
f_density_mp = ggplot(mp, aes(x = survey_year, y = fit, col = english_name, fill = english_name)) +
theme_tufte() +
geom_line() +
geom_ribbon(aes(ymin = ci2.5, ymax = ci97.5), alpha = 0.25, col = NA) +
ylim(0, 6) +
ylab("Predicted count / route") +
theme(legend.title = element_blank(),
axis.title.x = element_blank())
f_density_mp