-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathOpenDoorScience_tidyverse_intro.r
255 lines (190 loc) · 7.74 KB
/
OpenDoorScience_tidyverse_intro.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
# Into the tidyverse ------------------------------------------------------
# Preamble ----------------------------------------------------------------
# This script provides a brief tutorial on various data wrangling and
# visualization tools available in the tidyverse.
# Load packages -----------------------------------------------------------
library(tidyverse)
library(naturecounts)
# Import data -------------------------------------------------------------
nc_permissions(username = "sample")
dat = nc_data_dl(username = "sample",
collections = "BBS50-CAN",
info = "science hour",
region = list(statprov = "SK"),
years = c(2011, 2021))
# species list
sp_code = search_species()
# modify the code list so that the crows are aggregated
sp_code = sp_code %>%
select(species_id, english_name, scientific_name)
# join species names to detections
dat_j = dat %>%
left_join(sp_code) %>%
# remove columns with all NA
select(where(~!all(is.na(.x))))
# Let's zero-fill the data -----------------------------------------------
dat_f = dat_j %>%
naturecounts::format_zero_fill(extra_species = c("english_name",
"scientific_name"),
extra_event = c("SiteCode",
"RouteIdentifier",
"latitude", "longitude",
"survey_year",
"survey_month",
"survey_day",
"TimeCollected",
"CollectorNumber"))
# Basic data summaries ----------------------------------------------------
# what columns exist in the data?
dat_f %>%
str()
# how many records?
dat_f %>%
nrow()
# how many species?
dat_f %>%
distinct(english_name) %>%
nrow()
# which species?
sp_list = dat_f %>%
distinct(english_name)
# Filter the data ---------------------------------------------------------
# very common is to filter by species
ex = dat_f %>%
filter(english_name == "Chestnut-collared Longspur")
# or by year
ex = dat_f %>%
filter(survey_year == 2015)
# easy to combine filters
ex = dat_f %>%
filter(survey_year == 2015,
english_name == "Chestnut-collared Longspur")
# Selecting columns -------------------------------------------------------
# perhaps we only need a few of the available columns
ex = dat_f %>%
select(SamplingEventIdentifier, english_name, ObservationCount)
# Summarizing data --------------------------------------------------------
# maybe we don't care about the stop-level data and want to summarize at
# route level
ex = dat_f %>%
group_by(RouteIdentifier, survey_year, english_name) %>%
summarise(TotalCount = sum(ObservationCount),
MaxCount = max(ObservationCount))
# Creating/modifying a column ----------------------------------------------
# perhaps we want to put the date information in a single column
ex = dat_f %>%
mutate(survey_date = paste(survey_year, survey_month, survey_day, sep = "-"))
# recall there were two different species codes for American Crow
# we probably want to combine them for analysis
ex = dat_f %>%
mutate(english_name = if_else(species_id %in% c(13900, 38407),
"American Crow",
english_name),
scientific_name = if_else(species_id %in% c(13900, 38407),
"Corvus brachyrhynchos",
scientific_name))
# check that it worked
ex %>%
distinct(english_name) %>%
arrange(english_name)
# good
dat_f %>%
mutate(ObservationCount = as.numeric(ObservationCount))
# Strings! ----------------------------------------------------------------
# many fancy things can be done with character strings, and it tends to get
# very specific to your application, so google/stackexchange is your friend
# let's try a basic application though
# what if we want to filter all the warblers?
# we could supply all the warbler names or codes to the filter function
# but that would be a lot of species to type out
# e.g., dat_f %>%
# filter(english_name %in% c("Black-and-white Warbler,
# "Chestnut-sided Warbler, ....))
# or we could just tell it to search for the string "Warbler"
ex = dat_f %>%
filter(str_detect(english_name, "Warbler"))
# did it work?
ex %>%
distinct(english_name)
# yep
# we also might want to extract a string from a column
# for example, SiteCode contains the stop number at the end of the string
# let's make a new column for stop number
# one way to do this is to take the last two digits of the string
ex = dat_f %>%
mutate(stop = str_sub(SiteCode, start = -2))
# but the problem is stops can have <2 digits
ex %>%
distinct(stop)
# we can do this by taking only the numeric characters in the last two digits
ex = dat_f %>%
mutate(stop = str_sub(SiteCode, start = -2),
stop = str_extract(stop, "[[:digit:]]+"))
# let's make sure there are no letters attached to the stop number
ex %>%
distinct(stop)
# good
# Long vs wide format -----------------------------------------------------
# the data we are working with are in the long format, which is usually how
# you want it for analysis, but let's say you wanted to summarize the data
# in wide format for a table
# let's make a table where each row is a year, each column is a species,
# and each cell contains the count per route
ex = dat_f %>%
# first we summarize the data at the route level
group_by(RouteIdentifier, survey_year, english_name) %>%
summarise(TotalCount = sum(ObservationCount, na.rm = T)) %>%
# now summarize by year
group_by(survey_year, english_name) %>%
summarise(mu_count = sum(TotalCount) / length(unique(RouteIdentifier))) %>%
# now we spread the data
pivot_wider(names_from = "english_name",
values_from = "mu_count")
# what if we want to go back to long format?
ex2 = ex %>%
pivot_longer(cols = c(2:230),
names_to = "english_name",
values_to = "mu_count")
# Let's try a more realistic workflow --------------------------------------
# we want to make a plot showing the trend in average count per route before
# we start our hypothetical analysis
# how many routes have data in each year?
dat_f %>%
group_by(survey_year) %>%
summarise(n_route = length(unique(RouteIdentifier)))
ex = dat_f %>%
# let's filter to a few key grassland SAR
filter(english_name %in% c("Baird's Sparrow", "Chestnut-collared Longspur",
"Sprague's Pipit")) %>%
# summarize counts by route
group_by(RouteIdentifier, survey_year, english_name) %>%
summarise(TotalCount = sum(ObservationCount)) %>%
# summarize for each year
group_by(survey_year, english_name) %>%
summarise(mu_count = sum(TotalCount) / n())
# Plotting ----------------------------------------------------------------
# we're just scratching the surface here
f_trend = ggplot(ex,
aes(x = survey_year,
y = mu_count,
col = english_name)) +
geom_line()
f_trend
# let's make it prettier
f_trend = ggplot(ex,
aes(x = survey_year,
y = mu_count,
col = english_name)) +
# the ggthemes package has some nifty formatting for plots
ggthemes::theme_tufte() +
geom_line() +
# new y axis title
ylab("Average count per route") +
# remove legend and x axis title
theme(legend.title = element_blank(),
axis.title.x = element_blank())
f_trend
ggsave("figures/bbs_trend.pdf",
f_trend,
height = 4,
width = 6)