-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata-wrangling-solutions.R
174 lines (110 loc) · 4.66 KB
/
data-wrangling-solutions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# Load Packages -----------------------------------------------------------
# Load the tidyverse package
library(tidyverse)
# Import Data -------------------------------------------------------------
# Download data from https://rfor.us/penguins
# Copy the data into the RStudio project
# Create a new R script file and add code to import your data
penguins <- read_csv("penguins.csv")
# select() ----------------------------------------------------------------
# Use select() to keep only the sex variable
penguins |>
select(sex)
# Use select() to keep the island and sex variables
penguins |>
select(island, sex)
# Use one of the select() helper functions to keep all variables that have the letter s in their names
penguins |>
select(contains("s"))
# Use one of the select() helper functions to keep all variables that start with the letter b
penguins |>
select(starts_with("b"))
# Use select() to keep the variables from island to the end
penguins |>
select(island:year)
# Use the dropping syntax with - to keep the same variables as above (island to the end)
penguins |>
select(-species)
# Drop all variables from bill_length_mm to body_mass_g
penguins |>
select(-(bill_length_mm:body_mass_g))
# mutate() ----------------------------------------------------------------
# Use mutate() to create a variable called observation_station and set its value to "Palmer"
penguins |>
mutate(observation_station = "Palmer")
# 2. Create a new variable based on other variables:
penguins |>
mutate(body_mass_lbs = body_mass_g / 453.6)
# 3. Change an existing variable
penguins |>
mutate(bill_length_mm = bill_length_mm + 1)
# filter() ----------------------------------------------------------------
# Use filter() to only keep female penguins
penguins |>
filter(sex == "female")
# Use filter() to only keep penguins NOT on Torgersen island
penguins |>
filter(island != "Torgersen")
# Use filter() to only keep penguins on Torgersen island or Biscoe island
# Use the or logical operator (|) to do this
penguins |>
filter(island == "Torgersen" | island == "Biscoe")
# Rewrite your filter() code above to keep the penguins from Torgersen island or Biscoe island
# This time, though, use the %in% operator
penguins |>
filter(island %in% c("Torgersen", "Biscoe"))
# Use a comparison operator to keep penguins with flipper lengths greater than or equal to 193 millimeters
penguins |>
filter(flipper_length_mm >= 193)
# Drop any rows that have missing data in the flipper_length_mm variable
# Do this first with !is.na()
penguins |>
filter(!is.na(flipper_length_mm))
# Do this a second time with drop_na()
penguins |>
drop_na(flipper_length_mm)
# summarize() -------------------------------------------------------------
# Calculate the weight of the heaviest penguin.
# Don't forget to drop NAs!
penguins |>
summarize(max_body_mass = max(body_mass_g, na.rm = TRUE))
# Calculate the minimum and maximum weight of penguins in the dataset.
penguins |>
summarize(max_body_mass = max(body_mass_g, na.rm = TRUE),
min_body_mass = min(body_mass_g, na.rm = TRUE))
# group_by() and summarize() ----------------------------------------------
# Calculate the weight of the heaviest penguin on each island.
penguins |>
group_by(island) |>
summarize(max_body_mass = max(body_mass_g, na.rm = TRUE))
# Calculate the weight of the heaviest penguin on each island for each year.
penguins |>
group_by(island, year) |>
summarize(max_body_mass = max(body_mass_g, na.rm = TRUE))
# arrange() ---------------------------------------------------------------
# Use arrange() to display the penguins data frame in order by body mass
penguins %>%
arrange(body_mass_g)
# Now display the penguins data in descending order by body mass
penguins %>%
arrange(desc(body_mass_g))
# Create a pipeline that does the following:
# 1. Filters to only keep penguins on Biscoe island
# 2. Drops any rows with NA values for the body_mass_g or sex variables
# 3. Calculates the average body mass by sex
# 4. Displays the result in descending order by average body mass
penguins |>
filter(island == "Biscoe") |>
drop_na(body_mass_g, sex) |>
group_by(sex) |>
summarize(avg_body_mass = mean(body_mass_g)) |>
arrange(desc(avg_body_mass))
# Create a new data frame -------------------------------------------------
# Take the pipeline that you just created and copy it below
# Then assign the result of the pipeline to an object called penguin_body_mass_by_sex
penguin_body_mass_by_sex <- penguins |>
filter(island == "Biscoe") |>
drop_na(body_mass_g, sex) |>
group_by(sex) |>
summarize(avg_body_mass = mean(body_mass_g)) |>
arrange(desc(avg_body_mass))