-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexampleStatsAnalyses.R
203 lines (160 loc) · 6.14 KB
/
exampleStatsAnalyses.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
# load up any libraries you might need - you can also select them from the Packages window!
library(tidyr)
library(rstatix)
library(ggpubr)
library(datasets)
library(emmeans)
# some basic R commands
variable <- 1 # use the <- to assign
variable1 <- variable + 1
data(ChickWeight, package = "datasets")
head(ChickWeight, 20)
chickdata.1 <- subset(ChickWeight, Chick == "1")
chickdata.1$Chick <- NULL
chickPlot <- plot(chickdata.1$Time, chickdata.1$weight,
ylab = "Weight", xlab = "Time (Days)",
)
# what about linear modeling?
scatter.smooth(chickdata.1$Time, chickdata.1$weight,
ylab = "Weight", xlab = "Time (Days)",
)
lin <- lm(formula = weight ~ Time, data = chickdata.1)
print(lin)
summary(lin)
#####################
# trying out some correlation (with data not about chicks)
# use the training.csv data
# check if there is correlation between the subscales: Pearson correlation
corplot <- ggscatter(training, x = "bais.v", y = "bais.c",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "BAIS V", ylab = "BAIS C")
# we can't just stop here - we need to make sure our data actually fits the test assumptions
# is the data normally distributed?: Shapiro-Wilk Test
shapiro.test(training$bais.v)
shapiro.test(training$bais.c)
# confirm visually with QQ plots - show the correlation between given points and a normal distribution
ggqqplot(training$bais.v, ylab = "BAIS V")
ggqqplot(training$bais.c, ylab = "BAISC")
# if they are NOT normal - we need to use a different test (here, ranked tests such as Spearman or Kendall)
# let's check out the correlation:
res <- cor.test(training$bais.v, training$bais.c,
method = "pearson")
res
# t = 4.319
# df = 14
# p = 0.0007
# correlation coefficient = 0.76
# t(14) = 4.319, p < 0.05
# maybe we then want to take an averaged BAIS score for future tests - add this to the dataset
training$bais.agg <- rowMeans(training[c('bais.v', 'bais.c')])
######################
# the t-test and friends
# t-test is used to compare the means of two-groups
# one-sample: compare your data to a known standard or theoretical mean
# independent sample: compare two groups
# paired sample: look data from one group at two different points (data is linked/paired)
# let R know about our groups
pitcherror %>%
group_by(Group) %>%
get_summary_stats(AvgABSError, type = "mean_sd")
# what does our data look like?
boxplot_t <- ggboxplot(
pitcherror, x = "Group", y = "AvgABSError",
ylab = "Error (Cents)", xlab = "Group", add = "jitter"
)
boxplot_t
# need to check our assumptions before we go on:
# 1) check for outliers in the data
pitcherror %>%
group_by(Group) %>%
identify_outliers(AvgABSError)
# 2) check normal distribution of data - Shapiro wilk test by goups
pitcherror %>%
group_by(Group) %>%
shapiro_test(AvgABSError)
ggqqplot(pitcherror, x = "AvgABSError", facet.by = "Group")
# 3) check the variance is equal
pitcherror %>% levene_test(AvgABSError ~ Group) # it's not, so we use Welch's t-test
# ok cool, so let's actually do the t-test
welch_t <- pitcherror %>%
t_test(AvgABSError ~ Group) %>%
add_significance()
welch_t
# how strong is this effect?
pitcherror %>% cohens_d(AvgABSError ~ Group, var.equal = FALSE)
welch_t <- welch_t %>% add_xy_position(x = "Group")
boxplot_t +
stat_pvalue_manual(welch_t, tip.length = 0) +
labs(subtitle = get_test_label(welch_t, detailed = TRUE))
# t(17.94) = -5.24, p < 0.0001
# what if my data isn't parametric?
# Wilcoxon testing (aka Mann-Whitney) - the test is harsher
wilcox <- pitcherror %>%
wilcox_test(AvgABSError ~ Group) %>%
add_significance()
wilcox
pitcherror %>% wilcox_effsize(AvgABSError ~ Group)
#####################
# what if I like groups and so I have more than two?
# ANOVA (analyis of variance)
ratings %>%
group_by(Gender, Instr) %>%
get_summary_stats(Rating, type = "mean_sd")
# focal variable is Instr, moderator variable is Gender
boxplot_a <- ggboxplot(
ratings, x = "Instr", y = "Rating",
color = "Gender", palette = "jco"
)
boxplot_a
# check for outliers - what do we do with them?
ratings %>%
group_by(Instr, Gender) %>%
identify_outliers(Rating)
ratings.alt <- ratings[-c(12,35), ]
ratings.alt %>%
group_by(Instr, Gender) %>%
identify_outliers(Rating)
# build a linear model (ANOVA is basically linear modeling in disguise - more on that later)
model <- lm(Rating ~ Instr*Gender,data = ratings)
# check that normality distribution
shapiro_test(residuals(model))
ggqqplot(residuals(model))
# now we check it by group
ratings %>%
group_by(Instr, Gender) %>%
shapiro_test(Rating)
# you can confirm visually if you like (I like)
ggqqplot(ratings, "Rating", ggtheme = theme_bw()) +
facet_grid(Instr ~ Gender)
# Levine's to check our variances are the same
ratings %>% levene_test(Rating ~ Instr*Gender)
# okay, all good - let's look at those interactions
res.aov <- ratings %>% anova_test(Rating ~ Instr * Gender)
res.aov
# what about if we remove the outliers?
model_alt <- lm(Rating ~ Instr*Gender,data = ratings.alt)
shapiro_test(residuals(model_alt))
ggqqplot(residuals(model_alt))
ratings.alt %>%
group_by(Instr, Gender) %>%
shapiro_test(Rating)
ggqqplot(ratings.alt, "Rating", ggtheme = theme_bw()) +
facet_grid(Instr ~ Gender)
ratings.alt %>% levene_test(Rating ~ Instr*Gender)
res.aov <- ratings.alt %>% anova_test(Rating ~ Instr * Gender)
res.aov
# results are similar - it's up to you to decide. In this case, we'd probably just leave them in
# let's check our main effect (Instr) by Gender
model_main <- lm(Rating ~ Gender * Instr, data = ratings)
ratings %>%
group_by(Gender) %>%
anova_test(Rating ~ Instr, error = model_main)
# the effect of Instrument was significant for both males, F(2,41) = 18.2, p < 0.05, and females F(2,41) = 46.2, p < 0.05,
# with principal instruments of voice, strings, or winds.
# so now we check pairwise comparisons (how these two variables interact depending on Instr)
pairwise <- ratings %>%
group_by(Gender) %>%
emmeans_test(Rating ~ Instr, p.adjust.method = "bonferroni")
pairwise
# the effect was significant for all principal instruments on rating, except for female wind players and vocalists