-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRedWineQualityEdx_damiano_pincolini.R
404 lines (246 loc) · 10.8 KB
/
RedWineQualityEdx_damiano_pincolini.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
# RED WINE QUALITY MACHINE LEARING PROJECT
# Damiano Pincolini
# Loading libraries
library(readr)
library(tidyverse)
library(moments)
library(GGally)
library(caret)
library(MASS)
library(stats)
library(rpart.plot) # for decision tree plots
library("xgboost") # for X Gradient Boosting model
# Loading Data
# IMPORTANT NOTE: since it is not possible to download any dataset from Kaggle without
# providing account credentials to other graders,
# the following code simply load the dataset in R from my laptop where it has previuosly
# been saved.
# This link (https://www.kaggle.com/datasets/uciml/red-wine-quality-cortez-et-al-2009/code?datasetId=4458&language=R)
# allows to land on the kaggle's page where it is possibile to download the dataset I have used
# (provided you log in).
wines <- read_csv("wineQualityReds.csv")
colnames(wines)[1] <- "wineId"
# 1. EXPLORATORY DATA ANALYSIS
head(wines)
colnames(wines)
str(wines)
summary(wines)
# Univariate analysis of the output (quality): the main goal here is to understand how
# the outcome I want to investigate (quality) behaves.
# Five numbers by Tukey
summary(wines$quality)
# Distribution graphic analysis
ggplot(wines, aes(quality))+
geom_bar(width=0.75)+
scale_x_continuous(n.breaks=10,limits=c(0,10))+
coord_cartesian(xlim=c(0,10))+
labs(x = "quality (grades)",
y = "number of grades",
title ="Grade frequency")+
theme_bw()
ggplot(wines, aes(quality))+
geom_boxplot(outlier.colour="red", orientation="y")+
labs(x = "quality (grades)",
title ="Grade frequency")+
theme_bw()
# Skweness and kurtosis
skewness(wines$quality)
# It is visible a slightly positive skewness (0.217)
kurtosis(wines$quality)
# With a 3.29 value, the distribution is slightly leptokurtic, but it does not seems a
# major issue.
# Normality tests: let's take some test on the distribution of quality variable from the
# moments package.
jarque.test(wines$quality)
# In the Jarque-Bera normality test the null hypothesis is that the variable has a skewness and
# kurtosis that matches a normal distribution, while the alternative hypothesis is that the
# dataset has a skewness and kurtosis that does not match a normal distribution.
# Since the p-value is 0.0001062 which is less than 0.05, we can not reject the
# alternative hypothesis.
agostino.test(wines$quality)
# The D'Agostino skewness test returns a p-value of 0.0004182 is lower than 0.05 so we can
# accept the alternative hypotesis (skewness).
anscombe.test(wines$quality)
# The anscombe kurtosis test returns a p-value of 0.028, lower than 0.05, that leads me to
# accept the alternative hypotesis (kurtosis).
# It seems we are facing a non-normal distribution of red wine quality grades.
# Is there some issue with outliers?
# As we have already seen, the first quantile corresponds to grade 5 and the third quantile
# corresponds to grade 6.
# So IQR is simply 1.
# The limit for lower outliers is
as.vector(summary(wines$quality)[2] - 1.5*IQR(wines$quality))
# The limit for higher outliers is
as.vector(summary(wines$quality)[5] + 1.5*IQR(wines$quality))
# It appears that the minimun and the maximum grade (respectively 3 and 5) are supposed to be
# considered as outliers.
# In the dataset documentation each wine rate is based on sensory evaluation which is
# the median of at least 3 evaluations.
# Due to this information, I don't want to lose the information of 3 and 8 grades because
# the evaluation process seems pretty solid since it takes into account more than
# a single grade (a least 3); in my opinion this can be considered as a good tip to exclude
# the presence of misleading information.
# Correlation: I want to detect if there are specific correlations either between quality and
# other features and between features themselves.
# First of all, I need to cancel the wineId column which is a simple progressive counter that
# does not represent a feature for quality wine prediction.
wines2 <- wines %>%
dplyr::select(2:13)
ggpairs(wines2, title="Correlation analysis", proportions = "auto")+
theme_bw()
# 1. The features most correlated (respectively in a negative and a positive way)
# with quality are volatile acidity (-0.391) and alcohol (0.476).
# 2. Volatile acidity appears to be well positively correlated to citric acid (-0.552).
# 3. Alcohol is well negatively correlated with density (-0.496).
# According to what's above, I will focus on volatile acidity and alcohol (the two most
# important feature since I have spotted a direct correlation with quality).
# Dataset reduction and analysis of the selected features: the aim is here to focus on the
# features that seems to have a fairly good correlation with quality, thus managing the
# "curse of dimensionality".
# I rescale the dataset values via square root transformation
wines2sqrt <- wines2 %>%
transmute(volatile.aciditysqrt = sqrt(volatile.acidity),
alcoholsqrt = sqrt(alcohol),
qualitysqrt = sqrt(quality)) %>%
dplyr::select(volatile.aciditysqrt, alcoholsqrt, qualitysqrt)
# I check how variables behaves after transformation:
# Volatile acidity
wines2sqrt %>% ggplot(aes(volatile.aciditysqrt))+
geom_boxplot()
skewness(wines2sqrt$volatile.aciditysqrt)
kurtosis(wines2sqrt$volatile.aciditysqrt)
# Alcohol
wines2sqrt %>% ggplot(aes(alcoholsqrt))+
geom_boxplot()
skewness(wines2sqrt$alcoholsqrt)
kurtosis(wines2sqrt$alcoholsqrt)
# Quality
wines2sqrt %>% ggplot(aes(qualitysqrt))+
geom_boxplot()
skewness(wines2sqrt$qualitysqrt)
kurtosis(wines2sqrt$qualitysqrt)
cor.test(x=wines2$volatile.acidity, y=wines2$quality, method="pearson")
cor.test(x=wines2$alcohol, y=wines2$quality, method="pearson")
# After squared root transformation I can detect what follows:
# 1. quality skeness has significantly reduced (from 0.217 to -0.05),
# 2. quality kurtosis has slightly increased (from 3.29 to 3.58),
# 3. correlation between quality and both alcohol and volatile acidity has not changed (0.476
# and -0.391 respectively).
# 2. MODELING
# Pre-processing: data have already been transformed, the number of features have already
# been reduced. There is no need for other pre-processing activities.
# Data partitioning
# The starting point of modeling activity is the dataset called wines2sqrt,
# which contains both outcomes (quality) and two features (alcohol and volatile acidity).
# I want to split the dataset in the two following items:
# 1. wineQuality, a vector containing quality marks.
# 2. wineFeatures, a matrix of two numeric features.
wineQuality <- wines2sqrt$qualitysqrt
wineFeatures <- wines2sqrt %>%
dplyr::select(c(1:2)) %>%
as.matrix()
# After setting the seed to 1, I proceed with data partition splitting both wineQuality vector
# and wineFeatures matrix into a 20% test set and 80% train.
set.seed(1, sample.kind = "Rounding")
test_index <- createDataPartition(wineQuality, times = 1, p = 0.2, list = FALSE)
wineFeatureTest <- wineFeatures[test_index,]
wineQualityTest <- wineQuality[test_index]
wineFeatureTrain <- wineFeatures[-test_index,]
wineQualityTrain <- wineQuality[-test_index]
# I check that the training and test wineQuality datasets have similar proportions of marks.
mean(wineQuality)
mean(wineQualityTest)
mean(wineQualityTrain)
# Everything seems fine: the average mark in the original, train and test tests is 2.37 in
# all of the three cases.
densityplot(wineQualityTest)
densityplot(wineQualityTrain)
wineQualityTest %>%
as_tibble() %>%
group_by(value) %>%
summarize(numMarks=n()) %>%
mutate(percMark=numMarks/sum(numMarks)*100)
wineQualityTrain %>%
as_tibble() %>%
group_by(value) %>%
summarize(numMarks=n()) %>%
mutate(percMark=numMarks/sum(numMarks)*100)
# Distribution of test and train set is similar with highs and lows at the same points as
# it is visibile from the two plot above.
# The 2.24 value appears for the 42/44 per cent of the times; the 2.45 value for the 39.9
# per cent of the times, the 2.65 value has 13.4/12.2 per cent of frequency.
# The marginal (transformed) marks (1.73, 2 and 2.83) show slightly meaningful differences
# but we can deduce that data partitioning has not produced unbalanced train and test dataset.
# Training
# I want to try the following algorithms: linear regression, decision tree, random forest,
# XG boost and k-nearest neighbors.
# Linear regression
train_lm <- train(x = wineFeatureTrain,
y = wineQualityTrain,
method = "lm")
train_lm
summary(train_lm)
# Decision Tree
train_d.tree <- train(x = wineFeatureTrain,
y = wineQualityTrain,
method = "rpart")
train_d.tree
# Random forest model
train_rf <- train(x = wineFeatureTrain,
y = wineQualityTrain,
method = "ranger")
train_rf
# XG Boost
train_xg.boost <- train(x = wineFeatureTrain,
y = wineQualityTrain,
method = "xgbTree")
train_xg.boost
# K-Nearest Neighbor
train_knn <- train(x = wineFeatureTrain,
y = wineQualityTrain,
method = "knn")
train_knn
# Testing: prediction and performance metrics
# For each model I will predict outcomes based on the features of test set and then,
# with postSample() command, I will get the most used KPI: RMSE, R squared and MAE which
# will be discussed in the following sections of this report.
# Linear regression
hat_lm <- predict(train_lm, wineFeatureTest)
Perf_lm <- postResample(pred=hat_lm, obs=wineQualityTest)
Perf_lm
ggplot(varImp(train_lm))
# Decision Tree
hat_d.tree <- predict(train_d.tree, wineFeatureTest)
Perf_d.tree <- postResample(pred=hat_d.tree, obs=wineQualityTest)
Perf_d.tree
ggplot(varImp(train_d.tree))
# Random forest
hat_rf <- predict(train_rf, wineFeatureTest)
Perf_rf <- postResample(pred=hat_rf, obs=wineQualityTest)
Perf_rf
# XG Boost
hat_xg.boost <- predict(train_xg.boost, wineFeatureTest)
Perf_xg.boost <- postResample(pred=hat_xg.boost, obs=wineQualityTest)
Perf_xg.boost
ggplot(varImp(train_xg.boost))
# K-Nearest Neighbor
hat_knn <- predict(train_knn, wineFeatureTest)
Perf_knn <- postResample(pred=hat_knn, obs=wineQualityTest)
Perf_knn
ggplot(varImp(train_knn))
# 3. MODEL PERFORMANCES EVALUATION
# The goal is to gather the performances of the models I have tried in order to find out
# which is more suitable to predict wine quality.
Perf_overview <- rbind(Perf_lm, Perf_d.tree, Perf_rf, Perf_xg.boost, Perf_knn)
Perf_overview2 <- Perf_overview %>%
as_tibble %>%
cbind(model = row.names(Perf_overview), .)
Perf_overview2 %>% ggplot(aes(x=RMSE, y=Rsquared, color=model))+
geom_point(size=4)+
labs(x = "RMSE", y = "Rsquared",
title ="MODEL EVALUATION")+
theme_bw()
# As said in the introduction section, I am interested in understanding models performance
# using RMSE and R squared index.
# As far as our models are concerned, we can visualize an inverse relation between RMSE and
# the R squared.