-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCaret.R
239 lines (173 loc) · 6.77 KB
/
Caret.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
library (caret)
library(mlbench)
data(Sonar)
View(Sonar)
summary(Sonar)
#Modelling
set.seed(1996)
inTrain <- createDataPartition(y = Sonar$Class,
p = 0.75,
list = FALSE)
#Bt default, createDataPartition does a stratifed random split of the data (bootstap resampling)
str(inTrain)
nrow(inTrain)
# Now to split the data into training and testing
training <- Sonar[inTrain, ]
testing <- Sonar [-inTrain, ]
nrow(testing)
nrow(training)
#Applying a Partial Least Squared Model (PLS)
fit <- train(Class ~ .,
data = training,
method = "pls",
preProess = c("centre", "scale"))
fit
plot(fit)
fit$bestTune
fit$finalModel
fit$control
fit.1 <- train(Class ~.,
data = training,
method = "pls",
tuneLength = 15,
preProcess = c("center", "scale"))
fit.1
plot(fit.1)
fit$bestTune
fit$finalModel
"To modify the resampling method, a trainControl function is used. The option method controls
the type of resampling and defaults to "boot".
Another method, "repeatedcv", is used to specify repeated
K-fold cross-validation (and the argument repeats controls the number of repetitions).
K is controlled by the number argument and defaults to 10."
cntrl <- trainControl(method = "repeatedcv", # Repeated K-fold cross-validation
repeats = 3) # Number of Folds
fit.2 <- train(Class ~ .,
data = training,
method = "pls",
tuneLength = 15,
trControl = cntrl,
preProcess = c("center", "scale"))
fit.2
plot(fit.2)
fit.2$bestTune
fit.2$finalModel
"The methods for measuring performance. If unspecied, overall accuracy and the Kappa
statistic are computed. For regression models, root mean squared error and R2 are computed.
Here, the function will be altered to estimate the area under the ROC curve, the sensitivity
and specicity"
"Finally, to choose different measures of performance, additional arguments are given to trainControl.
The summaryFunction argument is used to pass in a function that takes the observed and predicted
values and estimate some measure of performance. Two such functions are already included in the
package: defaultSummary and twoClassSummary. The latter will compute measures specific to two-class
problems, such as the area under the ROC curve, the sensitivity and specicity. Since the ROC
curve is based on the predicted class probabilities (which are not computed automatically), another
option is required. The classProbs = TRUE option is used to include these calculations
Lastly, the function will pick the tuning parameters associated with the best results. Since we are
using custom performance measures, the criterion that should be optimized must also be specified.
In the call to train, we can use metric = "ROC" to do this"
cntrl.1 <- trainControl(method = "repeatedcv",
repeats = 3,
classProbs = TRUE,
summaryFunction = twoClassSummary)
fit.3 <- train(Class ~.,
data = training,
method = "pls",
tuneLength = 15,
trControl = cntrl.1,
preProcess = c("center", "scale"),
metric = "ROC")
fit.3
plot(fit.3) # shows the relationship between the number of PLS components and the resampled estimate of the area under the ROC curve.
attributes(fit.3)
fit.3$finalModel
fit.3$results
# Make new predictions based on testing data
predictions <- predict(fit.3, newdata = testing)
# Construct Confusion Matrix for results
confusion <- confusionMatrix(data = predictions, testing$Class)
confusion
________________________________________________________________________________________________________________________________________________________________
fitControl <- trainControl(method = "repeatedcv",
number = 10, # 10 K-folds
repeats = 10) # Repeated 10 times
# Fit a Stochastic Gradient Boosted Tree model
gbmFit <- train(Class ~.,
data = training,
method = "gbm",
trControl = fitControl,
verbose = FALSE)
gbmFit
plot(gbmFit)
gbmFit$bestTune
gbmFit$finalModel
"For a gradient boosting machine (GBM) model, there are three main tuning parameters:
number of iterations, i.e. trees, (called n.trees in the gbm function)
complexity of the tree, called interaction.depth
learning rate: how quickly the algorithm adapts, called shrinkage
the minimum number of training set samples in a node to commence splitting (n.minobsinnode)"
# Create an alternate tuning Grid
gbmGrid <- expand.grid(interaction.depth = c(1,5,9),
n.trees = (1:30)*50,
shrinkage = 0.1,
n.minobsinnode = 20)
head(gbmGrid)
set.seed (1996)
gbmFit2 <- train(Class ~ .,
data = training,
method = "gbm",
trControl = fitControl,
verbose = FALSE,
tuneGrid = gbmGrid)
gbmFit2
plot
gbmFit2$finalModel
# Plot the results
trellis.par.set(caretTheme())
plot.train(gbmFit2)
plot.train(gbmFit2, metric = "Kappa") # Plot using Kappa as the metric
# Using ROC to Optimise model
fitControl2 <- trainControl(method = "repeatedcv",
number = 10,
repeats = 10,
classProbs = TRUE,
summaryFunction = twoClassSummary)
gbmFit3 <- train(Class ~ .,
data = training,
method = 'gbm',
trControl = fitControl2,
tuneGrid = gbmGrid,
metric = "ROC",
verbose = FALSE)
gbmFit3
plot.train(gbmFit3)
# Choosing the Final Model (Using Tolerance within 2% of the best model)
TwoPerc <- tolerance(gbmFit3$results, metric = "ROC",
tol = 10, maximize = FALSE)
gbmFit3$results[TwoPerc, ]
# Extracting Predictions and Class Probabilities
gbmpredictions <- predict(gbmFit3, newdata = testing)
gbmpredictions
gbmpredictions.probs <- predict(gbmFit3, newdata = testing, type = "prob")
head(gbmpredictions.probs)
# Build SVM Model
set.seed(1996)
svmFit <- train(Class ~.,
data = training,
method = "svmRadial",
trControl = fitControl2,
preProcess = c("center", "scale"),
metric = "ROC",
tuneLength = 8)
svmFit
plot(svmFit)
svmFit$modelInfo
svmFit$finalModel
# Build a Regulised Discriminant Analysis Model
set.seed(1996)
rdaFit <- train(Class ~ .,
data = training,
method = "rda",
tuneLength = 4,
metric ="ROC",
trControl = fitControl2)