-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcorrelationReduction_rFE.R
243 lines (169 loc) · 7.73 KB
/
correlationReduction_rFE.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
#12/2/20 correlation reduction, rfe
# Load and install packages
required.packages <- c("caret", "corrplot", "rgdal", "raster", "doParallel", "psych", "maptools", "dplyr", "sp", "snow", "snowfall", "plyr", "randomForest", "fmsb")
new.packages <- required.packages[!(required.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
lapply(required.packages, require, character.only=T)
rm(required.packages, new.packages)
# Load Raster Data
# set working directory
setwd("C:/cov")
# read in raster layer names
rList=list.files(getwd(), pattern="tif$", full.names = FALSE)
rList
#create raster stack of covariates
rStack <- stack(rList)
names(rStack)
# create a dataframe
# for the following data reduction techniques, it is necessary to convert the raster stack into a data frame. This may cause problems with large raster stacks. my recommendation is to subset the raster stack by creating a regular sample of points.
# If you need to create a regular sample due to a large set of covariates, that could be completed as follows:
df <- na.omit(as.data.frame(sampleRegular(rStack, na.rm=T, xy=F, size = 100))) #(length(rStack$calseddry)/2))))
plot(rStack$nd3t1dry)
length(rStack$calseddry)
gc()
#df <- na.omit(as.data.frame(rStack, na.rm=T, xy=F))
# filtering by correlation
# a correlation matrix is determined and covariates with high degree of correlation are returned
# create correlation matrix
corMat <- cor(df)
# visually examine the correlation matrix
corrplot(corMat, method = "circle")
# find high degree of correlation, cutoff is the threshold to set. If cutoff = 0.75 then covariates that are >= 75% correlated are removed
highCorr <- findCorrelation(corMat, cutoff = 0.85)
# remove covariates with high degree of correlation
df <- if(length(highCorr) > 0){
df[, -highCorr]
} else {
df
}
# subset rStack to crate a new raster stack of reduced covariates for use in other applications
r.stack <- subset(rStack, names(df))
## load in shapefile with points, if points are in a different folder change the dsn="." to the directory that it is in
shp.pts <-readOGR(dsn=".", layer="field_data_0206")
# the points and the raster stack need to be in the same projection
r.stack@crs # projection information for raster stack
shp.pts@proj4string # proj info for shapefile
all.equal(r.stack@crs, shp.pts@proj4string) #True - same projection
# if you need to convert the projection of the shape file, it would be done as follows:
#
# shp.pts <- spTransform(shp.pts, CRSobj = r.stack@crs)
# r brings in a points shp file in as a SpatialPointsDataFrame (SPDF) object
# check the names of your SPDF
names(shp.pts)
# we only need the column that we are modeling remove all other columns
# remove columns 1 through 12 from shp.pts and rewrite it to shp.pts
shp.pts <- shp.pts[-c(1:12,14:18)]
# check the names of shp.pts to see if it worked you should only have the column with your modeling class
names(shp.pts)
## Plot to ensure alignment bw points and rasters
plot(r.stack$calsedwet)
plot(shp.pts, add=TRUE)
# this step gets the names of the rasters for use later
# convert raster stack to list of single raster layers
r.stack.list <- unstack(r.stack)
names(r.stack.list) <- names(r.stack)
# this following lines are the extract function this allows you to
# extract the covariate values at each data point collected
## Parallelized extract: (larger datasets)
sfInit(parallel=TRUE, cpus=parallel:::detectCores()-1)
sfLibrary(raster)
sfLibrary(rgdal)
# run parallelized 'extract'
e.df <- sfSapply(r.stack.list, extract, y=shp.pts)
sfStop()
# clean memory
gc()
# now we need to assign the names of the covariates to the extracted values
DF <- as.data.frame(e.df)
names(DF) = tools::file_path_sans_ext(basename(names(r.stack.list)))
names(DF)
# head() looks at the top five rows of a data frame
head(DF)
# create ID field in both dataframe and shp
DF$ID <- seq.int(nrow(DF))
shp.pts$ID <- seq.int(nrow(shp.pts))
# create the training points by merging the extracted covariate values with the shape file
train.pts = merge(shp.pts, DF, by="ID")
#check the names to ensure merge
names(train.pts)
## Save points as a RDS file - this is a R file that contains an R object - save this for use later
#saveRDS(train.pts, file = "points.rds")
# clean junk out of memory
gc()
###################################################################
# this section gets the data ready for covariate reduction, where we select the covariates that are most usefule for modeling
# read in RDS file with points that include the extracted covariate values
#comp <- as.data.frame(readRDS("points.rds"))
comp <- as.data.frame(train.pts)
names(comp)
# remove the ID column and change the first column name to class
comp <- comp[,-c(1,43,42)] # removes the ID column
names(comp) # check the column names to make sure ID is removed
names(comp)[c(1)] <- c('Class') # change name of first col to class
names(comp) # check names of comp
# check the levels (classes) of comp class
levels(comp$Class)
# make sure Class is a factor
comp$Class <- as.factor(comp$Class)
# remove NA values from class
#comp <- comp[complete.cases(comp), ]
##################################################
# Recursive Feature Selection (RFE)
# this section is covariate reduction section
# This process is setup to run as a parallel
# in the make cluster function.
# Next, change the subsets
# to match the number of covariates that you have.
# check the number of covariates
length(comp) # number of covariates plus the class column
subsets <- c(1:(length(comp)-1))
# set seeds to get reporducable results when running the process in parallel
set.seed(238)
seeds <- vector(mode = "list", length=105)
for(i in 1:104) seeds[[i]] <- sample.int(1000, length(subsets) + 1)
seeds[[105]] <- sample.int(1000, 1)
# set up the rfe control
ctrl.RFE <- rfeControl(functions = rfFuncs,
method = "repeatedcv",
number = 15,
repeats = 5,
seeds = seeds,
verbose = FALSE)
## highlight and run everything from c1 to stopCluster(c1) to run RFE
c1 <- makeCluster(detectCores()-1)
registerDoParallel(c1)
set.seed(9)
rf.RFE <- rfe(x = comp[,-1],
y = comp$Class,
sizes = subsets,
rfeControl = ctrl.RFE,
allowParallel = TRUE
)
stopCluster(c1)
gc()
# 4.3 Look at the results
rf.RFE
#confusion matrix
rf.RFE$fit
#plotting rFE
plot(rf.RFE) # default plot is for Accuracy, but it can also be changed to Kappa
plot(rf.RFE, metric="Kappa", main='RFE Kappa')
plot(rf.RFE, metric="Accuracy", main='RFE Accuracy')
# see list of predictors
predictors(rf.RFE)
# the rfe function retuns the covariates with the highest accuracy for the rf model
# view the highest accuracy noted by the *
rf.RFE
# take the accuracy and subtract the accuracySD. look in the results of rf.RFE and find the accuracy that is > or = to this value. this is the number of covariates to use below
predictors(rf.RFE) # top number of covariates
# look at the top number of covariates that are equal to greater than the accuracy minus the accuracySD
predictors(rf.RFE)[1:5]
# assign this to a variable
a <- predictors(rf.RFE)[1:5]
# subset the covariate stack and the data frame by the selected covariates the variable a is your selected covariates
# subsed the raser stack to the selected covariates
r.stack.model <- subset(r.stack, a)
names(r.stack.model)
# subset the data frame points with the number of covariates selected
comp.sub <- (comp[,c("Class", a)])
names(comp.sub)