diff --git a/soils2026/class-5-projects/8-VIC_allProjects/5.modelingTerra0324_30m_notParallel.R b/soils2026/class-5-projects/8-VIC_allProjects/5.modelingTerra0324_30m_notParallel.R new file mode 100644 index 0000000..ca51916 --- /dev/null +++ b/soils2026/class-5-projects/8-VIC_allProjects/5.modelingTerra0324_30m_notParallel.R @@ -0,0 +1,382 @@ +# 8-VIC modeling 10/16/23 + + +# 3/13/24 +# Dave White + + +# load and install packages +required.packages <- c( "caret", "sf", "terra", "randomForest", "doParallel", "aqp", "parallel", "snow", "foreign", "foreach") +new.packages <- required.packages[!(required.packages %in% installed.packages()[,"Package"])] +if(length(new.packages)) install.packages(new.packages) +lapply(required.packages, require, character.only=T) +rm(required.packages, new.packages) + + + +# bring in covariate data and create a stack + +# set working directory to dem covs +setwd("~/data/8-vic/cov30") + +# read in raster file names as a list +rList=list.files(getwd(), pattern="tif$", full.names = FALSE) + +# create a raster stack of dem covs +demStack <- rast(rList) + +# set working directory to spectral covs +setwd("~/data/8-vic/specCov30/clip") + +# read in raster file names as a list +rList=list.files(getwd(), pattern="tif$", full.names = FALSE) + +# create a raster stack of covs +specStack <- rast(rList) + +# combine into one stack +rStack <- c(demStack,specStack) + +rm(specStack, demStack, rList) + +# Bring in training data points +# read in shapefile +# bring in pedon data + +#set working directory to training data. +setwd("~/data/8-vic/data") + +pts <- read_sf("pedons.shp") + +# extract raster covariate values at each training point for the all.pts dataset +pts.sv <- terra::extract(rStack, pts, xy=F, bind=TRUE, na.rm=TRUE) + +# convert SpatVector to sf +all.pts <- sf::st_as_sf(pts.sv) + +names(all.pts) + +# remove unwanted cols +all.pts <- all.pts[-c(1:14)] +colnames(all.pts)[1] <- "class" +names(all.pts) +all.pts$class <- as.factor(all.pts$class) +names(all.pts) +# +levels(all.pts$class) + + +# convert sf to a dataframe +pts.all <- as.data.frame(all.pts)[-c(172)] +names(pts.all) + + +# remove any observations with NAs +comp <- pts.all[complete.cases(pts.all),] + +# convert Class col to a factor +#comp$Class <- as.factor(comp$Class) +is.factor(comp$class) +levels(comp$class) +summary((comp$class)) + +# There are a few classes with < 2 observations remove those classes + +length(levels(comp$class)) + +nlevels(comp$class) + +comp = comp[comp$class %in% names(table(comp$class)) [table(comp$class) >2],] +levels(comp$class) +summary(comp$class) +comp <- droplevels(comp) +levels(comp$class) +summary(comp$class) + +length(levels(comp$class)) +gc() +#--------------------------------------------------------------------------- +# Recursive Feature Selection (RFE) + + +#subsets <- c(1,10,25,50, 75, 100, 150, 200) +subsets <- c(1,10,30,50,70,95,100,110,150,180) + +# set seeds to get reproducible results when running the process in parallel +set.seed(238) +seeds <- vector(mode = "list", length=112) +for(i in 1:111) seeds[[i]] <- sample.int(1000, length(subsets) + 1) +seeds[[112]] <- sample.int(1000, 1) + + +# set up the rfe control +ctrl.RFE <- rfeControl(functions = rfFuncs, + method = "repeatedcv", + number = 10, + repeats = 5, + seeds = seeds, + verbose = FALSE) + +## highlight and run everything from c1 to stopCluster(c1) to run RFE +detectCores()# number of cores +cores <- 123 +cl <- makeCluster(cores) # base R only recognizes 128 cores and about 5 need to be left for OS +registerDoParallel(cl) +set.seed(9) +rfe <- rfe(x = comp[,-c(1)], + y = comp$class, + sizes = subsets, + rfeControl = ctrl.RFE, + allowParallel = TRUE +) +stopCluster(cl) +gc() + +# Look at the results +rfe +plot(rfe) + + + +rfe$fit$confusion +rfe$results +plot(rfe) # default plot is for Accuracy, but it can also be changed to Kappa +plot(rfe, metric="Kappa", main='RFE Kappa') +plot(rfe, metric="Accuracy", main='RFE Accuracy') + +# see list of predictors +predictors(rfe) + +# the rfe function retuns the covariates with the highest accuracy for the rf model +# view the highest accuracy noted by the * +rfe + +# take the accuracy and subtract the accuracySD. look in the results of rf.RFE and find the accuracy that is > or = to this value. this is the number of covariates to use below +#predictors(rfeLand) # top number of covariates + +# look at the top number of covariates that are equal to greater than the accuracy minus the accuracySD +#predictors(rf.RFE)[c(1:9,24:26)] + +# assign this to a variable +a <- predictors(rfe)#[c(1:50)] + + +# subset the covariate stack and the data frame by the selected covariates the variable a is your selected covariates + +# subsed the raser stack to the selected covariates +rsm <- subset(rStack, a) + + +# subset the data frame points with the number of covariates selected +names(comp) +comp.sub <- (comp[,c("class", a)]) + + +names(comp.sub) +is.factor(comp.sub$class) + +# +# determine number of covariates +#length(a) + +subsets <- 161 +#subsets <- c(1,10, 25, 50, 100, 150, 161) +# set seeds to get reproducable results when running the process in parallel +set.seed(12) +seeds <- vector(mode = "list", length=51) +for(i in 1:50) seeds[[i]] <- sample.int(1000, length(1:subsets) + 1) +seeds[[51]] <- sample.int(1000, 1) + + +# set up the train control +fitControl <- trainControl(method = "repeatedcv", + number = 10, + repeats = 5, + p = 0.8, #30% used for test set, 70% used for training set + selectionFunction = 'best', + classProbs = T, + savePredictions = T, + returnResamp = 'final', + search = "random", + seeds = seeds) + +# Random Forest - Parallel process +cl <- makeCluster(cores) +registerDoParallel(cl) +set.seed(48) +rfm = train(x = comp.sub[,-c(1)], + y = comp.sub$class, + "rf", + trControl = fitControl, + ntree = 500, #number of trees default is 500, which seems to work best anyway. + tuneLength=10, + metric = 'Kappa', + na.action=na.pass, + keep.forest=TRUE, # added this line and the next for partial dependence plots + importance=TRUE) +stopCluster(cl) +gc() + + +# Inspect rfFit +rfm$finalModel + + +#look at overall accuracy, kappa, and SD associated with each mtry value +print(rfm) +rfm$results + + +# Convert caret wrapper into randomforest model object for raster prediction +rfm <- rfm$finalModel + + + + + + + +# make predictions + +# set wd to store tiles for prediction - tiles are written to hard drive, make sure there is enough room +setwd("~/data/8-vic/data/tiles/") + + +#numTiles <- 11 # numTiles^2 should <= the number of cores - in this case I have 123 cores available and 11x11 would give 121 tiles +numTiles <- round(sqrt(cores), digits = 0) + +x <- rast(ncol=numTiles, nrow=numTiles, extent=ext(rsm)) + +tl <- makeTiles(rsm, x, overwrite=T, extend=T) + +cl <- parallel::makeCluster(length(tl)) +registerDoParallel(cl) + +pred <- foreach(i = 1:length(tl), + .packages = c("terra", "randomForest")) %dopar% { + pred <- wrap(terra::predict(rast(tl[i]),rfm, na.rm=T)) + return(pred) + } +pred <- do.call(terra::merge,lapply(pred,terra::rast)) +plot(pred) +setwd("~/data/8-vic/results") +# write rasters +writeRaster(pred, overwrite = TRUE, filename = "class.tif", gdal=c("COMPRESS=DEFLATE", "TFW=YES"),datatype='INT1U') +# write raster attribute table +#library(foreign) +write.dbf(levels(pred)[[1]], file='class.tif.vat.dbf') # make sure the first part of the file name is exactly the same as the predicted raster + + + + +# process probability stacks in smaller chunks +setwd("~/data/8-vic/results/prob") +length(tl) + +predProb <- foreach(i = 1:20, #1:lenght(tl) + .packages = c("terra", "randomForest")) %dopar% { + pred <- wrap(terra::predict(rast(tl[i]),rfm, na.rm=T, type="prob")) + return(pred) + } +predProb <- do.call(terra::merge,lapply(predProb,terra::rast)) +writeRaster(predProb, overwrite = TRUE, filename = "prob1.tif", gdal=c("COMPRESS=DEFLATE", "TFW=YES")) +predProb <- foreach(i = 21:40, + .packages = c("terra", "randomForest")) %dopar% { + pred <- wrap(terra::predict(rast(tl[i]),rfm, na.rm=T, type="prob")) + return(pred) + } +predProb <- do.call(terra::merge,lapply(predProb,terra::rast)) +writeRaster(predProb, overwrite = TRUE, filename = "prob2.tif", gdal=c("COMPRESS=DEFLATE", "TFW=YES")) +predProb <- foreach(i = 41:60, + .packages = c("terra", "randomForest")) %dopar% { + pred <- wrap(terra::predict(rast(tl[i]),rfm, na.rm=T, type="prob")) + return(pred) + } +predProb <- do.call(terra::merge,lapply(predProb,terra::rast)) +writeRaster(predProb, overwrite = TRUE, filename = "prob3.tif", gdal=c("COMPRESS=DEFLATE", "TFW=YES")) +predProb <- foreach(i = 61:80, + .packages = c("terra", "randomForest")) %dopar% { + pred <- wrap(terra::predict(rast(tl[i]),rfm, na.rm=T, type="prob")) + return(pred) + } +predProb <- do.call(terra::merge,lapply(predProb,terra::rast)) +writeRaster(predProb, overwrite = TRUE, filename = "prob4.tif", gdal=c("COMPRESS=DEFLATE", "TFW=YES")) +predProb <- foreach(i = 81:100, + .packages = c("terra", "randomForest")) %dopar% { + pred <- wrap(terra::predict(rast(tl[i]),rfm, na.rm=T, type="prob")) + return(pred) + } +predProb <- do.call(terra::merge,lapply(predProb,terra::rast)) +writeRaster(predProb, overwrite = TRUE, filename = "prob5.tif", gdal=c("COMPRESS=DEFLATE", "TFW=YES")) +predProb <- foreach(i = 101:121, + .packages = c("terra", "randomForest")) %dopar% { + pred <- wrap(terra::predict(rast(tl[i]),rfm, na.rm=T, type="prob")) + return(pred) + } +predProb <- do.call(terra::merge,lapply(predProb,terra::rast)) +writeRaster(predProb, overwrite = TRUE, filename = "prob6.tif", gdal=c("COMPRESS=DEFLATE", "TFW=YES")) + +stopCluster(cl) + +rm(predProb) + +gc() + + + +#set directory to prob tiles +setwd("~/data/8-vic/results/prob") + +# read in raster file names as a list +rList=list.files(getwd(), pattern="tif$", full.names = FALSE) + +vrtfile <- paste0(tempfile(), ".vrt") +v <- vrt(rList, vrtfile) + +## shannon entropy keeps failing, maybe write prob rasters to disk then bring in as vrt and calc shan + +cl <- parallel::makeCluster(length(rList)) +registerDoParallel(cl) +shan <- foreach(i = 1:length(rList), + .packages = c("terra", "aqp")) %dopar% { + shan <- wrap(aqp::shannonEntropy(terra::rast(rList[i]))) + return(shan) + } +shan <- do.call(terra::merge,lapply(shan, terra::rast)) +plot(shan) + +gc() + +#normalized shan entropy +b <- length(names(v)) +shanNorm1 <- foreach(i = 1:3, + .packages = c("terra", "aqp")) %dopar% { + shanNorm1 <- wrap(aqp::shannonEntropy(terra::rast(rList[i]), b=b)) + return(shanNorm1) + } +shanNorm1 <- do.call(terra::merge,lapply(shanNorm1, terra::rast)) +shanNorm2 <- foreach(i = 4:6, + .packages = c("terra", "aqp")) %dopar% { + shanNorm2 <- wrap(aqp::shannonEntropy(terra::rast(rList[i]), b=b)) + return(shanNorm2) + } +shanNorm2 <- do.call(terra::merge,lapply(shanNorm2, terra::rast)) + +shanNorm <- merge(shanNorm1, shanNorm2) + + +plot(shanNorm) + +setwd("~/data/8-vic/results") +writeRaster(shan, overwrite = TRUE, filename = "shan.tif", gdal=c("COMPRESS=DEFLATE", "TFW=YES")) +writeRaster(shanNorm, overwrite = TRUE, filename = "shanNorm.tif", gdal=c("COMPRESS=DEFLATE", "TFW=YES")) + +stopCluster(cl) + +# get confusion matrix from model +cm <- confusionMatrix(rfm$predicted, rfm$y) +cm$byClass + + + +save.image(file = "data032624.RData")