README.Rmd

---
title: "Documentation for running pbj simulations on AWS"
author: "Simon Vandekar"
date: "2/7/2020"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE, eval=TRUE, message=FALSE, warning=FALSE, fig.width=15, fig.height=9)
path = Sys.getenv('PATH')
path = Sys.setenv('PATH'=paste(path, '/home/rstudio/.local/bin', sep=':'))
```

## AWS machine image setup

I use the directions [here](https://jagg19.github.io/2019/08/aws-r/#short-easy) to create an AMI to run Rstudio on.
The `Welcome.R` script in the [NIsim](https://github.com/simonvandekar/NIsim) package has code to setup this machine image with Dropbox access to the files.


## Setup simulations

```{r simconfig}
#devtools::install_github('simonvandekar/pbj', ref='ftest')
#devtools::install_github('simonvandekar/NIsim')
### LIBRARIES ###
library(RNifti)
library(parallel)
library(splines)
library(mmand)
library(fslr)
library(progress)
library(abind)
library(pbj)
library(PDQutils)
library(NIsim)


### LOAD IN DATA FROM DROPBOX ###
dbimagedir = '~/Dropbox (VUMC)/pbj/data/abide/neuroimaging/cpac/alff_cropped'
dbresimagedir = '~/Dropbox (VUMC)/pbj/data/abide/neuroimaging/cpac/alff_cropped_res'
dbdatafile = '~/Dropbox (VUMC)/pbj/data/abide/demographic/n1035_phenotypic_20190509.rds'
  maskfile = '~/Dropbox (VUMC)/pbj/data/abide/neuroimaging/cpac/cropped_n1035_mask.nii.gz'


# load in data and get directories
dat = readRDS(dbdatafile)
dat$imgname = paste(dat$file_id, 'alff.nii.gz', sep='_')
dat$files = file.path(dbimagedir, dat$imgname)


### COMPUTING PARAMETERS ###
computeConfig = list(
  # number of cores to use for computing
  ncores = 32
)


### SIMULATION PARAMETERS ###
simConfig = list(
  # use robust variance estimator?
  robust = TRUE,
  # what transformation to use. Only the first is used
  tranform = c('t', 'edgeworth', 'none'),
  # vector of sample sizes to simulate
  ns = c(200, 400, 800),
  # number of simulations to run
  nsim=500,
  # number of bootstraps
  nboot = 500,
  # number of permutations
  nperm = 0,
  # cluster forming thresholds
  cfts.s = c(0.1, 0.25, 0.4),
  cfts.p = c(0.01, 0.001),
  
  # radius for spheres of signal.
  rs=c(8),
  
  #### MODEL FORMULAS FOR SIMULATIONS ####
  formres = as.formula( paste0(" ~ dx_group + sex + func_mean_fd + ns(age_at_scan, df=10)" )),
  # need age_at_scan in both models for testing nonlinear functions
  form = as.formula(paste0(" ~ sex + func_mean_fd + age_at_scan + fake_covariate1 + scale(fake_covariate1^2) + scale(fake_covariate1^3)" )),
  formred = as.formula(paste0(" ~ sex + func_mean_fd + age_at_scan + fake_covariate1")),
  #  weights for each subject. Can be a character vector
  W = c("func_mean_fd"),
  # where to put residuals
  resdir = dbresimagedir,
  # where to output results
  simdir = '~/temp',
  dat = dat,
  mask = maskfile,
  output = '~/Dropbox (VUMC)/pbj/pbj_ftest/covariance_sim_df2_polynomial_covariate.rdata'
)
# use betas = 0 for global null
# parameters = betas * sd(y)/sd(x).
simConfig$betas = rep(0, length(simConfig$rs))
```


```{r simsetup}
### SETUP THE SIMULATION ANALYSIS ###
# subsets dataset to all people who have the variables
simConfig$dat = simConfig$dat[apply(!is.na(simConfig$dat[ ,c(all.vars(as.formula(simConfig$formres)), simConfig$W)]), 1, all), ]
# Create residualized images
if(class(simConfig$formres)=='formula' | is.character(simConfig$formres)){
  simConfig$dat$rfiles = file.path(simConfig$resdir, basename(simConfig$dat$files))
  if(!all(file.exists(simConfig$dat$rfiles))){
  pbj::residualizeImages(files=simConfig$dat$files, dat=simConfig$dat, mask=simConfig$mask, form=simConfig$formres,
                         outfiles=simConfig$dat$rfiles, mc.cores=computeConfig$ncores)
}
  simConfig$dat$files = simConfig$dat$rfiles
  # clean up. May not be necessary
  gc()
}


simdirs = simSetup(simConfig$dat$files, data=simConfig$dat, outdir=simConfig$simdir, nsim=simConfig$nsim, ns=simConfig$ns, mask=simConfig$mask, rs=simConfig$rs, betas=simConfig$betas )
```

```{r runSims, message=FALSE}
# simfunc should contain a data argument, which is defined within runSim
# Other arguments are identical across simulation runs.
simFunc = function(lmfull, lmred, mask, data, nboot, cfts){
  data$fake_group = factor(ceiling(ppoints(nrow(data))*3 ) )
  data$fake_covariate1 = rnorm(nrow(data))
  data$fake_covariate2 = rnorm(nrow(data))
  statmap = lmPBJ(data$images, form=lmfull, formred=lmred, mask=mask, data=data, transform = 't')
  #k = mmand::shapeKernel(3, 3, type='box')
  #stat = lapply(cfts, function(th) max(c(table(c(mmand::components(stat.statMap(statmap) >th^2*statmap$rdf + statmap$df, k))),0), na.rm=TRUE) )
  #pbj = pbjSEI(statmap, nboot = nboot, cfts.s = cfts)
  #pbj = lapply(pbj[grep('cft', names(pbj))], function(x) x[['boots']])
  return(list(estimates=statmap$normedcoef, covestimator=statmap$sqrtSigma))
}

#debug(lmPBJ)
#test = simFunc(simConfig$form, simConfig$formred, simConfig$mask, readRDS(file.path(simdirs$simdir[101], 'data.rds')), simConfig$nboot, simConfig$cfts.s)


results = runSim(simdirs$simdir, method='synthetic',
       simfunc = simFunc, mask = simConfig$mask,
simfuncArgs = list(
  lmfull= simConfig$form,
  lmred = simConfig$formred,
  mask = simConfig$mask, nboot=simConfig$nboot, cfts=simConfig$cfts.s), ncores = computeConfig$ncores)

dir.create(dirname(simConfig$output), showWarnings = FALSE, recursive = TRUE)
# clean up files
save.image(file=simConfig$output)
#Sys.sleep(5*60)
#unlink(list.files(tempdir(), full.names = TRUE))
#gc()
#unlink(simdirs)
#system('sudo shutdown -h now')

# summarize the results
# apply(rowMeans(simplify2array(x[!is.na(x)]), dims = 2), 2, quantile)
```

```{r, message=FALSE, eval=FALSE}

resultsFixedX = runSim(rep(simdirs$simdir[seq(1, nrow(simdirs), by=simConfig$nsim)], each=simConfig$nsim), method='synthetic',
       simfunc = simFunc, mask = simConfig$mask,
simfuncArgs = list(
  lmfull= simConfig$form,
  lmred = simConfig$formred,
  mask = simConfig$mask), ncores = computeConfig$ncores)

```


## Compare covariance estimator to simulations estimator

```{r, eval=FALSE}
colMeans(do.call(rbind, lapply(results, function(x) c(crossprod(x$covestimator[1,,], x$covestimator[2,,])))))

cov(do.call(rbind, lapply(results, function(x) c(x$estimates))))

```


```{r, eval=FALSE}
load('~/Dropbox (VUMC)/pbj/pbj_ftest/synthsim_transform_images.rdata')
simdirs$results = resultsFixedX# lapply(results, simplify2array)
x =simdirs[simdirs$n==100,]
simdirs$results[ !sapply(simdirs$results, is.numeric) ] = NA
#simdirs$results = lapply(simdirs$results, function(x){ x[,'edgeworth'] = ifelse(is.infinite(x[,'edgeworth']), x[, 't'], x[,'edgeworth']); x})
by(simdirs, simdirs$n, function(x) sum(!is.na(x$results)))
by(simdirs, simdirs$n, function(x) apply(rowMeans(simplify2array(x$results[!is.na(x$results)]), dims = 2), 2, function(x) quantile(x)))
by(simdirs, simdirs$n, function(x) apply(apply(simplify2array(x$results[!is.na(x$results)]), 1:2, var ), 2, function(x) quantile(x)) )
by(simdirs, simdirs$n, function(x) apply(apply(simplify2array(x$results[!is.na(x$results)]), 1:2, function(y) var(y) ), 2, function(x) x) )
by(simdirs, simdirs$n, function(x) apply(apply(simplify2array(x$results[!is.na(x$results)]), 1:2, function(y) sd(y)/sqrt(length(y)) ), 2, function(x) x) )
```

```{r, eval=FALSE}
simConfig$dat$images = simConfig$dat$files
test2 = simFuncCoefs(lmfull= simConfig$form,
 lmred = simConfig$formred,
 mask = simConfig$mask,
 data=simConfig$dat[1:50,])

results = runSim(simdirs$simdir, method='synthetic',
       simfunc = simFuncCoefs, mask = simConfig$mask,
simfuncArgs = list(
  lmfull= simConfig$form,
  lmred = simConfig$formred,
  mask = simConfig$mask), ncores = computeConfig$ncores)

```


```{r, eval=TRUE}
# plotting function for below sections
plots = function(rdata){
  load(rdata)
simdirs$results = results# lapply(results, simplify2array)
x =simdirs[simdirs$n==100,]
simdirs[, paste0('obsMaxClust_cft.s', simConfig$cfts.s)] = do.call(rbind, by(simdirs, simdirs$n, function(x) do.call(rbind, lapply(x$results, function(y) unlist(y[['obs']]) ) ) ))

cex=1.5
par(mgp=c(1.7,.7,0), lwd=1.5, lend=2, cex.lab=0.8*cex, cex.axis=0.8*cex, cex.main=1*cex, mfrow=c(1,1), mar=c(2.8,2.8,1.8,.2), bty='l')
layout(mat=matrix(1:(length(simConfig$cfts.s)*length(simConfig$ns)), nrow=length(simConfig$cfts.s)) )
# axes are based on tail quantiles
probs = seq(0.75, 1, length.out=100) #length.out=pmin(simConfig$nsim, simConfig$nboot)

trash = by(simdirs, simdirs$n, function(df){
  for(cftInd in 1:length(simConfig$cfts.s)){
    ylims = range(sapply(df$results, function(x) range(quantile(x$boot[[cftInd]][[1]], probs=probs))))
    colname = paste0('obsMaxClust_cft.s', simConfig$cfts.s[cftInd])
    x = df[,colname]
    xlims = range(quantile(x, probs=probs) )
    xaxlab = c(0.9, 0.95, 0.99, 0.999) 
    xaxt = quantile(x, probs=xaxlab)
    plot(x, ylim=ylims, xlim=xlims, type='n', xlab='Observed cluster quantile', ylab='Estimated cluster quantile', main=paste('n =', df$n[1], 'cft =', simConfig$cfts.s[cftInd]))
    #axis(side=1, at=xaxt, labels=xaxlab)
    abline(v=xaxt, col='orange', lty=2)
    for(ind in 1:simConfig$nsim){
      points(quantile(x, probs=probs), quantile(df$results[[ind]]$boot[[cftInd]][[1]], probs=probs), type='l')
    }
    abline(a=0,b=1, col='blue')
  }
})

trash = by(simdirs, simdirs$n, function(df){
  for(cftInd in 1:length(simConfig$cfts.s)){
    ylims = range(sapply(df$results, function(x) range(quantile(x$boot[[cftInd]][[1]], probs=probs))))
    colname = paste0('obsMaxClust_cft.s', simConfig$cfts.s[cftInd])
    x = df[,colname]
    xlims = range(quantile(x, probs=probs) )
    xaxlab = c(0.9, 0.95, 0.99, 0.999) 
    xaxt = quantile(x, probs=xaxlab)
    y=colMeans(do.call(rbind, lapply(1:nrow(df), function(ind) quantile(df$results[[ind]]$boot[[cftInd]][[1]], probs=xaxlab)<df[ind,colname])) )
    plot(1-xaxlab, y, type='b', xlab='Target type 1 error', ylab='Actual type 1 error', xlim=range(c(y, 1-xaxlab)), ylim=range(c(y, 1-xaxlab)), , main=paste('n =', df$n[1], 'cft =', simConfig$cfts.s[cftInd]))
    abline(a=0,b=1, col='blue')
  }
})
}
```

## Group covariate

```{r, eval=TRUE}
plots('~/Dropbox (VUMC)/pbj/pbj_ftest/df2_group_covariate.rdata')
```


## Independent continuous covariates

```{r, eval=TRUE}
plots('~/Dropbox (VUMC)/pbj/pbj_ftest/df2_independent_covariates.rdata')
```


## Polynomial continuous covariate

Testing the second and third degree terms of a polynomial covariate.

```{r, eval=TRUE}
plots('~/Dropbox (VUMC)/pbj/pbj_ftest/df2_polynomial_covariate.rdata')
```