Add-on: SCENT algorithm

Added Options for Non-binarized counts and Negative Binomial Regression
immunogenomics · Jul 26, 2023 · b3be208 · b3be208
1 parent fcf3acc
commit b3be208
Show file tree

Hide file tree

Showing 10 changed files with 118 additions and 36 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -3,6 +3,7 @@
 export(CreatePeakToGeneList)
 export(CreateSCENTObj)
 export(SCENT_algorithm)
+export(assoc_negbin)
 export(assoc_poisson)
 export(basic_p)
 export(check_dimensions)

diff --git a/Parallelized Bash Script/SCENT_parallelization.R b/Parallelized Bash Script/SCENT_parallelization.R
@@ -4,10 +4,12 @@ library(SCENT)
 ####### INPUTS
 #Obtain arguments: (from Cluster)
 node = commandArgs(trailingOnly = T)[1] # JOB ARRAY number: node usage
-cores = commandArgs(trailingOnly = T)[2] # Number of Cores
+cores = commandArgs(trailingOnly = T)[2] # numeric. Number of Cores
 SCENTobj_rds = commandArgs(trailingOnly = T)[3] # RDS object file type
-celltype = commandArgs(trailingOnly = T)[4] # CellType
-output_dir  = commandArgs(trailingOnly = T)[5] # Output of each text file to a specific folder
+celltype = commandArgs(trailingOnly = T)[4] # character. CellType
+regr = commandArgs(trailingOnly = T)[5] # character. Regression Type
+bin = commandArgs(trailingOnly = T)[6] # logical. Binarize ATAC counts
+output_dir  = commandArgs(trailingOnly = T)[7] # Output of each text file to a specific folder
 
 
 ###Example of inputs from the bash script: parallelizedSCENT.sh
@@ -25,7 +27,7 @@ SCENT_obj <- readRDS(SCENTobj_rds)
 SCENT_obj@peak.info <- SCENT_obj@peak.info[[node]]
 
 #### Run SCENT algorithm of Tnk cell type and use 6 cores for parallelization:
-SCENT_obj <- SCENT_algorithm(SCENT_obj, celltype, cores)
+SCENT_obj <- SCENT_algorithm(SCENT_obj, celltype, cores, regr, bin)
 
 #### Output SCENT results for each gene-peak pair block.
 filename <- paste0(output_dir,"SCENTresult_",node,".txt")

diff --git a/Parallelized Bash Script/parallelizedSCENT.sh b/Parallelized Bash Script/parallelizedSCENT.sh
@@ -9,4 +9,5 @@
 
 
 module load R
-Rscript SCENT_parallelization.R $LSB_JOBINDEX ${num_cores} ${file_SCENT_obj} ${celltype} ${output_dir}
+Rscript SCENT_parallelization.R $LSB_JOBINDEX ${num_cores} ${file_SCENT_obj} ${celltype} ${regr} ${bin} ${output_dir}
+
diff --git a/R/SCENTfunctions.R b/R/SCENTfunctions.R
@@ -40,6 +40,20 @@ assoc_poisson = function(data, idx = seq_len(nrow(data)), formula){
 }
 
 
+#' Perform negative binomial regression: exprs ~ peak + covariates
+#'
+#' @param data contains expr values and associated peak and covariates for a gene.
+#' @param idx rows of the data to use: argument for boot function (bootstrapping)
+#' @param formula user defined formula based on initialization in CreateSCENTObj Constructor
+#'
+#' @return vector: (coefficient of the peak effect on gene, variance of peak effect on gene)
+#' @export
+assoc_negbin = function(data, idx = seq_len(nrow(data)), formula){
+  gg = glm.nb(formula, data = data[idx,,drop = FALSE])
+  c(coef(gg)['atac'], diag(vcov(gg))['atac'])
+}
+
+
 
 #' Validity and Type Checking for CreateSCENTObject Constructor
 #'
@@ -134,19 +148,25 @@ CreateSCENTObj <- setClass(
 #' SCENT Algorithm: Poisson Regression with Empirical P-values through Bootstrapping.
 #'
 #' @param object SCENT object
-#' @param celltype User specified cell type defined in celltypes column of meta.data
-#' @param ncores Number of cores to use for Parallelization
+#' @param celltype character. User specified cell type defined in celltypes column of meta.data
+#' @param ncores numeric. Number of cores to use for Parallelization
+#' @param regr character. Regression type: "poisson" or "negbin" for Poisson regression and Negative Binomial regression, respectively
+#' @param bin logical. TRUE to binarize ATAC counts. FALSE to NOT binarize ATAC counts
 #'
 #' @return SCENT object with updated field SCENT.results
 #' @export
-SCENT_algorithm <- function(object, celltype, ncores){
+SCENT_algorithm <- function(object, celltype, ncores, regr = "poisson", bin = TRUE){
   res <- data.frame()
   for (n in 1:nrow(object@peak.info)){ ####c(1:nrow(chunkinfo))
     gene <- object@peak.info[n,1] #GENE is FIRST COLUMN OF PEAK.INFO
     this_peak <- object@peak.info[n,2] #PEAK is SECOND COLUMN OF PEAK.INFO
     atac_target <- data.frame(cell = colnames(object@atac), atac = object@atac[this_peak,])
+
+
     #binarize peaks:
-    atac_target[atac_target$atac>0,]$atac<-1
+    if(bin){
+      atac_target[atac_target$atac>0,]$atac<-1
+    }
 
     mrna_target <- object@rna[gene,]
     df <- data.frame(cell=names(mrna_target),exprs=as.numeric(mrna_target))
@@ -158,32 +178,40 @@ SCENT_algorithm <- function(object, celltype, ncores){
     nonzero_m  <- length( df2$exprs[ df2$exprs > 0] ) / length( df2$exprs )
     nonzero_a  <- length( df2$atac[ df2$atac > 0] ) / length( df2$atac )
     if(nonzero_m > 0.05 & nonzero_a > 0.05){
-      # poisson Regression
+      #Run Regression Once Before Bootstrapping:
       res_var <- "exprs"
       pred_var <- c("atac", object@covariates) ###need to add log....
       formula <- as.formula(paste(res_var, paste(pred_var, collapse = "+"), sep = "~"))
 
+
       #Estimated Coefficients Obtained without Bootstrapping:
-      base = glm(formula, family = 'poisson', data = df2)
-      coefs<-summary(base)$coefficients["atac",]
+      if(regr == "poisson"){
+        base = glm(formula, family = 'poisson', data = df2)
+        coefs<-summary(base)$coefficients["atac",]
+        assoc <- assoc_poisson
+      } else if (regr == "negbin"){
+        base = glm.nb(formula, data = df2)
+        coefs<-summary(base)$coefficients["atac",]
+        assoc <- assoc_negbin
+      }
 
       ###Iterative Bootstrapping Procedure: Estimate the Beta coefficients and associate a 2-sided p-value.
-      bs = boot::boot(df2,assoc_poisson, R = 100, formula = formula, stype = 'i', parallel = "multicore", ncpus = ncores)
+      bs = boot::boot(df2,assoc, R = 100, formula = formula, stype = 'i', parallel = "multicore", ncpus = ncores)
       p0 = basic_p(bs$t0[1], bs$t[,1])
       if(p0<0.1){
-        bs = boot::boot(df2,assoc_poisson, R = 500, formula = formula,  stype = 'i', parallel = "multicore", ncpus = ncores)
+        bs = boot::boot(df2,assoc, R = 500, formula = formula,  stype = 'i', parallel = "multicore", ncpus = ncores)
         p0 = basic_p(bs$t0[1], bs$t[,1])
       }
       if(p0<0.05){
-        bs = boot::boot(df2,assoc_poisson, R = 2500, formula = formula,  stype = 'i', parallel = "multicore", ncpus = ncores)
+        bs = boot::boot(df2,assoc, R = 2500, formula = formula,  stype = 'i', parallel = "multicore", ncpus = ncores)
         p0 = basic_p(bs$t0[1], bs$t[,1])
       }
       if(p0<0.01){
-        bs = boot::boot(df2,assoc_poisson, R = 25000, formula = formula,  stype = 'i', parallel = "multicore", ncpus = ncores)
+        bs = boot::boot(df2,assoc, R = 25000, formula = formula,  stype = 'i', parallel = "multicore", ncpus = ncores)
         p0 = basic_p(bs$t0[1], bs$t[,1])
       }
       if(p0<0.001){
-        bs = boot::boot(df2,assoc_poisson, R = 50000, formula = formula, stype = 'i', parallel = "multicore", ncpus = ncores)
+        bs = boot::boot(df2,assoc, R = 50000, formula = formula, stype = 'i', parallel = "multicore", ncpus = ncores)
         p0 = basic_p(bs$t0[1], bs$t[,1])
       }
       out <- data.frame(gene=gene,peak=this_peak,beta=coefs[1],se=coefs[2],z=coefs[3],p=coefs[4],boot_basic_p=p0)
@@ -201,12 +229,12 @@ SCENT_algorithm <- function(object, celltype, ncores){
 #' Creating Cis Gene-Peak Pair Lists to Parallelize Through
 #'
 #' @param object SCENT object
-#' @param genebed File directory for bed file that contains 500 kb windows for each gene
-#' @param nbatch Number of batches to produce: Length of the list
-#' @param tmpfile Location of temporary file.
-#' @param intersectedfile Location of intersected file.
+#' @param genebed character. File directory for bed file that contains 500 kb windows for each gene
+#' @param nbatch numeric. Number of batches to produce: Length of the list
+#' @param tmpfile character. Location of temporary file.
+#' @param intersectedfile character. Location of intersected file.
 #'
-#' @return SCENT object with updated field of peak.info
+#' @return SCENT object with updated field of peak.info.list
 #' @export
 CreatePeakToGeneList <- function(object,genebed="/path/to/GeneBody_500kb_margin.bed",nbatch,tmpfile="./temporary_atac_peak.bed",intersectedfile="./temporary_atac_peak_intersected.bed.gz"){
   peaknames <- rownames(object@atac) # peak by cell matrix

diff --git a/R/import_packages.R b/R/import_packages.R
@@ -1,4 +1,5 @@
-#' @import methods Hmisc R.utils data.table stringr
+#' @import methods Hmisc data.table stringr
+#' @import R.utils
 #' @import lme4 boot MASS parallel
 #' @import Matrix
 NULL
diff --git a/man/CreatePeakToGeneList.Rd b/man/CreatePeakToGeneList.Rd
diff --git a/man/SCENT-class.Rd b/man/SCENT-class.Rd
diff --git a/man/SCENT_algorithm.Rd b/man/SCENT_algorithm.Rd
diff --git a/man/assoc_negbin.Rd b/man/assoc_negbin.Rd
diff --git a/vignettes/SCENT_interactive.Rmd b/vignettes/SCENT_interactive.Rmd
@@ -55,23 +55,45 @@ head(SCENT_obj@rna[1:10,1:2])
 head(SCENT_obj@atac[1:10,1:2])
 head([email protected])
 head([email protected])
+str(SCENT_obj)
 ```
 
-## SCENT Algorithm: Poisson Regression w/ Bootstrapping
+## SCENT Algorithm: Obtain small list of gene-peak pairs.
 
 ```{r gene_peak}
 #Of the set of peak gene pairs: pick a set of pairs to test: 
 #Example: (first 10 gene-peak pairs)
 [email protected] <- [email protected][1:10,]
 head([email protected])
+```
+## SCENT Algorithm: Options for Regression w/ Bootstrapping.
+
+```{r gene_peak}
+#Run SCENT algorithm of Tnk cell type and use 6 cores for parallelization:
+
+
+#Default: Poisson regression and Binarized ATAC counts
+SCENT_obj_ver1 <- SCENT_algorithm(SCENT_obj, "Tnk", 6) 
+# By default settings the above will perform parallelizations using Poisson regression and Binarized counts.
+
+#Option 1: Poisson regression and Non-Binarized ATAC counts
+SCENT_obj_ver2 <- SCENT_algorithm(SCENT_obj, "Tnk", 6, regr = "poisson", bin = FALSE)
+
+#Option 2: Negative Binomial regression and Binarized ATAC counts
+SCENT_obj_ver3 <- SCENT_algorithm(SCENT_obj, "Tnk", 6, regr = "negbin", bin = TRUE)
+
+#Option 3: Negative Binomial regression and Non-Binarized ATAC counts
+SCENT_obj_ver4 <- SCENT_algorithm(SCENT_obj, "Tnk", 6, regr = "negbin", bin = FALSE)
+
 ```
 
 ## Output of SCENT Algorithm
 
 ```{r SCENT_algo}
-#Run SCENT algorithm of Tnk cell type and use 6 cores for parallelization:
-SCENT_obj <- SCENT_algorithm(SCENT_obj, "Tnk", 6)
-head([email protected])
+head([email protected])
+head([email protected])
+head([email protected])
+head([email protected])
 ```
 
 ```
Original file line number	Diff line number	Diff line change
Expand Up		@@ -9,4 +9,5 @@


		module load R
		Rscript SCENT_parallelization.R $LSB_JOBINDEX ${num_cores} ${file_SCENT_obj} ${celltype} ${output_dir}
		Rscript SCENT_parallelization.R $LSB_JOBINDEX ${num_cores} ${file_SCENT_obj} ${celltype} ${regr} ${bin} ${output_dir}