Merge pull request #4 from immunogenomics/dev

Updated SCENT package
immunogenomics · Jul 27, 2023 · ecc9b21 · ecc9b21
2 parents 0e25ea4 + 2f8ffcd
commit ecc9b21
Show file tree

Hide file tree

Showing 14 changed files with 138 additions and 47 deletions.
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,4 @@ improvements
 READMEupdate.md
 RData/
 .DS_Store
+.DS_Store
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,13 +1,14 @@
 Package: SCENT
 Type: Package
 Title: Single-Cell ENhancer Target (SCENT) gene mapping for single cell multimodal data 
-Version: 0.1.0
+Version: 1.0.0
 Author: Saori Sakaue and Shakson Isaac
 Maintainer: Shakson Isaac <[email protected]>
 Description: R package that contains functions for the SCENT algorithm. SCENT uses 
     single-cell multimodal data (e.g., 10X Multiome RNA/ATAC) and links 
     ATAC-seq peaks (putative enhancers) to their target genes by modeling association
     between chromatin accessibility and gene expression across individual single cells.
+Depends: R (>= 3.5.0)
 Imports:
     methods, 
     Hmisc, 

diff --git a/NAMESPACE b/NAMESPACE
@@ -3,6 +3,7 @@
 export(CreatePeakToGeneList)
 export(CreateSCENTObj)
 export(SCENT_algorithm)
+export(assoc_negbin)
 export(assoc_poisson)
 export(basic_p)
 export(check_dimensions)
@@ -18,3 +19,8 @@ import(lme4)
 import(methods)
 import(parallel)
 import(stringr)
+importFrom(stats,as.formula)
+importFrom(stats,coef)
+importFrom(stats,glm)
+importFrom(stats,vcov)
+importFrom(utils,write.table)
diff --git a/Parallelized Bash Script/SCENT_parallelization.R b/Parallelized Bash Script/SCENT_parallelization.R
@@ -4,10 +4,12 @@ library(SCENT)
 ####### INPUTS
 #Obtain arguments: (from Cluster)
 node = commandArgs(trailingOnly = T)[1] # JOB ARRAY number: node usage
-cores = commandArgs(trailingOnly = T)[2] # Number of Cores
+cores = commandArgs(trailingOnly = T)[2] # numeric. Number of Cores
 SCENTobj_rds = commandArgs(trailingOnly = T)[3] # RDS object file type
-celltype = commandArgs(trailingOnly = T)[4] # CellType
-output_dir  = commandArgs(trailingOnly = T)[5] # Output of each text file to a specific folder
+celltype = commandArgs(trailingOnly = T)[4] # character. CellType
+regr = commandArgs(trailingOnly = T)[5] # character. Regression Type
+bin = commandArgs(trailingOnly = T)[6] # logical. Binarize ATAC counts
+output_dir  = commandArgs(trailingOnly = T)[7] # Output of each text file to a specific folder
 
 
 ###Example of inputs from the bash script: parallelizedSCENT.sh
@@ -22,10 +24,10 @@ output_dir  = commandArgs(trailingOnly = T)[5] # Output of each text file to a s
 SCENT_obj <- readRDS(SCENTobj_rds)
 
 #### Get the corresponding dataframe from the list:
-SCENT_obj@peak.info <- SCENT_obj@peak.info.list[[node]]
+SCENT_obj@peak.info <- SCENT_obj@peak.info[[node]]
 
 #### Run SCENT algorithm of Tnk cell type and use 6 cores for parallelization:
-SCENT_obj <- SCENT_algorithm(SCENT_obj, celltype, cores)
+SCENT_obj <- SCENT_algorithm(SCENT_obj, celltype, cores, regr, bin)
 
 #### Output SCENT results for each gene-peak pair block.
 filename <- paste0(output_dir,"SCENTresult_",node,".txt")

diff --git a/Parallelized Bash Script/parallelizedSCENT.sh b/Parallelized Bash Script/parallelizedSCENT.sh
@@ -9,4 +9,5 @@
 
 
 module load R
-Rscript SCENT_parallelization.R $LSB_JOBINDEX ${num_cores} ${file_SCENT_obj} ${celltype} ${output_dir}
+Rscript SCENT_parallelization.R $LSB_JOBINDEX ${num_cores} ${file_SCENT_obj} ${celltype} ${regr} ${bin} ${output_dir}
+
diff --git a/R/SCENTfunctions.R b/R/SCENTfunctions.R
@@ -40,6 +40,20 @@ assoc_poisson = function(data, idx = seq_len(nrow(data)), formula){
 }
 
 
+#' Perform negative binomial regression: exprs ~ peak + covariates
+#'
+#' @param data contains expr values and associated peak and covariates for a gene.
+#' @param idx rows of the data to use: argument for boot function (bootstrapping)
+#' @param formula user defined formula based on initialization in CreateSCENTObj Constructor
+#'
+#' @return vector: (coefficient of the peak effect on gene, variance of peak effect on gene)
+#' @export
+assoc_negbin = function(data, idx = seq_len(nrow(data)), formula){
+  gg = glm.nb(formula, data = data[idx,,drop = FALSE])
+  c(coef(gg)['atac'], diag(vcov(gg))['atac'])
+}
+
+
 
 #' Validity and Type Checking for CreateSCENTObject Constructor
 #'
@@ -132,23 +146,28 @@ CreateSCENTObj <- setClass(
   validity = check_dimensions
 )
 
-
 #' SCENT Algorithm: Poisson Regression with Empirical P-values through Bootstrapping.
 #'
 #' @param object SCENT object
-#' @param celltype User specified cell type defined in celltypes column of meta.data
-#' @param ncores Number of cores to use for Parallelization
+#' @param celltype character. User specified cell type defined in celltypes column of meta.data
+#' @param ncores numeric. Number of cores to use for Parallelization
+#' @param regr character. Regression type: "poisson" or "negbin" for Poisson regression and Negative Binomial regression, respectively
+#' @param bin logical. TRUE to binarize ATAC counts. FALSE to NOT binarize ATAC counts
 #'
 #' @return SCENT object with updated field SCENT.results
 #' @export
-SCENT_algorithm <- function(object, celltype, ncores){
+SCENT_algorithm <- function(object, celltype, ncores, regr = "poisson", bin = TRUE){
   res <- data.frame()
   for (n in 1:nrow(object@peak.info)){ ####c(1:nrow(chunkinfo))
     gene <- object@peak.info[n,1] #GENE is FIRST COLUMN OF PEAK.INFO
     this_peak <- object@peak.info[n,2] #PEAK is SECOND COLUMN OF PEAK.INFO
     atac_target <- data.frame(cell = colnames(object@atac), atac = object@atac[this_peak,])
+
+
     #binarize peaks:
-    atac_target[atac_target$atac>0,]$atac<-1
+    if(bin){
+      atac_target[atac_target$atac>0,]$atac<-1
+    }
 
     mrna_target <- object@rna[gene,]
     df <- data.frame(cell=names(mrna_target),exprs=as.numeric(mrna_target))
@@ -160,32 +179,40 @@ SCENT_algorithm <- function(object, celltype, ncores){
     nonzero_m  <- length( df2$exprs[ df2$exprs > 0] ) / length( df2$exprs )
     nonzero_a  <- length( df2$atac[ df2$atac > 0] ) / length( df2$atac )
     if(nonzero_m > 0.05 & nonzero_a > 0.05){
-      # poisson Regression
+      #Run Regression Once Before Bootstrapping:
       res_var <- "exprs"
-      pred_var <- c("atac", object@covariates) ###need to add log....
+      pred_var <- c("atac", object@covariates)
       formula <- as.formula(paste(res_var, paste(pred_var, collapse = "+"), sep = "~"))
 
+
       #Estimated Coefficients Obtained without Bootstrapping:
-      base = glm(formula, family = 'poisson', data = df2)
-      coefs<-summary(base)$coefficients["atac",]
+      if(regr == "poisson"){
+        base = glm(formula, family = 'poisson', data = df2)
+        coefs<-summary(base)$coefficients["atac",]
+        assoc <- assoc_poisson
+      } else if (regr == "negbin"){
+        base = glm.nb(formula, data = df2)
+        coefs<-summary(base)$coefficients["atac",]
+        assoc <- assoc_negbin
+      }
 
       ###Iterative Bootstrapping Procedure: Estimate the Beta coefficients and associate a 2-sided p-value.
-      bs = boot::boot(df2,assoc_poisson, R = 100, formula = formula, stype = 'i', parallel = "multicore", ncpus = ncores)
+      bs = boot::boot(df2,assoc, R = 100, formula = formula, stype = 'i', parallel = "multicore", ncpus = ncores)
       p0 = basic_p(bs$t0[1], bs$t[,1])
       if(p0<0.1){
-        bs = boot::boot(df2,assoc_poisson, R = 500, formula = formula,  stype = 'i', parallel = "multicore", ncpus = ncores)
+        bs = boot::boot(df2,assoc, R = 500, formula = formula,  stype = 'i', parallel = "multicore", ncpus = ncores)
         p0 = basic_p(bs$t0[1], bs$t[,1])
       }
       if(p0<0.05){
-        bs = boot::boot(df2,assoc_poisson, R = 2500, formula = formula,  stype = 'i', parallel = "multicore", ncpus = ncores)
+        bs = boot::boot(df2,assoc, R = 2500, formula = formula,  stype = 'i', parallel = "multicore", ncpus = ncores)
         p0 = basic_p(bs$t0[1], bs$t[,1])
       }
       if(p0<0.01){
-        bs = boot::boot(df2,assoc_poisson, R = 25000, formula = formula,  stype = 'i', parallel = "multicore", ncpus = ncores)
+        bs = boot::boot(df2,assoc, R = 25000, formula = formula,  stype = 'i', parallel = "multicore", ncpus = ncores)
         p0 = basic_p(bs$t0[1], bs$t[,1])
       }
       if(p0<0.001){
-        bs = boot::boot(df2,assoc_poisson, R = 50000, formula = formula, stype = 'i', parallel = "multicore", ncpus = ncores)
+        bs = boot::boot(df2,assoc, R = 50000, formula = formula, stype = 'i', parallel = "multicore", ncpus = ncores)
         p0 = basic_p(bs$t0[1], bs$t[,1])
       }
       out <- data.frame(gene=gene,peak=this_peak,beta=coefs[1],se=coefs[2],z=coefs[3],p=coefs[4],boot_basic_p=p0)
@@ -203,12 +230,12 @@ SCENT_algorithm <- function(object, celltype, ncores){
 #' Creating Cis Gene-Peak Pair Lists to Parallelize Through
 #'
 #' @param object SCENT object
-#' @param genebed File directory for bed file that contains 500 kb windows for each gene
-#' @param nbatch Number of batches to produce: Length of the list
-#' @param tmpfile Location of temporary file.
-#' @param intersectedfile Location of intersected file.
+#' @param genebed character. File directory for bed file that contains 500 kb windows for each gene
+#' @param nbatch numeric. Number of batches to produce: Length of the list
+#' @param tmpfile character. Location of temporary file.
+#' @param intersectedfile character. Location of intersected file.
 #'
-#' @return SCENT object with updated field of peak.info
+#' @return SCENT object with updated field of peak.info.list
 #' @export
 CreatePeakToGeneList <- function(object,genebed="/path/to/GeneBody_500kb_margin.bed",nbatch,tmpfile="./temporary_atac_peak.bed",intersectedfile="./temporary_atac_peak_intersected.bed.gz"){
   peaknames <- rownames(object@atac) # peak by cell matrix

diff --git a/R/import_packages.R b/R/import_packages.R
@@ -1,4 +1,6 @@
 #' @import methods Hmisc R.utils data.table stringr
 #' @import lme4 boot MASS parallel
 #' @import Matrix
+#' @importFrom stats as.formula coef glm vcov
+#' @importFrom utils write.table
 NULL
diff --git a/README.md b/README.md
@@ -72,10 +72,11 @@ SCENT_obj <- CreateSCENTObj(rna = mrna, atac = atac, meta.data = meta,
 ```
 
 Followed by SCENT algorithm:
+
 ```r
-SCENT_obj <- SCENT_algorithm(object = SCENT_obj, celltype = "Tcell", ncores = 6)
+SCENT_obj <- SCENT_algorithm(object = SCENT_obj, celltype = "Tcell", ncores = 6, regr = 'poisson', bin = TRUE)
 ```
-Where the user specifies a `celltype` (in this case "Tcell") for association analysis (in `meta.data` slot in SCENT object) and the number of cores for parallelized bootstrapping.
+The user specifies a `celltype` (in this case “Tcell”) for association analysis (in `meta.data` slot in SCENT object), `ncores` for the number of cores for parallelized bootstrapping, `regr` for the regression type (Poisson ‘poisson’ or Negative Binomial ‘negbin’ regression), and `bin` for whether to binarize ATAC counts (TRUE for binarization or FALSE for not).
 
 The output of the SCENT algorithm will be contained in the field:
 ```r
@@ -93,11 +94,11 @@ Further information on Inputs and Outputs of SCENT are detailed below:
 | 1    | rna (sparse matrix) | A gene-by-cell count matrix from multimodal RNA-seq data. This is a raw count matrix without any normalization. The row names should be the gene names used in the `peak.info` file. The column names are the cell names which should be the same names used in the `cell`column of the dataframe specified for `meta.data`. Sparse matrix format is required. |
 | 2    | atac (sparse matrix) | A peak-by-cell count matrix from multimodal ATAC-seq data. This is a raw count matrix without any normalization. The row names should be the peak names used in the `peak.info` file. The column names are the cell names which should be the same names used in `rna` and the `cell`column of dataframe specified for `meta.data`. The matrix may not be binarized while it will be binarized within the function. Sparse matrix format is required. |
 | 3    | meta.data (dataframe)     | A meta data frame for cells (rows are cells, and **cell names should be in the column named as "cell"**; see below example). Additionally, this text should include covariates to use in the model. Examples include: % mitochondrial reads, log(nUMI), sample, and batch as covariates. Dataframe format is required. |
-| 4    | peak.info (dataframe) | A textfile indicating which gene-peak pairs you want to test in this chunk (see below example). We highly recommend splitting gene-peak pairs into many chunks to increase computational efficiency (See Parallelized Jobs Info in Section 2). List(Dataframe) format which is a list of multiple data frames for parallelization is required. \* |
+| 4    | peak.info (dataframe) | A textfile indicating which gene-peak pairs you want to test in this chunk (see below example) **genes should be in the 1st column and peaks in the 2nd column**. We highly recommend splitting gene-peak pairs into many chunks to increase computational efficiency (See Parallelized Jobs Info in Section 2). List(Dataframe) format which is a list of multiple data frames for parallelization is required. \* |
 | 5    | covariates (a vector of character) | A vector of character fields that denote the covariates listed in the meta.data. For example, a set of covariates can be: %mitochondrial reads, log_nUMI, sample, and batch. Additionally the user can specify transformations to the covariates such as log transformation on nUMI counts for direct usage in the SCENT algorithm invoking poisson glm. **We recommend users to at least use log(number_of_total_RNA_UMI_count_per_cell) as the base model is Poisson regression and we do not include the offset term into the default model.** |
 | 6    | celltypes (character)        | User specified naming of the celltype column in the meta.data file. This column should contain the names of the celltypes you want to test in this association analysis. |
 
-\* Extra Argument: The peak.info.list field can be left blank initially and a created List(Dataframes) can be constructed using the CreatePeakToGeneList function in the SCENT package. This function requires the user to specify a bed file that specifies ~500 kb windows of multiple gene loci to identify cis gene-peak pairs to test. The vignette, SCENT_parallelize.Rmd, will show steps to produce a SCENT object with a peak.info.list field that is used for parallelization in the SCENT_parallelization.R script.
+\* Extra Argument: The peak.info.list field can be left blank initially and a created List(Dataframe) can be constructed using the CreatePeakToGeneList function in the SCENT package. This function requires the user to specify a bed file that specifies ~500 kb windows of multiple gene loci to identify cis gene-peak pairs to test. The vignette, SCENT_parallelize.Rmd, will show steps to produce a SCENT object with a peak.info.list field that is used for parallelization in the SCENT_parallelization.R script.
 
 
 
@@ -179,7 +180,7 @@ dependent on the amount of gene-peak pair batches that is user defined (for cont
 SCENT_parallelize.Rmd vignette). The main part of the bash script contains the line:
 
 ```bash
-Rscript SCENT_parallelization.R $LSB_JOBINDEX ${num_cores} ${file_SCENT_obj} ${celltype} ${output_dir}
+Rscript SCENT_parallelization.R $LSB_JOBINDEX ${num_cores} ${file_SCENT_obj} ${celltype} ${regr} ${bin} ${output_dir}
 ```
 
 Arguments in the bash file are user specified as follows:
@@ -189,8 +190,10 @@ Arguments in the bash file are user specified as follows:
 |1    | LSB_JOBINDEX   | jobarray index specified by BSUB -J SCENT[1-100] |
 |2    | num_cores      | number of cores (ex. 6) to parallelize to the SCENT algorithm |
 |3    | file_SCENT_obj | SCENT object that contains atac_matrix, rna_matrix, metafile, peak_gene_list, etc. To run the SCENT algorithm |
-|4    | celltype       |User specified celltype (ex. "Tcells") to run the SCENT algorithm |
-|5    | output_dir     | User specified directory to output the SCENT results to aggregate once completed. |
+|4    | celltype       | User specified celltype (ex. "Tcells") to run the SCENT algorithm |
+|5    | regr           | User specified regression type (ex. "poisson") to run SCENT algorithm |
+|6    | bin            | User specified choice to binarize ATAC counts (ex. TRUE) |
+|7    | output_dir     | User specified directory to output the SCENT results to aggregate once completed |
 
 
 ### Contact

diff --git a/man/CreatePeakToGeneList.Rd b/man/CreatePeakToGeneList.Rd
diff --git a/man/SCENT-class.Rd b/man/SCENT-class.Rd
diff --git a/man/SCENT_algorithm.Rd b/man/SCENT_algorithm.Rd
diff --git a/man/assoc_negbin.Rd b/man/assoc_negbin.Rd
diff --git a/vignettes/.gitignore b/vignettes/.gitignore
@@ -3,4 +3,3 @@
 Output/
 RData/
 temporary_atac_peak_intersected.bed.gz
-.DS_Store
Original file line number	Diff line number	Diff line change
Expand Up		@@ -9,4 +9,5 @@


		module load R
		Rscript SCENT_parallelization.R $LSB_JOBINDEX ${num_cores} ${file_SCENT_obj} ${celltype} ${output_dir}
		Rscript SCENT_parallelization.R $LSB_JOBINDEX ${num_cores} ${file_SCENT_obj} ${celltype} ${regr} ${bin} ${output_dir}
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,4 +3,3 @@ @@
     Output/
     RData/
     temporary_atac_peak_intersected.bed.gz
-    .DS_Store