diff --git a/.gitignore b/.gitignore index 13fde90..5ea3a50 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ improvements READMEupdate.md RData/ .DS_Store +.DS_Store diff --git a/DESCRIPTION b/DESCRIPTION index 11dd659..8992776 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,13 +1,14 @@ Package: SCENT Type: Package Title: Single-Cell ENhancer Target (SCENT) gene mapping for single cell multimodal data -Version: 0.1.0 +Version: 1.0.0 Author: Saori Sakaue and Shakson Isaac Maintainer: Shakson Isaac Description: R package that contains functions for the SCENT algorithm. SCENT uses single-cell multimodal data (e.g., 10X Multiome RNA/ATAC) and links ATAC-seq peaks (putative enhancers) to their target genes by modeling association between chromatin accessibility and gene expression across individual single cells. +Depends: R (>= 3.5.0) Imports: methods, Hmisc, diff --git a/NAMESPACE b/NAMESPACE index 9c346d2..76629e6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,6 +3,7 @@ export(CreatePeakToGeneList) export(CreateSCENTObj) export(SCENT_algorithm) +export(assoc_negbin) export(assoc_poisson) export(basic_p) export(check_dimensions) @@ -18,3 +19,8 @@ import(lme4) import(methods) import(parallel) import(stringr) +importFrom(stats,as.formula) +importFrom(stats,coef) +importFrom(stats,glm) +importFrom(stats,vcov) +importFrom(utils,write.table) diff --git a/Parallelized Bash Script/SCENT_parallelization.R b/Parallelized Bash Script/SCENT_parallelization.R index 221f59b..c5ad18b 100644 --- a/Parallelized Bash Script/SCENT_parallelization.R +++ b/Parallelized Bash Script/SCENT_parallelization.R @@ -4,10 +4,12 @@ library(SCENT) ####### INPUTS #Obtain arguments: (from Cluster) node = commandArgs(trailingOnly = T)[1] # JOB ARRAY number: node usage -cores = commandArgs(trailingOnly = T)[2] # Number of Cores +cores = commandArgs(trailingOnly = T)[2] # numeric. Number of Cores SCENTobj_rds = commandArgs(trailingOnly = T)[3] # RDS object file type -celltype = commandArgs(trailingOnly = T)[4] # CellType -output_dir = commandArgs(trailingOnly = T)[5] # Output of each text file to a specific folder +celltype = commandArgs(trailingOnly = T)[4] # character. CellType +regr = commandArgs(trailingOnly = T)[5] # character. Regression Type +bin = commandArgs(trailingOnly = T)[6] # logical. Binarize ATAC counts +output_dir = commandArgs(trailingOnly = T)[7] # Output of each text file to a specific folder ###Example of inputs from the bash script: parallelizedSCENT.sh @@ -22,10 +24,10 @@ output_dir = commandArgs(trailingOnly = T)[5] # Output of each text file to a s SCENT_obj <- readRDS(SCENTobj_rds) #### Get the corresponding dataframe from the list: -SCENT_obj@peak.info <- SCENT_obj@peak.info.list[[node]] +SCENT_obj@peak.info <- SCENT_obj@peak.info[[node]] #### Run SCENT algorithm of Tnk cell type and use 6 cores for parallelization: -SCENT_obj <- SCENT_algorithm(SCENT_obj, celltype, cores) +SCENT_obj <- SCENT_algorithm(SCENT_obj, celltype, cores, regr, bin) #### Output SCENT results for each gene-peak pair block. filename <- paste0(output_dir,"SCENTresult_",node,".txt") diff --git a/Parallelized Bash Script/parallelizedSCENT.sh b/Parallelized Bash Script/parallelizedSCENT.sh index ca17142..f604e96 100644 --- a/Parallelized Bash Script/parallelizedSCENT.sh +++ b/Parallelized Bash Script/parallelizedSCENT.sh @@ -9,4 +9,5 @@ module load R -Rscript SCENT_parallelization.R $LSB_JOBINDEX ${num_cores} ${file_SCENT_obj} ${celltype} ${output_dir} +Rscript SCENT_parallelization.R $LSB_JOBINDEX ${num_cores} ${file_SCENT_obj} ${celltype} ${regr} ${bin} ${output_dir} + diff --git a/R/SCENTfunctions.R b/R/SCENTfunctions.R index 8d65af9..e43f5c0 100644 --- a/R/SCENTfunctions.R +++ b/R/SCENTfunctions.R @@ -40,6 +40,20 @@ assoc_poisson = function(data, idx = seq_len(nrow(data)), formula){ } +#' Perform negative binomial regression: exprs ~ peak + covariates +#' +#' @param data contains expr values and associated peak and covariates for a gene. +#' @param idx rows of the data to use: argument for boot function (bootstrapping) +#' @param formula user defined formula based on initialization in CreateSCENTObj Constructor +#' +#' @return vector: (coefficient of the peak effect on gene, variance of peak effect on gene) +#' @export +assoc_negbin = function(data, idx = seq_len(nrow(data)), formula){ + gg = glm.nb(formula, data = data[idx,,drop = FALSE]) + c(coef(gg)['atac'], diag(vcov(gg))['atac']) +} + + #' Validity and Type Checking for CreateSCENTObject Constructor #' @@ -132,23 +146,28 @@ CreateSCENTObj <- setClass( validity = check_dimensions ) - #' SCENT Algorithm: Poisson Regression with Empirical P-values through Bootstrapping. #' #' @param object SCENT object -#' @param celltype User specified cell type defined in celltypes column of meta.data -#' @param ncores Number of cores to use for Parallelization +#' @param celltype character. User specified cell type defined in celltypes column of meta.data +#' @param ncores numeric. Number of cores to use for Parallelization +#' @param regr character. Regression type: "poisson" or "negbin" for Poisson regression and Negative Binomial regression, respectively +#' @param bin logical. TRUE to binarize ATAC counts. FALSE to NOT binarize ATAC counts #' #' @return SCENT object with updated field SCENT.results #' @export -SCENT_algorithm <- function(object, celltype, ncores){ +SCENT_algorithm <- function(object, celltype, ncores, regr = "poisson", bin = TRUE){ res <- data.frame() for (n in 1:nrow(object@peak.info)){ ####c(1:nrow(chunkinfo)) gene <- object@peak.info[n,1] #GENE is FIRST COLUMN OF PEAK.INFO this_peak <- object@peak.info[n,2] #PEAK is SECOND COLUMN OF PEAK.INFO atac_target <- data.frame(cell = colnames(object@atac), atac = object@atac[this_peak,]) + + #binarize peaks: - atac_target[atac_target$atac>0,]$atac<-1 + if(bin){ + atac_target[atac_target$atac>0,]$atac<-1 + } mrna_target <- object@rna[gene,] df <- data.frame(cell=names(mrna_target),exprs=as.numeric(mrna_target)) @@ -160,32 +179,40 @@ SCENT_algorithm <- function(object, celltype, ncores){ nonzero_m <- length( df2$exprs[ df2$exprs > 0] ) / length( df2$exprs ) nonzero_a <- length( df2$atac[ df2$atac > 0] ) / length( df2$atac ) if(nonzero_m > 0.05 & nonzero_a > 0.05){ - # poisson Regression + #Run Regression Once Before Bootstrapping: res_var <- "exprs" - pred_var <- c("atac", object@covariates) ###need to add log.... + pred_var <- c("atac", object@covariates) formula <- as.formula(paste(res_var, paste(pred_var, collapse = "+"), sep = "~")) + #Estimated Coefficients Obtained without Bootstrapping: - base = glm(formula, family = 'poisson', data = df2) - coefs<-summary(base)$coefficients["atac",] + if(regr == "poisson"){ + base = glm(formula, family = 'poisson', data = df2) + coefs<-summary(base)$coefficients["atac",] + assoc <- assoc_poisson + } else if (regr == "negbin"){ + base = glm.nb(formula, data = df2) + coefs<-summary(base)$coefficients["atac",] + assoc <- assoc_negbin + } ###Iterative Bootstrapping Procedure: Estimate the Beta coefficients and associate a 2-sided p-value. - bs = boot::boot(df2,assoc_poisson, R = 100, formula = formula, stype = 'i', parallel = "multicore", ncpus = ncores) + bs = boot::boot(df2,assoc, R = 100, formula = formula, stype = 'i', parallel = "multicore", ncpus = ncores) p0 = basic_p(bs$t0[1], bs$t[,1]) if(p0<0.1){ - bs = boot::boot(df2,assoc_poisson, R = 500, formula = formula, stype = 'i', parallel = "multicore", ncpus = ncores) + bs = boot::boot(df2,assoc, R = 500, formula = formula, stype = 'i', parallel = "multicore", ncpus = ncores) p0 = basic_p(bs$t0[1], bs$t[,1]) } if(p0<0.05){ - bs = boot::boot(df2,assoc_poisson, R = 2500, formula = formula, stype = 'i', parallel = "multicore", ncpus = ncores) + bs = boot::boot(df2,assoc, R = 2500, formula = formula, stype = 'i', parallel = "multicore", ncpus = ncores) p0 = basic_p(bs$t0[1], bs$t[,1]) } if(p0<0.01){ - bs = boot::boot(df2,assoc_poisson, R = 25000, formula = formula, stype = 'i', parallel = "multicore", ncpus = ncores) + bs = boot::boot(df2,assoc, R = 25000, formula = formula, stype = 'i', parallel = "multicore", ncpus = ncores) p0 = basic_p(bs$t0[1], bs$t[,1]) } if(p0<0.001){ - bs = boot::boot(df2,assoc_poisson, R = 50000, formula = formula, stype = 'i', parallel = "multicore", ncpus = ncores) + bs = boot::boot(df2,assoc, R = 50000, formula = formula, stype = 'i', parallel = "multicore", ncpus = ncores) p0 = basic_p(bs$t0[1], bs$t[,1]) } out <- data.frame(gene=gene,peak=this_peak,beta=coefs[1],se=coefs[2],z=coefs[3],p=coefs[4],boot_basic_p=p0) @@ -203,12 +230,12 @@ SCENT_algorithm <- function(object, celltype, ncores){ #' Creating Cis Gene-Peak Pair Lists to Parallelize Through #' #' @param object SCENT object -#' @param genebed File directory for bed file that contains 500 kb windows for each gene -#' @param nbatch Number of batches to produce: Length of the list -#' @param tmpfile Location of temporary file. -#' @param intersectedfile Location of intersected file. +#' @param genebed character. File directory for bed file that contains 500 kb windows for each gene +#' @param nbatch numeric. Number of batches to produce: Length of the list +#' @param tmpfile character. Location of temporary file. +#' @param intersectedfile character. Location of intersected file. #' -#' @return SCENT object with updated field of peak.info +#' @return SCENT object with updated field of peak.info.list #' @export CreatePeakToGeneList <- function(object,genebed="/path/to/GeneBody_500kb_margin.bed",nbatch,tmpfile="./temporary_atac_peak.bed",intersectedfile="./temporary_atac_peak_intersected.bed.gz"){ peaknames <- rownames(object@atac) # peak by cell matrix diff --git a/R/import_packages.R b/R/import_packages.R index a6397ae..37f7d39 100644 --- a/R/import_packages.R +++ b/R/import_packages.R @@ -1,4 +1,6 @@ #' @import methods Hmisc R.utils data.table stringr #' @import lme4 boot MASS parallel #' @import Matrix +#' @importFrom stats as.formula coef glm vcov +#' @importFrom utils write.table NULL diff --git a/README.md b/README.md index b31a9f7..9d7eeb2 100644 --- a/README.md +++ b/README.md @@ -72,10 +72,11 @@ SCENT_obj <- CreateSCENTObj(rna = mrna, atac = atac, meta.data = meta, ``` Followed by SCENT algorithm: + ```r -SCENT_obj <- SCENT_algorithm(object = SCENT_obj, celltype = "Tcell", ncores = 6) +SCENT_obj <- SCENT_algorithm(object = SCENT_obj, celltype = "Tcell", ncores = 6, regr = 'poisson', bin = TRUE) ``` -Where the user specifies a `celltype` (in this case "Tcell") for association analysis (in `meta.data` slot in SCENT object) and the number of cores for parallelized bootstrapping. +The user specifies a `celltype` (in this case “Tcell”) for association analysis (in `meta.data` slot in SCENT object), `ncores` for the number of cores for parallelized bootstrapping, `regr` for the regression type (Poisson ‘poisson’ or Negative Binomial ‘negbin’ regression), and `bin` for whether to binarize ATAC counts (TRUE for binarization or FALSE for not). The output of the SCENT algorithm will be contained in the field: ```r @@ -93,11 +94,11 @@ Further information on Inputs and Outputs of SCENT are detailed below: | 1 | rna (sparse matrix) | A gene-by-cell count matrix from multimodal RNA-seq data. This is a raw count matrix without any normalization. The row names should be the gene names used in the `peak.info` file. The column names are the cell names which should be the same names used in the `cell`column of the dataframe specified for `meta.data`. Sparse matrix format is required. | | 2 | atac (sparse matrix) | A peak-by-cell count matrix from multimodal ATAC-seq data. This is a raw count matrix without any normalization. The row names should be the peak names used in the `peak.info` file. The column names are the cell names which should be the same names used in `rna` and the `cell`column of dataframe specified for `meta.data`. The matrix may not be binarized while it will be binarized within the function. Sparse matrix format is required. | | 3 | meta.data (dataframe) | A meta data frame for cells (rows are cells, and **cell names should be in the column named as "cell"**; see below example). Additionally, this text should include covariates to use in the model. Examples include: % mitochondrial reads, log(nUMI), sample, and batch as covariates. Dataframe format is required. | -| 4 | peak.info (dataframe) | A textfile indicating which gene-peak pairs you want to test in this chunk (see below example). We highly recommend splitting gene-peak pairs into many chunks to increase computational efficiency (See Parallelized Jobs Info in Section 2). List(Dataframe) format which is a list of multiple data frames for parallelization is required. \* | +| 4 | peak.info (dataframe) | A textfile indicating which gene-peak pairs you want to test in this chunk (see below example) **genes should be in the 1st column and peaks in the 2nd column**. We highly recommend splitting gene-peak pairs into many chunks to increase computational efficiency (See Parallelized Jobs Info in Section 2). List(Dataframe) format which is a list of multiple data frames for parallelization is required. \* | | 5 | covariates (a vector of character) | A vector of character fields that denote the covariates listed in the meta.data. For example, a set of covariates can be: %mitochondrial reads, log_nUMI, sample, and batch. Additionally the user can specify transformations to the covariates such as log transformation on nUMI counts for direct usage in the SCENT algorithm invoking poisson glm. **We recommend users to at least use log(number_of_total_RNA_UMI_count_per_cell) as the base model is Poisson regression and we do not include the offset term into the default model.** | | 6 | celltypes (character) | User specified naming of the celltype column in the meta.data file. This column should contain the names of the celltypes you want to test in this association analysis. | -\* Extra Argument: The peak.info.list field can be left blank initially and a created List(Dataframes) can be constructed using the CreatePeakToGeneList function in the SCENT package. This function requires the user to specify a bed file that specifies ~500 kb windows of multiple gene loci to identify cis gene-peak pairs to test. The vignette, SCENT_parallelize.Rmd, will show steps to produce a SCENT object with a peak.info.list field that is used for parallelization in the SCENT_parallelization.R script. +\* Extra Argument: The peak.info.list field can be left blank initially and a created List(Dataframe) can be constructed using the CreatePeakToGeneList function in the SCENT package. This function requires the user to specify a bed file that specifies ~500 kb windows of multiple gene loci to identify cis gene-peak pairs to test. The vignette, SCENT_parallelize.Rmd, will show steps to produce a SCENT object with a peak.info.list field that is used for parallelization in the SCENT_parallelization.R script. @@ -179,7 +180,7 @@ dependent on the amount of gene-peak pair batches that is user defined (for cont SCENT_parallelize.Rmd vignette). The main part of the bash script contains the line: ```bash -Rscript SCENT_parallelization.R $LSB_JOBINDEX ${num_cores} ${file_SCENT_obj} ${celltype} ${output_dir} +Rscript SCENT_parallelization.R $LSB_JOBINDEX ${num_cores} ${file_SCENT_obj} ${celltype} ${regr} ${bin} ${output_dir} ``` Arguments in the bash file are user specified as follows: @@ -189,8 +190,10 @@ Arguments in the bash file are user specified as follows: |1 | LSB_JOBINDEX | jobarray index specified by BSUB -J SCENT[1-100] | |2 | num_cores | number of cores (ex. 6) to parallelize to the SCENT algorithm | |3 | file_SCENT_obj | SCENT object that contains atac_matrix, rna_matrix, metafile, peak_gene_list, etc. To run the SCENT algorithm | -|4 | celltype |User specified celltype (ex. "Tcells") to run the SCENT algorithm | -|5 | output_dir | User specified directory to output the SCENT results to aggregate once completed. | +|4 | celltype | User specified celltype (ex. "Tcells") to run the SCENT algorithm | +|5 | regr | User specified regression type (ex. "poisson") to run SCENT algorithm | +|6 | bin | User specified choice to binarize ATAC counts (ex. TRUE) | +|7 | output_dir | User specified directory to output the SCENT results to aggregate once completed | ### Contact diff --git a/man/CreatePeakToGeneList.Rd b/man/CreatePeakToGeneList.Rd index d2be3f9..0da980c 100644 --- a/man/CreatePeakToGeneList.Rd +++ b/man/CreatePeakToGeneList.Rd @@ -15,16 +15,16 @@ CreatePeakToGeneList( \arguments{ \item{object}{SCENT object} -\item{genebed}{File directory for bed file that contains 500 kb windows for each gene} +\item{genebed}{character. File directory for bed file that contains 500 kb windows for each gene} -\item{nbatch}{Number of batches to produce: Length of the list} +\item{nbatch}{numeric. Number of batches to produce: Length of the list} -\item{tmpfile}{Location of temporary file.} +\item{tmpfile}{character. Location of temporary file.} -\item{intersectedfile}{Location of intersected file.} +\item{intersectedfile}{character. Location of intersected file.} } \value{ -SCENT object with updated field of peak.info +SCENT object with updated field of peak.info.list } \description{ Creating Cis Gene-Peak Pair Lists to Parallelize Through diff --git a/man/SCENT-class.Rd b/man/SCENT-class.Rd index 3e108f2..038571c 100644 --- a/man/SCENT-class.Rd +++ b/man/SCENT-class.Rd @@ -20,7 +20,9 @@ SCENT Class Constructor \item{\code{meta.data}}{data.frame. Metadata table with covariates and a cell ID column ("cell")} -\item{\code{peak.info}}{ANY. Gene to Corresponding Peaks Table to Identify Signficance through SCENT: Order Must be (Gene, Peak)} +\item{\code{peak.info}}{data.frame. Dataframe that contains gene-peak pairs for SCENT to search through} + +\item{\code{peak.info.list}}{list. List of dataframes that contain gene-peak pairs to parallelize through} \item{\code{covariates}}{character. Assign covariates that are needed for the analysis. Must be names that are in the columns of meta.data} diff --git a/man/SCENT_algorithm.Rd b/man/SCENT_algorithm.Rd index b3a2465..1782af8 100644 --- a/man/SCENT_algorithm.Rd +++ b/man/SCENT_algorithm.Rd @@ -4,14 +4,18 @@ \alias{SCENT_algorithm} \title{SCENT Algorithm: Poisson Regression with Empirical P-values through Bootstrapping.} \usage{ -SCENT_algorithm(object, celltype, ncores) +SCENT_algorithm(object, celltype, ncores, regr = "poisson", bin = TRUE) } \arguments{ \item{object}{SCENT object} -\item{celltype}{User specified cell type defined in celltypes column of meta.data} +\item{celltype}{character. User specified cell type defined in celltypes column of meta.data} -\item{ncores}{Number of cores to use for Parallelization} +\item{ncores}{numeric. Number of cores to use for Parallelization} + +\item{regr}{character. Regression type: "poisson" or "negbin" for Poisson regression and Negative Binomial regression, respectively} + +\item{bin}{logical. TRUE to binarize ATAC counts. FALSE to NOT binarize ATAC counts} } \value{ SCENT object with updated field SCENT.results diff --git a/man/assoc_negbin.Rd b/man/assoc_negbin.Rd new file mode 100644 index 0000000..a896679 --- /dev/null +++ b/man/assoc_negbin.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/SCENTfunctions.R +\name{assoc_negbin} +\alias{assoc_negbin} +\title{Perform negative binomial regression: exprs ~ peak + covariates} +\usage{ +assoc_negbin(data, idx = seq_len(nrow(data)), formula) +} +\arguments{ +\item{data}{contains expr values and associated peak and covariates for a gene.} + +\item{idx}{rows of the data to use: argument for boot function (bootstrapping)} + +\item{formula}{user defined formula based on initialization in CreateSCENTObj Constructor} +} +\value{ +vector: (coefficient of the peak effect on gene, variance of peak effect on gene) +} +\description{ +Perform negative binomial regression: exprs ~ peak + covariates +} diff --git a/vignettes/.gitignore b/vignettes/.gitignore index aafa40d..20af2b5 100644 --- a/vignettes/.gitignore +++ b/vignettes/.gitignore @@ -3,4 +3,3 @@ Output/ RData/ temporary_atac_peak_intersected.bed.gz -.DS_Store diff --git a/vignettes/SCENT_interactive.Rmd b/vignettes/SCENT_interactive.Rmd index cc473dd..f535579 100644 --- a/vignettes/SCENT_interactive.Rmd +++ b/vignettes/SCENT_interactive.Rmd @@ -55,23 +55,45 @@ head(SCENT_obj@rna[1:10,1:2]) head(SCENT_obj@atac[1:10,1:2]) head(SCENT_obj@meta.data) head(SCENT_obj@peak.info) +str(SCENT_obj) ``` -## SCENT Algorithm: Poisson Regression w/ Bootstrapping +## SCENT Algorithm: Obtain small list of gene-peak pairs. ```{r gene_peak} #Of the set of peak gene pairs: pick a set of pairs to test: #Example: (first 10 gene-peak pairs) SCENT_obj@peak.info <- SCENT_obj@peak.info[1:10,] head(SCENT_obj@peak.info) +``` +## SCENT Algorithm: Options for Regression w/ Bootstrapping. + +```{r gene_peak} +#Run SCENT algorithm of Tnk cell type and use 6 cores for parallelization: + + +#Default: Poisson regression and Binarized ATAC counts +SCENT_obj_ver1 <- SCENT_algorithm(SCENT_obj, "Tnk", 6) +# By default settings the above will perform parallelizations using Poisson regression and Binarized counts. + +#Option 1: Poisson regression and Non-Binarized ATAC counts +SCENT_obj_ver2 <- SCENT_algorithm(SCENT_obj, "Tnk", 6, regr = "poisson", bin = FALSE) + +#Option 2: Negative Binomial regression and Binarized ATAC counts +SCENT_obj_ver3 <- SCENT_algorithm(SCENT_obj, "Tnk", 6, regr = "negbin", bin = TRUE) + +#Option 3: Negative Binomial regression and Non-Binarized ATAC counts +SCENT_obj_ver4 <- SCENT_algorithm(SCENT_obj, "Tnk", 6, regr = "negbin", bin = FALSE) + ``` ## Output of SCENT Algorithm ```{r SCENT_algo} -#Run SCENT algorithm of Tnk cell type and use 6 cores for parallelization: -SCENT_obj <- SCENT_algorithm(SCENT_obj, "Tnk", 6) -head(SCENT_obj@SCENT.result) +head(SCENT_obj_ver1@SCENT.result) +head(SCENT_obj_ver2@SCENT.result) +head(SCENT_obj_ver3@SCENT.result) +head(SCENT_obj_ver4@SCENT.result) ``` ```