Skip to content

Commit

Permalink
Add-on: SCENT algorithm
Browse files Browse the repository at this point in the history
Added Options for Non-binarized counts and Negative Binomial Regression
  • Loading branch information
shakson-isaac committed Jul 26, 2023
1 parent fcf3acc commit b3be208
Show file tree
Hide file tree
Showing 10 changed files with 118 additions and 36 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
export(CreatePeakToGeneList)
export(CreateSCENTObj)
export(SCENT_algorithm)
export(assoc_negbin)
export(assoc_poisson)
export(basic_p)
export(check_dimensions)
Expand Down
10 changes: 6 additions & 4 deletions Parallelized Bash Script/SCENT_parallelization.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@ library(SCENT)
####### INPUTS
#Obtain arguments: (from Cluster)
node = commandArgs(trailingOnly = T)[1] # JOB ARRAY number: node usage
cores = commandArgs(trailingOnly = T)[2] # Number of Cores
cores = commandArgs(trailingOnly = T)[2] # numeric. Number of Cores
SCENTobj_rds = commandArgs(trailingOnly = T)[3] # RDS object file type
celltype = commandArgs(trailingOnly = T)[4] # CellType
output_dir = commandArgs(trailingOnly = T)[5] # Output of each text file to a specific folder
celltype = commandArgs(trailingOnly = T)[4] # character. CellType
regr = commandArgs(trailingOnly = T)[5] # character. Regression Type
bin = commandArgs(trailingOnly = T)[6] # logical. Binarize ATAC counts
output_dir = commandArgs(trailingOnly = T)[7] # Output of each text file to a specific folder


###Example of inputs from the bash script: parallelizedSCENT.sh
Expand All @@ -25,7 +27,7 @@ SCENT_obj <- readRDS(SCENTobj_rds)
SCENT_obj@peak.info <- SCENT_obj@peak.info[[node]]

#### Run SCENT algorithm of Tnk cell type and use 6 cores for parallelization:
SCENT_obj <- SCENT_algorithm(SCENT_obj, celltype, cores)
SCENT_obj <- SCENT_algorithm(SCENT_obj, celltype, cores, regr, bin)

#### Output SCENT results for each gene-peak pair block.
filename <- paste0(output_dir,"SCENTresult_",node,".txt")
Expand Down
3 changes: 2 additions & 1 deletion Parallelized Bash Script/parallelizedSCENT.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@


module load R
Rscript SCENT_parallelization.R $LSB_JOBINDEX ${num_cores} ${file_SCENT_obj} ${celltype} ${output_dir}
Rscript SCENT_parallelization.R $LSB_JOBINDEX ${num_cores} ${file_SCENT_obj} ${celltype} ${regr} ${bin} ${output_dir}

62 changes: 45 additions & 17 deletions R/SCENTfunctions.R
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,20 @@ assoc_poisson = function(data, idx = seq_len(nrow(data)), formula){
}


#' Perform negative binomial regression: exprs ~ peak + covariates
#'
#' @param data contains expr values and associated peak and covariates for a gene.
#' @param idx rows of the data to use: argument for boot function (bootstrapping)
#' @param formula user defined formula based on initialization in CreateSCENTObj Constructor
#'
#' @return vector: (coefficient of the peak effect on gene, variance of peak effect on gene)
#' @export
assoc_negbin = function(data, idx = seq_len(nrow(data)), formula){
gg = glm.nb(formula, data = data[idx,,drop = FALSE])
c(coef(gg)['atac'], diag(vcov(gg))['atac'])
}



#' Validity and Type Checking for CreateSCENTObject Constructor
#'
Expand Down Expand Up @@ -134,19 +148,25 @@ CreateSCENTObj <- setClass(
#' SCENT Algorithm: Poisson Regression with Empirical P-values through Bootstrapping.
#'
#' @param object SCENT object
#' @param celltype User specified cell type defined in celltypes column of meta.data
#' @param ncores Number of cores to use for Parallelization
#' @param celltype character. User specified cell type defined in celltypes column of meta.data
#' @param ncores numeric. Number of cores to use for Parallelization
#' @param regr character. Regression type: "poisson" or "negbin" for Poisson regression and Negative Binomial regression, respectively
#' @param bin logical. TRUE to binarize ATAC counts. FALSE to NOT binarize ATAC counts
#'
#' @return SCENT object with updated field SCENT.results
#' @export
SCENT_algorithm <- function(object, celltype, ncores){
SCENT_algorithm <- function(object, celltype, ncores, regr = "poisson", bin = TRUE){
res <- data.frame()
for (n in 1:nrow(object@peak.info)){ ####c(1:nrow(chunkinfo))
gene <- object@peak.info[n,1] #GENE is FIRST COLUMN OF PEAK.INFO
this_peak <- object@peak.info[n,2] #PEAK is SECOND COLUMN OF PEAK.INFO
atac_target <- data.frame(cell = colnames(object@atac), atac = object@atac[this_peak,])


#binarize peaks:
atac_target[atac_target$atac>0,]$atac<-1
if(bin){
atac_target[atac_target$atac>0,]$atac<-1
}

mrna_target <- object@rna[gene,]
df <- data.frame(cell=names(mrna_target),exprs=as.numeric(mrna_target))
Expand All @@ -158,32 +178,40 @@ SCENT_algorithm <- function(object, celltype, ncores){
nonzero_m <- length( df2$exprs[ df2$exprs > 0] ) / length( df2$exprs )
nonzero_a <- length( df2$atac[ df2$atac > 0] ) / length( df2$atac )
if(nonzero_m > 0.05 & nonzero_a > 0.05){
# poisson Regression
#Run Regression Once Before Bootstrapping:
res_var <- "exprs"
pred_var <- c("atac", object@covariates) ###need to add log....
formula <- as.formula(paste(res_var, paste(pred_var, collapse = "+"), sep = "~"))


#Estimated Coefficients Obtained without Bootstrapping:
base = glm(formula, family = 'poisson', data = df2)
coefs<-summary(base)$coefficients["atac",]
if(regr == "poisson"){
base = glm(formula, family = 'poisson', data = df2)
coefs<-summary(base)$coefficients["atac",]
assoc <- assoc_poisson
} else if (regr == "negbin"){
base = glm.nb(formula, data = df2)
coefs<-summary(base)$coefficients["atac",]
assoc <- assoc_negbin
}

###Iterative Bootstrapping Procedure: Estimate the Beta coefficients and associate a 2-sided p-value.
bs = boot::boot(df2,assoc_poisson, R = 100, formula = formula, stype = 'i', parallel = "multicore", ncpus = ncores)
bs = boot::boot(df2,assoc, R = 100, formula = formula, stype = 'i', parallel = "multicore", ncpus = ncores)
p0 = basic_p(bs$t0[1], bs$t[,1])
if(p0<0.1){
bs = boot::boot(df2,assoc_poisson, R = 500, formula = formula, stype = 'i', parallel = "multicore", ncpus = ncores)
bs = boot::boot(df2,assoc, R = 500, formula = formula, stype = 'i', parallel = "multicore", ncpus = ncores)
p0 = basic_p(bs$t0[1], bs$t[,1])
}
if(p0<0.05){
bs = boot::boot(df2,assoc_poisson, R = 2500, formula = formula, stype = 'i', parallel = "multicore", ncpus = ncores)
bs = boot::boot(df2,assoc, R = 2500, formula = formula, stype = 'i', parallel = "multicore", ncpus = ncores)
p0 = basic_p(bs$t0[1], bs$t[,1])
}
if(p0<0.01){
bs = boot::boot(df2,assoc_poisson, R = 25000, formula = formula, stype = 'i', parallel = "multicore", ncpus = ncores)
bs = boot::boot(df2,assoc, R = 25000, formula = formula, stype = 'i', parallel = "multicore", ncpus = ncores)
p0 = basic_p(bs$t0[1], bs$t[,1])
}
if(p0<0.001){
bs = boot::boot(df2,assoc_poisson, R = 50000, formula = formula, stype = 'i', parallel = "multicore", ncpus = ncores)
bs = boot::boot(df2,assoc, R = 50000, formula = formula, stype = 'i', parallel = "multicore", ncpus = ncores)
p0 = basic_p(bs$t0[1], bs$t[,1])
}
out <- data.frame(gene=gene,peak=this_peak,beta=coefs[1],se=coefs[2],z=coefs[3],p=coefs[4],boot_basic_p=p0)
Expand All @@ -201,12 +229,12 @@ SCENT_algorithm <- function(object, celltype, ncores){
#' Creating Cis Gene-Peak Pair Lists to Parallelize Through
#'
#' @param object SCENT object
#' @param genebed File directory for bed file that contains 500 kb windows for each gene
#' @param nbatch Number of batches to produce: Length of the list
#' @param tmpfile Location of temporary file.
#' @param intersectedfile Location of intersected file.
#' @param genebed character. File directory for bed file that contains 500 kb windows for each gene
#' @param nbatch numeric. Number of batches to produce: Length of the list
#' @param tmpfile character. Location of temporary file.
#' @param intersectedfile character. Location of intersected file.
#'
#' @return SCENT object with updated field of peak.info
#' @return SCENT object with updated field of peak.info.list
#' @export
CreatePeakToGeneList <- function(object,genebed="/path/to/GeneBody_500kb_margin.bed",nbatch,tmpfile="./temporary_atac_peak.bed",intersectedfile="./temporary_atac_peak_intersected.bed.gz"){
peaknames <- rownames(object@atac) # peak by cell matrix
Expand Down
3 changes: 2 additions & 1 deletion R/import_packages.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#' @import methods Hmisc R.utils data.table stringr
#' @import methods Hmisc data.table stringr
#' @import R.utils
#' @import lme4 boot MASS parallel
#' @import Matrix
NULL
10 changes: 5 additions & 5 deletions man/CreatePeakToGeneList.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion man/SCENT-class.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 7 additions & 3 deletions man/SCENT_algorithm.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 21 additions & 0 deletions man/assoc_negbin.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

30 changes: 26 additions & 4 deletions vignettes/SCENT_interactive.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -55,23 +55,45 @@ head(SCENT_obj@rna[1:10,1:2])
head(SCENT_obj@atac[1:10,1:2])
head([email protected])
head([email protected])
str(SCENT_obj)
```

## SCENT Algorithm: Poisson Regression w/ Bootstrapping
## SCENT Algorithm: Obtain small list of gene-peak pairs.

```{r gene_peak}
#Of the set of peak gene pairs: pick a set of pairs to test:
#Example: (first 10 gene-peak pairs)
[email protected] <- [email protected][1:10,]
head([email protected])
```
## SCENT Algorithm: Options for Regression w/ Bootstrapping.

```{r gene_peak}
#Run SCENT algorithm of Tnk cell type and use 6 cores for parallelization:
#Default: Poisson regression and Binarized ATAC counts
SCENT_obj_ver1 <- SCENT_algorithm(SCENT_obj, "Tnk", 6)
# By default settings the above will perform parallelizations using Poisson regression and Binarized counts.
#Option 1: Poisson regression and Non-Binarized ATAC counts
SCENT_obj_ver2 <- SCENT_algorithm(SCENT_obj, "Tnk", 6, regr = "poisson", bin = FALSE)
#Option 2: Negative Binomial regression and Binarized ATAC counts
SCENT_obj_ver3 <- SCENT_algorithm(SCENT_obj, "Tnk", 6, regr = "negbin", bin = TRUE)
#Option 3: Negative Binomial regression and Non-Binarized ATAC counts
SCENT_obj_ver4 <- SCENT_algorithm(SCENT_obj, "Tnk", 6, regr = "negbin", bin = FALSE)
```

## Output of SCENT Algorithm

```{r SCENT_algo}
#Run SCENT algorithm of Tnk cell type and use 6 cores for parallelization:
SCENT_obj <- SCENT_algorithm(SCENT_obj, "Tnk", 6)
head([email protected])
head([email protected])
head([email protected])
head([email protected])
head([email protected])
```

```
Expand Down

0 comments on commit b3be208

Please sign in to comment.