update docs, vignette, and calc confidence options.

rnabioco · Oct 1, 2023 · 2e41592 · 2e41592
1 parent 64d1ad1
commit 2e41592
Show file tree

Hide file tree

Showing 17 changed files with 158 additions and 126 deletions.
diff --git a/R/annot_rse.R b/R/annot_rse.R
@@ -20,7 +20,7 @@
 #' plus strand).
 #' @param drop If TRUE, remove sites overlapping SNPs
 #' @param RLE If TRUE, columns added will returned as [S4Vectors::Rle()] vectors
-#' to reduce memory
+#' to reduce memory usage.
 #'
 #' @param ... For the generic, further arguments to pass to specific methods.
 #' Unused for now.
@@ -179,14 +179,15 @@ annot_snps.SummarizedExperiment <- function(obj, ...) {
 #'
 #' @description Utility function to map annotations from GRanges to rowData of
 #'   SummarizedExperiment or to mcols of GRanges object. If multiple features overlap then
-#'   they will be concatenated as comma separated values.
+#'   they will be concatenated with the specified separtor string .
 #'
 #' @param obj RangedSummarizedExperiment or GRanges object
 #' @param gr GRanges with annotations to map to obj
-#' @param cols_to_map character vector of columns from gr to map to obj. If the
-#'   vector has names, the names will be the column names in the output obj
+#' @param cols_to_map character vector of columns from GRanges to map to SummarizedExperiment. 
+#' If the vector has names, the names will be the column names in the output.
 #' @param RLE If TRUE, columns added will returned as [S4Vectors::Rle()] vectors
 #' to reduce memory
+#' @param sep separator string, defaults to comma. 
 #' @param ... additional arguments to pass to [GenomicRanges::findOverlaps()]
 #'
 #' @return Either a SummarizedExperiment or GRanges object with additional
@@ -210,7 +211,7 @@ annot_snps.SummarizedExperiment <- function(obj, ...) {
 #' @importFrom GenomeInfoDb seqlevelsStyle seqlevelsStyle<- seqlevels
 #'
 #' @export
-annot_from_gr <- function(obj, gr, cols_to_map, RLE = TRUE, ...) {
+annot_from_gr <- function(obj, gr, cols_to_map, RLE = TRUE, sep = ",", ...) {
     if (is(obj, "RangedSummarizedExperiment")) {
         gr_sites <- rowRanges(obj)
         return_se <- TRUE
@@ -241,7 +242,7 @@ annot_from_gr <- function(obj, gr, cols_to_map, RLE = TRUE, ...) {
             overlaps,
             tmp = unstrsplit(
                 unique(eval(parse(text = col))),
-                ","
+                sep = sep
             ),
             drop = FALSE
         )

diff --git a/R/calc_AEI.R b/R/calc_AEI.R
@@ -6,7 +6,7 @@
 #'   ALU elements in the human genome, and these regions have a high A-to-I
 #'   editing signal compared to other regions such as coding exons. This
 #'   function will perform pileup at specified repeat regions and return a
-#'   summary AEI metric.
+#'   summary AEI metric. 
 #'
 #' @references Roth, S.H., Levanon, E.Y. & Eisenberg, E. Genome-wide
 #' quantification of ADAR adenosine-to-inosine RNA editing activity. Nat Methods

diff --git a/R/calc_scAEI.R b/R/calc_scAEI.R
@@ -5,8 +5,9 @@
 #'   bases) observed at A positions. The vast majority A-to-I editing occurs in
 #'   ALU elements in the human genome, and these regions have a high A-to-I
 #'   editing signal compared to other regions such as coding exons. This
-#'   function will examine potential editing sites and return
-#'   summary AEI metric per cell.
+#'   function will examine enumerate edited and non-edited base counts at the supplied
+#'   sites and return summary AEI metric per cell. Potential editing sites within
+#'   repeat regions can be generated using `get_scAEI_sites()`.
 #'
 #' @references Roth, S.H., Levanon, E.Y. & Eisenberg, E. Genome-wide
 #' quantification of ADAR adenosine-to-inosine RNA editing activity. Nat Methods

diff --git a/R/differential_editing.R b/R/differential_editing.R
@@ -6,7 +6,7 @@
 #' for each site (`edit_freq`), depth of coverage computed
 #' using the indicated edited nucleotides (`depth`) and new `colData`
 #' columns with the number of edited sites (`n_sites`) and the
-#' fraction of edits (`edit_idx`) is returned.
+#' fraction of edits (`edit_idx`) is returned. 
 #'
 #' @param rse A [RangedSummarizedExperiment] object created by [pileup_sites()]
 #' @param edit_from This should correspond to a nucleotide or assay
@@ -234,11 +234,8 @@ make_de_object <- function(rse,
 #' Perform differential editing
 #'
 #' @description Use `edgeR` or `DESeq2` to perform differential editing
-#'   analysis. This will work for simple designs that have 1 treatment and 1
-#'   control. For more complex designs, we suggest you perform your own.
-#'
-#'   At the moment, this function will only find editing events specific to the
-#'   treatment.
+#'   analysis. This will work for designs that have 1 treatment and 1
+#'   control group. For more complex designs, we suggest you perform your own modeling.
 #'
 #' @param deobj A [RangedSummarizedExperiment] object prepared for differential
 #' editing analysis by [make_de_object()]

diff --git a/R/filter_rse.R b/R/filter_rse.R
@@ -41,7 +41,7 @@ filter_multiallelic <- function(se) {
 
 #' Extract regions surrounding splice sites
 #'
-#' @description Find intervals containing splice sites and their adjacent
+#' @description Extract intervals at splice sites and their adjacent
 #'   regions.
 #'
 #' @param txdb `GenomicFeatures::TxDb`
@@ -162,10 +162,10 @@ filter_splice_variants <- function(rse, txdb,
 
 #' Filter out clustered sequence variants
 #'
-#' @description Sequence variants of multiple allele types (e.g., `A>G`, `A>C`)
-#'   in nearby regions can be due to mis-alignment. Remove variants if multiple
-#'   allele types are present within a given distance in genomic or
-#'   transcriptome coordinate space.
+#' @description Sequence variants of multiple allele types (e.g., `A -> G`, `A -> C`)
+#'   proximal to a putative editing site can be indicative of a region prone to mis-alignment
+#'   artifacts. Sites will be removed if variants of multiple allele types are present 
+#'   within a given distance in genomic or transcriptome coordinate space.
 #'
 #' @param rse `SummarizedExperiment::SummarizedExperiment` containing editing sites
 #' @param txdb `GenomicFeatures::TxDb`
@@ -298,7 +298,10 @@ filter_clustered_variants <- function(rse, txdb,
 #' @param edit_from non-edited base
 #' @param per_sample if TRUE, calculate confidence per sample, otherwise edited
 #' and non-edited counts will be summed across all samples.
-#' @param exp_fraction Numeric, confidence margin parameter for
+#' @param exp_fraction Numeric value between 0 and 1, specifying the expected error
+#' rate 
+#' @param alpha Pseudo-count to add to non-edited base counts
+#' @param beta  Pseudo-count to add to edited base counts
 #'
 #' @examples
 #' rse_adar_ifn <- mock_rse()
@@ -319,14 +322,21 @@ calc_confidence <- function(se,
     edit_to = "G",
     edit_from = "A",
     per_sample = FALSE,
-    exp_fraction = 0.01) {
-    if (length(exp_fraction) != 1) {
-        cli::cli_abort("exp_fraction must be numeric(1)")
+    exp_fraction = 0.01,
+    alpha = 0L,
+    beta = 0L) {
+    if (length(exp_fraction) != 1 || (exp_fraction < 0 || exp_fraction > 1)) {
+        cli::cli_abort("exp_fraction must be numeric(1) and between 0 and 1")
     }
+
+    if (length(alpha) != 1 || length(beta) != 1) {
+        cli::cli_abort("alpha and beta must be length 1")
+    }
+
     edit_to <- paste0("n", edit_to)
     edit_from <- paste0("n", edit_from)
-    alt <- assay(se, edit_to)
-    ref <- assay(se, edit_from)
+    alt <- assay(se, edit_to) + as.integer(beta)
+    ref <- assay(se, edit_from) + as.integer(alpha)
     if (per_sample) {
         nc <- ncol(se)
         res <- vapply(seq_len(nc), function(i) {

diff --git a/R/pileup.R b/R/pileup.R
@@ -1,19 +1,26 @@
 #' Generate base counts using pileup
-#'
+#' 
+#' @description This function uses a pileup routine to examine numerate base counts from
+#'   alignments at specified sites, regions, or across all read alignments, from one or 
+#'   more BAM files. Alignment and site filtering
+#'   options are controlled by the `FilterParam` class.A [RangedSummarizedExperiment] object 
+#'   is returned, populated with base count statistics for each supplied BAM file.
+#'   
 #' @param bamfiles a character vector, [BamFile] or [BamFileList] indicating 1 or
 #' more BAM files to process. If named, the names will be included in the [colData]
 #' of the [RangedSummarizedExperiment] as a `sample` column, otherwise the names will
 #' be taken from the basename of the BAM file.
-#' @param fasta path to genome fasta file used for read alignment. Can be in
-#' gzip or bgzip format.
+#' @param fasta path to genome fasta file used for read alignment. Can be provided in 
+#' compressed gzip or bgzip format.
 #' @param sites a [GRanges] object containing regions or sites to process.
 #' @param region samtools region query string (i.e. `chr1:100-1000`). Can be combined
 #' with sites, in which case sites will be filtered to keep only sites within the
 #' region.
-#' @param chroms chromosomes to process, not to be used with region.
+#' @param chroms chromosomes to process, provided as a character vector. Not to be used with the
+#' region parameter.
 #' @param param object of class [FilterParam()] which specify various
 #'   filters to apply to reads and sites during pileup.
-#' @param umi_tag The bam tag containing a UMI sequence. If supplied, multiple
+#' @param umi_tag The BAM tag containing a UMI sequence. If supplied, multiple
 #'   reads with the same UMI sequence will only be counted once per position.
 #' @param BPPARAM A [BiocParallel] class to control parallel execution. Parallel
 #'   processing occurs per chromosome and is disabled when run on a single
@@ -33,7 +40,7 @@
 #'   * `REF`: The reference base
 #'   * `rpbz`: Mann-Whitney U test of Read Position Bias from bcftools,
 #'     extreme negative or positive values indicate more bias.
-#'   * `vdb`: Variant Distance Bias for filtering splice-site artefacts from
+#'   * `vdb`: Variant Distance Bias for filtering splice-site artifacts from
 #'     bcftools, lower values indicate more bias.
 #'   * `sor` Strand Odds Ratio Score, strand bias estimated by the Symmetric
 #'     Odds Ratio test, based on GATK code. Higher values indicate more bias.

diff --git a/R/sc-pileup.R b/R/sc-pileup.R
@@ -1,22 +1,22 @@
-#' Pileup sites per cell
+#' Generate base counts per cell
 #'
-#' @description This function performs a pileup operation at specified
-#'   sites, returning counts for Reference (e.g. unedited) or Alternate (e.g.
-#'   edited) bases. `pileup_cells` can process either a 10x Genomic's style
-#'   library, from a aligned BAM file containing a tag with a cell-barcode and a
-#'   tag with a UMI, or smart-seq style libraries without cell-barcodes.
+#' @description This function processes scRNA-seq library to enumerate base counts 
+#'   for Reference (unedited) or Alternate (
+#'   edited) bases at specified sites in single cells. `pileup_cells` can process 
+#'   droplet scRNA-seq libraries, from a BAM file containing a cell-barcode and UMI, 
+#'   or well-based libraries that do not contain cell-barcodes.
 #'
-#'   The `sites` parameter specifies sites to pileup. This must be a [GRanges]
+#'   The `sites` parameter specifies sites to quantify. This must be a [GRanges]
 #'   object with 1 base intervals, a strand (+ or -), and supplemented with
 #'   metadata columns named `REF` and `ALT` containing the reference and
-#'   alternate base to query. See examples for an example format.
+#'   alternate base to query. See examples for the required format.
 #'
 #'   At each site, bases from overlapping reads will be examined, and counts of
 #'   each ref and alt base enumerated for each cell-barcode present. A single
 #'   base will be counted once for each UMI sequence present in each cell.
 #'
-#' @param bamfiles a path to a BAM file (for 10x libraries), or a vector of paths
-#' to BAM files (smart-seq2). Can be supplied as a character vector, [BamFile], or
+#' @param bamfiles a path to a BAM file (for droplet scRNA-seq), or a vector of paths
+#' to BAM files (Smart-seq2). Can be supplied as a character vector, [BamFile], or
 #' [BamFileList].
 #' @param sites a GRanges object containing sites to process. See examples for
 #'   valid formatting.
@@ -25,7 +25,7 @@
 #' @param chroms A character vector of chromosomes to process. If supplied, only
 #'   sites present in the listed chromosomes will be processed
 #' @param cell_barcodes A character vector of single cell barcodes to process. If
-#' processing multiple BAM files (e.g. smart-seq-2), provide a character vector
+#' processing multiple BAM files (e.g. Smart-seq2), provide a character vector
 #' of unique identifiers for each input BAM, to name each BAM file in the output files.
 #' @param param object of class [FilterParam()] which specify various filters to
 #'   apply to reads and sites during pileup. Note that the `min_depth` and
@@ -34,11 +34,9 @@
 #'   set to 2, then at least 2 reads (from any cell) must have a variant in order
 #'   to report the site. Setting `min_depth` and `min_variant_reads` to 0 reports
 #'   all sites present in the `sites` object. The following options are not enabled
-#'   for pileup_cells(): max_mismatch_type, homopolymer_len, and min_allelic_freq.
+#'   for pileup_cells(): `max_mismatch_type`, `homopolymer_len`, and `min_allelic_freq`.
 #' @param umi_tag tag in BAM containing the UMI sequence
 #' @param cb_tag tag in BAM containing the cell-barcode sequence
-#' @param paired_end set to TRUE if data is paired-end to prevent double counting
-#' overlapping read pairs.
 #' @param return_sce if `TRUE`, data is returned as a SingleCellExperiment, if
 #'   `FALSE` a character vector of the output files, specified by
 #'   `outfile_prefix`, will be returned.
@@ -78,16 +76,19 @@
 #' sce <- pileup_cells(bam_fn, gr, cbs, outdir, param = fp)
 #' sce
 #'
-#' # example of processing multiple smart-seq2 style libraries
+#' # example of processing multiple Smart-seq2 style libraries
 #'
 #' many_small_bams <- rep(bam_fn, 10)
 #' bam_ids <- LETTERS[1:10]
+#' 
+#' fp <- FilterParam(library_type = "unstranded",
+#'                   remove_overlaps = TRUE)
+#' 
 #' pileup_cells(many_small_bams,
 #'     sites = gr,
 #'     cell_barcodes = bam_ids,
 #'     cb_tag = NULL,
 #'     umi_tag = NULL,
-#'     paired_end = TRUE,
 #'     outdir,
 #'     param = fp
 #' )
@@ -107,7 +108,6 @@ pileup_cells <- function(bamfiles,
     chroms = NULL,
     umi_tag = "UB",
     cb_tag = "CB",
-    paired_end = FALSE,
     param = FilterParam(),
     BPPARAM = SerialParam(),
     return_sce = TRUE,
@@ -189,7 +189,6 @@ pileup_cells <- function(bamfiles,
                 cb_tag = cb_tag,
                 umi_tag = umi_tag,
                 param = param,
-                pe = paired_end,
                 verbose = verbose
             ),
             BPPARAM = BPPARAM,
@@ -211,7 +210,6 @@ pileup_cells <- function(bamfiles,
                 cb_tag = cb_tag,
                 umi_tag = umi_tag,
                 param = param,
-                pe = paired_end,
                 verbose = verbose
             ),
             BPPARAM = BPPARAM,
@@ -261,7 +259,7 @@ defaultScBamFlags <- Rsamtools::scanBamFlag(
 get_sc_pileup <- function(bamfn, index, id, sites, barcodes,
                           outfile_prefix, chrom,
                           umi_tag, cb_tag, param,
-                          pe, verbose) {
+                          verbose) {
     if (length(chrom) > 0) {
         sites <- sites[seqnames(sites) %in% chrom, ]
     }
@@ -293,7 +291,7 @@ get_sc_pileup <- function(bamfn, index, id, sites, barcodes,
         fp[["library_type"]],
         plp_outfns,
         umi_tag,
-        pe,
+        fp$lgl_args[["remove_overlaps"]],
         fp[["min_mapq"]],
         max(fp$int_args["min_variant_reads"], fp$int_args["min_depth"])
     )

diff --git a/man/annot_from_gr.Rd b/man/annot_from_gr.Rd
diff --git a/man/annot_snps.Rd b/man/annot_snps.Rd
diff --git a/man/calc_confidence.Rd b/man/calc_confidence.Rd
diff --git a/man/calc_scAEI.Rd b/man/calc_scAEI.Rd