From 98b06a3d294df0c6f83a2a28047894925b2fc699 Mon Sep 17 00:00:00 2001 From: Kent Riemondy Date: Thu, 17 Aug 2023 16:47:39 -0600 Subject: [PATCH] standardize fasta naming, and expand fasta file path --- DESCRIPTION | 2 +- NEWS.md | 4 ++++ R/calc_AEI.R | 8 ++++---- R/calc_scAEI.R | 8 +++----- R/pileup.R | 15 ++++++++------- R/utils.R | 10 +++++----- README.md | 18 +++++++++--------- man/calc_AEI.Rd | 4 ++-- man/calc_scAEI.Rd | 6 ++---- man/find_mispriming_sites.Rd | 4 ++-- man/pileup_sites.Rd | 4 ++-- tests/testthat/test_pileup_cells.R | 2 +- vignettes/novel-sites.Rmd | 2 +- vignettes/raer.Rmd | 4 ++-- 14 files changed, 46 insertions(+), 45 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3edccbc..e421209 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: raer Type: Package Title: RNA editing tools in R -Version: 0.99.7 +Version: 0.99.8 Authors@R: c( person("Kent", "Riemondy", , "kent.riemondy@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0003-0750-1273")), diff --git a/NEWS.md b/NEWS.md index 6bb03db..30c9b02 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# raer 0.99.8 + +* Function arguments involving a fasta file have been renamed to all be `fasta` + # raer 0.99.7 * added a single cell specific AEI calculation (`calc_scAEI()`) diff --git a/R/calc_AEI.R b/R/calc_AEI.R index 4b6675e..3287a02 100644 --- a/R/calc_AEI.R +++ b/R/calc_AEI.R @@ -14,7 +14,7 @@ #' #' @param bamfiles character vector of paths to indexed bam files. If a named #' character vector is supplied the names will be used in the output. -#' @param fasta_fn fasta filename +#' @param fasta fasta filename #' @param alu_ranges [GRanges] with regions to query for #' calculating the AEI, typically ALU repeats. #' @param txdb A [TxDb] object, if supplied, will be used to subset the alu_ranges @@ -64,7 +64,7 @@ #' #' @export calc_AEI <- function(bamfiles, - fasta_fn, + fasta, alu_ranges = NULL, txdb = NULL, snp_db = NULL, @@ -167,7 +167,7 @@ calc_AEI <- function(bamfiles, res <- list() for(i in seq_along(bamfiles)) { bam_fn <- bamfiles[i] - aei <- .AEI_per_bam(bam_fn = bam_fn, fasta_fn = fasta_fn, chroms = chroms, + aei <- .AEI_per_bam(bam_fn = bam_fn, fasta_fn = fasta, chroms = chroms, alu_ranges = alu_ranges, snps = snps, param = param, genes_gr = genes_gr, verbose = verbose, BPPARAM = BPPARAM) res[[path(bam_fn)]] <- aei @@ -267,7 +267,7 @@ calc_AEI <- function(bamfiles, param@only_keep_variants <- FALSE plp <- pileup_sites(bam_fn, - fafile = fasta_fn, + fasta = fasta_fn, sites = alu_sites, chroms = chrom, param = param diff --git a/R/calc_scAEI.R b/R/calc_scAEI.R index f2b8bef..5af80c3 100644 --- a/R/calc_scAEI.R +++ b/R/calc_scAEI.R @@ -121,9 +121,7 @@ calc_scAEI <- function(bamfiles, sites, cell_barcodes, param = FilterParam(), res } -#' @param fasta_fn a path to a BAM file (for 10x libraries), or a vector of paths -#' to BAM files (smart-seq2). Can be supplied as a character vector, `BamFile`, or -#' `BamFileList`. +#' @param fasta Path to a genome fasta file #' @param genes A `GRanges` object with gene coordinates.Alternatively a `TxDb` object, #' which if supplied, will be used extract gene coordinates. #' @param alus `GRanges` with repeat regions to query for calculating the AEI, @@ -138,7 +136,7 @@ calc_scAEI <- function(bamfiles, sites, cell_barcodes, param = FilterParam(), #' #' @rdname calc_scAEI #' @export -get_scAEI_sites <- function(fasta_fn, +get_scAEI_sites <- function(fasta, genes, alus, edit_from = "A", @@ -159,7 +157,7 @@ get_scAEI_sites <- function(fasta_fn, alus$id <- seq_along(alus) gn_alus <- prep_genic_alu_regions(genes, alus) aei_sites <- get_aei_site_positions(gn_alus, - fasta_fn, + fasta, edit_from) aei_sites$ALT <- edit_to diff --git a/R/pileup.R b/R/pileup.R index d48eef6..52d96e2 100644 --- a/R/pileup.R +++ b/R/pileup.R @@ -4,7 +4,7 @@ #' more BAM files to process. If named, the names will be included in the [colData] #' of the [RangedSummarizedExperiment] as a `sample` column, otherwise the names will #' be taken from the basename of the BAM file. -#' @param fafile path to genome fasta file used for read alignment. Can be in +#' @param fasta path to genome fasta file used for read alignment. Can be in #' gzip or bgzip format. #' @param sites a [GRanges] object containing regions or sites to process. #' @param region samtools region query string (i.e. `chr1:100-1000`). Can be combined @@ -107,7 +107,7 @@ #' @rdname pileup_sites #' @export pileup_sites <- function(bamfiles, - fafile, + fasta, sites = NULL, region = NULL, chroms = NULL, @@ -146,9 +146,10 @@ pileup_sites <- function(bamfiles, cli::cli_abort("index file(s) not found for bam: {mi}") } - if (!file.exists(fafile)) { - cli::cli_abort("fasta file not found: {fafile}") + if (!file.exists(fasta)) { + cli::cli_abort("fasta file not found: {fasta}") } + fasta <- path.expand(fasta) if (is.null(outfile_prefix)) { if (!return_data) { @@ -208,7 +209,7 @@ pileup_sites <- function(bamfiles, setdiff(chroms_to_process, missing_chroms) } - chroms_in_fa <- seqnames(Rsamtools::scanFaIndex(fafile)) + chroms_in_fa <- seqnames(Rsamtools::scanFaIndex(fasta)) missing_chroms <- chroms_to_process[!chroms_to_process %in% levels(chroms_in_fa)] @@ -278,7 +279,7 @@ pileup_sites <- function(bamfiles, bfs, bidxs, as.integer(n_files), - fafile, + fasta, region, sites, fp[["int_args"]], @@ -334,7 +335,7 @@ pileup_sites <- function(bamfiles, bfs, bidxs, as.integer(n_files), - fafile, + fasta, ctig, sites, fp[["int_args"]], diff --git a/R/utils.R b/R/utils.R index 9a32dac..ad510a9 100644 --- a/R/utils.R +++ b/R/utils.R @@ -56,7 +56,7 @@ unlist_w_names <- function(x) { #' are returned. #' #' @param bamfile path to bamfile -#' @param fafile path to fasta file +#' @param fasta path to fasta file #' @param pos_5p distance 5' of mispriming site to define mispriming region #' @param pos_3p distance 3' of mispriming site to define mispriming region #' @param min_reads minimum required number of reads at a mispriming site @@ -82,7 +82,7 @@ unlist_w_names <- function(x) { #' find_mispriming_sites(bam_fn, fa_fn) #' #' @export -find_mispriming_sites <- function(bamfile, fafile, pos_5p = 5, pos_3p = 20, +find_mispriming_sites <- function(bamfile, fasta, pos_5p = 5, pos_3p = 20, min_reads = 2, tag = "pa", tag_values = 3:300, n_reads_per_chunk = 1e6, verbose = TRUE){ @@ -140,7 +140,7 @@ find_mispriming_sites <- function(bamfile, fafile, pos_5p = 5, pos_3p = 20, drop = FALSE) res$n_regions <- IRanges::grouplengths(res$grouping) res$grouping <- NULL - res <- pa_seq_context(res, fafile) + res <- pa_seq_context(res, fasta) res } @@ -175,8 +175,8 @@ merge_pa_peaks <- function(gr) { #' @importFrom Rsamtools FaFile scanFa #' @importFrom Biostrings letterFrequency reverseComplement -pa_seq_context <- function(gr, fafile){ - fa <- Rsamtools::FaFile(fafile) +pa_seq_context <- function(gr, fasta){ + fa <- Rsamtools::FaFile(fasta) seqs <- Rsamtools::scanFa(fa, gr) seqs[strand(gr) == "-"] <- Biostrings::reverseComplement(seqs[strand(gr) == "-"]) a_prop <- Biostrings::letterFrequency(seqs, "A") / width(gr) diff --git a/README.md b/README.md index 164b617..93074c3 100644 --- a/README.md +++ b/README.md @@ -138,7 +138,7 @@ sce #> dim: 3 3 #> metadata(0): #> assays(2): nRef nAlt -#> rownames(3): site_2_579_2 site_2_625_2 site_2_589_2 +#> rownames(3): site_2_579_2_AG site_2_589_2_AG site_2_625_2_AG #> rowData names(2): REF ALT #> colnames(3): CACCAAACAACAACAA-1 TATTCCACACCCTCTA-1 GACCTTCAGTTGTAAG-1 #> colData names(0): @@ -150,16 +150,16 @@ sce ``` r assays(sce)$nRef #> 3 x 3 sparse Matrix of class "dgCMatrix" -#> CACCAAACAACAACAA-1 TATTCCACACCCTCTA-1 GACCTTCAGTTGTAAG-1 -#> site_2_579_2 0 0 1 -#> site_2_625_2 0 0 0 -#> site_2_589_2 1 1 2 +#> CACCAAACAACAACAA-1 TATTCCACACCCTCTA-1 GACCTTCAGTTGTAAG-1 +#> site_2_579_2_AG 0 0 1 +#> site_2_589_2_AG 1 1 2 +#> site_2_625_2_AG 0 0 0 assays(sce)$nAlt #> 3 x 3 sparse Matrix of class "dgCMatrix" -#> CACCAAACAACAACAA-1 TATTCCACACCCTCTA-1 GACCTTCAGTTGTAAG-1 -#> site_2_579_2 1 1 1 -#> site_2_625_2 1 1 1 -#> site_2_589_2 0 0 0 +#> CACCAAACAACAACAA-1 TATTCCACACCCTCTA-1 GACCTTCAGTTGTAAG-1 +#> site_2_579_2_AG 1 1 1 +#> site_2_589_2_AG 0 0 0 +#> site_2_625_2_AG 1 1 1 ``` ## Related work diff --git a/man/calc_AEI.Rd b/man/calc_AEI.Rd index aa570fb..4a4d572 100644 --- a/man/calc_AEI.Rd +++ b/man/calc_AEI.Rd @@ -6,7 +6,7 @@ \usage{ calc_AEI( bamfiles, - fasta_fn, + fasta, alu_ranges = NULL, txdb = NULL, snp_db = NULL, @@ -19,7 +19,7 @@ calc_AEI( \item{bamfiles}{character vector of paths to indexed bam files. If a named character vector is supplied the names will be used in the output.} -\item{fasta_fn}{fasta filename} +\item{fasta}{fasta filename} \item{alu_ranges}{\link{GRanges} with regions to query for calculating the AEI, typically ALU repeats.} diff --git a/man/calc_scAEI.Rd b/man/calc_scAEI.Rd index 12f136c..7abd8bc 100644 --- a/man/calc_scAEI.Rd +++ b/man/calc_scAEI.Rd @@ -17,7 +17,7 @@ calc_scAEI( ... ) -get_scAEI_sites(fasta_fn, genes, alus, edit_from = "A", edit_to = "G") +get_scAEI_sites(fasta, genes, alus, edit_from = "A", edit_to = "G") } \arguments{ \item{bamfiles}{a path to a BAM file (for 10x libraries), or a vector of paths @@ -49,9 +49,7 @@ If NULL, a temporary directory will be used.} \item{...}{additional arguments to \code{\link[=pileup_cells]{pileup_cells()}}} -\item{fasta_fn}{a path to a BAM file (for 10x libraries), or a vector of paths -to BAM files (smart-seq2). Can be supplied as a character vector, \code{BamFile}, or -\code{BamFileList}.} +\item{fasta}{Path to a genome fasta file} \item{genes}{A \code{GRanges} object with gene coordinates.Alternatively a \code{TxDb} object, which if supplied, will be used extract gene coordinates.} diff --git a/man/find_mispriming_sites.Rd b/man/find_mispriming_sites.Rd index accf299..f8a188d 100644 --- a/man/find_mispriming_sites.Rd +++ b/man/find_mispriming_sites.Rd @@ -6,7 +6,7 @@ \usage{ find_mispriming_sites( bamfile, - fafile, + fasta, pos_5p = 5, pos_3p = 20, min_reads = 2, @@ -19,7 +19,7 @@ find_mispriming_sites( \arguments{ \item{bamfile}{path to bamfile} -\item{fafile}{path to fasta file} +\item{fasta}{path to fasta file} \item{pos_5p}{distance 5' of mispriming site to define mispriming region} diff --git a/man/pileup_sites.Rd b/man/pileup_sites.Rd index 2ac2b22..74b2070 100644 --- a/man/pileup_sites.Rd +++ b/man/pileup_sites.Rd @@ -7,7 +7,7 @@ \usage{ pileup_sites( bamfiles, - fafile, + fasta, sites = NULL, region = NULL, chroms = NULL, @@ -48,7 +48,7 @@ more BAM files to process. If named, the names will be included in the \link{col of the \link{RangedSummarizedExperiment} as a \code{sample} column, otherwise the names will be taken from the basename of the BAM file.} -\item{fafile}{path to genome fasta file used for read alignment. Can be in +\item{fasta}{path to genome fasta file used for read alignment. Can be in gzip or bgzip format.} \item{sites}{a \link{GRanges} object containing regions or sites to process.} diff --git a/tests/testthat/test_pileup_cells.R b/tests/testthat/test_pileup_cells.R index 91c69ff..58aa593 100644 --- a/tests/testthat/test_pileup_cells.R +++ b/tests/testthat/test_pileup_cells.R @@ -217,7 +217,7 @@ bulkfp <- FilterParam(min_mapq = 255L, bam_flags = scanBamFlag(isSecondaryAlignment = FALSE, isSupplementaryAlignment = FALSE, isNotPassingQualityControls = FALSE)) -rse <- pileup_sites(bam_fn, fafile = fa_fn, chrom = "2", param = bulkfp) +rse <- pileup_sites(bam_fn, fa_fn, chrom = "2", param = bulkfp) rowData(rse)$ALT <- assay(rse, "ALT")[, 1] sites <- rowRanges(rse) diff --git a/vignettes/novel-sites.Rmd b/vignettes/novel-sites.Rmd index 02bd4e7..d6edfdf 100644 --- a/vignettes/novel-sites.Rmd +++ b/vignettes/novel-sites.Rmd @@ -87,7 +87,7 @@ fp <- FilterParam( bams <- c(rna_bam, wgs_bam) names(bams) <- c("rna", "dna") rse <- pileup_sites(bams, - fafile = fafn, + fasta = fafn, chrom = "chr4", param = fp ) diff --git a/vignettes/raer.Rmd b/vignettes/raer.Rmd index 2b2ee41..7f42a73 100644 --- a/vignettes/raer.Rmd +++ b/vignettes/raer.Rmd @@ -84,7 +84,7 @@ fp <- FilterParam( ) rse <- pileup_sites(bam_files, - fafile = fafn, + fasta = fafn, sites = sites, region = "chr18", param = fp @@ -232,7 +232,7 @@ alus[1:3, ] ```{r} alu_index <- calc_AEI(bam_files, - fasta_fn = fafn, + fasta = fafn, snp_db = alu_snps, alu_ranges = alus, param = fp)