Skip to content

Commit

Permalink
Merge pull request #70 from bartongroup/development
Browse files Browse the repository at this point in the history
0.6.7
  • Loading branch information
fruce-ki authored Jun 2, 2022
2 parents 749bf9e + 5d3363d commit b0712f1
Show file tree
Hide file tree
Showing 52 changed files with 8,745 additions and 465 deletions.
3 changes: 0 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,4 @@
.Rhistory
.RData
*.Rproj

.DS_Store
doc
Meta
8 changes: 4 additions & 4 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: rats
Version: 0.6.6
Date: 2019-07-18
Version: 0.6.7
Date: 2022-05-30
Title: Relative Abundance of Transcripts
Encoding: UTF-8
Author: Kimon Froussios [aut], Kira Mourão [aut], Nick Schurch [cre]
Expand All @@ -22,13 +22,13 @@ Imports:
ggplot2 (>= 2.2.0),
rtracklayer,
rhdf5,
wasabi,
GenomicRanges
Suggests:
ggbio,
shiny,
knitr,
rmarkdown,
testthat
URL: https://github.com/bartongroup/Rats, http://www.compbio.dundee.ac.uk
BugReports: https://github.com/bartongroup/RATS/issues
RoxygenNote: 6.1.1
RoxygenNote: 7.1.2
Binary file modified Meta/vignette.rds
Binary file not shown.
1 change: 0 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ import(matrixStats)
import(parallel)
import(rhdf5)
import(utils)
import(wasabi)
importFrom(GenomicRanges,makeGRangesListFromDataFrame)
importFrom(GenomicRanges,mcols)
importFrom(parallel,detectCores)
Expand Down
81 changes: 50 additions & 31 deletions R/input.R
Original file line number Diff line number Diff line change
Expand Up @@ -127,51 +127,43 @@ annot2models <- function(annotfile, threads= 1L)


#================================================================================
#' Import abundances directly from salmon and kallisto output.
#' Import abundances directly from kallisto output.
#'
#' Convert Salmon read counts format to Kallisto RHDF5 format,
#' then apply TPM normalisation using the info available from the abundance.h5 files.
#' Apply TPM normalisation using the info available from the abundance.h5 files.
#'
#' Converting, normalising and importing multiple bootstrapped abundance files takes a bit of time.
#' IMPORTANT: This function is currently not intended to be used to import non-bootstrapped quantifications.
#'
#' \code{wasabi} automatically skips format conversion if a folder already contains an \code{abundance.h5} file.
#'
#' @param A_paths (character) A vector of strings, listing the directory paths to the quantifications for the first condition. One directory per replicate, without trailing path dividers. The directory name should be a unique identifier for the sample.
#' @param B_paths (character) A vector of strings, listing the directory paths to the quantifications for the second condition. One directory per replicate, without trailing path dividers.. The directory name should be a unique identifier for the sample.
#' @param annot (data.frame) A table matching transcript identifiers to gene identifiers. This should be the same that you used for quantification and that you will use with \code{call_DTU()}. It is used to order the transcripts consistently throughout RATs.
#' @param scaleto (double) Scaling factor for normalised abundances. (Default 1000000 gives TPM). If a numeric vector is supplied instead, its length must match the total number of samples. The value order should correspond to the samples in group A followed by group B. This allows each sample to be scaled to its own actual library size, allowing higher-throughput samples to carry more weight in deciding DTU.
#' @param half_cooked (logical) If TRUE, input is already in \code{Kallisto} h5 format and \code{wasabi} conversion will be skipped. FALSE can also be used to force wasabi to repeat the conversion even if it exists. (Default FALSE)
#' @param beartext (logical) Instead of importing bootstrap data from the \code{abundance.h5} file of each sample, import it from plaintext files in a \code{bootstrap} subdirectory created by running \code{kallisto}'s \code{h5dump} subcommand (Default FALSE). This workaround circumvents some mysterious .h5 parsing issues on certain systems.
#' @param TARGET_COL The name of the column for the transcript identifiers in \code{annot}. (Default \code{"target_id"})
#' @param PARENT_COL The name of the column for the gene identifiers in \code{annot}. (Default \code{"parent_id"})
#' @param threads (integer) For parallel processing. (Default 1)
#' @return A list of two, representing the TPM abundances per condition. These will be formatted in the RATs generic bootstrapped data input format.
#' @return A list of two, representing the TPM abundances per condition. These will be formatted in the RATs generic data input format, preferably for bootstrapped estimates (if bootstraps are available) or otherwise for plain count estimates.
#'
#' @import parallel
#' @import data.table
#' @import rhdf5
#' @import wasabi
#'
#' @export

fish4rodents <- function(A_paths, B_paths, annot, TARGET_COL="target_id", PARENT_COL="parent_id", half_cooked=FALSE, beartext=FALSE, threads= 1L, scaleto= 1000000)
fish4rodents <- function(A_paths, B_paths, annot, TARGET_COL="target_id", PARENT_COL="parent_id", beartext=FALSE, threads= 1L, scaleto= 1000000)
{
threads <- as.integer(threads) # Can't be decimal.
setDTthreads(threads, restore_after_fork = TRUE)

# Don't assume the annotation is ordered properly.
annot <- tidy_annot(annot, TARGET_COL, PARENT_COL)

# Wasabi?
wasabi::prepare_fish_for_sleuth(c(A_paths, B_paths), force= !half_cooked)

# Sort out scaling factor per sample.
lgth <- c("A"=length(A_paths), "B"=length(B_paths))
sfac <- list("A"=NA_real_, "B"=NA_real_)
if (length(scaleto) == 1) { # uniform scaling
sfac$A <- rep(scaleto, lgth$A)
sfac$B <- rep(scaleto, lgth$B)
sfac$A <- rep(scaleto, lgth["A"])
sfac$B <- rep(scaleto, lgth["B"])
} else { # individual scaling
sfac$A <- scaleto[1:lgth$A]
sfac$B <- scaleto[(1 + lgth$A):(lgth$A + lgth$B)]
Expand Down Expand Up @@ -201,32 +193,59 @@ fish4rodents <- function(A_paths, B_paths, annot, TARGET_COL="target_id", PARENT
# for consistency with the .h5 mode and to allow normalisation to other target sizes
# If data from Salmon/Wasabi or Kallisto abundance.h5...
} else {
content <- rhdf5::h5ls(file.path(fil, "abundance.h5"))
meta <- as.data.table(list( target_id=as.character(rhdf5::h5read(file.path(fil, "abundance.h5"), "/aux/ids")),
eff_length=as.numeric(rhdf5::h5read(file.path(fil, "abundance.h5"), "/aux/eff_lengths")) ))
counts <- as.data.table( rhdf5::h5read(file.path(fil, "abundance.h5"), "/bootstrap") )

if ('bootstrap' %in% content$name) {
counts <- as.data.table( rhdf5::h5read(file.path(fil, "abundance.h5"), "/bootstrap") )
# Normalise raw counts.
tpm <- as.data.table( lapply(counts, function (y) {
cpb <- y / meta$eff_length
tcpb <- sf / sum(cpb)
return(cpb * tcpb)
}) )
# Structure.
dt <- as.data.table( cbind(meta$target_id, tpm) )
with(dt, setkey(dt, V1) )
names(dt)[1] <- TARGET_COL
# Order transcripts to match annotation.
dt <- merge(annot[, c(TARGET_COL), with=FALSE], dt, by=TARGET_COL, all=TRUE)

} else {
counts <- rhdf5::h5read(file.path(fil, "abundance.h5"), "/est_counts")
# Normalize.
cpb <- counts / meta$eff_length
tcpb <- sf / sum(cpb)
tpm <- cpb * tcpb
# Structure
dt <- as.data.table( cbind(meta$target_id, tpm) )
with(dt, setkey(dt, V1) )
names(dt)[1] <- TARGET_COL
names(dt)[2] <- basename(fil)
# Order transcripts to match annotation.
dt <- merge(annot[, c(TARGET_COL), with=FALSE], dt, by=TARGET_COL, all=TRUE)
}

return (dt)
}

# Normalise raw counts.
tpm <- as.data.table( lapply(counts, function (y) {
cpb <- y / meta$eff_length
tcpb <- sf / sum(cpb)
return(cpb * tcpb)
}) )

# Structure.
dt <- as.data.table( cbind(meta$target_id, tpm) )
with(dt, setkey(dt, V1) )
names(dt)[1] <- TARGET_COL
# Order transcripts to match annotation.
dt <- merge(annot[, c(TARGET_COL), with=FALSE], dt, by=TARGET_COL, all=TRUE)

return (dt)
}, mc.cores = min(threads,lgth[cond]), mc.preschedule = TRUE, mc.allow.recursive = FALSE)

# If single measurement for all samples, ie. est_counts instead of bootsraps, merge and unnest to meet un-bootsrapped format.
if( sum(vapply(boots, function(b){length(b)-1}, numeric(1))) == length(boots) ) {
boots <- Reduce(merge, boots)
}

return (boots)
})

names(res) <- c("boot_data_A", "boot_data_B")
if (is.data.table(res[[1]])) {
names(res) <- c("count_data_A", "count_data_B")
} else {
names(res) <- c("boot_data_A", "boot_data_B")
}

return(res)
}

Expand Down
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,9 @@ BiocManager::install("GenomicRanges")
* Optional dependencies

```
# Gene models
BiocManager::install("ggbio")
# Interactive volcano plot.
install.packages("shiny")
```
Expand Down
Loading

0 comments on commit b0712f1

Please sign in to comment.