From 98d4ebf8be953e5b7326ae04e2457f96dafc404a Mon Sep 17 00:00:00 2001
From: lcolladotor <lcolladotor@gmail.com>
Date: Tue, 26 Mar 2024 13:33:53 -0400
Subject: [PATCH] v0.99.33 -- enable `merge_rse_metrics()` to handle
 `ERCCsumLogErr` via an approximation.

Details of the approximation were discussed at https://jhu-genomics.slack.com/archives/C018ZDB8QLU/p1710454284251579 as part of the `qsvaR` project.

Resolve #3.
---
 DESCRIPTION           |  4 +--
 NEWS.md               |  7 ++++
 R/merge_rse_metrics.R | 74 ++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 79 insertions(+), 6 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 38de78b..ec743b8 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: jaffelab
 Title: Commonly used functions by the Jaffe lab
-Version: 0.99.32
-Date: 2021-12-13
+Version: 0.99.33
+Date: 2024-03-26
 Authors@R: c(person("Leonardo", "Collado-Torres", 
 	role = c("aut", "cre"), email = "lcolladotor@gmail.com",
 	comment = c(ORCID = "0000-0003-2140-308X")), 
diff --git a/NEWS.md b/NEWS.md
index b072722..69a9886 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,10 @@
+# jaffelab 0.99.33
+
+* `merge_rse_metrics()` can now also handle `ERCCsumLogErr`. Note that its an
+approximation given properties of logarithms. To compute the actual correct
+values, you would need access to the original output ERCC quantification output
+files.
+
 # jaffelab 0.99.32
 
 * Added the `skipLines` argument to `junctionCount` to resolve
diff --git a/R/merge_rse_metrics.R b/R/merge_rse_metrics.R
index c18c920..859c9a4 100644
--- a/R/merge_rse_metrics.R
+++ b/R/merge_rse_metrics.R
@@ -8,13 +8,13 @@
 #' @param rse A [RangedSummarizedExperiment-class][SummarizedExperiment::RangedSummarizedExperiment-class]
 #' object with 'concordMapRate', 'overallMapRate', 'mitoRate',
 #' 'rRNA_rate', 'totalAssignedGene', 'numMapped', 'numReads', 'numUnmapped',
-#' 'mitoMapped', 'totalMapped' as either NumericList() or IntegerList() objects
-#' in the colData() columns.
+#' 'mitoMapped', 'totalMapped', 'ERCCsumLogErr' as either NumericList() or
+#' IntegerList() objects in the colData() columns.
 #'
 #' @return A [RangedSummarizedExperiment-class][SummarizedExperiment::RangedSummarizedExperiment-class]
 #' with merged columns for 'concordMapRate', 'overallMapRate', 'mitoRate',
 #' 'rRNA_rate', 'totalAssignedGene', 'numMapped', 'numReads', 'numUnmapped',
-#' 'mitoMapped', 'totalMapped'.
+#' 'mitoMapped', 'totalMapped', 'ERCCsumLogErr'
 #'
 #' @export
 #' @author Leonardo Collado-Torres, Andrew E Jaffe
@@ -52,7 +52,8 @@
 #'     numReads = rand_gen(1e6 + 6e5),
 #'     numUnmapped = rand_gen(5e5),
 #'     mitoMapped = rand_gen(1e5),
-#'     totalMapped = rand_gen(1e6 + 1e5)
+#'     totalMapped = rand_gen(1e6 + 1e5),
+#'     ERCCsumLogErr = rand_gen(-28.5, 4, 6)
 #' )
 #' rse <- SummarizedExperiment(
 #'     assays = SimpleList(counts = counts),
@@ -61,6 +62,41 @@
 #' colData(rse)
 #' rse_merged <- merge_rse_metrics(rse)
 #' colData(rse_merged)
+#'
+#' ## Use the BSP2 real data
+#' local_bsp2 <- tempfile("BSP2_rse_gene.Rdata")
+#' download.file(
+#'     "https://s3.us-east-2.amazonaws.com/libd-brainseq2/rse_gene_unfiltered.Rdata",
+#'     destfile = local_bsp2,
+#'     mode = "wb"
+#' )
+#' load(local_bsp2, verbose = TRUE)
+#' # lobstr::obj_size(rse_gene)
+#' # 872.12 MB
+#' bsp2_rse_merged <- merge_rse_metrics(rse_gene)
+#' colData(bsp2_rse_merged)
+#'
+#' ## Compare the ERCCsumLogErr approximation against the mean of the
+#' ## error values. Note that you would actually have to load the actual
+#' ## ERCC quantification output files in order to compute the correct
+#' ## ERCCsumLogErr values.
+#' bsp2_rse_merged$mean_ERCCsumLogErr <-
+#'     sapply(rse_gene$ERCCsumLogErr, mean)
+#' with(
+#'     colData(bsp2_rse_merged)[lengths(rse_gene$ERCCsumLogErr) > 1, ],
+#'     plot(
+#'         mean_ERCCsumLogErr,
+#'         ERCCsumLogErr,
+#'         ylab = "Approximate ERCCsumLogErr"
+#'     )
+#' )
+#' abline(a = 0, b = 1, col = "red")
+#' with(
+#'     colData(bsp2_rse_merged)[lengths(rse_gene$ERCCsumLogErr) > 1, ],
+#'     summary(mean_ERCCsumLogErr - ERCCsumLogErr)
+#' )
+#'
+
 merge_rse_metrics <- function(rse) {
     stopifnot(is(rse, "RangedSummarizedExperiment"))
     stopifnot(
@@ -101,5 +137,35 @@ merge_rse_metrics <- function(rse) {
     rse$numUnmapped <- sapply(rse$numUnmapped, sum)
     rse$mitoMapped <- sapply(rse$mitoMapped, sum)
     rse$totalMapped <- sapply(rse$totalMapped, sum)
+
+    if("ERCCsumLogErr" %in% colnames(colData(rse))) {
+        message(Sys.time(), " attempting to approximate ERCCsumLogErr.")
+        stopifnot(is(rse$ERCCsumLogErr, "NumericList"))
+
+        # ## Read in the ERCC expected concentration
+        # spikeIns <- read.delim("https://raw.githubusercontent.com/LieberInstitute/SPEAQeasy/master/Annotation/ERCC/ercc_actual_conc.txt",
+        #     as.is = TRUE, row.names = 2
+        # )
+        #
+        # ## Identify the number of ERCC sequences
+        # n_erccs <- nrow(spikeIns)
+        #
+        # ## Calculate the ERCC expected concentration
+        # ercc_expected_conc <- sum(log2(10 * spikeIns[, "concentration.in.Mix.1..attomoles.ul."] + 1))
+        # dput(ercc_expected_conc)
+        # dput(n_erccs)
+        #
+        ## Use pre-computed values
+        ercc_expected_conc <- 643.072778500804
+        n_erccs <- 92L
+
+        ## Then compute the approximate result starting from a NumericList input
+        rse$ERCCsumLogErr <- sapply(rse$ERCCsumLogErr, function(sumlogerr) {
+            log2(mean((2 ^ (
+                sumlogerr + ercc_expected_conc
+            )) ^ (1 / n_erccs)) ^ n_erccs) - ercc_expected_conc
+        })
+    }
+
     return(rse)
 }