very rough kmer code and doc

bob-carpenter · Nov 1, 2020 · 6d3f6a2 · 6d3f6a2
1 parent 39accd2
commit 6d3f6a2
Show file tree

Hide file tree

Showing 16 changed files with 21,182 additions and 174 deletions.
diff --git a/.gitignore b/.gitignore
@@ -34,3 +34,6 @@ vignettes/*.pdf
 
 # Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html
 rsconnect/
+
+# unpzked
+kmers/data/unpacked/
diff --git a/bayes-versus/bayes-versus.Rmd b/bayes-versus/bayes-versus.Rmd
@@ -2,7 +2,7 @@
 title: "For Probabilistic Prediction, Full Bayes is Better than Point
 Estimators"
 author: "Bob Carpenter"
-date: "April 2019"
+date: "August 2020"
 output:
   tufte::tufte_html:
     toc: true
@@ -36,7 +36,7 @@ library(MASS)
 library(rstan)
   rstan_options(auto_write = FALSE)
   options(mc.cores = 4)
-library(tufte)
+# library(tufte)
 
 
 # UTILITY FUNCTIONS
@@ -627,9 +627,9 @@ err_df <-
   rbind(data.frame(error = err,  loss = err^2 * 0.5 / 0.5^2,
                    type = "sq error"),
         data.frame(error = err, loss = abs(err),
-	           type = "abs error"),
-	data.frame(error = err, loss = -log(1 - err) * (0.5 / -log(0.5)),
-	           type = "log loss"))
+                   type = "abs error"),
+        data.frame(error = err, loss = -log(1 - err) * (0.5 / -log(0.5)),
+                   type = "log loss"))
 err_plot <-
   ggplot(err_df, aes(x = error, y = loss, group = type)) +
   geom_line(aes(color = type)) +
@@ -669,7 +669,7 @@ coefficient, $\beta_1$.
 model <- stan_model("logistic-regression.stan")
 fit <- sampling(model, data = data, iter = 4000,
                 control = list(adapt_delta = 0.99, stepsize = 0.01),
-		refresh = 0, open_progress = FALSE)
+                refresh = 0, open_progress = FALSE)
 ```
 ```{r}
 print(fit, prob = c(0.01, 0.99),
@@ -1034,6 +1034,14 @@ almost all such draws are worse than the posterior mode and the
 posterior mean.
 
 
+## Acknowledgements {-}
+
+Thanks to the Mark Johnson for inspiring to write up an example of
+where full Bayesian inference was better than point estimates.  Thanks
+also to all the commenters in our Bayesian computation reading group.
+Thanks in particular to Yuling Yao for all of the clarifying
+questions;  I tried to answer them all here.
+
 
 # Appendix A:  Some alternative evals
 
@@ -1072,26 +1080,28 @@ losses <- function(y, y_hat) {
   list(mae = mae, rmse = rmse, log_loss_rate = log_loss_rate)
 }
 
-losses_table <- function(y, y_hat, N, K, rho, estimator) {
+losses_table <- function(y, y_hat, N, K, rho, estimator, rep_in) {
   losses = losses(y, y_hat)
   data.frame(estimator = estimator, N = N, K = K, rho = rho,
              log_loss_rate = losses$log_loss_rate,
-	     rmse = losses$rmse,
-             mae = losses$mae)
+             rmse = losses$rmse,
+             mae = losses$mae,
+	     rep = rep_in)
 }
 
 model <- stan_model("logistic-regression.stan")
 
-Ns <- c(8, 32, 128, 512)
-Ks <- c(2, 8, 32, 128)
-rhos <- c(0, 0.9)
+Ns <- c(8, 16, 32, 64, 128, 256, 512) # , 32, 64, 128, 256, 512, 1024)
+Ks <- c(2, 8, 32, 64, 256)
+rhos <- c(0)  # , 0.9)
 
 results <- data.frame()
 
 for (N in Ns) {
   for (K in Ks) {
-    if (K > 2 * N) next
     for (rho in rhos) {
+      for (rep in 1:10) {
+        cat("N = ", N, ";  K = ", K, ";  rho = ", rho, "; rep = ", rep, "\n")
         Sigma <- cov_matrix(K, rho)
 
         alpha <- rnorm(1, 0, 0.5)
@@ -1108,16 +1118,16 @@ for (N in Ns) {
                      N_test = N_test, x_test = x_test, y_test = y_test)
 
         # Bayes
-        fit <- sampling(model, data = data, refresh = 0, init = 0,
-	                open_progress = FALSE)
+        fit <- sampling(model, data = data, refresh = 0, init = 0.5,
+                        chains = 4, iter = 4000, control = list(stepsize = 0.01, adapt_delta = 0.95),
+                        open_progress = FALSE)
 
         y_test_hat_bayes <- rep(0, N_test)
         for (n in 1:N_test) {
           y_test_hat_bayes[n] <- mean(extract(fit)$E_y_test[ , n])
         }
-	results <-
-	  rbind(results,
-	        losses_table(y_test, y_test_hat_bayes, N, K, rho, "Bayes"))
+	bayes_table <- losses_table(y_test, y_test_hat_bayes, N, K, rho, "Bayes", rep)
+
 
         # MAP plug-in
         fit_map <- optimizing(model, data = data, refresh = 0, verbose = FALSE)
@@ -1127,36 +1137,67 @@ for (N in Ns) {
           beta_star[k] <- fit_map$par[[paste('beta[', k, ']', sep = "")]]
 
         y_test_star <- inv_logit(alpha_star + (x_test %*% beta_star))
-	results <-
-	  rbind(results,
-	        losses_table(y_test, y_test_star, N, K, rho, "MAP"))
+        map_table <- losses_table(y_test, y_test_star, N, K, rho, "MAP", rep)
 
         # VB plug-in
         alpha_hat <- mean(extract(fit)$alpha)
         beta_hat <- rep(0, K)
         for (k in 1:K)
           beta_hat[k] <- mean(extract(fit)$beta[ , k])
         y_test_hat_vb <- inv_logit(alpha_hat + (x_test %*% beta_hat))
-	results <-
-	  rbind(results,
-	        losses_table(y_test, y_test_hat_vb, N, K, rho, "VB"))
+	vb_table <- losses_table(y_test, y_test_hat_vb, N, K, rho, "VB", rep)
+
+        # reduce to difference in error from Bayes to standardize
+        map_table$log_loss_rate <- map_table$log_loss_rate - bayes_table$log_loss_rate
+	map_table$rmse <- map_table$rmse - bayes_table$rmse
+	map_table$mae <- map_table$mae - bayes_table$mae
+
+        vb_table$log_loss_rate <- vb_table$log_loss_rate - bayes_table$log_loss_rate
+	vb_table$rmse <- vb_table$rmse - bayes_table$rmse
+	vb_table$mae <- vb_table$mae - bayes_table$mae
+
+        bayes_table$log_loss_rate <- bayes_table$log_loss_rate - bayes_table$log_loss_rate
+	bayes_table$rmse <- bayes_table$rmse - bayes_table$rmse
+	bayes_table$mae <- bayes_table$mae - bayes_table$mae
+
+        results <- rbind(results, map_table)
+        results <- rbind(results, vb_table)
+        results <- rbind(results, bayes_table)
+      }	
     }
   }
 }
 
-# no idea why this doesn't work any more. :-(
-knitr::kable(results,
-  caption = "Log loss rate, root mean square error (rmse), and mean absolute error (mae) for full Bayes and plug-in estimates with posterior mode (MAP) and mean (VB) given $N$ observations, $K$ predictors, and a base $\\rho$ correlation between adjacent predictors.")
-```
-
-```{r}
-knitr::kable(results, caption = "Foo") %>%
-  kable_styling(bootstrap_options = c("striped", "hover"))
-print(results, digits = 2)
+mae_results <- aggregate(mae ~ estimator + N + K + rho, results, mean)
+log_loss_rate_results <- aggregate(log_loss_rate ~ estimator + N + K + rho, results, mean)
+rmse_results <- aggregate(rmse ~ estimator + N + K + rho, results, mean)
+
+mae_results <- cbind(mae_results,
+                     loss_type = rep("mae", dim(mae_results)[1]))
+log_loss_rate_results <- cbind(log_loss_rate_results,
+                               loss_type = rep("log loss", dim(log_loss_rate_results)[1]))
+rmse_results <- cbind(rmse_results,
+                      loss_type = rep("rmse", dim(mae_results)[1]))
+
+names(mae_results)[5] <- "loss"
+names(log_loss_rate_results)[5] <- "loss"
+names(rmse_results)[5] <- "loss"
+
+plot_df <- rbind(log_loss_rate_results, rmse_results)  # , mae_results
+sub_df <- subset(plot_df, rho == 0)
+sub_df$rho <- NULL
+
+results_plot <-
+  ggplot(sub_df,
+         aes(x = N, y = loss, color = estimator, group = estimator)) +
+  geom_line(aes(group = estimator, color = estimator)) +
+  scale_x_log10(breaks = Ns) + 
+  facet_grid(loss_type ~ K, scales = "free_y") +
+  ylab("loss - Bayes loss") +
+  ggtheme_tufte()
+results_plot
 ```
 
-Do I need something afterward here?
-
 # Appendix B:  R Session Information
 
 ```{r}

diff --git a/bayes-versus/bayes-versus.html b/bayes-versus/bayes-versus.html
diff --git a/kmers/data/raw/refseq-select-2020-10-22.fasta.gz b/kmers/data/raw/refseq-select-2020-10-22.fasta.gz
diff --git a/kmers/doc/model.tex b/kmers/doc/model.tex
@@ -0,0 +1,172 @@
+\documentclass{article}
+
+\usepackage{hyperref}
+\usepackage{amssymb}
+\usepackage{amsmath}
+
+\title{Fast and accurate alignment-free RNA expression estimation with $K$-mers}
+\author{Bob Carpenter}
+\date{\today}
+
+\begin{document}
+
+\maketitle
+
+\abstract{\noindent We'll show that reducing both the transcriptome and the
+  short reads produced by RNA-seq to $K$-mers can be used to provide
+  an accurate estimate of relative transcript abundance that is faster
+  and more accurate than traditional methods based on aligning short
+  reads to the transcriptome.}
+
+\section{Background}
+
+\subsection{Bases}
+
+Let
+$\mathbb{B} = \{ \texttt{A}, \texttt{C}, \texttt{G}, \texttt{T} \}$ be
+a set representing the four DNA bases, adenine (\texttt{A}), cytosine
+(\texttt{C}), guanine (\texttt{G}), and thymine (\texttt{T}).
+
+\subsection{$N$-mers}
+
+An $N$-mer is an $N$-tuple of bases,
+$x = (x_1, \ldots, x_N)$.  The set of $N$-mers is
+$\mathbb{B}^N = \{ (x_1, \ldots, x_N) : x_1, \ldots x_N \in
+\mathbb{B}\}$.  The set of $N$-mers of all lengths is
+$\mathbb{B}^* = \bigcup_{n=0}^{\infty} \mathbb{B}^N.$
+
+\subsection{Transcriptome}
+
+Transcripts are the RNA strands transcribed from the DNA molecule to
+messenger RNA (mRNA).  A transcriptome with $G$ transcript types is an
+indexed set of $N$-mers, $T = T_1, \ldots, T_G \in
+\mathbb{B}^*$.
+
+
+\section{RNA-seq data}
+
+\subsection{RNA-seq data}
+
+RNA-seq data, after processing, consists of a sequence of short reads
+$R = R_1, \ldots, R_N$, where each $R_n \in \mathbb{B}^*$.
+
+\subsection{$K$-mer reduced RNA-seq data}
+
+An $N$-mer $x = x_1, \ldots, x_N$, can be reduced to a sequence of
+$K$-mers, $x_{1:K}, x_{2:K+1}, \ldots, x_{N-K+1:N}$.  This sequence
+may be reduced to a function
+$\textrm{count}_K(x):\mathbb{B}^K \rightarrow \mathbb{N}$, where
+$\textrm{count}_K(x)(y)$ is the number of times the $K$-mer $y$
+appears in the $N$-mer $x$.  Henceforth, we will treat
+$\textrm{count}_K(x)$ as a (sparse) $4^K$-vector by ordering the
+$K$-mers lexicographically.
+
+A set of $N$-mers representing RNA-seq data can then be reduced to a
+$K$-mer count by summing the functions representing its elements,
+$\textrm{count}_K(X) = \sum_{x \in X} \textrm{count}_K(x)$.
+
+\section{Model}
+
+In a transcriptome consisting of $G$ base sequences, the only
+parameter is a vector $\alpha \in \mathbb{R}^G$, representing
+intercepts in a multi-logit regression.  $\theta_g$ represents the
+relative abundance of sequence $g$ in the transcriptome. This
+parameter is most easily understood after transforming it to a
+simplex, $\theta = \textrm{softmax}(\alpha)$, where
+\[
+  \textrm{softmax}(\alpha)
+  = \frac{\exp(\alpha)}
+         {\textrm{sum}(\exp(\alpha))}
+\]
+and $\exp(\alpha)$ is defined elementwise.  By construction,
+$\textrm{softmax}(\alpha)_i > 0$ and
+$\textrm{sum}(\textrm{softmax}(\alpha)) = 1$.
+
+\subsection{Likelihood}
+
+The observation $y$ is a sparse $4^K$-vector of $K$-mer counts.  The
+transcriptome $x$ is represented as a sparse $4^K \times G$-matrix of
+$K$-mer counts for each gene.  The matrix $x$ is standardized so that
+the columns, representing genes, are simplexes giving the relative
+frequency of of $K$-mers in that gene's sequence of bases.
+
+The likelihood is
+\[
+  y \sim \textrm{multinomial}(x \cdot \textrm{softmax}(\theta)).
+\]
+Because the columns of $x$ are taken to be simplexes and
+$\textrm{softmax}(\alpha)$ is a simplex, then
+$x \cdot \textrm{softmax}(\alpha)$ is also a simplex, and thus
+appropriate as a parameter for a multinomial distribution.
+
+
+\subsection{Prior}
+
+The prior is a normal centered at the origin,
+\[
+  \alpha_g \sim \textrm{normal}(0, \lambda),
+\]
+for some scale $\lambda > 0$.
+
+
+\subsection{Posterior}
+
+The posterior is determined by Bayes's rule up to an additive constant
+that does not depend on the parameters $\alpha$ as
+\[
+  \log p(\alpha \mid x, y) = \log p(y \mid x, \alpha) + \log p(\alpha) +
+  \textrm{const.}
+  \]
+
+\subsubsection{Posterior gradient}
+
+Efficient maximum penalized likelihood estimation and full Bayesian inference
+require evaluating gradients of the posterior with respect to the parameters
+$\alpha$.  The gradient is\footnote{The derivative is easy to work out
+  by passing \url{matrixcalculus.org} the query
+  \begin{center}\texttt{y' * log
+(x * exp(a) / sum(exp(a))) - a' * a / (2 * lambda)}\end{center}
+and then simplifying using $\textrm{softmax}$.}
+%
+\[
+  \begin{array}{l}
+  \nabla\!_{\alpha} \, \log p(\alpha \mid x, y) + \log p(\alpha)
+  \\[4pt]
+  \qquad = \
+    t \odot \textrm{softmax}(\alpha)
+    - \textrm{softmax}(\alpha)^{\top}\! \cdot t \cdot \textrm{softmax}(\alpha)
+    - \frac{\displaystyle \alpha}{\displaystyle \lambda},
+\end{array}
+\]
+where
+\[
+  t = x^{\top}\! \cdot (y \oslash (x \cdot \textrm{softmax}(\alpha)))
+\]
+and $\odot$ and $\oslash$ are elementwise multiplication and
+division, respectively.
+
+\section{Estimating expression}
+
+\subsection{Maximum a posterior estimate}
+
+The max a posteriori (MAP) estimate for $\alpha$ is given by
+\[
+  \alpha^* = \textrm{arg\,max}_{\alpha} \,
+  \log \textrm{multinomial}(y \mid x \cdot \textrm{softmax}(\alpha))
+  + \log \textrm{normal}(\alpha \mid 0, \lambda \cdot \textrm{I}).
+\]
+
+\subsection{Maximum penalized likelihood estimate}
+
+With a normal prior, the maximum a posteriori (MAP) estimate will be
+equivalent to the $\textrm{L}_2$-penalized maximum likelihood estimate
+(MLE), which is defined as
+\[
+  \alpha^* = \textrm{arg\,max}_{\alpha} \,
+  \log \textrm{multinomial}(y \mid x \cdot \textrm{softmax}(\alpha))
+  - \frac{1}{2 \cdot \lambda^2} \cdot \alpha^{\top}\!\! \cdot \alpha.
+\]
+
+\subsection{Bayesian posterior mean estimate}
+
+\end{document}