Skip to content

Commit

Permalink
very rough kmer code and doc
Browse files Browse the repository at this point in the history
  • Loading branch information
Bob Carpenter committed Nov 1, 2020
1 parent 39accd2 commit 6d3f6a2
Show file tree
Hide file tree
Showing 16 changed files with 21,182 additions and 174 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,6 @@ vignettes/*.pdf

# Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html
rsconnect/

# unpzked
kmers/data/unpacked/
111 changes: 76 additions & 35 deletions bayes-versus/bayes-versus.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
title: "For Probabilistic Prediction, Full Bayes is Better than Point
Estimators"
author: "Bob Carpenter"
date: "April 2019"
date: "August 2020"
output:
tufte::tufte_html:
toc: true
Expand Down Expand Up @@ -36,7 +36,7 @@ library(MASS)
library(rstan)
rstan_options(auto_write = FALSE)
options(mc.cores = 4)
library(tufte)
# library(tufte)
# UTILITY FUNCTIONS
Expand Down Expand Up @@ -627,9 +627,9 @@ err_df <-
rbind(data.frame(error = err, loss = err^2 * 0.5 / 0.5^2,
type = "sq error"),
data.frame(error = err, loss = abs(err),
type = "abs error"),
data.frame(error = err, loss = -log(1 - err) * (0.5 / -log(0.5)),
type = "log loss"))
type = "abs error"),
data.frame(error = err, loss = -log(1 - err) * (0.5 / -log(0.5)),
type = "log loss"))
err_plot <-
ggplot(err_df, aes(x = error, y = loss, group = type)) +
geom_line(aes(color = type)) +
Expand Down Expand Up @@ -669,7 +669,7 @@ coefficient, $\beta_1$.
model <- stan_model("logistic-regression.stan")
fit <- sampling(model, data = data, iter = 4000,
control = list(adapt_delta = 0.99, stepsize = 0.01),
refresh = 0, open_progress = FALSE)
refresh = 0, open_progress = FALSE)
```
```{r}
print(fit, prob = c(0.01, 0.99),
Expand Down Expand Up @@ -1034,6 +1034,14 @@ almost all such draws are worse than the posterior mode and the
posterior mean.
## Acknowledgements {-}
Thanks to the Mark Johnson for inspiring to write up an example of
where full Bayesian inference was better than point estimates. Thanks
also to all the commenters in our Bayesian computation reading group.
Thanks in particular to Yuling Yao for all of the clarifying
questions; I tried to answer them all here.
# Appendix A: Some alternative evals
Expand Down Expand Up @@ -1072,26 +1080,28 @@ losses <- function(y, y_hat) {
list(mae = mae, rmse = rmse, log_loss_rate = log_loss_rate)
}
losses_table <- function(y, y_hat, N, K, rho, estimator) {
losses_table <- function(y, y_hat, N, K, rho, estimator, rep_in) {
losses = losses(y, y_hat)
data.frame(estimator = estimator, N = N, K = K, rho = rho,
log_loss_rate = losses$log_loss_rate,
rmse = losses$rmse,
mae = losses$mae)
rmse = losses$rmse,
mae = losses$mae,
rep = rep_in)
}
model <- stan_model("logistic-regression.stan")
Ns <- c(8, 32, 128, 512)
Ks <- c(2, 8, 32, 128)
rhos <- c(0, 0.9)
Ns <- c(8, 16, 32, 64, 128, 256, 512) # , 32, 64, 128, 256, 512, 1024)
Ks <- c(2, 8, 32, 64, 256)
rhos <- c(0) # , 0.9)
results <- data.frame()
for (N in Ns) {
for (K in Ks) {
if (K > 2 * N) next
for (rho in rhos) {
for (rep in 1:10) {
cat("N = ", N, "; K = ", K, "; rho = ", rho, "; rep = ", rep, "\n")
Sigma <- cov_matrix(K, rho)
alpha <- rnorm(1, 0, 0.5)
Expand All @@ -1108,16 +1118,16 @@ for (N in Ns) {
N_test = N_test, x_test = x_test, y_test = y_test)
# Bayes
fit <- sampling(model, data = data, refresh = 0, init = 0,
open_progress = FALSE)
fit <- sampling(model, data = data, refresh = 0, init = 0.5,
chains = 4, iter = 4000, control = list(stepsize = 0.01, adapt_delta = 0.95),
open_progress = FALSE)
y_test_hat_bayes <- rep(0, N_test)
for (n in 1:N_test) {
y_test_hat_bayes[n] <- mean(extract(fit)$E_y_test[ , n])
}
results <-
rbind(results,
losses_table(y_test, y_test_hat_bayes, N, K, rho, "Bayes"))
bayes_table <- losses_table(y_test, y_test_hat_bayes, N, K, rho, "Bayes", rep)
# MAP plug-in
fit_map <- optimizing(model, data = data, refresh = 0, verbose = FALSE)
Expand All @@ -1127,36 +1137,67 @@ for (N in Ns) {
beta_star[k] <- fit_map$par[[paste('beta[', k, ']', sep = "")]]
y_test_star <- inv_logit(alpha_star + (x_test %*% beta_star))
results <-
rbind(results,
losses_table(y_test, y_test_star, N, K, rho, "MAP"))
map_table <- losses_table(y_test, y_test_star, N, K, rho, "MAP", rep)
# VB plug-in
alpha_hat <- mean(extract(fit)$alpha)
beta_hat <- rep(0, K)
for (k in 1:K)
beta_hat[k] <- mean(extract(fit)$beta[ , k])
y_test_hat_vb <- inv_logit(alpha_hat + (x_test %*% beta_hat))
results <-
rbind(results,
losses_table(y_test, y_test_hat_vb, N, K, rho, "VB"))
vb_table <- losses_table(y_test, y_test_hat_vb, N, K, rho, "VB", rep)
# reduce to difference in error from Bayes to standardize
map_table$log_loss_rate <- map_table$log_loss_rate - bayes_table$log_loss_rate
map_table$rmse <- map_table$rmse - bayes_table$rmse
map_table$mae <- map_table$mae - bayes_table$mae
vb_table$log_loss_rate <- vb_table$log_loss_rate - bayes_table$log_loss_rate
vb_table$rmse <- vb_table$rmse - bayes_table$rmse
vb_table$mae <- vb_table$mae - bayes_table$mae
bayes_table$log_loss_rate <- bayes_table$log_loss_rate - bayes_table$log_loss_rate
bayes_table$rmse <- bayes_table$rmse - bayes_table$rmse
bayes_table$mae <- bayes_table$mae - bayes_table$mae
results <- rbind(results, map_table)
results <- rbind(results, vb_table)
results <- rbind(results, bayes_table)
}
}
}
}
# no idea why this doesn't work any more. :-(
knitr::kable(results,
caption = "Log loss rate, root mean square error (rmse), and mean absolute error (mae) for full Bayes and plug-in estimates with posterior mode (MAP) and mean (VB) given $N$ observations, $K$ predictors, and a base $\\rho$ correlation between adjacent predictors.")
```
```{r}
knitr::kable(results, caption = "Foo") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
print(results, digits = 2)
mae_results <- aggregate(mae ~ estimator + N + K + rho, results, mean)
log_loss_rate_results <- aggregate(log_loss_rate ~ estimator + N + K + rho, results, mean)
rmse_results <- aggregate(rmse ~ estimator + N + K + rho, results, mean)
mae_results <- cbind(mae_results,
loss_type = rep("mae", dim(mae_results)[1]))
log_loss_rate_results <- cbind(log_loss_rate_results,
loss_type = rep("log loss", dim(log_loss_rate_results)[1]))
rmse_results <- cbind(rmse_results,
loss_type = rep("rmse", dim(mae_results)[1]))
names(mae_results)[5] <- "loss"
names(log_loss_rate_results)[5] <- "loss"
names(rmse_results)[5] <- "loss"
plot_df <- rbind(log_loss_rate_results, rmse_results) # , mae_results
sub_df <- subset(plot_df, rho == 0)
sub_df$rho <- NULL
results_plot <-
ggplot(sub_df,
aes(x = N, y = loss, color = estimator, group = estimator)) +
geom_line(aes(group = estimator, color = estimator)) +
scale_x_log10(breaks = Ns) +
facet_grid(loss_type ~ K, scales = "free_y") +
ylab("loss - Bayes loss") +
ggtheme_tufte()
results_plot
```
Do I need something afterward here?
# Appendix B: R Session Information
```{r}
Expand Down
213 changes: 77 additions & 136 deletions bayes-versus/bayes-versus.html

Large diffs are not rendered by default.

Binary file added kmers/data/raw/refseq-select-2020-10-22.fasta.gz
Binary file not shown.
172 changes: 172 additions & 0 deletions kmers/doc/model.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
\documentclass{article}

\usepackage{hyperref}
\usepackage{amssymb}
\usepackage{amsmath}

\title{Fast and accurate alignment-free RNA expression estimation with $K$-mers}
\author{Bob Carpenter}
\date{\today}

\begin{document}

\maketitle

\abstract{\noindent We'll show that reducing both the transcriptome and the
short reads produced by RNA-seq to $K$-mers can be used to provide
an accurate estimate of relative transcript abundance that is faster
and more accurate than traditional methods based on aligning short
reads to the transcriptome.}

\section{Background}

\subsection{Bases}

Let
$\mathbb{B} = \{ \texttt{A}, \texttt{C}, \texttt{G}, \texttt{T} \}$ be
a set representing the four DNA bases, adenine (\texttt{A}), cytosine
(\texttt{C}), guanine (\texttt{G}), and thymine (\texttt{T}).

\subsection{$N$-mers}

An $N$-mer is an $N$-tuple of bases,
$x = (x_1, \ldots, x_N)$. The set of $N$-mers is
$\mathbb{B}^N = \{ (x_1, \ldots, x_N) : x_1, \ldots x_N \in
\mathbb{B}\}$. The set of $N$-mers of all lengths is
$\mathbb{B}^* = \bigcup_{n=0}^{\infty} \mathbb{B}^N.$

\subsection{Transcriptome}

Transcripts are the RNA strands transcribed from the DNA molecule to
messenger RNA (mRNA). A transcriptome with $G$ transcript types is an
indexed set of $N$-mers, $T = T_1, \ldots, T_G \in
\mathbb{B}^*$.


\section{RNA-seq data}

\subsection{RNA-seq data}

RNA-seq data, after processing, consists of a sequence of short reads
$R = R_1, \ldots, R_N$, where each $R_n \in \mathbb{B}^*$.

\subsection{$K$-mer reduced RNA-seq data}

An $N$-mer $x = x_1, \ldots, x_N$, can be reduced to a sequence of
$K$-mers, $x_{1:K}, x_{2:K+1}, \ldots, x_{N-K+1:N}$. This sequence
may be reduced to a function
$\textrm{count}_K(x):\mathbb{B}^K \rightarrow \mathbb{N}$, where
$\textrm{count}_K(x)(y)$ is the number of times the $K$-mer $y$
appears in the $N$-mer $x$. Henceforth, we will treat
$\textrm{count}_K(x)$ as a (sparse) $4^K$-vector by ordering the
$K$-mers lexicographically.

A set of $N$-mers representing RNA-seq data can then be reduced to a
$K$-mer count by summing the functions representing its elements,
$\textrm{count}_K(X) = \sum_{x \in X} \textrm{count}_K(x)$.

\section{Model}

In a transcriptome consisting of $G$ base sequences, the only
parameter is a vector $\alpha \in \mathbb{R}^G$, representing
intercepts in a multi-logit regression. $\theta_g$ represents the
relative abundance of sequence $g$ in the transcriptome. This
parameter is most easily understood after transforming it to a
simplex, $\theta = \textrm{softmax}(\alpha)$, where
\[
\textrm{softmax}(\alpha)
= \frac{\exp(\alpha)}
{\textrm{sum}(\exp(\alpha))}
\]
and $\exp(\alpha)$ is defined elementwise. By construction,
$\textrm{softmax}(\alpha)_i > 0$ and
$\textrm{sum}(\textrm{softmax}(\alpha)) = 1$.

\subsection{Likelihood}

The observation $y$ is a sparse $4^K$-vector of $K$-mer counts. The
transcriptome $x$ is represented as a sparse $4^K \times G$-matrix of
$K$-mer counts for each gene. The matrix $x$ is standardized so that
the columns, representing genes, are simplexes giving the relative
frequency of of $K$-mers in that gene's sequence of bases.

The likelihood is
\[
y \sim \textrm{multinomial}(x \cdot \textrm{softmax}(\theta)).
\]
Because the columns of $x$ are taken to be simplexes and
$\textrm{softmax}(\alpha)$ is a simplex, then
$x \cdot \textrm{softmax}(\alpha)$ is also a simplex, and thus
appropriate as a parameter for a multinomial distribution.


\subsection{Prior}

The prior is a normal centered at the origin,
\[
\alpha_g \sim \textrm{normal}(0, \lambda),
\]
for some scale $\lambda > 0$.


\subsection{Posterior}

The posterior is determined by Bayes's rule up to an additive constant
that does not depend on the parameters $\alpha$ as
\[
\log p(\alpha \mid x, y) = \log p(y \mid x, \alpha) + \log p(\alpha) +
\textrm{const.}
\]

\subsubsection{Posterior gradient}

Efficient maximum penalized likelihood estimation and full Bayesian inference
require evaluating gradients of the posterior with respect to the parameters
$\alpha$. The gradient is\footnote{The derivative is easy to work out
by passing \url{matrixcalculus.org} the query
\begin{center}\texttt{y' * log
(x * exp(a) / sum(exp(a))) - a' * a / (2 * lambda)}\end{center}
and then simplifying using $\textrm{softmax}$.}
%
\[
\begin{array}{l}
\nabla\!_{\alpha} \, \log p(\alpha \mid x, y) + \log p(\alpha)
\\[4pt]
\qquad = \
t \odot \textrm{softmax}(\alpha)
- \textrm{softmax}(\alpha)^{\top}\! \cdot t \cdot \textrm{softmax}(\alpha)
- \frac{\displaystyle \alpha}{\displaystyle \lambda},
\end{array}
\]
where
\[
t = x^{\top}\! \cdot (y \oslash (x \cdot \textrm{softmax}(\alpha)))
\]
and $\odot$ and $\oslash$ are elementwise multiplication and
division, respectively.

\section{Estimating expression}

\subsection{Maximum a posterior estimate}

The max a posteriori (MAP) estimate for $\alpha$ is given by
\[
\alpha^* = \textrm{arg\,max}_{\alpha} \,
\log \textrm{multinomial}(y \mid x \cdot \textrm{softmax}(\alpha))
+ \log \textrm{normal}(\alpha \mid 0, \lambda \cdot \textrm{I}).
\]

\subsection{Maximum penalized likelihood estimate}

With a normal prior, the maximum a posteriori (MAP) estimate will be
equivalent to the $\textrm{L}_2$-penalized maximum likelihood estimate
(MLE), which is defined as
\[
\alpha^* = \textrm{arg\,max}_{\alpha} \,
\log \textrm{multinomial}(y \mid x \cdot \textrm{softmax}(\alpha))
- \frac{1}{2 \cdot \lambda^2} \cdot \alpha^{\top}\!\! \cdot \alpha.
\]

\subsection{Bayesian posterior mean estimate}

\end{document}
Loading

0 comments on commit 6d3f6a2

Please sign in to comment.