results/19-01-10-EDA-RNA-time-course/01-EDA-time-course.Rmd

---
title: "EDA RNA-seq time courses"
author: "Santiago Medina"
date: "1/10/2019"
output: 
  html_document:
    keep_md: true
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(
  echo = F, message = F, warning = F, collapse = T,
  fig.path = "./figures/", dev = c("png", "pdf")
  )

library(tidyverse)
library(ggrepel)
library(ggridges)
library(gridExtra)
library(ggthemes)
library(scales)

theme_set(theme_tufte(base_family = "Helvetica"))
```

In this notebook I will visualize the time-course RNA-seq data.

```{r load_data}
dpath <- "../../data/19-01-09-RNAseqProfilesFish/rna-seq-profiles/"

polyA_wt <- read_csv(str_c(dpath, "polyA-profile.csv"))
ribo_wt <- read_csv(str_c(dpath, "ribo-zero-profile.csv"))
polyA_alpha <- read_csv(str_c(dpath, "alpha-amanitin-prolife.csv"))
errc <- read_csv(str_c(dpath, "ercc_count.csv"))
```

## PCA analysis

```{r pca, fig.height=2.5, fig.width=9}

visualize_pca_decomposition <- function(data, title = NULL) {
  pca <- 
    data %>% 
    select(-Gene_ID, -Name) %>% 
    .[rowMeans(.) > 10, ] %>% # low expressed genes are not usefull
    mutate_all(~ log(. + 1)) %>%  # log transformation works good
    t() %>% 
    prcomp(scale. = T)
  
  # get % of explained var for the visualization
  pca_var <- pca$sdev^2
  pca_var_per <- round(pca_var / sum(pca_var) * 100, 1)
  
  ## make plot
  as_tibble(pca$x, rownames = "sample_id") %>% 
    mutate(
      time = str_extract(sample_id, pattern = "\\d.?\\d?"),
      time = str_remove(time, "h") %>% as.numeric()
      ) %>% 
    ggplot(aes(x = PC1, y = PC2, label = time, color = time)) +
    geom_text_repel() +
    geom_point(shape=16, alpha=.8) +
    labs(
      x = str_c("PC1, ", pca_var_per[1], "%"),
      y = str_c("PC2, ", pca_var_per[2], "%"),
      title = title
    ) +
    geom_rug() +
    theme(legend.position = "none", axis.ticks = element_blank())
  
}

p1 <- visualize_pca_decomposition(ribo_wt, title = "ribo wt")
p2 <- visualize_pca_decomposition(polyA_wt, title = "polyA wt")
p3 <- visualize_pca_decomposition(polyA_alpha, title = "polyA alpha")

grid.arrange(p1, p2, p3, nrow = 1)
```

The PCA analysis looks find!

## Bivariate distributions

```{r  bivariate_polyA, fig.height=2.5, fig.width=9}
tidy_data <- function(data, var_id) {
  data %>% 
    select(-Name) %>% 
    gather(key = sample_id, value = TPM, -Gene_ID) %>% 
    mutate(
      time = as.numeric (str_extract(sample_id, "\\d{1}\\.?\\d?")),
      condition = var_id
    )
}

dtidy <- bind_rows(
  tidy_data(polyA_wt, "polyA_wt"),
  tidy_data(polyA_alpha, "polyA_aamanitin"),
  tidy_data(ribo_wt, "ribo_wt")
)

d_ribo <- dtidy %>% 
  filter(condition != "polyA_aamanitin") %>% 
  select(-sample_id) %>% 
  filter(time %% 1 == 0) %>% 
  spread(key = condition, value = TPM) %>% 
  mutate(sample = "WT ribo") %>% 
  drop_na() %>% 
  rename(expression_wt = ribo_wt)

d_poly <- dtidy %>% 
  filter(condition != "ribo_wt") %>% 
  select(-sample_id) %>% 
  filter(time %% 1 == 0) %>% 
  spread(key = condition, value = TPM) %>% 
  mutate(sample = "a-amanitin") %>% 
  drop_na() %>% 
  rename(expression_wt = polyA_aamanitin)

tc_d <- bind_rows(d_poly, d_ribo)


tc_d %>% 
  filter(expression_wt > 0, polyA_wt > 0, time >=2, time <= 7) %>% 
  filter(sample != "WT ribo") %>% 
  mutate(time = paste0(time, " hrs")) %>% 
  ggplot(aes(y = expression_wt, x = polyA_wt)) +
  stat_density_2d(aes(fill = ..level..), geom = "polygon") +
  scale_fill_gradient(low = "grey90", high = "black") +
  facet_grid(. ~ time) +
  scale_y_continuous(trans = log2_trans(),
                     breaks = trans_breaks("log10", function(x) 10^x),
                     labels = trans_format("log10", math_format(10^.x))) +
  scale_x_continuous(trans = log2_trans(),
                     breaks = trans_breaks("log10", function(x) 10^x),
                     labels = trans_format("log10", math_format(10^.x))) +
  labs(
    y = "a-amanitin\nRNA level (TPM)",
    x = "wild type\nRNA level (TPM)"
  )

  
```

## poly/ribo over time

```{r polyOverRiboRatio, fig.width=2.5, fig.height=5}
dtidy %>% 
  filter(condition != "polyA_aamanitin", time %% 2 == 0) %>% 
  spread(key = condition, value = TPM) %>% 
  group_by(Gene_ID) %>% 
  mutate(log2fc = log2(polyA_wt / ribo_wt)) %>% 
  filter(!is.na(log2fc), !is.infinite(log2fc)) %>% 
  mutate(time = str_c(time, " hrs")) %>% 
  ggplot(aes(x = log2fc)) +
  geom_histogram(bins=50, color="white", fill="grey50") +
  geom_hline(yintercept = c(0, 1000, 2000, 3000), color="white", size=1/3) + 
  scale_y_continuous(breaks = c(0, 1000, 2000, 3000)) +
  facet_grid(time~.) +
  coord_cartesian(xlim = c(-5, 5)) +
  theme(axis.ticks = element_blank()) +
  labs(
    x = "log2 fold change",
    title = "poly(A) / ribo-zero"
  )
```