InitialAnalysis.Rmd

---
title: "Mnemiopsis Extended Analysis"
author: "Kirsten Gotting"
output:
  html_document:
      code_download: true      
      toc: true
      toc_depth: 2
      toc_float:
        collapsed: false
---

```{r setupInitial, message=FALSE, results ='hide', warning=FALSE, error = FALSE, cache = FALSE, echo = FALSE}

library(knitr)
opts_chunk$set(echo = FALSE, message=FALSE, results = 'hide', fig.keep="all", warning=FALSE, error=FALSE, fig.path="./analysis_figures/", cache = TRUE)
opts_knit$set(root.dir = "./")


library('kiRsten')
library('tidyverse')
library('GO.db')
library('wesanderson')
library('pheatmap')
library('htmlwidgets')
library('DT')

my_pallete <- c(wes_palettes$Rushmore[-2], wes_palettes$FantasticFox)

setwd("~/projects/Mnemiopsis/website")

```

#Overview

For these analyses I want to see how much overlap there is between the two sets of significant genes between the 2hpf reference timepoint and the 8hpf reference timepoint. 

#Data Used

1. The significant genes list from the 2hpf reference quality control differential expression experiment.
2. The significant genes list from the 8hpf reference quality control differential expression experiment.


```{r fixAnnotInitial, eval = FALSE, echo = FALSE}

pfam2go    <- data.table::fread("~/projects/Mnemiopsis/annotation/pfam2go", head= FALSE, data.table = FALSE)
pfam2go$V1 <- gsub('Pfam:', '', gsub(" >.*", "", pfam2go$V1))

pfam2go <- pfam2go %>% separate(V1, into = c('Pfam', 'pfamDesc'), sep = " ") %>% dplyr::rename('go_id' = V2)

ml2pfam  <- data.table::fread('~/projects/Mnemiopsis/annotation/mleidyi2pfam.txt', header = TRUE, data.table = FALSE)
ml2pfam$Pfam <- gsub("\\..*", '', ml2pfam$Pfam)

ml2sp  <- data.table::fread('~/projects/Mnemiopsis/annotation/ml_v_sp_20160526.annot', header = TRUE, data.table = FALSE)
ml2sp  <- ml2sp %>% dplyr::rename('Gene' = id)

uniprot2go <- data.table::fread("~/projects/Mnemiopsis/annotation/uniprot2go.txt", header = FALSE, data.table = FALSE, col.names = c('sp_id', 'go_id'))
uni2go     <- uniprot2go %>% filter(sp_id %in% ml2sp$sp_id)


ml_annot1 <- inner_join(ml2pfam, pfam2go) %>%
  group_by(Gene) %>%
  mutate(Pfam = paste(Pfam, collapse = ";")) %>%
  mutate(go_id = paste(go_id, collapse = ';')) %>%
  ungroup() %>%
  dplyr::select(-pfamDesc) %>%
  distinct()


ml_annot2 <- left_join(ml2sp, uni2go)


ml1 <- left_join(ml_annot2, ml_annot1, by = 'Gene') %>%
  unite(go_all, go_id.x, go_id.y, sep = ";") %>%
  mutate(go_all = sub('^;', '', sub(";NA", "", go_all))) 


write.table(ml1, 'output/Mle_annot20160901.txt', quote = FALSE, sep = "\t", row.names = FALSE)

```


```{r readdata2Initial}

ml_annot <- data.table::fread('output/Mle_annot20160901.txt', data.table = FALSE)

ml <- ml_annot %>%
  unnest(go_id = strsplit(go_all, split = ";")) %>%
  dplyr::select(-go_all) %>%
  distinct()


rpkms <- data.table::fread('output/rpkm.txt', data.table = FALSE)

rpkm <- rpkms %>%
  gather(Sample, RPKM, -ID, -transcript_length)  %>%
  dplyr::rename('Gene' = ID) %>% 
  separate(Sample, c('Time', 'Year', 'Replicate'), sep = '_') %>%
  group_by(Time, Gene) %>%
  mutate(meanRPKM = mean(RPKM)) %>%
  ungroup() %>%
  dplyr::select(-RPKM, -Year, -Replicate) %>% distinct() %>%
  group_by(Gene) %>%
  mutate(zscore = as.vector(scale(meanRPKM))) %>%
  ungroup() %>%
  filter(!is.na(zscore))


expr <- data.table::fread('output/all_genes_expressionTidy8hpa.txt', data.table = FALSE)
expr8 <- na.omit(expr)


sig <- expr8  %>% 
  mutate(id1 = ifelse(dea_ID == 'padj' & dea_Value <= 1e-5, yes = TRUE, no = FALSE)) %>%
  mutate(id2 = ifelse(dea_ID == 'log2FoldChange' & dea_Value >= 0.5, yes = TRUE, no = FALSE)) %>%
  group_by(contrastID, Gene) %>%
  filter(any(id1), any(id2)) %>%
  ungroup() %>%
  distinct(Gene) %>% .$Gene


sign8 <- expr8 %>% filter(Gene %in% sig)

write(unique(sign8$Gene), file = 'output/significant_8hr.txt')

expr <- data.table::fread('output/all_genes_expressionTidy2hpa.txt', data.table = FALSE)
expr2 <- na.omit(expr)


sig <- expr2  %>% 
  mutate(id1 = ifelse(dea_ID == 'padj' & dea_Value <= 1e-5, yes = TRUE, no = FALSE)) %>%
  mutate(id2 = ifelse(dea_ID == 'log2FoldChange' & dea_Value >= 0.5, yes = TRUE, no = FALSE)) %>%
  group_by(contrastID, Gene) %>%
  filter(any(id1), any(id2)) %>%
  ungroup() %>%
  distinct(Gene) %>% .$Gene


sign2 <- expr2 %>% filter(Gene %in% sig)

write(unique(sign2$Gene), file = 'output/significant_2hr.txt')

all_sign <- unique(c(sign2$Gene, sign8$Gene))

```


# Clustering Significant Genes


Clustering Genes with a pvalue <= 1e-5 and log2 Fold Change >= 1 for the two different contrast experiments

##8hpf Control
```{r geneHeatInitial}
lfc <- data.frame(sign8 %>% filter(dea_ID == 'log2FoldChange') %>% dplyr::select(-dea_ID) %>% spread(contrastID, dea_Value), check.names = FALSE)

mlhere <- ml %>% dplyr::select(Gene, sp_desc) %>% distinct()

lfc8 <- left_join(lfc, mlhere)

rownames(lfc8)      <- lfc8$Gene
lfc8$sp_desc        <- NULL
lfc8$Gene           <- NULL

lfc8 <- na.omit(lfc8)

lfc8_heat <- make_pheatmap_df(lfc8, max = 3, min = -3)


hr  <- hclust(dist(lfc8), method = 'complete')


mycl            <- cutree(hr, k = 4)
cluster.letters <- LETTERS[seq( from = 1, to = 4)]
clusters        <- paste0("Cluster", cluster.letters)
mycols          <- clusters[as.vector(mycl)]
names(mycols)   <- names(mycl)

annotation           <- data.frame(Cluster=mycols)
annotation$Cluster   <- as.factor(annotation$Cluster)
up.colors            <- wes_palettes$Royal1
names(up.colors)     <- clusters
ann.cols             <- list(Cluster = up.colors)

annotationDF8 <- data.frame(annotation, ID = rownames(annotation)) %>% 
  separate(ID, c('Gene', 'sp_id'), remove = FALSE, sep = ":")

annotationDF8$control <- '8hpf'

pheatmap(lfc8_heat, show_rownames = FALSE, cluster_cols = FALSE, cluster_rows = hr, main = paste0('Clustering of ', nrow(lfc8), ' Genes'), border_color = NA, annotation_row=annotation, annotation_colors = ann.cols)


```

##2hpf Control
```{r hpf2controlheatInitial}

lfc <- data.frame(sign2 %>% filter(dea_ID == 'log2FoldChange') %>% dplyr::select(-dea_ID) %>% spread(contrastID, dea_Value), check.names = FALSE)


lfc2 <- left_join(lfc, mlhere)

rownames(lfc2)      <- lfc2$Gene
lfc2$sp_desc        <- NULL
lfc2$Gene           <- NULL

lfc2 <- na.omit(lfc2)

lfc2_heat <- make_pheatmap_df(lfc2, max = 6, min = -3)


hr  <- hclust(dist(lfc2), method = 'complete')


mycl            <- cutree(hr, k = 3)
cluster.letters <- LETTERS[seq(from = 5, to = 7)]
clusters        <- paste0("Cluster", cluster.letters)
mycols          <- clusters[as.vector(mycl)]
names(mycols)   <- names(mycl)

annotation           <- data.frame(Cluster=mycols)
annotation$Cluster   <- as.factor(annotation$Cluster)
up.colors            <- wes_palettes$GrandBudapest[1:3]
names(up.colors)     <- clusters
ann.cols             <- list(Cluster = up.colors)

annotationDF2 <- data.frame(annotation, ID = rownames(annotation)) %>% 
  separate(ID, c('Gene', 'sp_id'), remove = FALSE, sep = ":")

annotationDF2$control <- '2hpf'

pheatmap(lfc2_heat, show_rownames = FALSE, cluster_cols = FALSE, cluster_rows = hr, main = paste0('Clustering of ', nrow(lfc2), ' Genes'), border_color = NA, annotation_row=annotation, annotation_colors = ann.cols)

```

## Combined Control

For the following heatmaps I took the significant genes for the 2hpf and 8hpf control experiments and hierarchically clustered all of the timepoints for each reference control together. 

In Heatmap A, I highlight the first four time points (1hpf - 4hpf) with the 8hpf control and the second four timepoints (5hpf - 8hpf) with the 2hpf control. In doing this we see the presence and then gradual decrease at 5hpf in the Cluster D genes, while all other clusters contain lowly expressed genes that shift to increasingly higher levels of expression at the 5hpf timepoint.


In Heatmap B I include all of the experimental timepoints for both controls, from which the clusters in Heatmap A were drawn.


```{r clusterAllheatInitial, fig.keep='all'}


sigExpr <- bind_rows(expr2, expr8) %>% 
  filter(Gene %in% all_sign, dea_ID == 'log2FoldChange') %>%
  dplyr::select(-dea_ID) %>%
  spread(contrastID, dea_Value)


lfc <- left_join(sigExpr, mlhere)

lfc <- na.omit(lfc[ ,c("Gene", "1hpf/8hpf", "2hpf/8hpf", "3hpf/8hpf", "4hpf/8hpf", "5hpf/8hpf", "6hpf/8hpf", "7hpf/8hpf", "1hpf/2hpf", "3hpf/2hpf", "4hpf/2hpf", "5hpf/2hpf", "6hpf/2hpf", "7hpf/2hpf", "8hpf/2hpf")])

rownames(lfc)      <- lfc$Gene
lfc$sp_desc        <- NULL
lfc$Gene           <- NULL


lfc_heat <- make_pheatmap_df(lfc, max = 3, min = -3)


hr  <- hclust(dist(lfc), method = 'complete')


mycl            <- cutree(hr, k = 5)
cluster.letters <- LETTERS[seq(from = 1, to = 5)]
clusters        <- paste0("Cluster", cluster.letters)
mycols          <- clusters[as.vector(mycl)]
names(mycols)   <- names(mycl)

annotation              <- data.frame(Cluster = mycols)
annotation$Significant <- 'None' 


annotation[which(rownames(annotation) %in% rownames(lfc2)), 'Significant'] <- '2hpf Control'
annotation[which(rownames(annotation) %in% rownames(lfc8)), 'Significant'] <- '8hpf Control'


annotation$Cluster    <- as.factor(annotation$Cluster)
annotation$Significant <- as.factor(annotation$Significant)
up.colors            <- rainbow(5)
names(up.colors)     <- clusters
Significant           <- c('turquoise', 'pink')
names(Significant)    <- c('2hpf Control', '8hpf Control')
ann.colors             <- list(Cluster = up.colors, Significant = Significant)


annot_column1 <- data.frame(row.names = colnames(lfc_heat),
                           Reference = as.factor(sapply(colnames(lfc_heat), function(x){strsplit(x, "/")[[1]][2]})))


keep_cols <- c("1hpf/8hpf", "2hpf/8hpf", "3hpf/8hpf", "4hpf/8hpf","5hpf/2hpf", "6hpf/2hpf", "7hpf/2hpf", "8hpf/2hpf")


exp <- c('rosybrown3', 'cornflowerblue')
names(exp) <- c('8hpf', '2hpf')
ann.colors$Reference <- exp
annot_column <- data.frame(row.names = keep_cols,
                           Reference = as.factor(sapply(keep_cols, function(x){strsplit(x, "/")[[1]][2]})))
 

annotationDF <- data.frame(annotation, Gene = rownames(annotation))
annotationDF <- left_join(annotationDF, mlhere)


pheatmap(lfc_heat[ ,keep_cols], show_rownames = FALSE, cluster_cols = FALSE, cluster_rows = hr, main = paste0('Heatmap A: Clustering of ', nrow(lfc), ' Genes'), border_color = NA, annotation_row=annotation, annotation_colors = ann.colors, annotation_col = annot_column, annotation_names_col = FALSE, gaps_col = c(4), labels_col = c("1hpf", "2hpf", "3hpf", "4hpf","5hpf", "6hpf", "7hpf", "8hpf"))


```


```{r anotherheatmapInitial}

exp <- c('rosybrown3', 'cornflowerblue')
names(exp) <- c('8hpf', '2hpf')
ann.colors$Reference <- exp
annot_column <- data.frame(row.names = keep_cols,
                           Reference = as.factor(sapply(keep_cols, function(x){strsplit(x, "/")[[1]][2]})))
 

pheatmap(lfc_heat, show_rownames = FALSE, cluster_cols = FALSE, cluster_rows = hr, main = paste0('Heatmap B: Clustering of ', nrow(lfc), ' Genes'), border_color = NA, annotation_row=annotation, annotation_colors = ann.colors, annotation_col = annot_column1, gaps_col = c(7), annotation_names_col = FALSE, labels_col = c("1hpf", "2hpf", "3hpf", "4hpf","5hpf", "6hpf", "7hpf", "1hpf", "3hpf", "4hpf","5hpf", "6hpf", "7hpf", "8hpf"))


```


Here is the same plot, but with only two clusters annotated on the side.

```{r RPKMplotClustersInitial}
##first lets see what the previous plot looks like with just two clusters
rpkm_sig <- na.omit(data.frame(rpkm %>% filter(Gene %in% all_sign) %>%
  dplyr::select(Gene, Time, zscore) %>%
  spread(Time, zscore), stringsAsFactors = FALSE, check.names = FALSE))


rownames(rpkm_sig) <- rpkm_sig$Gene
rpkm_sig$Gene      <- NULL

rpkm_sig_heat <- make_pheatmap_df(rpkm_sig, max = 2, min = -2)


mycl            <- cutree(hr, k = 2)
cluster.letters <- LETTERS[seq(from = 1, to = 2)]
clusters        <- paste0("Cluster", cluster.letters)
mycols          <- clusters[as.vector(mycl)]
names(mycols)   <- names(mycl)

annotation           <- data.frame(Cluster=mycols)
annotation$Cluster   <- as.factor(annotation$Cluster)
up.colors            <- rainbow(2)
names(up.colors)     <- clusters
ann.cols             <- list(Cluster = up.colors)


pheatmap(lfc_heat, show_rownames = FALSE, cluster_cols = FALSE, cluster_rows = hr, main = paste0('Heatmap B: Clustering of ', nrow(lfc), ' Genes'), border_color = NA, annotation_row=annotation, annotation_colors = ann.cols, annotation_col = annot_column1, gaps_col = c(7), annotation_names_col = FALSE, labels_col = c("1hpf", "2hpf", "3hpf", "4hpf","5hpf", "6hpf", "7hpf", "1hpf", "3hpf", "4hpf","5hpf", "6hpf", "7hpf", "8hpf"))
```

#RPKM Clustering

Cluster zscore of the mean RPKMs of significant genes using either the 8hr control or the 2hr control experiments

Heatmap C shows the RPKMs with the clusters derived from Heatmap B, while Heatmap D shows the clusters that are created from clustering purely from the zscore of the RPKM. 

I think this is a pretty good proof a principle that the normalized RPKMs are able to recapitulate similar clusters to the differential expression data for the significant genes, with some minor differnces in the order of the genes. The two main clusters are present in both graphs.


```{r anotherheatmap1Initial}
pheatmap(rpkm_sig_heat[rownames(lfc_heat), ], show_rownames = FALSE, cluster_cols = FALSE, cluster_rows = hr, 
         main = paste0('Heatmap C: Clustering of ', nrow(lfc), ' Genes'), 
         border_color = NA, annotation_row=annotation, 
         annotation_colors = ann.cols, 
         annotation_names_col = FALSE, 
         labels_col = c("1hpf", "2hpf", "3hpf", "4hpf","5hpf", "6hpf", "7hpf", "1hpf", "3hpf", "4hpf","5hpf", "6hpf", "7hpf", "8hpf"))
```


```{r anotherheatmap2Initial}

hr  <- hclust(dist(rpkm_sig), method = 'complete')


mycl            <- cutree(hr, k = 2)
cluster.letters <- LETTERS[seq(from = 1, to = 2)]
clusters        <- paste0("Cluster", cluster.letters)
mycols          <- clusters[as.vector(mycl)]
names(mycols)   <- names(mycl)

annotation           <- data.frame(Cluster=mycols)
annotation$Cluster   <- as.factor(annotation$Cluster)
up.colors            <- rainbow(2)
names(up.colors)     <- clusters
ann.cols             <- list(Cluster = up.colors)


pheatmap(rpkm_sig_heat, show_rownames = FALSE, cluster_cols = FALSE, cluster_rows = hr, main = paste0('Heatmap D: Clustering of ', nrow(rpkm_sig), ' Genes'), border_color = NA, annotation_row=annotation, annotation_colors = ann.cols)


```

# Significant Gene PCA

Here I did a PCA analysis on the mean RPKM per sample for the significant genes in the heatmaps above. We see that PC2 holds the variance that separates the 'early' transcripts from the 'late' transcripts. 

```{r PCAInitial}

#rpkm of significant genes PCA

rpkm_spread <- data.frame(rpkm %>% dplyr::select(-zscore, -transcript_length) %>%
                            spread(Time, meanRPKM), check.names = FALSE)

rownames(rpkm_spread) <- rpkm_spread$Gene
rpkm_spread$Gene      <- NULL

princomp <- prcomp(rpkm_spread)

rot <- data.frame(princomp$rotation)

rot$Sample <- rownames(rot)
rot$Stage  <- c(rep('Early', 5), rep('Late', 4))

ggplot(data = rot, aes(x = PC1, y = PC2, label = Sample, colour = factor(Stage))) +
          geom_point(size = 1) + theme_minimal() + geom_text(hjust = 0, vjust = 0, size = 3) +
          ggtitle('All Samples PCA of Mean RPKM of Significant Genes') + 
          theme(legend.position="none") + scale_x_continuous(expand = c(0.2, 0))

ggplot(data = rot, aes(x = PC2, y = PC3, label = Sample, colour = factor(Stage))) +
          geom_point(size = 1) + theme_minimal() + geom_text(hjust = 0, vjust = 0, size = 3) +
          ggtitle('All Samples PCA of Mean RPKM of Significant Genes') + 
          theme(legend.position="none") + scale_x_continuous(expand = c(0.2, 0))

```


#Identifying Housekeepers?

Here I'm plotting the zscore of the RPKMs per gene. The red points are genes that have a log2 Fold Change in between 0.5 and -0.5 for all contrasts. I drew box plots over the points to get a better feel for the spread of the data and where the significant genes tended to fall in the quartiles. 

Ariel brought up an interesting point about the genes that don't change. What are the genes that don't change and are there consitent 'housekeepers' that are present in all of the timepoints at a certain level? This plot tells me that there isn't a definitive RPKM cutoff across the timepoints taht could help me define things that are highly expressed but not significant. 

```{r housekeepersInitial}


all_expr <- rbind(expr2, expr8)

zeroFC <- all_expr %>% filter(dea_ID == 'log2FoldChange') %>%
  filter(abs(dea_Value) <= 0.5) %>%
  group_by(Gene) %>%
  mutate(num_Contrasts = length(unique(contrastID))) %>%
  filter(num_Contrasts == 14) %>%
  distinct(Gene) %>% .$Gene

housekeepers <- scan("db/housekeepers.txt", what = 'character')


rpkm_info <- rpkm %>%
  mutate(zeroFC = ifelse(Gene %in% zeroFC, TRUE, FALSE)) %>%
  mutate(housekeepers = ifelse(Gene %in% housekeepers, TRUE, FALSE)) %>% 
  mutate(logRPKM = log(meanRPKM)) %>%
  mutate(significant2 = ifelse(Gene %in% sign2$Gene, TRUE, FALSE)) %>%
  mutate(significant8 = ifelse(Gene %in% sign8$Gene, TRUE, FALSE))


ggplot(rpkm_info, aes(x = Time, y = logRPKM)) +
  geom_jitter(data = rpkm_info[which(rpkm_info$significant2 == TRUE), ], colour = 'purple', alpha = 0.3) +
  geom_jitter(data = rpkm_info[which(rpkm_info$significant8 == TRUE), ], colour = 'yellow', alpha = 0.3) +
  geom_violin(alpha = 0) +
  theme_minimal()


ggplot(rpkm_info, aes(x = Time, y = zscore)) +
  geom_jitter(data = rpkm_info[which(rpkm_info$zeroFC == FALSE), ], colour = 'lightgrey', alpha = 0.3) + 
  geom_jitter(data = rpkm_info[which(rpkm_info$zeroFC == TRUE), ], colour = 'red', alpha = 0.3) +
  geom_boxplot(colour = 'blue', alpha = 0) + 
  theme_minimal() + 
  ggtitle('zscore(Mean RPKM) per gene across the timepoints')

  #geom_jitter(data = rpkm_info[which(rpkm_info$housekeepers == TRUE), ], colour = 'blue', alpha = 0.3) +

```


#Cluster Trends

The following is a plot of the trends for each of the clusters in Heatmap A, with the 95% confidence interval on the error bars.

```{r zscoreInitial}

## find the mean RPKM for timepoint for each cluster

 
rpkm_sub <- left_join(annotationDF, rpkm) %>%
  dplyr::select(Cluster, Time, zscore, meanRPKM) %>%
  group_by(Cluster, Time) %>%
  mutate(meanRPKMtot = mean(meanRPKM)) %>%
  mutate(meanZscore = mean(zscore)) %>%
  mutate(zscore_error = (1.9645*se(zscore))) %>% 
  mutate(rpkm_error = (1.9645*se(meanRPKM))) %>% 
  dplyr::select(-zscore) %>%
  ungroup() %>%
  dplyr::select(Cluster, Time, meanZscore, zscore_error) %>%
  mutate(ymin = meanZscore - zscore_error) %>%
  mutate(ymax = meanZscore + zscore_error) %>%
  distinct()


ggplot(rpkm_sub, aes(y = meanZscore, x = Time, group = 1)) + 
      geom_line() + 
      theme_minimal() +
      theme(panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(),
            axis.text.x = element_text(angle = -90)
            ) + facet_grid(. ~ Cluster) +
      geom_hline(yintercept = 0, alpha = 0.3, size = 0.1) +
      geom_errorbar(aes(ymax = ymax, ymin = ymin), width = 0.1, colour = "red") +
      ylab('Mean z-score') +
      scale_y_continuous(limits = c(-3, 3), breaks = seq(-3, 3)) + ggtitle('Mean Zscore of Significant Genes Per Cluster')


rpkm_sub_rpkm <- left_join(annotationDF, rpkm) %>%
  dplyr::select(Cluster, Time, zscore, meanRPKM) %>%
  group_by(Cluster, Time) %>%
  mutate(meanRPKMtot = mean(meanRPKM)) %>%
  mutate(rpkm_error = (1.9645*se(meanRPKM))) %>% 
  dplyr::select(-zscore, -meanRPKM) %>%
  ungroup() %>%
  dplyr::select(Cluster, Time, meanRPKMtot, rpkm_error) %>%
  mutate(ymin = meanRPKMtot - rpkm_error) %>%
  mutate(ymax = meanRPKMtot + rpkm_error) %>%
  distinct()


ggplot(rpkm_sub_rpkm, aes(y = meanRPKMtot, x = Time)) + 
      geom_bar(stat = 'identity') + 
      theme_minimal() +
      theme(panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(),
            axis.text.x = element_text(angle = -90)
            ) + facet_grid(. ~ Cluster) +
      geom_hline(yintercept = 0, alpha = 0.3, size = 0.1) +
      geom_errorbar(aes(ymax = ymax, ymin = ymin), width = 0.1, colour = "red") +
      ylab('Mean RPKM') + ggtitle('Mean RPKM of Significant Genes Per Cluster')

```

## Table of Cluster Genes


```{r clustergenesInitial}


bar_string <- "type: 'bar', barColor: 'red', negBarColor: 'blue', highlightColor: 'black'"

cb_bar = JS(paste0("function (oSettings, json) { $('.spark:not(:has(canvas))').sparkline('html', { ", 
    bar_string, " }); }"), collapse = "")


line_string <- "type: 'line', lineColor: 'black', fillColor: '', highlightLineColor: 'red', highlightSpotColor: 'red'"

cb_line = JS(paste0("function (oSettings, json) { $('.spark:not(:has(canvas))').sparkline('html', { ", 
    line_string, ", chartRangeMin: ", -3, ", chartRangeMax: ", 3, " }); }"), 
    collapse = "")

cd <- list(list(targets = 2, render = JS("function(data, type, full){ return '<span class=sparkSamples>' + data + '</span>' }")), 
    list(targets = 1, render = JS("function(data, type, full){ return '<span class=sparkSeries>' + data + '</span>' }")))

cb = JS(paste0("function (oSettings, json) {\n  $('.sparkSeries:not(:has(canvas))').sparkline('html', { ", 
    line_string, " });\n  $('.sparkSamples:not(:has(canvas))').sparkline('html', { ", 
    bar_string, " });\n}"), collapse = "")


rpkm_sub <- left_join(annotationDF, rpkm) %>%
  group_by(Cluster, Gene) %>%
  mutate(zscore = paste(round(zscore, digits = 2), collapse = ",")) %>%
  mutate(meanRPKM = paste(round(meanRPKM, digits = 2), collapse = ",")) %>%
  ungroup() %>%
  dplyr::select(Gene, sp_desc, Cluster, transcript_length, zscore, meanRPKM) %>%
  distinct()


bar_string <- "type: 'bar', barColor: 'red', negBarColor: 'blue', highlightColor: 'black'"

cb_bar = JS(paste0("function (oSettings, json) { $('.spark:not(:has(canvas))').sparkline('html', { ", 
    bar_string, " }); }"), collapse = "")


line_string <- "type: 'line', lineColor: 'black', fillColor: '', highlightLineColor: 'red', highlightSpotColor: 'red'"

cb_line = JS(paste0("function (oSettings, json) { $('.spark:not(:has(canvas))').sparkline('html', { ", 
    line_string, ", chartRangeMin: ", -3, ", chartRangeMax: ", 3, " }); }"), 
    collapse = "")

cd <- list(list(targets = 5, render = JS("function(data, type, full){ return '<span class=sparkSamples>' + data + '</span>' }")), 
    list(targets = 4, render = JS("function(data, type, full){ return '<span class=sparkSeries>' + data + '</span>' }")))

cb = JS(paste0("function (oSettings, json) {\n  $('.sparkSeries:not(:has(canvas))').sparkline('html', { ", 
    line_string, " });\n  $('.sparkSamples:not(:has(canvas))').sparkline('html', { ", 
    bar_string, " });\n}"), collapse = "")

columnDefs = list(list(className = 'dt-center', targets = 5))

d5 <- datatable(rpkm_sub, rownames = FALSE, extensions = c('Buttons', 'FixedHeader', 'Scroller'), options = list(
    autoWidth = TRUE,
    columnDefs = cd,
    fnDrawCallback = cb,
    dom = 'frtBip',
    buttons = c('copy', 'csv', 'excel', 'pdf', 'print'),
    deferRender = TRUE,
    scrollY = 800,
    scroller = TRUE), caption = 'Expansion of the previous table to include all of the genes. For each gene there is an average zscore per timepoint, computed from the average RPKM of the replicates per timepoint.', width = 1000)
d5$dependencies <- append(d5$dependencies, htmlwidgets:::getDependency("sparkline"))


```

```{r outputClusterTableInitial, results='asis', cache = FALSE}
d5

cat('<br><br><br>')
```


# GO Term Plots

## Transcription Factor heatmap

The following heatmap will be significantly differentially expressed genes that are annotated to GO term 'GO:0006355': regulation of transcription, DNA-templated.


'Differentially Expressed' defined as having a p-value less than or equal to 1e-5 and log2Fold Change greater than 0.5 at any time point.

```{r tfheatmapInitial, fig.height=12}


sigtf <- bind_rows(expr2, expr8)  %>% 
  mutate(id1 = ifelse(dea_ID == 'padj' & dea_Value <= 1e-5, yes = TRUE, no = FALSE)) %>%
  mutate(id2 = ifelse(dea_ID == 'log2FoldChange' & dea_Value >= 0.5, yes = TRUE, no = FALSE)) %>%
  group_by(contrastID, Gene) %>%
  filter(any(id1), any(id2)) %>%
  ungroup() %>%
  distinct(Gene) %>% .$Gene


signtf <- expr %>% filter(Gene %in% sigtf)


tf_factor <- filter(ml_annot, grepl('GO:0006355', go_all)) %>%
  unite(col = ID, Gene, sp_desc, sep = ";", remove = FALSE) %>%
  distinct()

lfc <- signtf %>% filter(dea_ID == 'log2FoldChange') %>%
  dplyr::select(-dea_ID) %>%
  spread(contrastID, dea_Value) %>%
  distinct()

tflfc <- inner_join(tf_factor, lfc) %>%
  dplyr::select(-sp_id, -sp_evalue, -sp_desc, -go_all)

tfl <- data.frame(tflfc %>% dplyr::select(-Gene, -Pfam), check.names = FALSE)

rownames(tfl) <- tfl$ID
tfl$ID <- NULL

tfl_heat <- make_pheatmap_df(tfl, min = -2, max = 2)


pheatmap(tfl_heat, cluster_cols = FALSE, clustering_method = 'complete', clustering_distance_rows = dist(tfl), main = paste0('Clustering of ', nrow(tfl), ' Genes'), border_color = NA, cellwidth = 15, cellheight = 10)

```


## Differentially Expressed Genes

The following are heatmaps for differentiall expressed genes annotated to the above GO terms.

'Differentially Expressed' defined as having a p-value less than or equal to 1e-5 and log2Fold Change greater than 0.5 at any time point. GO terms that do not have heatmaps have less than five genes annotated.

```{r DEHeatmapsInitial, fig.keep="all", result='asis'}


simpleCap <- function(x) {
    s <- strsplit(x, " ")[[1]]
    paste(toupper(substring(s, 1, 1)), substring(s, 2),
          sep = "", collapse = " ")
}

GOs   <- c('GO:0007049', 'GO:0030154', 'GO:0007276', 'GO:0007498', 'GO:0007492', 'GO:0007398', 'GO:0048729')


sig <- bind_rows(expr8, expr2)  %>% 
  mutate(id1 = ifelse(dea_ID == 'padj' & dea_Value <= 1e-5, yes = TRUE, no = FALSE)) %>%
  mutate(id2 = ifelse(dea_ID == 'log2FoldChange' & dea_Value >= 0.5, yes = TRUE, no = FALSE)) %>%
  group_by(contrastID, Gene) %>%
  filter(any(id1), any(id2)) %>%
  ungroup() %>%
  distinct(Gene)


sign <- left_join(sig, ml)
sign <- left_join(sign, expr, by = 'Gene') %>%
  filter(dea_ID == 'log2FoldChange') %>%
  spread(contrastID, dea_Value) %>%
  dplyr::select(-dea_ID)

## attach the child term to the data frame of parent terms

xx           <- as.list(GOBPANCESTOR)
allAncestor <- xx[GOs]
x           <- as.list(GOBPOFFSPRING)
allOffspring <- x[GOs]

allAnc <- bind_rows(lapply(GOs, function(x){
  data.frame(go_id = x, associated = unlist(allAncestor[x]), stringsAsFactors = FALSE)
})) %>% filter(associated != 'all')

allOff <- bind_rows(lapply(GOs, function(x){
  data.frame(go_id = x, associated = unlist(allOffspring[x]), stringsAsFactors = FALSE)
}))

xxx <- rbind(allAnc, allOff)

## start here

all_associated <- rbind(data.frame(go_id = GOs, associated = GOs, id = 'original', stringsAsFactors = FALSE), 
                        data.frame(xxx, id = 'derived', stringsAsFactors = FALSE)) %>%
  filter(associated != 'all')


all_associated$Term <- sapply(all_associated$go_id, function(x){simpleCap(Term(GOTERM[[x]]))})


sign <- sign %>% dplyr::rename('associated' = go_id)

dea_go1 <- inner_join(sign, all_associated, by = 'associated') %>%
  unite(ID, Gene, sp_desc, sep = ':') %>%
  dplyr::select(-associated, -id) %>%
  distinct()

dea_go <- dea_go1 %>% dplyr::select(-sp_id, -sp_evalue, -Pfam)


heatmaps <- lapply(GOs, function(x){
    df_sub1 <- dea_go[which(dea_go$go_id == x), ] %>%
      dplyr::select(-go_id)
    title <- paste0('Genes (N=', nrow(df_sub1), ')')
    df_sub <- data.frame(df_sub1 %>% dplyr::select(-Term)  %>%
      distinct(), check.names = FALSE, stringsAsFactors = FALSE)
  if(nrow(df_sub) > 1){
    rownames(df_sub) <- df_sub$ID
    df_sub$ID        <- NULL
    df_sub_heat      <- make_pheatmap_df(df_sub, min = -2, max = 2)
    #heatmap <- pheatmap(df_sub_heat, cluster_cols = FALSE, clustering_method = 'complete', clustering_distance_rows = dist(df_sub),
           #main = title, border_color = NA, cellwidth = 15, cellheight = 10, silent = TRUE)
    list(catname = unique(df_sub1$Term), df = df_sub, title = title)
  }
})

heatmaps <- heatmaps[which(sapply(heatmaps, function(x){!is.null(x)}))]

```

###Heatmaps {.tabset}
```{r tabsetGOplotsInitial, fig.keep='all', results='asis', fig.height=6, fig.width=10, fig.show = 'asis'}

for(x in heatmaps){
  cat( "\n\n####", x[[1]], "\n\n")
  df <- make_pheatmap_df(x[[2]])      
  if(nrow(df) > 25){
      h <- nrow(df)*10.3
      filename <- paste0('analysis_figures/', gsub(" ", "", x[[1]]), 'anyExpressedHeatmap.png')
       pheatmap(df, cluster_cols = FALSE, clustering_method = 'complete', clustering_distance_rows = dist(x[[2]]), main = x[[3]], border_color = NA,
                cellwidth = 15, cellheight = 10, filename = filename)
      cat(paste0("[", x[[1]],"](http://bioinfo/home/klg/projects/Mnemiopsis/DEA/analysisRR4/", filename, ")<br><br><br><br><br><br><br><br><br><br><br><br>"))
    } else {
  pheatmap(df, cluster_cols = FALSE, clustering_method = 'complete', clustering_distance_rows = dist(x[[2]]), main = x[[3]], border_color = NA, cellwidth = 15, cellheight = 10)
    }
  }

```


### Bar Plot
Number of Genes bar plot for each of the GO terms, differentially expressed as defined above.

```{r DEgenesBarPlotInitial}

dea_num <- dea_go %>% group_by(Term) %>%
  mutate(num_rep = length(unique(ID))) %>%
  ungroup() %>%
  filter(num_rep > 1)

gene_bar <- dea_num %>%
  dplyr::select(Term, num_rep) %>%
  distinct()


gene_bar$Term <- factor(gene_bar$Term, levels = rev(c("Cell Cycle", "Cell Differentiation", "Tissue Morphogenesis", "Gamete Generation", "Mesoderm Development", 'Endoderm Development', 'Ectoderm Development')))

ggplot(gene_bar, aes(x = Term, y = num_rep)) + geom_bar(stat="identity", fill = 'purple') + 
  coord_flip() + theme_bw() + 
  ylab("Number of Genes") + xlab("Gene Ontology Ancestral Term") + ggtitle("Ancestral GO terms of Significantly Up Genes") +
  theme(axis.title.y = element_text(vjust = 1), axis.title.x = element_text(vjust = 0.01))

```

Table of genes that created the plots above.

```{r barplotGOInitial, results='asis'}

DT::datatable(dplyr::select(dea_go1, ID, sp_evalue, Term))


cat('<br><br><br>')
```


## Expressed Genes

###Heatmaps

Heatmaps for genes that are expressed (no pvalue or log2 Fold Change cutoff).


```{r anyExpressedHeatmapsInitial, fig.keep = 'none'}

sig <- expr8 %>% filter(dea_ID == 'log2FoldChange')


sign <- left_join(sig, ml)

sign <- sign  %>%
  spread(contrastID, dea_Value) %>%
  dplyr::select(-dea_ID) %>%
  dplyr::rename('associated' = go_id)


dea_go1 <- inner_join(sign, all_associated, by = 'associated') %>%
  unite(ID, Gene, sp_desc, sep = ':') %>%
  dplyr::select(-associated, -id) %>%
  distinct()

dea_go <- dea_go1 %>% dplyr::select(-sp_evalue, -Pfam, -sp_id)

x <- GOs[1]

lapply(GOs, function(x){
  df_sub1 <- dea_go[which(dea_go$go_id == x), ] %>%
    dplyr::select(-go_id)
  title <- paste0(unique(df_sub1$Term), ' Genes (N=', nrow(df_sub1), ')')
  df_sub <- data.frame(df_sub1 %>% dplyr::select(-Term)  %>%
    distinct(), check.names = FALSE, stringsAsFactors = FALSE)
  if(nrow(df_sub) > 5){
    rownames(df_sub) <- df_sub$ID
    df_sub$ID        <- NULL
    df_sub_heat      <- make_pheatmap_df(df_sub, min = -2, max = 2)
    print(nrow(df_sub))
      if(nrow(df_sub) > 300){
      h <- nrow(df_sub)*10.3
      png(file = paste0('analysis_figures/', gsub(" ", "", unique(df_sub1$Term)), 'anyExpressedHeatmap.png'), bg = "transparent", height = h, width = 1500)
      pheatmap(df_sub_heat, cluster_cols = FALSE, clustering_method = 'complete', clustering_distance_rows = dist(df_sub),
           main = title, border_color = NA, cellwidth = 15, cellheight = 10)
      dev.off()  
    } else {
      pheatmap(df_sub_heat, cluster_cols = FALSE, clustering_method = 'complete', clustering_distance_rows = dist(df_sub),
           main = title, filename = paste0('analysis_figures/', gsub(" ", "", unique(df_sub1$Term)), 'anyExpressedHeatmap.png'), border_color = NA, cellwidth = 15, cellheight = 10)
      
    }
}})


```

```{r outputheatmaps2Initial, results = "asis", tidy = FALSE, eval = TRUE}

heatmaps        <- list.files("analysis_figures", pattern = 'anyExpressedHeatmap')
names(heatmaps) <- sort(unique(dea_go$Term))


heatmapLs <- lapply(names(heatmaps), function(x){
  paste0("[", x,"](http://bioinfo/home/klg/projects/Mnemiopsis/DEA/analysisRR4/analysis_figures/", heatmaps[x], ")\n\n")
})

cat(unlist(heatmapLs))
```

### Barplot

Bar plot for genes that are expressed (no pvalue or log2 Fold Change cutoff).

```{r anyExpressedBarPlotInitial}

gene_bar <- dea_go %>% 
  dplyr::select(Term, ID) %>%
  distinct() %>%
  dplyr::group_by(Term) %>%
  mutate(num_terms = length(ID)) %>%
  ungroup() %>%
  dplyr::select(-ID) %>%
  distinct()
  
  
gene_bar$Term <- factor(gene_bar$Term, levels = rev(c("Cell Cycle", "Cell Differentiation", "Tissue Morphogenesis","Gamete Generation", "Mesoderm Development", "Endoderm Development", "Ectoderm Development")))

ggplot(gene_bar, aes(x = Term, y = num_terms)) + geom_bar(stat="identity", fill = 'purple') + 
  coord_flip() + theme_bw() + 
  ylab("Number of Genes") + xlab("Gene Ontology Ancestral Term") + ggtitle("Ancestral GO terms of Expressed Genes") +
  theme(axis.title.y = element_text(vjust = 1), axis.title.x = element_text(vjust = 0.01))


```

Table of genes that created the GO term plots above.

```{r gotermplotstableInitial, results='asis'}

DT::datatable(dplyr::select(dea_go1, ID, sp_evalue, Term))

```


# GO Term Enrichment


```{r GOenrichmentInitial, fig.width=15, fig.height=15}
table2  <- data.table::fread('output/2hpf_all_go_enrichment.xls', data.table = FALSE)
table2$EnrichID <- 'Late Stage'

table8  <- data.table::fread('output/8hpf_all_go_enrichment.xls', data.table = FALSE)
table8$EnrichID <- 'Early Stage'


table <- bind_rows(table2, table8)

table$Branch_Num_Genes <- ifelse(is.na(table$Branch_Num_Genes), 0, table$Branch_Num_Genes)

table <- table %>% group_by(go_id) %>%
  mutate(Num_Genes = (Num_Leaf_Genes + Branch_Num_Genes)) %>%
  ungroup() %>%
  dplyr::select(go_id, Term, Ontol, BH.correction, Num_Genes, EnrichID) %>%
  filter(Num_Genes > 4) %>%
  mutate(pval = paste0('p <= ', signif(BH.correction, digits = 3)))


table$Term <- factor(table$Term, levels = table[order(table$Num_Genes, decreasing = FALSE), ]$Term)


ggplot(table, aes(y = Num_Genes, x = Term, fill = EnrichID)) + geom_bar(stat = 'identity') + coord_flip() +
  theme_minimal() + xlab('GO Term') + ylab('Number of Genes') + ggtitle('Enriched GO Terms by Stage') + 
  geom_text(aes(y = Num_Genes + 25, x = Term, label = pval), size = 2.5) +
  scale_y_continuous(expand = c(0.12,0)) + 
  theme(axis.title.y = element_text(margin=margin(0,-10,0,0)),
        axis.text.y = element_text(margin = margin(0,-20,0,0)),
        legend.title = element_blank())

DT::datatable(dplyr::select(table, go_id, Term, Ontol, BH.correction, Num_Genes))

```


## Late Stage Significant Genes

```{r lateStageEnrichInitial, results='asis'}

library(GO.db)


table <- data.table::fread('output/2hpf_all_go_enrichment.xls', data.table = FALSE)

table$Branch_Num_Genes <- ifelse(is.na(table$Branch_Num_Genes), 0, table$Branch_Num_Genes)

table <- table %>% group_by(go_id) %>%
  mutate(Num_Genes = (Num_Leaf_Genes + Branch_Num_Genes)) %>%
  ungroup() %>%
  dplyr::select(go_id, Term, Ontol, BH.correction, Num_Genes) %>%
  filter(Num_Genes > 1) %>%
  mutate(pval = paste0('p <= ', signif(BH.correction, digits = 3)))

ggplot(table, aes(y = Num_Genes, x = Term)) + geom_bar(stat = 'identity') + coord_flip() +
  theme_minimal() + xlab('GO Term') + ylab('Number of Genes') + ggtitle('Select Enriched GO Terms') +
  geom_text(aes(y = Num_Genes + 25, x = Term, label = pval), size = 2.5) +
  scale_y_continuous(expand = c(0.12,0)) + 
  theme(axis.title.y = element_text(margin=margin(0,-10,0,0)),
        axis.text.y = element_text(margin = margin(0,-20,0,0)))

DT::datatable(dplyr::select(table, go_id, Term, Ontol, BH.correction, Num_Genes))


```

## Early Stage Significant Genes

```{r EarlyStageEnrichInitial, results='asis'}

table <- data.table::fread('output/8hpf_all_go_enrichment.xls', data.table = FALSE)

table$Branch_Num_Genes <- ifelse(is.na(table$Branch_Num_Genes), 0, table$Branch_Num_Genes)


table <- table %>% group_by(go_id) %>%
  mutate(Num_Genes = (Num_Leaf_Genes + Branch_Num_Genes)) %>%
  ungroup() %>%
  dplyr::select(go_id, Term, Ontol, BH.correction, Num_Genes) %>%
  filter(Num_Genes > 1) %>%
  mutate(pval = paste0('p <= ', signif(BH.correction, digits = 3)))

ggplot(table, aes(y = Num_Genes, x = Term)) + geom_bar(stat = 'identity') + coord_flip() +
  theme_minimal() + xlab('GO Term') + ylab('Number of Genes') + ggtitle('Select Enriched GO Terms') +
  geom_text(aes(y = Num_Genes + 25, x = Term, label = pval), size = 2.5) +
  scale_y_continuous(expand = c(0.12,0)) + 
  theme(axis.title.y = element_text(margin=margin(0,-10,0,0)),
        axis.text.y = element_text(margin = margin(0,-20,0,0)))

DT::datatable(dplyr::select(table, go_id, Term, Ontol, BH.correction, Num_Genes))


```

## All Significant Genes

```{r AllEnrichInitial, results='asis'}

table <- data.table::fread('output/allSignificant_all_go_enrichment.xls', data.table = FALSE)

table$Branch_Num_Genes <- ifelse(is.na(table$Branch_Num_Genes), 0, table$Branch_Num_Genes)


table <- table %>% group_by(go_id) %>%
  mutate(Num_Genes = (Num_Leaf_Genes + Branch_Num_Genes)) %>%
  ungroup() %>%
  dplyr::select(go_id, Term, Ontol, BH.correction, Num_Genes) %>%
  filter(Num_Genes > 1) %>%
  mutate(pval = paste0('p <= ', signif(BH.correction, digits = 3)))

ggplot(table, aes(y = Num_Genes, x = Term)) + geom_bar(stat = 'identity') + coord_flip() +
  theme_minimal() + xlab('GO Term') + ylab('Number of Genes') + ggtitle('Select Enriched GO Terms') +
  geom_text(aes(y = Num_Genes + 25, x = Term, label = pval), size = 2.5) +
  scale_y_continuous(expand = c(0.12,0)) + 
  theme(axis.title.y = element_text(margin=margin(0,-10,0,0)),
        axis.text.y = element_text(margin = margin(0,-20,0,0)))

DT::datatable(dplyr::select(table, go_id, Term, Ontol, BH.correction, Num_Genes))


```


#[R-session information](`r paste0("http://bioinfo/", gsub("/home/", "~", getwd()), "/sessionInfo/StorySessionInfo.txt")`)