Dissertation_KG_Chapter_2.3.2.Rmd

---
title: "Role of Piwi-piRNA pathway in somatic and cancer cells"
author: "__Konstantinos Geles__"
date: "Wed Jul 13 2022, Last Update: `r format(Sys.Date(), '%a %b %d %Y')`"
output:
  html_document:
    toc: yes
    toc_depth: 3
    df_print: paged
  pdf_document:
    toc: yes
    toc_depth: 3
  html_notebook: null
editor_options:
  chunk_output_type: console
subtitle: UMG PhD Programme of Molecular and Translational Oncology - Circle XXXIV
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, eval = FALSE)
```

This project contains the scripting part of the Doctoral Dissertation of **Konstantinos Geles** with doi:   

# CHAPTER 2: Data Analysis Workflow for small-RNAseq focused on piRNAs  
  
## 2.3.2 Comparison of the in-house CRC  with GSE160432 dataset results

###  DE piRNAs

Import Libraries
```{r}
suppressPackageStartupMessages({
library(vroom)           #
library(dplyr)           #
library(tidyr)           # data import and manipulation
library(tibble)          # 
library(stringr)         # 
library(forcats)         #
library(readr)           #
library(purrr)           # 

library(ggplot2)         #
library(scales)          # make exploration data plots
library(ggpmisc)         #

library(ComplexHeatmap)  #  Heatmaps
library(circlize)        #

library(clusterProfiler) #
library(wppi)            # Functional enrichment analysis and 
library(ReactomePA)      # protein protein interaction network
})

```


import the resulted de sncRNAs of CRC patients from both datasets
```{r}
CRC_tissues <- vroom("Chapter_2_3/DEA_piRNA_CC_tissues_GRCh38_25_Jan_2022/all_comparisons_long_voom_TMM_salmon_fc_LFCs_25_Jan_2022.txt")

GSE160432_res <- vroom("Chapter_2_2/DEA_piRNA_GSE160432_CRC_GRCh38_20_Jul_2022/all_comparisons_long_voom_TMMQW_salmon_fc_LFCs_20_Jul_2022.txt")

```

DE piRNAs, keep the union of both approaches
```{r}
DE_piRNA <- CRC_tissues %>% 
    filter(adj.P.Val < 0.05, gene_type == "piRNA") %>% 
    dplyr::select(smallRNA:logFC) %>% 
    pivot_wider(names_from = quantification, values_from = logFC) 

DE_piRNA_GSE <- GSE160432_res %>% 
    filter(contrast == "Tumour_v_Ctrl", adj.P.Val < 0.05, 
           gene_type == "piRNA") %>%
    select(smallRNA:logFC) %>% 
    pivot_wider(names_from = quantification, values_from = logFC)

not_concord_piRNA <- DE_piRNA %>% 
    inner_join(DE_piRNA_GSE, by = "smallRNA") %>% 
    filter(if_any(c(-smallRNA), ~ .x > 0) & if_any(c(-smallRNA), ~ .x < -0.1)|
               if_any(c(-smallRNA), ~ .x < -0.1) & if_any(c(-smallRNA), ~ .x > 0)) %>%
    pull(smallRNA)

common_piRNA <- DE_piRNA %>% 
    inner_join(DE_piRNA_GSE, by = "smallRNA") %>% 
    filter(!smallRNA %in% not_concord_piRNA)
```

import the exprs values from featurecounts for the common DE piRNAs
```{r}
exprs_mat_CRC_tissues <- read_rds("Chapter_2_3/EDA_piRNA_CC_tissues_GRCh38_24_Jan_2022/featureCounts/list_norm_dgls_featureCounts.rds") %>% 
    magrittr::extract2("TMM") %>% #get only the TMM normalized expressions
    edgeR::cpm(log = TRUE, prior.count = 4) %>% # transform to lcpm
    .[common_piRNA$smallRNA,] # subset to DE piRNAs

exprs_mat_GSE160432_res <- read_rds("Chapter_2_2/EDA_CRC_no_batch_GRCh38_17_Jun_2022/featureCounts/list_norm_dgls_featureCounts.rds") %>% 
    magrittr::extract2("TMM") %>% #get only the TMM normalized expressions
    edgeR::cpm(log = TRUE, prior.count = 4) %>% # transform to lcpm
    .[common_piRNA$smallRNA,] # subset to DE piRNAs

```

import the groups table
```{r}
table_groups_CRC_tissues <- read_rds("Chapter_2_3/EDA_piRNA_CC_tissues_GRCh38_24_Jan_2022/featureCounts/list_norm_dgls_featureCounts.rds") %>% 
    magrittr::extract2("TMM") %>% 
    magrittr::extract2("colours") %>% # select only the columns of interest
    select(name, group, group_col)

table_groups_GSE160432 <- read_rds("Chapter_2_2/EDA_CRC_no_batch_GRCh38_17_Jun_2022/featureCounts/list_norm_dgls_featureCounts.rds") %>% 
    magrittr::extract2("TMM") %>% 
    magrittr::extract2("colours") %>% 
    select(sample_name, group, group_col) %>% 
    mutate(group = as.character(group),
           group = case_when(
                   group == "healthy" ~ "Control",
                   group == "CRC" ~ "Tumour",
                   TRUE ~ group
               ),
           group_col = case_when(
                   group == "Control" ~ "#377eb8",
                   group == "Tumour" ~ "#e41a1c",
                   TRUE ~ "#f781bf"
               ),
           group = as_factor(group)
           )
```

Scale the matrices for the heatmap
```{r}
# make the exprs matrix for the heatmap -----
sc_mat_CRC <- exprs_mat_CRC_tissues %>% t() %>% scale() %>% t()
sc_mat_GSE <- exprs_mat_GSE160432_res %>% t() %>% scale() %>% t()

sc_mat_CRC %>% dim()
sc_mat_GSE %>% dim()

str_c("min = ", round(min(sc_mat_CRC),3),
      "| median = ", round(median(sc_mat_CRC),3), "| max = ", round(max(sc_mat_CRC),3))
str_c("min = ", round(min(sc_mat_GSE),3),
      "| median = ", round(median(sc_mat_GSE),3), "| max = ", round(max(sc_mat_GSE),3))

stopifnot(identical(as.character(table_groups_CRC_tissues$name), colnames(sc_mat_CRC)))

stopifnot(identical(as.character(table_groups_GSE160432$sample_name), colnames(sc_mat_GSE)))

# make the logFC matrix for the heatmap -----
scfc_mat <- common_piRNA %>% column_to_rownames("smallRNA") %>% as.matrix()
scfc_mat %>% dim()
scfc_mat %>% head()
str_c("min = ", round(min(scfc_mat, na.rm = T),3),
      "| median = ", round(median(scfc_mat, na.rm = T),3), "| max = ", round(max(scfc_mat, na.rm = T),3))

scfc_mat <- scfc_mat[rownames(sc_mat_CRC),]

rownames(sc_mat_CRC) <- rownames(sc_mat_CRC) %>% str_remove("_GR.+")
rownames(sc_mat_GSE) <- rownames(sc_mat_GSE) %>% str_remove("_GR.+")
rownames(scfc_mat) <- rownames(scfc_mat) %>% str_remove("_GR.+")

identical(rownames(sc_mat_CRC), rownames(scfc_mat))
identical(rownames(sc_mat_GSE), rownames(scfc_mat))
```

make the heatmap
```{r}
f <- colorRamp2(c(-1, 0, 1), c("#53868B", "#8B8878", "#FFD700"))

f_GSE <- colorRamp2(c(-1, 0, 1),
                    c("#1874CD", "#8B8878", "#EEC900"))

f_lFC <- colorRamp2(c(-1, 0, 1),
                    c("#008B8B", "black", "#B22222"))
# mat
ha_1 <- HeatmapAnnotation(
          Group = table_groups_CRC_tissues$group,
            col = list(
                Group = table_groups_CRC_tissues %>%  
                           select(group, group_col) %>% 
                           deframe()),
            annotation_name_side = "left",
            annotation_legend_param = 
                list(title_gp = gpar(fontsize = 15, fontface = "bold"),
                     labels_gp = gpar(fontsize = 15, fontface = "bold"))
            )
# mat GSE
ha_GSE <- HeatmapAnnotation(
                          Group = table_groups_GSE160432$group,
                          
            col = list(
                Group = table_groups_GSE160432 %>%  
                           select(group, group_col) %>% 
                           deframe()), 
            annotation_legend_param = 
                list(title_gp = gpar(fontsize = 15, fontface = "bold"),
                     labels_gp = gpar(fontsize = 15, fontface = "bold"))
            )
# lFCS
ha_1_LFCs <- HeatmapAnnotation(
              Method = colnames(scfc_mat) %>% str_remove("..$"),
              col = list(
                  Method = wesanderson::wes_palettes$Moonrise1[c(1,4)] %>%
                      rep(2) %>% 
                      set_names(colnames(scfc_mat) %>% str_remove("..$"))
                  ), 
              annotation_legend_param = 
                list(title_gp = gpar(fontsize = 15, fontface = "bold"),
                     labels_gp = gpar(fontsize = 15, fontface = "bold"))
            )

ht_1 <- Heatmap(matrix = sc_mat_CRC, #data
        top_annotation = ha_1, #annot
        col = f, #colors data  
        row_title_gp = gpar(fontsize = 20),
        column_title_gp = gpar(fontsize = 20),
        column_names_gp = gpar(fontsize = 20),
        column_names_rot = 45,
        heatmap_legend_param = list(
            title_gp = gpar(fontsize = 15, fontface = "bold"),
            labels_gp = gpar(fontsize = 15)),
        
        show_row_dend = TRUE,
        show_row_names = FALSE,
        show_column_names = FALSE,
        name = "z-score \nabundance \nin-house",
         
        clustering_distance_columns = "euclidean",
        clustering_method_columns =  "ward.D2",
        clustering_method_rows = "ward.D2",
        clustering_distance_rows = "euclidean",
        row_dend_reorder = TRUE,
        row_title = "piRNAs",
        column_title = "in-house dataset")

ht_1_GSE <- Heatmap(matrix = sc_mat_GSE, #data
        top_annotation = ha_GSE, #annot
        col = f_GSE, #colors data  
        row_title_gp = gpar(fontsize = 20),
        column_title_gp = gpar(fontsize = 20),
        column_names_gp = gpar(fontsize = 20),
        column_names_rot = 45,
        heatmap_legend_param = list(
            title_gp = gpar(fontsize = 15, fontface = "bold"),
            labels_gp = gpar(fontsize = 15)),
        
        show_row_dend = FALSE,
        show_column_dend = TRUE,
        show_row_names = FALSE,
        show_column_names = FALSE,
        name = "z-score \nabundance \nGSE160432",
         
        clustering_distance_columns = "euclidean",
        clustering_method_columns =  "ward.D2",
        clustering_method_rows = "ward.D2",
        clustering_distance_rows = "euclidean",
        row_dend_reorder = TRUE,
        row_title = "piRNAs",
        column_title = "GSE160432")

ht_1_lFCs <- Heatmap(matrix = scfc_mat, #data
        top_annotation = ha_1_LFCs, #annot
        col = f_lFC, #colors data   
        row_title_gp = gpar(fontsize = 20),
        column_title_gp = gpar(fontsize = 20),
        column_names_gp = gpar(fontsize = 20),
        heatmap_legend_param = list(
            title_gp = gpar(fontsize = 15, fontface = "bold"),
            labels_gp = gpar(fontsize = 15)),
        show_row_dend = TRUE,
        show_column_dend = TRUE,
        show_row_names = TRUE,
        show_column_names = FALSE,
        name = "Log2 Fold Change",
        clustering_distance_columns = "spearman",
        clustering_method_columns =  "ward.D2",
        clustering_method_rows = "ward.D2",
        clustering_distance_rows = "spearman",
        row_dend_reorder = TRUE,
        column_title = "Log2 Fold Changes"
)

draw(ht_1 + ht_1_GSE + ht_1_lFCs,
     column_title = str_glue("Heatmap of {nrow(sc_mat_GSE)} DE piRNAs"),
     merge_legend = TRUE,
     column_title_gp = gpar(fontsize = 20, fontface = "bold"),
     heatmap_legend_side = "bottom", annotation_legend_side = "bottom")

tiff(filename = file.path("FIG_44_piRNA_common_DE_CRC_GSE.tiff"),
     compression = "none", height = 10, width = 14,  units = 'in', res = 600)

draw(ht_1 + ht_1_GSE + ht_1_lFCs,
     column_title = str_glue("Heatmap of {nrow(sc_mat_GSE)} common DE piRNAs"),
     merge_legend = TRUE,
     column_title_gp = gpar(fontsize = 20, fontface = "bold"),
     heatmap_legend_side = "bottom", annotation_legend_side = "bottom")

dev.off()

```


###  Gene predicted targets for the common DE piRNA from CRC patients

import the table with predicted targets
```{r}
total_pred_targ <- vroom("Chapter_2_3/piRNA_predicted_Targets.v02.txt")
```

search the union of DE piRNAs from both FC and salmon
```{r}
targets_inhouse <- total_pred_targ %>% 
    filter(piRNA_id %in% DE_piRNA$smallRNA) %>% 
    distinct(piRNA_id, Target_gene_name, .keep_all = TRUE)

unique(targets_inhouse$Target_gene_name) 

targets_GSE <- total_pred_targ %>% 
    filter(piRNA_id %in% DE_piRNA_GSE$smallRNA) %>% 
    distinct(piRNA_id, Target_gene_name, .keep_all = TRUE)

unique(targets_GSE$Target_gene_name) 

piRNA_DE_targets_union <- total_pred_targ %>% 
    filter(piRNA_id %in% unique(c(DE_piRNA$smallRNA,DE_piRNA_GSE$smallRNA))) %>% 
    distinct(piRNA_id, Target_gene_name, .keep_all = TRUE) 

piRNA_DE_targets_union %>% 
    vroom_write("Chapter_2_3/Pred_targets_union_GSE_inhouse_DE_piRNA.txt")

gene_targets <- unique(piRNA_DE_targets_union$Target_gene_name) 
```

find PPIs for genes of interest
```{r}
# HPO annotations set
HPO_data <- wppi_hpo_data()

# search for gene symbols correlated to cancer of carcinoma
HPO_interest <- HPO_data %>% 
  filter(str_detect(Name, "cancer|carcinoma"))

db <- wppi_data(datasets = c('omnipath', 'kinaseextra')) # here

graph_op <- graph_from_op(db$omnipath)

# subset the graph with regard to the predicted target genes
graph_op_targets <- subgraph_op(graph_op, gene_targets)
igraph::vcount(graph_op_targets)

w_adj_targets <- weighted_adj(graph_op_targets, db$hpo, HPO_interest) 

rw_targets <- random_walk(w_adj_targets)

scores_targets <- prioritization_genes(graph_op_targets, rw_targets, gene_targets)

wppi_scores_targets <- unique(scores_targets$gene_symbol) # 55 genes

total_network_targets <- union(wppi_scores_targets, gene_targets)

```

transform Gene Symbols to ENTREZ IDs
```{r}
Entr_Gene_Ids <- bitr(total_network_targets, 
                     fromType="SYMBOL", 
                     toType="ENTREZID", OrgDb="org.Hs.eg.db")

total_network_targets %>% length()
Entr_Gene_Ids %>% nrow() #75/76
```

over-representation analysis of Ontological databases
```{r}
reactome_path <- enrichPathway(gene = Entr_Gene_Ids$ENTREZID, 
                               pvalueCutoff = 0.05, readable = TRUE) %>%
    as_tibble()

reactome_path %>% vroom_write("Chapter_2_3/Pred_targets_Enr_REACTOME_union_CRC_datasets.txt")
```

join the enriched pathways
```{r}
reactome_path <- reactome_path %>% 
    mutate(richFactor = Count / as.numeric(sub("/\\d+", "", BgRatio)))
```

PhD theme for plots
```{r}
wes_cols <- c(wesanderson::wes_palettes$Zissou1[5:1])
PhD_theme <-
  list(
    scale_fill_brewer(palette = "Set1"),
    theme_bw() +
      theme(
        panel.border = element_blank(),
        strip.background = element_blank(),
        strip.text = element_text(size = 25, colour = "black",
                                  face = "bold"),
        axis.line = element_line(),
        panel.grid.major = element_line(size = 0.3),
        panel.grid.minor = element_line(size = 0.1),
        text = element_text(size = 20),
        axis.title.x = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10),
                                    colour = "black", face = "bold"),
        axis.title.y = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10),
                                    colour = "black", face = "bold"),
        axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, size = 20,
                                   colour = "black", face = "bold"),
        axis.text.y = element_text(size = 20, colour = "black", face = "bold"),
        plot.title = element_text(hjust = 0.5, colour = "black", face = "bold")
      )
  )
```

plot pathways
```{r}
reactomeenr_plot <- reactome_path %>% 
  head(30) %>% 
  ggplot(
  aes(richFactor, fct_reorder(Description, richFactor))) + 
  geom_segment(aes(xend = 0, yend = Description)) +
  geom_point(aes(color = p.adjust, size = Count)) +
  scale_color_gradientn(colours = wes_cols,
                        guide = guide_colorbar(reverse = TRUE, order = 1)) +
  scale_size_continuous(range=c(2, 10)) +
  #facet_wrap(facets = "Pathway", ncol = 1) +
  xlab("Rich Factor") +
  ylab("Enriched Pathways") + 
  ggtitle("Top 30 Enriched Reactome Pathways \nin CRC datasets of the predicted piRNA gene targets")+
  PhD_theme 

tiff(filename = file.path("FIG_45_Enriched_REACTOME_CRC_targets.tiff"),
     compression = "none", height = 12, width = 18,  units = 'in', res = 600)
reactomeenr_plot
dev.off()
```