wgcna.Rmd

---
title: "WGCNA Notebook"
output:
  html_document:
    toc: yes
    df_print: paged
  html_notebook:
    toc: yes
    fig_width: 12
    fig_height: 12
    fig_caption: yes
    number_sections: yes
---

```{r}
library(tidyverse)
library(stringr)
library(DESeq2)
library(magrittr)
library(genefilter)
library(readxl)
library(org.Mm.eg.db)
library(AnnotationDbi)
library(WGCNA)
```

```{r}
effector <- as.data.frame(read_excel("allStrains_PresentAbsent_effectorlist_forKeshav.xlsx",
  col_types = c(
    "text", "numeric", "numeric", "numeric", "numeric",
    "numeric", "numeric", "numeric", "numeric", "numeric",
    "numeric", "numeric", "numeric", "numeric", "numeric"
  )
))
rownames(effector) <- effector[, 1] # move effectors to rownames
effector <- subset(effector, select = -...1) # remove effector symbol column
head(effector)
```
## Creating metadata for the DGE Analysis

DESeq2 needs sample information (metadata) for performing DGE analysis.
Let's create the sample information

```{r}
# Read the csv file and change the column name
sample_ID <- read.csv("~/Documents/AlinaRnaSeq/countmatrices/samples.csv", 
                      sep = "", 
                      stringsAsFactors=TRUE)
head(sample_ID)
```

```{r}
condition <- c(
  "Infected", "Infected", "Infected", "Infected", "Infected", "Infected",
  "Infected", "Infected", "Infected", "Infected", "Infected", "Infected",
  "Infected", "Infected", "control", "control"
)
coldata <- data.frame(sample_ID, condition)
colnames(coldata) <- c("Sample_Name", "condition") # change name of one of the columns
# The metadata can be found in a df called coldata!
head(coldata)
```

### Tidying up the names for plots later!

#### First from coldata

```{r}
# tidying up the names od samples in both columns that list of samples
# coldata$Samples <- str_remove_all(coldata$Sampls, pattern = "run6_trimmed_|_.bam|_S\\d\\d|_S\\d")
coldata$Sample_Name <- str_remove_all(coldata$Sample_Name,
  pattern = "run6_trimmed_|_.bam|_S\\d\\d|_S\\d"
)
coldata$condition <- as.factor(coldata$condition)
```

### Changing the names of samples (as per Alina)
```{r}
coldata[coldata == "476_R1"] <- "T"
coldata[coldata == "754_R1"] <- "S54"
coldata[coldata == "755_R1"] <- "S55"
coldata[coldata == "757_R1"] <- "L57"
coldata[coldata == "758_R1"] <- "A58"
coldata[coldata == "760_R1"] <- "L60"
coldata[coldata == "761_R1"] <- "S61"
coldata[coldata == "762_R1"] <- "A62"
coldata[coldata == "763_R1"] <- "L63"
coldata[coldata == "764_R1"] <- "A64"
coldata[coldata == "765_R1"] <- "S65"
coldata[coldata == "766_R1"] <- "L66"
coldata[coldata == "768_R1"] <- "A68"
coldata[coldata == "769_R1"] <- "L69"
coldata[coldata == "Ctrl1_R1"] <- "C1"
coldata[coldata == "Ctrl2_R2"] <- "C2"
# convert column1 with sample names to row.names of coldata
rownames(coldata) <- coldata$Sample_Name
coldata
```
## Input Count Matrix 
```{r}
count_matrix0 <- read.csv("~/Documents/AlinaRnaSeq/countmatrices/newcounts.csv",
                          stringsAsFactors = TRUE
                          )
rownames(count_matrix0) <- count_matrix0[, 1] # make the symbols as rownames
count_matrix0 <- subset(count_matrix0, select = -X) # remove the X and symbol columns
colnames(count_matrix0) <- str_remove_all(colnames(count_matrix0),
                                      pattern="run6_trimmed_|_.bam|_S\\d\\d|_S\\d"
                                          )
# the elements from Sample_Name from coldata must the the colnames of countsmatrix
colnames(count_matrix0) <- coldata$Sample_Name
# Display the column names
colnames(count_matrix0)
```

## Annotating and Exporting ENSEMBL ID into Gene Symbols

Adding genes annotated from ENSEMBL ID to Gene symbols and ENTREZ Id to countsmatrix table. Will be keeping the symbols and entrez columsn to be added later into results table as it is for later use
```{r}
cm_row <- rownames(count_matrix0)
head(cm_row)
# Mapping the ENSEMBL ID to Symbol and ENTREZ ID
symbols <- mapIds(
  org.Mm.eg.db,
  keys = cm_row,
  column = c("SYMBOL"),
  keytype = "ENSEMBL",
  multiVals = "first"
)
symbols <- symbols[!is.na(symbols)]
symbols <- symbols[match(rownames(count_matrix0), names(symbols))]
head(symbols, 25)
# Creating a new column called genename and putting in the symbols and entrez columns into count matrix
count_matrix0$genename <- symbols
# Removing all rows with NA values for genenames, so that those rows are filtered out.
count_matrix0 <- unique(count_matrix0[rowSums(is.na(count_matrix0)) == 0, ]) # Apply rowSums & is.na
nrow(count_matrix0)
# Moving the ENSEMBL ID from rownames into separate column for itself.
count_matrix0 <- tibble::rownames_to_column(count_matrix0, "E_ID")
# Removing the duplicated genes so that then these genes can be made into rownames for countsmatrix
count_matrix0 <- distinct(count_matrix0[!duplicated(count_matrix0$genename), ])
# Now make the ganename column into rownames of count matrix
rownames(count_matrix0) <- count_matrix0[, "genename"]
# dropping the column E_ID, genenames so that only numeric values are present in it as an input of DESEq Object.
count_matrix0 <- subset(count_matrix0, select = -c(genename, E_ID)) #
# Changing countsmatrix into Matrix of numeric values so that only numeric values are present in it as an input of DESEq Object.
count_matrix0 <- as.matrix(count_matrix0)
class(count_matrix0) <- "numeric"
```

### The Count Matrix is:
```{r}
dim(count_matrix0)
head(count_matrix0)
```


## Differential Gene Expression analysis using DESeq2


```{r}
all(rownames(coldata) %in% colnames(count_matrix0))
ncol(count_matrix0) == nrow(coldata)
dim(count_matrix0)
```

## Creating the DESeq Data set Object

```{r}
dds <- DESeqDataSetFromMatrix(countData = count_matrix0,
                              colData = coldata,
                              design = ~condition
)
dds <- DESeq(dds)
nrow(dds)
```

## VST Transformation

```{r}
keep <- rowSums(counts(dds)) > 10
dds <- dds[keep, ]
nrow(dds)
# vsd <- vst(dds, blind = FALSE)
# head(assay(vsd), 3)
vsd <- varianceStabilizingTransformation(dds)
```
```{r}
wpn_vsd <- getVarianceStabilizedData(dds)
rv_wpn <- rowVars(wpn_vsd)
# summary(rv_wpn)
q95_wpn <- quantile(rowVars(wpn_vsd), 0.95)

normalized_input <- wpn_vsd[rv_wpn > q95_wpn,]
head(normalized_input)
```
```{r}
# normalized_input_df <- data.frame(normalized_input)
```


```{r}
input_mat <- as.data.frame(t(normalized_input))
head(input_mat)
```


# check for missing values and identify outliers

```{r}
# gsg <- goodSamplesGenes(count_matrix, verbose = 10)
# gsg$allOK
```
```{r}
# if (!gsg$allOK) {
#   # # Optionally, print the gene and sample names that were removed:
#   # if (sum(!gsg$goodGenes) > 0) {
#   #   print(paste(
#   #     "Removing genes:",
#   #     paste(names(count_matrix)[!gsg$goodGenes],
#   #       collapse = ", "
#   #     )
#   #   ))
#   # }
#   # if (sum(!gsg$goodSamples) > 0) {
#   #   print(paste(
#   #     "Removing samples:",
#   #     paste(rownames(count_matrix)[!gsg$goodSamples],
#   #       collapse = ", "
#   #     )
#   #   ))
#   # }
#   # Remove the offending genes and samples from the data:
#   count_matrix <- count_matrix[gsg$goodSamples, gsg$goodGenes]
# }
```

```{r}
count_matrix <- input_mat[-c(15,16), ]
dim(count_matrix)
```

```{r}
head(count_matrix)
```

## using log2Fc tabel with Zero thresh
```{r}
# count_matrix_zerothresh <- read.csv("~/2D_Analysis/l2fctable_InfectedvsControl_BL6_zerothresh.csv", 
#                               stringsAsFactors = TRUE)
# #
# 
# rownames(count_matrix_zerothresh) <- count_matrix_zerothresh[, 2] # make the symbols as rownames
# count_matrix_zerothresh <- subset(count_matrix_zerothresh, select = -c(X, symbol)) # remove the X and symbol columns
# ```
# 
# ```{r}
# count_matrix <- count_matrix_zerothresh
```

## Cluster the samples

```{r}
sampleTree <- hclust(dist(count_matrix), method = "average")
plot(sampleTree, hang = -1)
```

```{r}
sampleTree2 <- hclust(dist(effector), method = "average")
# effectorColors
effColors <- numbers2colors(effector, signed = T)
```
```{r fig.width=10, fig.height=10}
plotDendroAndColors(sampleTree2,
  effColors,
  groupLabels = names(effector),
  main = "Sample Dendrogram of Effector"
)
```

# Part 2

```{r}
# Choose a set of soft-thresholding powers
powers <- c(c(1:20), seq(from = 1, to = 20, by = 1))
# Call the network topology analysis function
sft <- pickSoftThreshold(count_matrix, 
                         powerVector = powers,
                         dataIsExpr = TRUE,
                         RsquaredCut = 0.80,
                         verbose = 10, 
                         blockSize = 10,
                         networkType = "unsigned"
                         )
```

```{r fig.width=10, fig.height=10}
# Plot the results:
# sizeGrWindow(9, 5)
par(mfrow = c(1, 2))
cex1 <- 1
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[, 1], -sign(sft$fitIndices[, 3]) * sft$fitIndices[, 2],
  xlab = "Soft Threshold (power)", ylab = "Scale Free Topology Model Fit,signed R^2", type = "n",
  main = paste("Scale independence")
)
text(sft$fitIndices[, 1], -sign(sft$fitIndices[, 3]) * sft$fitIndices[, 2],
  labels = powers, cex = cex1, col = "red"
)
# this line corresponds to using an R^2 cut-off of h
abline(h = 0.8, col = "green")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[, 1], 
     sft$fitIndices[, 5],
     xlab = "Soft Threshold (power)", 
     ylab = "Mean Connectivity", 
     type = "n",
     main = paste("Mean connectivity")
)
text(sft$fitIndices[, 1], sft$fitIndices[, 5], labels = powers, cex = cex1, 
     col = "red")
```
```{r}
count_matrix <- apply(count_matrix, 2, function(x) as.numeric(as.character(x)))
```

```{r}
cor <- WGCNA::cor
net <- blockwiseModules(count_matrix,
  power = 15,
  TOMType = "signed",
  minModuleSize = 5,
  reassignThreshold = 0,
  mergeCutHeight = 0.05,
  numericLabels = FALSE,
  pamRespectsDendro = FALSE,
  saveTOMs = TRUE,
  saveTOMFileBase = "TOM",
  verbose = 5
)
```
```{r}
table(net$colors)
```
```{r}
modulecolors <- labels2colors(net$colors)
```


```{r}
plotDendroAndColors(net$dendrograms[[1]],
  modulecolors[net$blockGenes[[1]]],
  "Module Colors",
  dendroLabels = FALSE,
  autoColorHeight = TRUE,
  addGuide = TRUE,
  addTextGuide = TRUE,
  hang = 0.05,
  guideHang = 0.05
)
```


```{r}
module_df <- data.frame(gene_id = names(net$colors),
                        colors = labels2colors(net$colors)
) 
```

# Determine Eigengenes - hypothetical central genes

## Get module Eigen genes per cluster

```{r}
#Define number of Genes and Samples
nGenes = ncol(count_matrix)
nSamples = nrow(count_matrix)
```

Recalculate MEs with color labels
```{r}
MEs0 <- moduleEigengenes(count_matrix, modulecolors)$eigengenes
```

Reorder Modules so that similar modules are next to each other
```{r}
effector <- t(effector) # transposing to have same number of rows and cols for correlations
```

```{r}
MEs <- orderMEs(MEs0)
moduleTraitCor = cor(MEs, effector, use = "p" )
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
```

# Display correlations and respective Pvalues


```{r}
textMatrix = paste(signif(moduleTraitCor, 1),
                   "\n (",
                   signif(moduleTraitPvalue,1),
                   ")\n",
                   sep = ""
                   )
dim(textMatrix) = dim(moduleTraitCor)

textMatrix_df <- as.data.frame(textMatrix)
```


## Display Correlation Value with heatmap plot

1. Each row corresponds to a module eigengene, column to a trait (effector in our case). 
2. Each cell contains the corresponding correlation and p-value (pvalue is displayed in brackets). 
3. The table is color-coded by correlation according to the color legend.

```{r fig.height=10, fig.width=20}
labeledHeatmap(Matrix = moduleTraitCor,
               xLabels = names(data.frame(effector)),
               yLabels = substring(names(MEs), 3),
               ySymbols = substring(names(MEs), 3),
               colorLabels = FALSE,
               colors = blueWhiteRed(50),
               textMatrix = textMatrix,
               setStdMargins = FALSE,
               cex.text = .7,
               main = paste("All relationships of T3SS Effector with Modules of Differential Genes"),
               keepLegendSpace = TRUE,
               #horizontalSeparator.y = c(1:14),
               verticalSeparator.x = c(1:56),
               plotLegend = TRUE
  
)
```

### Filter for only significant P values

```{r}
moduleTraitPvalue_df <- as.matrix(moduleTraitPvalue)
filtered_mat <- ifelse(moduleTraitPvalue <= 0.05, moduleTraitPvalue, 0) # filter out the insignificant values and keep teh regular values
```

```{r}
textMatrix_filtered = paste(signif(moduleTraitCor, 1),
                   "\n (",
                   signif(filtered_mat,1),
                   ")\n",
                   sep = ""
                   )
dim(textMatrix_filtered) = dim(moduleTraitCor)
```

```{r}
textMatrix_filtered_df <- as.data.frame(textMatrix_filtered)
```
```{r fig.height=10, fig.width=20}
labeledHeatmap(Matrix = moduleTraitCor,
               xLabels = names(data.frame(effector)),
               yLabels = substring(names(MEs), 3),
               ySymbols = substring(names(MEs), 3),
               colorLabels = TRUE,
               colors = blueWhiteRed(100),
               textMatrix = textMatrix_filtered,
               setStdMargins = FALSE,
               cex.text = .7,
               main = paste("Statistically significant relationships of T3SS Effectors with Modules of Differential Genes"),
               keepLegendSpace = TRUE,
               horizontalSeparator.y = c(1:14),
               verticalSeparator.x = c(1:56),
               plotLegend = TRUE

)
```


```{r}
#names(count_matrix)[modulecolors =="black"]
black_module <- filter(module_df, colors == "black")
turquoise_module <- filter(module_df, colors == "turquoise")
royalblue_module <- filter(module_df, colors == "royalblue")
magenta_module <- filter(module_df, colors == "magenta")
salmon_module <- filter(module_df, colors == "salmon")
tan_module <- filter(module_df, colors == "tan")
lightgreen_module <- filter(module_df, colors == "lightgreen")
lightyellow_module <- filter(module_df, colors == "lightyellow")
grey60_module <- filter(module_df, colors == "grey60")

```

```{r}
write.csv(magenta_module,file = "magenta_module.csv")
write.csv(black_module,file = "black_module.csv")
write.csv(turquoise_module,file = "turquoise_module.csv")
write.csv(grey_module,file = "grey_module.csv")
```

# Relationship of Genes to Effectors and Important Modules

### Gene Significance (GS) 
The association of individual genes with interesting effectors(weight) can be quantified by GS, as the (absolute value of ) correlation between and effector. 

Here, I dont have any quantitative variable regarding effector data, so I will create one called net_effectors, by rowwise sum of all presence comparing each strain.


```{r}
effector_df <- as.data.frame(effector)
net_eff <- rowSums(effector_df, na.rm = TRUE, dims = 1)
effector_df$net_effector <- net_eff
head(effector_df)
```
```{r}
# Net_eff variable
net_eff <- as.data.frame(net_eff)

# Names (Colors) of the Modules
modNames <-  substring(names(MEs), 3)
```

### Module Membership (MM)

For each module, we define a quantitative measure of module membership MM as the correlation between the module eigengene and gene expression profile. This enables to quanitfy the similarity of all genes on the array to every module.´

```{r}
geneModuleMembership <- as.data.frame(cor(count_matrix, MEs, use = "p"))
head(geneModuleMembership)
MMPvalue <- as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples))
# fix up the names
names(geneModuleMembership) <- paste("MM", modNames, sep = "_")
names(MMPvalue) <- paste("p.MM", modNames, sep = "")
```
```{r}
geneModuleMembership_df <- cbind(geneModuleMembership, MMPvalue)
head(geneModuleMembership_df)
```

```{r}
geneEffSignificance <- as.data.frame(cor(count_matrix, net_eff, use = "p"))
GSPvalue <- as.data.frame(corPvalueStudent(as.matrix(geneEffSignificance), nSamples))

names(geneEffSignificance) <- paste("GS", names(net_eff), sep = ",")
names(GSPvalue) <- paste("p.GS", names(net_eff), sep = "")

geneEffSignificance_df <- cbind(geneEffSignificance, GSPvalue)
names(geneEffSignificance_df) <- c("GS", "pvalue.GS")

head(geneEffSignificance_df)
```

## Intramodular Analysis

Use GS and MM, to identify genes that have a high significance for NetEff and also high module membership in interesting modules.

```{r}
module = "turquoise"
column = match(module, modNames)
moduleGenes = modulecolors == module

verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
                   abs(geneEffSignificance[moduleGenes,1]),
                   xlab = paste("Module membership in",module,"module"),
                   ylab = "Gene Significance for Net Effectors", 
                   main = paste("Module Membership vs gene Significance \n"),
                   cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module
  
)
```