DESeq2


setwd("/Users/birongzhang/Downloads/Sheeba/TCGA/DESeq2")
### set up
## DESeq2
library(DESeq2)
library(tidyverse)
library(ggplot2)
library(biomaRt)

# PCA plot
library(PCAtools)
library(pcaExplorer)

# volcano plot
library(EnhancedVolcano)


### import data
coldata <- read.table("clin.csv",header = T,sep = ",", row.names= 1)
head(coldata)
coldata$Treatment <- as.factor(coldata$Treatment)
levels(coldata$Treatment)
rownames(coldata) <- gsub("-",".",rownames(coldata))

counts <- read.delim("TP_gene-expression.csv", header = T, sep=",", row.names= 1)
counts[1:3,1:3]

coldata<-coldata[colnames(counts),]
counts <- counts[, rownames(coldata) ]
all((colnames(counts)==rownames(coldata))==T)

head(colnames(counts))
head(rownames(coldata))

write.csv(coldata, "clin_metadata.csv",row.names = T)
write.csv(counts, "clin_counts.csv")


###DESeq2 dim(1309,9)
ds <- DESeqDataSetFromMatrix(countData = counts,
                             colData = coldata,
                             design = ~ Treatment)
ds

## filter
## keep <- rowSums(counts(ds)) >= 300
table(keep)

ds <- ds[keep,]
ds
# normalized data_ Corrected data using the median of ratios method
# d<-t(t(counts(dds))/sizeFactors(dds)) %>% data.frame()
dds=estimateSizeFactors(ds,type = c("poscounts"))
normlised <- counts(dds, normalized=TRUE) %>% data.frame()
write.csv(normlised, "DESeq2_Normalised.csv")

ds = DESeq(ds)
alpha=0.05


#########PCA plot
levels(coldata$Treatment)
vsd <- varianceStabilizingTransformation(ds)
p<-pcaplot(vsd, intgroup = c("Treatment"), text_labels = FALSE)+  theme_bw()+  
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), 
        panel.border = element_blank(),        
        axis.line = element_line(colour = "black",size=1),
        plot.title = element_text(hjust = 0.5))+
  labs(title = "PCAplot")
p
ggsave("PCAplot.png",p,dpi=600, width = 8, height = 6)


p<-pcaplot(vsd, intgroup = c("Treatment"), text_labels = TRUE)+  
  theme_bw()+  
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), 
        panel.border = element_blank(),        
        axis.line = element_line(colour = "black",size=1),
        plot.title = element_text(hjust = 0.5))+
  labs(title = "PCAplot")
p
ggsave("PCAplot on samples.png",p,dpi=600, width = 8, height = 6)


################ exceptional_responders Vs early_rapid_responders
# save the results
levels(coldata$Treatment)
res = results(ds, contrast=c("Treatment","exceptional_responders", "early_rapid_responders"), alpha=0.05)
res<-res %>% data.frame() %>%  arrange(padj)
head(res)

fc_cutoff <- 2
padj_cutoff <- 0.05
res$Tag <- "NC"
res$Tag[res$log2FoldChange > log2(fc_cutoff) & res$padj < padj_cutoff] <- "Up"
res$Tag[res$log2FoldChange < (-1 * log2(fc_cutoff)) & res$padj < padj_cutoff] <- "Down"
table(res$Tag )
# Down    NC    Up 
# 197 56201   204 

write.csv(res, "DESeq2.csv")

res_sig <- res[res$Tag == "Up" | res$Tag == "Down",]
write.csv(res_sig, "DESeq2_sig.csv")

library(rtracklayer)
library(AnnotationDbi)
library(GenomicFeatures)
library(tidyverse)
library(biomaRt)

# Human
###gene annotation
dataFilt.brca <- read.delim("DESeq2.csv",header=T,sep = ",",row.names = 1)
head(dataFilt.brca)
ensembl = useMart("ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl", host="uswest.ensembl.org")

##### 'mgi_symbol' => 'external_gene_name'
bioM.res<-getBM(attributes=c('ensembl_gene_id','hgnc_symbol',"entrezgene_id"), 
                filters = 'ensembl_gene_id', values = rownames(dataFilt.brca), mart = ensembl)
dim(bioM.res) 
colnames(bioM.res) <- c("Geneid","hgnc_symbol","GeneID")
bioM.res[1:3,1:3]

dataFilt.brca.anno<-cbind(bioM.res[match(rownames(dataFilt.brca), bioM.res$Geneid),],dataFilt.brca)
dataFilt.brca.anno[1:3,1:3]

write.csv(dataFilt.brca.anno, "DESeq2_anno.csv",row.names = F)

res_sig <- dataFilt.brca.anno[dataFilt.brca.anno$Tag == "Up" | dataFilt.brca.anno$Tag == "Down",]
write.csv(res_sig, "DESeq2_sig_anno.csv",row.names = F)

####select
res_up<-res_sig %>% dplyr::filter(res_sig$Tag == "Up") %>% arrange(padj)
res_down<-res_sig %>% dplyr::filter(res_sig$Tag == "Down") %>% arrange(padj)
selectLab_italics<-c(head(res_up$hgnc_symbol,10),head(res_down$hgnc_symbol,10))


res<-dataFilt.brca.anno
# create custom key-value pairs for 'high', 'low', 'mid' expression by fold-change
# this can be achieved with nested ifelse statements
keyvals <- ifelse(
  res$log2FoldChange < -1 & res$padj < 0.05, 'royalblue',
  ifelse(res$log2FoldChange > 1 & res$padj < 0.05, 'red',
         'black'))
keyvals[is.na(keyvals)] <- 'black'

names(keyvals)[keyvals == 'royalblue'] <- 'down regualation'
names(keyvals)[keyvals == 'black'] <- 'NC'
names(keyvals)[keyvals == 'red'] <- 'up regulation'


# Volcano plot
res$Tag <- as.factor(res$Tag)
col_li <- c("#cd2631", "#e9e9e9", "#4b79ae")
names(col_li) <- c("up", "NC", "down")
deseq2_info <- res %>% group_by(Tag) %>% summarise(Num = n())
deseq2_info$Text <- sprintf("N=%d", deseq2_info$Num)
deseq2_info <- deseq2_info[which(deseq2_info$Tag != "NC"), ]
deseq2_info$x <- -5 # for label text position
deseq2_info$x[deseq2_info$Tag== "Up"] <- -deseq2_info$x[deseq2_info$Tag == "Down"]
deseq2_info

p<- EnhancedVolcano(res,
                    lab = res$hgnc_symbol,
                    x = 'log2FoldChange',
                    y = 'padj',
                    title = 'exceptional_responders VS. early_rapid_responders',
                    selectLab = selectLab_italics,
                    colCustom = keyvals,
                    pCutoff = 0.05,
                    FCcutoff = 1,
                    pointSize = 2.0,
                    labSize = 3.0,
                    col=c('black', 'blue', 'green', 'red3'),
                    shape = 16,
                    colAlpha = 1)+geom_text(
                      aes(x = x, y = 11, label = Text),
                      data = deseq2_info,
                      size = 9,
                      color = "black"
                    )
p
ggsave("Volcanoplot.png", p, dpi=300, width=8, height=6)


# https://github.com/ctlab/fgsea
# http://bioconductor.org/packages/devel/bioc/vignettes/fgsea/inst/doc/fgsea-tutorial.html
############dload DESeq2 analysis result
setwd("/Users/birongzhang/Downloads/Sheeba/TCGA/DESeq2")
res_entrez = read.table("DESeq2_anno.csv", sep=",", header=TRUE)
res_entrez <- res
colnames(res_entrez)[1] <- "ensembl_gene_id"
colnames(res_entrez)[3] <- "entrezgene_id"
head(res_entrez)

## Remove any ensembl_gene_id duplicates
res_entrez <- res_entrez[which(duplicated(res_entrez$ensembl_gene_id) == F), ]

## Remove any NA values (reduces the data by quite a bit)
res_entrez <- res_entrez%>% dplyr::filter(entrezgene_id != "NA")

## Remove any entrezgene_id duplicates
res_entrez <- res_entrez[which(duplicated(res_entrez$entrezgene_id) == F), ]

## Remove any stat values (reduces the data by quite a bit)
res_entrez <- res_entrez%>% dplyr::filter(stat  != "NA")

#data conversionn
# Create a vector of the gene unuiverse
kegg_gene_list <- res_entrez$stat
# Name vector with ENTREZ ids
names(kegg_gene_list) <- res_entrez$entrezgene_id
# omit any NA values 
kegg_gene_list<-na.omit(kegg_gene_list)
# sort the list in decreasing order (required for clusterProfiler)
kegg_gene_list = sort(kegg_gene_list, decreasing = TRUE)
head(kegg_gene_list)


a<- res_entrez$stat
table(a > 0)
#FALSE  TRUE 
#22369 21509 


########GSEA
setwd("/Users/birongzhang/Downloads/Sheeba/TCGA/DESeq2/pathway")
biocLite("GSEABase")
install_version("rvcheck", version = "0.1.8", repos = "http://cran.us.r-project.org")

library(data.table)
library(fgsea)
library(ggplot2)
library(clusterProfiler)
library(org.Mm.eg.db)
library(GSEABase)
library(ggplot2)
library(dplyr)
library(stringr)
library(forcats)

# http://bioinf.wehi.edu.au/software/MSigDB/
# download reference

# h
load("/Users/birongzhang/Downloads/Sheeba/TCGA/DESeq2/pathway/human_H_v5p2.rdata")
h <- ldply (Hs.H, data.frame)
head(h)
write.csv(h,"H hallmark gene sets.csv")

# C2 
load("/Users/birongzhang/Downloads/Sheeba/TCGA/DESeq2/pathway/human_c2_v5p2.rdata")
c2 <- ldply (Hs.c2, data.frame)
head(c2)
write.csv(c2,"C2.csv")


# C7 immunologic signatures 
load("/Users/birongzhang/Downloads/Sheeba/TCGA/DESeq2/pathway/human_c7_v5p2.rdata")
c7 <- ldply (Hs.c7, data.frame)
head(c7)
write.csv(c7,"C7 immunologic signatures .csv")

## fgsea
## C7 4872
head(Hs.c7)
head(kegg_gene_list)

msig <- fgsea(pathways = Hs.c7, 
                  stats    = kegg_gene_list,
                  minSize  = 15,
                  maxSize  = 500)

msig_df <- msig%>% arrange(msig$padj) 
msig_sig <- msig_df[msig_df$padj < 0.05, ] #1071

msig_DF <- msig_df[,-8] %>% data.frame
write.csv(msig_DF, "fgsea_c7_human.csv")

msig_sig_DF <- msig_sig[,-8] %>% data.frame
write.csv(msig_sig_DF, "fgsea_c7_human_sig.csv",quote=F)

## barplot 
topPathwaysUp <- msig[ES > 0][head(order(pval), n=10), pathway]
topPathwaysDown <- msig[ES < 0][head(order(pval), n=10), pathway]
topPathways <- c(topPathwaysUp,topPathwaysDown  )
p<- ggplot(msig_sig[msig_sig$pathway %in% topPathways,], aes(x=NES, y=fct_reorder(pathway, NES), fill=NES)) + 
  geom_bar(stat='identity') +
  scale_fill_continuous(low='red', high='blue', guide=guide_colorbar(reverse=TRUE)) + 
  labs(title="C7_top10")+
  theme_minimal() + ylab(NULL)
p
ggsave("C7_significant_pathway_top15.eps", p, dpi=300, width=8, height=5)

## dotplot
png("C7_significant_pathway_top15_dotplot.png", width = 10, height = 6, units = 'in', res = 600)
dotplot(as.matrix(msig), showCategory=15, split=".sign", title="C7_immunologic signatures_top20 significant") + facet_grid(.~.sign)
dev.off()

topPathwaysUp <- msig[ES > 0][head(order(pval), n=10), pathway]
topPathwaysDown <- msig[ES < 0][head(order(pval), n=10), pathway]
topPathways <- c(topPathwaysUp, rev(topPathwaysDown))

png("C7_significant_pathway_top20.png", width = 15, height = 6, units = 'in', res = 600)
plotGseaTable(Hs.c7[topPathways], kegg_gene_list, msig, gseaParam=0.5)
dev.off()


## C2 3739
msig <- fgsea(pathways = Hs.c2, 
              stats    = kegg_gene_list,
              minSize  = 15,
              maxSize  = 500)

msig_df <- msig%>% arrange(msig$padj)
msig_sig <- msig_df[msig_df$padj < 0.05, ] #494

msig_DF <- msig_df[,-8] %>% data.frame
write.csv(msig_DF, "fgsea_c2_human.csv")

msig_sig_DF <- msig_sig[,-8] %>% data.frame
write.csv(msig_sig_DF, "fgsea_c2_human_sig.csv",quote=F)

## barplot 
topPathwaysUp <- msig[ES > 0][head(order(pval), n=10), pathway]
topPathwaysDown <- msig[ES < 0][head(order(pval), n=10), pathway]
topPathways <- c(topPathwaysUp, rev(topPathwaysDown))
p<- ggplot(msig_sig[msig_sig$pathway %in% topPathways,], aes(NES, fct_reorder(pathway, NES), fill=NES)) + 
  geom_bar(stat='identity') +
  scale_fill_continuous(low='red', high='blue', guide=guide_colorbar(reverse=TRUE)) + 
  labs(title="C2_top10")+
  theme_minimal() + ylab(NULL)
p
ggsave("C2_significant_pathway_top15.eps", p, dpi=300, width=8, height=5)

## dotplot
png("C2_significant_pathway_top15_dotplot.png", width = 10, height = 6, units = 'in', res = 600)
dotplot(as.matrix(msig), showCategory=15, split=".sign", title="C2_immunologic signatures_top20 significant") + facet_grid(.~.sign)
dev.off()

topPathwaysUp <- msig[ES > 0][head(order(pval), n=10), pathway]
topPathwaysDown <- msig[ES < 0][head(order(pval), n=10), pathway]
topPathways <- c(topPathwaysUp, rev(topPathwaysDown))

png("C2_significant_pathway_top20.png", width = 15, height = 6, units = 'in', res = 600)
plotGseaTable(Hs.c2[topPathways], kegg_gene_list, msig, gseaParam=0.5)
dev.off()

## h 50
msig <- fgsea(pathways = Hs.H, 
              stats    = kegg_gene_list,
              minSize  = 15,
              maxSize  = 500)

msig_df <- msig%>% arrange(msig$padj)
msig_sig <- msig_df[msig_df$padj  < 0.05, ] #18


## barplot 
p<- ggplot(msig_sig[1:18,], aes(NES, fct_reorder(pathway, NES), fill=NES)) + 
  geom_bar(stat='identity') +
  scale_fill_continuous(low='red', high='blue', guide=guide_colorbar(reverse=TRUE)) + 
  labs(title="h_immunologic signatures_top15")+
  theme_minimal() + ylab(NULL)
p
ggsave("h_significant_pathway_top15.eps", p, dpi=300, width=8, height=5)

## dotplot
png("h_significant_pathway_top15_dotplot.png", width = 10, height = 6, units = 'in', res = 600)
dotplot(as.matrix(msig), showCategory=15, split=".sign", title="h_immunologic signatures_top20 significant") + facet_grid(.~.sign)
dev.off()

topPathwaysUp <- msig[ES > 0][head(order(pval), n=10), pathway]
topPathwaysDown <- msig[ES < 0][head(order(pval), n=10), pathway]
topPathways <- c(topPathwaysUp, rev(topPathwaysDown))

png("h_significant_pathway_top20.png", width = 15, height = 6, units = 'in', res = 600)
plotGseaTable(Hs.h[topPathways], kegg_gene_list, msig, gseaParam=0.5)
dev.off()


msig_DF <- msig_df[,-8] %>% data.frame
write.csv(msig_DF, "fgsea_h_human.csv")

msig_sig_DF <- msig_sig[,-8] %>% data.frame
write.csv(msig_sig_DF, "fgsea_h_human_sig.csv",quote=F)