-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocess-protein-groups-with-gene-names.R
178 lines (138 loc) · 6.98 KB
/
process-protein-groups-with-gene-names.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
## ---------------------------
##
## Script name: process-protein-groups.R
##
## Purpose of script: process proteinGroups.txt file from MaxQuant to include in Expression Atlas
##
## Author: Dr. Andrew Jarnuczak
##
## Date Created: 2019-07-01
##
## Copyright (c) Andrew Jarnuczak, 2019
## Email: [email protected]
##
## ---------------------------
##
## Notes: Steps performed in the script
## 1. clean up proteinGroups file (remove CONTAMINANTS and REVERSE)
## 2. for MS1 based quantification results (i.e. label free or SILAC) normalise iBAQ intensities to ppb
## 3. perform proteinGroup to Gene ID mapping based on mapping provided by UniProt
## 4. perform any other necessary clean-up steps (this will depend on the dataset, for example if there are any "technical" assays included in the original dataset that should be removed before data is included in Expression Atlas)
##
##
## ---------------------------
# https://bioconductor.org/packages/release/bioc/manuals/mygene/man/mygene.pdf
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("mygene")
library(mygene)
setwd("/Users/ananth/Documents/MaxQuant_Bechmarking/Human/PXD012203_Reanalysis/")
#In case that the library mygene was not already installed
#if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#BiocManager::install("mygene")
if ( !exists("EXP_TYPE")) warning("Please specify experiment type variable: EXP_TYE")
## Specify experiment type, i.e. what type of quantification is used, MS1-based or MS2-based. As a rule of thumb, label free and SILAC is MS1-based, and iTRAQ and TMT is MS2-based.
# EXP_TYPE <- "MS2-quant"
EXP_TYPE <- "MS2-quant"
##############
# define FOT normalisation function
# FOT stands for Fraction Of Total. In this normalisation method each protein iBAQ intensity value is scaled to the total amount of signal in a given MS run (column) and transformed to parts per billion (ppb)
fot.normalise <- function(x){
data.sum <- apply(x, 2, function(y){sum(y, na.rm=TRUE)})
# barplot((data.sum), log = "y")
#
##### do ppm normalisation
x.mat <- as.matrix(x)
x.mat.ppb <- apply(x.mat, 2, function(i) i/sum(i, na.rm = T) * 1000000000 )
x.mat.ppb <- as.data.frame(x.mat.ppb)
colnames(x.mat.ppb) <- paste("ppb.", colnames(x.mat.ppb), sep = "")
return(x.mat.ppb)
}
##############
##### read the protein expression matrix produced by MaxQuant
tmp <- read.table( "proteinGroups.txt" , quote = "\"", header = TRUE, sep = "\t", stringsAsFactors = FALSE, comment.char = "#")
## clean up
tmp <- tmp[ tmp[, "Reverse"] != "+", ]
tmp <- tmp[ tmp[, "Potential.contaminant"] != "+", ]
## consider those protein groups with more than 1 peptides matched
tmp <- tmp[ tmp[, "Peptides"] > 1, ]
##
# if experiment is label free use iBAQ quant:
if(EXP_TYPE == "MS1-quant"){
message("Collecting iBAQ quantification")
tmp <- tmp[ ,c(2, grep("iBAQ.", colnames(tmp))) ]
}
# if experiment is TMT, use reporter intensities, if experiment iTRAQ use Intensities. Note, MaxQuant might change how it reports iTRAQ and TMT intensities.
if(EXP_TYPE == "MS2-quant"){
message("Collecting MS2 intensities")
if( any(grepl("Reporter.intensity.corrected", colnames(tmp))) ){
# code modified by Ananth:
# Filter to consider only protein groups for which the at least 50 % of the sample replicates
# have non-zero intensity values
##### Read sample replicate information
##### Note: User defined annotation file
sample_replicates <- read.table("sample_replicates.txt" , quote = "\"", header = TRUE, sep = "\t", stringsAsFactors = FALSE, comment.char = "#")
for(i in 1:nrow(sample_replicates))
{
sample_replicates[i,] <- gsub("^","Reporter.intensity.corrected.", sample_replicates[i,], perl=TRUE)
subdata <- tmp[, c(unname(unlist(sample_replicates[i,-1])))]
tmp$replicate_nonzero_instensity_count <- apply(subdata[1:ncol(subdata)], 1, function(x) length(which(x != 0)) )
tmp <- tmp[tmp$replicate_nonzero_instensity_count >= ncol(subdata)/2,]
}
### code modification stops
tmp <- tmp[ ,c(2, grep("Reporter.intensity.corrected.[0-9].{1,}", colnames(tmp))) ]
} else {
tmp <- tmp[ ,c(2, grep("Intensity.", colnames(tmp))) ]
}
}
#####
Majority.protein.IDs <- tmp$Majority.protein.IDs
tmp <- tmp[ , -1]
# for iTRAQ and TMT ppb normalization might not be the best method
if(EXP_TYPE == "MS1-quant"){
tmp <- fot.normalise(tmp)
}
#
tmp <- data.frame( cbind(Majority.protein.IDs, tmp, stringsAsFactors = FALSE) )
##############
tmp[tmp == 0] <- NA
tmp[ tmp == "NaN"] <- NA
##### perform the protein group to gene mapping
# this loop will take some time
data.to.map <- tmp
data.to.map$"ENSG" <- "NA"
data.to.map$"Gene.Name" <- "NA"
for(i in 1:nrow(data.to.map)){
x <- data.frame(strsplit(data.to.map[ i, "Majority.protein.IDs"], split = ";"), stringsAsFactors = FALSE)
x_temp <- regmatches(x[,1],regexpr("[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}", x[,1]))
f = file()
sink(file=f)
a <- tryCatch(queryMany(x_temp, scopes="uniprot", fields=c("ensembl.gene", "symbol"), species="human"),
error = function(e) {print(0)})
sink()
close(f)
# DFrame not DataFrame
if (class(a)=="DFrame"){
data.to.map[ i, "ENSG"] <- paste( unique(unlist(a$ensembl.gene[!is.na(a$ensembl.gene)])), collapse = ";")
data.to.map[ i, "Gene.Name"] <- paste( unique(unlist(a$symbol[!is.na(a$symbol)])), collapse = ";")}
print(paste0("Processing protein groups... ", as.character(round(i*100/nrow(data.to.map),1)),"%"))
}
# remove protein groups that have no mapping to an ENSG gene IDs
data.to.map <- data.to.map[ data.to.map$ENSG != "" , ]
data.to.map <- data.to.map[ data.to.map$ENSG != "NA" , ]
#tmp <- data.to.map
# remove all protein groups that map to multiple ENSG gene IDs (this removes a lot of proteins) - the reasoning to remove these cases is that we cannot establish for sure which gene is contributing the signal to the protein abundance; all genes contribute equally or one gene is a majority?
data.to.map <- data.to.map[ grep(";", data.to.map$ENSG, invert = TRUE) , ]
# for genes that map to multiple proteins, in order to determine the amount of protein that gene is producing we sum the protein quantification values
#xx.Majority.protein.IDs <- aggregate(data.to.map$Majority.protein.IDs, list(ESNG = data.to.map$ENSG ), function(x) paste0( (x) ) )
#
colnames(data.to.map)
## select which columns to aggregate
data.to.map <- aggregate(data.to.map[ , 2:(ncol(data.to.map)-2)], list("Gene ID" = data.to.map$ENSG, "Gene.Name" = data.to.map$Gene.Name), sum, na.rm =TRUE)
#
# data.to.map <- cbind(Majority.protein.IDs = (as.character(xx.Majority.protein.IDs$x)), data.to.map )
if(EXP_TYPE == "MS2-quant"){
write.table(data.to.map, "proteinGroups_final.txt", sep = "\t", row.names = FALSE, quote = FALSE )} else {
write.table(data.to.map, "proteinGroups_ppb_final.txt", sep = "\t", row.names = FALSE, quote = FALSE )
}