forked from chferte/KRAS_Analysis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathALL_KRAS_model_GOLD.R
367 lines (291 loc) · 14.2 KB
/
ALL_KRAS_model_GOLD.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
# Charles Ferté
# Sage Bionetworks
# 14 Sept 2012
# train a model of differential gene expression per KRAS codon in Lung Adenocarcinoma
# both for G12C and G12V, combining CHEMORES and in TCGA LUAD
#load the different packages
options(stringsAsFactors=FALSE)
library(affy)
library(corpcor)
library(lattice)
library(limma)
library(caret)
library(glmnet)
source("/home/cferte/FELLOW/cferte/KRAS_Project/JUSTIN_PREDICT_CCLE/code/lung_analysis_functions.R")
synapseLogin("[email protected]","charles")
###############################################################
# run the analysis for All KRAS in the TCGA LUAD
###############################################################
load("/home/cferte/FELLOW/cferte/KRAS_Project/OBJECTS/LUAD_EXP.RData")
load("/home/cferte/FELLOW/cferte/KRAS_Project/OBJECTS/KRAS_LUAD_STATUS.RData")
table(KRAS_LUAD)
# keep only the patients with available KRAS WT or G12C
tmp <- intersect(names(KRAS_LUAD),colnames(LUAD_EXP))
LUAD_EXP <- LUAD_EXP[,tmp]
KRAS_LUAD <- KRAS_LUAD[tmp]
KRAS_LUAD <- ifelse(KRAS_LUAD=="WT",0,1)
table(KRAS_LUAD)
# check if there is any latent structure in the exp data
s <- svd(LUAD_EXP)
plot(s$v[,1],s$v[,2], col=ifelse(KRAS_LUAD==1,"royalblue","orange"),pch=20,xlab="PC1",ylab="PC2",main="LUAD EXP n=122")
# train the model on LUAD
fit <- eBayes(lmFit(LUAD_EXP,model.matrix(~KRAS_LUAD)))
hist(fit$p.value[,2],breaks=100,main="unadjusted p values (G12C vs. rest TCGA)")
abline(v=.05,col="red",lty=2)
###############################################################
# load the CHEMORES data
###############################################################
load("/home/cferte/FELLOW/cferte/KRAS_Project/OBJECTS/CHEMORES_EXP.RData")
load("/home/cferte/FELLOW/cferte/KRAS_Project/OBJECTS/CHEMORES_CLIN.RData")
load("/home/cferte/FELLOW/cferte/KRAS_Project/OBJECTS/KRAS_CHEMORES_STATUS.RData")
# retain only the patients that are evaluated for KRAS
KRAS_CHEMORES <- KRAS_CHEMORES[which(!is.na(KRAS_CHEMORES))]
CHEMORES_EXP <- CHEMORES_EXP[,names(KRAS_CHEMORES)]
CHEMORES_CLIN <- CHEMORES_CLIN[names(KRAS_CHEMORES),]
# retain only the ADENOS only from CHEMORES
CHEMORES_CLIN <- CHEMORES_CLIN[ CHEMORES_CLIN$Histology=="AC",]
CHEMORES_EXP <- CHEMORES_EXP[,rownames(CHEMORES_CLIN)]
KRAS_CHEMORES <- KRAS_CHEMORES[rownames(CHEMORES_CLIN)]
# transform KRAS_CHEMORES into a numeric variable
KRAS_CHEMORES <- ifelse(KRAS_CHEMORES=="WT",0,1)
table(KRAS_CHEMORES)
# check if there is any latent structure in the exp data
s <- svd(CHEMORES_EXP)
plot(s$v[,1],s$v[,2], col=ifelse(KRAS_CHEMORES==1,"royalblue","orange"),pch=20,xlab="PC1",ylab="PC2",main="CHEMORES EXP n=43")
# train the model on CHEMORES
fit <- eBayes(lmFit(CHEMORES_EXP,model.matrix(~KRAS_CHEMORES)))
hist(fit$p.value[,2],breaks=100,main="unadjusted p values
(G12C vs. rest CHEMORES)")
abline(v=.05,col="red",lty=2)
#hist(p.adjust(fit$p.value[,2],method="BH"),breaks=200)
################################################################
# merge the two dataset into the GOLD one to generate the model
################################################################
################################################################
# load the ccle data from synapse
################################################################
ccle_eset <- loadEntity('syn1354643')
ccle_drug <- loadEntity('syn1354656')
ccle_kras <- loadEntity('syn1443160')
# make all the db have the same features
tmp <- intersect(featureNames(ccle_eset$objects$ccle_eset),intersect(rownames(LUAD_EXP),rownames(CHEMORES_EXP)))
LUAD_EXP <- LUAD_EXP[tmp,]
CHEMORES_EXP <- CHEMORES_EXP[tmp,]
ccle_eset <- ccle_eset$objects$ccle_eset[tmp,]
# second quantile Normalization
require(caret)
CHEMO_EXP2 <- normalize2Reference(CHEMORES_EXP,rowMeans(LUAD_EXP))
# rescale the data (chemores) to have the same mean and variance than the LUAD
# Justin's function to rescale the VS to get the same mean/var than the TS
normalize_to_X <- function(mean.x, sd.x, Y){
m.y <- rowMeans(Y)
sd.y <- apply(Y, 1, sd)
Y.adj <- (Y - m.y) * sd.x / sd.y + mean.x
Y.adj[sd.y == 0] <- mean.x[sd.y==0]
Y.adj
}
CHEMO_EXP2 <- normalize_to_X(rowMeans(LUAD_EXP),apply(LUAD_EXP,1,sd),CHEMO_EXP2)
# attempt to draw a qq plot
#qqplot(CHEMO_EXP2,LUAD_EXP)
# combine the two datasets
GOLD_EXP <- cbind(LUAD_EXP,CHEMO_EXP2)
DB_GOLD <- c(rep(1,times=137),rep(0,times=51))
KRAS_GOLD <- c(KRAS_LUAD,KRAS_CHEMORES)
# check if there is any latent structure in the exp data
s <- svd(GOLD_EXP)
plot(s$v[,1],s$v[,2], col=ifelse(KRAS_GOLD==1,"royalblue","orange"),pch=20,xlab="PC1",ylab="PC2",main="GOLD EXP n=188")
plot(s$v[,1],s$v[,2], col=ifelse(DB_GOLD==1,"black","red"),pch=20,xlab="PC1",ylab="PC2",main="GOLD EXP n=188")
# SNM to remove the batch and adjustment variables effect
require(snm)
bio.var <- model.matrix(~ as.factor(KRAS_GOLD))
adj.var <- model.matrix(~ DB_GOLD)
snm.fit <- snm(GOLD_EXP,
bio.var=bio.var,
adj.var=adj.var,
rm.adj=TRUE)
new.dat <- snm.fit$norm.dat
# svd plots again
s <- svd(new.dat)
plot(s$v[,1],s$v[,2], col=ifelse(KRAS_GOLD==1,"royalblue","orange"),pch=20,xlab="PC1",ylab="PC2",main="GOLD EXP n=43")
plot(s$v[,1],s$v[,2], col=ifelse(DB_GOLD==1,"black","red"),pch=20,xlab="PC1",ylab="PC2",main="GOLD EXP n=43")
# train the model on GOLD1
GOLD1 <- new.dat
colnames(GOLD1) <- colnames(GOLD_EXP)
rownames(GOLD1) <- rownames(GOLD_EXP)
fit <- eBayes(lmFit(GOLD1,model.matrix(~KRAS_GOLD)))
hist(fit$p.value[,2],breaks=100,main="unadjusted p values
(G12C vs. all GOLD)")
abline(v=.05,col="red",lty=2)
GOLD1_names <- fit$p.value[,2]
table(fit$p.value[,2]<.05)
#################################################################################################################
# apply the models in the CCLE database and correlate them with drug sensitivity
#################################################################################################################
ccle_kras <- ccle_kras$objects$KRAS_CCLE
ccle_kras <- ifelse(ccle_kras=="0",0,1)
table(ccle_kras)
ccle_drug <- ccle_drug$objects$ccle_drug
ccle_drug <- ccle_drug[names(ccle_kras),]
drug <- ccle_drug@data
ccle_eset <- ccle_eset[,names(ccle_kras)]
# load the cell lines metadata and exclude the SCLC and the undifferenciated carcinoma
mData <- loadEntity('syn1443797')
mData <- mData$objects$CCLE_metadata
rownames(mData) <- mData$CCLE.name
mData <- mData[sampleNames(ccle_eset),]
table(mData$Hist.Subtype1)
#mData <- mData[mData$Hist.Subtype1!= "small_cell_carcinoma",]
#mData <- mData[mData$Hist.Subtype1!= "undifferentiated_carcinoma",]
#mData <- mData[mData$Hist.Subtype1!= "squamous_cell_carcinoma",]
ccle_eset <- ccle_eset[,rownames(mData)]
ccle_kras <- ccle_kras[rownames(mData)]
drug <- drug[rownames(mData),]
table(ccle_kras)
# perform quantile Normalization on ccle_eset with ref =GOLD1
ccle_eset1 <- normalize2Reference(exprs(ccle_eset),rowMeans(GOLD1))
# rescale the ccle_eset1 to have the same mean and variance than the GOLD1
# Justin's function to rescale the VS to get the same mean/var than the TS
normalize_to_X <- function(mean.x, sd.x, Y){
m.y <- rowMeans(Y)
sd.y <- apply(Y, 1, sd)
Y.adj <- (Y - m.y) * sd.x / sd.y + mean.x
Y.adj[sd.y == 0] <- mean.x[sd.y==0]
Y.adj
}
ccle_eset1 <- normalize_to_X(rowMeans(GOLD1),apply(GOLD1,1,sd),ccle_eset1)
# check if there is any latent structure in the data
s <- svd(ccle_eset1)
plot(s$v[,1],s$v[,2], col=ifelse(KRAS_GOLD==1,"royalblue","orange"),pch=20,xlab="PC1",ylab="PC2",main="ccle_eset n=82")
# check if there is any signal in the ccle data
fit <- eBayes(lmFit(ccle_eset1,model.matrix(~ccle_kras+s$v[,4]+s$v[,5])))
hist(fit$p.value[,2],breaks=100,main="unadjusted p values
(G12C vs. G12V ccle)")
abline(v=.05,col="red",lty=2)
ccle_names <- fit$p.value[,2]
table(fit$p.value[,2]<.05)
# remove the 4th, 5th and 6th SVDw from ccle_eset since this is improving
# remove the first svd of the battle
s$d[4] <- 0
s$d[5] <- 0
ccle_eset2 <- s$u %*% diag(s$d) %*% t(s$v)
rownames(ccle_eset2) <- rownames(ccle_eset1)
colnames(ccle_eset2) <- colnames(ccle_eset1)
ccle_eset1 <- ccle_eset2
rm(ccle_eset2)
# check if there is any latent structure in the data
s <- svd(ccle_eset1)
plot(s$v[,1],s$v[,2], col=ifelse(KRAS_GOLD==1,"royalblue","orange"),pch=20,xlab="PC1",ylab="PC2",main="ccle_eset n=71")
## plot the PVAL across GOLD1 and ccle
require(geneplotter)
require("RColorBrewer")
dim <- c(0,1)
par(mfrow=c(1,1),oma=c(0,0,3,0))
tmp <- intersect(names(GOLD1_names),names(ccle_names))
x <- GOLD1_names[tmp]
y <- ccle_names[tmp]
colors <- densCols(x,y)
plot(x,y, col=colors, pch=20,xlab="GOLD1 (TCGA+CHEMORES) p values",ylab="CCLE p values",xlim=dim,ylim=dim)
abline(h=.05,col="red",lty=2)
abline(v=.05,col="red",lty=2)
table(GOLD1_names<.05)
table(ccle_names<.05)
#################################################################
# perform a feature selection on the overlapping probes only
#################################################################
tmp <- intersect(names(GOLD1_names)[which(GOLD1_names<.2)],names(ccle_names)[which(ccle_names<.2)])
length(tmp)
###################################################################################################################
# perform the prediction using the glmnet package with alpha=.1 (more ridge) and determine lambda using nfolds= 10
###################################################################################################################
require(glmnet)
set.seed(1234567)
cv.fit <- cv.glmnet(t(GOLD1[tmp,]), factor(KRAS_GOLD), nfolds=10, alpha=.1, family="binomial")
plot(cv.fit)
fit <- glmnet(x=t(GOLD1[tmp,]),y=KRAS_GOLD,family="binomial",alpha=.1,lambda=cv.fit$lambda.1se)
table(as.numeric(fit$beta)!=0)
####################################
# apply this model in the ccle data
####################################
y_hat <- predict(fit, t(ccle_eset1[tmp,]),type="response")
########################################################
#evaluate the performance of the model
########################################################
boxplot(y_hat~ccle_kras,xlab=c("KRAS G12C mutational status"),ylab="model of G12C",main="predicting KRAS in ccle
(modele trained in tcga + chemores)")
stripchart(y_hat~ccle_kras,vertical=TRUE,pch=20,add=T,col="royalblue",cex=.7 )
########################################################
# plot a ROC curve to asses the performance of our model
########################################################
require(ROCR)
Pred <- prediction(as.numeric(y_hat),as.numeric(ccle_kras))
Perf <- performance(prediction.obj=Pred,"tpr","fpr")
AUC <- performance(prediction.obj=Pred,"auc")
plot(Perf, col="royalblue",main="predicting KRAS G12C in ccle
(modele trained in tcga + chemores)")
text(x=.7,y=.4,labels=paste("AUC=",format([email protected],digits=2)),col="royalblue")
###############################################################################
# boxplot the model across the different KRAS codon mutations within ccle
###############################################################################
codon_kras <- loadEntity('syn1443160')
codon_kras_ccle <- codon_kras$objects$KRAS_CCLE[colnames(ccle_eset1)]
codon_kras_ccle[codon_kras_ccle=="0"] <- "WT"
boxplot(y_hat~codon_kras_ccle,xlab=c("KRAS G12C mutational status"),ylab="KRAS lung model",main="predicting KRAS in ccle
(modele trained in tcga + chemores)")
stripchart(y_hat~codon_kras_ccle,vertical=TRUE,pch=20,add=T,col="royalblue",cex=.7 )
###################################################################################################################
# improve the model by a dynamic penalization using the G12V/G12C diff p val vector
###################################################################################################################
load("/home/cferte/FELLOW/cferte/KRAS_Project/OBJECTS/KRAS_LUAD_STATUS.RData")
load("/home/cferte/FELLOW/cferte/KRAS_Project/OBJECTS/KRAS_CHEMORES_STATUS.RData")
KCV <- c(KRAS_LUAD,KRAS_CHEMORES)
KCV <- KCV[colnames(GOLD1)]
KCV <- KCV[which(KCV %in% c("G12C","G12V"))]
GOLDCV <- GOLD1[,names(KCV)]
fit <- eBayes(lmFit(GOLDCV,model.matrix(~KCV)))
hist(fit$p.value[,2],breaks=100,main="unadjusted p values (G12C vs.G12V)")
abline(v=.05,col="red",lty=2)
table(fit$p.value[,2]<.05)
pen <- fit$p.value[,2]
pen <- ifelse(pen<.05,.5,1)
set.seed(1234567)
cv.fit <- cv.glmnet(t(GOLD1), factor(KRAS_GOLD), nfolds=10, alpha=.1, family="binomial",penalty.factor=pen)
plot(cv.fit)
fit <- glmnet(x=t(GOLD1[tmp,]),y=KRAS_GOLD,family="binomial",alpha=.1,lambda=cv.fit$lambda.1se,penalty.factor=pen)
table(as.numeric(fit$beta)!=0)
y_hat2 <- predict(fit, t(ccle_eset1[tmp,]),type="response")
###############################################################################
# boxplot the model across the different KRAS codon mutations within ccle
###############################################################################
codon_kras <- loadEntity('syn1443160')
codon_kras_ccle <- codon_kras$objects$KRAS_CCLE[colnames(ccle_eset1)]
codon_kras_ccle[codon_kras_ccle=="0"] <- "WT"
boxplot(y_hat2~codon_kras_ccle,xlab=c("KRAS G12C mutational status"),ylab="KRAS lung model",main="predicting KRAS in ccle
(modele trained in tcga + chemores)")
stripchart(y_hat2~codon_kras_ccle,vertical=TRUE,pch=20,add=T,col="royalblue",cex=.7 )
table(codon_kras_ccle)
########################################################
# correlate the y_hat2 with the drug response
########################################################
drug1 <- drug[rownames(y_hat2),]
dim(drug1)
p <- c()
D <- c()
yhat2 <- sapply(colnames(drug1),function(x){
p <- cor.test(drug[,x],y_hat2,method="spearman",use="pairwise.complete.obs")
D <- rbind(D,c(p$estimate,p$p.value))
})
rownames(yhat2) <- c("rho","p.value")
class(yhat2)
yhat2 <- as.data.frame(t(yhat2))
yhat2 <- yhat2[sort(yhat2$rho,decreasing=TRUE,index.return=TRUE)$ix,]
yhat2
yhat2$color <- ifelse(yhat2$p.value<.05,"red",ifelse(yhat2$p.value<.1,"orange","black"))
# we prefer to get rid of irinotecan given there are too many missing values
yhat2 <- yhat2[-which(rownames(yhat2)=="Irinotecan"),]
dotchart(yhat2$rho,labels=row.names(yhat2),pch=20,xlab="correlation (spearman rho)",main="correlation of drug sensitivity
with G12C model in lung ccle",col=yhat2$color)
abline(v=0,lty=3,lwd=.3)
# do it in the G12C only ones...
# redo your model for KRAS only
# train in TCGA KRAS mutant only validate it in CHEMORES kras only - try to predict G12C
# if you get similar AUC in ccle then infer drug sensitivity information