Skip to content

Latest commit

 

History

History
345 lines (308 loc) · 11.8 KB

20161216_RV144.Fig4.code.md

File metadata and controls

345 lines (308 loc) · 11.8 KB
title author date output
R code to generate Figure 4
Slim Fourati
December 16, 2016
github_documents

loading require packages

suppressPackageStartupMessages(library(package = "knitr"))
suppressPackageStartupMessages(library(package = "Biobase"))
suppressPackageStartupMessages(library(package = "limma"))
suppressPackageStartupMessages(library(package = "readr"))
suppressPackageStartupMessages(library(package = "ggplot2"))
suppressPackageStartupMessages(library(package = "dplyr"))
suppressPackageStartupMessages(library(package = "ROCR"))
suppressPackageStartupMessages(library(package = "pROC"))
suppressPackageStartupMessages(library(package = "tibble"))
suppressPackageStartupMessages(library(package = "tidyr"))

set default options/variables

workDir <- dirname(getwd())
opts_chunk$set(tidy = FALSE, fig.path = "../figure/")
options(stringsAsFactors  = FALSE,
        width             = 80,
        readr.num_columns = 0)

load genesets ExpressionSet

gsSetFile <- file.path(workDir, "output/rv144.gsSet.RData")
load(file = gsSetFile)
gsSet$pin <- gsSet$donor

read clinical annotation

clinicalAnnotFile <- file.path(workDir, "input/rv144.master_wk26.csv")
clinicalAnnotation <- read_csv(file      = clinicalAnnotFile,
                               col_types = paste(rep("c", times = 21),
                                                 collapse = ""))
clinicalAnnotation <- lapply(clinicalAnnotation,
                             FUN   = type.convert,
                             as.is = TRUE) %>%
                      data.frame(check.names = FALSE)
# remove unused columns
clinicalAnnotation <- clinicalAnnotation %>%
  select(-v7, -v9)
rownames(clinicalAnnotation) <- clinicalAnnotation$pin

read primary correlates ExpressionSet

primCorrelatesFile <- file.path(workDir,
				"output/rv144.primCorrelatesSet.RData")
load(file = primCorrelatesFile)

read intracellular cytokine staining ExpressionSet

icsFile <- file.path(workDir, "output/rv144.icsSet.RData")
load(file = icsFile)

read haplotype ExpressionSet

haplotypeFile <- file.path(workDir,
                           "output/rv144.haplotypeSet.RData")
load(file = haplotypeFile)
allMat <- keyClass <- NULL
for (omic in ls(pattern = "Set$")) {
  # print(omic)
  eval(parse(text = paste("eset <-", omic)))
  allMat <- rbind(allMat,
                  exprs(eset)[,
                              match(clinicalAnnotation$pin,
                                    table = eset$pin),
                              drop = FALSE])
  keyClass <- c(keyClass,
               rep(gsub(pattern = "Set$", replacement = "", omic),
                   times = nrow(eset)))
}
clinicalAnnotation <- cbind(clinicalAnnotation, t(allMat)) %>%
  select(pin, dem_sex, BRA_risk,
         primary_iga_score_tomaras_wk26, primary_v2_gp70_v1v2_zollapazner_wk26,
         COMPASSPolyfunctionalityScore,
         DQB06, DPB13)
pdata <- merge(pData(gsSet),
               clinicalAnnotation,
               by.x = "donor", by.y = "pin", all.x = TRUE) %>%
  mutate(`HIV infection` = factor(`HIV infection`, levels = c("Yes", "No")))
rownames(pdata) <- sampleNames(gsSet)
pData(gsSet) <- pdata
# filter esetBaselined on vaccinated participants
gsSet <- gsSet[, gsSet$treatment %in% "VACCINE" &
               !is.na(gsSet$DQB06) &
               !is.na(gsSet$DPB13)]

merge all datasets

# merge all datasets
mergedDF <- cbind(t(exprs(gsSet)), pData(gsSet)) %>%
  dplyr::rename(iga = primary_iga_score_tomaras_wk26,
                igg = primary_v2_gp70_v1v2_zollapazner_wk26,
                pfs = COMPASSPolyfunctionalityScore) %>%
  mutate(infect   = factor(`HIV infection`, levels = c("No", "Yes")),
         BRA_risk = factor(BRA_risk, levels = c("Low", "Medium", "High")))
gsNames <- c("HALLMARK_INTERFERON_GAMMA_RESPONSE",
             "HALLMARK_MTORC1_SIGNALING",
             "HALLMARK_TNFA_SIGNALING_VIA_NFKB")

create iteration of the 10-fold cross-validation

cv10LS <- split(1:ncol(gsSet), f = gsSet$"HIV infection") %>%
  lapply(FUN =  function(x) {
    set.seed(seed = 1)
    f <- rep(1:10, times = ceiling(length(x)/10))
    f <- f[1:length(x)]
    return(value = split(sample(x), f = f))
  }) %>%
  do.call(what = c) %>%
  stack() %>%
  mutate(ind = gsub(pattern     = ".+\\.([0-9]+)",
             replacement = "\\1",
             ind)) %>%
  unstack()

build logistic regression models and calculate accuracies

# 10-fold cross validation for the reduced model (all previous correlates of
# protection, minus transcriptomic data)
reducedModelVars <- c("iga", "dem_sex", "BRA_risk", "igg", "pfs", "DQB06",
                      "DPB13")
logitRed <- NULL
for (i in 1:length(cv10LS)) {
  trainDF <- mergedDF[-cv10LS[[i]], c(reducedModelVars, "infect")]
  fit0 <- glm(formula = infect ~ dem_sex + BRA_risk + pfs + iga * DQB06 +
                        igg * DPB13,
              data = trainDF, family = binomial())
  logitRed <- c(logitRed,
                predict(fit0, newdata = mergedDF[cv10LS[[i]], , drop = FALSE]))
}
logitRed <- logitRed[order(as.numeric(names(logitRed)))]
# 10-fold cross validation for the reduced model (only transcriptomic data)
minVars <- c("dem_sex", "BRA_risk", gsNames)
logitMin <- NULL
for (i in 1:length(cv10LS)) {
  trainDF <- mergedDF[-cv10LS[[i]], c(reducedModelVars, "infect")]
  fit0 <- glm(formula = infect ~ ., data = trainDF, family = binomial())
  logitMin <- c(logitMin,
                predict(fit0, newdata = mergedDF[cv10LS[[i]], , drop = FALSE]))
}
logitMin <- logitMin[order(as.numeric(names(logitMin)))]
# 10-fold cross validation for the full  model (all previous correlates of
# protection, plus transcriptomic data)
logitFull <- NULL
for (i in 1:length(cv10LS)) {
  trainDF <- mergedDF[-cv10LS[[i]], c(reducedModelVars, gsNames, "infect")]
  fit <- glm(formula = infect ~ dem_sex + BRA_risk + pfs + iga * DQB06 +
             igg * DPB13 +
             HALLMARK_INTERFERON_GAMMA_RESPONSE +
             HALLMARK_MTORC1_SIGNALING +
             HALLMARK_TNFA_SIGNALING_VIA_NFKB,
             data = trainDF, family = binomial())
  logitFull <- c(logitFull,
                 predict(fit, newdata = mergedDF[cv10LS[[i]], , drop = FALSE]))
}
logitFull <- logitFull[order(as.numeric(names(logitFull)))]

jitter plot with prediction of the logistic regression models as a function of the real status

plotDF <- data.frame(logpp  = c(logitRed, logitFull),
                     infect = rep(mergedDF$infect, times = 2),
                     model  = rep(c("gender+BRA_risk+IgAxDQB1*06+IgGxDPB1*13+PFS",
                         "...+transcriptomic"),
                         each = nrow(mergedDF))) %>%
  mutate(model = factor(model,
             levels = c("gender+BRA_risk+IgAxDQB1*06+IgGxDPB1*13+PFS",
                 "...+transcriptomic")))
# print accuracies of each models
accDF <- plotDF %>%
  filter(!is.na(logpp)) %>%
  group_by(model) %>%
  do(sen = sum(.$logpp > 0 & .$infect %in% "Yes")/sum(.$infect %in% "Yes"),
     spe = sum(.$logpp < 0 & .$infect %in% "No")/sum(.$infect %in% "No")) %>%
  mutate(sen = unlist(sen), spe = unlist(spe), acc = (sen+spe)/2)
accDF %>%
  print()
## Source: local data frame [2 x 4]
## Groups: <by row>
## 
## # A tibble: 2 x 4
##   model                                         sen   spe   acc
##   <fct>                                       <dbl> <dbl> <dbl>
## 1 gender+BRA_risk+IgAxDQB1*06+IgGxDPB1*13+PFS 0.267 0.979 0.623
## 2 ...+transcriptomic                          0.4   0.957 0.679
plotLabels <- data.frame(infect = c(1.5, 1.5),
                         logpp = c(10, 10),
                         model = levels(plotDF$model),
                         label = paste0("Acc=",
                             signif(accDF$acc, digits = 3) * 100,
                             "%")) %>%
  mutate(model = factor(model,
             levels = c("gender+BRA_risk+IgAxDQB1*06+IgGxDPB1*13+PFS",
                 "...+transcriptomic")))
plotJit <- ggplot(data    = plotDF,
                  mapping = aes(x = infect, y = logpp)) +
  geom_dotplot(binaxis = "y", binwidth = 0.25, stackdir = "center",
               color = "transparent", mapping = aes(fill = infect)) +
  geom_hline(yintercept = 0) +
  geom_text(data = plotLabels, mapping = aes(label = label)) +
  labs(x = "Infection status",
       y = "log(probability of infection)") +
  scale_fill_manual(values = c(Yes = "red", No = "black")) +
  facet_wrap(facets = ~model) +
  theme_bw() +
  theme(legend.position = "none",
        strip.text = element_text(siz = 6))
print(plotJit)

plot of chunk jitter-plot

plot ROC curves

redDF <- plotDF %>%
  filter(model %in% "gender+BRA_risk+IgAxDQB1*06+IgGxDPB1*13+PFS")
predRed <- prediction(redDF$logpp, as.numeric(redDF$infect %in% "Yes"))
perfRed <- performance(predRed, measure = "tpr", x.measure = "fpr")
fullDF <- plotDF %>%
  filter(model %in% "...+transcriptomic")
predFull <- prediction(fullDF$logpp, as.numeric(fullDF$infect %in% "Yes"))
perfFull <- performance(predFull, measure = "tpr", x.measure = "fpr")

plotDF2 <- rbind(data.frame(fpr = unlist(perfRed@x.values),
                            tpr = unlist(perfRed@y.values),
                            model = "reduced"),
                 data.frame(fpr = unlist(perfFull@x.values),
                            tpr = unlist(perfFull@y.values),
                            model = "full"))
plotRoc <- ggplot(data = plotDF2,
                  mapping = aes(x = fpr, y = tpr, lty = model)) +
  geom_line() +
  geom_abline(slope = 1, intercept = 0) +
  scale_x_continuous(expand = c(0, 0)) +
  scale_y_continuous(expand = c(0, 0)) +
  labs(x = "False positive rate", y = "True positive rate") +
  theme_bw() +
  theme(legend.key = element_blank())
print(plotRoc)

plot of chunk roc

test diffences in term of accuracies, specificity and sensitivities between the two models

# test difference in accuracies
fitAcc <- roc.test(as.numeric(fullDF$infect %in% "Yes"),
                 fullDF$logpp,
                 redDF$logpp)
print(fitAcc)
## 
## 	DeLong's test for two correlated ROC curves
## 
## data:  fullDF$logpp and redDF$logpp by as.numeric(fullDF$infect %in% "Yes") (0, 1)
## Z = 0.83024, p-value = 0.4064
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2 
##   0.7885714   0.7590476

print session info

sessionInfo()
## R version 3.5.3 (2019-03-11)
## Platform: x86_64-apple-darwin18.2.0 (64-bit)
## Running under: macOS Mojave 10.14.4
## 
## Matrix products: default
## BLAS/LAPACK: /usr/local/Cellar/openblas/0.3.5/lib/libopenblasp-r0.3.5.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] tidyr_0.8.3         tibble_2.1.1        pROC_1.14.0        
##  [4] ROCR_1.0-7          gplots_3.0.1.1      dplyr_0.8.0.1      
##  [7] ggplot2_3.1.1       readr_1.3.1         limma_3.38.3       
## [10] Biobase_2.42.0      BiocGenerics_0.28.0 knitr_1.22         
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.1         highr_0.8          pillar_1.3.1       compiler_3.5.3    
##  [5] plyr_1.8.4         bitops_1.0-6       tools_3.5.3        digest_0.6.18     
##  [9] evaluate_0.13      gtable_0.3.0       pkgconfig_2.0.2    rlang_0.3.4       
## [13] cli_1.1.0          xfun_0.6           withr_2.1.2        stringr_1.4.0     
## [17] gtools_3.8.1       caTools_1.17.1.2   hms_0.4.2          grid_3.5.3        
## [21] tidyselect_0.2.5   glue_1.3.1         R6_2.4.0           fansi_0.4.0       
## [25] gdata_2.18.0       purrr_0.3.2        magrittr_1.5       scales_1.0.0      
## [29] assertthat_0.2.1   colorspace_1.4-1   labeling_0.3       utf8_1.1.4        
## [33] KernSmooth_2.23-15 stringi_1.4.3      lazyeval_0.2.2     munsell_0.5.0     
## [37] crayon_1.3.4