diff --git a/docs/index.html b/docs/index.html new file mode 100644 index 0000000..a0f7f85 --- /dev/null +++ b/docs/index.html @@ -0,0 +1,2266 @@ + + + + + + + + + + + + Copy number alteration fingerprint predicts the clinical response of oxaliplatin-based chemotherapy in metastatic colorectal cancer + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + +
+
+
+ + + +
+
+ + + +

Copy number alteration fingerprint predicts the +clinical response of oxaliplatin-based chemotherapy in metastatic +colorectal cancer

+ + + + + + + +


+
+

EQUAL CONTRIBUTION

+

J.W. and Z.T. contributed equally to this work as joint first +authors; X.L. and X.S.L contributed equally to this work as joint senior +authors.

+
+
+

LICENSE

+

If you want to reuse the code in this report, please note the +license below should be followed.

+

The code is made available for non commercial research purposes only +under the MIT. However, notwithstanding any provision of the MIT +License, the software currently may not be used for commercial purposes +without explicit written permission after contacting Ziyu Tao or Xue-Song Liu .

+
+
+

PART 1: Data preprocessing

+

To develop a stable, effective, and cost-efficient predictive +biomarker for oxaliplatin-based chemotherapy response in metastatic +colorectal cancer patients, we utilized shallow whole-genome +sequencing(sWGS) data to extract copy number alteration(CNA) features. +These features, combined with clinical characteristics, were used to +construct a machine learning model, ultimately resulting in the +development of a highly predictive model known as the CNA +fingerprint.

+
+

1.1 Raw data processing

+

suppose i is the sample name,Using follow tools: ### fastp QC

+
fastp -i $workdir/$fq1 -I $workdir/$fq2 \
+-o ${i}_fp_1.fq.gz -O  ${i}_fp_2.fq.gz 
+-h ${i}_fastp.html \
+-w 10
+
+

BWA MEM

+

blast

+
bwa mem -M -R "@RG\tID:$i\tLM:$i\tSM:$i\tPL:illumina\tPU:$i" 
+-t 8 \
+/path_to_hg38_genome/GRCh38.d1.vd1.fa \
+$workdir/${i}_fp_1.fq.gz \
+$workdir/${i}_fp_2.fq.gz > $workdir/${i}.sam
+
+
+

Samtools

+

transform sam to bam file

+
samtools view -@ 8 -bS $workdir/${i}.sam > $workdir/${i}_unsort.bam
+
+
+

Picard

+

sort the bam file

+
java -jar -Xmx12G -Djava.io.tmpdir=/path_to_tmp/tmp \
+/path_to_picard-2.20.6-0/picard.jar SortSam \
+I=$workdir/${i}_unsort.bam \
+O=$workdir/${i}_sorted.bam \
+SORT_ORDER=coordinate
+
+
+
+

1.1 CNA features calling

+

suppose you have got your bam file,Using follow tools:

+
+

QDNAseq

+

CBC segment,the defult genome is hg19,here we use hg38,and binsize as +100

+
library(QDNAseq)
+library(QDNAseq.hg38)
+library(stringr)
+library(ACE)
+library(DNAcopy)
+
+args <- commandArgs(trailingOnly = TRUE)
+#args 1,sample name
+#args 2,path to bam
+#args 3,the output path
+
+i <- args[1]
+s <- paste0(i,"_sorted.bam")
+#binsize 设置为100
+bins <- getBinAnnotations(binSize=100,genome="hg38")
+readCounts <- binReadCounts(bins,bamfile=paste0(args[2],"/",s))
+readCountsFiltered <- applyFilters(readCounts,residual=TRUE, blacklist=TRUE)
+readCountsFiltered <- estimateCorrection(readCountsFiltered)
+copyNumbers <- correctBins(readCountsFiltered)
+copyNumbersNormalized <- normalizeBins(copyNumbers)
+copyNumbersSmooth <- smoothOutlierBins(copyNumbersNormalized)
+copyNumbersSegmented <- segmentBins(copyNumbersSmooth, transformFun="log2")
+copyNumbersSegmented <- normalizeSegmentedBins(copyNumbersSegmented)
+#save(copyNumbersSegmented,file="/your_path/copyNumbersSegmented.rda")
+
+
+

ACE

+

cellularity and ploidy

+
sqm <- squaremodel(copyNumbersSegmented, QDNAseqobjectsample = 1, 
+                   penalty = 0.65, penploidy = 0.65, 
+                   ptop = 5.3, pbottom = 1.8, prows = 250,
+                   cellularities = seq(50, 100, length.out = 50),
+                   sgc = c("X", "Y"),exclude=c())
+
+###-----------------------------------------------------------------------待修改
+mdf <- sqm$minimadf
+mdf <- mdf[order(mdf$error,-mdf$cellularity),]
+
+ploidy <- mdf$ploidy[1]
+cellularity <- mdf$cellularity[1]
+
+
+# Absoult CNA calling
+m<-ACEcall(copyNumbersSegmented, QDNAseqobjectsample = TRUE, scellularity = cellularity,
+           ploidy = ploidy, standard =1, plot = FALSE,
+           qcutoff = -3, subcutoff = 0.2, trncname = FALSE,
+           cap = 12, bottom = 0, 
+           onlyautosomes = TRUE
+           )
+
+##提取信息并平滑
+#seg <- m[,6]
+segment <- data.frame(copyNumbersSegmented@featureData@data[1:nrow(m),1:3],segment_mean=m$segment_mean)
+segment <- na.omit(segment)
+segment$segment_mean <- round(segment$segment_mean)
+
+
+

sigminer

+

CN features calling here we got the CNA segment files,first we do a +filter,delect all segments with num_marks <10,make the file colnames +as: chr,start,end segval,sample

+
+
+
+
+

PART 2: Model training

+
+

2.1 Training XGBoost model

+

Considering the vast number of combinations, it would be advisable to +first conduct parameter searches for a subset, then utilize the +optimized subset to search for other parameters, rather than running all +parameters simultaneously.

+

Here, we omit the step of parameter searching and directly present +all the considered parameters.

+
    +
  • We use scale_pos_weight_value to address the issue of +class imbalance. The weight is set as the sum of negative samples +divided by the sum of positive samples.
  • +
+
library(xgboost)
+library(caret)
+library(Matrix)
+
+train.mat <- readRDS("../data/train_mat.rds")
+dtrain_mat <- train.mat
+colnames(dtrain_mat)[1:(dim(dtrain_mat)[2] - 1)] <- paste0("feature", seq(1:(dim(dtrain_mat)[2] - 1)))
+dtrain_input <- sparse.model.matrix(res ~ ., data = dtrain_mat)[, -1] %>%
+  as.matrix()
+train_label <- dtrain_mat$res == "response"
+
control <- trainControl(
+  method = "cv",
+  number = 5,
+  classProbs = TRUE,
+  summaryFunction = twoClassSummary
+)
+
+scale_pos_weight_value <- sum(dtrain_mat3$res == "nonresponse") / sum(dtrain_mat3$res == "response")
+
+param_grid <- expand.grid(
+  max_depth = c(2:15),
+  eta = c(1e-5, 1e-4, 0.001, 0.01, 0.1),
+  max_delta_step = c(1:30),
+  alpha = c(0, 0.1, 0.5),
+  lambda = c(0, 1, 0.5),
+  gamma = c(0, 1, 5, 10),
+  scale_pos_weight = scale_pos_weight_value,
+  colsample_bytree = seq(0.1, 1, 0.1),
+  subsample = seq(0.5, 1, 0.1), 
+  min_child_weight = c(1:10) 
+)
+paramlist <- apply(param_grid, 1, as.list)
+
+for (i in seq_along(paramlist)) {
+  x <- paramlist[[i]] 
+  tryCatch(
+    {
+      set.seed(333) 
+      xgb_last <- xgboost(
+        data = dtrain_input,
+        label = train_label,
+        objective = "binary:logistic",
+        params = x %>% as.list(), 
+        nrounds = 1000,
+        early_stopping_rounds = 20,
+        eval_metric = "auc"
+      )
+      imp <- xgb.importance(model = xgb_last) %>%
+        as.data.frame() %>%
+        dplyr::arrange(desc(Gain))
+      headimp <- head(imp, n = 20)
+      headimp$Feature
+      ## ---------------------------------------------------
+      set.seed(333) 
+      out <- xgb.cv(
+        data = dtrain_input[, colnames(dtrain_input) %in% headimp$Feature] %>% as.matrix(),
+        label = train_label,
+        params = x %>% as.list(),
+        nrounds = 1000,
+        showsd = TRUE,
+        booster = "gbtree",
+        print_every_n = 5,
+        early_stopping_rounds = 100,
+        metrics = c("auc", "aucpr"),
+        verbose = TRUE,
+        nfold = 5,
+        objective = "binary:logistic"
+      )
+
+      result <- out[["evaluation_log"]]
+      result$best_nround <- out$best_iteration
+      result$params <- out$params %>% list()
+      result$params_input <- list(x)
+      result$i <- i
+      print(x)
+      print(paste0("Here is ", i))
+      if (max(out$evaluation_log$test_auc_mean) > 0.80) {
+        saveRDS(
+          result,
+          file = paste0("/home/data/sde/tzy/project/swgs_coloncaner/script/final/tune_param_top20/iteronehot_", i, "_tune_param.rds")
+        )
+        saveRDS(
+          out,
+          file = paste0("/home/data/sde/tzy/project/swgs_coloncaner/script/final/tune_param_top20/iteronehot_", i, "_out.rds")
+        )
+      }
+    },
+    error = function(e) {
+      message("Error occurred with parameters: ", x, ". Error message: ", e$message)
+      list(error_occurred = TRUE, params = x)
+    }
+  )
+}
+
+
+

2.2 Final model

+

The best model is selected based on cross-validation results. The +parameters of final model as followed.

+
cvmodel <- readRDS("../data/iteronehot_1587_out.rds")
+cvparam <- readRDS("../data/iteronehot_1587_tune_param.rds")
+cvparam[[12]][[11]]
+
$nrounds
+[1] 100
+
+$max_depth
+[1] 3
+
+$eta
+[1] 0.1
+
+$max_delta_step
+[1] 8
+
+$alpha
+[1] 0
+
+$lambda
+[1] 1
+
+$gamma
+[1] 5
+
+$scale_pos_weight
+[1] 1.2
+
+$colsample_bytree
+[1] 0.7
+
+$subsample
+[1] 0.6
+
+$min_child_weight
+[1] 1
+

Attention, it has been tested that even with the same seed number, +there may still be differences in feature importance output between +different versions of the tool. To ensure result reproducibility, it is +recommended to run this section using the same version, or +alternatively, directly apply the final model provided by us. The +version of the tool used should be specified at the end.

+
dtrain_input <- readRDS("../data/dtrain_input.rds")
+train_label <- readRDS("../data/train_label.rds")
+
+set.seed(333)
+xgb_last_all <- xgboost(
+  data = dtrain_input,
+  label = train_label,
+  objective = "binary:logistic",
+  params = cvparam[[12]][[11]] %>% as.list(), 
+  nrounds = 1000,
+  early_stopping_rounds = 20,
+  eval_metric = c("auc")
+)
+saveRDS(xgb_last_all, file = "../data/xgb_last_all.rds")
+
+
+imp <- xgb.importance(model = xgb_last_all) %>%
+  as.data.frame() %>%
+  dplyr::arrange(desc(Gain))
+headimp <- head(imp, n = 20)
+headimp$Feature
+
+set.seed(123)
+xgb_last <- xgboost(
+  data = dtrain_input[, headimp$Feature] %>% as.matrix(),
+  label = train_label,
+  objective = "binary:logistic",
+  params = cvparam[[12]][[11]] %>% as.list(), 
+  nrounds = 1000,
+  early_stopping_rounds = 20,
+  eval_metric = c("auc")
+)
+saveRDS(xgb_last, file = "../data/xgb_last.rds")
+

The feature importances in the model are ranked based on gains.

+
library(tidyverse)
+library(xgboost)
+library(ggplot2)
+xgb_last_all <- readRDS("../data/xgb_last_all.rds")
+importance_all <- xgb.importance(model = xgb_last_all) %>%
+  as.data.frame() %>%
+  dplyr::arrange(desc(Gain)) %>%
+  head(n = 20L) %>%
+  dplyr::mutate(
+    Feature = case_when(
+      Feature == "feature84" ~ "CN[>8]",
+      Feature == "feature15" ~ "cna_burden",
+      Feature == "feature7" ~ "AFP",
+      Feature == "feature12" ~ "chr[20]AMP",
+      Feature == "feature70" ~ "NC50[>7]",
+      Feature == "feature74" ~ "CNCP[3]",
+      Feature == "feature81" ~ "CN[2]", 
+      Feature == "feature5" ~ "CA19-9",
+      Feature == "feature42" ~ "L:LL:9+:BB",
+      Feature == "feature60" ~ "E:HH:2:BB",
+      Feature == "feature75" ~ "CNCP[>4 & <=8]",
+      Feature == "feature52" ~ "E:LL:9+:BB",
+      Feature == "feature47" ~ "E:LL:3:AA",
+      Feature == "feature59" ~ "E:HH:2:AA",
+      Feature == "feature3有" ~ "Cancerous node",
+      Feature == "feature53" ~ "E:LL:5-8:BB",
+      Feature == "feature73" ~ "CNCP[1]",
+      Feature == "feature38" ~ "L:HH:2:AA",
+      Feature == "feature83" ~ "CN[>4 & <=8]",
+      Feature == "feature16" ~ "SS[>8]",
+      .default = Feature
+    )
+  )
+
[10:50:24] WARNING: src/learner.cc:553: 
+  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
+  older XGBoost, please export the model by calling `Booster.save_model` from that version
+  first, then load it back in current version. See:
+
+    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html
+
+  for more details about differences between saving model and serializing.
+
import_allgg <- ggplot(importance_all, aes(x = Gain, y = reorder(Feature, Gain))) +
+  geom_bar(stat = "identity", fill = "steelblue") +
+  labs(x = "Feature importance", y = "") +
+  theme_bw() +
+  theme(
+    panel.grid.major = element_blank(), 
+    panel.background = element_blank() 
+  ) + 
+  theme(axis.text.x = element_text(angle = 45, hjust = 1)) 
+import_allgg
+

+

The feature

+
library(ggpubr) 
+library(ggsignif) 
+
+boxplot.df <- train.mat %>%
+  dplyr::select(
+    "CN[>8]",
+    # "chr[20]AMP",
+    "CN[2]",
+    "L:LL:9+:BB",
+    "E:LL:9+:BB",
+    "E:HH:2:AA",
+    "L:HH:2:AA",
+    "res"
+  ) %>%
+  tibble::rownames_to_column(var = "sample") %>%
+  reshape2::melt(id = c("res", "sample")) %>%
+  dplyr::mutate(value = as.numeric(value))
+
+feature_boxplot <- ggplot(boxplot.df, aes(x = res, y = value)) +
+  geom_boxplot(
+    position = position_dodge(width = 0.7)
+  ) +
+  facet_wrap(~variable, scales = "free_y") + 
+  geom_signif(
+    comparisons = list(c("response", "nonresponse")),
+    map_signif_level = TRUE,
+    test = "wilcox.test",
+    step_increase = 0.15,
+    y_position = c(9, 12), 
+    tip_length = 0.03
+  ) +
+  labs(
+    title = "",
+    x = "",
+    y = "Count",
+    fill = ""
+  ) +
+  theme_bw() +
+  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
+  theme(
+    panel.grid.major = element_blank(),
+    panel.grid.minor = element_blank()
+  ) 
+feature_boxplot
+

+
+
+

2.3 Feature distribution

+
library(sigminer)
+ace_tally_W <- readRDS("../data/ace_tally_W.rds")
+tcga_tally_W <- readRDS("../data/tcga_tally_W.rds")
+show_catalogue(t(ace_tally_W[, -1]), style = "cosmic", mode = "copynumber", x_label_angle = 90)
+show_catalogue(t(tcga_tally_W[, -1]), style = "cosmic", mode = "copynumber", x_label_angle = 90)
+
+ace_tally_X <- readRDS("../data/ace_tally_X.rds")
+tcga_tally_X <- readRDS("../data/tcga_tally_X.rds")
+show_catalogue(t(ace_tally_X[, -1]), style = "cosmic", mode = "copynumber", method = "T", x_label_angle = 90)
+show_catalogue(t(tcga_tally_X[, -1]), style = "cosmic", mode = "copynumber", method = "T", x_label_angle = 90)
+

The W method feature distribution in TCGA mCRCs.

+

+

The W method feature distribution in this research.

+

+

The X method feature distribution in TCGA mCRCs.

+

+

The X method feature distribution in this research.

+

+
+
+

2.4 5-fold CV

+

CV results. Red line indicated the best iteration. Grey line +indicated the standard deviation.

+
library(gridExtra)
+
+CV5_evaluate <- cvmodel[["evaluation_log"]] %>%
+  head(50)
+
+## AUROC
+# Fit a curve by calculating the starting and ending positions of the bar chart.
+fit_y1 <- predict(loess(train_auc_mean ~ iter, data = CV5_evaluate))
+fit_y2 <- predict(loess(test_auc_mean ~ iter, data = CV5_evaluate))
+
+df_bar.auroc <- data.frame(
+  x = CV5_evaluate$iter,
+  y1 = fit_y1,
+  y2 = fit_y2,
+  bar_y1 = CV5_evaluate$train_auc_std,
+  bar_y2 = CV5_evaluate$test_auc_std
+)
+
+iterauc_plot <- ggplot(CV5_evaluate, aes(x = iter)) +
+  geom_smooth(aes(y = train_auc_mean, color = "Train"), method = "loess", se = FALSE, size = 0.5) +
+  geom_smooth(aes(y = test_auc_mean, color = "Test"), method = "loess", se = FALSE, size = 0.5) +
+  geom_point(aes(y = train_auc_mean), color = "grey", alpha = 0) +
+  geom_point(aes(y = test_auc_mean), color = "grey", alpha = 0) +
+  geom_segment(data = df_bar.auroc, aes(x = x, xend = x, y = y1 - bar_y1 / 2, yend = y1 + bar_y1 / 2), color = "grey", size = 0.5) +
+  geom_segment(data = df_bar.auroc, aes(x = x, xend = x, y = y2 - bar_y2 / 2, yend = y2 + bar_y2 / 2), color = "grey", size = 0.5) +
+  scale_color_manual(values = c("Train" = "blue", "Test" = "green")) +
+  labs(x = "Iterations", y = "Area Under the ROC Curve", color = "Lines") +
+  theme_bw() +
+  ylim(0, 1) +
+  scale_y_continuous(breaks = seq(0, 1, by = 0.2), limits = c(0, 1)) + 
+  geom_vline(xintercept = 16, linetype = "dashed", color = "red") 
+
+## AUCPR
+fit_y1_pr <- predict(loess(train_aucpr_mean ~ iter, data = CV5_evaluate))
+fit_y2_pr <- predict(loess(test_aucpr_mean ~ iter, data = CV5_evaluate))
+
+df_bar.aucpr <- data.frame(
+  x = CV5_evaluate$iter,
+  y1 = fit_y1,
+  y2 = fit_y2,
+  bar_y1 = CV5_evaluate$train_aucpr_std,
+  bar_y2 = CV5_evaluate$test_aucpr_std
+)
+
+iteraucpr_plot <- ggplot(CV5_evaluate, aes(x = iter)) +
+  geom_smooth(aes(y = train_aucpr_mean, color = "Train"), method = "loess", se = FALSE, size = 0.5) +
+  geom_smooth(aes(y = test_aucpr_mean, color = "Test"), method = "loess", se = FALSE, size = 0.5) +
+  geom_point(aes(y = train_aucpr_mean), color = "grey", alpha = 0) +
+  geom_point(aes(y = test_aucpr_mean), color = "grey", alpha = 0) +
+  geom_segment(data = df_bar.aucpr, aes(x = x, xend = x, y = y1 - bar_y1 / 2, yend = y1 + bar_y1 / 2), color = "grey", size = 0.5) +
+  geom_segment(data = df_bar.aucpr, aes(x = x, xend = x, y = y2 - bar_y2 / 2, yend = y2 + bar_y2 / 2), color = "grey", size = 0.5) +
+  scale_color_manual(values = c("Train" = "blue", "Test" = "green")) +
+  labs(x = "Iterations", y = "Area Under the Precision-Recall Curve", color = "Lines") +
+  theme_bw() +
+  ylim(0, 1) +
+  scale_y_continuous(breaks = seq(0, 1, by = 0.2), limits = c(0, 1)) + 
+  geom_vline(xintercept = 16, linetype = "dashed", color = "red") 
+
grid.arrange(iterauc_plot, iteraucpr_plot, ncol = 2)
+

+
+
+
+

PART 3: Model evaluation

+
+

Final Model Evaluation

+

The performance of CNA fingerprint on two independent test sets.

+
vali1roc <- readRDS("../data/vali1roc.rds")
+vali2roc <- readRDS("../data/vali2roc.rds")
+
+pROC::plot.roc(
+  vali1roc,
+  xlim = c(1, 0),
+  col = "darkgreen",
+  main = "ROC Curves of response prediction",
+  grid = c(0.1, 0.1),
+  print.thres.adj = c(0, 1),
+  add = FALSE
+)
+
+lines(vali2roc, col = "orange")
+
+legend("bottomright",
+  legend = c(
+    "Test1(AUROC=0.832)",
+    "Test2(AUROC=0.914)"
+  ), col = c("darkgreen", "orange"), lty = 1
+) 
+

+
+
+
+

PART 4: Biomarker comparison

+
+

4.1 HRD Performance Evaluation

+

HRDCNA can detect HRD from sWGS. Using the threshold values 0.2 +recommended by the tool guideline.

+
library(HRDCNA)
+library(caret)
+data(testdata)
+# nmfcn_wgs <- sigminercopy(data = cn_wgs, "hg19")
+# saveRDS(nmfcn_wgs, file = "./data/nmfcn_wgs.rds")
+
+nmfcn_wgs <- readRDS("../data/nmfcn_wgs.rds")
+validation.hrd <- readRDS("../data/validation_for_hrd.rds")
+
+validation.hrd.all <- validation.hrd %>%
+  dplyr::filter(batch_label %in% c("BATCH1", "BATCH2")) %>%
+  dplyr::select(all_of(colnames(nmfcn_wgs)))
+
+score_swgs <- HRDprediction(data = validation.hrd.all)
+
+Test.hrd_label <- validation.hrd %>%
+  dplyr::select(label, sample, batch_label) %>%
+  dplyr::inner_join(score_swgs) %>%
+  dplyr::mutate(hrd = ifelse(HRDCNAScore > 0.2, "response", "nonresponse")) %>%
+  dplyr::mutate(right = ifelse(hrd == label, "correct", "wrong"))
+
+Test.hrd_label1 <- Test.hrd_label %>% 
+  dplyr::filter(batch_label == "BATCH1")
+Test.hrd_label2 <- Test.hrd_label%>% 
+  dplyr::filter(batch_label == "BATCH2")
+
+test1.hrd <- confusionMatrix(
+  ifelse(Test.hrd_label1$hrd  == "response", 1, 0) %>% as.factor(),
+  ifelse(Test.hrd_label1$label == "response", 1, 0) %>% as.factor()
+)
+
+test2.hrd <- confusionMatrix(
+  ifelse(Test.hrd_label2$hrd  == "response", 1, 0) %>% as.factor(),
+  ifelse(Test.hrd_label2$label == "response", 1, 0) %>% as.factor()
+)
+
+print(paste0("The evaluation of HRD in Test 1 as followed: ", test1.hrd$overall[1]))
+
[1] "The evaluation of HRD in Test 1 as followed: 0.354838709677419"
+
test1.hrd$overall[1]
+
 Accuracy 
+0.3548387 
+
test1.hrd$byClass
+
         Sensitivity          Specificity       Pos Pred Value 
+           0.8181818            0.1000000            0.3333333 
+      Neg Pred Value            Precision               Recall 
+           0.5000000            0.3333333            0.8181818 
+                  F1           Prevalence       Detection Rate 
+           0.4736842            0.3548387            0.2903226 
+Detection Prevalence    Balanced Accuracy 
+           0.8709677            0.4590909 
+
print(paste0("The evaluation of HRD in Test 2 as followed: ", test2.hrd$overall[1]))
+
[1] "The evaluation of HRD in Test 2 as followed: 0.333333333333333"
+
test2.hrd$overall[1]
+
 Accuracy 
+0.3333333 
+
test2.hrd$byClass
+
         Sensitivity          Specificity       Pos Pred Value 
+           0.8750000            0.0625000            0.3181818 
+      Neg Pred Value            Precision               Recall 
+           0.5000000            0.3181818            0.8750000 
+                  F1           Prevalence       Detection Rate 
+           0.4666667            0.3333333            0.2916667 
+Detection Prevalence    Balanced Accuracy 
+           0.9166667            0.4687500 
+
+
+

4.2 compare to hotspot mutation

+
mut.df.train <- readRDS("../data/mut.df.train.rds")
+
+kras.cm <- confusionMatrix(
+  ifelse(mut.df.train$mutation  == "KRASmut", 1, 0) %>% as.factor(),
+  ifelse(mut.df.train$res == "yes", 1, 0) %>% as.factor()
+)
+print(paste0("KRAS evaluation as followed:"))
+
[1] "KRAS evaluation as followed:"
+
kras.cm$byClass
+
         Sensitivity          Specificity       Pos Pred Value 
+           0.5294118            0.3956044            0.4500000 
+      Neg Pred Value            Precision               Recall 
+           0.4736842            0.4500000            0.5294118 
+                  F1           Prevalence       Detection Rate 
+           0.4864865            0.4829545            0.2556818 
+Detection Prevalence    Balanced Accuracy 
+           0.5681818            0.4625081 
+
nras.cm <- confusionMatrix(
+  ifelse(mut.df.train$mutation  == "NRASmut", 1, 0) %>% as.factor(),
+  ifelse(mut.df.train$res == "yes", 1, 0) %>% as.factor()
+)
+print(paste0("NRAS evaluation as followed:"))
+
[1] "NRAS evaluation as followed:"
+
nras.cm$byClass
+
         Sensitivity          Specificity       Pos Pred Value 
+          1.00000000           0.01098901           0.48571429 
+      Neg Pred Value            Precision               Recall 
+          1.00000000           0.48571429           1.00000000 
+                  F1           Prevalence       Detection Rate 
+          0.65384615           0.48295455           0.48295455 
+Detection Prevalence    Balanced Accuracy 
+          0.99431818           0.50549451 
+
braf.cm <- confusionMatrix(
+  ifelse(mut.df.train$mutation  == "BRAFmut", 1, 0) %>% as.factor(),
+  ifelse(mut.df.train$res == "yes", 1, 0) %>% as.factor())
+print(paste0("BRAF evaluation as followed:"))
+
[1] "BRAF evaluation as followed:"
+
braf.cm$byClass
+
         Sensitivity          Specificity       Pos Pred Value 
+          0.97647059           0.01098901           0.47976879 
+      Neg Pred Value            Precision               Recall 
+          0.33333333           0.47976879           0.97647059 
+                  F1           Prevalence       Detection Rate 
+          0.64341085           0.48295455           0.47159091 
+Detection Prevalence    Balanced Accuracy 
+          0.98295455           0.49372980 
+
+
+ + + +
+
+
+
+
+ + + + + + + + + + + + diff --git a/pic/TCGA_CNF_distribution.png b/pic/TCGA_CNF_distribution.png new file mode 100644 index 0000000..4f0744d Binary files /dev/null and b/pic/TCGA_CNF_distribution.png differ diff --git a/pic/Train_CNF_distribution_T.png b/pic/Train_CNF_distribution_T.png new file mode 100644 index 0000000..66e6f75 Binary files /dev/null and b/pic/Train_CNF_distribution_T.png differ diff --git a/pic/Train_CNF_distribution_W.png b/pic/Train_CNF_distribution_W.png new file mode 100644 index 0000000..7bf0f37 Binary files /dev/null and b/pic/Train_CNF_distribution_W.png differ diff --git a/pic/criteria.png b/pic/criteria.png new file mode 100644 index 0000000..e8ed543 Binary files /dev/null and b/pic/criteria.png differ