vignette

victorvelasco · May 10, 2023 · 1e5c16e · 1e5c16e
1 parent b9cf0d7
commit 1e5c16e
Show file tree

Hide file tree

Showing 3 changed files with 139 additions and 18 deletions.
diff --git a/R/hdpExtra_diagnostic_plots.R b/R/hdpExtra_diagnostic_plots.R
@@ -20,3 +20,21 @@ hdpExtra_diagnostic_plots <- function(hdpExtraChainMulti, param_names) {
     # ggmcmc::ggs_compare_partial(ggs_object)
   )
 }
+
+#' Autocorrelation plots for the concentration parameters
+#'
+#' @param hdpExtraChainMulti An object of class HdpExtraChainMulti
+#' @param param_names Name of the concentration parameters, to display in
+#' the diagnostic plots
+#'
+#' @export
+hdpExtra_acf <- function(hdpExtraChainMulti, param_names) {
+  list_cp_values <- lapply(
+    1:length(hdpExtraChainMulti@cp_values),
+    function(i) coda::as.mcmc(hdpExtraChainMulti@cp_values[[i]][-(1:chlist@burnin[i]), ])
+  )
+  list_cp_values <- coda::as.mcmc.list(list_cp_values)
+  coda::varnames(list_cp_values) <- param_names
+  list_cp_values <- ggmcmc::ggs(list_cp_values, keep_original_order = TRUE)
+  list(ggmcmc::ggs_autocorrelation(list_cp_values, nLags = 200))
+}
diff --git a/rds/chlist.rds b/rds/chlist.rds
diff --git a/vignettes/uncertainty_mutational_signatures2.Rmd b/vignettes/uncertainty_mutational_signatures2.Rmd
@@ -51,7 +51,7 @@ hdp_mut <- hdp::hdp_init(ppindex = c(0, rep(1, each = nrow(breast_counts))), # i
                          cpindex = c(1, rep(2, each = nrow(breast_counts))), # index of concentration param
                          hh=rep(1, 96), # prior is uniform over the 96 mutation categories
                          alphaa=c(1.0,1.0), # shape hyperparams for five different CPs
-                         alphab=c(0.1,1.0)) # rate hyperparams for five different CPs
+                         alphab=c(0.1,0.1)) # rate hyperparams for five different CPs
 
 # add data to leaf nodes (one per cancer sample, in row order of mut_count)
 hdp_mut <- hdp::hdp_setdata(hdp_mut,
@@ -82,13 +82,13 @@ we discourage users from setting $G_j \sim \text{DP}(\alpha_j,G_0)$.
 
 Concentration parameters, $\gamma$ and $\alpha$, determine how the number of
 mutational signatures grows with the sample size, in the sample and in individual
-patients respectively. Because this number is not know in advance, we set a
-vague (hyper)prior on both hyperparameters. Following the original work of Teh et al.,
+patients respectively. Because this number is not known in advance, we place a
+vague (hyper)prior on both hyper-parameters. Following the original work of Teh et al.,
 we set the following vague gamma priors:
 $$
 \begin{align}
 \gamma &\sim \Gamma(1, 0.1) \\
-\alpha &\sim \Gamma(1, 1)
+\alpha &\sim \Gamma(1, 0.1)
 \end{align}
 $$
 
@@ -107,19 +107,19 @@ We set the initial number of cluster to a number much higher than the one we exp
 In this example, we set the initial number of cluster to 100.
 
 ```{r}
-chlist <- vector("list", 2)
-for (i in 1:2){
-
-  # activate DPs, 10 initial components
-  hdp_activated <- hdp::dp_activate(hdp_mut, 1:numdp(hdp_mut), initcc=100, seed=i*200)
-
-  chlist[[i]] <- hdpExtra_posterior(hdp_activated,
-                               burnin=100,
-                               n=10,
-                               space=10,
-                               cpiter=3,
-                               seed=i*1e3)
-}
+library(parallel)
+
+Sys.time()
+chlist <- mclapply(1:4, function(i) {
+  hdp_activated <- dp_activate(hdp_mut, 1:numdp(hdp_mut), initcc=200, seed=i*200)
+  return(hdpExtra_posterior(hdp_activated,
+                       burnin=10000,
+                       n=100,
+                       space=100,
+                       cpiter=3,
+                       seed=i*1e3))
+})
+Sys.time()
 ```
 
 Now chlist contains a list of objects of class HdpExtraChain. In turn, each such
@@ -137,8 +137,111 @@ chlist <- HdpExtraChainMulti(chlist)
 
 # Assessig convergence
 
-# Plotting mutational signatures
+## Diagnostic plots
+Provided with output from the MCMC sampler, it is important to check convergence
+of the MCMC chains. For this purpose, we run several chains in parallel. hdpExtra 
+provides standard diagnostic plots, to check that the chains appear to converge
+to the same mode of the posterior distribution. 
+To assess convergence, it is important to assess the posterior over the parameters of 
+interest at every iteration of the sampler, and not just at those iterations that were
+saved in memory for inferece.
+
+```{r}
+hdpExtra_diagnostic_plots(chlist, c("gamma", "alpha"))
+```
+The first figure indicates that the marginal posteriors of the concentration parameters,
+as sampled from each of the four chains, have converged to the same region of the 
+posterior distribution. The second figure displays the traceplots for the number of clusters 
+sampled in each of the four chains. is similar in all four chains. Having initialised each
+chain with 200 clusters, it can be observed that the sampler merges the initial clusters
+into a smaller number of around 25-35 in each of the four chains. Once the sampler has
+converged, it attempts to split and merge clusters but the model dimension stays stable
+at around 25-35. 
+
+It should be noted that some of these clusters are artefacts of the model
+and will not appear consistently across iterations of the sampler. Mutations attributed to
+those artefactual clusters at a certain iteration of the samplers will be attributed to 
+different ones at other iterations. It is thus important that the partition that we report
+is representative of the MCMC output. 
+
+## Autocorrelation function (ACF)
+
+Despite our demanding hundreds of thousands of iterations, in general it is not
+practical to store the MCMC output for such a large number of iterations. Instead,
+we recommend to store every $s$th draw, with $s$ large enough so that stored 
+draws of the marginal posterior over the concentration parameters do not show
+autocorrelation.
+
+```{r}
+hdpExtra_acf(chlist, c("gamma", "alpha"))
+````
+
+# Post-processing the posterior distribution
+
+Having obtained $S = 20$ draws from the posterior distribution, the challenging task is now
+to summarise the posterior distribution and to quantify the uncertainty around the reported
+parameters. To do so, we follow (Molitor et al. 2010) in choosing a "model averaging" approach s.t.
+
+* We aim to obtain a partition $z^{\text{best}}$ that represents the center of the posterior distibution.
+* For that chosen partition, we assess the uncertainty around the signature parameters.
+
+## Obtaining a most representative partition with SALSO
+
+First, we construct the allocations matrix from the different chains:
+
+```{r}
+chlist <- HdpExtraChainMulti(chlist)
+```
+
+Provided with a matrix of dimension $N \times S$, where each column represents a partition of
+the $N$ mutations, we aim to find the center of the distribution.
+
+
+
+```{r}
+best_partition <- salso(t(hdp_allocations(chlist)), loss = VI(a = 0.5))
+table(best_partition)
+```
+
+Here is the table of exposures:
+
+```{r}
+J <- ncol(counts_gbm)# Number of patients
+K <- max(best_partition) # Number of clusters (signatures)
+
+nmuts <- colSums(counts_gbm)
+index_end <- cumsum(nmuts)
+index_beg <- c(0, index_end[1:(length(nmuts)-1)])+1
+names(index_beg) <- names(index_end)
+
+sort(table(best_partition), decreasing = TRUE)
+
+exposures <- matrix(0, nrow = J, ncol = K)
+rownames(exposures) <- colnames(counts_gbm)
+colnames(exposures) <- paste("Sig", 01:K)
+
+for (j in 1:J) {
+  patient_allocs <- table(best_partition[index_beg[j]:index_end[j]])
+  exposures[j, as.integer(names(patient_allocs))] <- unname(patient_allocs)
+}
 
+knitr::kable(exposures)
+```
+
+
+Now that we are provided with a best partition, we proceed to evaluate
+the uncertainty around that partition. To do so, we use the function
+<tt>hdp_postprocessing</tt>:
+```{r}
+Phi_best <- hdpExtra::hdp_postprocessing(chlist, best_partition)
+dim(Phi_best)
+```
+
+Now we can plot the signatures:
+
+```{r}
+hdp_plot_sig_uncertainty(Phi_best)
+```
 # Compatibility with hdp
 
 The output of hdpExtra_posterior can also be used for the post-processing tools