Ruby Xiong and Simon Vandekar 2023-04-02
Gamma Mixture Models for Multiplexed Immunofluorescence Imaging Data
Install Package
if( !require(devtools) ) install.packages("devtools")
devtools::install_github( "statimagcoll/gluster" )
## Downloading GitHub repo statimagcoll/gluster@HEAD
## gdtools (0.3.2 -> 0.3.3) [CRAN]
## Installing 1 packages: gdtools
##
## The downloaded binary packages are in
## /var/folders/lf/krz2s0js7jj03g10q52y5mhc0000gn/T//RtmpMZHwA0/downloaded_packages
## ── R CMD build ─────────────────────────────────────────────────────────────────
## checking for file ‘/private/var/folders/lf/krz2s0js7jj03g10q52y5mhc0000gn/T/RtmpMZHwA0/remotes14c4a4173fbdc/statimagcoll-gluster-5c5c987/DESCRIPTION’ ... ✔ checking for file ‘/private/var/folders/lf/krz2s0js7jj03g10q52y5mhc0000gn/T/RtmpMZHwA0/remotes14c4a4173fbdc/statimagcoll-gluster-5c5c987/DESCRIPTION’
## ─ preparing ‘gluster’:
## checking DESCRIPTION meta-information ... ✔ checking DESCRIPTION meta-information
## ─ checking for LF line-endings in source and make files and shell scripts
## ─ checking for empty or unneeded directories
## Removed empty directory ‘gluster/vignettes’
## Omitted ‘LazyData’ from DESCRIPTION
## ─ building ‘gluster_0.1.0.tar.gz’
##
##
library(gluster)
Install instructions can be found at https://github.com/julia-wrobel/VectraPolarisData. After installation, run the following:
library(VectraPolarisData)
spe_lung <- HumanLungCancerV3()
# get the cell level imaging data with spatial coordinates
cell = cbind(as.data.frame(colData(spe_lung)), as.data.frame(spatialCoords(spe_lung)) )
Before fitting the model, the expression levels are normalized to reduce batch effect and variation among slides. Quantile boundaries and fixed value boundaries are picked based on prior knowledge.
allMarkers = grep('^entire_cell_.*total$', names(cell), value=TRUE)
# normalize the data
nzNormedMarkers = paste0('nzNorm_',allMarkers)
cell = do.call(rbind, lapply(split(cell, cell$slide_id), function(df){df[,nzNormedMarkers] = log10(1+sweep(df[,allMarkers], 2, colMeans(replace(df[,allMarkers], df[,allMarkers]==0, NA), na.rm = TRUE ), FUN = '/' ) ); df }) )
# split the data frame
cells = split(cell, cell$slide_id)
# set up boundary for all markers
quantileBoundaries = list(matrix(c(0,.8, .9, 1), byrow = TRUE, nrow=2),
matrix(c(0,.7, .8, 1), byrow = TRUE, nrow=2),
matrix(c(0,.6, .8, 1), byrow = TRUE, nrow=2),
matrix(c(0,.6, .8, 1), byrow = TRUE, nrow=2),
matrix(c(0,.7, .8, 1), byrow = TRUE, nrow=2),
NULL,
NULL,
NULL)
boundaries = list(matrix(c(0,.3, .5, 1), byrow = TRUE, nrow=2),
NULL,
NULL,
NULL,
matrix(c(0,.3, .45, Inf), byrow = TRUE, nrow=2),
matrix(c(0,.3, .5, Inf), byrow = TRUE, nrow=2),
matrix(c(0,.4, .45, Inf), byrow = TRUE, nrow=2),
NULL)
names(quantileBoundaries) = names(boundaries) = nzNormedMarkers
Here the fitting is run both with and without constraints. Fitting run without constraints are susceptible to statistically optimal results (local maximum) that are not reasonable biologically.
unconstrGMMfit = gluster(cells[[1]][,nzNormedMarkers])
constrGMMfit = gluster(cells[[1]][,nzNormedMarkers], boundaryMarkers = boundaries,
qboundaryMarkers=quantileBoundaries)
# check convergence
convCheck(constrGMMfit)
# visualize constrained fits with quantile drawn on histogram
plots_hist <- sapply(nzNormedMarkers, function(x) plot(constrGMMfit, marker = x, boundary = quantile(constrGMMfit$expressionX[,x], probs=quantileBoundaries[[x]][2,1]), title=x))
# comparing constrained and unconstrained fits
contrast_hist = sapply(nzNormedMarkers, function(x) hist_sr_constrast(constrGMMfit, unconstrGMMfit, marker=x, title=x) )
Sub-batches can be used if there are multiple regions per slide to assess the fit within each region. The model is fit at the slide level and then model fit is assessed by batch.
# fitting on the same slide and getting model fit results by batch
batchConstrGMMfit = gluster(cells[[1]][,nzNormedMarkers], boundaryMarkers = boundaries,
qboundaryMarkers=quantileBoundaries, subBatch = cells[[1]]$sample_id)
# visualize by subBatch for the first markers
trash = sapply(unique(cells[[1]]$sample_id), function(x) plot(batchConstrGMMfit, subBatch = x, title=x) )
We can use mclapply from the parallel package to run gluster on multiple
slides. The example line below runs on a subset of 3 slides from the
cell
data frame.
Alternatively, use groupGluster:
# subset of cell, first 3 slides
slide3 <- names(cells)[1:3]
cells3 <- cell[cell$slide_id %in% slide3,]
# fit cfGMM with groupGluster
constrCfGMMbunch <- groupGluster(cells3[,nzNormedMarkers], slide = cells3$slide_id,
boundaryMarkers = boundaries,qboundaryMarkers=quantileBoundaries, n.cores = 5)
Check for convergence:
convCheck(constrCfGMMbunch)
## [1] "All converged."
Before plotting and looking over the (# slides
for(i in nzNormedMarkers){
plot(constrCfGMMbunch, diagnostic=TRUE, interactive=FALSE, histogram=FALSE, marker=i, title=i)
}
slides.fit <- names(constrCfGMMbunch)
for(i in slides.fit){
cat('\n#### Slide:', i, " \n")
for(j in nzNormedMarkers){
plot(constrCfGMMbunch, marker=j, slide=i, title=paste0(i,"\n",j), diagnostic=FALSE, histogram=TRUE,
boundary = quantile(constrCfGMMbunch[[i]]$expressionX[,j], probs=quantileBoundaries[[j]][2,1]))
}
cat('\n\n')
}
Again, we can compare it fitting with no boundaries.
# same as above , but fitting without boundaries
CfGMMbunch = groupGluster(cells3[,nzNormedMarkers], slide = cells3$slide_id,n.cores = 5)
convCheck(CfGMMbunch)
[1] “All converged.”
for(i in slides.fit){
cat('\n#### Slide:', i, " \n")
for(j in nzNormedMarkers){
hist_sr_constrast(constrCfGMMbunch, CfGMMbunch, marker=j, slide=i, title=paste0(i,"\n",j), diagnostic=FALSE, histogram=TRUE,boundary = quantile(constrGMMbunch[[i]]$expressionX[,j], probs=quantileBoundaries[[j]][2,1]))
}
cat('\n\n')
}
We can overlay the diagnostic plot to further check what are the difference between the two fitted models.
for(i in nzNormedMarkers){
print(diag_contrast(constrCfGMMbunch, CfGMMbunch, marker=i, title=i, fit.names = c("Constrained", "Un-constrained")))
}
To compare agreement between two binomial variables, we can use cohen’s kappa or rand index. Here is the usage of the function with simulated data:
test1 <- matrix(sample(c(0,1),400, replace = TRUE), nrow=200)
test2 <- matrix(sample(c(0,1),400, replace = TRUE), nrow=200)
standard.mat <- matrix(sample(c(0,1),400, replace = TRUE), nrow=200)
colnames(test1) <- colnames(test2) <- colnames(standard.mat) <- c("a", "b")
evaluateGroupGluster(test1, test2, standard=standard.mat, method="cohen",batch=rep(1:5, each=10))
## a b batch method
## 1 -0.25000000 -0.19047619 1 method1
## 2 0.04040404 0.26108374 2 method1
## 3 0.01234568 -0.18518519 3 method1
## 4 -0.10000000 0.15422886 4 method1
## 5 0.05472637 -0.15662651 5 method1
## 11 -0.10000000 0.18987342 1 method2
## 21 -0.06060606 0.10000000 2 method2
## 31 0.13253012 -0.25000000 3 method2
## 41 -0.26903553 0.19799499 4 method2
## 51 -0.22448980 0.07317073 5 method2
evaluateGroupGluster(test1, test2, standard=standard.mat, method="rand",batch=rep(1:5, each=10))
## a b batch method
## 1 0.03789224 0.037892244 1 method1
## 2 0.02271383 0.038423606 2 method1
## 3 0.02473150 0.016257759 3 method1
## 4 0.01578532 0.002920169 4 method1
## 5 0.02344027 0.015481708 5 method1
## 11 0.01578532 0.016257759 1 method2
## 21 0.02271383 0.015785320 2 method2
## 31 0.01528449 0.037892244 3 method2
## 41 0.03842361 0.014859985 4 method2
## 51 0.01561761 0.022471910 5 method2