improved seurat

wtsi-hgi · Jun 18, 2024 · 3003d72 · 3003d72
1 parent e85cabe
commit 3003d72
Show file tree

Hide file tree

Showing 7 changed files with 404 additions and 136 deletions.
diff --git a/bin/2.integrate.R b/bin/2.integrate.R
diff --git a/bin/2.process_donor_data_for_integration.R b/bin/2.process_donor_data_for_integration.R
@@ -108,13 +108,13 @@ for (donor in names(sobj_per_donor)) {
 
     # Check number of PCs
     dims2 <- dim(sobj_per_donor[[donor]]@assays$RNA$counts)
-    num_pcs <- min(20, dims2[2] - 1)
-    if (num_pcs < 20) {
+    num_pcs <- min(50, dims2[2] - 1)
+    if (num_pcs < 50) {
         donors_to_drop <- append(donors_to_drop, donor)
     }
 
     # Run PCA on all variable genes
-    sobj_per_donor[[donor]] <- RunPCA(sobj_per_donor[[donor]], dims = 1:10, npcs = 20, verbose = FALSE)
+    sobj_per_donor[[donor]] <- RunPCA(sobj_per_donor[[donor]], dims = 1:10, npcs = num_pcs, verbose = FALSE)
     p2 <- DimPlot(sobj_per_donor[[donor]]) + ggtitle('Before cell cycle correction (all genes)')
     sobj_per_donor[[donor]]@project.name <- paste0(sample_name, '-', donor)
 }

diff --git a/bin/totalVI.py b/bin/totalVI.py
@@ -0,0 +1,139 @@
+#!/usr/bin/env python
+
+__date__ = '2020-05-26'
+__version__ = '0.0.1'
+
+import matplotlib.pyplot as plt
+from matplotlib import rcParams
+from matplotlib import gridspec
+import numpy as np
+import pandas as pd
+import scipy as sci
+import scanpy as sc
+import copy
+import scvi
+import argparse
+import glob
+import pandas as pd
+
+
+
+# This code will gather the most important graphs and place them in a summary folder.
+parser = argparse.ArgumentParser(
+    description="""
+        Collect inputs
+        """
+)
+
+parser.add_argument(
+        '-h5ad_file', '--h5ad_file',
+        action='store',
+        dest='h5ad_file',
+        required=True,
+        help='Input h5ad_file after normalisation'
+)
+
+
+
+options = parser.parse_args()
+
+sc.set_figure_params(figsize=(5,5), dpi=150)
+
+#import single cell data and CITE-seq data
+# SLEmap = sc.read('adata-normalized.h5ad')
+SLEmap = sc.read(options.h5ad_file)
+
+all_cite_files = glob.glob("./*/*.matrix.csv")
+CITE = pd.DataFrame()
+for f1 in all_cite_files:
+    c1 = pd.read_csv(f1,index_col=0)
+    CITE=pd.concat([CITE,c1])
+
+# merge citeseq data to obs of single cell data 
+SLEmap.obs["Barcode"] = SLEmap.obs.index.str.split('__').str[0]
+SLEmap.obs = SLEmap.obs.merge(CITE, left_on=['Barcode'],right_index=True,how='left', indicator=True)
+
+
+# make citeseq data to obsm single cell data : required for totalVI
+CITE_2 = SLEmap.obs[CITE.columns].copy()
+SLEmap.obsm['protein_expression'] = CITE_2
+
+# keep only cells passing QC and highly variable genes
+SLEmap = SLEmap[SLEmap.obs["cell_passes_qc"],:]
+SLEmap = SLEmap[SLEmap.obs["cell_passes_hard_filters"],:]
+SLEmap = SLEmap[:,SLEmap.var["highly_variable"]]
+
+#run totalVI
+SLEmap = SLEmap.copy()
+scvi.model.TOTALVI.setup_anndata(SLEmap, protein_expression_obsm_key="protein_expression",batch_key="experiment_id")
+model = scvi.model.TOTALVI(SLEmap,latent_distribution="normal",n_layers_decoder=2)
+
+model.train()
+model.save("./scvi_model_300samples",adata=SLEmap, overwrite=True)
+
+
+# Get latent expression from model: used for UMAP calculations
+SLEmap.obsm["X_totalVI"] = model.get_latent_representation()
+
+
+# Get denoised protein values: can fail with low memory: use 400 minimum for n_samples=25. 
+# If it keeps failing lower n_samples or don't use it and use dsb values instead
+rna, protein = model.get_normalized_expression(n_samples=25,return_mean=True)
+SLEmap.obsm["denoised_protein"] = protein
+
+sc.pp.neighbors(SLEmap, use_rep="X_totalVI",n_neighbors=15)
+sc.tl.umap(SLEmap)
+
+
+sc.set_figure_params(figsize=(5,5), dpi=150)
+
+def panel_grid(hspace, wspace, ncols, num_panels):
+    """Init plot."""
+    n_panels_x = min(ncols, num_panels)
+    n_panels_y = np.ceil(num_panels / n_panels_x).astype(int)
+    if wspace is None:
+        #  try to set a wspace that is not too large or too small given the
+        #  current figure size
+        wspace = 0.75 / rcParams['figure.figsize'][0] + 0.02
+    # each panel will have the size of rcParams['figure.figsize']
+    fig = plt.figure(
+        figsize=(
+            n_panels_x * rcParams['figure.figsize'][0] * (1 + wspace),
+            n_panels_y * rcParams['figure.figsize'][1],
+        )
+    )
+    left = 0.2 / n_panels_x
+    bottom = 0.13 / n_panels_y
+    gs = gridspec.GridSpec(
+        nrows=n_panels_y,
+        ncols=n_panels_x,
+        left=left,
+        right=1 - (n_panels_x - 1) * left - 0.01 / n_panels_x,
+        bottom=bottom,
+        top=1 - (n_panels_y - 1) * bottom - 0.1 / n_panels_y,
+        hspace=hspace,
+        wspace=wspace
+    )
+    return fig, gs
+
+fig, grid = panel_grid(
+    hspace=0.125*2,
+    wspace=None,
+    ncols=4,
+    num_panels=2
+)
+
+sc.pl.umap(
+    SLEmap,
+    color=["Azimuth:predicted.celltype.l1","Azimuth:predicted.celltype.l2"],
+    # Setting a smaller point size to get prevent overlap
+    size=0.7,save='plots.pdf'
+)
+
+fig.savefig(
+    'umap.png',
+    #dpi=300,
+    bbox_inches='tight'
+)
+
+SLEmap.write("./totalVI_integrated.h5ad")
diff --git a/modules/nf-core/modules/citeseq/main.nf b/modules/nf-core/modules/citeseq/main.nf
@@ -75,8 +75,7 @@ process DSB {
         """
 }
 
-process DSB_INTEGRATE{
-
+process COMBINE_INTEGRATIONS{
     label 'process_medium'
     tag "${sample_name}"
     if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
@@ -91,6 +90,36 @@ process DSB_INTEGRATE{
     output:
         // path("out"), emit: outdir
         path("*all_samples_integrated.RDS"), emit: tmp_rds_file
+
+    input:
+        path(rds_files)
+
+    script:
+
+        """
+        echo 'running1'
+        2.combine.R
+        """
+
+}
+
+
+process DSB_INTEGRATE{
+
+    label 'process_high'
+    tag "${sample_name}"
+    if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
+        container "https://yascp.cog.sanger.ac.uk/public/singularity_images/azimuth_dsb_6_03_2024.sif"
+    } else {
+        container "mercury/azimuth_dsb:6_03_2024"
+    }
+
+    publishDir  path: "${params.outdir}/citeseq/all_data_integrated", mode: "${params.copy_mode}",
+      overwrite: "true"
+
+    output:
+        // path("out"), emit: outdir
+        path("*__integration_*.RDS"), emit: tmp_rds_file
         path("figures__*/*"), emit: figs
 
 
@@ -103,6 +132,7 @@ process DSB_INTEGRATE{
         val(ndim_sct)
         val(ndim_citeBgRemoved)
         val(ndim_cite_integrated)
+        val(method)
     script:
 
         if (vars_to_regress == ''){
@@ -111,7 +141,7 @@ process DSB_INTEGRATE{
 
         """
         echo 'running1'
-        2.integrate.R ${vars_to_regress} ${k_anchor} ${dims} ${ndim_sct} ${ndim_citeBgRemoved} ${ndim_cite_integrated}
+        2.integrate.R ${vars_to_regress} ${k_anchor} ${dims} ${ndim_sct} ${ndim_citeBgRemoved} ${ndim_cite_integrated} ${method}
         """
 
 }

diff --git a/modules/nf-core/modules/totalVi/main.nf b/modules/nf-core/modules/totalVi/main.nf
@@ -0,0 +1,29 @@
+process TOTAL_VI_INTEGRATION{
+    label 'process_medium'
+    if (params.utilise_gpu){
+        label 'gpu'
+    }else{
+        label 'process_medium'
+        // label 'process_low'
+    }
+
+    if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
+        container "/lustre/scratch123/hgi/teams/hgi/mo11/tmp_projects/sle_project/5.sle_noCB_fullRun_MAD_perPool/yascp_totalvi_v1.sif"
+    } else {
+        container "wtsihgi/nf_scrna_qc:6bb6af5"
+    }
+
+    input:
+        path(adata)
+        path(citedata)
+
+    // output:
+    //     val(outdir, emit: outdir)
+
+    script:
+
+        """
+            totalVI.py -h5ad_file ${adata} 
+        """
+
+}
diff --git a/subworkflows/qc.nf b/subworkflows/qc.nf
@@ -14,7 +14,8 @@ include {UMAP; UMAP as UMAP_HARMONY; UMAP as UMAP_BBKNN;} from "$projectDir/modu
 include {CLUSTERING; CLUSTERING as CLUSTERING_HARMONY; CLUSTERING as CLUSTERING_BBKNN;} from "$projectDir/modules/nf-core/modules/clustering/main"
 include {CELL_HARD_FILTERS} from "$projectDir/modules/nf-core/modules/cell_hard_filters/main"
 include {DONT_INTEGRATE} from "$projectDir/modules/nf-core/modules/reduce_dims/main"
-include { DSB_PROCESS; PREPROCESS_PROCESS; DSB_INTEGRATE; MULTIMODAL_INTEGRATION; VDJ_INTEGRATION } from '../modules/nf-core/modules/citeseq/main'
+include {TOTAL_VI_INTEGRATION} from "$projectDir/modules/nf-core/modules/totalVi/main"
+include { DSB_PROCESS; PREPROCESS_PROCESS; DSB_INTEGRATE as DSB_INTEGRATE_SCT;DSB_INTEGRATE as DSB_INTEGRATE_CITE;DSB_INTEGRATE as DSB_INTEGRATE_cite_bgRemoved; MULTIMODAL_INTEGRATION; VDJ_INTEGRATION } from '../modules/nf-core/modules/citeseq/main'
 
 workflow qc {
     take:
@@ -86,6 +87,10 @@ workflow qc {
                 NORMALISE_AND_PCA.out.sample_QCd_adata.flatten().map{sample -> tuple("${sample}".replaceFirst(/___sample_QCd_adata.h5ad/,"").replaceFirst(/.*\//,""),sample)}.set{alt_input}
                 channel_dsb2 = channel_dsb.combine(alt_input, by: 0)
                 DSB_PROCESS(channel_dsb2)
+// SCT,CITE,cite_bgRemoved
+                if(params.totalVi.run_process){
+                    TOTAL_VI_INTEGRATION(NORMALISE_AND_PCA.out.anndata,DSB_PROCESS.out.citeseq_rsd.collect())
+                }
 
                 DSB_PROCESS.out.ch_for_norm.subscribe { println "1:: DSB_PROCESS.out.ch_for_norm: $it" }
 
@@ -100,24 +105,46 @@ workflow qc {
                 matched_donors.subscribe { println "1:: matched_donors $it" }
                 PREPROCESS_PROCESS(inp4,params.reduced_dims.vars_to_regress.value)
 
-                DSB_INTEGRATE(
+                DSB_INTEGRATE_SCT(
                     PREPROCESS_PROCESS.out.tmp_rsd.collect(),
                     params.reduced_dims.vars_to_regress.value,
                     params.reduced_dims.seurat_integration.k_anchor,
                     params.reduced_dims.seurat_integration.dims,
                     params.reduced_dims.seurat_integration.ndim_sct,
                     params.reduced_dims.seurat_integration.ndim_citeBgRemoved,
-                    params.reduced_dims.seurat_integration.ndim_cite_integrated
-                    )
-
-                MULTIMODAL_INTEGRATION(
-                    DSB_INTEGRATE.out.tmp_rds_file,
+                    params.reduced_dims.seurat_integration.ndim_cite_integrated,
+                    'SCT'
                 )
 
-                VDJ_INTEGRATION(
-                    chanel_cr_outs.collect(),
-                    MULTIMODAL_INTEGRATION.out.wnn_integrated_file
+                DSB_INTEGRATE_CITE(
+                    PREPROCESS_PROCESS.out.tmp_rsd.collect(),
+                    params.reduced_dims.vars_to_regress.value,
+                    params.reduced_dims.seurat_integration.k_anchor,
+                    params.reduced_dims.seurat_integration.dims,
+                    params.reduced_dims.seurat_integration.ndim_sct,
+                    params.reduced_dims.seurat_integration.ndim_citeBgRemoved,
+                    params.reduced_dims.seurat_integration.ndim_cite_integrated,
+                    'CITE'
+                )
+                DSB_INTEGRATE_cite_bgRemoved(
+                    PREPROCESS_PROCESS.out.tmp_rsd.collect(),
+                    params.reduced_dims.vars_to_regress.value,
+                    params.reduced_dims.seurat_integration.k_anchor,
+                    params.reduced_dims.seurat_integration.dims,
+                    params.reduced_dims.seurat_integration.ndim_sct,
+                    params.reduced_dims.seurat_integration.ndim_citeBgRemoved,
+                    params.reduced_dims.seurat_integration.ndim_cite_integrated,
+                    'cite_bgRemoved'
                 )
+
+                // MULTIMODAL_INTEGRATION(
+                //     DSB_INTEGRATE.out.tmp_rds_file,
+                // )
+
+                // VDJ_INTEGRATION(
+                //     chanel_cr_outs.collect(),
+                //     MULTIMODAL_INTEGRATION.out.wnn_integrated_file
+                // )
             }    
 
 

diff --git a/workflows/yascp.nf b/workflows/yascp.nf
@@ -201,10 +201,16 @@ workflow YASCP {
 
                 if (params.citeseq){
 
-                    vireo_paths = params.outdir
-                        ? Channel.fromPath("${params.outdir}/deconvolution/vireo/*/vireo_*", checkIfExists:true, type: 'dir')
-                        : Channel.fromPath("${launchDir}/${params.outdir}/deconvolution/vireo/*/vireo_*", type: 'dir')
-
+                    // vireo_paths1 = params.outdir
+                    //     ? Channel.fromPath("${params.outdir}/deconvolution/vireo/*/vireo_*", checkIfExists:true, type: 'dir')
+                    //     : Channel.fromPath("${launchDir}/${params.outdir}/deconvolution/vireo/*/vireo_*", checkIfExists:true, type: 'dir')
+                    vireo_paths2 = params.outdir
+                        ? Channel.fromPath("${params.outdir}/deconvolution/vireo/vireo_*", checkIfExists:true, type: 'dir')
+                        : Channel.fromPath("${launchDir}/${params.outdir}/deconvolution/vireo/vireo_*", checkIfExists:true, type: 'dir')
+                    vireo_paths3 = params.outdir
+                        ? Channel.fromPath("${params.outdir}/deconvolution/vireo/*/*/vireo_*", checkIfExists:true, type: 'dir')
+                        : Channel.fromPath("${launchDir}/${params.outdir}/deconvolution/vireo/*/*/vireo_*", checkIfExists:true, type: 'dir')
+                    vireo_paths = vireo_paths3.mix(vireo_paths2)//.mix(vireo_paths3)
                     GENOTYPE_MATCHER(vireo_paths.collect())
                     matched_donors = GENOTYPE_MATCHER.out.matched_donors
                 }else{