Merge pull request #56 from Mye-InfoBank/implement-modes

Implement multiple pipeline modes
Mye-InfoBank · Apr 8, 2024 · 122d5e5 · 122d5e5
2 parents 5d95205 + 7616379
commit 122d5e5
Show file tree

Hide file tree

Showing 43 changed files with 519 additions and 333 deletions.
diff --git a/bin/merge_datasets.py b/bin/merge_datasets.py
@@ -4,53 +4,53 @@
 import anndata as ad
 import scanpy as sc
 from scipy.sparse import csr_matrix
-import numpy as np
-
-
 
 parser = argparse.ArgumentParser(description="Merge datasets")
 parser.add_argument("--input", help="Input file", type=str, nargs="+")
-parser.add_argument("--output_integration", help="Output file containing only cells which do not require transfer learning", type=str)
-parser.add_argument("--output_intersection", help="Output file containing all cells but gene intersection", type=str)
-parser.add_argument("--output_transfer", help="Output file containing all cells which require transfer learning", type=str)
-parser.add_argument("--output_counts", help="Output file, outer join of cells and genes", type=str)
+parser.add_argument("--base", help="Base dataset to use as reference", type=str, required=False)
+parser.add_argument("--output_intersection", help="Output file containing all cells but gene intersection", type=str, required=True)
+parser.add_argument("--output_union", help="Output file, outer join of cells and genes", type=str, required=True)
+parser.add_argument("--output_transfer", help="Output file, cells to project onto base", type=str, required=False)
 parser.add_argument("--min_cells", help='Minimum number of cells to keep a gene', type=int, required=False, default=50)
 parser.add_argument("--custom_genes", help="Additional genes to include", type=str, nargs="*")
 
 args = parser.parse_args()
 
 datasets = [ad.read_h5ad(f) for f in args.input]
 
-adata = ad.concat(datasets)
-adata_outer = ad.concat(datasets, join='outer')
+if args.base:
+    if not args.output_transfer:
+        raise ValueError("Transfer file required when using base dataset")
+
+    adata_base = ad.read_h5ad(args.base)
+    datasets = [adata_base] + datasets
+
+adata_intersection = ad.concat(datasets)
+adata_union = ad.concat(datasets, join='outer')
 
-additional_genes = [gene for gene in args.custom_genes if gene not in adata.var_names and gene in adata_outer.var_names]
+additional_genes = [gene for gene in args.custom_genes if gene not in adata_intersection.var_names and gene in adata_union.var_names]
 
 # Add custom genes from outer join to the intersection
 if additional_genes:
-    adata_additional = adata_outer[adata.obs_names, additional_genes]
-    adata_concatenated = ad.concat([adata, adata_additional], join="outer", axis=1)
-    adata_concatenated.obs, adata_concatenated.obsm = adata.obs, adata.obsm
-    adata = adata_concatenated
+    adata_additional = adata_union[adata_intersection.obs_names, additional_genes]
+    adata_concatenated = ad.concat([adata_intersection, adata_additional], join="outer", axis=1)
+    adata_concatenated.obs, adata_concatenated.obsm = adata_intersection.obs, adata_intersection.obsm
+    adata_intersection = adata_concatenated
 
 # Convert to CSR matrix
-adata.X = csr_matrix(adata.X)
-adata_outer.X = csr_matrix(adata_outer.X)
-
-# Filter genes with no counts in core atlas
-gene_mask, _ = sc.pp.filter_genes(adata[~adata.obs["transfer"]], min_cells=1, inplace=False)
-adata = adata[:, gene_mask]
+adata_intersection.X = csr_matrix(adata_intersection.X)
+adata_union.X = csr_matrix(adata_union.X)
 
 # Filter cells with no counts
-cell_mask, _ = sc.pp.filter_cells(adata, min_genes=1, inplace=False)
-adata = adata[cell_mask, :]
-adata_outer = adata_outer[cell_mask, :]
+cell_mask, _ = sc.pp.filter_cells(adata_intersection, min_genes=1, inplace=False)
+adata_intersection = adata_intersection[cell_mask, :]
+adata_union = adata_union[cell_mask, :]
 
 # Filter genes with too few occurrences in outer join
-sc.pp.filter_genes(adata_outer, min_cells=args.min_cells)
+sc.pp.filter_genes(adata_union, min_cells=args.min_cells)
 
-adata.obs["batch"] = adata.obs["dataset"].astype(str) + "_" + adata.obs["batch"].astype(str)
-adata.obs["patient"] = adata.obs["dataset"].astype(str) + "_" + adata.obs["patient"].astype(str)
+adata_intersection.obs["batch"] = adata_intersection.obs["dataset"].astype(str) + "_" + adata_intersection.obs["batch"].astype(str)
+adata_intersection.obs["patient"] = adata_intersection.obs["dataset"].astype(str) + "_" + adata_intersection.obs["patient"].astype(str)
 
 def to_Florent_case(s: str):
     corrected = s.lower().strip()
@@ -77,25 +77,25 @@ def to_Florent_case(s: str):
 
     return corrected[0].upper() + corrected[1:]
 
-for column in adata.obs.columns:
-    if column == "transfer":
-        continue
-    if not adata.obs[column].dtype.name == "category" and not adata.obs[column].dtype.name == "object":
+for column in adata_intersection.obs.columns:
+    if not adata_intersection.obs[column].dtype.name == "category" and not adata_intersection.obs[column].dtype.name == "object":
         continue
     # Convert first to string and then to category
-    adata.obs[column] = adata.obs[column].astype(str).fillna("Unknown").apply(to_Florent_case).astype("category")
+    adata_intersection.obs[column] = adata_intersection.obs[column].astype(str).fillna("Unknown").apply(to_Florent_case).astype("category")
+
+adata_union.obs = adata_intersection.obs
+
+adata_intersection.layers["counts"] = adata_intersection.X
+adata_union.layers["counts"] = adata_union.X
 
-adata_outer.obs = adata.obs
+if args.base:
+    adata_transfer = adata_intersection[~adata_intersection.obs.index.isin(adata_base.obs.index)]
 
-adata.layers["counts"] = adata.X
-adata_outer.layers["counts"] = adata_outer.X
+    known_celltypes = adata_base.obs["cell_type"].unique()
+    adata_transfer.obs["cell_type"] = adata_transfer.obs["cell_type"].map(lambda x: x if x in known_celltypes else "Unknown")
 
-if any(adata.obs["transfer"]):
-    adata_transfer = adata[adata.obs["transfer"]]
     adata_transfer.write_h5ad(args.output_transfer)
 
-adata_notransfer = adata[~adata.obs["transfer"]]
-adata_notransfer.write_h5ad(args.output_integration)
+adata_intersection.write_h5ad(args.output_intersection)
+adata_union.write_h5ad(args.output_union)
 
-adata.write_h5ad(args.output_intersection)
-adata_outer.write_h5ad(args.output_counts)
diff --git a/bin/preprocess.py b/bin/preprocess.py
@@ -13,7 +13,6 @@
     "patient": True,
     "tissue": True,
     "dataset": True,
-    "transfer": True
 }
 
 parser = argparse.ArgumentParser(description="Filter dataset")
@@ -22,7 +21,6 @@
 parser.add_argument("--output", help="Output file", type=str)
 parser.add_argument("--problems", help="Problems file", type=str)
 parser.add_argument("--no-symbols", help="Convert varnames to gene symbols", action="store_true")
-parser.add_argument("--transfer", help="Apply transfer leanring on dataset", action="store_true")
 parser.add_argument("--sure_raw", help="Skip check for raw counts", action="store_true")
 parser.add_argument("--custom_metadata", help="Additional metadata columns to include", type=str, nargs="*")
 
@@ -57,7 +55,6 @@ def aggregate_duplicate_var(adata, aggr_fun=np.mean):
 print("Reading input")
 adata = sc.read_h5ad(args.input)
 adata.obs["dataset"] = args.id
-adata.obs["transfer"] = args.transfer
 
 if adata.__dict__["_raw"] and "_index" in adata.__dict__["_raw"].__dict__["_var"]:
     adata.__dict__["_raw"].__dict__["_var"] = (

diff --git a/conf/modes.config b/conf/modes.config
@@ -0,0 +1,39 @@
+params {
+    leiden_resolutions = [0.25, 0.5, 0.75, 1, 1.5, 2]
+    entropy = false
+    entropy_initial_smoothness = 0.5
+    scshc = false
+}
+
+process {
+    withName: SCSHC_CLUSTERING {
+        ext.when = { params.scshc }
+    }
+
+    withName: SCSHC_CLUSTERING_QC {
+        ext.when = { params.scshc }
+    }
+
+    withName: ENTROPY {
+        ext.when = { params.entropy }
+    }
+}
+
+profiles {
+    build {
+        params.mode = "build"
+        includeConfig "modes/build.config"
+        includeConfig "modes/build-extend.config"
+    }
+
+    extend {
+        params.mode = "extend"
+        includeConfig "modes/extend.config"
+        includeConfig "modes/build-extend.config"
+    }
+
+    sub {
+        params.mode = "sub"
+        includeConfig "modes/sub.config"
+    }
+}
diff --git a/conf/modules.config → conf/modes/build-extend.config b/conf/modules.config → conf/modes/build-extend.config
@@ -1,3 +1,17 @@
+params {
+    samplesheet = null
+    celltypist_model = null
+    min_cells = 50
+    cell_cycle = true
+
+    normalization_method = "log_total"
+    upset_only = false
+    decontX = true
+
+    has_celltypes = true
+    custom_metadata = []
+}
+
 process {
     withName: CELLTYPIST {
         ext.when = { params.celltypist_model != null }
@@ -7,18 +21,6 @@ process {
         ext.when = { params.cell_cycle }
     }
 
-    withName: SCSHC_CLUSTERING {
-        ext.when = { params.scshc }
-    }
-
-    withName: SCSHC_CLUSTERING_QC {
-        ext.when = { params.scshc }
-    }
-
-    withName: ENTROPY {
-        ext.when = { params.entropy }
-    }
-
     withName: BENCHMARK_INTEGRATIONS {
         ext.when = { params.benchmark_hvgs > 0 }
     }

diff --git a/conf/modes/build.config b/conf/modes/build.config
@@ -0,0 +1,6 @@
+params {
+    integration_methods = ["scvi", "scanvi", "harmony", "scgen", "scanorama", "bbknn", "desc", "combat", "trvaep"]
+    custom_hvgs = []
+    integration_hvgs = 10000
+    benchmark_hvgs = 0
+}
diff --git a/conf/modes/extend.config b/conf/modes/extend.config
@@ -0,0 +1,4 @@
+params {
+    base = null
+    model = null
+}
diff --git a/conf/modes/sub.config b/conf/modes/sub.config
@@ -0,0 +1,6 @@
+params {
+    input = null
+    integration = null
+    annotation = null
+    split_on = null
+}
diff --git a/example/.gitignore → example/build/.gitignore b/example/.gitignore → example/build/.gitignore
diff --git a/example/minimal_example.h5ad → example/build/minimal_example.h5ad b/example/minimal_example.h5ad → example/build/minimal_example.h5ad
diff --git a/example/nextflow.config → example/build/nextflow.config b/example/nextflow.config → example/build/nextflow.config
@@ -1,20 +1,11 @@
+includeConfig "../test.config"
+
 params {
     samplesheet = "samplesheet.csv"
 
     benchmark_hvgs = 100
-    scshc = false
-    entropy = false
     cell_cycle = true
-    leiden_resolutions = [0.5, 1]
 
     celltypist_model = "Cells_Intestinal_Tract.pkl"
     integration_methods = ["scvi", "scanvi", "harmony", "desc", "combat"]
-
-    max_cpus = 4
-    max_memory = "12G"
-    max_time = "6.h"
-}
-
-process {
-    executor = "local"
 }
diff --git a/example/build/run.sh b/example/build/run.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+nextflow run ../.. -resume -profile apptainer,build
diff --git a/example/samplesheet.csv → example/build/samplesheet.csv b/example/samplesheet.csv → example/build/samplesheet.csv
diff --git a/example/extend/.gitignore b/example/extend/.gitignore
@@ -0,0 +1,6 @@
+*
+!.gitignore
+!minimal_example.h5ad
+!nextflow.config
+!run.sh
+!samplesheet.csv
diff --git a/example/extend/nextflow.config b/example/extend/nextflow.config
@@ -0,0 +1,12 @@
+includeConfig "../test.config"
+
+params {
+    samplesheet = "samplesheet.csv"
+    base = "atlas.h5ad"
+    model = "model"
+
+    benchmark_hvgs = 100
+    cell_cycle = true
+
+    celltypist_model = "Cells_Intestinal_Tract.pkl"
+}
diff --git a/example/extend/run.sh b/example/extend/run.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+nextflow run ../.. -resume -profile apptainer,extend
diff --git a/example/extend/samplesheet.csv b/example/extend/samplesheet.csv
@@ -0,0 +1,2 @@
+id,input_adata
+transfer,transfer.h5ad
diff --git a/example/run.sh b/example/run.sh
diff --git a/example/sub/.gitignore b/example/sub/.gitignore
@@ -0,0 +1,6 @@
+*
+!.gitignore
+!minimal_example.h5ad
+!nextflow.config
+!run.sh
+!samplesheet.csv
diff --git a/example/sub/nextflow.config b/example/sub/nextflow.config
@@ -0,0 +1,8 @@
+includeConfig "../test.config"
+
+params {
+    input = "atlas.h5ad"
+    annotation = "annotation.csv"
+    integration = 'scanvi'
+    split_on = 'scanvi_leiden_0.5'
+}
diff --git a/example/sub/run.sh b/example/sub/run.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+nextflow run ../.. -resume -profile apptainer,sub
diff --git a/example/test.config b/example/test.config
@@ -0,0 +1,13 @@
+params {
+    scshc = false
+    entropy = false
+    leiden_resolutions = [0.5, 1]
+
+    max_cpus = 4
+    max_memory = "12G"
+    max_time = "6.h"
+}
+
+process {
+    executor = "local"
+}