Skip to content

Commit

Permalink
Merge pull request #56 from Mye-InfoBank/implement-modes
Browse files Browse the repository at this point in the history
Implement multiple pipeline modes
  • Loading branch information
nictru authored Apr 8, 2024
2 parents 5d95205 + 7616379 commit 122d5e5
Show file tree
Hide file tree
Showing 43 changed files with 519 additions and 333 deletions.
80 changes: 40 additions & 40 deletions bin/merge_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,53 +4,53 @@
import anndata as ad
import scanpy as sc
from scipy.sparse import csr_matrix
import numpy as np



parser = argparse.ArgumentParser(description="Merge datasets")
parser.add_argument("--input", help="Input file", type=str, nargs="+")
parser.add_argument("--output_integration", help="Output file containing only cells which do not require transfer learning", type=str)
parser.add_argument("--output_intersection", help="Output file containing all cells but gene intersection", type=str)
parser.add_argument("--output_transfer", help="Output file containing all cells which require transfer learning", type=str)
parser.add_argument("--output_counts", help="Output file, outer join of cells and genes", type=str)
parser.add_argument("--base", help="Base dataset to use as reference", type=str, required=False)
parser.add_argument("--output_intersection", help="Output file containing all cells but gene intersection", type=str, required=True)
parser.add_argument("--output_union", help="Output file, outer join of cells and genes", type=str, required=True)
parser.add_argument("--output_transfer", help="Output file, cells to project onto base", type=str, required=False)
parser.add_argument("--min_cells", help='Minimum number of cells to keep a gene', type=int, required=False, default=50)
parser.add_argument("--custom_genes", help="Additional genes to include", type=str, nargs="*")

args = parser.parse_args()

datasets = [ad.read_h5ad(f) for f in args.input]

adata = ad.concat(datasets)
adata_outer = ad.concat(datasets, join='outer')
if args.base:
if not args.output_transfer:
raise ValueError("Transfer file required when using base dataset")

adata_base = ad.read_h5ad(args.base)
datasets = [adata_base] + datasets

adata_intersection = ad.concat(datasets)
adata_union = ad.concat(datasets, join='outer')

additional_genes = [gene for gene in args.custom_genes if gene not in adata.var_names and gene in adata_outer.var_names]
additional_genes = [gene for gene in args.custom_genes if gene not in adata_intersection.var_names and gene in adata_union.var_names]

# Add custom genes from outer join to the intersection
if additional_genes:
adata_additional = adata_outer[adata.obs_names, additional_genes]
adata_concatenated = ad.concat([adata, adata_additional], join="outer", axis=1)
adata_concatenated.obs, adata_concatenated.obsm = adata.obs, adata.obsm
adata = adata_concatenated
adata_additional = adata_union[adata_intersection.obs_names, additional_genes]
adata_concatenated = ad.concat([adata_intersection, adata_additional], join="outer", axis=1)
adata_concatenated.obs, adata_concatenated.obsm = adata_intersection.obs, adata_intersection.obsm
adata_intersection = adata_concatenated

# Convert to CSR matrix
adata.X = csr_matrix(adata.X)
adata_outer.X = csr_matrix(adata_outer.X)

# Filter genes with no counts in core atlas
gene_mask, _ = sc.pp.filter_genes(adata[~adata.obs["transfer"]], min_cells=1, inplace=False)
adata = adata[:, gene_mask]
adata_intersection.X = csr_matrix(adata_intersection.X)
adata_union.X = csr_matrix(adata_union.X)

# Filter cells with no counts
cell_mask, _ = sc.pp.filter_cells(adata, min_genes=1, inplace=False)
adata = adata[cell_mask, :]
adata_outer = adata_outer[cell_mask, :]
cell_mask, _ = sc.pp.filter_cells(adata_intersection, min_genes=1, inplace=False)
adata_intersection = adata_intersection[cell_mask, :]
adata_union = adata_union[cell_mask, :]

# Filter genes with too few occurrences in outer join
sc.pp.filter_genes(adata_outer, min_cells=args.min_cells)
sc.pp.filter_genes(adata_union, min_cells=args.min_cells)

adata.obs["batch"] = adata.obs["dataset"].astype(str) + "_" + adata.obs["batch"].astype(str)
adata.obs["patient"] = adata.obs["dataset"].astype(str) + "_" + adata.obs["patient"].astype(str)
adata_intersection.obs["batch"] = adata_intersection.obs["dataset"].astype(str) + "_" + adata_intersection.obs["batch"].astype(str)
adata_intersection.obs["patient"] = adata_intersection.obs["dataset"].astype(str) + "_" + adata_intersection.obs["patient"].astype(str)

def to_Florent_case(s: str):
corrected = s.lower().strip()
Expand All @@ -77,25 +77,25 @@ def to_Florent_case(s: str):

return corrected[0].upper() + corrected[1:]

for column in adata.obs.columns:
if column == "transfer":
continue
if not adata.obs[column].dtype.name == "category" and not adata.obs[column].dtype.name == "object":
for column in adata_intersection.obs.columns:
if not adata_intersection.obs[column].dtype.name == "category" and not adata_intersection.obs[column].dtype.name == "object":
continue
# Convert first to string and then to category
adata.obs[column] = adata.obs[column].astype(str).fillna("Unknown").apply(to_Florent_case).astype("category")
adata_intersection.obs[column] = adata_intersection.obs[column].astype(str).fillna("Unknown").apply(to_Florent_case).astype("category")

adata_union.obs = adata_intersection.obs

adata_intersection.layers["counts"] = adata_intersection.X
adata_union.layers["counts"] = adata_union.X

adata_outer.obs = adata.obs
if args.base:
adata_transfer = adata_intersection[~adata_intersection.obs.index.isin(adata_base.obs.index)]

adata.layers["counts"] = adata.X
adata_outer.layers["counts"] = adata_outer.X
known_celltypes = adata_base.obs["cell_type"].unique()
adata_transfer.obs["cell_type"] = adata_transfer.obs["cell_type"].map(lambda x: x if x in known_celltypes else "Unknown")

if any(adata.obs["transfer"]):
adata_transfer = adata[adata.obs["transfer"]]
adata_transfer.write_h5ad(args.output_transfer)

adata_notransfer = adata[~adata.obs["transfer"]]
adata_notransfer.write_h5ad(args.output_integration)
adata_intersection.write_h5ad(args.output_intersection)
adata_union.write_h5ad(args.output_union)

adata.write_h5ad(args.output_intersection)
adata_outer.write_h5ad(args.output_counts)
3 changes: 0 additions & 3 deletions bin/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
"patient": True,
"tissue": True,
"dataset": True,
"transfer": True
}

parser = argparse.ArgumentParser(description="Filter dataset")
Expand All @@ -22,7 +21,6 @@
parser.add_argument("--output", help="Output file", type=str)
parser.add_argument("--problems", help="Problems file", type=str)
parser.add_argument("--no-symbols", help="Convert varnames to gene symbols", action="store_true")
parser.add_argument("--transfer", help="Apply transfer leanring on dataset", action="store_true")
parser.add_argument("--sure_raw", help="Skip check for raw counts", action="store_true")
parser.add_argument("--custom_metadata", help="Additional metadata columns to include", type=str, nargs="*")

Expand Down Expand Up @@ -57,7 +55,6 @@ def aggregate_duplicate_var(adata, aggr_fun=np.mean):
print("Reading input")
adata = sc.read_h5ad(args.input)
adata.obs["dataset"] = args.id
adata.obs["transfer"] = args.transfer

if adata.__dict__["_raw"] and "_index" in adata.__dict__["_raw"].__dict__["_var"]:
adata.__dict__["_raw"].__dict__["_var"] = (
Expand Down
39 changes: 39 additions & 0 deletions conf/modes.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
params {
leiden_resolutions = [0.25, 0.5, 0.75, 1, 1.5, 2]
entropy = false
entropy_initial_smoothness = 0.5
scshc = false
}

process {
withName: SCSHC_CLUSTERING {
ext.when = { params.scshc }
}

withName: SCSHC_CLUSTERING_QC {
ext.when = { params.scshc }
}

withName: ENTROPY {
ext.when = { params.entropy }
}
}

profiles {
build {
params.mode = "build"
includeConfig "modes/build.config"
includeConfig "modes/build-extend.config"
}

extend {
params.mode = "extend"
includeConfig "modes/extend.config"
includeConfig "modes/build-extend.config"
}

sub {
params.mode = "sub"
includeConfig "modes/sub.config"
}
}
26 changes: 14 additions & 12 deletions conf/modules.config → conf/modes/build-extend.config
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
params {
samplesheet = null
celltypist_model = null
min_cells = 50
cell_cycle = true

normalization_method = "log_total"
upset_only = false
decontX = true

has_celltypes = true
custom_metadata = []
}

process {
withName: CELLTYPIST {
ext.when = { params.celltypist_model != null }
Expand All @@ -7,18 +21,6 @@ process {
ext.when = { params.cell_cycle }
}

withName: SCSHC_CLUSTERING {
ext.when = { params.scshc }
}

withName: SCSHC_CLUSTERING_QC {
ext.when = { params.scshc }
}

withName: ENTROPY {
ext.when = { params.entropy }
}

withName: BENCHMARK_INTEGRATIONS {
ext.when = { params.benchmark_hvgs > 0 }
}
Expand Down
6 changes: 6 additions & 0 deletions conf/modes/build.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
params {
integration_methods = ["scvi", "scanvi", "harmony", "scgen", "scanorama", "bbknn", "desc", "combat", "trvaep"]
custom_hvgs = []
integration_hvgs = 10000
benchmark_hvgs = 0
}
4 changes: 4 additions & 0 deletions conf/modes/extend.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
params {
base = null
model = null
}
6 changes: 6 additions & 0 deletions conf/modes/sub.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
params {
input = null
integration = null
annotation = null
split_on = null
}
File renamed without changes.
File renamed without changes.
13 changes: 2 additions & 11 deletions example/nextflow.config → example/build/nextflow.config
Original file line number Diff line number Diff line change
@@ -1,20 +1,11 @@
includeConfig "../test.config"

params {
samplesheet = "samplesheet.csv"

benchmark_hvgs = 100
scshc = false
entropy = false
cell_cycle = true
leiden_resolutions = [0.5, 1]

celltypist_model = "Cells_Intestinal_Tract.pkl"
integration_methods = ["scvi", "scanvi", "harmony", "desc", "combat"]

max_cpus = 4
max_memory = "12G"
max_time = "6.h"
}

process {
executor = "local"
}
3 changes: 3 additions & 0 deletions example/build/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash

nextflow run ../.. -resume -profile apptainer,build
File renamed without changes.
6 changes: 6 additions & 0 deletions example/extend/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
*
!.gitignore
!minimal_example.h5ad
!nextflow.config
!run.sh
!samplesheet.csv
12 changes: 12 additions & 0 deletions example/extend/nextflow.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
includeConfig "../test.config"

params {
samplesheet = "samplesheet.csv"
base = "atlas.h5ad"
model = "model"

benchmark_hvgs = 100
cell_cycle = true

celltypist_model = "Cells_Intestinal_Tract.pkl"
}
3 changes: 3 additions & 0 deletions example/extend/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash

nextflow run ../.. -resume -profile apptainer,extend
2 changes: 2 additions & 0 deletions example/extend/samplesheet.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
id,input_adata
transfer,transfer.h5ad
3 changes: 0 additions & 3 deletions example/run.sh

This file was deleted.

6 changes: 6 additions & 0 deletions example/sub/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
*
!.gitignore
!minimal_example.h5ad
!nextflow.config
!run.sh
!samplesheet.csv
8 changes: 8 additions & 0 deletions example/sub/nextflow.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
includeConfig "../test.config"

params {
input = "atlas.h5ad"
annotation = "annotation.csv"
integration = 'scanvi'
split_on = 'scanvi_leiden_0.5'
}
3 changes: 3 additions & 0 deletions example/sub/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash

nextflow run ../.. -resume -profile apptainer,sub
13 changes: 13 additions & 0 deletions example/test.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
params {
scshc = false
entropy = false
leiden_resolutions = [0.5, 1]

max_cpus = 4
max_memory = "12G"
max_time = "6.h"
}

process {
executor = "local"
}
Loading

0 comments on commit 122d5e5

Please sign in to comment.