diff --git a/.gitignore b/.gitignore index d152cba..626cb25 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ __pycahe__/ *.py[cod] scplus_rtd .python-version +build/* +*.egg* diff --git a/pyproject.toml b/pyproject.toml index 4dfddd6..40f22f3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,8 +10,8 @@ authors = [ ] description = "SCENIC+ is a python package to build gene regulatory networks (GRNs) using combined or separate single-cell gene expression (scRNA-seq) and single-cell chromatin accessibility (scATAC-seq) data." readme = "README.md" -version = "1.0a1" -requires-python = ">=3.8" +version = "1.0a2" +requires-python = ">=3.8,<=3.11.8" keywords = ["scATAC", "GRN inference", "eGRN inference"] license = { file = "LICENCE.txt" } classifiers = [ diff --git a/src/scenicplus/cli/commands.py b/src/scenicplus/cli/commands.py index 7485347..7b318ff 100644 --- a/src/scenicplus/cli/commands.py +++ b/src/scenicplus/cli/commands.py @@ -817,7 +817,8 @@ def infer_grn( keep_only_activating: bool, rho_threshold: float, min_target_genes: int, - n_cpu: int): + n_cpu: int, + seed: int): """ Infer gene regulatory network. @@ -869,6 +870,8 @@ def infer_grn( Minimum number of target genes. n_cpu : int Number of parallel processes to run. + seed: int + Random seed to use. """ from scenicplus.grn_builder.gsea_approach import build_grn @@ -906,7 +909,8 @@ def infer_grn( min_target_genes=min_target_genes, n_cpu=n_cpu, merge_eRegulons=True, - disable_tqdm=False) + disable_tqdm=False, + seed=seed) log.info("Formatting eGRN as table.") eRegulon_metadata = _format_egrns( diff --git a/src/scenicplus/cli/scenicplus.py b/src/scenicplus/cli/scenicplus.py index 2150422..7cba71d 100644 --- a/src/scenicplus/cli/scenicplus.py +++ b/src/scenicplus/cli/scenicplus.py @@ -880,7 +880,8 @@ def eGRN(arg): keep_only_activating=arg.keep_only_activating_eRegulons, rho_threshold=arg.rho_threshold, min_target_genes=arg.min_target_genes, - n_cpu=arg.n_cpu) + n_cpu=arg.n_cpu, + seed=arg.seed) parser.set_defaults(func=eGRN) # Required arguments @@ -984,6 +985,11 @@ def eGRN(arg): action="store", type=int, required=False, default=1, help="Number of cores to use. Default is 1.") + parser.add_argument( + "--seed", dest="seed", + action="store", type=int, required=False, + default=666, + help="Seed to use. Default is 666.") def add_parser_for_aucell(subparser:argparse._SubParsersAction): parser:argparse.ArgumentParser = subparser.add_parser( diff --git a/src/scenicplus/data_wrangling/adata_cistopic_wrangling.py b/src/scenicplus/data_wrangling/adata_cistopic_wrangling.py index d56f666..4357f7e 100644 --- a/src/scenicplus/data_wrangling/adata_cistopic_wrangling.py +++ b/src/scenicplus/data_wrangling/adata_cistopic_wrangling.py @@ -91,8 +91,10 @@ def process_multiome_data( GEX_gene_metadata = GEX_anndata.var.copy(deep=True) GEX_cell_metadata = GEX_anndata.obs.copy(deep=True).loc[ common_cells] - ACC_region_metadata = cisTopic_obj.region_data.copy(deep=True).loc[ - imputed_acc_obj.feature_names] + ACC_region_metadata = pd.DataFrame(index = imputed_acc_obj.feature_names) + ACC_region_metadata = ACC_region_metadata.merge( + cisTopic_obj.region_data, left_index = True, right_index = True, how = "left") + ACC_region_metadata = ACC_region_metadata.loc[imputed_acc_obj.feature_names] ACC_cell_metadata = cisTopic_obj.cell_data.copy(deep=True).loc[ common_cells] @@ -331,7 +333,9 @@ def process_non_multiome_data( # generate cell metadata metadata_cell = pd.DataFrame(index=meta_cell_names_ACC, data={key_to_group_by: [x.split(meta_cell_split)[0] for x in meta_cell_names_ACC]}) - ACC_region_metadata = cisTopic_obj.region_data.copy(deep=True) + ACC_region_metadata = pd.DataFrame(index = imputed_acc_obj.feature_names) + ACC_region_metadata = ACC_region_metadata.merge( + cisTopic_obj.region_data, left_index = True, right_index = True, how = "left") ACC_region_metadata_subset = ACC_region_metadata.loc[imputed_acc_obj.feature_names] mudata = MuData( diff --git a/src/scenicplus/grn_builder/gsea.py b/src/scenicplus/grn_builder/gsea.py index ef3aded..83d70a2 100644 --- a/src/scenicplus/grn_builder/gsea.py +++ b/src/scenicplus/grn_builder/gsea.py @@ -1,18 +1,14 @@ from gseapy.algorithm import enrichment_score from gseapy.algorithm import gsea_compute -from gseapy.plot import GSEAPlot import numpy as np import pandas as pd -seed = 666 - - def run_gsea(ranked_gene_list: pd.Series, gene_set: list, n_perm: int = 1000, ascending: bool = False, - return_res: bool = False, - name: str = 'gene_set'): + name: str = 'gene_set', + seed: int = 555): """ Calculates gene set enrichment score (and estimates its significance) and leading edge for the gene set in the ranked gene list using gseapy prerank. @@ -39,7 +35,7 @@ def run_gsea(ranked_gene_list: pd.Series, gmt = {name: list(gene_set)} gsea_results, ind, rank_ES, gs = gsea_compute(data=ranked_gene_list, n=n_perm, gmt=gmt, weighted_score_type=1, permutation_type='gene_set', method=None, - pheno_pos='Pos', pheno_neg='Neg', classes=None, ascending=ascending) + pheno_pos='Pos', pheno_neg='Neg', classes=None, ascending=ascending, seed = seed) # extract enrichment scores gseale = list(gsea_results)[0] @@ -62,13 +58,7 @@ def run_gsea(ranked_gene_list: pd.Series, pval = gseale[2] fdr = gseale[3] LE = list(map(str, ranked_gene_list.iloc[ldg_pos].index)) - if return_res: - res = GSEAPlot(ranked_gene_list, name, ind, nes, pval, fdr, RES, - pheno_pos='', pheno_neg='', figsize=(6, 5.5), - cmap='seismic', ofname=None) - return nes, pval, LE, res - else: - return nes, pval, LE + return nes, pval, LE def run_enrichr(ranked_gene_list: np.array, diff --git a/src/scenicplus/grn_builder/gsea_approach.py b/src/scenicplus/grn_builder/gsea_approach.py index 00268ba..603121a 100644 --- a/src/scenicplus/grn_builder/gsea_approach.py +++ b/src/scenicplus/grn_builder/gsea_approach.py @@ -62,11 +62,13 @@ logging.basicConfig(level=level, format=format, handlers=handlers) log = logging.getLogger('GSEA') + def _run_gsea_for_e_module( e_module:eRegulon, rnk:pd.Series, gsea_n_perm:int, - context: frozenset): + context: frozenset, + seed: int): """ Helper function to run gsea for single e_module @@ -92,11 +94,12 @@ def _run_gsea_for_e_module( NES, pval, LE_genes = run_gsea( ranked_gene_list=rnk, gene_set=gene_set, - n_perm=gsea_n_perm) + n_perm=gsea_n_perm, + seed=seed) except: NES = np.nan pval = np.nan - LE_genes = np.nan + LE_genes = [np.nan] return eRegulon( transcription_factor=TF, cistrome_name=e_module.cistrome_name, @@ -134,6 +137,7 @@ def build_grn( n_cpu=1, merge_eRegulons=True, disable_tqdm=False, + seed=555, **kwargs) -> List[eRegulon]: log.info('Thresholding region to gene relationships') # some tfs are missing from tf_to_gene because they are not @@ -185,9 +189,10 @@ def build_grn( e_module=e_module, rnk=TF_to_ranking_pos[e_module.transcription_factor], gsea_n_perm=gsea_n_perm, - context=frozenset(['positive tf2g'])) + context=frozenset(['positive tf2g']), + seed=seed) for e_module in tqdm( - e_modules, + e_modules, total = len(e_modules), desc="Running for Positive TF to gene") if e_module.transcription_factor in pos_TFs) @@ -198,7 +203,8 @@ def build_grn( e_module=e_module, rnk=TF_to_ranking_neg[e_module.transcription_factor], gsea_n_perm=gsea_n_perm, - context=frozenset(['negative tf2g'])) + context=frozenset(['negative tf2g']), + seed=seed) for e_module in tqdm( e_modules, total = len(e_modules),