From a137cd16bbbb327d56d2a4e9d454c7b316199437 Mon Sep 17 00:00:00 2001 From: Chris Flerin Date: Thu, 6 May 2021 13:39:48 +0200 Subject: [PATCH 1/5] Remove files that now belong to ctxcore - https://github.com/aertslab/ctxcore --- src/pyscenic/genesig.py | 418 -------------------------- src/pyscenic/recovery.py | 289 ------------------ src/pyscenic/rnkdb.py | 623 --------------------------------------- tests/test_featherdb.py | 45 --- tests/test_genesig.py | 171 ----------- tests/test_parquetdb.py | 50 ---- tests/test_recovery.py | 106 ------- tests/test_sqlitedb.py | 45 --- 8 files changed, 1747 deletions(-) delete mode 100644 src/pyscenic/genesig.py delete mode 100644 src/pyscenic/recovery.py delete mode 100644 src/pyscenic/rnkdb.py delete mode 100644 tests/test_featherdb.py delete mode 100644 tests/test_genesig.py delete mode 100644 tests/test_parquetdb.py delete mode 100644 tests/test_recovery.py delete mode 100644 tests/test_sqlitedb.py diff --git a/src/pyscenic/genesig.py b/src/pyscenic/genesig.py deleted file mode 100644 index 7c11333..0000000 --- a/src/pyscenic/genesig.py +++ /dev/null @@ -1,418 +0,0 @@ -# -*- coding: utf-8 -*- - -import re -import os -from collections.abc import Iterable, Mapping -from itertools import repeat -from typing import Mapping, List, FrozenSet, Type - -import attr -import yaml -from cytoolz import merge_with, dissoc, keyfilter, first, second -from frozendict import frozendict -from itertools import chain -import gzip -from cytoolz import memoize, merge - - -def convert(genes): - # Genes supplied as dictionary. - if isinstance(genes, Mapping): - return frozendict(genes) - # Genes supplied as iterable of (gene, weight) tuples. - elif isinstance(genes, Iterable) and all(isinstance(n, tuple) for n in genes): - return frozendict(genes) - # Genes supplied as iterable of genes. - elif isinstance(genes, Iterable) and all(isinstance(n, str) for n in genes): - return frozendict(zip(genes, repeat(1.0))) - - -def openfile(filename, mode='r'): - if filename.endswith('.gz'): - return gzip.open(filename, mode) - else: - return open(filename, mode) - - -@attr.s(frozen=True) -class GeneSignature(yaml.YAMLObject): - """ - A class of gene signatures, i.e. a set of genes that are biologically related. - """ - - yaml_tag = u'!GeneSignature' - - @classmethod - def to_yaml(cls, dumper, data): - dict_representation = {'name': data.name, 'genes': list(data.genes), 'weights': list(data.weights)} - return dumper.represent_mapping(cls.yaml_tag, dict_representation, cls) - - @classmethod - def from_yaml(cls, loader, node): - data = loader.construct_mapping(node, cls) - return GeneSignature(name=data['name'], gene2weight=zip(data['genes'], data['weights'])) - - @classmethod - def from_gmt(cls, fname: str, field_separator: str = '\t', gene_separator: str = '\t') -> List['GeneSignature']: - """ - Load gene signatures from a GMT file. - - :param fname: The filename. - :param field_separator: The separator that separates fields in a line. - :param gene_separator: The separator that separates the genes. - :return: A list of signatures. - """ - # https://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats - assert os.path.exists(fname), "{} does not exist.".format(fname) - - def signatures(): - with openfile(fname, "r") as file: - for line in file: - if isinstance(line, (bytes, bytearray)): - line = line.decode() - if line.startswith("#") or not line.strip(): - continue - columns = re.split(field_separator, line.rstrip()) - genes = columns[2:] - yield GeneSignature(name=columns[0], gene2weight=genes) - - return list(signatures()) - - @classmethod - def to_gmt( - cls, - fname: str, - signatures: List[Type['GeneSignature']], - field_separator: str = '\t', - gene_separator: str = '\t', - ) -> None: - """ - Save list of signatures as GMT file. - - :param fname: Name of the file to generate. - :param signatures: The collection of signatures. - :param field_separator: The separator that separates fields in a line. - :param gene_separator: The separator that separates the genes. - """ - # assert not os.path.exists(fname), "{} already exists.".format(fname) - with openfile(fname, "wt") as file: - for signature in signatures: - genes = gene_separator.join(signature.genes) - file.write( - "{}{}{}{}{}\n".format( - signature.name, field_separator, signature.metadata(','), field_separator, genes - ) - ) - - @classmethod - def from_grp(cls, fname, name: str) -> 'GeneSignature': - """ - Load gene signature from GRP file. - - :param fname: The filename. - :param name: The name of the resulting signature. - :return: A signature. - """ - # https://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats - assert os.path.exists(fname), "{} does not exist.".format(fname) - with openfile(fname, "r") as file: - return GeneSignature( - name=name, gene2weight=[line.rstrip() for line in file if not line.startswith("#") and line.strip()] - ) - - @classmethod - def from_rnk(cls, fname: str, name: str, field_separator=",") -> 'GeneSignature': - """ - Reads in a signature from an RNK file. This format associates weights with the genes part of the signature. - - :param fname: The filename. - :param name: The name of the resulting signature. - :param field_separator: The separator that separates fields in a line. - :return: A signature. - """ - # https://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats - assert os.path.exists(fname), "{} does not exist.".format(fname) - - def columns(): - with openfile(fname, "r") as file: - for line in file: - if line.startswith("#") or not line.strip(): - continue - columns = tuple(map(str.rstrip, re.split(field_separator, line))) - assert len(columns) == 2, "Invalid file format." - yield columns - - return GeneSignature(name=name, gene2weight=list(columns())) - - name = attr.ib() # str - gene2weight = attr.ib(converter=convert) # Mapping[str, float] - - @name.validator - def name_validator(self, attribute, value): - if len(value) == 0: - raise ValueError("A gene signature must have a non-empty name.") - - @gene2weight.validator - def gene2weight_validator(self, attribute, value): - if len(value) == 0: - raise ValueError("A gene signature must have at least one gene.") - - @property - @memoize - def genes(self): - """ - Return genes in this signature. Genes are sorted in descending order according to weight. - """ - return tuple(map(first, sorted(self.gene2weight.items(), key=second, reverse=True))) - - @property - @memoize - def weights(self): - """ - Return the weights of the genes in this signature. Genes are sorted in descending order according to weight. - """ - return tuple(map(second, sorted(self.gene2weight.items(), key=second, reverse=True))) - - def metadata(self, field_separator: str = ",") -> str: - """ - Textual representation of metadata for this signature. - Is used as description when storing this signature as part of a GMT file. - - :param field_separator: the separator to use within fields. - :return: The string representation of the metadata of this signature. - """ - return "" - - def copy(self, **kwargs) -> Type['GeneSignature']: - # noinspection PyTypeChecker - try: - return GeneSignature(**merge(vars(self), kwargs)) - except TypeError: - # Pickled gene signatures might still have nomenclature property. - args = merge(vars(self), kwargs) - del args['nomenclature'] - return GeneSignature(**args) - - def rename(self, name: str) -> Type['GeneSignature']: - """ - Rename this signature. - - :param name: The new name. - :return: the new :class:`GeneSignature` instance. - """ - return self.copy(name=name) - - def add(self, gene_symbol, weight=1.0) -> Type['GeneSignature']: - """ - Add an extra gene symbol to this signature. - :param gene_symbol: The symbol of the gene. - :param weight: The weight. - :return: the new :class:`GeneSignature` instance. - """ - return self.copy(gene2weight=list(chain(self.gene2weight.items(), [(gene_symbol, weight)]))) - - def union(self, other: Type['GeneSignature']) -> Type['GeneSignature']: - """ - Creates a new :class:`GeneSignature` instance which is the union of this signature and the other supplied - signature. - - The weight associated with the genes in the intersection is the maximum of the weights in the composing signatures. - - :param other: The other :class:`GeneSignature`. - :return: the new :class:`GeneSignature` instance. - """ - return self.copy( - name="({} | {})".format(self.name, other.name) if self.name != other.name else self.name, - gene2weight=frozendict(merge_with(max, self.gene2weight, other.gene2weight)), - ) - - def difference(self, other: Type['GeneSignature']) -> Type['GeneSignature']: - """ - Creates a new :class:`GeneSignature` instance which is the difference of this signature and the supplied other - signature. - - The weight associated with the genes in the difference are taken from this gene signature. - - :param other: The other :class:`GeneSignature`. - :return: the new :class:`GeneSignature` instance. - """ - return self.copy( - name="({} - {})".format(self.name, other.name) if self.name != other.name else self.name, - gene2weight=frozendict(dissoc(dict(self.gene2weight), *other.genes)), - ) - - def intersection(self, other: Type['GeneSignature']) -> Type['GeneSignature']: - """ - Creates a new :class:`GeneSignature` instance which is the intersection of this signature and the supplied other - signature. - - The weight associated with the genes in the intersection is the maximum of the weights in the composing signatures. - - :param other: The other :class:`GeneSignature`. - :return: the new :class:`GeneSignature` instance. - """ - genes = set(self.gene2weight.keys()).intersection(set(other.gene2weight.keys())) - return self.copy( - name="({} & {})".format(self.name, other.name) if self.name != other.name else self.name, - gene2weight=frozendict( - keyfilter(lambda k: k in genes, merge_with(max, self.gene2weight, other.gene2weight)) - ), - ) - - def noweights(self): - """ - Create a new gene signature with uniform weights, i.e. all weights are equal and set to 1.0. - """ - return self.copy(gene2weight=self.genes) - - def head(self, n: int = 5) -> Type['GeneSignature']: - """ - Returns a gene signature with only the top n targets. - """ - assert n >= 1, "n must be greater than or equal to one." - genes = self.genes[0:n] # Genes are sorted in ascending order according to weight. - return self.copy(gene2weight=keyfilter(lambda k: k in genes, self.gene2weight)) - - def jaccard_index(self, other: Type['GeneSignature']) -> float: - """ - Calculate the symmetrical similarity metric between this and another signature. - The JI is a value between 0.0 and 1.0. - """ - ss = set(self.genes) - so = set(other.genes) - return float(len(ss.intersection(so))) / len(ss.union(so)) - - def __len__(self): - """ - The number of genes in this signature. - """ - return len(self.genes) - - def __contains__(self, item): - """ - Checks if a gene is part of this signature. - """ - return item in self.gene2weight.keys() - - def __getitem__(self, item): - """ - Return the weight associated with a gene. - """ - return self.gene2weight[item] - - def __str__(self): - """ - Returns a readable string representation. - """ - return "[]".format(",".join(self.genes)) - - def __repr__(self): - """ - Returns an unambiguous string representation. - """ - return "{}(name=\"{}\",gene2weight=[{}])".format( - self.__class__.__name__, - self.name, - "[" + ",".join(map(lambda g, w: "(\"{}\",{})".format(g, w), zip(self.genes, self.weights))) + "]", - ) - - -@attr.s(frozen=True) -class Regulon(GeneSignature, yaml.YAMLObject): - """ - A regulon is a gene signature that defines the target genes of a Transcription Factor (TF) and thereby defines - a subnetwork of a larger Gene Regulatory Network (GRN) connecting a TF with its target genes. - """ - - yaml_tag = u'!Regulon' - - @classmethod - def to_yaml(cls, dumper, data): - dict_representation = { - 'name': data.name, - 'genes': list(data.genes), - 'weights': list(data.weights), - 'score': data.score, - 'context': list(data.context), - 'transcription_factor': data.transcription_factor, - } - return dumper.represent_mapping(cls.yaml_tag, dict_representation, cls) - - @classmethod - def from_yaml(cls, loader, node): - data = loader.construct_mapping(node, cls) - return Regulon( - name=data['name'], - gene2weight=list(zip(data['genes'], data['weights'])), - score=data['score'], - context=frozenset(data['context']), - transcription_factor=data['transcription_factor'], - ) - - gene2occurrence = attr.ib(converter=convert) # Mapping[str, float] - transcription_factor = attr.ib() # str - context = attr.ib(default=frozenset()) # FrozenSet[str] - score = attr.ib(default=0.0) # float - nes = attr.ib(default=0.0) # float - orthologous_identity = attr.ib(default=0.0) # float - similarity_qvalue = attr.ib(default=0.0) # float - annotation = attr.ib(default='') # str - - @transcription_factor.validator - def non_empty(self, attribute, value): - if len(value) == 0: - raise ValueError("A regulon must have a transcription factor.") - - def metadata(self, field_separator: str = ',') -> str: - return "tf={}{}score={}".format(self.transcription_factor, field_separator, self.score) - - def copy(self, **kwargs) -> 'Regulon': - try: - return Regulon(**merge(vars(self), kwargs)) - except TypeError: - # Pickled regulons might still have nomenclature property. - args = merge(vars(self), kwargs) - del args['nomenclature'] - return Regulon(**args) - - def union(self, other: Type['GeneSignature']) -> 'Regulon': - assert self.transcription_factor == getattr( - other, 'transcription_factor', self.transcription_factor - ), "Union of two regulons is only possible when same factor." - # noinspection PyTypeChecker - return ( - super() - .union(other) - .copy( - context=self.context.union(getattr(other, 'context', frozenset())), - score=max(self.score, getattr(other, 'score', 0.0)), - ) - ) - - def difference(self, other: Type['GeneSignature']) -> 'Regulon': - assert self.transcription_factor == getattr( - other, 'transcription_factor', self.transcription_factor - ), "Difference of two regulons is only possible when same factor." - # noinspection PyTypeChecker - return ( - super() - .difference(other) - .copy( - context=self.context.union(getattr(other, 'context', frozenset())), - score=max(self.score, getattr(other, 'score', 0.0)), - ) - ) - - def intersection(self, other: Type['GeneSignature']) -> 'Regulon': - assert self.transcription_factor == getattr( - other, 'transcription_factor', self.transcription_factor - ), "Intersection of two regulons is only possible when same factor." - # noinspection PyTypeChecker - return ( - super() - .intersection(other) - .copy( - context=self.context.union(getattr(other, 'context', frozenset())), - score=max(self.score, getattr(other, 'score', 0.0)), - ) - ) diff --git a/src/pyscenic/recovery.py b/src/pyscenic/recovery.py deleted file mode 100644 index 9c9f011..0000000 --- a/src/pyscenic/recovery.py +++ /dev/null @@ -1,289 +0,0 @@ -# -*- coding: utf-8 -*- - -import pandas as pd -import numpy as np -from itertools import repeat -from typing import Type, Optional, List, Tuple -from numba import jit -import logging - -from .rnkdb import RankingDatabase -from .genesig import GeneSignature, Regulon - - -__all__ = ["recovery", "aucs", "enrichment4features", "enrichment4cells", "leading_edge4row"] - - -LOGGER = logging.getLogger(__name__) - - -def derive_rank_cutoff(auc_threshold, total_genes, rank_threshold: Optional[int] = None): - """ """ - if not rank_threshold: - rank_threshold = total_genes - 1 - - assert 0 < rank_threshold < total_genes, "Rank threshold must be an integer between 1 and {0:d}".format(total_genes) - assert 0.0 < auc_threshold <= 1.0, "AUC threshold must be a fraction between 0.0 and 1.0" - - # In the R implementation the cutoff is rounded. - rank_cutoff = int(round(auc_threshold * total_genes)) - assert 0 < rank_cutoff <= rank_threshold, ( - "An AUC threshold of {0:f} corresponds to {1:d} top ranked genes/regions in the database. " - "Please increase the rank threshold or decrease the AUC threshold.".format(auc_threshold, rank_cutoff) - ) - # Make sure we have exacly the same AUC values as the R-SCENIC pipeline. - # In the latter the rank threshold is not included in AUC calculation. - rank_cutoff -= 1 - return rank_cutoff - - -# Do not use numba as it dwarfs the performance. -def rcc2d(rankings: np.ndarray, weights: np.ndarray, rank_threshold: int) -> np.ndarray: - """ - Calculate recovery curves. - - :param rankings: The features rankings for a gene signature (n_features, n_genes). - :param weights: The weights of these genes. - :return: Recovery curves (n_features, rank_threshold). - """ - n_features = rankings.shape[0] - rccs = np.empty(shape=(n_features, rank_threshold)) # Pre-allocation. - for row_idx in range(n_features): - curranking = rankings[row_idx, :] - rccs[row_idx, :] = np.cumsum(np.bincount(curranking, weights=weights)[:rank_threshold]) - return rccs - - -def recovery( - rnk: pd.DataFrame, total_genes: int, weights: np.ndarray, rank_threshold: int, auc_threshold: float, no_auc=False -) -> (np.ndarray, np.ndarray): - """ - Calculate recovery curves and AUCs. This is the workhorse of the recovery algorithm. - - :param rnk: A dataframe containing the rank number of genes of interest. Columns correspond to genes. - :param total_genes: The total number of genes ranked. - :param weights: the weights associated with the selected genes. - :param rank_threshold: The total number of ranked genes to take into account when creating a recovery curve. - :param auc_threshold: The fraction of the ranked genome to take into account for the calculation of the - Area Under the recovery Curve. - :return: A tuple of numpy arrays. The first array contains the recovery curves (n_features/n_cells x rank_threshold), - the second array the AUC values (n_features/n_cells). - """ - rank_cutoff = derive_rank_cutoff(auc_threshold, total_genes, rank_threshold) - features, genes, rankings = rnk.index.values, rnk.columns.values, rnk.values - weights = np.insert(weights, len(weights), 0.0) - n_features = len(features) - rankings = np.append(rankings, np.full(shape=(n_features, 1), fill_value=total_genes), axis=1) - - # Calculate recovery curves. - rccs = rcc2d(rankings, weights, rank_threshold) - if no_auc: - return rccs, np.array([]) - - # Calculate AUC. - # For reason of generating the same results as in R we introduce an error by adding one to the rank_cutoff - # for calculationg the maximum AUC. - maxauc = float((rank_cutoff + 1) * weights.sum()) - assert maxauc > 0 - # The rankings are 0-based. The position at the rank threshold is included in the calculation. - # The maximum AUC takes this into account. - aucs = rccs[:, :rank_cutoff].sum(axis=1) / maxauc - - return rccs, aucs - - -def enrichment4cells(rnk_mtx: pd.DataFrame, regulon: Type[GeneSignature], auc_threshold: float = 0.05) -> pd.DataFrame: - """ - Calculate the enrichment of the regulon for the cells in the ranking dataframe. - - :param rnk_mtx: The ranked expression matrix (n_cells, n_genes). - :param regulon: The regulon the assess for enrichment - :param auc_threshold: The fraction of the ranked genome to take into account for the calculation of the - Area Under the recovery Curve. - :return: - """ - total_genes = len(rnk_mtx.columns) - index = pd.MultiIndex.from_tuples(list(zip(rnk_mtx.index.values, repeat(regulon.name))), names=["Cell", "Regulon"]) - rnk = rnk_mtx.iloc[:, rnk_mtx.columns.isin(regulon.genes)] - if rnk.empty or float(len(rnk.columns)) / len(regulon) < 0.80: - LOGGER.warning("Less than 80% of the genes in {} are present in the expression matrix.".format(regulon.name)) - return pd.DataFrame(index=index, data={"AUC": np.zeros(shape=(rnk_mtx.shape[0]), dtype=np.float64)}) - else: - weights = np.asarray([regulon[gene] if gene in regulon.genes else 1.0 for gene in rnk.columns.values]) - return pd.DataFrame(index=index, data={"AUC": aucs(rnk, total_genes, weights, auc_threshold)}) - - -def enrichment4features( - rnkdb: Type[RankingDatabase], gs: Type[GeneSignature], rank_threshold: int = 5000, auc_threshold: float = 0.05 -) -> pd.DataFrame: - """ - Calculate AUC and NES for all regulatory features in the supplied database using the genes of the give signature. - - :param rnkdb: The database. - :param gs: The gene signature the assess for enrichment - :param rank_threshold: The total number of ranked genes to take into account when creating a recovery curve. - :param auc_threshold: The fraction of the ranked genome to take into account for the calculation of the - Area Under the recovery Curve. - :return: A dataframe containing all information. - """ - assert rnkdb, "A database must be supplied" - assert gs, "A gene signature must be supplied" - - # Load rank of genes from database. - df = rnkdb.load(gs) - features = df.index.values - genes = df.columns.values - rankings = df.values - weights = np.asarray([gs[gene] for gene in genes]) - - rccs, aucs = recovery(df, rnkdb.total_genes, weights, rank_threshold, auc_threshold) - ness = (aucs - aucs.mean()) / aucs.std() - - # The creation of a dataframe is a severe performance penalty. - df_nes = pd.DataFrame(index=features, data={("Enrichment", "AUC"): aucs, ("Enrichment", "NES"): ness}) - df_rnks = pd.DataFrame( - index=features, columns=pd.MultiIndex.from_tuples(list(zip(repeat("Ranking"), genes))), data=rankings - ) - df_rccs = pd.DataFrame( - index=features, - columns=pd.MultiIndex.from_tuples(list(zip(repeat("Recovery"), np.arange(rank_threshold)))), - data=rccs, - ) - return pd.concat([df_nes, df_rccs, df_rnks], axis=1) - - -def leading_edge( - rcc: np.ndarray, avg2stdrcc: np.ndarray, ranking: np.ndarray, genes: np.ndarray, weights: Optional[np.array] = None -) -> Tuple[List[Tuple[str, float]], int]: - """ - Calculate the leading edge for a given recovery curve. - - :param rcc: The recovery curve. - :param avg2stdrcc: The average + 2 standard deviation recovery curve. - :param ranking: The rank numbers of the gene signature for a given regulatory feature. - :param genes: The genes corresponding to the ranking available in the aforementioned parameter. - :param weights: The weights for these genes. - :return: The leading edge returned as a list of tuple. Each tuple associates a gene part of the leading edge with - its rank or with its importance (if gene signature supplied). In addition, the rank at maximum difference is - returned - """ - - def critical_point(): - """Returns (rank_at_max, max_recovery).""" - rank_at_max = np.argmax(rcc - avg2stdrcc) - return rank_at_max, rcc[rank_at_max] - - def get_genes(rank_at_max): - sorted_idx = np.argsort(ranking) - sranking = ranking[sorted_idx] - gene_ids = genes[sorted_idx] - # Make sure to include the gene at the leading edge itself. This is different from the i-cisTarget implementation - # but is inline with the RcisTarget implementation. - filtered_idx = sranking <= rank_at_max - filtered_gene_ids = gene_ids[filtered_idx] - return list( - zip(filtered_gene_ids, weights[sorted_idx][filtered_idx] if weights is not None else sranking[filtered_idx]) - ) - - rank_at_max, n_recovered_genes = critical_point() - # noinspection PyTypeChecker - return get_genes(rank_at_max), rank_at_max - - -def leading_edge4row( - row: pd.Series, avg2stdrcc: np.ndarray, genes: np.ndarray, weights: Optional[np.array] = None -) -> pd.Series: - """ - Calculate the leading edge for a row of a dataframe. Should be used with partial function application to make this - function amenable to the apply idiom common for dataframes. - - :param row: The row of the dataframe to calculate the leading edge for. - :param avg2stdrcc: The average + 2 standard deviation recovery curve. - :param genes: The genes corresponding to the ranking available in the supplied row. - :param weights: The weights for these genes. - :return: The leading edge returned as a list of tuple. Each tuple associates a gene part of the leading edge with - its rank or with its importance (if gene signature supplied). - """ - return pd.Series(data=leading_edge(row['Recovery'].values, avg2stdrcc, row['Ranking'].values, genes, weights)) - - -# Giving numba a signature makes the code marginally faster but with losing flexibility (only being able to use one -# type of integers used in rankings). -# @jit(signature_or_function=float64(int16[:], int_, float64), nopython=True) -@jit(nopython=True) -def auc1d(ranking, rank_cutoff, max_auc): - """ - Calculate the AUC of the recovery curve of a single ranking. [DEPRECATED] - - :param ranking: The rank numbers of the genes. - :param rank_cutoff: The maximum rank to take into account when calculating the AUC. - :param max_auc: The maximum AUC. - :return: The normalized AUC. - """ - # Using concatenate and full constructs required by numba. - # The rankings are 0-based. The position at the rank threshold is included in the calculation. - x = np.concatenate((np.sort(ranking[ranking < rank_cutoff]), np.full((1,), rank_cutoff, dtype=np.int_))) - y = np.arange(x.size - 1) + 1.0 - return np.sum(np.diff(x) * y) / max_auc - - -@jit(nopython=True) -def weighted_auc1d(ranking, weights, rank_cutoff, max_auc): - """ - Calculate the AUC of the weighted recovery curve of a single ranking. - - :param ranking: The rank numbers of the genes. - :param weights: The associated weights. - :param rank_cutoff: The maximum rank to take into account when calculating the AUC. - :param max_auc: The maximum AUC. - :return: The normalized AUC. - """ - # Using concatenate and full constructs required by numba. - # The rankings are 0-based. The position at the rank threshold is included in the calculation. - filter_idx = ranking < rank_cutoff - x = ranking[filter_idx] - y = weights[filter_idx] - sort_idx = np.argsort(x) - x = np.concatenate((x[sort_idx], np.full((1,), rank_cutoff, dtype=np.int_))) - y = y[sort_idx].cumsum() - return np.sum(np.diff(x) * y) / max_auc - - -def auc2d(rankings, weights, rank_cutoff, max_auc): - """ - Calculate the AUCs of multiple rankings. - - :param ranking: The rankings. - :param auc_func: The 1d AUC function to use. - :param rank_cutoff: The maximum rank to take into account when calculating the AUC. - :param max_auc: The maximum AUC. - :return: The normalized AUCs. - """ - n_features = rankings.shape[0] - aucs = np.empty(shape=(n_features,), dtype=np.float64) # Pre-allocation. - for row_idx in range(n_features): - aucs[row_idx] = weighted_auc1d(rankings[row_idx, :], weights, rank_cutoff, max_auc) - return aucs - - -def aucs(rnk: pd.DataFrame, total_genes: int, weights: np.ndarray, auc_threshold: float) -> np.ndarray: - """ - Calculate AUCs (implementation without calculating recovery curves first). - - :param rnk: A dataframe containing the rank number of genes of interest. Columns correspond to genes. - :param total_genes: The total number of genes ranked. - :param weights: the weights associated with the selected genes. - :param auc_threshold: The fraction of the ranked genome to take into account for the calculation of the - Area Under the recovery Curve. - :return: An array with the AUCs. - """ - rank_cutoff = derive_rank_cutoff(auc_threshold, total_genes) - features, genes, rankings = rnk.index.values, rnk.columns.values, rnk.values - y_max = weights.sum() - # The rankings are 0-based. The position at the rank threshold is included in the calculation. - # The maximum AUC takes this into account. - # For reason of generating the same results as in R we introduce an error by adding one to the rank_cutoff - # for calculationg the maximum AUC. - maxauc = float((rank_cutoff + 1) * y_max) - assert maxauc > 0 - return auc2d(rankings, weights, rank_cutoff, maxauc) diff --git a/src/pyscenic/rnkdb.py b/src/pyscenic/rnkdb.py deleted file mode 100644 index acccf86..0000000 --- a/src/pyscenic/rnkdb.py +++ /dev/null @@ -1,623 +0,0 @@ -# -*- coding: utf-8 -*- - -import os -import numpy as np -import pandas as pd -import pyarrow as pa -import pyarrow.parquet as pq -import sqlite3 - -from abc import ABCMeta, abstractmethod -from operator import itemgetter -from typing import Dict, Set, Tuple, Type - -from cytoolz import memoize -from pyarrow.feather import write_feather, FeatherReader -from tqdm import tqdm - -from .genesig import GeneSignature - - -class PyArrowThreads: - """ - A static class to control how many threads PyArrow is allowed to use to convert a Feather database to a pandas - dataframe. - - By default the number of threads is set to 4. - Overriding the number of threads is possible by using the environment variable "PYARROW_THREADS=nbr_threads". - """ - - pyarrow_threads = 4 - - if os.environ.get("PYARROW_THREADS"): - try: - # If "PYARROW_THREADS" is set, check if it is a number. - pyarrow_threads = int(os.environ.get("PYARROW_THREADS")) - except ValueError: - pass - - if pyarrow_threads < 1: - # Set the number of PyArrow threads to 1 if a negative number or zero was specified. - pyarrow_threads = 1 - - @staticmethod - def set_nbr_pyarrow_threads(nbr_threads=None): - # Set number of threads to use for PyArrow when converting Feather database to pandas dataframe. - pa.set_cpu_count(nbr_threads if nbr_threads else PyArrowThreads.pyarrow_threads) - - -PyArrowThreads.set_nbr_pyarrow_threads() - - -class RankingDatabase(metaclass=ABCMeta): - """ - A class of a database of whole genome rankings. The whole genome is ranked for regulatory features of interest, e.g. - motifs for a transcription factor. - - The rankings of the genes are 0-based. - """ - - def __init__(self, name: str): - """ - Create a new instance. - - :param name: The name of the database. - """ - assert name, "Name must be specified." - - self._name = name - - @property - def name(self) -> str: - """ - The name of this database of rankings. - """ - return self._name - - @property - @abstractmethod - def total_genes(self) -> int: - """ - The total number of genes ranked. - """ - pass - - @property - @abstractmethod - def genes(self) -> Tuple[str]: - """ - List of genes ranked according to the regulatory features in this database. - """ - pass - - @property - @memoize - def geneset(self) -> Set[str]: - """ - Set of genes ranked according to the regulatory features in this database. - """ - return set(self.genes) - - @abstractmethod - def load_full(self) -> pd.DataFrame: - """ - Load the whole database into memory. - - :return: a dataframe. - """ - pass - - @abstractmethod - def load(self, gs: Type[GeneSignature]) -> pd.DataFrame: - """ - Load the ranking of the genes in the supplied signature for all features in this database. - - :param gs: The gene signature. - :return: a dataframe. - """ - pass - - def __str__(self): - """ - Returns a readable string representation. - """ - return self.name - - def __repr__(self): - """ - Returns a unambiguous string representation. - """ - return "{}(name=\"{}\")".format(self.__class__.__name__, self._name) - - -# SQL query to get the total number of genes in the database. -GENE_ID_COUNT_QUERY = r"SELECT COUNT(*) FROM rankings;" -# SQL query for retrieving the rankings for a particular set of genes. -RANKINGS_QUERY = r"SELECT geneID, ranking FROM rankings WHERE geneID IN ({0:s}) ORDER BY geneID;" -# SQL query that retrieves the ordered list of features in the database. -FEATURE_IDS_QUERY = r"SELECT motifName FROM motifs ORDER BY idx;" -# SQL query for retrieving the full list of genes scored in this database. -ALL_GENE_IDS_QUERY = r"SELECT geneID FROM rankings ORDER BY geneID;" -# SQL query for retrieving the the whole database. -ALL_RANKINGS_QUERY = r"SELECT geneID, ranking FROM rankings ORDER BY geneID;" - - -class SQLiteRankingDatabase(RankingDatabase): - """ - A class of a database of whole genome rankings. The whole genome is ranked for regulatory features of interest, e.g. - motifs for a transcription factor. - """ - - def __init__(self, fname: str, name: str): - """ - Create a new instance. - - :param fname: The name of the SQLite database file. - :param name: The name of the database. - """ - super().__init__(name) - - assert os.path.isfile(fname), "Database {0:s} doesn't exist.".format(fname) - - self._fname = fname - # Read-only view on SQLite database. - self._uri = 'file:{}?mode=ro'.format(os.path.abspath(fname)) - - with sqlite3.connect(self._uri, uri=True) as db: - cursor = db.cursor() - count = cursor.execute(GENE_ID_COUNT_QUERY).fetchone() - cursor.close() - self._gene_count = count[0] - - # Because of problems on same architectures use of unsigned integers is avoided. - def derive_dtype(n): - """Derive datatype for storing 0-based rankings for a given set length.""" - if n <= 2 ** 15: - # Range int16: -2^15 (= -32768) to 2^15 - 1 (= 32767). - return np.int16 - else: - # Range int32: -2^31 (= -2147483648) to 2^31 - 1 (= 2147483647). - return np.int32 - - self._dtype = derive_dtype(self._gene_count) - - @property - def total_genes(self) -> int: - """ - The total number of genes ranked. - """ - return self._gene_count - - @property - @memoize - def features(self) -> Tuple[str]: - """ - List of regulatory features for which whole genome rankings are available in this database. - """ - with sqlite3.connect(self._uri, uri=True) as db: - cursor = db.cursor() - features = tuple(map(itemgetter(0), cursor.execute(FEATURE_IDS_QUERY).fetchall())) - cursor.close() - return features - - @property - @memoize - def genes(self) -> Tuple[str]: - """ - List of genes ranked according to the regulatory features in this database. - """ - with sqlite3.connect(self._uri, uri=True) as db: - cursor = db.cursor() - genes = tuple(map(itemgetter(0), cursor.execute(ALL_GENE_IDS_QUERY).fetchall())) - cursor.close() - return genes - - def load_full(self) -> pd.DataFrame: - """ - Load the whole database into memory. - - :return: a dataframe. - """ - # Pre-allocate the matrix. - rankings = np.empty(shape=(len(self.features), len(self.genes)), dtype=self._dtype) - with sqlite3.connect(self._uri, uri=True) as db: - cursor = db.cursor() - for idx, (_, ranking) in enumerate(cursor.execute(ALL_RANKINGS_QUERY)): - rankings[:, idx] = np.frombuffer(ranking, dtype=self._dtype) - cursor.close() - - return pd.DataFrame(index=self.features, columns=self.genes, data=rankings) - - def load(self, gs: Type[GeneSignature]) -> pd.DataFrame: - """ - Load the ranking of the genes in the supplied signature for all features in this database. - - :param gs: The gene signature. - :return: A dataframe. - """ - assert gs, "A gene signature must be supplied" - - def quoted_csv(values): - # Escape single quotes (') by using (''), because sometimes ID's contain a single quote. - def quote(value): - return "'" + value.replace("'", "''") + "'" - - return ','.join(map(quote, values)) - - # For some genes in the signature there might not be a rank available in the database. - gene_set = self.geneset.intersection(set(gs.genes)) - # Pre-allocate the matrix. - rankings = np.empty(shape=(len(self.features), len(gene_set)), dtype=self._dtype) - with sqlite3.connect(self._uri, uri=True) as db: - cursor = db.cursor() - genes = [] - for idx, (gene, ranking) in enumerate(cursor.execute(RANKINGS_QUERY.format(quoted_csv(gene_set)))): - rankings[:, idx] = np.frombuffer(ranking, dtype=self._dtype) - genes.append(gene) - cursor.close() - - return pd.DataFrame(index=self.features, columns=genes, data=rankings) - - -INDEX_NAME = "features" - - -class FeatherRankingDatabase(RankingDatabase): - def __init__(self, fname: str, name: str): - """ - Create a new feather database. - - :param fname: The filename of the database. - :param name: The name of the database. - """ - super().__init__(name=name) - - assert os.path.isfile(fname), "Database {0:s} doesn't exist.".format(fname) - # FeatherReader cannot be pickle (important for dask framework) so filename is field instead. - self._fname = fname - - @property - @memoize - def total_genes(self) -> int: - # Do not count column 1 as it contains the index with the name of the features. - return FeatherReader(self._fname).num_columns - 1 - - @property - @memoize - def genes(self) -> Tuple[str]: - # noinspection PyTypeChecker - reader = FeatherReader(self._fname) - # Get all gene names (exclude "features" column). - return tuple( - reader.get_column_name(idx) - for idx in range(reader.num_columns) - if reader.get_column_name(idx) != INDEX_NAME - ) - - @property - @memoize - def genes2idx(self) -> Dict[str, int]: - return {gene: idx for idx, gene in enumerate(self.genes)} - - def load_full(self) -> pd.DataFrame: - df = FeatherReader(self._fname).read_pandas() - # Avoid copying the whole dataframe by replacing the index in place. - # This makes loading a database twice as fast in case the database file is already in the filesystem cache. - df.set_index(INDEX_NAME, inplace=True) - return df - - def load(self, gs: Type[GeneSignature]) -> pd.DataFrame: - # For some genes in the signature there might not be a rank available in the database. - gene_set = self.geneset.intersection(set(gs.genes)) - # Read ranking columns for genes in order they appear in the Feather file. - df = FeatherReader(self._fname).read_pandas( - columns=(INDEX_NAME,) + tuple(sorted(gene_set, key=lambda gene: self.genes2idx[gene])) - ) - # Avoid copying the whole dataframe by replacing the index in place. - # This makes loading a database twice as fast in case the database file is already in the filesystem cache. - df.set_index(INDEX_NAME, inplace=True) - return df - - -class ParquetRankingDatabase(RankingDatabase): - def __init__(self, fname: str, name: str): - """ - Create a new parquet database. - - :param fname: The filename of the database. - :param name: The name of the database. - """ - super().__init__(name=name) - - assert os.path.isfile(fname), "Database {0:s} doesn't exist.".format(fname) - # FeatherReader cannot be pickle (important for dask framework) so filename is field instead. - self._fname = fname - - @property - @memoize - def total_genes(self) -> int: - # Do not count column 1 as it contains the index with the name of the features. - return pq.read_metadata(self._fname).num_columns - 1 - - @property - @memoize - def genes(self) -> Tuple[str]: - # noinspection PyTypeChecker - metadata = pq.read_metadata(self._fname) - assert metadata.num_row_groups == 1, "Parquet database {0:s} has more than one row group.".format(self._fname) - metadata_row_group = metadata.row_group(0) - # Get all gene names (exclude "features" column). - return tuple( - metadata_row_group.column(idx).path_in_schema - for idx in range(0, metadata.num_columns) - if metadata_row_group.column(idx).path_in_schema != INDEX_NAME - ) - - @property - @memoize - def genes2idx(self) -> Dict[str, int]: - return {gene: idx for idx, gene in enumerate(self.genes)} - - def load_full(self) -> pd.DataFrame: - df = pq.read_pandas(self._fname).to_pandas() - # Avoid copying the whole dataframe by replacing the index in place. - # This makes loading a database twice as fast in case the database file is already in the filesystem cache. - df.set_index(INDEX_NAME, inplace=True) - return df - - def load(self, gs: Type[GeneSignature]) -> pd.DataFrame: - # For some genes in the signature there might not be a rank available in the database. - gene_set = self.geneset.intersection(set(gs.genes)) - # Read rankings columns for genes in order they appear in the Parquet file. - df = pq.read_pandas( - self._fname, columns=(INDEX_NAME,) + tuple(sorted(gene_set, key=lambda gene: self.genes2idx[gene])) - ).to_pandas() - # Avoid copying the whole dataframe by replacing the index in place. - # This makes loading a database twice as fast in case the database file is already in the filesystem cache. - df.set_index(INDEX_NAME, inplace=True) - return df - - -class MemoryDecorator(RankingDatabase): - """ - A decorator for a ranking database which loads the entire database in memory. - """ - - def __init__(self, db: Type[RankingDatabase]): - assert db, "Database should be supplied." - self._db = db - self._df = db.load_full() - super().__init__(db.name) - - @property - def total_genes(self) -> int: - return self._db.total_genes - - @property - def genes(self) -> Tuple[str]: - return self._db.genes - - def load_full(self) -> pd.DataFrame: - return self._df - - def load(self, gs: Type[GeneSignature]) -> pd.DataFrame: - return self._df.loc[:, self._df.columns.isin(gs.genes)] - - -class DataFrameRankingDatabase(RankingDatabase): - """ - A ranking database from a dataframe. - """ - - def __init__(self, df: pd.DataFrame, name: str): - self._df = df - super().__init__(name) - - @property - def total_genes(self) -> int: - return len(self._df.columns) - - @property - def genes(self) -> Tuple[str]: - return tuple(self._df.columns) - - def load_full(self) -> pd.DataFrame: - return self._df - - def load(self, gs: Type[GeneSignature]) -> pd.DataFrame: - return self._df.loc[:, self._df.columns.isin(gs.genes)] - - def save(self, fname: str): - """ - Save database as feather file. - - :param fname: The name of the file to create. - """ - assert not os.path.exists(fname) - df = self._df.copy() - df.index.name = INDEX_NAME - df.reset_index( - inplace=True - ) # Index is not stored in feather format. https://github.com/wesm/feather/issues/200 - write_feather(df, fname) - - -IDENTIFIERS_FNAME_EXTENSION = "identifiers.txt" -INVERTED_DB_DTYPE = np.uint32 - - -class InvertedRankingDatabase(RankingDatabase): - @classmethod - def _derive_identifiers_fname(cls, fname): - return '{}.{}'.format(os.path.splitext(fname)[0], IDENTIFIERS_FNAME_EXTENSION) - - @classmethod - def invert(cls, db: Type[RankingDatabase], fname: str, top_n_identifiers: int = 50000) -> None: - """ - Create an inverted whole genome rankings database keeping only the top n genes/regions for a feature. - - Inverted design: not storing the rankings for all regions in the dataframe but instead store the identifier of the - top n genes/regions in the dataframe introduces an enormous reduction in disk and memory size. - - :param db: The rankings database. - :param fname: the filename of the inverted database to be created. - :param top_n_identifiers: The number of genes to keep in the inverted database. - """ - - df_original = db.load_full() - n_features = len(df_original) - - index_fname = InvertedRankingDatabase._derive_identifiers_fname(fname) - assert not os.path.exists(index_fname), "Database index {0:s} already exists.".format(index_fname) - identifiers = df_original.columns.values - with open(index_fname, 'w') as f: - f.write('\n'.join(identifiers)) - identifier2idx = {identifier: idx for idx, identifier in enumerate(identifiers)} - - inverted_data = np.empty(shape=(n_features, top_n_identifiers), dtype=INVERTED_DB_DTYPE) - df_original.columns = [identifier2idx[identifier] for identifier in df_original.columns] - for idx, (_, row) in tqdm(enumerate(df_original.iterrows())): - inverted_data[idx, :] = np.array( - row.sort_values(ascending=True).head(top_n_identifiers).index, dtype=INVERTED_DB_DTYPE - ) - df = pd.DataFrame(data=inverted_data, index=df_original.index, columns=list(range(top_n_identifiers))) - - df.index.name = INDEX_NAME - df.reset_index( - inplace=True - ) # Index is not stored in feather format. https://github.com/wesm/feather/issues/200 - write_feather(df, fname) - - @classmethod - def revert(cls, db: 'InvertedRankingDatabase', fname: str) -> None: - """ - Revert an inverted database to a normal format. - - :param db: The inverted database - :param fname: the filename for the new database. - """ - # TODO: The memory requirement of this method might be prohibitively large! - n = len(db.genes) - db.max_rank - rank_unknown = np.iinfo(INVERTED_DB_DTYPE).max - df = db.load(GeneSignature(name="all", gene2weight=db.genes)) - for ridx, row in df.iterrows(): - df[ridx, row == rank_unknown] = np.random.randint(low=0, high=n, size=n) - DataFrameRankingDatabase(df=df, name=db.name).save(fname) - - def __init__(self, fname: str, name: str): - """ - Create a new inverted database. - - :param fname: The filename of the database. - :param name: The name of the database. - """ - super().__init__(name=name) - - assert os.path.isfile(fname), "Database {0:s} doesn't exist.".format(fname) - - # Load mapping from gene/region identifiers to index values used in stored in inverted database. - index_fname = InvertedRankingDatabase._derive_identifiers_fname(fname) - assert os.path.isfile(fname), "Database index {0:s} doesn't exist.".format(index_fname) - self.identifier2idx = self._load_identifier2idx(index_fname) - self.idx2identifier = {idx: identifier for identifier, idx in self.identifier2idx.items()} - - # Load dataframe into memory in a format most suited for fast loading of gene signatures. - df = FeatherReader(fname).read_pandas() - # Avoid copying the whole dataframe by replacing the index in place. - # This makes loading a database twice as fast in case the database file is already in the filesystem cache. - df.set_index(INDEX_NAME, inplace=True) - self.max_rank = len(df.columns) - self.features = [pd.Series(index=row.values, data=row.index, name=name) for name, row in df.iterrows()] - - def _load_identifier2idx(self, fname): - with open(fname, 'r') as f: - return {line.strip(): idx for idx, line in enumerate(f)} - - @property - def total_genes(self) -> int: - return len(self.identifier2idx) - - @property - @memoize - def genes(self) -> Tuple[str]: - # noinspection PyTypeChecker - return tuple(self.identifier2idx.keys()) - - def is_valid_rank_threshold(self, rank_threshold: int) -> bool: - return rank_threshold <= self.max_rank - - def load_full(self) -> pd.DataFrame: - # Loading the whole database into memory is not possible with an inverted database. - # Decoration with a MemoryDecorator is not possible and will be prevented by raising - # an exception. - raise NotImplemented - - def load(self, gs: Type[GeneSignature]) -> pd.DataFrame: - rank_unknown = np.iinfo(INVERTED_DB_DTYPE).max - reference_identifiers = np.array([self.identifier2idx[identifier] for identifier in gs.genes]) - return ( - pd.concat( - [col.reindex(index=reference_identifiers, fill_value=rank_unknown) for col in self.features], axis=1 - ) - .T.astype(INVERTED_DB_DTYPE) - .rename(columns=self.idx2identifier) - ) - - -def convert_sqlitedb_to_featherdb(fname: str, out_folder: str, name: str, extension: str = "feather") -> str: - """ - Convert a whole genome SQLite rankings database to a feather format based database. - - More information on this format can be found here: - .. feather-format: https://blog.rstudio.com/2016/03/29/feather/ - - :param fname: The filename of the legacy SQLite rankings database. - :param out_folder: The name of the folder to write the new database to. - :param name: The name of the rankings database. - :param extension: The extension of the new database file. - :return: The filename of the new database. - """ - assert os.path.isfile(fname), "{} does not exist.".format(fname) - assert os.path.isdir(out_folder), "{} is not a directory.".format(out_folder) - - feather_fname = os.path.join(out_folder, "{}.{}".format(os.path.splitext(os.path.basename(fname))[0], extension)) - assert not os.path.exists(feather_fname), "{} already exists.".format(feather_fname) - - # Load original database into memory. - # Caveat: the original storage format of whole genome rankings does not store the metadata, i.e. name. - db = SQLiteRankingDatabase(fname=fname, name=name) - df = db.load_full() - df.index.name = INDEX_NAME - # Index is not stored in feather format: https://github.com/wesm/feather/issues/200 - df.reset_index(inplace=True) - write_feather(df, feather_fname) - return feather_fname - - -def opendb(fname: str, name: str) -> Type['RankingDatabase']: - """ - Open a ranking database. - - :param fname: The filename of the database. - :param name: The name of the database. - :return: A ranking database. - """ - assert os.path.isfile(fname), "{} does not exist.".format(fname) - assert name, "A database should be given a proper name." - - extension = os.path.splitext(fname)[1] - if extension == ".parquet": - # noinspection PyTypeChecker - return ParquetRankingDatabase(fname, name=name) - elif extension == ".feather": - if fname.endswith(".inverted.feather"): - # noinspection PyTypeChecker - return InvertedRankingDatabase(fname, name=name) - else: - # noinspection PyTypeChecker - return FeatherRankingDatabase(fname, name=name) - elif extension in (".db", ".sqlite", ".sqlite3"): - # noinspection PyTypeChecker - return SQLiteRankingDatabase(fname, name=name) - else: - raise ValueError("{} is an unknown extension.".format(extension)) diff --git a/tests/test_featherdb.py b/tests/test_featherdb.py deleted file mode 100644 index 40eebf0..0000000 --- a/tests/test_featherdb.py +++ /dev/null @@ -1,45 +0,0 @@ -# -*- coding: utf-8 -*- - -import pytest -from pkg_resources import resource_filename - -from pyscenic.genesig import GeneSignature -from pyscenic.rnkdb import FeatherRankingDatabase as RankingDatabase - -TEST_DATABASE_FNAME = resource_filename('resources.tests', "hg19-tss-centered-10kb-10species.mc9nr.feather") -TEST_DATABASE_NAME = "hg19-tss-centered-10kb-10species.mc9nr" -TEST_SIGNATURE_FNAME = resource_filename('resources.tests', "c6.all.v6.1.symbols.gmt") - - -@pytest.fixture -def db(): - return RankingDatabase(TEST_DATABASE_FNAME, TEST_DATABASE_NAME) - - -@pytest.fixture -def gs(): - return GeneSignature.from_gmt(TEST_SIGNATURE_FNAME, gene_separator="\t", field_separator="\t", )[0] - - -def test_init(db): - assert db.name == TEST_DATABASE_NAME - - -def test_total_genes(db): - assert db.total_genes == 22284 - - -def test_genes(db): - assert len(db.genes) == 22284 - - -def test_load_full(db): - rankings = db.load_full() - assert len(rankings.index) == 5 - assert len(rankings.columns) == 22284 - - -def test_load(db, gs): - rankings = db.load(gs) - assert len(rankings.index) == 5 - assert len(rankings.columns) == 29 diff --git a/tests/test_genesig.py b/tests/test_genesig.py deleted file mode 100644 index 10c05b1..0000000 --- a/tests/test_genesig.py +++ /dev/null @@ -1,171 +0,0 @@ -# -*- coding: utf-8 -*- - -import attr -import pytest -from pkg_resources import resource_filename - -from pyscenic.genesig import GeneSignature, Regulon - -TEST_SIGNATURE = "msigdb_cancer_c6" -TEST_SIGNATURE_FNAME = resource_filename('resources.tests', "c6.all.v6.1.symbols.gmt") - - -def test_init1(): - gs1 = GeneSignature(name="test1", gene2weight=['TP53', 'SOX4']) - assert 'TP53' in gs1 - assert 'SOX4' in gs1 - assert gs1.name == 'test1' - assert len(gs1) == 2 - assert gs1.gene2weight['TP53'] == 1.0 - assert gs1.gene2weight['SOX4'] == 1.0 - - -def test_init2(): - gs1 = GeneSignature(name="test1", gene2weight=[('TP53', 0.5), ('SOX4', 0.75)]) - assert 'TP53' in gs1 - assert 'SOX4' in gs1 - assert gs1.name == 'test1' - assert len(gs1) == 2 - assert gs1.gene2weight['TP53'] == 0.5 - assert gs1.gene2weight['SOX4'] == 0.75 - - -def test_init3(): - gs1 = GeneSignature(name="test1", gene2weight={'TP53': 0.5, 'SOX4': 0.75}) - assert 'TP53' in gs1 - assert 'SOX4' in gs1 - assert gs1.name == 'test1' - assert len(gs1) == 2 - assert gs1.gene2weight['TP53'] == 0.5 - assert gs1.gene2weight['SOX4'] == 0.75 - - -def test_immut(): - gs1 = GeneSignature(name="test1", gene2weight={'TP53': 0.5, 'SOX4': 0.75}) - with pytest.raises(attr.exceptions.FrozenInstanceError): - gs1.name = 'rename' - with pytest.raises(TypeError): - gs1.gene2weight['TP53'] = 0.6 - - -def test_genes(): - gs1 = GeneSignature(name="test1", gene2weight={'TP53': 0.5, 'SOX4': 0.75}) - assert gs1.genes == ('SOX4', 'TP53') - - -def test_dict(): - gs1 = GeneSignature(name="test1", gene2weight={'TP53': 0.5, 'SOX4': 0.75}) - assert gs1['TP53'] == 0.5 - assert gs1['SOX4'] == 0.75 - - -def test_rename(): - gs1 = GeneSignature(name="test1", gene2weight={'TP53': 0.5, 'SOX4': 0.75}) - gs2 = gs1.rename('test2') - assert 'TP53' in gs2 - assert 'SOX4' in gs2 - assert gs2.name == 'test2' - assert len(gs2) == 2 - assert gs2.gene2weight['TP53'] == 0.5 - assert gs2.gene2weight['SOX4'] == 0.75 - - -def test_union1(): - gs1 = GeneSignature(name="test1", gene2weight=['TP53', 'SOX4']) - gs2 = GeneSignature(name="test1", gene2weight=['TP53', 'SOX2']) - gsu = gs1.union(gs2) - assert 'TP53' in gsu - assert 'SOX4' in gsu - assert 'SOX2' in gsu - assert len(gsu) == 3 - - -def test_union3(): - gs1 = GeneSignature(name="test1", gene2weight={'TP53': 0.8, 'SOX4': 0.75}) - gs2 = GeneSignature(name="test1", gene2weight={'TP53': 0.3, 'SOX2': 0.60}) - gsu = gs1.union(gs2) - assert 'TP53' in gsu - assert gsu.gene2weight['TP53'] == 0.8 - assert 'SOX4' in gsu - assert gsu.gene2weight['SOX4'] == 0.75 - assert 'SOX2' in gsu - assert gsu.gene2weight['SOX2'] == 0.6 - assert len(gsu) == 3 - - -def test_diff1(): - gs1 = GeneSignature(name="test1", gene2weight=['TP53', 'SOX4']) - gs2 = GeneSignature(name="test1", gene2weight=['TP53', 'SOX2']) - gsu = gs1.difference(gs2) - assert 'SOX4' in gsu - assert len(gsu) == 1 - - -def test_diff3(): - gs1 = GeneSignature(name="test1", gene2weight={'TP53': 0.8, 'SOX4': 0.75}) - gs2 = GeneSignature(name="test1", gene2weight={'TP53': 0.3, 'SOX2': 0.60}) - gsu = gs1.difference(gs2) - assert 'SOX4' in gsu - assert gsu.gene2weight['SOX4'] == 0.75 - assert len(gsu) == 1 - - -def test_intersection1(): - gs1 = GeneSignature(name="test1", gene2weight=['TP53', 'SOX4']) - gs2 = GeneSignature(name="test1", gene2weight=['TP53', 'SOX2']) - gsu = gs1.intersection(gs2) - assert 'TP53' in gsu - assert len(gsu) == 1 - - -def test_intersection3(): - gs1 = GeneSignature(name="test1", gene2weight={'TP53': 0.8, 'SOX4': 0.75}) - gs2 = GeneSignature(name="test1", gene2weight={'TP53': 0.3, 'SOX2': 0.60}) - gsu = gs1.intersection(gs2) - assert len(gsu) == 1 - assert 'TP53' in gsu - assert gsu.gene2weight['TP53'] == 0.8 - - -def test_regulon(): - reg = Regulon(name='TP53 regulon', gene2weight={'TP53': 0.8, 'SOX4': 0.75}, transcription_factor="TP53", gene2occurrence={"TP53": 1}) - assert reg.transcription_factor == "TP53" - - -def test_load_gmt(): - gss = GeneSignature.from_gmt(field_separator='\t', gene_separator='\t', fname=TEST_SIGNATURE_FNAME) - # http://software.broadinstitute.org/gsea/msigdb/collections.jsp#C6 - assert len(gss) == 189 - assert gss[0].name == "GLI1_UP.V1_DN" - assert "COPZ1" in gss[0] - assert len(gss[0]) == 29 - - -def test_add(): - gss = GeneSignature.from_gmt(field_separator='\t', gene_separator='\t', fname=TEST_SIGNATURE_FNAME) - res = gss[0].add("MEF2") - assert "MEF2" in res - - -def test_noweights(): - gs1 = GeneSignature(name="test1", gene2weight={'TP53': 0.8, 'SOX4': 0.75}) - gs2 = gs1.noweights() - assert gs1['TP53'] == 0.8 - assert gs2['TP53'] == 1.0 - - reg1 = Regulon(name='TP53 regulon', gene2weight={'TP53': 0.8, 'SOX4': 0.75}, transcription_factor="TP53", gene2occurrence={"TP53": 1}) - reg2 = reg1.noweights() - assert reg1['TP53'] == 0.8 - assert reg2['TP53'] == 1.0 - assert isinstance(reg2, Regulon) - - -def test_head(): - gs1 = GeneSignature(name="test1", gene2weight={'TP53': 0.8, 'SOX4': 0.75}) - gs2 = gs1.head(1) - assert gs2['TP53'] == 0.8 - assert len(gs2) == 1 - gs2 = gs1.head(2) - assert gs2['TP53'] == 0.8 - assert gs2['SOX4'] == 0.75 - assert len(gs2) == 2 diff --git a/tests/test_parquetdb.py b/tests/test_parquetdb.py deleted file mode 100644 index 21791f6..0000000 --- a/tests/test_parquetdb.py +++ /dev/null @@ -1,50 +0,0 @@ -# -*- coding: utf-8 -*- - -import pytest -from pkg_resources import resource_filename - -from pyscenic.genesig import GeneSignature -from pyscenic.rnkdb import ParquetRankingDatabase as RankingDatabase - -TEST_DATABASE_FNAME = resource_filename('resources.tests', "hg19-tss-centered-10kb-10species.mc9nr.parquet") -TEST_DATABASE_NAME = "hg19-tss-centered-10kb-10species.mc9nr" -TEST_SIGNATURE_FNAME = resource_filename('resources.tests', "c6.all.v6.1.symbols.gmt") - -################################################## -# temporarily disable testing of the parqet databases until we have a working test database -from os import path -pytestmark = pytest.mark.skipif(not path.exists(TEST_DATABASE_FNAME), reason="Parquet testing is temporarily disabled.") -################################################## - -@pytest.fixture -def db(): - return RankingDatabase(TEST_DATABASE_FNAME, TEST_DATABASE_NAME) - - -@pytest.fixture -def gs(): - return GeneSignature.from_gmt(TEST_SIGNATURE_FNAME, gene_separator="\t", field_separator="\t", )[0] - - -def test_init(db): - assert db.name == TEST_DATABASE_NAME - - -def test_total_genes(db): - assert db.total_genes == 22284 - - -def test_genes(db): - assert len(db.genes) == 22284 - - -def test_load_full(db): - rankings = db.load_full() - assert len(rankings.index) == 5 - assert len(rankings.columns) == 22284 - - -def test_load(db, gs): - rankings = db.load(gs) - assert len(rankings.index) == 5 - assert len(rankings.columns) == 29 diff --git a/tests/test_recovery.py b/tests/test_recovery.py deleted file mode 100644 index 16fb7ae..0000000 --- a/tests/test_recovery.py +++ /dev/null @@ -1,106 +0,0 @@ -# -*- coding: utf-8 -*- - -import numpy as np -import pytest -from pkg_resources import resource_filename - -from pyscenic.genesig import GeneSignature -from pyscenic.recovery import enrichment4features as enrichment, auc1d, weighted_auc1d, rcc2d -from pyscenic.rnkdb import FeatherRankingDatabase as RankingDatabase - -TEST_DATABASE_FNAME = resource_filename('resources.tests', "hg19-tss-centered-10kb-10species.mc9nr.feather") -TEST_DATABASE_NAME = "hg19-tss-centered-10kb-10species.mc9nr" -TEST_SIGNATURE_FNAME = resource_filename('resources.tests', "c6.all.v6.1.symbols.gmt") - - -@pytest.fixture -def db(): - return RankingDatabase(TEST_DATABASE_FNAME, TEST_DATABASE_NAME, ) - - -@pytest.fixture -def gs(): - return GeneSignature.from_gmt(TEST_SIGNATURE_FNAME, gene_separator="\t", field_separator="\t", )[0] - - -def test_enrichment(db, gs): - df = enrichment(db, gs) - - -def test_auc1d_1(): - # Check if AUC is calculated correctly when a gene is recovered at the rank threshold. - # In the python implementation it should be included in the AUC calculation. - # For the current gene list the non-normalized AUC should be: (2*1) + (2*2) + (2*3) + (1*4) = 16. - - total_genes = 100 - # This rank threshold is inclusive. - auc_rank_threshold = 8 - # The databases have a zero-based ranking system. - ranking = np.asarray([2, 4, 6, 8]) - 1 - auc_max = auc_rank_threshold * total_genes - assert 16.0 / 800 == auc1d(ranking, auc_rank_threshold, auc_max) - - -def test_auc1d_2(): - # For the current gene list the non-normalized AUC should be: (2*1) + (2*2) + (3*3) = 15. - - total_genes = 100 - # This rank threshold is inclusive. - auc_rank_threshold = 8 - # The databases have a zero-based ranking system. - ranking = np.asarray([2, 4, 6]) - 1 - auc_max = auc_rank_threshold * total_genes - assert 15.0 / 800 == auc1d(ranking, auc_rank_threshold, auc_max) - - -def test_weighted_auc1d(): - # CAVE: In python the ranking databases are 0-based. The only difference a 1-based system has on the calc - # of the AUC is that in the latter the rank threshold would not be included. This has an influence on the - # normalization factor max AUC but nothing else. - - total_genes = 100 - auc_rank_threshold = 8 - # The databases have a zero-based ranking system. - ranking = np.asarray([2, 4, 6]) - 1 - weights = np.ones(len(ranking)) - # Disable normalization. - auc_max = 1.0 - assert 15.0 == weighted_auc1d(ranking, weights, auc_rank_threshold, auc_max) - - -def test_weighted_auc1d_batch(): - # The assumption taken here is that for weights uniformly being set to 1.0, - # auc1d and weighted_auc1d should always have the same output. - - N = 100000 - total_genes = 100 - for _ in range(N): - auc_rank_threshold = np.random.randint(2, high=total_genes) - n_genes = np.random.randint(20, high=total_genes) - ranking = np.random.randint(low=0, high=total_genes, size=n_genes) - weights = np.ones(n_genes) - # Disable normalization. - auc_max = 1.0 - assert auc1d(ranking, auc_rank_threshold, auc_max) \ - == weighted_auc1d(ranking, weights, auc_rank_threshold, auc_max) - - -def test_weighted_rcc2d_batch(): - # The assumption taken here is that for weights uniformly being set to 1.0, - # auc1d and weighted_auc1d should always have the same output. - - N = 100000 - total_genes = 100 - for _ in range(N): - # rccs[row_idx, :] = np.cumsum(np.bincount(curranking, weights=weights)[:rank_threshold]) - # ValueError: could not broadcast input array from shape (89) into shape (97) - auc_rank_threshold = np.random.randint(2, high=total_genes) - n_genes = np.random.randint(20, high=total_genes) - ranking = np.random.randint(low=0, high=total_genes, size=n_genes) - rankings = ranking.reshape((1, n_genes)) - rankings = np.append(rankings, np.full(shape=(1, 1), fill_value=total_genes), axis=1) - weights = np.ones(n_genes) - # Disable normalization. - auc_max = 1.0 - assert rcc2d(rankings, np.insert(weights, len(weights), 0.0), total_genes)[:, :auc_rank_threshold].sum(axis=1) \ - == weighted_auc1d(ranking, weights, auc_rank_threshold, auc_max) diff --git a/tests/test_sqlitedb.py b/tests/test_sqlitedb.py deleted file mode 100644 index ef6a9a6..0000000 --- a/tests/test_sqlitedb.py +++ /dev/null @@ -1,45 +0,0 @@ -# -*- coding: utf-8 -*- - -import pytest -from pkg_resources import resource_filename - -from pyscenic.genesig import GeneSignature -from pyscenic.rnkdb import SQLiteRankingDatabase as RankingDatabase - -TEST_DATABASE_FNAME = resource_filename('resources.tests', "hg19-tss-centered-5kb-10species.mc9nr.db") -TEST_DATABASE_NAME = "hg19-tss-centered-5kb-10species.mc9nr" -TEST_SIGNATURE_FNAME = resource_filename('resources.tests', "c6.all.v6.1.symbols.gmt") - - -@pytest.fixture -def db(): - return RankingDatabase(TEST_DATABASE_FNAME, TEST_DATABASE_NAME) - - -@pytest.fixture -def gs(): - return GeneSignature.from_gmt(TEST_SIGNATURE_FNAME, gene_separator="\t", field_separator="\t", )[0] - - -def test_init(db): - assert db.name == TEST_DATABASE_NAME - - -def test_total_genes(db): - assert db.total_genes == 29 - - -def test_genes(db): - assert len(db.genes) == 29 - - -def test_load_full(db): - rankings = db.load_full() - assert len(rankings.index) == 24453 - assert len(rankings.columns) == 29 - - -def test_load(db, gs): - rankings = db.load(gs) - assert len(rankings.index) == 24453 - assert len(rankings.columns) == 29 From e2f2babba3b1dee7e40c52e792ce79cd2a5129bf Mon Sep 17 00:00:00 2001 From: Chris Flerin Date: Thu, 6 May 2021 13:49:34 +0200 Subject: [PATCH 2/5] Update imports to use ctxcore functions --- docs/faq.rst | 2 +- docs/tutorial.rst | 2 +- requirements.txt | 1 + src/pyscenic/aucell.py | 4 ++-- src/pyscenic/cli/db2feather.py | 2 +- src/pyscenic/cli/gmt2regions.py | 2 +- src/pyscenic/cli/invertdb.py | 2 +- src/pyscenic/cli/pyscenic.py | 2 +- src/pyscenic/cli/utils.py | 2 +- src/pyscenic/export.py | 2 +- src/pyscenic/prune.py | 4 ++-- src/pyscenic/regions.py | 4 ++-- src/pyscenic/transform.py | 8 ++++---- src/pyscenic/utils.py | 2 +- 14 files changed, 20 insertions(+), 19 deletions(-) diff --git a/docs/faq.rst b/docs/faq.rst index c6a62c9..c0f06d8 100644 --- a/docs/faq.rst +++ b/docs/faq.rst @@ -101,7 +101,7 @@ Yes you can. The code snippet below shows you how to create your own databases: .. code-block:: python - from pyscenic.rnkdb import DataFrameRankingDatabase as RankingDatabase + from ctxcore.rnkdb import DataFrameRankingDatabase as RankingDatabase import numpy as np import pandas as pd diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 1f682f3..bbc33e4 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -56,7 +56,7 @@ First we import the necessary modules and declare some constants: from arboreto.utils import load_tf_names from arboreto.algo import grnboost2 - from pyscenic.rnkdb import FeatherRankingDatabase as RankingDatabase + from ctxcore.rnkdb import FeatherRankingDatabase as RankingDatabase from pyscenic.utils import modules_from_adjacencies, load_motifs from pyscenic.prune import prune2df, df2regulons from pyscenic.aucell import aucell diff --git a/requirements.txt b/requirements.txt index 6422672..727bd4b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ cytoolz +ctxcore multiprocessing_on_dill llvmlite numba>=0.51.2 diff --git a/src/pyscenic/aucell.py b/src/pyscenic/aucell.py index 34a59e5..3b0109a 100644 --- a/src/pyscenic/aucell.py +++ b/src/pyscenic/aucell.py @@ -1,10 +1,10 @@ # -*- coding: utf-8 -*- import pandas as pd -from .recovery import enrichment4cells +from ctxcore.recovery import enrichment4cells from tqdm import tqdm from typing import Sequence, Type -from .genesig import GeneSignature +from ctxcore.genesig import GeneSignature from multiprocessing import cpu_count, Process, Array from boltons.iterutils import chunked from multiprocessing.sharedctypes import RawArray diff --git a/src/pyscenic/cli/db2feather.py b/src/pyscenic/cli/db2feather.py index 1c5e5d3..c3d5fc3 100644 --- a/src/pyscenic/cli/db2feather.py +++ b/src/pyscenic/cli/db2feather.py @@ -2,7 +2,7 @@ import os import argparse -from pyscenic.rnkdb import convert_sqlitedb_to_featherdb +from ctxcore.rnkdb import convert_sqlitedb_to_featherdb def derive_db_name(fname: str) -> str: diff --git a/src/pyscenic/cli/gmt2regions.py b/src/pyscenic/cli/gmt2regions.py index 04df179..dc34cd8 100644 --- a/src/pyscenic/cli/gmt2regions.py +++ b/src/pyscenic/cli/gmt2regions.py @@ -3,7 +3,7 @@ import os import argparse import sys -from pyscenic.genesig import GeneSignature +from ctxcore.genesig import GeneSignature from pyscenic.regions import RegionRankingDatabase, Delineation, convert diff --git a/src/pyscenic/cli/invertdb.py b/src/pyscenic/cli/invertdb.py index c92d0c8..ee394ea 100644 --- a/src/pyscenic/cli/invertdb.py +++ b/src/pyscenic/cli/invertdb.py @@ -2,7 +2,7 @@ import os import argparse -from pyscenic.rnkdb import opendb, InvertedRankingDatabase +from ctxcore.rnkdb import opendb, InvertedRankingDatabase def derive_db_name(fname: str) -> str: diff --git a/src/pyscenic/cli/pyscenic.py b/src/pyscenic/cli/pyscenic.py index f706f2b..72c1047 100644 --- a/src/pyscenic/cli/pyscenic.py +++ b/src/pyscenic/cli/pyscenic.py @@ -17,7 +17,7 @@ from arboreto.utils import load_tf_names from pyscenic.utils import modules_from_adjacencies, add_correlation -from pyscenic.rnkdb import opendb, RankingDatabase +from ctxcore.rnkdb import opendb, RankingDatabase from pyscenic.prune import prune2df, find_features, _prepare_client from pyscenic.aucell import aucell from pyscenic.log import create_logging_handler diff --git a/src/pyscenic/cli/utils.py b/src/pyscenic/cli/utils.py index 169280a..b9ab8f8 100644 --- a/src/pyscenic/cli/utils.py +++ b/src/pyscenic/cli/utils.py @@ -10,7 +10,7 @@ import loompy as lp from operator import attrgetter from typing import Type, Sequence -from pyscenic.genesig import GeneSignature, openfile +from ctxcore.genesig import GeneSignature, openfile from pyscenic.transform import df2regulons from pyscenic.utils import load_motifs, load_from_yaml, save_to_yaml from pyscenic.binarization import binarize diff --git a/src/pyscenic/export.py b/src/pyscenic/export.py index c4a5313..f75f022 100644 --- a/src/pyscenic/export.py +++ b/src/pyscenic/export.py @@ -6,7 +6,7 @@ import loompy as lp from sklearn.manifold import TSNE from .aucell import aucell -from .genesig import Regulon +from ctxcore.genesig import Regulon from typing import List, Mapping, Union, Sequence, Optional from operator import attrgetter from multiprocessing import cpu_count diff --git a/src/pyscenic/prune.py b/src/pyscenic/prune.py index e0d7b13..873bf5e 100644 --- a/src/pyscenic/prune.py +++ b/src/pyscenic/prune.py @@ -25,9 +25,9 @@ from dask.distributed import LocalCluster, Client from .log import create_logging_handler -from .genesig import Regulon, GeneSignature +from ctxcore.genesig import Regulon, GeneSignature from .utils import load_motif_annotations -from .rnkdb import RankingDatabase, MemoryDecorator +from ctxcore.rnkdb import RankingDatabase, MemoryDecorator from .utils import add_motif_url from .transform import module2features_auc1st_impl, modules2regulons, modules2df, df2regulons, DF_META_DATA diff --git a/src/pyscenic/regions.py b/src/pyscenic/regions.py index 510673c..296c5de 100644 --- a/src/pyscenic/regions.py +++ b/src/pyscenic/regions.py @@ -6,9 +6,9 @@ from enum import Enum from pkg_resources import resource_stream from typing import Tuple, Type, Iterator, Sequence -from .rnkdb import InvertedRankingDatabase +from ctxcore.rnkdb import InvertedRankingDatabase from .featureseq import FeatureSeq -from .genesig import GeneSignature, Regulon +from ctxcore.genesig import GeneSignature, Regulon from cytoolz import merge_with, memoize from itertools import repeat, chain import pandas as pd diff --git a/src/pyscenic/transform.py b/src/pyscenic/transform.py index d74067e..96faae7 100644 --- a/src/pyscenic/transform.py +++ b/src/pyscenic/transform.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -from .recovery import recovery, aucs as calc_aucs +from ctxcore.recovery import recovery, aucs as calc_aucs import logging import traceback import pandas as pd @@ -15,11 +15,11 @@ REPRESSING_MODULE, ) from itertools import repeat -from .rnkdb import RankingDatabase +from ctxcore.rnkdb import RankingDatabase from functools import reduce from typing import Type, Sequence, Optional -from .genesig import Regulon -from .recovery import leading_edge4row +from ctxcore.genesig import Regulon +from ctxcore.recovery import leading_edge4row import math from itertools import chain from functools import partial diff --git a/src/pyscenic/utils.py b/src/pyscenic/utils.py index e0cc6fd..d646c6c 100644 --- a/src/pyscenic/utils.py +++ b/src/pyscenic/utils.py @@ -2,7 +2,7 @@ import pandas as pd from urllib.parse import urljoin -from .genesig import Regulon, GeneSignature, openfile +from ctxcore.genesig import Regulon, GeneSignature, openfile from .math import masked_rho4pairs from itertools import chain import numpy as np From 906dbee7f6a91022e34e8a715ace87d4a81180ae Mon Sep 17 00:00:00 2001 From: Chris Flerin Date: Thu, 6 May 2021 14:31:08 +0200 Subject: [PATCH 3/5] Fix import in aucell test --- tests/test_aucell.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_aucell.py b/tests/test_aucell.py index aa2a230..e565010 100644 --- a/tests/test_aucell.py +++ b/tests/test_aucell.py @@ -4,7 +4,7 @@ import pandas as pd -from pyscenic.genesig import GeneSignature +from ctxcore.genesig import GeneSignature from pyscenic.aucell import derive_auc_threshold, aucell, create_rankings from pkg_resources import resource_filename From 7c54b77bda76d74cd432652e15b74e762ce27186 Mon Sep 17 00:00:00 2001 From: Chris Flerin Date: Fri, 7 May 2021 20:46:11 +0200 Subject: [PATCH 4/5] Update docs --- README.rst | 29 ++++++++++++++++------ docs/faq.rst | 17 +++++-------- docs/installation.rst | 57 +++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 83 insertions(+), 20 deletions(-) diff --git a/README.rst b/README.rst index 96239e3..8ebc367 100644 --- a/README.rst +++ b/README.rst @@ -3,21 +3,35 @@ pySCENIC |buildstatus|_ |pypipackage|_ |docstatus|_ + pySCENIC is a lightning-fast python implementation of the SCENIC_ pipeline (Single-Cell rEgulatory Network Inference and Clustering) which enables biologists to infer transcription factors, gene regulatory networks and cell types from single-cell RNA-seq data. The pioneering work was done in R and results were published in Nature Methods [1]_. -A new and comprehensive description of this Python implementation of the SCENIC pipeline is available in Nature Protocols [5]_ (`see here `_). +A new and comprehensive description of this Python implementation of the SCENIC pipeline is available in Nature Protocols [4]_. pySCENIC can be run on a single desktop machine but easily scales to multi-core clusters to analyze thousands of cells in no time. The latter is achieved via the dask_ framework for distributed computing [2]_. -**Full documentation** is available on `Read the Docs `_ +**Full documentation** for pySCENIC is available on `Read the Docs `_ + +---- + +pySCENIC is part of the SCENIC Suite of tools! +See the main `SCENIC website `_ for additional information and a full list of tools available. + +---- + News and releases ----------------- +0.11.2 | 2021-05-07 +^^^^^^^^^^^^^^^^^^^ + +* Split some core cisTarget functions out into a separate repository, `ctxcore `_. This is now a required package for pySCENIC. + 0.11.1 | 2021-02-11 ^^^^^^^^^^^^^^^^^^^ @@ -96,7 +110,9 @@ All the functionality of the original R implementation is available and in addit Additional resources -------------------- -For more information, please visit LCB_, or SCENIC_ (R version). +For more information, please visit LCB_, +the main `SCENIC website `_, +or `SCENIC (R version) `_. The CLI to pySCENIC has also been streamlined into a pipeline that can be run with a single command, using the Nextflow workflow manager. There are two Nextflow implementations available: @@ -113,11 +129,10 @@ We are grateful to all providers of TF-annotated position weight matrices, in pa References ---------- -.. [1] Aibar, S. et al. SCENIC: single-cell regulatory network inference and clustering. Nat Meth 14, 1083–1086 (2017). +.. [1] Aibar, S. et al. SCENIC: single-cell regulatory network inference and clustering. Nat Meth 14, 1083–1086 (2017). `doi:10.1038/nmeth.4463 `_ .. [2] Rocklin, M. Dask: parallel computation with blocked algorithms and task scheduling. conference.scipy.org -.. [3] Huynh-Thu, V. A. et al. Inferring regulatory networks from expression data using tree-based methods. PLoS ONE 5, (2010). -.. [4] Zeisel, A. et al. Cell types in the mouse cortex and hippocampus revealed by single-cell RNA-seq. Science 347, 1138–1142 (2015). -.. [5] Van de Sande B., Flerin C., et al. A scalable SCENIC workflow for single-cell gene regulatory network analysis. Nat Protoc. June 2020:1-30. doi:10.1038/s41596-020-0336-2 +.. [3] Huynh-Thu, V. A. et al. Inferring regulatory networks from expression data using tree-based methods. PLoS ONE 5, (2010). `doi:10.1371/journal.pone.0012776 `_ +.. [4] Van de Sande B., Flerin C., et al. A scalable SCENIC workflow for single-cell gene regulatory network analysis. Nat Protoc. June 2020:1-30. `doi:10.1038/s41596-020-0336-2 `_ .. |buildstatus| image:: https://travis-ci.org/aertslab/pySCENIC.svg?branch=master .. _buildstatus: https://travis-ci.org/aertslab/pySCENIC diff --git a/docs/faq.rst b/docs/faq.rst index c0f06d8..89481b6 100644 --- a/docs/faq.rst +++ b/docs/faq.rst @@ -4,17 +4,8 @@ Frequently Asked Questions I am having problems with Dask ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The Arboreto package :code:`v0.1.5`, and some steps of the cisTarget step within pySCENIC, seem to depend on an older version of Dask/Distributed. -Using a more recent version of Dask/Distributed can result in some cryptic errors. -It is recommended to use the older version of Dask and Distributed for stability here: - -.. code-block:: bash - - pip install dask==1.0.0 distributed'>=1.21.6,<2.0.0' - - -But in many cases this still results in issues with the GRN step. -An alternative is to use the multiprocessing implementation of Arboreto recently included in pySCENIC (`arboreto_with_multiprocessing.py `_). +An alternative is to use the multiprocessing implementation of Arboreto included in pySCENIC (`arboreto_with_multiprocessing.py `_). +This scrips is also available on the path once pySCENIC is installed. This script uses the Arboreto and pySCENIC codebase to run GRNBoost2 (or GENIE3) without Dask. The eliminates the possibility of running the GRN step across multiple nodes, but brings provides additional stability. The run time is generally equivalent to the Dask implementation using the same number of workers. @@ -114,6 +105,10 @@ Yes you can. The code snippet below shows you how to create your own databases: dtype=np.int32) RankingDatabase(df, 'custom').save('custom.db') +Please also see +`create_cisTarget_databases `_ +for more detailed and flexible methods to create custom cisTarget databases. + Can I draw the distribution of AUC values for a regulon across cells? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/installation.rst b/docs/installation.rst index 62a20c8..8f59bad 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -44,7 +44,7 @@ You can also install the bleeding edge (i.e. less stable) version of the package Containers -~~~~~~~~~ +~~~~~~~~~~ **pySCENIC containers** are also available for download and immediate use. In this case, no compiling or installation is required, provided either Docker or Singularity software is installed on the user's system. Images are available from `Docker Hub`_. Usage of the containers is shown below (`Docker and Singularity Images`_). To pull the docker images, for example: @@ -194,7 +194,60 @@ Note that in this case, a bind needs to be specified. .. code-block:: bash - singularity exec -B /data:/data aertslab-pyscenic-0.10.0.sif ipython kernel -f {connection_file} + singularity exec -B /data:/data aertslab-pyscenic-latest.sif ipython kernel -f {connection_file} + +More generally, a local or remote kernel can be set up by using the following examples. +These would go in a kernel file in ``~/.local/share/jupyter/kernels/pyscenic-latest/kernel.json`` (for example). + +**Remote singularity kernel:** + +.. code-block:: bash + + { + "argv": [ + "/software/jupyter/bin/python", + "-m", + "remote_ikernel", + "--interface", + "ssh", + "--host", + "r23i27n14", + "--workdir", + "~/", + "--kernel_cmd", + "singularity", + "exec", + "-B", + "/path/to/mounts", + "/path/to/aertslab-pyscenic-latest.sif", + "ipython", + "kernel", + "-f", + "{connection_file}" + ], + "display_name": "pySCENIC singularity remote", + "language": "Python" + } + +**Local singularity kernel:** + +.. code-block:: bash + + { + "argv": [ + "singularity", + "exec", + "-B", + "/path/to/mounts", + "/path/to/aertslab-pyscenic-latest.sif", + "ipython", + "kernel", + "-f", + "{connection_file}" + ], + "display_name": "pySCENIC singularity local", + "language": "python" + } Nextflow From 917323d920e012695560dc05b6e785fdca1996a8 Mon Sep 17 00:00:00 2001 From: Chris Flerin Date: Fri, 7 May 2021 20:46:30 +0200 Subject: [PATCH 5/5] Update requirements - remove pyarrow --- requirements.txt | 3 +-- requirements_docker.txt | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 727bd4b..ae99289 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -cytoolz ctxcore +cytoolz multiprocessing_on_dill llvmlite numba>=0.51.2 @@ -10,7 +10,6 @@ pandas>=0.20.1 cloudpickle dask distributed -pyarrow>=0.11.1,<0.17.0 arboreto>=0.1.6 boltons setuptools diff --git a/requirements_docker.txt b/requirements_docker.txt index 87969c7..b5745d1 100644 --- a/requirements_docker.txt +++ b/requirements_docker.txt @@ -18,6 +18,7 @@ cffi==1.14.4 chardet==3.0.4 click==7.1.2 cloudpickle==1.6.0 +ctxcore==0.1.1 cycler==0.10.0 Cython==0.29.21 cytoolz==0.11.0