diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 22f82a2..0aaf343 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.0.20 +current_version = 0.0.21 commit = True message = [skip ci] {current_version} → {new_version} tag = False diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..0fce304 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,13 @@ +*.py linguist-detectable=true +*Makefile linguist-detectable=false +*.md linguist-detectable=false +*.h linguist-detectable=false +*.cpp linguist-detectable=false +*.rst linguist-detectable=false +*.yaml linguist-detectable=false +*.yml linguist-detectable=false +*.cfg linguist-detectable=false +*.txt linguist-detectable=false +*.pyx linguist-detectable=false +*.R linguist-detectable=false +*.sh linguist-detectable=false diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index adee86e..02a5902 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,7 +7,7 @@ default_stages: minimum_pre_commit_version: 2.9.3 repos: - repo: https://github.com/psf/black - rev: 21.11b1 + rev: 22.3.0 hooks: - id: black additional_dependencies: [toml] @@ -19,7 +19,7 @@ repos: additional_dependencies: [toml] args: [--order-by-type, --profile, 'black'] - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.0.1 + rev: v4.2.0 hooks: - id: detect-private-key - id: check-merge-conflict @@ -40,22 +40,22 @@ repos: - id: check-toml - id: requirements-txt-fixer - repo: https://github.com/jumanjihouse/pre-commit-hooks - rev: 2.1.5 + rev: 2.1.6 hooks: - id: script-must-have-extension name: Check executable files use .sh extension types: [shell, executable] - repo: https://github.com/myint/rstcheck - rev: 3f92957478422df87bd730abde66f089cc1ee19b + rev: v5.0.0 hooks: - id: rstcheck - repo: https://github.com/asottile/blacken-docs - rev: v1.12.0 + rev: v1.12.1 hooks: - id: blacken-docs additional_dependencies: [black==20.8b1] - repo: https://github.com/asottile/pyupgrade - rev: v2.29.1 + rev: v2.32.0 hooks: - id: pyupgrade args: [--py3-plus, --py37-plus] @@ -69,7 +69,7 @@ repos: - id: rst-directive-colons - id: rst-inline-touching-normal - repo: https://github.com/PyCQA/doc8 - rev: 0.10.1 + rev: 0.11.1 hooks: - id: doc8 args: [--max-line-length=88] diff --git a/.readthedocs.yml b/.readthedocs.yml index 57d76b9..2f8abb0 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -1,18 +1,21 @@ version: 2 +build: + os: ubuntu-20.04 + tools: + python: "3.9" + apt_packages: + - pandoc + sphinx: builder: html configuration: docs/source/conf.py fail_on_warning: false -formats: -- htmlzip -- pdf - -build: - image: latest - python: - version: 3.7 install: - requirements: docs/requirements_rtfd.txt + +formats: +- htmlzip +- pdf diff --git a/README.rst b/README.rst index 2d30037..f4a60d9 100644 --- a/README.rst +++ b/README.rst @@ -27,8 +27,7 @@ Please see our `preprint`_ on **bioRxiv** to learn more. Support ------- -Feel free to submit an `issue `_ -or send us an `email `_. +Feel free to submit an `issue `_. Your help to improve Trisicell is highly appreciated. Trisicell was developed in collaboration between the `Cancer Data Science Laboratory (CDSL) `_ and the `Laboratory of Cancer Biology and Genetics (LCBG) `_ at the `National Cancer Institute (NCI) `_. diff --git a/docs/source/_static/thumbnails/trisicell-boost.png b/docs/source/_static/thumbnails/trisicell-boost.png new file mode 100644 index 0000000..918452c Binary files /dev/null and b/docs/source/_static/thumbnails/trisicell-boost.png differ diff --git a/docs/source/citing.rst b/docs/source/citing.rst index c289cfa..c3e20d8 100644 --- a/docs/source/citing.rst +++ b/docs/source/citing.rst @@ -23,6 +23,6 @@ BibTeX month = mar, publisher = {Cold Spring Harbor Laboratory}, author = {Farid Rashidi Mehrabadi and Kerrie L. Marie and Eva Perez-Guijarro and Salem Malikic and Erfan Sadeqi Azer and Howard H. Yang and Can Kizilkale and Charli Gruen and Wells Robinson and Huaitian Liu and Michael C. Kelly and Christina Marcelus and Sandra Burkett and Aydin Buluc and Funda Ergun and Maxwell P. Lee and Glenn Merlino and Chi-Ping Day and S. Cenk Sahinalp}, - title = {{Profiles of expressed mutations in single cells reveal subclonal expansion patterns and therapeutic impact of intratumor heterogeneity}} + title = {{Profiles of expressed mutations in single cells reveal subclonal expansion patterns and therapeutic impact of intratumor heterogeneity}}, journal = {bioRxiv} } diff --git a/docs/source/conf.py b/docs/source/conf.py index 2f0d8d6..3aa5bfe 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -11,7 +11,6 @@ # import os import sys -from datetime import datetime from pathlib import Path from pybtex.plugin import register_plugin @@ -46,7 +45,7 @@ title = ( "Scalable intratumor heterogeneity inference and validation from single-cell data" ) -copyright = f"{datetime.now():%Y}, {author}" +copyright = f"2021-2022, {author}" release = "master" version = f"master ({trisicell.__version__})" diff --git a/docs/source/index.rst b/docs/source/index.rst index c4a3ada..8222136 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -9,7 +9,6 @@ installation release_notes citing - references .. toctree:: :caption: Docs diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index ce3d847..5254032 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -7,6 +7,15 @@ Release Notes ============= +Version 0.0.21 :small:`April 22, 2022` +----------------------------------------- + +This version includes: + + - Fix some bugs. + - Improve the documentations. + + Version 0.0.20 :small:`November 22, 2021` ----------------------------------------- diff --git a/examples/reconstruction/compute_booster.py b/examples/reconstruction/compute_booster.py index 23b6107..d410d97 100644 --- a/examples/reconstruction/compute_booster.py +++ b/examples/reconstruction/compute_booster.py @@ -8,6 +8,8 @@ import trisicell as tsc +# sphinx_gallery_thumbnail_path = "_static/thumbnails/trisicell-boost.png" + # %% # First, we load a binary test single-cell genotype data. df_in = tsc.datasets.test() diff --git a/setup.cfg b/setup.cfg index c091121..e7d05b9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -19,6 +19,7 @@ ignore = D107, # D107 Missing docstring in __init__ W503, # W503 line break before binary operator D202, # D202 No blank lines allowed after function docstring + B020, # B020 Found for loop that reassigns the iterable it is iterating with each B902, # B902 blind except Exception: statement C400, # C400 Unnecessary generator - rewrite as a list comprehension [tool:pytest] diff --git a/setup.py b/setup.py index a8abe24..b9224c9 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ __author__ = ", ".join(["Farid Rashidi"]) __maintainer__ = ", ".join(["Farid Rashidi"]) __email__ = ", ".join(["farid.rsh@gmail.com"]) - __version__ = "0.0.20" + __version__ = "0.0.21" if platform == "linux" or platform == "linux2": os.environ["CC"] = "g++" diff --git a/tests/test_commands.py b/tests/test_commands.py index 567e27e..2617d75 100644 --- a/tests/test_commands.py +++ b/tests/test_commands.py @@ -57,10 +57,9 @@ def test_scite_experiment(self): tsc.ul.get_file("trisicell.datasets/test/test.tsv"), "0.0000001", "0.1", - "-r 3", - "-l 1000", "-e", - "-h 0.005", + "-t 1", + "-s 1", ], ) assert result.exit_code == 0 diff --git a/tests/test_logging.py b/tests/test_logging.py index 5ee38dd..f436763 100644 --- a/tests/test_logging.py +++ b/tests/test_logging.py @@ -1,3 +1,5 @@ +import pytest + import trisicell as tsc @@ -9,4 +11,6 @@ def test_logging(self): tsc.logg.info("INFO") tsc.logg.warn("WARN") tsc.logg.info("TIME", time=True, color="red") + with pytest.raises(RuntimeError): + tsc.logg.error("ERROR") assert True diff --git a/tests/test_pp.py b/tests/test_pp.py index 9c2a143..4704d32 100644 --- a/tests/test_pp.py +++ b/tests/test_pp.py @@ -41,3 +41,10 @@ def test_readcount(self): tsc.pp.remove_cell_by_list(adata, ["C15_1"]) tsc.pp.keep_cell_by_list(adata, ["C15_2", "C15_3"]) assert adata.shape == (2, 267) + + def test_local_cluster_cells_then_merge_muts_pseudo_bulk(self): + geno = tsc.datasets.example() + geno_merged, geno = tsc.pp.local_cluster_cells_then_merge_muts_pseudo_bulk( + geno, by="mut", n_clusters=11, min_n_cells=2, attr="group" + ) + assert geno_merged.shape == (11, 452) diff --git a/tests/test_tl_scores.py b/tests/test_tl_scores.py index aac5064..12bccf6 100644 --- a/tests/test_tl_scores.py +++ b/tests/test_tl_scores.py @@ -11,6 +11,10 @@ def setup_method(self): self.grnd = tsc.io.read(f1) self.sol = tsc.io.read(f2) + def test_gs(self): + gs = tsc.tl.gs(self.grnd, self.sol) + assert np.abs(gs - 0.9895) < 0.0001 + def test_ad(self): ad = tsc.tl.ad(self.grnd, self.sol) assert np.abs(ad - 0.9778) < 0.0001 diff --git a/tests/test_tl_solvers.py b/tests/test_tl_solvers.py index de806ca..09a7ce0 100644 --- a/tests/test_tl_solvers.py +++ b/tests/test_tl_solvers.py @@ -32,13 +32,13 @@ def test_phiscsb(self): df_out = tsc.tl.phiscsb(self.df_in, alpha=0.0000001, beta=0.1) assert tsc.ul.is_conflict_free_gusfield(df_out) - def test_huntress_both(self): - df_out = tsc.tl.huntress(self.df_in, alpha=0.0000001, beta=0.1, kind="both") - assert tsc.ul.is_conflict_free_gusfield(df_out) + # def test_huntress_both(self): + # df_out = tsc.tl.huntress(self.df_in, alpha=0.0000001, beta=0.1) + # assert tsc.ul.is_conflict_free_gusfield(df_out) - def test_huntress_fn(self): - df_out = tsc.tl.huntress(self.df_in, alpha=0, beta=0, kind="fn") - assert tsc.ul.is_conflict_free_gusfield(df_out) + # def test_huntress_fn(self): + # df_out = tsc.tl.huntress(self.df_in, alpha=0, beta=0.1) + # assert tsc.ul.is_conflict_free_gusfield(df_out) @skip_rpy2 def test_onconem(self): diff --git a/trisicell/__init__.py b/trisicell/__init__.py index 73ffb31..4e47aa3 100644 --- a/trisicell/__init__.py +++ b/trisicell/__init__.py @@ -7,5 +7,5 @@ __author__ = ", ".join(["Farid Rashidi"]) __maintainer__ = ", ".join(["Farid Rashidi"]) __email__ = ", ".join(["farid.rsh@gmail.com"]) -__version__ = "0.0.20" +__version__ = "0.0.21" __all__ = (datasets, io, logg, pl, pp, settings, tl, ul) diff --git a/trisicell/commands/_bnb.py b/trisicell/commands/_bnb.py index dc91b0b..d0b8a72 100644 --- a/trisicell/commands/_bnb.py +++ b/trisicell/commands/_bnb.py @@ -35,7 +35,7 @@ def bnb(genotype_file, bounding, time_limit): A fast branch and bound algorithm for the perfect tumor phylogeny reconstruction problem :cite:`PhISCS-BnB`. - trisicell bnb input.SC 0.0001 0.1 -b real + trisicell bnb input.SC -b real """ outfile = os.path.splitext(genotype_file)[0] diff --git a/trisicell/commands/_gpps.py b/trisicell/commands/_gpps.py new file mode 100644 index 0000000..0689a03 --- /dev/null +++ b/trisicell/commands/_gpps.py @@ -0,0 +1,115 @@ +import os + +import click + +import trisicell as tsc + + +@click.command(short_help="Run gpps.") +@click.argument( + "genotype_file", + required=True, + type=click.Path( + exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True + ), +) +@click.argument( + "alpha", + required=True, + type=float, +) +@click.argument( + "beta", + required=True, + type=float, +) +@click.option( + "--k_dollo", + "-k", + default=0, + type=int, + show_default=True, + help="k-Dollo.", +) +@click.option( + "--max_del", + "-d", + default=-1, + type=int, + show_default=True, + help="Maximum number of deletion allowed.", +) +@click.option( + "--neighbor_size", + "-s", + default=30, + type=int, + show_default=True, + help="Hill climbing neighborhood size.", +) +@click.option( + "--n_iters", + "-l", + default=100, + type=int, + show_default=True, + help="Hill climbing maximum iterations.", +) +@click.option( + "--time_limit", + "-t", + default=86400, + type=int, + show_default=True, + help="Time limit of the program (in second).", +) +@click.option( + "--n_threads", + "-p", + default=1, + type=int, + show_default=True, + help="Number of threads.", +) +def gpps( + genotype_file, + alpha, + beta, + k_dollo, + max_del, + neighbor_size, + n_iters, + time_limit, + n_threads, +): + """gpps. + + an ILP-based approach for inferring cancer progression with mutation losses from + single cell data :cite:`gpps`. + + trisicell gpps input.SC 0.0001 0.1 -k 0 -s 30 -l 100 -t 86400 -p 1 + """ + + outfile = os.path.splitext(genotype_file)[0] + + tsc.settings.verbosity = "info" + tsc.settings.logfile = f"{outfile}.gpps.log" + + df_in = tsc.io.read(genotype_file) + + if alpha == 0: + alpha = 0.000000000001 + df_out = tsc.tl.gpps( + df_in, + alpha=alpha, + beta=beta, + k_dollo=k_dollo, + max_del=max_del, + neighbor_size=neighbor_size, + n_iters=n_iters, + time_limit=time_limit, + n_threads=n_threads, + ) + tsc.io.write(df_out, f"{outfile}.gpps.CFMatrix") + + return None diff --git a/trisicell/commands/_grmt.py b/trisicell/commands/_grmt.py index 2bff962..7768772 100644 --- a/trisicell/commands/_grmt.py +++ b/trisicell/commands/_grmt.py @@ -23,6 +23,14 @@ required=True, type=float, ) +@click.option( + "--n_iters", + "-l", + default=30, + type=int, + show_default=True, + help="Number of iterations.", +) @click.option( "--n_threads", "-p", @@ -31,13 +39,13 @@ show_default=True, help="Number of threads.", ) -def grmt(genotype_file, alpha, beta, n_threads): +def grmt(genotype_file, alpha, beta, n_iters, n_threads): """GRMT. Generative Reconstruction of Mutation Tree From Scratch Using Single-Cell Sequencing Data :cite:`GRMT`. - trisicell grmt input.SC 0.0001 0.1 + trisicell grmt input.SC 0.0001 0.1 -l 100 -p 2 """ outfile = os.path.splitext(genotype_file)[0] @@ -46,7 +54,9 @@ def grmt(genotype_file, alpha, beta, n_threads): tsc.settings.logfile = f"{outfile}.grmt.log" df_in = tsc.io.read(genotype_file) - df_out = tsc.tl.grmt(df_in, alpha=alpha, beta=beta, n_threads=n_threads) + df_out = tsc.tl.grmt( + df_in, alpha=alpha, beta=beta, n_iters=n_iters, n_threads=n_threads + ) tsc.io.write(df_out, f"{outfile}.grmt.CFMatrix") return None diff --git a/trisicell/commands/_huntress.py b/trisicell/commands/_huntress.py index 831ea0b..8e934cb 100644 --- a/trisicell/commands/_huntress.py +++ b/trisicell/commands/_huntress.py @@ -23,14 +23,6 @@ required=True, type=float, ) -@click.option( - "--method", - "-m", - default="both", - type=click.Choice(["both", "fn"]), - show_default=True, - help="Method of the HUNTRESS", -) @click.option( "--n_threads", "-p", @@ -39,10 +31,10 @@ show_default=True, help="Number of threads.", ) -def huntress(genotype_file, alpha, beta, method, n_threads): +def huntress(genotype_file, alpha, beta, n_threads): """HUNTRESS. - trisicell huntress input.SC 0.0001 0.1 -m both -p 8 + trisicell huntress input.SC 0.0001 0.1 -p 8 """ outfile = os.path.splitext(genotype_file)[0] @@ -50,10 +42,7 @@ def huntress(genotype_file, alpha, beta, method, n_threads): tsc.settings.verbosity = "info" tsc.settings.logfile = f"{outfile}.huntress.log" - df_in = tsc.io.read(genotype_file) - df_out = tsc.tl.huntress( - df_in, alpha=alpha, beta=beta, kind=method, n_threads=n_threads - ) + df_out = tsc.tl.huntress(genotype_file, alpha=alpha, beta=beta, n_threads=n_threads) tsc.io.write(df_out, f"{outfile}.huntress.CFMatrix") return None diff --git a/trisicell/commands/_onconem.py b/trisicell/commands/_onconem.py new file mode 100644 index 0000000..561909a --- /dev/null +++ b/trisicell/commands/_onconem.py @@ -0,0 +1,43 @@ +import os + +import click + +import trisicell as tsc + + +@click.command(short_help="Run OncoNEM.") +@click.argument( + "genotype_file", + required=True, + type=click.Path( + exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True + ), +) +@click.argument( + "alpha", + required=True, + type=float, +) +@click.argument( + "beta", + required=True, + type=float, +) +def onconem(genotype_file, alpha, beta): + """OncoNEM. + + Inferring tumor evolution from single-cell sequencing data :cite:`OncoNEM`. + + trisicell onconem input.SC 0.0001 0.1 + """ + + outfile = os.path.splitext(genotype_file)[0] + + tsc.settings.verbosity = "info" + tsc.settings.logfile = f"{outfile}.onconem.log" + + df_in = tsc.io.read(genotype_file) + df_out = tsc.tl.onconem(df_in, alpha=alpha, beta=beta) + tsc.io.write(df_out, f"{outfile}.onconem.CFMatrix") + + return None diff --git a/trisicell/commands/_scite.py b/trisicell/commands/_scite.py index 3249f00..f110ada 100644 --- a/trisicell/commands/_scite.py +++ b/trisicell/commands/_scite.py @@ -51,17 +51,29 @@ help="Is in experiment mode.", ) @click.option( - "--n_hours", - "-h", - default=24, + "--time_limit", + "-t", + default=86400, type=float, show_default=True, - help="Number of hours for the experiment part.", + help="Time limit for the experiment part (in seconds).", ) -def scite(genotype_file, alpha, beta, n_iters, n_restarts, experiment, n_hours): - """Tree inference for single-cell data :cite:`SCITE`. +@click.option( + "--smooth_rate", + "-s", + default=2, + type=float, + show_default=True, + help="Smooth rate for the experiment part.", +) +def scite( + genotype_file, alpha, beta, n_iters, n_restarts, experiment, time_limit, smooth_rate +): + """SCITE. + + Tree inference for single-cell data :cite:`SCITE`. - trisicell scite input.SC 0.0001 0.1 -l 1000000 -r 3 -e -h 24 + trisicell scite input.SC 0.0001 0.1 -l 1000000 -r 3 -e -t 86400 -s 2 """ outfile = os.path.splitext(genotype_file)[0] @@ -89,7 +101,7 @@ def scite(genotype_file, alpha, beta, n_iters, n_restarts, experiment, n_hours): n_restarts=1, experiment=True, ) - n_iters = int(2 * 30000 * n_hours * 60 * 60 / running_time) + n_iters = int(smooth_rate * 30000 * time_limit / running_time) def run(i): do, r, s, b = tsc.tl.scite( @@ -102,7 +114,7 @@ def run(i): ) return do, r, s, b - output = Parallel(n_jobs=3)(delayed(run)(i) for i in range(3)) + output = Parallel(n_jobs=n_restarts)(delayed(run)(i) for i in range(n_restarts)) scores = [x[2] for x in output] betas = [x[3] for x in output] diff --git a/trisicell/commands/_score.py b/trisicell/commands/_score.py index 441cb83..d3ab2a5 100644 --- a/trisicell/commands/_score.py +++ b/trisicell/commands/_score.py @@ -29,6 +29,7 @@ def score(ground_file, inferred_file): df_g = tsc.io.read(ground_file) df_s = tsc.io.read(inferred_file) + gs = tsc.tl.gs(df_g, df_s) ad = tsc.tl.ad(df_g, df_s) dl = tsc.tl.dl(df_g, df_s) mltd = tsc.tl.mltd(df_g, df_s)["normalized_similarity"] @@ -39,8 +40,10 @@ def score(ground_file, inferred_file): rf = tsc.tl.rf(df_g, df_s) tsc.logg.info( - f"AD={ad:0.4f}\nDL={dl:0.4f}\nMLTSM={mltd:0.4f}\nTPTED={tpted:0.4f}\n" - f"CASet={caset:0.4f}\nDISC={disc:0.4f}\nMP3={mp3:0.4f}\nRF={rf:0.4f}" + f"GS={gs:0.4f}\nAD={ad:0.4f}\nDL={dl:0.4f}\n" + f"MLTSM={mltd:0.4f}\nTPTED={tpted:0.4f}\n" + f"CASet={caset:0.4f}\nDISC={disc:0.4f}\nMP3={mp3:0.4f}\n" + f"RF={rf:0.4f}" ) return None diff --git a/trisicell/commands/_siclonefit.py b/trisicell/commands/_siclonefit.py new file mode 100644 index 0000000..38b51af --- /dev/null +++ b/trisicell/commands/_siclonefit.py @@ -0,0 +1,141 @@ +import os + +import click +import numpy as np +from joblib import Parallel, delayed + +import trisicell as tsc + + +@click.command(short_help="Run SiCloneFit.") +@click.argument( + "genotype_file", + required=True, + type=click.Path( + exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True + ), +) +@click.argument( + "alpha", + required=True, + type=float, +) +@click.argument( + "beta", + required=True, + type=float, +) +@click.option( + "--n_iters", + "-l", + default=600, + type=int, + show_default=True, + help="Number of iterations.", +) +@click.option( + "--n_restarts", + "-r", + default=3, + type=int, + show_default=True, + help="Number of restarts.", +) +@click.option( + "--experiment", + "-e", + is_flag=True, + default=False, + type=bool, + show_default=True, + help="Is in experiment mode.", +) +@click.option( + "--time_limit", + "-t", + default=86400, + type=float, + show_default=True, + help="Time limit for the experiment part (in seconds).", +) +@click.option( + "--smooth_rate", + "-s", + default=2, + type=float, + show_default=True, + help="Smooth rate for the experiment part.", +) +def siclonefit( + genotype_file, alpha, beta, n_iters, n_restarts, experiment, time_limit, smooth_rate +): + """SiCloneFit. + + Bayesian inference of population structure, genotype, and phylogeny of tumor clones + from single-cell genome sequencing data :cite:`SiCloneFit`. + + trisicell siclonefit input.SC 0.0001 0.1 -l 1000000 -r 3 -e -t 86400 -s 2 + """ + + outfile = os.path.splitext(genotype_file)[0] + + tsc.settings.verbosity = "info" + + df_in = tsc.io.read(genotype_file) + if not experiment: + tsc.settings.logfile = f"{outfile}.siclonefit.log" + df_out = tsc.tl.siclonefit( + df_in, + alpha=alpha, + beta=beta, + n_iters=n_iters, + n_restarts=n_restarts, + ) + tsc.io.write(df_out, f"{outfile}.siclonefit.CFMatrix") + else: + tsc.settings.logfile = f"{outfile}.siclonefit.log" + df_out, running_time, _, _ = tsc.tl.siclonefit( + df_in, + alpha=alpha, + beta=beta, + n_iters=500, + n_restarts=1, + experiment=True, + ) + n_iters = int(smooth_rate * 500 * time_limit / running_time) + + def run(i): + do, r, cf, nll = tsc.tl.siclonefit( + df_in, + alpha=alpha, + beta=beta, + n_iters=n_iters, + n_restarts=1, + experiment=True, + ) + return do, r, cf, nll + + output = Parallel(n_jobs=n_restarts)(delayed(run)(i) for i in range(n_restarts)) + + scores = [x[3] for x in output] + iscfs = [x[2] for x in output] + best_i = np.Inf + best = np.Inf + for i, items in enumerate(zip(scores, iscfs)): + score, iscf = items + if iscf and score < best: + best_i = i + best = score + df_out = output[best_i][0] + + tsc.ul.stat(df_in, df_out, alpha, beta, output[best_i][1]) + tsc.logg.info(f"score: {output[best_i][3]}") + tsc.logg.info(f"iscf: {output[best_i][2]}") + tsc.logg.info(f"n_iters: {n_iters}") + tsc.logg.info(f"scores: {','.join(list(map(str, scores)))}") + tsc.logg.info(f"iscfs: {','.join(list(map(str, iscfs)))}") + tsc.logg.info(f"picked: {best_i}") + + tsc.io.write(df_out, f"{outfile}.siclonefit.CFMatrix") + + return None diff --git a/trisicell/commands/_sphyr.py b/trisicell/commands/_sphyr.py index dc0b7a8..fa9d36a 100644 --- a/trisicell/commands/_sphyr.py +++ b/trisicell/commands/_sphyr.py @@ -54,6 +54,8 @@ def sphyr(genotype_file, alpha, beta, n_restarts, n_threads): tsc.settings.logfile = f"{outfile}.sphyr.log" df_in = tsc.io.read(genotype_file) + if alpha == 0: + alpha = 0.000000000001 df_out = tsc.tl.sphyr( df_in, alpha=alpha, beta=beta, n_restarts=n_restarts, n_threads=n_threads ) diff --git a/trisicell/commands/trisicell.py b/trisicell/commands/trisicell.py index addee42..24046e2 100644 --- a/trisicell/commands/trisicell.py +++ b/trisicell/commands/trisicell.py @@ -6,15 +6,18 @@ from trisicell.commands._bnb import bnb from trisicell.commands._booster import booster from trisicell.commands._consensus import consensus +from trisicell.commands._gpps import gpps from trisicell.commands._grmt import grmt from trisicell.commands._huntress import huntress from trisicell.commands._mcalling import mcalling +from trisicell.commands._onconem import onconem from trisicell.commands._partf import partf from trisicell.commands._phiscs import phiscsb, phiscsi from trisicell.commands._scistree import scistree from trisicell.commands._scite import scite from trisicell.commands._score import score from trisicell.commands._search import search +from trisicell.commands._siclonefit import siclonefit from trisicell.commands._sphyr import sphyr from trisicell.commands._trees import cf2newick, cf2tree @@ -67,6 +70,9 @@ def cli(): cli.add_command(huntress) cli.add_command(grmt) cli.add_command(sphyr) +cli.add_command(gpps) +cli.add_command(onconem) +cli.add_command(siclonefit) cli.add_command(cf2newick) cli.add_command(cf2tree) cli.add_command(score) diff --git a/trisicell/external/_betabinom.py b/trisicell/external/_betabinom.py index 38c5867..8229461 100644 --- a/trisicell/external/_betabinom.py +++ b/trisicell/external/_betabinom.py @@ -237,10 +237,10 @@ def Generalabminuscd(a, b, c, d): bmr, be = math.frexp(b) cmr, ce = math.frexp(c) dmr, de = math.frexp(d) - am = int(amr * (2 ** 53)) - bm = int(bmr * (2 ** 53)) - cm = int(cmr * (2 ** 53)) - dm = int(dmr * (2 ** 53)) + am = int(amr * (2**53)) + bm = int(bmr * (2**53)) + cm = int(cmr * (2**53)) + dm = int(dmr * (2**53)) abe = ae + be cde = ce + de abm = am * bm @@ -251,14 +251,14 @@ def Generalabminuscd(a, b, c, d): scale = 0 cdm = 1 # return math.ldexp(abm * (2**scale) - cdm, abe - scale - 106) - return float(abm * (2 ** scale) - cdm) * (2.0 ** (abe - scale - 106)) + return float(abm * (2**scale) - cdm) * (2.0 ** (abe - scale - 106)) else: scale = -scale if scale > 106: scale = 0 abm = 1 # return math.ldexp(abm - cdm * (2**scale), cde - scale - 106) - return float(abm - cdm * (2 ** scale)) * (2.0 ** (cde - scale - 106)) + return float(abm - cdm * (2**scale)) * (2.0 ** (cde - scale - 106)) def logfbit2dif(x): @@ -519,8 +519,8 @@ def logfbit(x): x = x - 1.0 i = len(coeffs2) + 2 lgam = ((x + 2.5) * logcf(-x / 2.0, i, 1.0) - (2.0 / (i - 1.0))) * ( - 2.0 ** -i - ) + (3.0 ** -i) * logcf(-x / 3.0, i, 1.0) + 2.0**-i + ) + (3.0**-i) * logcf(-x / 3.0, i, 1.0) # print(i,coeffs2[i-3],lgam) for i in range(i - 3, -1, -1): lgam = coeffs2[i] - x * lgam @@ -576,9 +576,9 @@ def lfbaccdif1(a, b): return -lfbaccdif1(-a, b + a) elif b >= 8.0: y1 = b + 1.0 - y2 = y1 ** -2 + y2 = y1**-2 x1 = a + b + 1.0 - x2 = x1 ** -2 + x2 = x1**-2 x3 = x2 * lfbc9 y3 = y2 * lfbc9 acc = x2 * (a * (x1 + y1) * y3) @@ -615,14 +615,14 @@ def lfbaccdif1(a, b): s1 = (0.5 * (Start + 1.0)) ** 2 s2 = (0.5 * (Start + 1.5)) ** 2 - ty = y1 * math.sqrt(1.0 + s1 * (y1 ** -2)) - tx = x1 * math.sqrt(1.0 + s1 * (x1 ** -2)) + ty = y1 * math.sqrt(1.0 + s1 * (y1**-2)) + tx = x1 * math.sqrt(1.0 + s1 * (x1**-2)) y2 = ty - y1 x2 = tx - x1 acc = a * (1.0 - (2.0 * y1 + a) / (tx + ty)) # Seems to work better without the next 2 lines. - Not with modification to s2 - ty = y1 * math.sqrt(1.0 + s2 * (y1 ** -2)) - tx = x1 * math.sqrt(1.0 + s2 * (x1 ** -2)) + ty = y1 * math.sqrt(1.0 + s2 * (y1**-2)) + tx = x1 * math.sqrt(1.0 + s2 * (x1**-2)) acc = 0.25 * ( acc + s1 / ((y1 + ty) * (x1 + tx)) * a * (1.0 + (2.0 * y1 + a) / (tx + ty)) ) @@ -661,8 +661,8 @@ def lfbaccdif1(a, b): y1 = b - 1.0 x1 = y1 + a i = len(coeffs2) + 2 - scale2 = 2.0 ** -i - scale3 = 3.0 ** -i + scale2 = 2.0**-i + scale3 = 3.0**-i y2 = ( (y1 + 2.5) * logcf(-y1 / 2.0, i, 1.0) - (2.0 / (i - 1.0)) ) * scale2 + scale3 * logcf(-y1 / 3.0, i, 1.0) @@ -1103,7 +1103,7 @@ def ccBNB5(ilim, rr, a, bb): if r <= 0.001: temp = a + (b + r) * 0.5 ccbnb5 = ccbnb5 - b * r * ( - logfbit2(temp) + (b ** 2 + r ** 2) * logfbit4(temp) / 24.0 + logfbit2(temp) + (b**2 + r**2) * logfbit4(temp) / 24.0 ) else: ccbnb5 = ccbnb5 + (lfbaccdif1(b, r + a) - lfbaccdif1(b, a)) diff --git a/trisicell/external/_mp3.py b/trisicell/external/_mp3.py index d8f0f5b..dad5c8a 100644 --- a/trisicell/external/_mp3.py +++ b/trisicell/external/_mp3.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python3 - from collections import Counter, defaultdict from functools import partial from itertools import combinations diff --git a/trisicell/external/gpps/__init__.py b/trisicell/external/gpps/__init__.py new file mode 100644 index 0000000..147958a --- /dev/null +++ b/trisicell/external/gpps/__init__.py @@ -0,0 +1,2 @@ +from ._gpps_hc import gpps_hc +from ._gpps_ilp import gpps_ilp diff --git a/trisicell/external/gpps/_gpps_hc.py b/trisicell/external/gpps/_gpps_hc.py new file mode 100644 index 0000000..cd50333 --- /dev/null +++ b/trisicell/external/gpps/_gpps_hc.py @@ -0,0 +1,158 @@ +import random +from functools import lru_cache + +import numpy as np + +import trisicell as tsc +from trisicell.external.gpps._utils_hc import ( + copy_tree, + import_ilp_out, + prune_and_reattach, +) + +__author__ = "Simone Ciccolella" +__date__ = "11/30/21" + + +@lru_cache(maxsize=None) +def cell_row_likelihood(input_row, node_genotype, alpha, beta): + likelihood = 0 + for j in range(len(input_row)): + if input_row[j] == "0": + if node_genotype[j] == "0": + likelihood += np.log(1 - beta) + elif node_genotype[j] == "1": + likelihood += np.log(alpha) + else: + likelihood += -np.inf + elif input_row[j] == "1": + if node_genotype[j] == "0": + likelihood += np.log(beta) + elif node_genotype[j] == "1": + likelihood += np.log(1 - alpha) + else: + likelihood += -np.inf + + elif input_row[j] == "2": + likelihood += 0 + return likelihood + + +def greedy_tree_likelihood(tree, nid_dict, input_scs, alpha, beta): + tree + likelihood = 0 + attachment = [] + for row in input_scs: + str_row = "".join(str(int(x)) for x in row) + best_lh = -np.inf + best_attachment = -1 + for node in nid_dict: + str_gt = "".join(str(int(x)) for x in nid_dict[node].genotype_profile) + + lh = cell_row_likelihood(str_row, str_gt, alpha, beta) + if lh > best_lh: + best_lh = lh + best_attachment = node + likelihood += best_lh + attachment.append(best_attachment) + + return likelihood, attachment + + +def get_expect_matrix(tree, nid_dict, input_scs, alpha, beta): + _, attachment = greedy_tree_likelihood(tree, nid_dict, input_scs, alpha, beta) + e_matrix = [] + + for node in nid_dict: + t_node = nid_dict[node] + if ( + t_node.loss + and t_node.id_node not in attachment + and len(t_node.children) == 0 + ): + tsc.logg.debug(t_node.name) + + for c in range(len(attachment)): + e_matrix.append(nid_dict[attachment[c]].genotype_profile) + return e_matrix + + +def generate_neighborhood(start_tree, start_nid_dict, neighborhood_size): + start_nid_dict + neighbors = [] + while len(neighbors) < neighborhood_size: + # prune-reattach only + cp_tree, cp_dict = copy_tree(start_tree) + node_ids = list(cp_dict.keys()) + prune = random.choice(node_ids) + reattach = random.choice(node_ids) + + if prune != reattach: + if prune_and_reattach(cp_dict[prune], cp_dict[reattach], cp_dict): + neighbors.append((cp_tree, cp_dict)) + else: + cp_tree = None + cp_dict = None + return neighbors + + +def hill_climbing( + start_tree, + start_nid_dict, + neighborhood_size, + max_iterations, + alpha, + beta, + input_scs, +): + current_tree = start_tree + current_dict = start_nid_dict + current_lh, _ = greedy_tree_likelihood( + start_tree, start_nid_dict, input_scs, alpha, beta + ) + tsc.logg.debug("Initial log-likelihood: %f" % current_lh) + + current_iteration = 1 + while current_iteration < max_iterations: + + if current_iteration % 10 == 0: + tsc.logg.debug("Current iteration: %d" % current_iteration) + + neighbors = generate_neighborhood(current_tree, current_dict, neighborhood_size) + next_eval = -np.inf + next_sol = None + + for ng in neighbors: + # tsc.logg.debug(ng) + ng_lh, _ = greedy_tree_likelihood(ng[0], ng[1], input_scs, alpha, beta) + + if ng_lh > next_eval: + next_eval = ng_lh + next_sol = (ng[0], ng[1]) + + if next_eval > current_lh: + current_tree = next_sol[0] + current_dict = next_sol[1] + current_lh = next_eval + tsc.logg.debug("Found a better solution with likelihood: %f" % current_lh) + current_iteration += 1 + + return current_tree, current_dict + + +def gpps_hc(input_matrix, ilp_matrix, alpha, beta, k_dollo, mut_names, ns=30, mi=100): + imported_tree, imported_nid_dict = import_ilp_out(ilp_matrix, k_dollo, mut_names) + + hc_best_tree, hc_best_dict = hill_climbing( + imported_tree, + imported_nid_dict, + neighborhood_size=ns, + max_iterations=mi, + alpha=alpha, + beta=beta, + input_scs=input_matrix, + ) + + e_mat = get_expect_matrix(hc_best_tree, hc_best_dict, input_matrix, alpha, beta) + + return e_mat diff --git a/trisicell/external/gpps/_gpps_ilp.py b/trisicell/external/gpps/_gpps_ilp.py new file mode 100644 index 0000000..0c0f7cb --- /dev/null +++ b/trisicell/external/gpps/_gpps_ilp.py @@ -0,0 +1,226 @@ +import itertools +import math + +import trisicell as tsc +from trisicell.external.gpps._utils_ilp import expand_name + +__author__ = "Simone Ciccolella" +__date__ = "11/30/21" + + +def gpps_ilp( + input_matrix, + alpha, + beta, + k_dollo, + max_del=-1, + time_limit=86400, + n_threads=1, +): + gp, gp_is_not_imported = tsc.ul.import_gurobi() + if gp_is_not_imported: + tsc.logg.error("Unable to import a package!") + + # ==================================================================# + # ========================= PREREQUISITES ==========================# + # ==================================================================# + + # parser.add_argument( + # "--mps", + # action="store_true", + # help="This will output the model in MPS format instead of running the solver", + # ) + + # ----------------------Initialize program----------------------# + # Fixed parameters + # num_samples = len(input_matrix) + # num_clones = int(num_mutations * n_clones) + num_clones = len(input_matrix) + num_mutations = len(input_matrix[0]) + + # tsc.logg.debug("Num samples: %d" % num_samples) + tsc.logg.debug("Num mutations: %d" % num_mutations) + tsc.logg.debug("Num clones: %d" % num_clones) + + # ==================================================================# + # ========================== GUROBI MODEL ==========================# + # ==================================================================# + model = gp.Model("Parsimony Phylogeny Model") + model.Params.OutputFlag = 0 + model.Params.LogFile = "" + model.setParam("Threads", n_threads) + model.setParam("TimeLimit", time_limit) + + # ---------------------------------------------------# + # ------------------- VARIABLES ---------------------# + # ---------------------------------------------------# + + # -----------Variable Y and B--------------- + + lalpha = math.log(alpha) + lbeta = math.log(beta) + l_alpha = math.log(1 - alpha) + l_beta = math.log(1 - beta) + + tsc.logg.debug(lalpha, lbeta, l_alpha, l_beta) + + tsc.logg.debug("Generating variables I, F, w, P.") + I_mtx = {} + F = {} + fminus = {} + P = {} + # False positive should be only a few + FP = {} + + for row_index, row in enumerate(input_matrix): + I_mtx[row_index] = {} + F[row_index] = {} + fminus[row_index] = {} + P[row_index] = {} + FP[row_index] = {} + for col_index, cell in enumerate(row): + # P + names = expand_name(str(col_index), 1, k_dollo) + for name in names: + P[row_index][name] = model.addVar( + vtype=gp.GRB.BINARY, obj=0, name=f"P{row_index}-{name}" + ) + if cell < 2: + # I + I_mtx[row_index][col_index] = model.addVar( + vtype=gp.GRB.BINARY, + obj=0, + name=f"I{row_index}-{col_index}", + ) + model.update() + model.addConstr( + I_mtx[row_index][col_index] == cell, + f"constr_I{row_index}-{col_index}", + ) + # F and fminus. fminus is equal to 1-F + if cell == 0: + F[row_index][col_index] = model.addVar( + vtype=gp.GRB.BINARY, + obj=lalpha, + name=f"F{row_index}-{col_index}", + ) + fminus[row_index][col_index] = model.addVar( + vtype=gp.GRB.BINARY, + obj=l_beta, + name=f"f{row_index}-{col_index}", + ) + if cell == 1: + F[row_index][col_index] = model.addVar( + vtype=gp.GRB.BINARY, + obj=l_alpha, + name=f"F{row_index}-{col_index}", + ) + fminus[row_index][col_index] = model.addVar( + vtype=gp.GRB.BINARY, + obj=lbeta, + name=f"f{row_index}-{col_index}", + ) + FP[row_index][col_index] = model.addVar( + vtype=gp.GRB.BINARY, + obj=0, + name=f"FP{row_index}-{col_index}", + ) + model.update() + model.addConstr( + FP[row_index][col_index] == 1 - P[row_index][names[0]], + name=f"constr_FP{row_index}-{col_index}", + ) + model.update() + model.addConstr( + F[row_index][col_index] == 1 - fminus[row_index][col_index], + name=f"constr_def_fminus{row_index}-{col_index}", + ) + model.addConstr( + F[row_index][col_index] + == P[row_index][names[0]] + - gp.quicksum(P[row_index][name] for name in names[1:]), + name=f"constr_balance_F{row_index}-{col_index}", + ) + model.addConstr( + P[row_index][names[0]] + >= gp.quicksum(P[row_index][name] for name in names[1:]), + name=f"constr_imbalance_P{row_index}-{col_index}", + ) + + # There are only a few false positives + model.addConstr( + 4 >= gp.quicksum(FP[r][c] for r in FP.keys() for c in FP[r].keys()), + name="constr_few_FP", + ) + + model.update() + + tsc.logg.debug("Generating variables B.") + B = {} + columns = list( + itertools.chain.from_iterable( + [expand_name(str(name), 1, k_dollo) for name in range(num_mutations)] + ) + ) + for p in columns: + B[p] = {} + for p, q in itertools.combinations(columns, 2): + B[p][q] = {} + B[p][q]["01"] = model.addVar(vtype=gp.GRB.BINARY, obj=0, name=f"B[{p},{q},0,1]") + B[p][q]["10"] = model.addVar(vtype=gp.GRB.BINARY, obj=0, name=f"B[{p},{q},1,0]") + B[p][q]["11"] = model.addVar(vtype=gp.GRB.BINARY, obj=0, name=f"B[{p},{q},1,1]") + model.update() + for row_index, _ in enumerate(input_matrix): + model.addConstr( + B[p][q]["01"] >= P[row_index][q] - P[row_index][p], + f"constr_B01-{p}-{q}-{row_index}", + ) + model.addConstr( + B[p][q]["10"] >= P[row_index][p] - P[row_index][q], + f"constr_B10-{p}-{q}-{row_index}", + ) + model.addConstr( + B[p][q]["11"] >= P[row_index][p] + P[row_index][q] - 1, + f"constr_B11-{p}-{q}-{row_index}", + ) + model.addConstr( + B[p][q]["01"] + B[p][q]["10"] + B[p][q]["11"] <= 2, + f"constr_sum_B{p}-{q}", + ) + + deletions = {} + del_names = [] + for p in columns: + if "-" in p: + del_names.append(p) + deletions[p] = model.addVar(vtype=gp.GRB.BINARY, obj=0, name=f"Del[{p}]") + for row_index, _ in enumerate(input_matrix): + model.addConstr(deletions[p] >= P[row_index][p]) + + if max_del != -1: + model.addConstr(gp.quicksum(deletions[x] for x in del_names) <= max_del) + + model.update() + model.modelSense = gp.GRB.MAXIMIZE + model.update() + + # ---------------------------------------------------# + # -------------------- OPTIMIZE ---------------------# + # ---------------------------------------------------# + tsc.logg.debug("#----- GUROBI OPTIMIZATION ----#") + model.optimize() + + # ==================================================================# + # ======================= POST OPTIMIZATION ========================# + # ==================================================================# + + output_matrix = [] + for row_index, row in enumerate(input_matrix): + row_out = [] + for col_index, _ in enumerate(row): + names = expand_name(str(col_index), 1, k_dollo) + for name in names: + row_out.append(int(float(P[row_index][name].X))) + output_matrix.append(row_out) + + return output_matrix diff --git a/trisicell/external/gpps/_nh2lgf.py b/trisicell/external/gpps/_nh2lgf.py new file mode 100644 index 0000000..1948577 --- /dev/null +++ b/trisicell/external/gpps/_nh2lgf.py @@ -0,0 +1,103 @@ +import sys + +__author__ = "Simone Ciccolella" +__date__ = "11/30/21" + +name = "nh2lgf" + +desc = """ + +convert newhampshire (or newick) format (nh) +to lemon graph format (lgf) +-- Murray Patterson, Sept 2014 + +""" + +note = "" + + +# for processing a 'node' object, i,e., of the form 'A:B', where 'A' +# is a name (or a bootstrap value in the case of an internal node), +# and 'B' is a branch length +def node(suffix, count, nodes, arcs): + + assert not suffix.startswith("("), "\n\ninput not in nh format" + + # eat until ',', ')' or ';' + if ")" not in suffix: # we know we're at the end + a, b, c = suffix.partition(";") # if not, search for ',' + elif "," in suffix and suffix.find(",") < suffix.find(")"): + a, b, c = suffix.partition(",") + else: # o.w., it is the only terminal + a, b, c = suffix.partition(")") + + name, x, brlen = a.partition(":") + nodes.append((count + 1, name)) + + return b + c, count + 1, brlen, nodes, arcs + + +# for processing a 'subtree' object, i.e., of the form 'N' or 'X', +# where 'N' is a 'node' object (base case), and 'X' is a 'subtrees' +# object (recursive case) +def subtree(suffix, count, nodes, arcs): + + if suffix.startswith("("): # recursive case + return subtrees(suffix, count, nodes, arcs) + + else: # base case, we're at a node (which is necessarily a leaf, here) + return node(suffix, count, nodes, arcs) + + +# for recieving and processing a 'subtrees' object, i.e. of the form +# '(S,S,..,S)' where 'S' is a 'subtree' object +def subtrees(suffix, count, nodes, arcs): + + assert ")" in suffix, "\n\ninput not in nh format" + + suffix = suffix[1:] # eat the '(' + children = [] # store info for each 'S' in '(S,S,..,S)' + + # loop through the 'S' in '(S,S,..,S)' + while True: # note : contains at least one 'S', even if '' + suffix, count, brlen, nodes, arcs = subtree(suffix, count, nodes, arcs) + children.append((count, brlen)) + + if suffix.startswith(")"): # stopping condition + break + if suffix.startswith(","): # in case of multiple subtrees + suffix = suffix[1:] # eat the ',' + + # now, here, suffix[0] == ')' + suffix = suffix[1:] # eat the ')' + # get parent of '(S,S,..,S)' + suffix, count, brlen, nodes, arcs = node(suffix, count, nodes, arcs) + for child in children: # now add the arcs to each child 'S' + arcs.append((child[0], count, child[1])) + + return suffix, count, brlen, nodes, arcs + + +def newick_to_edgelist(input_string): + s = input_string + assert s.find(";") == len(s) - 1, "\n\nerror: input not in nh format" + assert s.count("(") == s.count(")"), "\n\nerror: input not in nh format" + + # parse the input + nodes = [] + arcs = [] + subtree(s, -1, nodes, arcs) + + try: + # output in lgf format + node_dict = {} + for label, name in nodes: + node_dict[label] = name + + edges = [] + for u, v, _ in arcs: + edges.append((u, v)) + + return node_dict, edges + except OSError: + sys.exit(0) diff --git a/trisicell/external/gpps/_utils_hc.py b/trisicell/external/gpps/_utils_hc.py new file mode 100644 index 0000000..2e75c78 --- /dev/null +++ b/trisicell/external/gpps/_utils_hc.py @@ -0,0 +1,311 @@ +from subprocess import PIPE, Popen + +import numpy as np + +import trisicell as tsc +from trisicell.external.gpps._nh2lgf import newick_to_edgelist + +__author__ = "Simone Ciccolella" +__date__ = "11/30/21" + + +class Node: + def __init__( + self, + name, + parent, + id_node, + mutation_id, + loss=False, + tot_mutations=0, + gt_build=True, + ): + self.name = name + self.id_node = id_node + self.parent = parent + self.children = [] + self.loss = loss + self.mut_id = mutation_id + self.tot_mutations = tot_mutations + if parent: + parent.children.append(self) + + if gt_build: + gt_par_cp = parent.genotype_profile.copy() + if self.loss: + gt_par_cp[mutation_id] -= 1 + else: + gt_par_cp[mutation_id] += 1 + + self.genotype_profile = gt_par_cp + else: + self.genotype_profile = [0 for x in range(tot_mutations)] + + def is_ancestor_of(self, node): + par = node.parent + while par: + if par == self: + return True + par = par.parent + return False + + def print_node_dot(self): + if self.parent is None: + print(f'\t"{self.parent.id_node}" -> "{self.id_node}";') + if "-" not in self.name: + print(f'\t"{self.id_node}" [label="{self.name}"];') + else: + print( + '\t"{}" [color=indianred1, style=filled, label="{}"];'.format( + self.id_node, self.name[:-1] + ) + ) + + def print_node_dot_file(self, fout): + if self.parent is None: + fout.write(f'\t"{self.parent.id_node}" -> "{self.id_node}";\n') + if "-" not in self.name: + fout.write(f'\t"{self.id_node}" [label="{self.name}"];\n') + else: + fout.write( + '\t"{}" [color=indianred1, style=filled, label="{}"];\n'.format( + self.id_node, self.name[:-1] + ) + ) + + +def add_edge(start, end): + start.children.append(end) + end.parent = start + + +def __print_tree(node): + if len(node.children) == 0: + node.print_node_dot() + else: + node.print_node_dot() + for child in node.children: + __print_tree(child) + + +def __print_tree_file(node, fout): + if len(node.children) == 0: + node.print_node_dot_file(fout) + else: + node.print_node_dot_file(fout) + for child in node.children: + __print_tree_file(child, fout) + + +def print_dot_tree(node): + print("digraph phylogeny {") + print(f'\t"{node.id_node}" [label="{node.name}"];') + __print_tree(node) + print("}") + + +def print_dot_tree_file(node, fout): + fout.write("digraph phylogeny {\n") + fout.write(f'\t"{node.id_node}" [label="{node.name}"];\n') + __print_tree_file(node, fout) + fout.write("}\n") + + +def __copy_tree_rec(node, cp_parent, nid_dict): + node_cp = Node(node.name, cp_parent, node.id_node, node.mut_id, loss=node.loss) + nid_dict[node_cp.id_node] = node_cp + if len(node.children) == 0: + return + for child in node.children: + __copy_tree_rec(child, node_cp, nid_dict) + + +def copy_tree(root): + nid_nodes = {} + cp_root = Node( + root.name, + root.parent, + root.id_node, + root.mut_id, + tot_mutations=root.tot_mutations, + ) + nid_nodes[root.id_node] = cp_root + for child in root.children: + __copy_tree_rec(child, cp_root, nid_nodes) + return cp_root, nid_nodes + + +def contains(col1, col2): + for i in range(len(col1)): + if not col1[i] >= col2[i]: + return False + return True + + +def build_tree_from_file(ilp_matrix, mutations_names, mutations_ids, tot_mutations): + executable = tsc.ul.executable("gpps_tree", "gpps tree scripts") + tmpdir = tsc.ul.tmpdirsys(suffix=".gpps") + np.savetxt( + f"{tmpdir.name}/input.mtx", ilp_matrix.values, delimiter=" ", fmt="%1.0f" + ) + + rb_tree = Popen(["ruby", executable, "-m", f"{tmpdir.name}/input.mtx"], stdout=PIPE) + stdout, _ = rb_tree.communicate() + tree = stdout.decode("utf-8").strip() + + tmpdir.cleanup() + + node_dict, edges = newick_to_edgelist(tree) + + building_dictionary = {} + nid = 0 + + # Get germline nid: + for k in node_dict: + if node_dict[k] == "germline": + nid = k + + root = Node("germline", None, nid, -1, tot_mutations=tot_mutations) + building_dictionary[nid] = root + + for edge in edges: + e, s = edge + + x = None + try: + x = building_dictionary[s] + except KeyError: + x_column_index = int(node_dict[s][1:]) - 1 + if "---" in mutations_names[x_column_index]: + loss = True + else: + loss = False + x = Node( + mutations_names[x_column_index], + None, + s, + mutations_ids[x_column_index], + gt_build=False, + loss=loss, + ) + building_dictionary[s] = x + + y = None + try: + y = building_dictionary[e] + except KeyError: + y_column_index = int(node_dict[e][1:]) - 1 + if "---" in mutations_names[y_column_index]: + loss = True + else: + loss = False + y = Node( + mutations_names[y_column_index], + None, + e, + mutations_ids[y_column_index], + gt_build=False, + loss=loss, + ) + building_dictionary[e] = y + add_edge(x, y) + + calculate_genotype_profile_subtree(root, building_dictionary) + + return (root, building_dictionary) + + +def calculate_genotype_profile_subtree(node, nid_dict): + if node.parent: + gt_par_cp = node.parent.genotype_profile.copy() + if node.loss: + gt_par_cp[node.mut_id] -= 1 + else: + gt_par_cp[node.mut_id] += 1 + node.genotype_profile = gt_par_cp + + else: + # This assumes that the root is correctly initiated at all 0s + pass + if len(node.children) == 0: + return + for child in node.children: + calculate_genotype_profile_subtree(child, nid_dict) + + +def prune_and_reattach(node_prune, node_reattach, nid_dict): + if node_prune.is_ancestor_of(node_reattach): + return False + node_prune.parent.children.remove(node_prune) + node_prune.parent = node_reattach + node_reattach.children.append(node_prune) + + check_subtree_losses(node_reattach, nid_dict) + + # rebuild genotype profile of pruned subtree + calculate_genotype_profile_subtree(node_reattach, nid_dict) + + return True + + +def is_loss_valid(node, mut_id): + par = node.parent + while par: + if par.mut_id == mut_id: + return True + par = par.parent + return False + + +def is_already_lost(node, mut_id): + par = node.parent + while par: + if par.loss and par.mut_id == mut_id: + return True + par = par.parent + return False + + +def delete_node(node, nid_dict): + parent = node.parent + # node.parent = None + parent.children.remove(node) + for child in node.children: + child.parent = parent + parent.children.append(child) + nid_dict.pop(node.id_node) + node = None + + +def check_subtree_losses(node, nid_dict): + if node.loss: + valid = is_loss_valid(node, node.mut_id) + lost = is_already_lost(node, node.mut_id) + + if not valid or lost: + delete_node(node, nid_dict) + + if len(node.children) == 0: + return + for child in node.children: + check_subtree_losses(child, nid_dict) + + +def import_ilp_out(ilp_matrix, k_dollo, mutation_names): + mut_names = [] + mut_ids = [] + + mut_index = 0 + for mut in mutation_names: + mut_names.append(mut) + mut_ids.append(mut_index) + for _ in range(k_dollo): + mut_names.append("%s---" % mut) + mut_ids.append(mut_index) + mut_index += 1 + + imported_tree, imported_dict = build_tree_from_file( + ilp_matrix, mut_names, mut_ids, len(mutation_names) + ) + + return imported_tree, imported_dict diff --git a/trisicell/external/gpps/_utils_ilp.py b/trisicell/external/gpps/_utils_ilp.py new file mode 100644 index 0000000..bd114fc --- /dev/null +++ b/trisicell/external/gpps/_utils_ilp.py @@ -0,0 +1,8 @@ +__author__ = "Simone Ciccolella" +__date__ = "11/30/21" + + +def expand_name(s, max_gains, max_losses): + positive_names = [s + "+" + str(i) for i in range(max_gains)] + negative_names = [s + "-" + str(i) for i in range(max_losses)] + return positive_names + negative_names diff --git a/trisicell/logging/_logging.py b/trisicell/logging/_logging.py index 2642a2a..03b5efd 100644 --- a/trisicell/logging/_logging.py +++ b/trisicell/logging/_logging.py @@ -16,7 +16,7 @@ def error(*args, **kwargs): """Write errors message.""" args = ("Error:",) + args msg(*args, v="error", **kwargs) - exit(1) + raise RuntimeError def warn(*args, **kwargs): diff --git a/trisicell/pp/__init__.py b/trisicell/pp/__init__.py index 1fb7d15..5009d08 100644 --- a/trisicell/pp/__init__.py +++ b/trisicell/pp/__init__.py @@ -17,6 +17,7 @@ group_obs_apply_func, keep_cell_by_list, keep_mut_by_list, + local_cluster_cells_then_merge_muts_pseudo_bulk, remove_cell_by_list, remove_mut_by_list, statistics, @@ -42,4 +43,5 @@ statistics, collapse, group_obs_apply_func, + local_cluster_cells_then_merge_muts_pseudo_bulk, ) diff --git a/trisicell/pp/_readcount.py b/trisicell/pp/_readcount.py index d62ee8e..95625fd 100644 --- a/trisicell/pp/_readcount.py +++ b/trisicell/pp/_readcount.py @@ -1,3 +1,4 @@ +import anndata as ad import numpy as np import pandas as pd @@ -211,3 +212,62 @@ def filter_snpeff(adata, exome=False): adata.var["Total"] = adata.layers["total"][0] adata.var["VAF"] = adata.var["Mutant"] / adata.var["Total"] adata.var["SAMPLE"] = tumor_obs + + +def local_cluster_cells_then_merge_muts_pseudo_bulk( + adata, by="mut", n_clusters=100, min_n_cells=5, attr="group" +): + def _pseudo_bulk_caller(sub_adata): + genotype = 2 * np.ones(sub_adata.shape[1]) + mutant = sub_adata.layers["mutant"].sum(axis=0) + total = sub_adata.layers["total"].sum(axis=0) + + genotype[total == mutant] = 3 + genotype[total > mutant] = 1 + genotype[mutant == 0] = 0 + genotype[total == 0] = 2 + + return genotype, mutant, total + + tsc.pp.build_scmatrix(adata) + if by == "mut": + clusters = tsc.ul.hclustering(adata.to_df()) + elif by == "cna": + clusters = tsc.ul.hclustering(adata.obsm["cna"], metric="cosine") + else: + tsc.logg.error("Wrong `by` choice!") + + cluster = clusters[n_clusters] + count = cluster.value_counts() + cluster = cluster[cluster.isin(count[count >= min_n_cells].index)] + cluster = cluster.apply(lambda x: f"G{x}") + + adata = adata[cluster.index, :] + adata.obs[attr] = cluster + + genotypes = [] + mutants = [] + totals = [] + indices = [] + for index, subgroup in adata.obs.groupby(adata.obs[attr]): + # genotype, mutant, total = _pseudo_bulk_caller_better( + # adata[subgroup.index, :], min_vaf=0.2, min_cell=1 + # ) + genotype, mutant, total = _pseudo_bulk_caller(adata[subgroup.index, :]) + + indices.append(index) + genotypes.append(genotype) + mutants.append(mutant) + totals.append(total) + + adata_merged = ad.AnnData( + X=3 * np.ones((len(indices), adata.shape[1])), + obs=pd.DataFrame.from_dict({"cells": indices}).set_index("cells"), + var=adata.var, + layers={ + "total": np.array(totals), + "mutant": np.array(mutants), + "genotype": np.array(genotypes), + }, + ) + return adata_merged, adata diff --git a/trisicell/tl/__init__.py b/trisicell/tl/__init__.py index 44a2e17..986f7e3 100644 --- a/trisicell/tl/__init__.py +++ b/trisicell/tl/__init__.py @@ -4,12 +4,13 @@ from trisicell.tl.consensus import consensus, consensus_day from trisicell.tl.fitch import fitch from trisicell.tl.partition_function import partition_function -from trisicell.tl.score import ad, caset, cc, disc, dl, mltd, mp3, rf, tpted +from trisicell.tl.score import ad, caset, cc, disc, dl, gs, mltd, mp3, rf, tpted from trisicell.tl.solver import ( bnb, booster, cardelino, dendro, + gpps, grmt, huntress, infscite, @@ -65,4 +66,5 @@ sphyr, grmt, sciphi, + gpps, ) diff --git a/trisicell/tl/score/__init__.py b/trisicell/tl/score/__init__.py index 698698d..b516744 100644 --- a/trisicell/tl/score/__init__.py +++ b/trisicell/tl/score/__init__.py @@ -1,2 +1,2 @@ from trisicell.tl.score._others import caset, disc, mp3, rf -from trisicell.tl.score._ours import ad, cc, dl, mltd, tpted +from trisicell.tl.score._ours import ad, cc, dl, gs, mltd, tpted diff --git a/trisicell/tl/score/_others.py b/trisicell/tl/score/_others.py index abdbe84..bfee2a2 100644 --- a/trisicell/tl/score/_others.py +++ b/trisicell/tl/score/_others.py @@ -33,6 +33,38 @@ def bourque(df_grnd, df_sol): return None +def PCSS(df_grnd, df_sol): + """Pairwise cell shortest-path similarity score. + + For every pair of cells :math:`i` and :math:`j`, we computed the shortest-path + :math:`d_{ij}` between the two cells in each tree. If the two cells belong to the + same clone, their shortest-path distance is 0, otherwise the shortest-path distance + equals the number of edges (regardless of direction) that separate the clones of the + two cells. Finally, we summed up the absolute differences between the shortest-path + distances of all unordered pairs of cells in the two trees. + This measure is metric. The proof is given in the paper. + + This measure was introduced in :cite:`OncoNEM`. + + Parameters + ---------- + df_grnd : :class:`pandas.DataFrame` + The first genotype matrix (e.g. ground truth) + This matrix must be conflict-free. + df_sol : :class:`pandas.DataFrame` + The second genotype matrix (e.g. solution/inferred) + This matrix must be conflict-free. + + Returns + ------- + :obj:`float` + Similarity out of one. + """ + + # TODO: implement + return None + + def mp3(df_grnd, df_sol): """Triplet-based similarity score. diff --git a/trisicell/tl/score/_ours.py b/trisicell/tl/score/_ours.py index 59ea47c..ccb3b1c 100644 --- a/trisicell/tl/score/_ours.py +++ b/trisicell/tl/score/_ours.py @@ -8,6 +8,37 @@ from trisicell.ul._trees import _split_labels, _to_apted +def gs(df_grnd, df_sol): + """Genotype-similarity accuracy. + + This measure was introduced in :cite:`SiCloneFit`. + + Parameters + ---------- + df_grnd : :class:`pandas.DataFrame` + The first genotype matrix (e.g. ground truth) + This matrix must be conflict-free. + df_sol : :class:`pandas.DataFrame` + The second genotype matrix (e.g. solution/inferred) + This matrix must be conflict-free. + + Returns + ------- + :obj:`float` + Similarity out of one. + """ + + muts = np.intersect1d(df_grnd.columns, df_sol.columns) + cells = np.intersect1d(df_grnd.index, df_sol.index) + if len(muts) == 0: + tsc.logg.error("No common mutations found between two trees!") + if len(cells) == 0: + tsc.logg.error("No common cells found between two trees!") + M_grnd = df_grnd.loc[cells, muts].values + M_sol = df_sol.loc[cells, muts].values + return 1 - np.abs(M_grnd - M_sol).sum() / M_grnd.size + + def ad(df_grnd, df_sol): """Ancestor-descendent accuracy. diff --git a/trisicell/tl/solver/__init__.py b/trisicell/tl/solver/__init__.py index 7202db8..6ab7153 100644 --- a/trisicell/tl/solver/__init__.py +++ b/trisicell/tl/solver/__init__.py @@ -1,6 +1,7 @@ from trisicell.tl.solver._bnb import bnb from trisicell.tl.solver._cardelino import cardelino from trisicell.tl.solver._dendro import dendro +from trisicell.tl.solver._gpps import gpps from trisicell.tl.solver._grmt import grmt from trisicell.tl.solver._onconem import onconem from trisicell.tl.solver._phiscs import ( diff --git a/trisicell/tl/solver/_gpps.py b/trisicell/tl/solver/_gpps.py index 182525d..255292b 100644 --- a/trisicell/tl/solver/_gpps.py +++ b/trisicell/tl/solver/_gpps.py @@ -1,3 +1,96 @@ -def gpps(): - # TODO: implement - return None +import time + +import pandas as pd + +import trisicell as tsc +from trisicell.external.gpps import gpps_hc, gpps_ilp + + +def gpps( + df_input, + alpha, + beta, + k_dollo=0, + max_del=-1, + neighbor_size=30, + n_iters=100, + time_limit=86400, + n_threads=1, +): + """Solving using gpps. + + an ILP-based approach for inferring cancer progression with mutation losses from + single cell data :cite:`gpps`. + + Parameters + ---------- + df_input : :class:`pandas.DataFrame` + Input genotype matrix in which rows are cells and columns are mutations. + Values inside this matrix show the presence (1), absence (0) and missing + entires (3). + alpha : :obj:`float` + False positive error rate. + beta : :obj:`float` + False negative error rate. + k_dollo : :obj:`int`, optional + k for Dollo model, by default 0 + max_del : :obj:`int`, optional + Maximum number of deletion allowed, by default -1 + neighbor_size : :obj:`int`, optional + Hill climbing neighborhood size, by default 30 + n_iters : :obj:`int`, optional + Hill climbing maximum iterations, by default 100 + time_limit : :obj:`int`, optional + Time limit (in seconds), by default 86400 + n_threads : :obj:`int`, optional + Number of threads, by default 1 + + Returns + ------- + :class:`pandas.DataFrame` + A conflict-free matrix in which rows are cells and columns are mutations. + Values inside this matrix show the presence (1) and absence (0). + """ + + tsc.logg.info( + f"running gpps with alpha={alpha}, beta={beta}, k_dollo={k_dollo}, " + f"max_del={max_del}, neighbor_size={neighbor_size}, n_iters={n_iters}, " + f"time_limit={time_limit}, n_threads={n_threads}" + ) + + cells = list(df_input.index) + snvs = list(df_input.columns) + + s_time = time.time() + ilp_matrix = gpps_ilp( + df_input.values, + alpha=beta, # gpps takes a as false-negative and b as false-positive + beta=alpha, + k_dollo=k_dollo, + max_del=max_del, + time_limit=time_limit, + n_threads=n_threads, + ) + ilp_matrix = pd.DataFrame(ilp_matrix) + + output_matrix = gpps_hc( + df_input.values, + ilp_matrix, + alpha=beta, + beta=alpha, + k_dollo=k_dollo, + mut_names=snvs, + ns=neighbor_size, + mi=n_iters, + ) + e_time = time.time() + running_time = e_time - s_time + + df_output = pd.DataFrame(output_matrix) + df_output.columns = snvs + df_output.index = cells + df_output.index.name = "cellIDxmutID" + + tsc.ul.stat(df_input, df_output, alpha, beta, running_time) + + return df_output diff --git a/trisicell/tl/solver/_grmt.py b/trisicell/tl/solver/_grmt.py index 5197d0f..fb69008 100644 --- a/trisicell/tl/solver/_grmt.py +++ b/trisicell/tl/solver/_grmt.py @@ -6,7 +6,7 @@ import trisicell as tsc -def grmt(df_input, alpha, beta, n_threads=1, n_iters=30): +def grmt(df_input, alpha, beta, n_iters=30, n_threads=1): """Solving using GRMT. Generative Reconstruction of Mutation Tree From Scratch Using Single-Cell @@ -22,12 +22,10 @@ def grmt(df_input, alpha, beta, n_threads=1, n_iters=30): False positive error rate. beta : :obj:`float` False negative error rate. - n_threads : :obj:`int`, optional - Number of threads, by default 1 n_iters : :obj:`int`, optional Number of iterations, by default 30 - experiment : :obj:`bool`, optional - Is in the experiment mode (the log won't be shown), by default False + n_threads : :obj:`int`, optional + Number of threads, by default 1 Returns ------- diff --git a/trisicell/tl/solver/_phiscs.py b/trisicell/tl/solver/_phiscs.py index 243c48b..9a7042c 100644 --- a/trisicell/tl/solver/_phiscs.py +++ b/trisicell/tl/solver/_phiscs.py @@ -177,7 +177,21 @@ def phiscsi(df_input, alpha, beta, time_limit=86400, n_threads=1): B = {} for c in range(num_cells): for m in range(num_mutations): - Y[c, m] = model.addVar(vtype=gp.GRB.BINARY, name=f"Y({c},{m})") + if alpha == 0: + # 0->1 + if I_mtr[c, m] == 0: + Y[c, m] = model.addVar( + vtype=gp.GRB.BINARY, obj=1, name=f"Y({c},{m})" + ) + elif I_mtr[c, m] == 1: + Y[c, m] = 1 + else: + Y[c, m] = model.addVar( + vtype=gp.GRB.BINARY, obj=0, name=f"Y({c},{m})" + ) + else: + # 0->1 & 1->0 + Y[c, m] = model.addVar(vtype=gp.GRB.BINARY, name=f"Y({c},{m})") for p in range(num_mutations): for q in range(p + 1, num_mutations): B[p, q, 1, 1] = model.addVar( @@ -197,16 +211,21 @@ def phiscsi(df_input, alpha, beta, time_limit=86400, n_threads=1): model.addConstr(-Y[i, p] + Y[i, q] - B[p, q, 0, 1] <= 0) model.addConstr(Y[i, p] - Y[i, q] - B[p, q, 1, 0] <= 0) - objective = 0 - for j in range(num_mutations): - for i in range(num_cells): - # 0->1 & 1->0 - if I_mtr[i, j] == 0: - objective += np.log(1 - alpha) + np.log(beta / (1 - alpha)) * Y[i, j] - if I_mtr[i, j] == 1: - objective += np.log(alpha) + np.log((1 - beta) / alpha) * Y[i, j] - - model.setObjective(objective, gp.GRB.MAXIMIZE) + if alpha == 0: + # 0->1 + model.Params.ModelSense = gp.GRB.MINIMIZE + else: + # 0->1 & 1->0 + objective = 0 + for j in range(num_mutations): + for i in range(num_cells): + if I_mtr[i, j] == 0: + objective += ( + np.log(1 - alpha) + np.log(beta / (1 - alpha)) * Y[i, j] + ) + if I_mtr[i, j] == 1: + objective += np.log(alpha) + np.log((1 - beta) / alpha) * Y[i, j] + model.setObjective(objective, gp.GRB.MAXIMIZE) s_time = time.time() model.optimize() @@ -216,7 +235,13 @@ def phiscsi(df_input, alpha, beta, time_limit=86400, n_threads=1): sol_Y = np.zeros((num_cells, num_mutations), dtype=np.int8) for i in range(num_cells): for j in range(num_mutations): - sol_Y[i, j] = Y[i, j].X > 0.5 + if alpha == 0: + if I_mtr[i, j] != 1: + sol_Y[i, j] = Y[i, j].X > 0.5 + else: + sol_Y[i, j] = 1 + else: + sol_Y[i, j] = Y[i, j].X > 0.5 df_output = pd.DataFrame(sol_Y) df_output.columns = snvs diff --git a/trisicell/tl/solver/_siclonefit.py b/trisicell/tl/solver/_siclonefit.py index d00aa8a..403bdde 100644 --- a/trisicell/tl/solver/_siclonefit.py +++ b/trisicell/tl/solver/_siclonefit.py @@ -1,3 +1,4 @@ +import glob import os import time @@ -7,13 +8,54 @@ import trisicell as tsc -def siclonefit(df_input, alpha, beta, n_iters): - # TODO: implement +def siclonefit( + df_input, + alpha, + beta, + n_restarts=3, + n_iters=500, + burnin=100, + return_tree=False, + experiment=False, +): + """Solving using SiCloneFit. + + Bayesian inference of population structure, genotype, and phylogeny of tumor clones + from single-cell genome sequencing data :cite:`SiCloneFit`. + + Parameters + ---------- + df_input : :class:`pandas.DataFrame` + Input genotype matrix in which rows are cells and columns are mutations. + Values inside this matrix show the presence (1), absence (0) and missing + entires (3). + alpha : :obj:`float` + False positive error rate. + beta : :obj:`float` + False negative error rate. + n_restarts : :obj:`int`, optional + Number of restarts, by default 3 + n_iters : :obj:`int`, optional + Number of iterations, by default 90000 + return_tree : :obj:`bool`, optional + Return the inferred cell-lineage tree, by default False + experiment : :obj:`bool`, optional + Is in the experiment mode (the log won't be shown), by default False + + Returns + ------- + :class:`pandas.DataFrame` + A conflict-free matrix in which rows are cells and columns are mutations. + Values inside this matrix show the presence (1) and absence (0). + """ + executable = tsc.ul.executable("SiCloneFiTComplete.jar", "SiCloneFit") - tsc.logg.info( - f"running SiCloneFit with alpha={alpha}, beta={beta}, n_iters={n_iters}" - ) + if not experiment: + tsc.logg.info( + f"running SiCloneFit with alpha={alpha}, beta={beta}, n_iters={n_iters}" + ) + tmpdir = tsc.ul.tmpdirsys(suffix=".siclonefit") df_input.T.reset_index(drop=True).to_csv( @@ -32,20 +74,19 @@ def siclonefit(df_input, alpha, beta, n_iters): f"-ipMat {tmpdir.name}/siclonefit.input " f"-fp {alpha} " f"-fn {beta} " - # "-df 0 " + "-df 0 " f"-missing {np.sum(I_mtr == 3)/(I_mtr.size)} " - # "-f 3 " + f"-iter {n_iters} " + f"-cellNames {tmpdir.name}/siclonefit.cellnames " + f"-geneNames {tmpdir.name}/siclonefit.genenames " + f"-r {n_restarts} " + f"-burnin {burnin} " # "-recurProb 0 " # "-delProb 0 " # "-LOHProb 0 " - # f"-iter {n_iters} " - f"-cellNames {tmpdir.name}/siclonefit.cellnames " - f"-geneNames {tmpdir.name}/siclonefit.genenames " - # "-r " - # "-burnin " + # "-doublet " # "-printIter " # "-treeIter " - # "-doublet " f"-outDir {tmpdir.name} > {tmpdir.name}/siclonefit.log" ) s_time = time.time() @@ -53,8 +94,10 @@ def siclonefit(df_input, alpha, beta, n_iters): e_time = time.time() running_time = e_time - s_time + out_dir = glob.glob(f"{tmpdir.name}/*samples/best")[0] + df_output = pd.read_csv( - f"{tmpdir.name}/samples/best/best_MAP_predicted_genotype.txt", + f"{out_dir}/best_MAP_predicted_genotype.txt", sep=" ", header=None, index_col=0, @@ -63,8 +106,18 @@ def siclonefit(df_input, alpha, beta, n_iters): df_output.index = df_input.index df_output.index.name = "cellIDxmutID" - tmpdir.cleanup() + with open(f"{out_dir}/best_MAP_tree.txt") as fin: + tree = fin.readline().strip() - tsc.ul.stat(df_input, df_output, alpha, beta, running_time) + tmpdir.cleanup() - return df_output + if not experiment: + tsc.ul.stat(df_input, df_output, alpha, beta, running_time) + if return_tree: + return df_output, tree + else: + return df_output + else: + is_cf = tsc.ul.is_conflict_free_gusfield(df_output) + nll = tsc.ul.calc_nll_matrix(df_input, df_output, alpha, beta) + return df_output, running_time, is_cf, nll diff --git a/trisicell/tl/solver/_sphyr.py b/trisicell/tl/solver/_sphyr.py index 1039e82..fe1877c 100644 --- a/trisicell/tl/solver/_sphyr.py +++ b/trisicell/tl/solver/_sphyr.py @@ -41,8 +41,6 @@ def sphyr( Number of cell clusters, by default 10 n_mut_clusters : :obj:`int`, optional Number of mutation clusters, by default 15 - experiment : :obj:`bool`, optional - Is in the experiment mode (the log won't be shown), by default False Returns ------- @@ -51,7 +49,7 @@ def sphyr( Values inside this matrix show the presence (1) and absence (0). """ - executable = tsc.ul.executable("kDPFC", "SPhyR") + executable = tsc.ul.executable("sphyr_kDPFC", "SPhyR") tsc.logg.info( f"running SPhyR with alpha={alpha}, beta={beta}, n_restarts={n_restarts}, " diff --git a/trisicell/tl/solver/booster/_booster.py b/trisicell/tl/solver/booster/_booster.py index c6bc7ba..6ca6062 100644 --- a/trisicell/tl/solver/booster/_booster.py +++ b/trisicell/tl/solver/booster/_booster.py @@ -88,7 +88,7 @@ def booster( n_muts = df_input.shape[1] if sample_size is None: - sample_size = int(dep_weight * (n_muts ** 2) / (sample_size ** 2)) + sample_size = int(dep_weight * (n_muts**2) / (sample_size**2)) detail = {} @@ -113,7 +113,7 @@ def booster( # preparing dependencies file if not no_dependencies: - max_num_submatrices = int(dep_weight * (n_muts ** 2) / (sample_size ** 2)) + max_num_submatrices = int(dep_weight * (n_muts**2) / (sample_size**2)) prepare_dependencies( df_input.columns, tmpdir, diff --git a/trisicell/tl/solver/huntress/__init__.py b/trisicell/tl/solver/huntress/__init__.py index b5cf4db..bf9939b 100644 --- a/trisicell/tl/solver/huntress/__init__.py +++ b/trisicell/tl/solver/huntress/__init__.py @@ -4,7 +4,7 @@ from trisicell.tl.solver.huntress._huntress import Reconstruct -def huntress(df_input, alpha, beta, kind="both", n_threads=1): +def huntress(df_input_filepath, alpha, beta, n_threads=1): """Solving using HUNTRESS. HUNTRESS: Provably fast intratumor heterogeneity inference from single-cell @@ -20,8 +20,6 @@ def huntress(df_input, alpha, beta, kind="both", n_threads=1): False positive error rate. beta : :obj:`float` False negative error rate. - kind : :obj:`str` - What type of noise rate was observed in the data {'both', 'fn'} n_threads : :obj:`int`, optional Number of threads, by default 1 @@ -33,33 +31,36 @@ def huntress(df_input, alpha, beta, kind="both", n_threads=1): """ tsc.logg.info( - f"running HUNTRESS with alpha={alpha}, beta={beta}, kind={kind}," - f" n_threads={n_threads}" + f"running HUNTRESS with alpha={alpha}, beta={beta}, n_threads={n_threads}" ) + tmpdir = tsc.ul.tmpdirsys(suffix=".huntress") + running_time = 0 - if kind == "both": + if alpha == 0: + running_time = Reconstruct( + df_input_filepath, + f"{tmpdir.name}/huntress.CFMatrix", + Algchoice="FN", + n_proc=n_threads, + ) + df_output = pd.read_table(f"{tmpdir.name}/huntress.CFMatrix", index_col=0) + else: fn_conorm = 0.1 fp_conorm = fn_conorm * alpha / beta - matrix_out, running_time = Reconstruct( - df_input, + running_time = Reconstruct( + df_input_filepath, + f"{tmpdir.name}/huntress", n_proc=n_threads, fnfp=51, post_fn=fn_conorm, post_fp=fp_conorm, ) - elif kind == "fn": - matrix_out, running_time = Reconstruct( - df_input, - Algchoice="FN", - n_proc=n_threads, - ) - - df_output = pd.DataFrame(matrix_out) - df_output.columns = df_input.columns - df_output.index = df_input.index - df_output.index.name = "cellIDxmutID" + df_output = pd.read_table(f"{tmpdir.name}/huntress.CFMatrix", index_col=0) + df_input = pd.read_table(df_input_filepath, index_col=0) tsc.ul.stat(df_input, df_output, alpha, beta, running_time) + tmpdir.cleanup() + return df_output diff --git a/trisicell/tl/solver/huntress/_huntress.py b/trisicell/tl/solver/huntress/_huntress.py index 6cce13c..6e67526 100644 --- a/trisicell/tl/solver/huntress/_huntress.py +++ b/trisicell/tl/solver/huntress/_huntress.py @@ -1,21 +1,30 @@ +#!/usr/bin/env python + +import argparse import itertools +import multiprocessing as mp import time from multiprocessing import Process, Queue import numpy as np +import pandas as pd __author__ = "Can Kizilkale" -__date__ = "3/19/21" +__date__ = "12/30/21" + +np.set_printoptions(threshold=np.inf) prfp = 5 def Reconstruct( - df_input, + input_file, + output_file, Algchoice="FPNA", auto_tune=1, overlapp_coeff=0.3, hist_coeff=20, + postprocessing=0, fnfp=100, fnc=1, postprcoef=5, @@ -26,12 +35,14 @@ def Reconstruct( global prfp prfp = postprcoef q = Queue() - matrix_input = ReadFileNA(df_input.copy()) - matrix_input_raw = ReadFasis(df_input.copy()) - matrix_NA_est = Estimated_Matrix(df_input.copy()) + Flog = open(output_file + ".LOG", "a+") + matrix_input = ReadFileNA(input_file) + matrix_input_raw = ReadFasis(input_file) + matrix_NA_est = Estimated_Matrix(input_file) + print(np.sum(matrix_input), np.sum(matrix_NA_est), file=Flog) h_range = list(range(1, 100, 2)) oc_range = [x / 10 for x in range(1, 5)] - tune_var = itertools.product(h_range, oc_range) + tune_var = list(itertools.product(h_range, oc_range)) running_time = 0 @@ -40,8 +51,10 @@ def Reconstruct( matrix_recons = greedyPtreeNew(matrix_input.astype(bool))[1] e_time = time.time() running_time = e_time - s_time + WriteTfile(output_file, matrix_recons, input_file) if Algchoice == "FPNA" and auto_tune == 0: + s_time = time.time() apprx_ordr = sum(matrix_NA_est) matrix_recons = greedyPtreeNA( @@ -49,10 +62,12 @@ def Reconstruct( )[2] e_time = time.time() running_time = e_time - s_time + output_file = output_file + f"_optH_{hist_coeff}TEMP.CFMatrix" + WriteTfile(output_file, matrix_recons, input_file) if Algchoice == "FPNA" and auto_tune == 1: - tune_var = list(tune_var) proc_size = np.ceil(len(tune_var) / n_proc).astype(int) + print(proc_size) cpu_range = [] for i in range(n_proc): @@ -73,9 +88,12 @@ def Reconstruct( matrix_input_raw, fnfp, fnc, + i, + input_file, + output_file, ), ) - for _ in range(len(cpu_range)) + for i in range(len(cpu_range)) ] for i in p: i.start() @@ -83,27 +101,52 @@ def Reconstruct( i.join() ret = [] while not q.empty(): + print(" Reading the queue ...") ret.append(q.get()) [m_r, d_min] = ret[0] - matrix_recons = m_r + matrix_recons = ReadFfile(m_r) for i in range(1, len(ret)): [m, d] = ret[i] + print(d_min) if d < d_min: - matrix_recons = m + matrix_recons = ReadFfile(m) + print(d_min) d_min = d + print("closed", q.empty()) + e_time = time.time() running_time = e_time - s_time - - matrix_recons = postprocess_col(df_input, matrix_recons, pfn=post_fn, pfp=post_fp) - - return matrix_recons, running_time - - -def Auto_fnfp(q, tune_ran, m_input, m_NA_est, m_input_raw, fnfp, fnc): + print(running_time) + output_befpost = output_file + "BefPost.CFMatrix" + output_file = output_file + "TEMP.CFMatrix" + WriteTfile(output_befpost, matrix_recons, input_file) + WriteTfile(output_file, matrix_recons, input_file) + print( + " 1->0 : ", + np.sum(matrix_recons < matrix_input), + " 0->1 : ", + np.sum(matrix_recons > matrix_input_raw), + " NA->1 : ", + np.sum(matrix_recons > matrix_input) + - np.sum(matrix_recons > matrix_input_raw), + ) + + Flog.close() + postprocess_col( + input_file, output_file, pfn=post_fn, pfp=post_fp + ) # Column based post processing, seem to give the best result. + + return running_time + + +def Auto_fnfp( + q, tune_ran, m_input, m_NA_est, m_input_raw, fnfp, fnc, procid, in_file, out_file +): # multiprocessing worker unit, takes a tuning range and tries everything in between apprx_ordr = sum(m_NA_est) + print("An instance of auto tune has started...", procid) matrix_recon = greedyPtreeNA( m_input.astype(bool), apprx_ordr, tune_ran[0][1], tune_ran[0][0] )[2] @@ -113,6 +156,8 @@ def Auto_fnfp(q, tune_ran, m_input, m_NA_est, m_input_raw, fnfp, fnc): distance = fnfp * n_10 + fnc * n_01 + h_current = tune_ran[0][0] + oc_current = tune_ran[0][1] for [h_i, overlapp_coeff] in tune_ran: matrix_rec_i = greedyPtreeNA( m_input.astype(bool), apprx_ordr, overlapp_coeff, h_i @@ -120,11 +165,17 @@ def Auto_fnfp(q, tune_ran, m_input, m_NA_est, m_input_raw, fnfp, fnc): matrix_rec_Temp = deleteNas(m_input_raw, matrix_rec_i) n_10 = np.sum(matrix_rec_Temp < m_input, dtype="int64") n_01 = np.sum(matrix_rec_Temp > m_input, dtype="int64") + print(procid, h_i, overlapp_coeff) distance_i = fnfp * n_10 + fnc * n_01 if distance_i < distance: matrix_recon = matrix_rec_i.copy() distance = distance_i - q.put([matrix_recon, distance]) + h_current = h_i + oc_current = overlapp_coeff + print(procid, "th process finished", q.full(), h_current, oc_current) + output_file = out_file + f"_TEMP_{procid}.CFMatrix" + WriteTfile(output_file, matrix_recon, in_file) + q.put([output_file, distance]) def deleteNas(M_in, M_out): @@ -132,10 +183,48 @@ def deleteNas(M_in, M_out): NA_position = np.argwhere(M_in == 3) for j in NA_position: M_o[j[0], j[1]] = 0 + return M_o -def greedyPtreeNew(M_input): +def deletemutations(M_in, M_out): # Finds the rows which are too far from the input, + # just replicates them to their closest neighbour. + x = M_in.shape + M_return = M_out.copy() + treshold_cut = 0.5 + dt = np.divide(sum(M_in), sum(M_out)) <= treshold_cut + for i in range(x[1]): + if dt[i] == 1: + M_return[:, i] = np.zeros(x[0]) + + return M_return + + +def precombination(M_in): + x = M_in.shape + trsh = 0.22 + M_return = M_in.copy() + for i in range(x[0]): + for j in range(i + 1, x[0]): + rij = np.sum(M_in[i, :] != M_in[j, :]) / np.sum(M_in[i, :] + M_in[j, :]) + print(rij) + if rij < trsh: + cupi = M_in[i, :] + M_in[j, :] + M_return[i, :] = cupi + M_return[j, :] = cupi + print("combined ", i, j) + return M_return + + +# Both algorithms take inut matrices of BOOL type. +def greedyPtreeNew( + M_input, +): # very greedy algorithm that constructs a ptree matrix from M_inputs by adding 1's + # M_input has to be of type BOOLEAN ! + # Returns + # M_copy(reconstructed matrix), + # bret (list of positions of assumed false negatives) + # M_apprx=greedy_row_rec(M_input)[1] M_copy = M_input.copy() ISet1 = np.argsort(sum(M_copy)) @@ -143,21 +232,27 @@ def greedyPtreeNew(M_input): for i in range(M_copy.shape[1]): ISet.append(ISet1[i]) - bret = [] + bret = [] # Location of detected false negatives + print(M_copy.shape, len(ISet)) while len(ISet) > 1: - pivot_index = ISet[-1] + pivot_index = ISet[-1] # index of the pivot vector Sremaining = ISet.copy() - pivot_vector = M_copy[:, pivot_index] - cum_vector = np.copy(pivot_vector) + pivot_vector = M_copy[ + :, pivot_index + ] # vector used for pivoting the current iteration + cum_vector = np.copy(pivot_vector) # holds the union vector while_cont = 1 - while while_cont == 1: + while while_cont == 1: # main loop while_cont = 0 for j in Sremaining: - cap_j = cum_vector * M_copy[:, j] + cap_j = ( + cum_vector * M_copy[:, j] + ) # intersection of the pivot and the jth column - if np.any(cap_j): + if np.any(cap_j): # continue as long as there is a column + # having non-empty intersection cum_vector = cum_vector + M_copy[:, j] while_cont = 1 Sremaining.remove(j) @@ -167,9 +262,16 @@ def greedyPtreeNew(M_input): return [bret, M_copy] -def greedyPtreeNA(M_input, approx_order, oc, hc): - overlap_coeff = oc - hist_coeff = hc +def greedyPtreeNA( + M_input, approx_order, oc, hc +): # Modified Greedy algorithm for NA and false positive values. + # Matrix M_input has to be BOOLEAN for the code to work right + # Returns: + # M_copy(reconstructed matrix), + # bret (list of positions of assumed false negatives) + # pret (list of approximate false positives) + overlap_coeff = oc # 0.1 + hist_coeff = hc # 25 M_copy = M_input.copy() ISet1 = np.argsort(approx_order) @@ -177,63 +279,86 @@ def greedyPtreeNA(M_input, approx_order, oc, hc): for i in range(M_copy.shape[1]): ISet.append(ISet1[i]) - bret = [] - pret = [] + bret = [] # Location of detected false negatives + pret = [] # Location of detected false positives while len(ISet) > 1: - pivot_index = ISet[-1] - Sremaining = ISet.copy() - pivot_vector = M_copy[:, pivot_index] + pivot_index = ISet[-1] # index of the pivot vector + Sremaining = ISet.copy() # set of indices that are not included in the union + pivot_vector = M_copy[ + :, pivot_index + ] # vector used for pivoting the current iteration + cum_vector = np.copy(pivot_vector) # holds the union vector while_cont = 1 - cum_hist = np.zeros(M_input.shape[0]) + cum_hist = np.zeros(M_input.shape[0]) # holds the histogram for the union - while while_cont == 1: + while while_cont == 1: # Continue uniting vectors until no candidate remains while_cont = 0 for j in Sremaining: cap_i = pivot_vector * M_copy[:, j] min_vec_size = np.min([np.sum(M_copy[:, j]), np.sum(pivot_vector)]) cap_size = np.sum(cap_i) - if cap_size / min_vec_size > overlap_coeff: + if ( + cap_size / min_vec_size > overlap_coeff + ): # we check if the columns have a meaningful overlap cum_hist = cum_hist + M_copy[:, j].astype(int) - while_cont = 1 - Sremaining.remove(j) + while_cont = 1 # found an overlapping vector so we keep on going + Sremaining.remove(j) # united vector is removed from the search set - cnumT = np.floor(cum_hist.max() / hist_coeff) + cnumT = np.floor( + cum_hist.max() / hist_coeff + ) # the elements that repeated few times are considered to be false positives cum_vector = cum_hist > cnumT - for j in ISet: - capj = cum_vector * M_copy[:, j] - difj = M_copy[:, j] > capj + for j in ISet: # clean up the false positives wrt. the established pivot + capj = cum_vector * M_copy[:, j] # intersection of union with column j + difj = M_copy[:, j] > capj # difference of column j from the union if np.sum(capj) > np.sum(difj): M_copy[:, j] = capj else: M_copy[:, j] = difj - M_copy[:, pivot_index] = cum_vector - ISet.remove(pivot_index) + M_copy[ + :, pivot_index + ] = cum_vector # correcting the false negatives in the pivot + ISet.remove(pivot_index) # removing the pivot from the search space bret = np.argwhere(M_copy.astype(int) > M_input.astype(int)) pret = np.argwhere(np.argwhere(M_copy.astype(int) < M_input.astype(int))) return [bret, pret, M_copy] -def ReadFfile(df): +def ReadFfile(filename): # reads a matrix from file and returns it in BOOL type + df = pd.read_csv(filename, sep="\t", index_col=0) M = df.values.astype(bool) return M -def ReadFileNA(df): +def ReadFile(filename): # reads a matrix from file and returns it in BOOL type + df = pd.read_csv(filename, sep="\t", index_col=0) M = df.values + return M + + +def ReadFileNA(filename): # reads the file and fills the NA with 0's. + df = pd.read_csv(filename, sep="\t", index_col=0) + M = df.values + NA_position = np.argwhere(M == 3) for j in NA_position: M[j[0], j[1]] = 0 return M.astype(bool) -def Estimated_Matrix(df): +def Estimated_Matrix( + filename, +): # Creates an estimate of the matrix such that each element is given + # the expectation wrt the column 1/0 frequencies. + df = pd.read_csv(filename, sep="\t", index_col=0) M = df.values.astype(float) + for i in range(M.shape[1]): if np.sum(M[:, i] != 3) == 0: one_ratio = 0 @@ -242,32 +367,227 @@ def Estimated_Matrix(df): for j in range(M.shape[0]): if M[j, i] == 3: M[j, i] = one_ratio + return M -def ReadFasis(df): +def WriteTfile( + filename, matrix, filename2 +): # writes matrix output as an integer matrix + df_input = pd.read_csv(filename2, sep="\t", index_col=0) + matrix_output = matrix.astype(int) + df_output = pd.DataFrame(matrix_output) + df_output.columns = df_input.columns + df_output.index = df_input.index + df_output.index.name = "cellIDxmutID" + df_output.to_csv(filename, sep="\t") + + +def isPtree(matrix_in): # brute force check if matrix_in is a pTree + M = matrix_in.astype(bool) + for j in range(M.shape[1]): + for i in range(j, M.shape[1]): + cap = M[:, i] * M[:, j] + cap_size = np.sum(cap) + Mi_size = np.sum(M[:, i]) + Mj_size = np.sum(M[:, j]) + if cap_size != 0: + if cap_size != Mi_size: + if cap_size != Mj_size: + return False + + print("Seems to be a PTree ...") + return True + + +def compareAD(M1, M2): # M1 is the ground truth + error_pairs = [] + n_adpairs = 0 + for i in range(M1.shape[1]): + for j in range(i, M1.shape[1]): + cap1 = M1[:, i] * M1[:, j] + cap2 = M2[:, i] * M2[:, j] + if np.sum(cap1) > 0 and np.sum(M1[:, i]) != np.sum(M1[:, j]): + n_adpairs = n_adpairs + 1 + if np.sum(cap2) == 0: + error_pairs.append([i, j]) + else: + if np.sum(M1[:, j]) > np.sum(M1[:, i]) and np.sum( + M2[:, j] + ) <= np.sum(M2[:, i]): + error_pairs.append([i, j]) + else: + if np.sum(M1[:, i]) > np.sum(M1[:, j]) and np.sum( + M2[:, i] + ) <= np.sum(M2[:, j]): + error_pairs.append([i, j]) + print( + "Number of AD pairs = ", + n_adpairs, + "errors : ", + len(error_pairs), + "AD score = ", + 1 - len(error_pairs) / n_adpairs, + ) + return error_pairs + + +def compareDF(M_orj, M_rec): + error_pairs = [] + d_pairs = 0 + for i in range(M_orj.shape[1]): + for j in range(i, M_orj.shape[1]): + cap1 = M_orj[:, i] * M_orj[:, j] + cap2 = M_rec[:, i] * M_rec[:, j] + if np.sum(cap1) == 0: + d_pairs = d_pairs + 1 + if np.sum(cap2) > 0: + error_pairs.append([i, j]) + print( + "Number of Diff pairs = ", + d_pairs, + "errors :", + len(error_pairs), + "score :", + 1 - len(error_pairs) / d_pairs, + ) + return + + +def ReadFasis(filename): # reads a matrix from file and returns it in BOOL type + df = pd.read_csv(filename, sep="\t", index_col=0) M = df.values return M -def postprocess_col(df_input, matrix_recons, pfn, pfp): - M_noisy = ReadFasis(df_input.copy()) - M_nds = matrix_recons.copy().astype(bool) - Mtemp = c_m_col(ReadFasis(df_input.copy()), M_nds, pc_fn=pfn, pc_fp=pfp) +def compute_fnfp(M_n, M_r): + n_01 = 0 + n_10 = 0 + n_1 = 0 + n_0 = 0 + for x in np.argwhere(M_n < 3): + if M_r[x[0], x[1]] == 0: + n_0 = n_0 + 1 + if M_r[x[0], x[1]] == 1: + n_1 = n_1 + 1 + if M_n[x[0], x[1]] > M_r[x[0], x[1]]: + + n_10 = n_10 + 1 + + if M_n[x[0], x[1]] < M_r[x[0], x[1]]: + n_01 = n_01 + 1 + n_1 = n_1 + 1 + print("computed fn :", n_01 / n_1, " fp : ", n_10 / n_0) + return [n_01 / n_1, n_10 / n_0] + + +def find_dist(node_piv, M_samples): + M_nodes = M_samples.copy() + for j in range(M_nodes.shape[1]): + if node_piv[j] == 3: + node_piv[j] = 0 + M_nodes[:, j] = 0 * M_nodes[:, j] + distances = np.zeros(M_nodes.shape[0]) + for i in range(M_nodes.shape[0]): + d_10 = np.sum(node_piv > M_nodes[i, :], dtype="int64") + d_01 = np.sum(node_piv < M_nodes[i, :], dtype="int64") + distances[i] = np.square(d_10) + d_01 + + return distances + + +def find_dist_col(node_piv, M_samples): + M_nodes = M_samples.copy() + for j in range(M_nodes.shape[0]): + if node_piv[j] == 3: + node_piv[j] = 0 + M_nodes[j, :] = 0 * M_nodes[j, :] + distances = np.zeros(M_nodes.shape[1]) + for i in range(M_nodes.shape[1]): + d_10 = np.sum(node_piv > M_nodes[:, i], dtype="int64") + d_01 = np.sum(node_piv < M_nodes[:, i], dtype="int64") + # distances[i]=np.square(d_10) + d_01 + # distances[i]=d_10 + d_01 + distances[i] = prfp * d_10 + d_01 + # distances[i]=np.sum(M_nodes[:,i]*node_piv)/np.sqrt(np.sum(M_nodes[:,i])*np.sum(node_piv)) + distances[i] = -((0.005) ** d_10) * (0.1) ** d_01 + return distances + + +def closest_matrix(M_input, M_nodes, M_rec): + M_out = M_input.copy() + for i in range(M_input.shape[0]): + pivot_v = M_input[i, :] + distance_i = find_dist(pivot_v, M_nodes) + + min_index = np.argmin(distance_i) + M_out[i, :] = M_nodes[min_index, :] + return M_out + + +def closest_matrix_col(M_input, M_nodes, M_rec): + M_out = M_input.copy() + for i in range(M_input.shape[1]): + pivot_v = M_input[:, i] + distance_i = find_dist_col(pivot_v, M_nodes) + + min_index = np.argmin(distance_i) + M_out[:, i] = M_nodes[:, min_index] + + return M_out + + +def postprocess_col(input_file, out_file, pfn, pfp): + s = time.time() + + M_noisy = ReadFasis(input_file) + M_n_copy = M_noisy.copy() + M_nds = ReadFfile(out_file) + + Mtemp = c_m_col(ReadFasis(input_file), M_nds, pc_fn=pfn, pc_fp=pfp) Mtemp2 = Mtemp.copy() d10min = np.sum(Mtemp < (M_noisy == 1)) + d10c = d10min imp = 1 while imp: - Mtemp2 = c_m_row(ReadFasis(df_input.copy()), Mtemp2, pc_fn=pfn, pc_fp=pfp) - Mtemp2 = c_m_col(ReadFasis(df_input.copy()), Mtemp2, pc_fn=pfn, pc_fp=pfp) + Mtemp2 = c_m_row(ReadFasis(input_file), Mtemp2, pc_fn=pfn, pc_fp=pfp) + Mtemp2 = c_m_col(ReadFasis(input_file), Mtemp2, pc_fn=pfn, pc_fp=pfp) d10c = np.sum(Mtemp2 < (M_noisy == 1)) + print(d10c) if d10c < d10min: d10min = d10c Mtemp = Mtemp2.copy() else: imp = 0 - return Mtemp + + M_postprocessed = Mtemp + processed_file = out_file[:-13] + ".CFMatrix" + WriteTfile(processed_file, M_postprocessed, input_file) + e = time.time() + print( + "Postprocessed 1->0 : ", + np.sum(M_postprocessed < (M_n_copy == 1)), + " 0->1 : ", + np.sum(M_postprocessed > M_n_copy), + " NA->1 : ", + np.sum(M_postprocessed > (M_n_copy == 1)) - np.sum(M_postprocessed > M_n_copy), + ) + print("Post processing time : ", e - s) + + +def preproc_row(M_o, c=0.8): + M_pre = M_o.copy() + for i in range(M_o.shape[0]): + for j in range(i + 1, M_o.shape[0]): + prdct = np.sum(M_o[i, :] * M_o[j, :]) / np.sqrt( + np.sum(M_o[i, :]) * np.sum(M_o[j, :]) + ) + if prdct > c: + print(i, j) + M_pre[i, :] = M_o[i, :] + M_o[j, :] + M_pre[j, :] = M_o[i, :] + M_o[j, :] + return M_pre def f_d_col(node_piv, M_samples, p_fp=0.005, p_fn=0.1): @@ -276,39 +596,85 @@ def f_d_col(node_piv, M_samples, p_fp=0.005, p_fn=0.1): D11 = (M_nodes.T == 1).astype(int).dot(node_piv == 1) D00 = (M_nodes.T == 0).astype(int).dot(node_piv == 0) D01 = (M_nodes.T == 1).astype(int).dot(node_piv == 0) - distances = -np.multiply(np.power(p_fp, D10), np.power(1 - p_fp, D00)) - distances = np.multiply(distances, np.power(p_fn, D01)) - distances = np.multiply(distances, np.power(1 - p_fn, D11)) + distances = -( + np.multiply(np.log(p_fp), D10) + + np.multiply(np.log(1 - p_fp), D00) + + np.multiply(np.log(p_fn), D01) + + np.multiply(np.log(1 - p_fn), D11) + ) + # For large matrices the probability becomes too small, + # taking log to prevent precision issues + + return distances + + +def f_d_row(node_piv, M_samples, p_fp=0.005, p_fn=0.1): + M_nodes = M_samples.copy() + + distances = np.zeros(M_nodes.shape[0]) + D10 = (M_nodes == 0).astype(int).dot(node_piv == 1) + D11 = (M_nodes == 1).astype(int).dot(node_piv == 1) + D00 = (M_nodes == 0).astype(int).dot(node_piv == 0) + D01 = (M_nodes == 1).astype(int).dot(node_piv == 0) + + distances = -( + np.multiply(np.log(p_fp), D10) + + np.multiply(np.log(1 - p_fp), D00) + + np.multiply(np.log(p_fn), D01) + + np.multiply(np.log(1 - p_fn), D11) + ) + # For large matrices the probability becomes too small, + # taking log to prevent precision issues return distances def c_m_col(M_input, M_nodes, pc_fp=0.0001, pc_fn=0.1): M_out = M_input.copy() + for i in range(M_input.shape[1]): pivot_v = M_input[:, i] + distance_i = f_d_col(pivot_v, M_nodes, p_fn=pc_fn, p_fp=pc_fp) + min_index = np.argmin(distance_i) M_out[:, i] = M_nodes[:, min_index] - return M_out - -def f_d_row(node_piv, M_samples, p_fp=0.005, p_fn=0.1): - M_nodes = M_samples.copy() - D10 = (M_nodes == 0).astype(int).dot(node_piv == 1) - D11 = (M_nodes == 1).astype(int).dot(node_piv == 1) - D00 = (M_nodes == 0).astype(int).dot(node_piv == 0) - D01 = (M_nodes == 1).astype(int).dot(node_piv == 0) - distances = -np.multiply(np.power(p_fp, D10), np.power(1 - p_fp, D00)) - distances = np.multiply(distances, np.power(p_fn, D01)) - distances = np.multiply(distances, np.power(1 - p_fn, D11)) - return distances + return M_out def c_m_row(M_input, M_nodes, pc_fp=0.0001, pc_fn=0.1): M_out = M_input.copy() + for i in range(M_input.shape[0]): pivot_v = M_input[i, :] distance_i = f_d_row(pivot_v, M_nodes, p_fn=pc_fn, p_fp=pc_fp) + min_index = np.argmin(distance_i) M_out[i, :] = M_nodes[min_index, :] + return M_out + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("inputfile") + parser.add_argument("outputfile") + parser.add_argument("--nofcpus", default=mp.cpu_count(), type=int, nargs="?") + parser.add_argument("--algorithmchoice", default="FPNA", nargs="?") + parser.add_argument("--fn_fpratio", default=51, type=int, nargs="?") + parser.add_argument("--fp_coeff", default=0.00001, type=float, nargs="?") + parser.add_argument("--fn_coeff", default=0.1, type=float, nargs="?") + args = parser.parse_args() + + fn_conorm = 0.1 + fp_conorm = fn_conorm * args.fp_coeff / args.fn_coeff + + Reconstruct( + args.inputfile, + args.outputfile, + Algchoice=args.algorithmchoice, + n_proc=args.nofcpus, + fnfp=args.fn_fpratio, + post_fn=fn_conorm, + post_fp=fp_conorm, + ) diff --git a/trisicell/ul/__init__.py b/trisicell/ul/__init__.py index 7f8384a..a865e9b 100644 --- a/trisicell/ul/__init__.py +++ b/trisicell/ul/__init__.py @@ -24,7 +24,7 @@ to_tree, ) from trisicell.ul._utils import ( - calc_score_tree, + calc_nll_matrix, cleanup, count_flips, dir_base, @@ -67,7 +67,7 @@ to_cfmatrix, to_mtree, to_tree, - calc_score_tree, + calc_nll_matrix, cleanup, dir_base, dirbase, diff --git a/trisicell/ul/_utils.py b/trisicell/ul/_utils.py index 63e3b1c..fdbb9dd 100644 --- a/trisicell/ul/_utils.py +++ b/trisicell/ul/_utils.py @@ -73,7 +73,7 @@ def log_flip(df_in, df_out): tsc.logg.info(f"rates -- NA: {na_rate:.3f}") -def calc_score_tree(df_in, df_out, alpha, beta): +def calc_nll_matrix(df_in, df_out, alpha, beta): if alpha == 0 or beta == 0: return None columns = np.intersect1d(df_in.columns, df_out.columns) @@ -98,36 +98,63 @@ def calc_score_tree(df_in, df_out, alpha, beta): objective -= numZeros * np.log(1 - alpha) + numOnes * ( np.log(alpha) + np.log((1 - beta) / alpha) ) - tsc.logg.info(f"score -- NLL: {-objective}") - return None + return -objective def stat(df_in, df_out, alpha, beta, running_time): log_input(df_in) log_output(df_out, running_time) log_flip(df_in, df_out) - calc_score_tree(df_in, df_out, alpha, beta) + nll = calc_nll_matrix(df_in, df_out, alpha, beta) + tsc.logg.info(f"score -- NLL: {nll}") def get_param(filename): + def _get_param_helper(param): + try: + value = basename.split(f"{param}_")[1] + if "-" in value: + value = value.split("-")[0] + else: + value = value.split(".")[0] + return float(value) if "." in value else int(value) + except IndexError: + return None + data = {} _, basename = dir_base(filename) - data["simNo"] = int(basename.split("-")[0].split("_")[1]) - data["s"] = int(basename.split("-")[1].split("_")[1]) - data["m"] = int(basename.split("-")[2].split("_")[1]) - data["h"] = int(basename.split("-")[3].split("_")[1]) - data["minVAF"] = float(basename.split("-")[4].split("_")[1]) - data["ISAV"] = int(basename.split("-")[5].split("_")[1]) - data["n"] = int(basename.split("-")[6].split("_")[1]) - data["fp"] = float(basename.split("-")[7].split("_")[1]) - data["fn"] = float(basename.split("-")[8].split("_")[1]) - data["na"] = float(basename.split("-")[9].split("_")[1]) - data["d"] = float(basename.split("-")[10].split("_")[1]) - last = basename.split("-")[11] - if "." in last: - data["l"] = int(last.split(".")[0].split("_")[1]) - else: - data["l"] = int(last.split("_")[1]) + for param in [ + "simNo", + "s", + "m", + "h", + "minVAF", + "ISAV", + "n", + "fp", + "fn", + "na", + "d", + "l", + ]: + value = _get_param_helper(param) + if value is not None: + data[param] = value + # data["s"] = int(basename.split("s_")[1].split("-")[0]) + # data["m"] = int(basename.split("-")[2].split("_")[1]) + # data["h"] = int(basename.split("-")[3].split("_")[1]) + # data["minVAF"] = float(basename.split("-")[4].split("_")[1]) + # data["ISAV"] = int(basename.split("-")[5].split("_")[1]) + # data["n"] = int(basename.split("-")[6].split("_")[1]) + # data["fp"] = float(basename.split("-")[7].split("_")[1]) + # data["fn"] = float(basename.split("-")[8].split("_")[1]) + # data["na"] = float(basename.split("-")[9].split("_")[1]) + # data["d"] = float(basename.split("-")[10].split("_")[1]) + # last = basename.split("-")[11] + # if "." in last: + # data["l"] = int(last.split(".")[0].split("_")[1]) + # else: + # data["l"] = int(last.split("_")[1]) return data