diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 00000000..ea27a584 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,27 @@ +{ + "name": "nfcore", + "image": "nfcore/gitpod:latest", + "remoteUser": "gitpod", + + // Configure tool-specific properties. + "customizations": { + // Configure properties specific to VS Code. + "vscode": { + // Set *default* container specific settings.json values on container create. + "settings": { + "python.defaultInterpreterPath": "/opt/conda/bin/python", + "python.linting.enabled": true, + "python.linting.pylintEnabled": true, + "python.formatting.autopep8Path": "/opt/conda/bin/autopep8", + "python.formatting.yapfPath": "/opt/conda/bin/yapf", + "python.linting.flake8Path": "/opt/conda/bin/flake8", + "python.linting.pycodestylePath": "/opt/conda/bin/pycodestyle", + "python.linting.pydocstylePath": "/opt/conda/bin/pydocstyle", + "python.linting.pylintPath": "/opt/conda/bin/pylint" + }, + + // Add the IDs of extensions you want installed when the container is created. + "extensions": ["ms-python.python", "ms-python.vscode-pylance", "nf-core.nf-core-extensionpack"] + } + } +} diff --git a/.editorconfig b/.editorconfig index b6b31907..b78de6e6 100644 --- a/.editorconfig +++ b/.editorconfig @@ -8,7 +8,7 @@ trim_trailing_whitespace = true indent_size = 4 indent_style = space -[*.{md,yml,yaml,html,css,scss,js}] +[*.{md,yml,yaml,html,css,scss,js,cff}] indent_size = 2 # These files are edited and tested upstream in nf-core/modules diff --git a/.gitattributes b/.gitattributes index 050bb120..7a2dabc2 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,3 +1,4 @@ *.config linguist-language=nextflow +*.nf.test linguist-language=nextflow modules/nf-core/** linguist-generated subworkflows/nf-core/** linguist-generated diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index c84e9f3c..12d0b7ac 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -101,3 +101,19 @@ If you are using a new feature from core Nextflow, you may bump the minimum requ ### Images and figures For overview images and other documents we follow the nf-core [style guidelines and examples](https://nf-co.re/developers/design_guidelines). + +## GitHub Codespaces + +This repo includes a devcontainer configuration which will create a GitHub Codespaces for Nextflow development! This is an online developer environment that runs in your browser, complete with VSCode and a terminal. + +To get started: + +- Open the repo in [Codespaces](https://github.com/nf-core/circdna/codespaces) +- Tools installed + - nf-core + - Nextflow + +Devcontainer specs: + +- [DevContainer config](.devcontainer/devcontainer.json) +- [Dockerfile](.devcontainer/Dockerfile) diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index 0e64d1af..a6ad9632 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -42,7 +42,7 @@ body: attributes: label: System information description: | - * Nextflow version _(eg. 21.10.3)_ + * Nextflow version _(eg. 22.10.1)_ * Hardware _(eg. HPC, Desktop, Cloud)_ * Executor _(eg. slurm, local, awsbatch)_ * Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter or Charliecloud)_ diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 021b5a1f..779cdf2e 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -15,8 +15,7 @@ Learn more about contributing: [CONTRIBUTING.md](https://github.com/nf-core/circ - [ ] This comment contains a description of changes (with reason). - [ ] If you've fixed a bug or added code that should be tested, add tests! - - [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/nf-core/circdna/tree/master/.github/CONTRIBUTING.md) - - [ ] If necessary, also make a PR on the nf-core/circdna _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. +- [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/nf-core/circdna/tree/master/.github/CONTRIBUTING.md)- [ ] If necessary, also make a PR on the nf-core/circdna _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. - [ ] Make sure your code lints (`nf-core lint`). - [ ] Ensure the test suite passes (`nextflow run . -profile test,docker --outdir `). - [ ] Usage Documentation in `docs/usage.md` is updated. diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml index e23b6848..bdf169ee 100644 --- a/.github/workflows/awsfulltest.yml +++ b/.github/workflows/awsfulltest.yml @@ -27,3 +27,7 @@ jobs: "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/circdna/results-${{ github.sha }}" } profiles: test_full,aws_tower + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: tower_action_*.log diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml index 8764760a..641e981e 100644 --- a/.github/workflows/awstest.yml +++ b/.github/workflows/awstest.yml @@ -23,3 +23,7 @@ jobs: "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/circdna/results-test-${{ github.sha }}" } profiles: test,aws_tower + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: tower_action_*.log diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 37c3b866..c5425869 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -10,7 +10,10 @@ on: env: NXF_ANSI_LOG: false - CAPSULE_LOG: none + +concurrency: + group: "${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}" + cancel-in-progress: true jobs: test: @@ -20,27 +23,17 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - # Nextflow versions - include: - # Test pipeline minimum Nextflow version - - NXF_VER: "21.10.3" - NXF_EDGE: "" - # Test latest edge release of Nextflow - - NXF_VER: "" - NXF_EDGE: "1" + NXF_VER: + - "22.10.1" + - "latest-everything" steps: - name: Check out pipeline code - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Install Nextflow - env: - NXF_VER: ${{ matrix.NXF_VER }} - # Uncomment only if the edge release is more recent than the latest stable release - # See https://github.com/nextflow-io/nextflow/issues/2467 - # NXF_EDGE: ${{ matrix.NXF_EDGE }} - run: | - wget -qO- get.nextflow.io | bash - sudo mv nextflow /usr/local/bin/ + uses: nf-core/setup-nextflow@v1 + with: + version: "${{ matrix.NXF_VER }}" - name: Run pipeline with test data # For example: adding multiple test runs with different parameters @@ -48,6 +41,54 @@ jobs: run: | nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results + test_keep_duplicates: + name: Run pipeline with test data, but remove marked duplicates + # Only run on push if this is the nf-core dev branch (merged PRs) + if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/circdna') }}" + runs-on: ubuntu-latest + strategy: + matrix: + NXF_VER: + - "22.10.1" + - "latest-everything" + steps: + - name: Check out pipeline code + uses: actions/checkout@v2 + + - name: Install Nextflow + uses: nf-core/setup-nextflow@v1 + with: + version: "${{ matrix.NXF_VER }}" + - name: Run pipeline with test data, but remove marked duplicates + # For example: adding multiple test runs with different parameters + # Remember that you can parallelise this by using strategy.matrix + run: | + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results --keep_duplicates false + + test_skip_markduplicates: + name: Run pipeline with test data, but remove marked duplicates + # Only run on push if this is the nf-core dev branch (merged PRs) + if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/circdna') }}" + runs-on: ubuntu-latest + strategy: + matrix: + NXF_VER: + - "22.10.1" + - "latest-everything" + steps: + - name: Check out pipeline code + uses: actions/checkout@v2 + + - name: Install Nextflow + uses: nf-core/setup-nextflow@v1 + with: + version: "${{ matrix.NXF_VER }}" + - name: Run pipeline with test data, but remove marked duplicates + # For example: adding multiple test runs with different parameters + # Remember that you can parallelise this by using strategy.matrix + run: | + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results --skip_markduplicates + ampliconarchitect: if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/circdna') }}" runs-on: ubuntu-latest diff --git a/.github/workflows/fix-linting.yml b/.github/workflows/fix-linting.yml index 9efbc47f..8de169a4 100644 --- a/.github/workflows/fix-linting.yml +++ b/.github/workflows/fix-linting.yml @@ -24,7 +24,7 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.nf_core_bot_auth_token }} - - uses: actions/setup-node@v2 + - uses: actions/setup-node@v3 - name: Install Prettier run: npm install -g prettier @prettier/plugin-php @@ -34,9 +34,9 @@ jobs: id: prettier_status run: | if prettier --check ${GITHUB_WORKSPACE}; then - echo "::set-output name=result::pass" + echo "result=pass" >> $GITHUB_OUTPUT else - echo "::set-output name=result::fail" + echo "result=fail" >> $GITHUB_OUTPUT fi - name: Run 'prettier --write' diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 77358dee..858d622e 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -4,6 +4,8 @@ name: nf-core linting # that the code meets the nf-core guidelines. on: push: + branches: + - dev pull_request: release: types: [published] @@ -12,9 +14,9 @@ jobs: EditorConfig: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - - uses: actions/setup-node@v2 + - uses: actions/setup-node@v3 - name: Install editorconfig-checker run: npm install -g editorconfig-checker @@ -25,9 +27,9 @@ jobs: Prettier: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - - uses: actions/setup-node@v2 + - uses: actions/setup-node@v3 - name: Install Prettier run: npm install -g prettier @@ -35,22 +37,48 @@ jobs: - name: Run Prettier --check run: prettier --check ${GITHUB_WORKSPACE} + PythonBlack: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Check code lints with Black + uses: psf/black@stable + + # If the above check failed, post a comment on the PR explaining the failure + - name: Post PR comment + if: failure() + uses: mshick/add-pr-comment@v1 + with: + message: | + ## Python linting (`black`) is failing + + To keep the code consistent with lots of contributors, we run automated code consistency checks. + To fix this CI test, please run: + + * Install [`black`](https://black.readthedocs.io/en/stable/): `pip install black` + * Fix formatting errors in your pipeline: `black .` + + Once you push these changes the test should pass, and you can hide this comment :+1: + + We highly recommend setting up Black in your code editor so that this formatting is done automatically on save. Ask about it on Slack for help! + + Thanks again for your contribution! + repo-token: ${{ secrets.GITHUB_TOKEN }} + allow-repeats: false + nf-core: runs-on: ubuntu-latest steps: - name: Check out pipeline code - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Install Nextflow - env: - CAPSULE_LOG: none - run: | - wget -qO- get.nextflow.io | bash - sudo mv nextflow /usr/local/bin/ + uses: nf-core/setup-nextflow@v1 - - uses: actions/setup-python@v3 + - uses: actions/setup-python@v4 with: - python-version: "3.6" + python-version: "3.7" architecture: "x64" - name: Install dependencies @@ -71,7 +99,7 @@ jobs: - name: Upload linting log file artifact if: ${{ always() }} - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: name: linting-logs path: | diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml index 04758f61..0bbcd30f 100644 --- a/.github/workflows/linting_comment.yml +++ b/.github/workflows/linting_comment.yml @@ -18,7 +18,7 @@ jobs: - name: Get PR number id: pr_number - run: echo "::set-output name=pr_number::$(cat linting-logs/PR_number.txt)" + run: echo "pr_number=$(cat linting-logs/PR_number.txt)" >> $GITHUB_OUTPUT - name: Post PR comment uses: marocchino/sticky-pull-request-comment@v2 diff --git a/.nf-core.yml b/.nf-core.yml index 605cce5e..003d5d23 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -7,6 +7,6 @@ lint: - assets/email_template.html update: - nf-core/modules: + nf-core: samtools/sort: "1ad73f1b2abdea9398680d6d20014838135c9a35" samtools/index: "1ad73f1b2abdea9398680d6d20014838135c9a35" diff --git a/.prettierignore b/.prettierignore index d0e7ae58..437d763d 100644 --- a/.prettierignore +++ b/.prettierignore @@ -1,4 +1,6 @@ email_template.html +adaptivecard.json +slackreport.json .nextflow* work/ data/ @@ -7,3 +9,4 @@ results/ testing/ testing* *.pyc +bin/ diff --git a/CHANGELOG.md b/CHANGELOG.md index fdb03553..5c3bd6bb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,29 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## v1.0.2 - [2023-03-07] + +### `Added` + +- ampliconclassifier/makeinput module added -> Generates the input file used for ampliconclassifier functions +- ampliconclassifier/makeresultstable added -> Generates results table from AmpliconArchitect and AmpliconClassifier +- CNN Reference File For AmpliconArchitect +- mm10 option for AmpliconArchitect +- stub runs for AmpliconArchitect processes +- New module versions +- nf-core template 2.7.2 + +### `Fixed` + +- Fixed ZeroDivisionError by Circle-Map +- Fixed keep_duplicates and skip_markduplicates parameter bug + +### `Dependencies` + +### `Deprecated` + +- AmpliconArchitect Summary Process was deprecated + ## v1.0.1 - [2022-06-22] ### `Added` diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 00000000..4533e2f2 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,56 @@ +cff-version: 1.2.0 +message: "If you use `nf-core tools` in your work, please cite the `nf-core` publication" +authors: + - family-names: Ewels + given-names: Philip + - family-names: Peltzer + given-names: Alexander + - family-names: Fillinger + given-names: Sven + - family-names: Patel + given-names: Harshil + - family-names: Alneberg + given-names: Johannes + - family-names: Wilm + given-names: Andreas + - family-names: Ulysse Garcia + given-names: Maxime + - family-names: Di Tommaso + given-names: Paolo + - family-names: Nahnsen + given-names: Sven +title: "The nf-core framework for community-curated bioinformatics pipelines." +version: 2.4.1 +doi: 10.1038/s41587-020-0439-x +date-released: 2022-05-16 +url: https://github.com/nf-core/tools +prefered-citation: + type: article + authors: + - family-names: Ewels + given-names: Philip + - family-names: Peltzer + given-names: Alexander + - family-names: Fillinger + given-names: Sven + - family-names: Patel + given-names: Harshil + - family-names: Alneberg + given-names: Johannes + - family-names: Wilm + given-names: Andreas + - family-names: Ulysse Garcia + given-names: Maxime + - family-names: Di Tommaso + given-names: Paolo + - family-names: Nahnsen + given-names: Sven + doi: 10.1038/s41587-020-0439-x + journal: nature biotechnology + start: 276 + end: 278 + title: "The nf-core framework for community-curated bioinformatics pipelines." + issue: 3 + volume: 38 + year: 2020 + url: https://dx.doi.org/10.1038/s41587-020-0439-x diff --git a/CITATIONS.md b/CITATIONS.md index 539f8c27..da0b48b9 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -36,7 +36,11 @@ > Wick RR, Judd LM, Gorrie CL, Holt KE. Unicycler: Resolving bacterial genome assemblies from short and long sequencing reads. PLoS Comput Biol. 2017 Jun 8;13(6):e1005595. doi: 10.1371/journal.pcbi.1005595. PMID: 28594827; PMCID: PMC5481147. -- [PrepareAA](https://github.com/jluebeck/PrepareAA) +- [CNVKit](https://github.com/etal/cnvkit) + + > Talevich E, Shain AH, Botton T, Bastian BC. Cnvkit: genome-wide copy number detection and visualization from targeted dna sequencing. PLoS Comput Biol. 2016;12(4):e1004873. doi: 10.1371/journal.pcbi.1004873. PMID: 27100738; PMCID: PMC4839673. + +- [PrepareAA](https://github.com/jluebeck/AmpliconSuite-pipeline) - [AmpliconArchitect](https://github.com/virajbdeshpande/AmpliconArchitect) @@ -44,6 +48,8 @@ - [AmpliconClassifier](https://github.com/jluebeck/AmpliconClassifier) + > Luebeck, Jens, Alvin Wei Tian Ng, Patricia C. Galipeau, Xiaohong Li, Carissa A. Sanchez, Annalise Katz-Summercorn, Hoon Kim et al. "Extrachromosomal DNA in the cancerous transformation of Barrett's esophagus." bioRxiv (2022): 2022-07. + - [Samblaster](https://github.com/GregoryFaust/samblaster) > Faust GG, Hall IM. SAMBLASTER: fast duplicate marking and structural variant read extraction. Bioinformatics. 2014 Sep 1;30(17):2503-5. doi: 10.1093/bioinformatics/btu314. Epub 2014 May 7. PMID: 24812344; PMCID: PMC4147885. diff --git a/README.md b/README.md index b4a26ef1..15294573 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,14 @@ # ![nf-core/circdna](docs/images/nf-core-circdna_logo_light.png#gh-light-mode-only) ![nf-core/circdna](docs/images/nf-core-circdna_logo_dark.png#gh-dark-mode-only) -[![GitHub Actions CI Status](https://github.com/nf-core/circdna/workflows/nf-core%20CI/badge.svg)](https://github.com/nf-core/circdna/actions?query=workflow%3A%22nf-core+CI%22) -[![GitHub Actions Linting Status](https://github.com/nf-core/circdna/workflows/nf-core%20linting/badge.svg)](https://github.com/nf-core/circdna/actions?query=workflow%3A%22nf-core+linting%22) -[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?logo=Amazon%20AWS)](https://nf-co.re/circdna/results) -[![Cite with Zenodo](https://zenodo.org/badge/DOI/10.5281/zenodo.6685250.svg)](https://doi.org/10.5281/zenodo.6685250) +[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/circdna/results) [![Cite with Zenodo](https://zenodo.org/badge/DOI/10.5281/zenodo.6685250.svg)](https://doi.org/10.5281/zenodo.6685250) -[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A521.10.3-23aa62.svg?labelColor=000000)](https://www.nextflow.io/) +[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A522.10.1-23aa62.svg)](https://www.nextflow.io/) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?logo=anaconda)](https://docs.conda.io/en/latest/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg)](https://sylabs.io/docs/) [![Launch on Nextflow Tower](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Nextflow%20Tower-%234256e7)](https://tower.nf/launch?pipeline=https://github.com/nf-core/circdna) -[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23circdna-4A154B?logo=slack)](https://nfcore.slack.com/channels/circdna) -[![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?logo=twitter)](https://twitter.com/nf_core) -[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?logo=youtube)](https://www.youtube.com/c/nf-core) +[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23circdna-4A154B?logo=slack)](https://nfcore.slack.com/channels/circdna) [![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?logo=twitter)](https://twitter.com/nf_core) [![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?logo=youtube)](https://www.youtube.com/c/nf-core) ## Introduction @@ -21,7 +16,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community! -On release, automated continuous integration tests run the pipeline on a [full-sized dataset](https://github.com/nf-core/test-datasets/tree/circdna) on the AWS cloud infrastructure. This ensures that the pipeline runs on AWS, has sensible resource allocation defaults set to run on real-world datasets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources. The results obtained from the full-sized test can be viewed on the [nf-core website](https://nf-co.re/circdna/results). +On release, automated continuous integration tests run the pipeline on a full-sized dataset on the AWS cloud infrastructure. This ensures that the pipeline runs on AWS, has sensible resource allocation defaults set to run on real-world datasets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources.The results obtained from the full-sized test can be viewed on the [nf-core website](https://nf-co.re/circdna/results). ## Pipeline summary @@ -34,7 +29,7 @@ On release, automated continuous integration tests run the pipeline on a [full-s 1. [`Circle-Map ReadExtractor`](https://github.com/iprada/Circle-Map) -> [`Circle-Map Realign`](https://github.com/iprada/Circle-Map) 1. [`Circle-Map ReadExtractor`](https://github.com/iprada/Circle-Map) -> [`Circle-Map Repeats`](https://github.com/iprada/Circle-Map) 1. [`CIRCexplorer2`](https://circexplorer2.readthedocs.io/en/latest/) - 1. [`Samblaster`](https://github.com/GregoryFaust/samblaster) -> [`Circle_finder`](https://github.com/pk7zuva/Circle_finder) + 1. [`Samblaster`](https://github.com/GregoryFaust/samblaster) -> [`Circle_finder`](https://github.com/pk7zuva/Circle_finder) **Does not use filtered BAM file, specificied with --keep_duplicates false** 1. Identification of circular amplicons [`AmpliconArchitect`](https://github.com/jluebeck/AmpliconArchitect) 1. DeNovo Assembly of circular DNAs [`Unicycler`](https://github.com/rrwick/Unicycler) -> [`Minimap2`](https://github.com/lh3/minimap2) 7. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) @@ -49,19 +44,19 @@ A graphical view of the pipeline and its diverse branches can be seen below. ## Quick Start -1. Install [`Nextflow`](https://www.nextflow.io/docs/latest/getstarted.html#installation) (`>=21.10.3`) +1. Install [`Nextflow`](https://www.nextflow.io/docs/latest/getstarted.html#installation) (`>=22.10.1`) 2. Install any of [`Docker`](https://docs.docker.com/engine/installation/), [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/) (you can follow [this tutorial](https://singularity-tutorial.github.io/01-installation/)), [`Podman`](https://podman.io/), [`Shifter`](https://nersc.gitlab.io/development/shifter/how-to-use/) or [`Charliecloud`](https://hpc.github.io/charliecloud/) for full pipeline reproducibility _(you can use [`Conda`](https://conda.io/miniconda.html) both to install Nextflow itself and also to manage software within pipelines. Please only use it within pipelines as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles))_. 3. Download the pipeline and test it on a minimal dataset with a single command: - ```console + ```bash nextflow run nf-core/circdna -profile test,YOURPROFILE --outdir ``` To test the `ampliconarchitect` branch functionality, please use the following command: - ```console + ```bash nextflow run nf-core/circdna -profile test_AA,YOURPROFILE --outdir ``` @@ -74,7 +69,7 @@ A graphical view of the pipeline and its diverse branches can be seen below. 4. Start running your own analysis! - ```console + ```bash nextflow run nf-core/circdna --input samplesheet.csv --outdir --genome GRCh38 -profile ``` @@ -104,7 +99,7 @@ Please specify the parameter `circle_identifier` depending on the pipeline branc The user can specify either one or multiple `circle_identifier` in a comma-separated string (see below). -```console +```bash nextflow run nf-core/circdna --input samplesheet.csv --outdir --genome GRCh38 -profile docker --circle_identifier circle_map_realign,unicycler ``` @@ -134,8 +129,7 @@ For further information or help, don't hesitate to get in touch on the [Slack `# ## Citations - - +If you use nf-core/circdna for your analysis, please cite it using the following doi: [10.5281/zenodo.6685250](https://doi.org/10.5281/zenodo.6685250) An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. diff --git a/assets/adaptivecard.json b/assets/adaptivecard.json new file mode 100644 index 00000000..43f9e1ee --- /dev/null +++ b/assets/adaptivecard.json @@ -0,0 +1,67 @@ +{ + "type": "message", + "attachments": [ + { + "contentType": "application/vnd.microsoft.card.adaptive", + "contentUrl": null, + "content": { + "\$schema": "http://adaptivecards.io/schemas/adaptive-card.json", + "msteams": { + "width": "Full" + }, + "type": "AdaptiveCard", + "version": "1.2", + "body": [ + { + "type": "TextBlock", + "size": "Large", + "weight": "Bolder", + "color": "<% if (success) { %>Good<% } else { %>Attention<%} %>", + "text": "nf-core/circdna v${version} - ${runName}", + "wrap": true + }, + { + "type": "TextBlock", + "spacing": "None", + "text": "Completed at ${dateComplete} (duration: ${duration})", + "isSubtle": true, + "wrap": true + }, + { + "type": "TextBlock", + "text": "<% if (success) { %>Pipeline completed successfully!<% } else { %>Pipeline completed with errors. The full error message was: ${errorReport}.<% } %>", + "wrap": true + }, + { + "type": "TextBlock", + "text": "The command used to launch the workflow was as follows:", + "wrap": true + }, + { + "type": "TextBlock", + "text": "${commandLine}", + "isSubtle": true, + "wrap": true + } + ], + "actions": [ + { + "type": "Action.ShowCard", + "title": "Pipeline Configuration", + "card": { + "type": "AdaptiveCard", + "\$schema": "http://adaptivecards.io/schemas/adaptive-card.json", + "body": [ + { + "type": "FactSet", + "facts": [<% out << summary.collect{ k,v -> "{\"title\": \"$k\", \"value\" : \"$v\"}"}.join(",\n") %> + ] + } + ] + } + } + ] + } + } + ] +} diff --git a/assets/email_template.txt b/assets/email_template.txt index 548364a7..19d70481 100644 --- a/assets/email_template.txt +++ b/assets/email_template.txt @@ -6,7 +6,6 @@ `._,._,' nf-core/circdna v${version} ---------------------------------------------------- - Run Name: $runName <% if (success){ diff --git a/assets/methods_description_template.yml b/assets/methods_description_template.yml new file mode 100644 index 00000000..418eb9b5 --- /dev/null +++ b/assets/methods_description_template.yml @@ -0,0 +1,25 @@ +id: "nf-core-circdna-methods-description" +description: "Suggested text and references to use when describing pipeline usage within the methods section of a publication." +section_name: "nf-core/circdna Methods Description" +section_href: "https://github.com/nf-core/circdna" +plot_type: "html" +## TODO nf-core: Update the HTML below to your prefered methods description, e.g. add publication citation for this pipeline +## You inject any metadata in the Nextflow '${workflow}' object +data: | +

Methods

+

Data was processed using nf-core/circdna v${workflow.manifest.version} ${doi_text} of the nf-core collection of workflows (Ewels et al., 2020).

+

The pipeline was executed with Nextflow v${workflow.nextflow.version} (Di Tommaso et al., 2017) with the following command:

+
${workflow.commandLine}
+

References

+
    +
  • Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., & Notredame, C. (2017). Nextflow enables reproducible computational workflows. Nature Biotechnology, 35(4), 316-319. https://doi.org/10.1038/nbt.3820
  • +
  • Ewels, P. A., Peltzer, A., Fillinger, S., Patel, H., Alneberg, J., Wilm, A., Garcia, M. U., Di Tommaso, P., & Nahnsen, S. (2020). The nf-core framework for community-curated bioinformatics pipelines. Nature Biotechnology, 38(3), 276-278. https://doi.org/10.1038/s41587-020-0439-x
  • +
+
+
Notes:
+
    + ${nodoi_text} +
  • The command above does not include parameters contained in any configs or profiles that may have been used. Ensure the config file is also uploaded with your publication!
  • +
  • You should also cite all software used within this run. Check the "Software Versions" of this report to get version information.
  • +
+
diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yml similarity index 82% rename from assets/multiqc_config.yaml rename to assets/multiqc_config.yml index 90c0577a..08bed135 100644 --- a/assets/multiqc_config.yaml +++ b/assets/multiqc_config.yml @@ -3,9 +3,11 @@ report_comment: > analysis pipeline. For information about how to interpret these results, please see the documentation. report_section_order: - software_versions: + "nf-core-circdna-methods-description": order: -1000 - nf-core-circdna-summary: + software_versions: order: -1001 + "nf-core-circdna-summary": + order: -1002 export_plots: true diff --git a/assets/nf-core-circdna_logo_dark.png b/assets/nf-core-circdna_logo_dark.png deleted file mode 100644 index 232f32be..00000000 Binary files a/assets/nf-core-circdna_logo_dark.png and /dev/null differ diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 37881564..5f653ab7 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,2 +1,3 @@ sample,fastq_1,fastq_2 SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz +SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz, diff --git a/assets/samplesheet_bam.csv b/assets/samplesheet_bam.csv deleted file mode 100644 index d2b4eabd..00000000 --- a/assets/samplesheet_bam.csv +++ /dev/null @@ -1,2 +0,0 @@ -sample,bam -SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002.bam diff --git a/assets/slackreport.json b/assets/slackreport.json new file mode 100644 index 00000000..043d02f2 --- /dev/null +++ b/assets/slackreport.json @@ -0,0 +1,34 @@ +{ + "attachments": [ + { + "fallback": "Plain-text summary of the attachment.", + "color": "<% if (success) { %>good<% } else { %>danger<%} %>", + "author_name": "sanger-tol/readmapping v${version} - ${runName}", + "author_icon": "https://www.nextflow.io/docs/latest/_static/favicon.ico", + "text": "<% if (success) { %>Pipeline completed successfully!<% } else { %>Pipeline completed with errors<% } %>", + "fields": [ + { + "title": "Command used to launch the workflow", + "value": "```${commandLine}```", + "short": false + } + <% + if (!success) { %> + , + { + "title": "Full error message", + "value": "```${errorReport}```", + "short": false + }, + { + "title": "Pipeline configuration", + "value": "<% out << summary.collect{ k,v -> k == "hook_url" ? "_${k}_: (_hidden_)" : ( ( v.class.toString().contains('Path') || ( v.class.toString().contains('String') && v.contains('/') ) ) ? "_${k}_: `${v}`" : (v.class.toString().contains('DateTime') ? ("_${k}_: " + v.format(java.time.format.DateTimeFormatter.ofLocalizedDateTime(java.time.format.FormatStyle.MEDIUM))) : "_${k}_: ${v}") ) }.join(",\n") %>", + "short": false + } + <% } + %> + ], + "footer": "Completed at <% out << dateComplete.format(java.time.format.DateTimeFormatter.ofLocalizedDateTime(java.time.format.FormatStyle.MEDIUM)) %> (duration: ${duration})" + } + ] +} diff --git a/bin/AmpliconArchitect.py b/bin/AmpliconArchitect.py index cfb3003f..b1b3ec04 100755 --- a/bin/AmpliconArchitect.py +++ b/bin/AmpliconArchitect.py @@ -49,9 +49,7 @@ __version__ = "1.3_r1" -parser = argparse.ArgumentParser( - description="Reconstruct Amplicons connected to listed intervals." -) +parser = argparse.ArgumentParser(description="Reconstruct Amplicons connected to listed intervals.") parser.add_argument( "--bed", dest="rdAlts", @@ -242,36 +240,22 @@ def process(self, msg, kwargs): DATA_REPO = os.environ["AA_DATA_REPO"] except: logging.warning( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + "unable to set AA_DATA_REPO variable. Setting to working directory" + "#TIME " + "%.3f\t" % (time() - TSTART) + "unable to set AA_DATA_REPO variable. Setting to working directory" ) DATA_REPO = "." if DATA_REPO == "." or DATA_REPO == "": logging.warning( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + "AA_DATA_REPO not set or empy. Setting to working directory" + "#TIME " + "%.3f\t" % (time() - TSTART) + "AA_DATA_REPO not set or empy. Setting to working directory" ) DATA_REPO = "." -logging.info( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + "Loading libraries and reference annotations for: " - + args.ref -) +logging.info("#TIME " + "%.3f\t" % (time() - TSTART) + "Loading libraries and reference annotations for: " + args.ref) import ref_util as hg import bam_to_breakpoint as b2b -logging.info( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + "Initiating bam_to_breakpoint object for: " - + args.bam -) +logging.info("#TIME " + "%.3f\t" % (time() - TSTART) + "Initiating bam_to_breakpoint object for: " + args.bam) rdList0 = hg.interval_list(rdAlts, "bed", exclude_info_string=True) rdList = hg.interval_list([r for r in rdList0]) cb = bamFile @@ -281,9 +265,7 @@ def process(self, msg, kwargs): cstats = None if args.no_cstats: logging.info( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + "--no_cstats was set. Will not attempt to re-use coverage.stats info" + "#TIME " + "%.3f\t" % (time() - TSTART) + "--no_cstats was set. Will not attempt to re-use coverage.stats info" ) if os.path.exists(os.path.join(hg.DATA_REPO, "coverage.stats")) and not args.no_cstats: @@ -309,11 +291,7 @@ def process(self, msg, kwargs): + str(os.path.join(hg.DATA_REPO, "coverage.stats")) ) else: - logging.debug( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + "cstats not found, generating coverage statistics... " - ) + logging.debug("#TIME " + "%.3f\t" % (time() - TSTART) + "cstats not found, generating coverage statistics... ") coverage_windows = None if cbed is not None: @@ -352,12 +330,7 @@ def process(self, msg, kwargs): if args.extendmode == "VIRAL": - logging.info( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + "Finding integration sites: " - + str(rdList[0]) - ) + logging.info("#TIME " + "%.3f\t" % (time() - TSTART) + "Finding integration sites: " + str(rdList[0])) de = bamFileb2b.interval_discordant_edges(rdList) old_stdout = sys.stdout sys.stdout = mystdout = StringIO() @@ -372,11 +345,7 @@ def process(self, msg, kwargs): [ i[0] for i in alist.merge_clusters(extend=5000000) - if len( - hg.interval_list([i[0]]).intersection(amplist) - + hg.interval_list([i[0]]).intersection(rdList) - ) - > 0 + if len(hg.interval_list([i[0]]).intersection(amplist) + hg.interval_list([i[0]]).intersection(rdList)) > 0 ] ) rdList = hg.interval_list( @@ -401,9 +370,7 @@ def process(self, msg, kwargs): irdgroupdict = {ird: frozenset([ird]) for ird in rdList} if args.extendmode == "EXPLORE" or args.extendmode == "VIRAL": for ird in rdList: - logging.info( - "#TIME " + "%.3f\t" % (time() - TSTART) + "Exploring interval: " + str(ird) - ) + logging.info("#TIME " + "%.3f\t" % (time() - TSTART) + "Exploring interval: " + str(ird)) old_stdout = sys.stdout sys.stdout = mystdout = StringIO() ilist = bamFileb2b.interval_hops(ird, rdlist=all_ilist) @@ -411,14 +378,7 @@ def process(self, msg, kwargs): for i in ilist: irddict[i] = ird iout = open( - outName - + "." - + ird.chrom - + ":" - + str(ird.start) - + "-" - + str(ird.end) - + ".out", + outName + "." + ird.chrom + ":" + str(ird.start) + "-" + str(ird.end) + ".out", "w", ) iout.write(mystdout.getvalue()) @@ -427,9 +387,7 @@ def process(self, msg, kwargs): all_ilist += ilist all_ilist.sort() - allhops = hg.interval_list( - reduce(lambda x, y: x + y, [irdh[1] for irdh in irdhops], []) - ) + allhops = hg.interval_list(reduce(lambda x, y: x + y, [irdh[1] for irdh in irdhops], [])) allhops.sort() allmerge = allhops.merge_clusters() for am in allmerge: @@ -462,9 +420,7 @@ def process(self, msg, kwargs): irdgroups = [hg.interval_list([r]) for r in rdList] -logging.info( - "#TIME " + "%.3f\t" % (time() - TSTART) + "Interval sets for amplicons determined: " -) +logging.info("#TIME " + "%.3f\t" % (time() - TSTART) + "Interval sets for amplicons determined: ") for il in enumerate(irdgroups): logging.info( "[amplicon" @@ -474,9 +430,7 @@ def process(self, msg, kwargs): ) summary_logger.info("#Amplicons = " + str(len(irdgroups))) -summary_logger.info( - "-----------------------------------------------------------------------------------------" -) +summary_logger.info("-----------------------------------------------------------------------------------------") if args.extendmode == "VIRAL": amplicon_id = 0 @@ -489,9 +443,7 @@ def process(self, msg, kwargs): old_stdout = sys.stdout sys.stdout = mystdout = StringIO() adapter = PrefixAdapter(summary_logger, {"prefix": str(amplicon_id)}) - summaryFormatter = logging.Formatter( - "[amplicon" + str(amplicon_id) + "] %(message)s" - ) + summaryFormatter = logging.Formatter("[amplicon" + str(amplicon_id) + "] %(message)s") for handler in summary_logger.handlers: handler.setFormatter(summaryFormatter) summary_logger.info("AmpliconID = " + str(amplicon_id)) @@ -499,43 +451,22 @@ def process(self, msg, kwargs): ilist1 = hg.interval_list([a[0] for a in ilist.merge_clusters()]) istr = ",".join([i.chrom + ":" + str(i.start) + "-" + str(i.end) for i in ilist1]) summary_logger.info("Intervals = " + str(istr)) - oncolist = ( - ",".join( - set([a[1].info["Name"] for a in ilist1.intersection(hg.oncogene_list)]) - ) - + "," - ) + oncolist = ",".join(set([a[1].info["Name"] for a in ilist1.intersection(hg.oncogene_list)])) + "," summary_logger.info("OncogenesAmplified = " + str(oncolist)) amplicon_name = outName + "_amplicon" + str(amplicon_id) if args.runmode in ["FULL", "CYCLES", "BPGRAPH"]: - logging.info( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + "Reconstructing amplicon" - + str(amplicon_id) - ) + logging.info("#TIME " + "%.3f\t" % (time() - TSTART) + "Reconstructing amplicon" + str(amplicon_id)) graph_handler = logging.FileHandler(amplicon_name + "_graph.txt", "w") cycle_handler = logging.FileHandler(amplicon_name + "_cycles.txt", "w") graph_logger.addHandler(graph_handler) cycle_logger.addHandler(cycle_handler) - bamFileb2b.interval_filter_vertices( - ilist, amplicon_name=amplicon_name, runmode=args.runmode - ) + bamFileb2b.interval_filter_vertices(ilist, amplicon_name=amplicon_name, runmode=args.runmode) graph_logger.removeHandler(graph_handler) cycle_logger.removeHandler(cycle_handler) if args.runmode in ["FULL", "SVVIEW"]: - logging.info( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + "Plotting SV View for amplicon" - + str(amplicon_id) - ) - bamFileb2b.plot_segmentation( - ilist, amplicon_name, segments=segments, font=args.plotstyle - ) - summary_logger.info( - "-----------------------------------------------------------------------------------------" - ) + logging.info("#TIME " + "%.3f\t" % (time() - TSTART) + "Plotting SV View for amplicon" + str(amplicon_id)) + bamFileb2b.plot_segmentation(ilist, amplicon_name, segments=segments, font=args.plotstyle) + summary_logger.info("-----------------------------------------------------------------------------------------") iout = open(amplicon_name + "_logs.txt", "w") iout.write(mystdout.getvalue()) iout.close() @@ -544,22 +475,12 @@ def process(self, msg, kwargs): continue -if (args.extendmode in ["VIRAL", "VIRAL_CLUSTERED"]) and ( - args.runmode in ["FULL", "SVVIEW", "VIRALVIEW"] -): +if (args.extendmode in ["VIRAL", "VIRAL_CLUSTERED"]) and (args.runmode in ["FULL", "SVVIEW", "VIRALVIEW"]): amplicon_id = 1 for i in irdgroups[0]: - if ( - i.intersects(rdList0[-1]) - or len(hg.interval_list([i]).intersection(rdList)) == 0 - ): + if i.intersects(rdList0[-1]) or len(hg.interval_list([i]).intersection(rdList)) == 0: continue - logging.info( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + "Plotting viral view for interval " - + str(i) - ) + logging.info("#TIME " + "%.3f\t" % (time() - TSTART) + "Plotting viral view for interval " + str(i)) bamFileb2b.plot_segmentation( hg.interval_list([i, rdList0[-1]]), outName + "_amplicon" + str(amplicon_id), diff --git a/bin/Coverage.py b/bin/Coverage.py index 81e35394..7ce2f9c1 100644 --- a/bin/Coverage.py +++ b/bin/Coverage.py @@ -1,24 +1,24 @@ -#MIT License +# MIT License # -#Copyright (c) 2019 Iñigo Prada Luengo +# Copyright (c) 2019 Iñigo Prada Luengo # -#Permission is hereby granted, free of charge, to any person obtaining a copy -#of this software and associated documentation files (the "Software"), to deal -#in the Software without restriction, including without limitation the rights -#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -#copies of the Software, and to permit persons to whom the Software is -#furnished to do so, subject to the following conditions: +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: # -#The above copyright notice and this permission notice shall be included in all -#copies or substantial portions of the Software. +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. # -#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -#SOFTWARE. +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. import os import pysam as ps import pybedtools as bt @@ -28,39 +28,33 @@ class coverage: """Class for managing the coverage metrics of circle-map""" - def __init__(self,sorted_bam,eccdna_bed,extension,mapq,inside_length,directory): - + def __init__(self, sorted_bam, eccdna_bed, extension, mapq, inside_length, directory): self.bam = ps.AlignmentFile(directory + "/" + sorted_bam, "rb") self.bed = eccdna_bed - #length of out + # length of out self.ext = extension self.mapq = mapq - #length of the region for the ratio + # length of the region for the ratio self.ilen = inside_length def print_parameters(self): print("Running coverage computations \n") - - def get_wg_coverage(self): """Generator that takes as input a sorted bam and a merged bam of the circles in the whole genome and returns a numpy array for every interval with the coverage""" - - - reference_contigs = self.bam.header['SQ'] + reference_contigs = self.bam.header["SQ"] header_dict = {} for reference in reference_contigs: - header_dict[reference['SN']] = reference['LN'] + header_dict[reference["SN"]] = reference["LN"] merged_bed = self.bed.sort().merge() for interval in merged_bed: - coverage_dict = {} if interval.start - self.ext < 0: start = 0 @@ -79,15 +73,12 @@ def get_wg_coverage(self): # save memory, convert to uint32. summ_cov = np.uint32(summarized_cov) - print("Computing coverage on interval %s:%s-%s" % (interval.chrom,interval.start,interval.end)) + print("Computing coverage on interval %s:%s-%s" % (interval.chrom, interval.start, interval.end)) coverage_dict[bt.Interval(interval.chrom, start, end)] = summ_cov - yield(coverage_dict,header_dict) - - - def compute_coverage(self,cov_generator): - + yield (coverage_dict, header_dict) + def compute_coverage(self, cov_generator): """Function that takes as input generator returning coverage numpy arrays and file with summarized statistics of the coverage within the intervals""" @@ -95,26 +86,21 @@ def compute_coverage(self,cov_generator): print("Merging intervals for coverage computation") output = [] - for cov_dict,header_dict in cov_generator: - for key,value in cov_dict.items(): - - + for cov_dict, header_dict in cov_generator: + for key, value in cov_dict.items(): overlaps = bt.BedTool(self.bed.all_hits(key)) - for interval in overlaps: - # compute array slicing indices - start = interval.start -key.start + start = interval.start - key.start end = interval.end - key.start - if start - self.ext < 0: ext_start = 0 else: ext_start = start - self.ext - if header_dict[interval.chrom] < (end+ self.ext): + if header_dict[interval.chrom] < (end + self.ext): ext_end = header_dict[interval.chrom] else: ext_end = end + self.ext @@ -123,13 +109,7 @@ def compute_coverage(self,cov_generator): ext_array = value[ext_start:ext_end] region_array = value[start:end] - - - - - try: - mean = np.mean(region_array) sd = np.std(region_array) @@ -137,37 +117,32 @@ def compute_coverage(self,cov_generator): interval.append(str(sd)) except: - - interval.append('NA') - interval.append('NA') - + interval.append("NA") + interval.append("NA") # compute ratios try: - - start_coverage_ratio = np.sum(region_array[0:self.ilen]) / np.sum( - ext_array[0:(self.ilen + self.ext)]) - end_coverage_ratio = np.sum(region_array[-self.ilen:]) / np.sum(ext_array[-(self.ilen + self.ext):]) + start_coverage_ratio = np.sum(region_array[0 : self.ilen]) / np.sum( + ext_array[0 : (self.ilen + self.ext)] + ) + end_coverage_ratio = np.sum(region_array[-self.ilen :]) / np.sum( + ext_array[-(self.ilen + self.ext) :] + ) interval.append(str(start_coverage_ratio)) interval.append(str(end_coverage_ratio)) except: - - interval.append('NA') - interval.append('NA') + interval.append("NA") + interval.append("NA") try: - - zero_frac = np.count_nonzero(region_array == 0) / len(region_array) interval.append(str(zero_frac)) except: - - interval.append('NA') + interval.append("NA") output.append(interval) - - return(bt.BedTool(output)) + return bt.BedTool(output) diff --git a/bin/abstract_graph.py b/bin/abstract_graph.py index 43ec5a9a..241c7d48 100755 --- a/bin/abstract_graph.py +++ b/bin/abstract_graph.py @@ -17,8 +17,8 @@ # IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. -#Author: Viraj Deshpande -#Contact: virajbdeshpande@gmail.com +# Author: Viraj Deshpande +# Contact: virajbdeshpande@gmail.com # This file defines classes and methods for an abstract undirected graph, vertex and edge. @@ -26,130 +26,135 @@ import logging + class abstract_vertex(object): - """Class describing a graph vertex. - Attributes: - elist: List of abstract_edges - vid: (optional) ID for the abstract_vertex - graph: (optional) abstract_graph to which the vertex belongs""" - def __init__(self, vid=0, graph=None): - """Initiate vertex with optional vid and graph""" - self.elist = [] - self.vid = vid # vertexid - self.graph = graph - if self.vid == 0 and self.graph is not None: - self.vid = self.graph.next_vid() - if self.graph is not None: - if vid in graph.vs: - raise Exception("Adding with duplicate vid") - self.graph.include_vertex(self) - - def neighbors(self): - """Return list of vertices connected to abstract_vertex by a direct edge""" - return [e.v2 for e in self.elist] - - def __hash__(self): - """Return hash based on vid to allow to efficiently check for presence of vid in graph, etc""" - return self.vid - - def __repr__(self): - """Vertex is represented by vid""" - return str(self.vid) + """Class describing a graph vertex. + Attributes: + elist: List of abstract_edges + vid: (optional) ID for the abstract_vertex + graph: (optional) abstract_graph to which the vertex belongs""" + + def __init__(self, vid=0, graph=None): + """Initiate vertex with optional vid and graph""" + self.elist = [] + self.vid = vid # vertexid + self.graph = graph + if self.vid == 0 and self.graph is not None: + self.vid = self.graph.next_vid() + if self.graph is not None: + if vid in graph.vs: + raise Exception("Adding with duplicate vid") + self.graph.include_vertex(self) + + def neighbors(self): + """Return list of vertices connected to abstract_vertex by a direct edge""" + return [e.v2 for e in self.elist] + + def __hash__(self): + """Return hash based on vid to allow to efficiently check for presence of vid in graph, etc""" + return self.vid + + def __repr__(self): + """Vertex is represented by vid""" + return str(self.vid) class abstract_edge(object): - """Class describing a graph edge. - Attributes: - v1, v2: Ordered pair of vertices connected by the edge - eid: (optional) ID for the abstract_edge - graph: (optional) abstract_graph to which the vertex belongs.""" - def __init__(self, v1, v2, eid=0, graph=None, update_vertices=True): - """Initiate edge - Arguments: v1, v2, (optional)eid, (optional) graph. - update_vertices: (optional True/False) to update vertices to include edge in v1.elist, v2.elist. (default=True)""" - self.v1, self.v2 = v1, v2 - self.eid = eid - self.graph = graph - if self.eid == 0 and self.graph is not None: - self.eid = self.graph.next_eid() - if self.graph is not None: - if eid in self.graph.es: - raise Exception("Adding edge with duplicate eid") - self.graph.include_edge(self) - if update_vertices: - if v1.graph is not v2.graph: - raise Exception("Adding edge between vertices of different graphs.") - if graph is not None and v1.graph is not graph: - raise Exception("Edge in different graph than vertex.") - if graph is None and v1.graph is not None: - graph = v1.graph - v1.elist.append(self) - v2.elist.append(self) - - def neighbor(self, v): - """Given a vertex, return its neighbor along the edge""" - if v == self.v1: - return self.v2 - if v == self.v2: - return self.v1 - raise Exception("Edge not connected to vertex") - - def __hash__(self): - """Return hash based on eid to allow to efficiently check for presence of eid in graph, etc""" - return self.eid - - def length(self): - """Not implemented""" - pass - - def __repr__(self): - """String representation of the form v1<->v2.""" - return str(self.v1) + '<->' + str(self.v2) + """Class describing a graph edge. + Attributes: + v1, v2: Ordered pair of vertices connected by the edge + eid: (optional) ID for the abstract_edge + graph: (optional) abstract_graph to which the vertex belongs.""" + + def __init__(self, v1, v2, eid=0, graph=None, update_vertices=True): + """Initiate edge + Arguments: v1, v2, (optional)eid, (optional) graph. + update_vertices: (optional True/False) to update vertices to include edge in v1.elist, v2.elist. (default=True) + """ + self.v1, self.v2 = v1, v2 + self.eid = eid + self.graph = graph + if self.eid == 0 and self.graph is not None: + self.eid = self.graph.next_eid() + if self.graph is not None: + if eid in self.graph.es: + raise Exception("Adding edge with duplicate eid") + self.graph.include_edge(self) + if update_vertices: + if v1.graph is not v2.graph: + raise Exception("Adding edge between vertices of different graphs.") + if graph is not None and v1.graph is not graph: + raise Exception("Edge in different graph than vertex.") + if graph is None and v1.graph is not None: + graph = v1.graph + v1.elist.append(self) + v2.elist.append(self) + + def neighbor(self, v): + """Given a vertex, return its neighbor along the edge""" + if v == self.v1: + return self.v2 + if v == self.v2: + return self.v1 + raise Exception("Edge not connected to vertex") + + def __hash__(self): + """Return hash based on eid to allow to efficiently check for presence of eid in graph, etc""" + return self.eid + + def length(self): + """Not implemented""" + pass + + def __repr__(self): + """String representation of the form v1<->v2.""" + return str(self.v1) + "<->" + str(self.v2) class abstract_graph(object): - """Class describing a graph. - Attributes: - vs: Dictionary from vid/key to vertex - es: Dictionary from eid/key to edge - max_vid: (internal) max_vid, used to assign vid for new vertex. Suggested to use function next_vid. - max_eid: (internal) max_eid, used to assign eid for new edge. Suggested to use function next_eid.""" - def __init__(self): - """Initiate empty graph""" - self.es = {} # key -->edges - self.vs = {} # key -->vertices - #self.logger = logging.getLogger('Algae') - self.max_eid = 1 - self.max_vid = 1 - - def include_vertex(self, v): - """Include orphan abstract_vertex in graph and update vertex.graph to point to self""" - if v.vid in self.vs and self.vs[v.vid] is not v: - raise "Adding vertex with duplicate vid" - if v.graph is not None and v.graph is not self: - raise "Adding vertex from another graph" - if v.graph is None: - v.graph = self - self.vs[v.vid] = v - - def include_edge(self, e): - """Include orphan abstract_edge in graph and update edge.graph to point to self. Vertices should be updated separately""" - if e.eid in self.es and self.es[e.eid] is not e: - raise "Adding edge with duplicate eid" - if e.graph is not None and e.graph is not self: - raise "Adding edge from another graph" - if e.graph is None: - e.graph = self - self.es[e.eid] = e - - def next_eid (self): - """Find the next eid available for assignment to new edge""" - while self.max_eid in self.es or -1 * self.max_eid in self.es: - self.max_eid += 1 - return self.max_eid - - def next_vid (self): - """Find the next vid available for assignment to new vertex""" - while self.max_vid in self.vs or -1 * self.max_vid in self.vs: - self.max_vid += 1 - return self.max_vid + """Class describing a graph. + Attributes: + vs: Dictionary from vid/key to vertex + es: Dictionary from eid/key to edge + max_vid: (internal) max_vid, used to assign vid for new vertex. Suggested to use function next_vid. + max_eid: (internal) max_eid, used to assign eid for new edge. Suggested to use function next_eid.""" + + def __init__(self): + """Initiate empty graph""" + self.es = {} # key -->edges + self.vs = {} # key -->vertices + # self.logger = logging.getLogger('Algae') + self.max_eid = 1 + self.max_vid = 1 + + def include_vertex(self, v): + """Include orphan abstract_vertex in graph and update vertex.graph to point to self""" + if v.vid in self.vs and self.vs[v.vid] is not v: + raise "Adding vertex with duplicate vid" + if v.graph is not None and v.graph is not self: + raise "Adding vertex from another graph" + if v.graph is None: + v.graph = self + self.vs[v.vid] = v + + def include_edge(self, e): + """Include orphan abstract_edge in graph and update edge.graph to point to self. Vertices should be updated separately""" + if e.eid in self.es and self.es[e.eid] is not e: + raise "Adding edge with duplicate eid" + if e.graph is not None and e.graph is not self: + raise "Adding edge from another graph" + if e.graph is None: + e.graph = self + self.es[e.eid] = e + + def next_eid(self): + """Find the next eid available for assignment to new edge""" + while self.max_eid in self.es or -1 * self.max_eid in self.es: + self.max_eid += 1 + return self.max_eid + + def next_vid(self): + """Find the next vid available for assignment to new vertex""" + while self.max_vid in self.vs or -1 * self.max_vid in self.vs: + self.max_vid += 1 + return self.max_vid diff --git a/bin/amplified_intervals copy.py b/bin/amplified_intervals copy.py deleted file mode 100755 index 04443f33..00000000 --- a/bin/amplified_intervals copy.py +++ /dev/null @@ -1,225 +0,0 @@ -#!/usr/bin/env python2 - -# This software is Copyright 2017 The Regents of the University of California. All Rights Reserved. Permission to copy, modify, and distribute this software and its documentation for educational, research and non-profit purposes, without fee, and without a written agreement is hereby granted, provided that the above copyright notice, this paragraph and the following three paragraphs appear in all copies. Permission to make commercial use of this software may be obtained by contacting: -# -# Office of Innovation and Commercialization -# -# University of California -# -# La Jolla, CA 92093-0910 -# -# (858) 534-5815 -# -# invent@ucsd.edu -# -# This software program and documentation are copyrighted by The Regents of the University of California. The software program and documentation are supplied "as is", without any accompanying services from The Regents. The Regents does not warrant that the operation of the program will be uninterrupted or error-free. The end-user understands that the program was developed for research purposes and is advised not to rely exclusively on the program for any reason. -# -# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. - -# Author: Viraj Deshpande -# Contact: virajbdeshpande@gmail.com - - -import copy -from collections import defaultdict -import sys -from sets import Set -import numpy as np -import re - -sys.setrecursionlimit(10000) -import argparse -import os -import pysam -import global_names - -GAIN = 5.0 -CNSIZE_MIN = 100000 - -parser = argparse.ArgumentParser(description="Filter and merge amplified intervals") -parser.add_argument( - "--bed", - dest="bed", - help="Input bed file with list of amplified intervals", - metavar="FILE", - action="store", - type=str, - required=True, -) -parser.add_argument( - "--out", - dest="out", - help="OPTIONAL: Prefix filename for output bed file. Default: _amplified.bed", - metavar="FILE", - action="store", - type=str, - default="", -) -parser.add_argument( - "--bam", - dest="bam", - help="OPTIONAL: Bamfile, used to avoid large aneuploidies", - metavar="FILE", - action="store", - type=str, - default="", -) -parser.add_argument( - "--gain", - dest="gain", - help="OPTIONAL: CN gain threshold for interval to be considered as a seed. Default: 5", - action="store", - type=float, - default=GAIN, -) -parser.add_argument( - "--cnsize_min", - dest="cnsize_min", - help="OPTIONAL: Minimum size (in bp) for interval to be considered as a seed. Default: 100000", - action="store", - type=int, - default=CNSIZE_MIN, -) -parser.add_argument( - "--ref", - dest="ref", - help='Values: [hg19, GRCh37, GRCh38, None]. "hg19"(default) & "GRCh38" : chr1, .. chrM etc / "GRCh37" : \'1\', \'2\', .. \'MT\' etc/ "None" : Do not use any annotations. AA can tolerate additional chromosomes not stated but accuracy and annotations may be affected. Default: hg19', - metavar="STR", - action="store", - type=str, - default="hg19", -) -args = parser.parse_args() - -global_names.REF = args.ref -import hg19util as hg - - -if args.bed != "": - rdAlts = args.bed - -if args.out != "": - outname = args.out + ".bed" -else: - outname = os.path.splitext(rdAlts)[0] + "_amplified.bed" - -GAIN, CNSIZE_MIN = args.gain, args.cnsize_min - -rdList0 = hg.interval_list(rdAlts, "bed") -if rdList0: - try: - if len(rdList0[0].info) == 0: - sys.stderr.write( - "ERROR: CNV estimate bed file had too few columns.\n" - "Must contain: chr pos1 pos2 cnv_estimate\n" - ) - sys.exit(1) - _ = float(rdList0[0].info[-1]) - - except ValueError: - sys.stderr.write("ERROR: CNV estimates must be in last column of bed file.\n") - sys.exit(1) - -rdList = hg.interval_list([r for r in rdList0 if float(r.info[-1]) > GAIN]) - -if args.bam != "": - import bam_to_breakpoint as b2b - - if os.path.splitext(args.bam)[-1] == ".cram": - bamFile = pysam.Samfile(args.bam, "rc") - else: - bamFile = pysam.Samfile(args.bam, "rb") - cstats = None - cb = bamFile - if os.path.exists(os.path.join(hg.DATA_REPO, "coverage.stats")): - coverage_stats_file = open(os.path.join(hg.DATA_REPO, "coverage.stats")) - for l in coverage_stats_file: - ll = l.strip().split() - if ll[0] == os.path.abspath(cb.filename): - cstats = tuple(map(float, ll[1:])) - coverage_stats_file.close() - bamFileb2b = b2b.bam_to_breakpoint(bamFile, coverage_stats=cstats) - rdList = hg.interval_list( - [ - r - for r in rdList - if float(r.info[-1]) - > GAIN - + 2 - * max( - 1.0, - bamFileb2b.median_coverage(refi=r)[0] / bamFileb2b.median_coverage()[0], - ) - - 2 - and bamFileb2b.median_coverage(refi=r)[0] / bamFileb2b.median_coverage()[0] - > 0 - ] - ) - -genome_features = hg.oncogene_list -amplicon_listl = rdList - -cr = hg.conserved_regions -uc_list = hg.interval_list([]) -for a in amplicon_listl: - if ( - len(hg.interval_list([a]).intersection(cr)) == 0 - or a.size() - > max( - 1000000, - 10 - * sum( - [ - a.intersection(ci[1]).size() - for ci in hg.interval_list([a]).intersection(cr) - ] - ), - ) - or a.size() - - sum( - [ - a.intersection(ci[1]).size() - for ci in hg.interval_list([a]).intersection(cr) - ] - ) - > 2000000 - ): - if (len(hg.interval_list([a]).intersection(cr))) == 0: - uc_list.append(a) - else: - cra = hg.interval_list([a]).intersection(cr) - cpos = a.start - for crai in cra: - if cpos < crai[1].start - 1000000: - uc_list.append( - hg.interval(a.chrom, cpos, crai[1].start - 1000000, info=a.info) - ) - cpos = crai[1].end + 1000000 - if a.end > cpos: - uc_list.append(hg.interval(a.chrom, cpos, a.end, info=a.info)) - -uc_list = hg.interval_list( - [ - a - for a in uc_list - if float(a.info[-1]) * a.segdup_uniqueness() > GAIN and a.rep_content() < 2.5 - ] -) -uc_merge = uc_list.merge_clusters(extend=300000) - -with open(outname, "w") as outfile: - for a in uc_merge: - if sum([ai.size() for ai in a[1]]) > CNSIZE_MIN: - outfile.write( - "\t".join( - [ - str(a[0]), - str( - sum([ai.size() * float(ai.info[-1]) for ai in a[1]]) - / sum([ai.size() for ai in a[1]]) - ), - rdAlts, - ] - ) - + "\n" - ) diff --git a/bin/amplified_intervals.py b/bin/amplified_intervals.py index 372000bc..c7e2fbce 100755 --- a/bin/amplified_intervals.py +++ b/bin/amplified_intervals.py @@ -117,8 +117,7 @@ try: if len(rdList0[0].info) == 0: sys.stderr.write( - "ERROR: CNV estimate bed file had too few columns.\n" - "Must contain: chr pos1 pos2 cnv_estimate\n" + "ERROR: CNV estimate bed file had too few columns.\n" "Must contain: chr pos1 pos2 cnv_estimate\n" ) sys.exit(1) _ = float(rdList0[0].info[-1]) @@ -139,10 +138,7 @@ cstats = None cb = bamFile - if ( - os.path.exists(os.path.join(hg.DATA_REPO, "coverage.stats")) - and not args.no_cstats - ): + if os.path.exists(os.path.join(hg.DATA_REPO, "coverage.stats")) and not args.no_cstats: coverage_stats_file = open(os.path.join(hg.DATA_REPO, "coverage.stats")) for l in coverage_stats_file: ll = l.strip().split() @@ -150,11 +146,7 @@ bamfile_filesize = os.path.getsize(bamfile_pathname) if ll[0] == os.path.abspath(bamfile_pathname): cstats = tuple(map(float, ll[1:])) - if ( - len(cstats) < 15 - or cstats[13] != 3 - or bamfile_filesize != int(cstats[14]) - ): + if len(cstats) < 15 or cstats[13] != 3 or bamfile_filesize != int(cstats[14]): cstats = None coverage_stats_file.close() @@ -163,9 +155,7 @@ pre_int_list = [] for r in rdList: try: - chrom_cov_ratio = ( - bamFileb2b.median_coverage(refi=r)[0] / bamFileb2b.median_coverage()[0] - ) + chrom_cov_ratio = bamFileb2b.median_coverage(refi=r)[0] / bamFileb2b.median_coverage()[0] # print("chrom ratio " + r.chrom + " " + str(chrom_cov_ratio)) if ( float(r.info[-1]) @@ -173,13 +163,10 @@ + 2 * max( 1.0, - bamFileb2b.median_coverage(refi=r)[0] - / bamFileb2b.median_coverage()[0], + bamFileb2b.median_coverage(refi=r)[0] / bamFileb2b.median_coverage()[0], ) - 2 - and bamFileb2b.median_coverage(refi=r)[0] - / bamFileb2b.median_coverage()[0] - > 0 + and bamFileb2b.median_coverage(refi=r)[0] / bamFileb2b.median_coverage()[0] > 0 ): if r.size() < 10000000 or float(r.info[-1]) > 1.5 * GAIN: pre_int_list.append(r) @@ -200,22 +187,9 @@ or a.size() > max( 1000000, - 10 - * sum( - [ - a.intersection(ci[1]).size() - for ci in hg.interval_list([a]).intersection(cr) - ] - ), - ) - or a.size() - - sum( - [ - a.intersection(ci[1]).size() - for ci in hg.interval_list([a]).intersection(cr) - ] + 10 * sum([a.intersection(ci[1]).size() for ci in hg.interval_list([a]).intersection(cr)]), ) - > 2000000 + or a.size() - sum([a.intersection(ci[1]).size() for ci in hg.interval_list([a]).intersection(cr)]) > 2000000 ): if (len(hg.interval_list([a]).intersection(cr))) == 0: uc_list.append(a) @@ -224,19 +198,13 @@ cpos = a.start for crai in cra: if cpos < crai[1].start - 1000000: - uc_list.append( - hg.interval(a.chrom, cpos, crai[1].start - 1000000, info=a.info) - ) + uc_list.append(hg.interval(a.chrom, cpos, crai[1].start - 1000000, info=a.info)) cpos = crai[1].end + 1000000 if a.end > cpos: uc_list.append(hg.interval(a.chrom, cpos, a.end, info=a.info)) uc_list = hg.interval_list( - [ - a - for a in uc_list - if float(a.info[-1]) * a.segdup_uniqueness() > GAIN and a.rep_content() < 2.5 - ] + [a for a in uc_list if float(a.info[-1]) * a.segdup_uniqueness() > GAIN and a.rep_content() < 2.5] ) uc_merge = uc_list.merge_clusters(extend=300000) @@ -247,10 +215,7 @@ "\t".join( [ str(a[0]), - str( - sum([ai.size() * float(ai.info[-1]) for ai in a[1]]) - / sum([ai.size() for ai in a[1]]) - ), + str(sum([ai.size() * float(ai.info[-1]) for ai in a[1]]) / sum([ai.size() for ai in a[1]])), rdAlts, ] ) diff --git a/bin/bam2bam.py b/bin/bam2bam.py index 8b2d4b21..0474fb58 100644 --- a/bin/bam2bam.py +++ b/bin/bam2bam.py @@ -1,24 +1,24 @@ -#MIT License +# MIT License # -#Copyright (c) 2019 Iñigo Prada Luengo +# Copyright (c) 2019 Iñigo Prada Luengo # -#Permission is hereby granted, free of charge, to any person obtaining a copy -#of this software and associated documentation files (the "Software"), to deal -#in the Software without restriction, including without limitation the rights -#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -#copies of the Software, and to permit persons to whom the Software is -#furnished to do so, subject to the following conditions: +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: # -#The above copyright notice and this permission notice shall be included in all -#copies or substantial portions of the Software. +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. # -#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -#SOFTWARE. +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. from __future__ import division @@ -35,17 +35,40 @@ import datetime - class bam2bam: """Class for managing the realignment and eccDNA indetification of circle-map""" queue = mp.Manager().Queue() - def __init__(self, input_bam,output,qname_bam,genome_fasta,directory,mapq_cutoff,insert_size_mapq,std_extension, - insert_size_sample_size,gap_open,gap_ext,n_hits,prob_cutoff,min_soft_clipped_length, - interval_p_cut,ncores,locker,verbose,pid,edit_distance_frac, - remap_splits,only_discordants,score,insert_size,manager): - #I/O + def __init__( + self, + input_bam, + output, + qname_bam, + genome_fasta, + directory, + mapq_cutoff, + insert_size_mapq, + std_extension, + insert_size_sample_size, + gap_open, + gap_ext, + n_hits, + prob_cutoff, + min_soft_clipped_length, + interval_p_cut, + ncores, + locker, + verbose, + pid, + edit_distance_frac, + remap_splits, + only_discordants, + score, + insert_size, + manager, + ): + # I/O self.edit_distance_frac = edit_distance_frac self.ecc_dna_str = input_bam self.output = output @@ -53,7 +76,7 @@ def __init__(self, input_bam,output,qname_bam,genome_fasta,directory,mapq_cutoff self.directory = directory self.genome_fa = genome_fasta - #realignment parameters + # realignment parameters # probabilistic realignment options self.n_hits = n_hits @@ -70,54 +93,41 @@ def __init__(self, input_bam,output,qname_bam,genome_fasta,directory,mapq_cutoff self.gap_open = gap_open self.gap_ext = gap_ext - - #insert size stimation parameters + # insert size stimation parameters self.insert_size_mapq = insert_size_mapq self.std_extenstion = std_extension self.insert_sample_size = insert_size_sample_size - #regular options + # regular options self.cores = ncores self.verbose = verbose self.lock = locker - - - #for instances running on the same directoiry + # for instances running on the same directoiry self.pid = pid - #parallel enviroment + # parallel enviroment self.read_list = manager.list() - self.read_count = manager.Value('i', 0) - self.write_round = manager.Value('i', 0) - - - - - - + self.read_count = manager.Value("i", 0) + self.write_round = manager.Value("i", 0) - def listener_writer(self,bam): - - f = open('test.sam',"w") + def listener_writer(self, bam): + f = open("test.sam", "w") header = bam.header while True: - # Read from the queue and do nothing read = self.queue.get() - if read == "DONE": f.close() print("breaking") bam.close() break else: - - pysam_read = ps.AlignedSegment.fromstring(read,bam.header) + pysam_read = ps.AlignedSegment.fromstring(read, bam.header) f.write(read + "\n") bam.write(pysam_read) @@ -128,103 +138,109 @@ def kill(self): def beta_version_warning(self): """Warn the user that this is experimental""" print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S: You are using a beta version feature")) - warnings.warn("The bam2bam feature on Circle-Map is experimental. The development of this feature is active, but" - " have in mind that it might produce unintended results. Check https://github.com/iprada/Circle-Map" - " for the development status.") - + warnings.warn( + "The bam2bam feature on Circle-Map is experimental. The development of this feature is active, but" + " have in mind that it might produce unintended results. Check https://github.com/iprada/Circle-Map" + " for the development status." + ) - - - def realign(self,peaks): + def realign(self, peaks): """Function that will iterate trough the bam file containing reads indicating eccDNA structural variants and will output a bed file containing the soft-clipped reads, the discordant and the coverage within the interval""" - #open files for every process + # open files for every process try: - peaks_pd = pd.DataFrame.from_records(peaks,columns=['chrom', 'start', 'end']) + peaks_pd = pd.DataFrame.from_records(peaks, columns=["chrom", "start", "end"]) genome_fa = ps.FastaFile(self.genome_fa) - ecc_dna = ps.AlignmentFile(self.ecc_dna_str,"rb") + ecc_dna = ps.AlignmentFile(self.ecc_dna_str, "rb") begin = time.time() - - - - - - - # compute insert size distribution insert_metrics = self.insert - - #define realignment extension interval - extension = insert_metrics[0] + self.std_extenstion*insert_metrics[1] - + # define realignment extension interval + extension = insert_metrics[0] + self.std_extenstion * insert_metrics[1] iteration = 0 - - - for index,interval in peaks_pd.iterrows(): - - - - + for index, interval in peaks_pd.iterrows(): try: - - - #find out the prior distribution (mate alignment positions). - candidate_mates = get_mate_intervals(ecc_dna,interval,self.mapq_cutoff,self.verbose,self.only_discordants) - - - - - + # find out the prior distribution (mate alignment positions). + candidate_mates = get_mate_intervals( + ecc_dna, + interval, + self.mapq_cutoff, + self.verbose, + self.only_discordants, + ) if len(candidate_mates) > 0 or candidate_mates != None: - - - realignment_interval_extended = get_realignment_intervals(candidate_mates,extension,self.interval_p, - self.verbose) + realignment_interval_extended = get_realignment_intervals( + candidate_mates, extension, self.interval_p, self.verbose + ) if realignment_interval_extended is None: continue - - iteration_results = [] - for index,mate_interval in realignment_interval_extended.iterrows(): - + for ( + index, + mate_interval, + ) in realignment_interval_extended.iterrows(): iteration += 1 - #sample realignment intervals - #fasta file fetch is 1 based that why I do +1 + # sample realignment intervals + # fasta file fetch is 1 based that why I do +1 - plus_coding_interval = genome_fa.fetch(str(mate_interval['chrom']),int(int(mate_interval['start'])+1),int(int(mate_interval['end'])+1)).upper() + plus_coding_interval = genome_fa.fetch( + str(mate_interval["chrom"]), + int(int(mate_interval["start"]) + 1), + int(int(mate_interval["end"]) + 1), + ).upper() interval_length = len(plus_coding_interval) minus_coding_interval = str(Seq(plus_coding_interval).complement()) # precompute the denominators of the error model. They will be constants for every interval plus_base_freqs = background_freqs(plus_coding_interval) - minus_base_freqs = {'T':plus_base_freqs['A'],'A':plus_base_freqs['T'], - 'C':plus_base_freqs['G'],'G':plus_base_freqs['C']} - - minus_base_freqs = np.array([plus_base_freqs['T'],plus_base_freqs['A'],plus_base_freqs['G'],plus_base_freqs['C']]) - plus_base_freqs = np.array([plus_base_freqs['A'],plus_base_freqs['T'],plus_base_freqs['C'],plus_base_freqs['G']]) - - #note that I am getting the reads of the interval. Not the reads of the mates - - for read in ecc_dna.fetch(interval['chrom'],int(interval['start']),int(interval['end']),multiple_iterators=True): + minus_base_freqs = { + "T": plus_base_freqs["A"], + "A": plus_base_freqs["T"], + "C": plus_base_freqs["G"], + "G": plus_base_freqs["C"], + } + + minus_base_freqs = np.array( + [ + plus_base_freqs["T"], + plus_base_freqs["A"], + plus_base_freqs["G"], + plus_base_freqs["C"], + ] + ) + plus_base_freqs = np.array( + [ + plus_base_freqs["A"], + plus_base_freqs["T"], + plus_base_freqs["C"], + plus_base_freqs["G"], + ] + ) + + # note that I am getting the reads of the interval. Not the reads of the mates + + for read in ecc_dna.fetch( + interval["chrom"], + int(interval["start"]), + int(interval["end"]), + multiple_iterators=True, + ): if is_soft_clipped(read): - if read.mapq >= self.mapq_cutoff: - # no need to realignment - if read.has_tag('SA') and self.remap != True: - + if read.has_tag("SA") and self.remap != True: # check realignment from SA tag support = circle_from_SA(read, self.mapq_cutoff, mate_interval) @@ -232,79 +248,104 @@ def realign(self,peaks): pass else: - - if support['support'] == True: + if support["support"] == True: self.queue.put(read.to_string()) else: # uninformative read pass - - else: - #sc length - sc_len = len(get_longest_soft_clipped_bases(read)['seq']) - - - if non_colinearity(int(read.cigar[0][0]),int(read.cigar[-1][0]),int(read.pos), - int(mate_interval.start),int(mate_interval.end)) == True: + # sc length + sc_len = len(get_longest_soft_clipped_bases(read)["seq"]) + + if ( + non_colinearity( + int(read.cigar[0][0]), + int(read.cigar[-1][0]), + int(read.pos), + int(mate_interval.start), + int(mate_interval.end), + ) + == True + ): if sc_len >= self.min_sc_length: edits_allowed = adaptative_myers_k(sc_len, self.edit_distance_frac) - #realignment - - realignment_dict = realign(read,self.n_hits,plus_coding_interval,minus_coding_interval, - plus_base_freqs,minus_base_freqs,self.gap_open,self.gap_ext,self.verbose,edits_allowed) - + # realignment + + realignment_dict = realign( + read, + self.n_hits, + plus_coding_interval, + minus_coding_interval, + plus_base_freqs, + minus_base_freqs, + self.gap_open, + self.gap_ext, + self.verbose, + edits_allowed, + ) if realignment_dict == None: - pass else: - #calc edit distance allowed - prob = realignment_probability(realignment_dict,interval_length) - if prob >= self.prob_cutoff and realignment_dict['alignments'][1][3] <= edits_allowed: - + # calc edit distance allowed + prob = realignment_probability( + realignment_dict, + interval_length, + ) + if ( + prob >= self.prob_cutoff + and realignment_dict["alignments"][1][3] <= edits_allowed + ): # here I have to retrieve the nucleotide mapping positions. Which should be the # the left sampling pysam coordinate - edlib coordinates read_end = rightmost_from_read(read) - #aln start on the reference - soft_clip_start = int(mate_interval['start'])+ int(realignment_dict['alignments'][1][0][0]) - - soft_clip_end = int(mate_interval['start']) + int(realignment_dict['alignments'][1][0][1]) + # aln start on the reference + soft_clip_start = int(mate_interval["start"]) + int( + realignment_dict["alignments"][1][0][0] + ) - score = sc_len*prob + soft_clip_end = int(mate_interval["start"]) + int( + realignment_dict["alignments"][1][0][1] + ) + score = sc_len * prob # I store the read name to the output, so that a read counts as 1 no matter it is SC in 2 pieces # Soft-clipped aligned upstream. Primary aligned downstream - if read.reference_start < int(mate_interval['start']) + int( - realignment_dict['alignments'][1][0][0]): - # construct tag - sa_tag = realignment_read_to_SA_string(realignment_dict, - prob, interval['chrom'], - soft_clip_start) - - - #read.tags += [('SA', sa_tag)] - - self.queue.put(read.to_string()) - + if read.reference_start < int(mate_interval["start"]) + int( + realignment_dict["alignments"][1][0][0] + ): + # construct tag + sa_tag = realignment_read_to_SA_string( + realignment_dict, + prob, + interval["chrom"], + soft_clip_start, + ) + + # read.tags += [('SA', sa_tag)] + self.queue.put(read.to_string()) # soft-clipped aligned downstream primary alignment is upstream - elif read.reference_start + int(mate_interval['start']) + int( - realignment_dict['alignments'][1][0][0]): - - sa_tag = realignment_read_to_SA_string(realignment_dict, - prob, interval[ - 'chrom'], - soft_clip_start) - - read.tags += [('SA', sa_tag)] + elif ( + read.reference_start + + int(mate_interval["start"]) + + int(realignment_dict["alignments"][1][0][0]) + ): + sa_tag = realignment_read_to_SA_string( + realignment_dict, + prob, + interval["chrom"], + soft_clip_start, + ) + + read.tags += [("SA", sa_tag)] self.queue.put(read.to_string()) @@ -312,12 +353,9 @@ def realign(self,peaks): # uninformative read pass - - else: pass else: - pass else: @@ -325,16 +363,10 @@ def realign(self,peaks): else: pass - except BaseException as e: traceback.print_exc(file=sys.stdout) - warnings.warn( - "Failed on interval %s due to the error %s" % ( - str(interval), str(e))) - return([1,1]) - - - + warnings.warn("Failed on interval %s due to the error %s" % (str(interval), str(e))) + return [1, 1] ecc_dna.close() genome_fa.close() @@ -342,10 +374,9 @@ def realign(self,peaks): except: print("Failed on cluster:") print(traceback.print_exc(file=sys.stdout)) - return([1,1]) + return [1, 1] genome_fa.close() ecc_dna.close() - - return([0,0]) + return [0, 0] diff --git a/bin/bam_to_breakpoint.py b/bin/bam_to_breakpoint.py index 2cec632f..5a9095db 100755 --- a/bin/bam_to_breakpoint.py +++ b/bin/bam_to_breakpoint.py @@ -70,9 +70,7 @@ mosek_major_version = mosek.Env.getversion()[0] if mosek_major_version > 8: logging.warning( - "Mosek version is " - + ".".join([str(x) for x in mosek.Env.getversion()]) - + " AA requires version 8\n" + "Mosek version is " + ".".join([str(x) for x in mosek.Env.getversion()]) + " AA requires version 8\n" ) @@ -154,23 +152,16 @@ def __init__( if self.downsample < 0 or self.downsample > self.basic_stats[0]: self.downsample_ratio = 1 elif self.downsample == 0: - self.downsample_ratio = ( - 10.0 / self.basic_stats[0] if self.basic_stats[0] > 10 else 1 - ) + self.downsample_ratio = 10.0 / self.basic_stats[0] if self.basic_stats[0] > 10 else 1 else: self.downsample_ratio = ( - float(self.downsample) / self.basic_stats[0] - if self.basic_stats[0] > float(self.downsample) - else 1 + float(self.downsample) / self.basic_stats[0] if self.basic_stats[0] > float(self.downsample) else 1 ) if self.downsample_ratio != 1: rr = self.downsample_ratio rsq = math.sqrt(rr) - r = [ - i[0] * i[1] - for i in zip([rr, rr, rsq, rr, rr, rsq, 1, 1, 1, 1, 1, 1, 1], r) - ] + r = [i[0] * i[1] for i in zip([rr, rr, rsq, rr, rr, rsq, 1, 1, 1, 1, 1, 1, 1], r)] r[11] = max((r[4] / 10.0) * ((r[7] - r[6]) / 2 / r[6]) * r[12], 2) self.pair_support = r[11] self.downsample_stats = r @@ -215,15 +206,9 @@ def interval_coverage(self, i, clip=False, gcc=False): j = 0 for w in wc_raw: alist = [a for a in self.fetch(w[0].chrom, w[0].start, w[0].end)] - wc_corrected += ( - w[0].size() - * w[1] - / self.gc_scaling()[int(w[0].gc_content() * 10) / 10.0] - ) + wc_corrected += w[0].size() * w[1] / self.gc_scaling()[int(w[0].gc_content() * 10) / 10.0] if ( - w[0].size() - * w[1] - / self.gc_scaling()[int(w[0].gc_content() * 10) / 10.0] + w[0].size() * w[1] / self.gc_scaling()[int(w[0].gc_content() * 10) / 10.0] > 10 * len(alist) * self.read_length ): print( @@ -256,16 +241,12 @@ def interval_coverage(self, i, clip=False, gcc=False): # if clip == True or (clip == False and i.size() >= 100 * self.read_length): # return len(alist) * self.read_length / float(i.size()) if clip == True or (clip is None and e2 - s2 < 1000): - icc = sum( - [sum(a) for a in self.bamfile.count_coverage(i.chrom, s2, e2)] - ) / max(1.0, float(e2 - s2 + 1)) + icc = sum([sum(a) for a in self.bamfile.count_coverage(i.chrom, s2, e2)]) / max(1.0, float(e2 - s2 + 1)) self.interval_coverage_calls[call_args] = icc return self.interval_coverage_calls[call_args] else: self.interval_coverage_calls[call_args] = ( - len([a for a in alist if a.reference_end - 1 <= e2]) - * self.read_length - / max(1.0, float(e2 - s2 + 1)) + len([a for a in alist if a.reference_end - 1 <= e2]) * self.read_length / max(1.0, float(e2 - s2 + 1)) ) return self.interval_coverage_calls[call_args] @@ -308,11 +289,7 @@ def win_breakup(i, window_size): # return [(k, self.interval_coverage(k, gcc=gcc)) for k in jj] def median_coverage(self, window_size=-1, gcc=False, refi=-1, window_list=None): - if ( - (window_size == 10000 or window_size == -1) - and self.basic_stats_set - and refi == -1 - ): + if (window_size == 10000 or window_size == -1) and self.basic_stats_set and refi == -1: return self.downsample_stats if window_size == 300 and self.basic_stats_set and refi == -1: return self.downsample_stats[3:6] @@ -329,9 +306,7 @@ def median_coverage(self, window_size=-1, gcc=False, refi=-1, window_list=None): if len([i for i in hg.centromere_list if i.chrom == refi.chrom]) == 0: chr_cent = None else: - chr_cent = [i for i in hg.centromere_list if i.chrom == refi.chrom][ - 0 - ] + chr_cent = [i for i in hg.centromere_list if i.chrom == refi.chrom][0] if chr_cent is None: sumchrLen = hg.chrLen[hg.chrNum(refi.chrom)] chroffset = hg.absPos(refi.chrom, 1) @@ -350,16 +325,9 @@ def median_coverage(self, window_size=-1, gcc=False, refi=-1, window_list=None): if cp is not None: ii = hg.interval(cp[0], cp[1], cp[1] + sumchrLen) unconserved_len = sumchrLen - sum( - [ - i[0].intersection(i[1]).size() - for i in hg.interval_list([ii]).intersection( - hg.conserved_regions - ) - ] + [i[0].intersection(i[1]).size() for i in hg.interval_list([ii]).intersection(hg.conserved_regions)] ) - if ( - sumchrLen < 1000000 or (refi != -1 and unconserved_len < 1000000) - ) and window_size == -1: + if (sumchrLen < 1000000 or (refi != -1 and unconserved_len < 1000000)) and window_size == -1: return self.downsample_stats elif (sumchrLen < 1000000) and window_size == -1: @@ -378,9 +346,9 @@ def median_coverage(self, window_size=-1, gcc=False, refi=-1, window_list=None): insert_size = [] window_list_index = 0 non_mapping = 0 - while ( - window_list is not None and window_list_index < len(window_list) - ) or (window_list is None and iteri <= num_iter): + while (window_list is not None and window_list_index < len(window_list)) or ( + window_list is None and iteri <= num_iter + ): if window_list is None: newpos = int(random.random() * sumchrLen) + chroffset else: @@ -388,9 +356,7 @@ def median_coverage(self, window_size=-1, gcc=False, refi=-1, window_list=None): window_list_index += 1 if cwindow.end - cwindow.start < 10000: continue - newpos = hg.absPos( - cwindow.chrom, ((cwindow.end + cwindow.start) / 2) - 5000 - ) + newpos = hg.absPos(cwindow.chrom, ((cwindow.end + cwindow.start) / 2) - 5000) if hg.chrPos(newpos) is None: logging.debug( "Unable to locate reference position: " @@ -418,25 +384,18 @@ def median_coverage(self, window_size=-1, gcc=False, refi=-1, window_list=None): ) > 0 or len( - hg.interval_list([hg.interval(c, p, p + 10000)]).intersection( - hg.centromere_list, extend=10000 - ) + hg.interval_list([hg.interval(c, p, p + 10000)]).intersection(hg.centromere_list, extend=10000) ) > 0 ): continue read_length += [ - a.infer_query_length(always=False) - for a in self.fetch(c, p, p + 10000) - if not a.is_unmapped + a.infer_query_length(always=False) for a in self.fetch(c, p, p + 10000) if not a.is_unmapped ] insert_size += [ a.template_length for a in self.fetch(c, p, p + 10000) - if a.is_proper_pair - and not a.is_reverse - and a.template_length < 10000 - and a.template_length > 0 + if a.is_proper_pair and not a.is_reverse and a.template_length < 10000 and a.template_length > 0 ] iteri += 1 self.read_length = np.average(read_length) @@ -445,9 +404,7 @@ def median_coverage(self, window_size=-1, gcc=False, refi=-1, window_list=None): self.percent_proper = percent_proper self.insert_std = np.std(insert_size) self.max_insert = self.insert_size + self.num_sdevs * self.insert_std - self.min_insert = max( - 0, self.insert_size - self.num_sdevs * self.insert_std - ) + self.min_insert = max(0, self.insert_size - self.num_sdevs * self.insert_std) if window_size not in [-1, 300, 10000]: ws_list = [window_size] @@ -461,9 +418,9 @@ def median_coverage(self, window_size=-1, gcc=False, refi=-1, window_list=None): wc_ws = [] iteri = 0 window_list_index = 0 - while ( - window_list is not None and window_list_index < len(window_list) - ) or (window_list is None and iteri <= num_iter): + while (window_list is not None and window_list_index < len(window_list)) or ( + window_list is None and iteri <= num_iter + ): if window_list is None: newpos = int(random.random() * sumchrLen) + chroffset else: @@ -471,9 +428,7 @@ def median_coverage(self, window_size=-1, gcc=False, refi=-1, window_list=None): window_list_index += 1 if cwindow.end - cwindow.start < 10000: continue - newpos = hg.absPos( - cwindow.chrom, ((cwindow.end + cwindow.start) / 2) - 5000 - ) + newpos = hg.absPos(cwindow.chrom, ((cwindow.end + cwindow.start) / 2) - 5000) if hg.chrPos(newpos) is None: logging.warning( "Unable to locate reference position: " @@ -494,17 +449,9 @@ def median_coverage(self, window_size=-1, gcc=False, refi=-1, window_list=None): c not in self.bamfile.references or p < ws or hg.chrLen[hg.chrNum(c)] < p + ws - or len( - hg.interval_list([hg.interval(c, p, p + ws)]).intersection( - hg.conserved_regions, extend=ws - ) - ) + or len(hg.interval_list([hg.interval(c, p, p + ws)]).intersection(hg.conserved_regions, extend=ws)) > 0 - or len( - hg.interval_list([hg.interval(c, p, p + ws)]).intersection( - hg.centromere_list, extend=ws - ) - ) + or len(hg.interval_list([hg.interval(c, p, p + ws)]).intersection(hg.centromere_list, extend=ws)) > 0 ): continue @@ -592,10 +539,7 @@ def median_coverage(self, window_size=-1, gcc=False, refi=-1, window_list=None): print("pair support", self.pair_support) coverage_stats_file = open(hg.DATA_REPO + "/coverage.stats", "a") coverage_stats_file.write( - os.path.abspath(self.bamfile.filename.decode("utf-8")) - + "\t" - + "\t".join(map(str, rstats)) - + "\n" + os.path.abspath(self.bamfile.filename.decode("utf-8")) + "\t" + "\t".join(map(str, rstats)) + "\n" ) coverage_stats_file.close() @@ -603,22 +547,15 @@ def median_coverage(self, window_size=-1, gcc=False, refi=-1, window_list=None): if self.downsample < 0 or self.downsample > self.basic_stats[0]: self.downsample_ratio = 1 elif self.downsample == 0: - self.downsample_ratio = ( - 10.0 / self.basic_stats[0] if self.basic_stats[0] > 10 else 1 - ) + self.downsample_ratio = 10.0 / self.basic_stats[0] if self.basic_stats[0] > 10 else 1 else: self.downsample_ratio = ( - float(self.downsample) / self.basic_stats[0] - if self.basic_stats[0] > float(self.downsample) - else 1 + float(self.downsample) / self.basic_stats[0] if self.basic_stats[0] > float(self.downsample) else 1 ) if self.downsample_ratio != 1: rr = self.downsample_ratio rsq = math.sqrt(rr) - r = [ - i[0] * i[1] - for i in zip([rr, rr, rsq, rr, rr, rsq, 1, 1, 1, 1, 1, 1, 1, 1, 1], r) - ] + r = [i[0] * i[1] for i in zip([rr, rr, rsq, rr, rr, rsq, 1, 1, 1, 1, 1, 1, 1, 1, 1], r)] r[11] = max((r[4] / 10.0) * ((r[7] - r[6]) / 2 / r[6]) * r[12], 2) self.pair_support = r[11] self.downsample_stats = r @@ -639,9 +576,7 @@ def gc_scaling(self): if hg.chrNum(self.bamfile.references[ri]) not in hg.chrLen: continue # print self.bamfile.references[ri] - wc = self.window_coverage( - hg.interval(self.bamfile.references[ri], 0, self.bamfile.lengths[ri]) - ) + wc = self.window_coverage(hg.interval(self.bamfile.references[ri], 0, self.bamfile.lengths[ri])) lwc = 0 for w in wc: lwc += 1 @@ -662,9 +597,7 @@ def gc_scaling(self): return scale # Methods to find all coverage shifts in amplicon - def meanshift( - self, i, window_size=-1, hb=2, cov=None, rd_global=-1, h0=-1, gcc=False, n=-1 - ): + def meanshift(self, i, window_size=-1, hb=2, cov=None, rd_global=-1, h0=-1, gcc=False, n=-1): if window_size == -1: window_size = self.max_insert - self.read_length if rd_global == -1: @@ -689,6 +622,7 @@ def meanshift( jj = [hg.interval(i.chrom, k, k + window_size) for k in j] i2 = hg.interval(i.chrom, s2, e2) cov = [c for c in self.window_coverage(i2, window_size, gcc, exact=False)] + # cov = [self.interval_coverage(k) for k in jj] # print window_size, len(cov), str(cov[0][0]).strip(), cov[0][1], str(cov[1][0]).strip(), cov[1][1] def hr(wi): @@ -705,9 +639,7 @@ def hr(wi): [ wj * math.exp(-0.5 * wj**2 / hb**2) - * math.exp( - -0.5 * (cov[wi + wj][1] - cov[wi][1]) ** 2 / hr(wi) ** 2 - ) + * math.exp(-0.5 * (cov[wi + wj][1] - cov[wi][1]) ** 2 / hr(wi) ** 2) for wj in range(-1 * n, n + 1) ] ), @@ -746,9 +678,7 @@ def meanshift_segmentation(self, i, window_size=-1, gcc=False, pvalue=0.01): h0 = mc[2] hb_profile = [2, 5, 10, 50, 100] # hb_profile = [2] - n = min( - max(100, 10 * hb_profile[-1]), 10000000 // window_size - ) # number of windows used to calculate meanshift + n = min(max(100, 10 * hb_profile[-1]), 10000000 // window_size) # number of windows used to calculate meanshift logging.debug("MS: " + str(i) + " window_size, n: " + str((window_size, n))) s2 = i.start - window_size * n e2 = i.end + window_size * n @@ -764,15 +694,9 @@ def meanshift_segmentation(self, i, window_size=-1, gcc=False, pvalue=0.01): endskip = n - (hgl - i.end) // window_size i2 = hg.interval(i.chrom, s2, e2) - logging.debug( - "MS: " + str(i) + " startskip,endskip" + str((startskip, endskip)) - ) + logging.debug("MS: " + str(i) + " startskip,endskip" + str((startskip, endskip))) cov = [c for c in self.window_coverage(i2, window_size, gcc, exact=False)] - cov = ( - [(None, 0) for ni in range(startskip)] - + cov - + [(None, 0) for ni in range(endskip)] - ) + cov = [(None, 0) for ni in range(startskip)] + cov + [(None, 0) for ni in range(endskip)] frozen = [] def hr(c, wlen): @@ -807,15 +731,8 @@ def hr(c, wlen): # for ff in range(len(frozen)): # print "THIS", ff, frozen[ff][0][0][0].start while msi < len(ms): - if ( - fi >= 0 - and fi < len(frozen) - and ms[msi][0].start == frozen[fi][0][0][0].start - ): - if len(new_seg) > 0 and ( - frozen[fi][1] % 2 == 1 - or (ms[msi][1] > 0 and ms[msi - 1][1] <= 0) - ): + if fi >= 0 and fi < len(frozen) and ms[msi][0].start == frozen[fi][0][0][0].start: + if len(new_seg) > 0 and (frozen[fi][1] % 2 == 1 or (ms[msi][1] > 0 and ms[msi - 1][1] <= 0)): segs.append(new_seg) new_seg = ms[msi : msi + len(frozen[fi][0])] else: @@ -879,15 +796,7 @@ def hr(c, wlen): if ( self.meanshift_pval( [cc[1] for cc in cov[ci : ci + len(segs[si])]], - [ - cs[1] - for cs in cov[ - ci - + len(segs[si]) : ci - + len(segs[si]) - + len(segs[si + 1]) - ] - ], + [cs[1] for cs in cov[ci + len(segs[si]) : ci + len(segs[si]) + len(segs[si + 1])]], ) < pvalue ): @@ -920,17 +829,9 @@ def hr(c, wlen): cms.append(frozen[msi]) if frozen[msi][1] % 4 >= 2: plist.append(frozen[msi][0][-1][0].end) - avgc = np.average( - reduce( - lambda x, y: x + y, [[c[1] for c in cc[4]] for cc in cms], [] - ) - ) + avgc = np.average(reduce(lambda x, y: x + y, [[c[1] for c in cc[4]] for cc in cms], [])) ms1list.append(avgc * 2 / self.median_coverage(window_size, gcc)[0]) - c1list.append( - reduce( - lambda x, y: x + y, [[c[1] for c in cc[4]] for cc in cms], [] - ) - ) + c1list.append(reduce(lambda x, y: x + y, [[c[1] for c in cc[4]] for cc in cms], [])) if len(ms1list) > 1: ms2list.append(avgc * 2 / self.median_coverage(window_size, gcc)[0]) c2list.append( @@ -944,13 +845,9 @@ def hr(c, wlen): ) cms = [] if len(cms) > 0: - avgc = np.average( - reduce(lambda x, y: x + y, [[c[1] for c in cc[4]] for cc in cms], []) - ) + avgc = np.average(reduce(lambda x, y: x + y, [[c[1] for c in cc[4]] for cc in cms], [])) ms2list.append(avgc * 2 / self.median_coverage(window_size, gcc)[0]) - c2list.append( - (reduce(lambda x, y: x + y, [[c[1] for c in cc[4]] for cc in cms], [])) - ) + c2list.append((reduce(lambda x, y: x + y, [[c[1] for c in cc[4]] for cc in cms], []))) shifts = list(zip(plist, ms1list, ms2list, c1list, c2list)) @@ -998,15 +895,11 @@ def hr(c, wlen): if shiftsi not in mergelist: plist.append(shifts[shiftsi][0]) avgc = np.average(c1) - ms1list.append( - avgc * 2 / self.median_coverage(window_size, gcc)[0] - ) + ms1list.append(avgc * 2 / self.median_coverage(window_size, gcc)[0]) c1list.append(c1) if len(plist) > 1: c2list.append(c1) - ms2list.append( - avgc * 2 / self.median_coverage(window_size, gcc)[0] - ) + ms2list.append(avgc * 2 / self.median_coverage(window_size, gcc)[0]) c1 = [] if len(plist) > 0: c1.extend(shifts[-1][4]) @@ -1021,10 +914,7 @@ def hr(c, wlen): shifts_select = [s for s in shifts if abs(s[2] - s[1]) >= 1] else: shifts_select = [ - s - for s in shifts - if abs(s[2] - s[1]) - >= max(1, min(max(s[2], s[1]) / 10.0, math.sqrt(max(s[2], s[1])))) + s for s in shifts if abs(s[2] - s[1]) >= max(1, min(max(s[2], s[1]) / 10.0, math.sqrt(max(s[2], s[1])))) ] if len(shifts_select) == 0: return hg.interval_list( @@ -1034,9 +924,7 @@ def hr(c, wlen): i.start, i.end, info={ - "cn": np.average([c[1] for c in cov[n:-n]]) - * 2 - / self.median_coverage(window_size, gcc)[0] + "cn": np.average([c[1] for c in cov[n:-n]]) * 2 / self.median_coverage(window_size, gcc)[0] }, ) ] @@ -1045,21 +933,13 @@ def hr(c, wlen): shift_intervals = hg.interval_list([]) start = i.start for si in shifts_select: - shift_intervals.append( - hg.interval(i.chrom, start, si[0], info={"cn": si[1]}) - ) + shift_intervals.append(hg.interval(i.chrom, start, si[0], info={"cn": si[1]})) start = si[0] + 1 - shift_intervals.append( - hg.interval(i.chrom, start, i.end, info={"cn": shifts_select[-1][2]}) - ) + shift_intervals.append(hg.interval(i.chrom, start, i.end, info={"cn": shifts_select[-1][2]})) return shift_intervals - def meanshift_refined( - self, i, window_size0=10000, window_size1=300, gcc=False, shifts_unrefined=None - ): - logging.debug( - "Meanshift refining " + i.chrom + ":" + str(i.start) + "-" + str(i.end) - ) + def meanshift_refined(self, i, window_size0=10000, window_size1=300, gcc=False, shifts_unrefined=None): + logging.debug("Meanshift refining " + i.chrom + ":" + str(i.start) + "-" + str(i.end)) if hg.chrLen[hg.chrNum(i.chrom)] < 3 * window_size0: logging.debug("small chrom") ms_ws1 = self.meanshift_segmentation(i, window_size1, gcc) @@ -1073,19 +953,13 @@ def meanshift_refined( else: shifts0 = shifts_unrefined - shift1_intervals = hg.interval_list( - hg.interval(msi.chrom, msi.end, msi.end) for msi in shifts0[:-1] - ) - shift1_intervals = [ - msi[0] for msi in shift1_intervals.merge_clusters(extend=3 * window_size0) - ] + shift1_intervals = hg.interval_list(hg.interval(msi.chrom, msi.end, msi.end) for msi in shifts0[:-1]) + shift1_intervals = [msi[0] for msi in shift1_intervals.merge_clusters(extend=3 * window_size0)] shifts1 = reduce( lambda x, y: x + y, [ self.meanshift_segmentation( - hg.interval( - i.chrom, s.start - 3 * window_size0, s.start + 3 * window_size0 - ), + hg.interval(i.chrom, s.start - 3 * window_size0, s.start + 3 * window_size0), window_size1, gcc, pvalue=0.05, @@ -1107,11 +981,7 @@ def meanshift_refined( if abs(shifts0[s0i].end - shifts1[s1i].end) >= window_size0: continue cndiff1 = shifts1[s1i + 1].info["cn"] - shifts1[s1i].info["cn"] - if ( - cndiff0 * cndiff1 < 0 - or cndiff0 / cndiff1 <= 0.5 - and cndiff0 / cndiff1 >= 2 - ): + if cndiff0 * cndiff1 < 0 or cndiff0 / cndiff1 <= 0.5 and cndiff0 / cndiff1 >= 2: continue if bests1i is None: bests1i = s1i @@ -1197,9 +1067,7 @@ def get_meanshift(self, i, window_size0=10000, window_size1=300, gcc=False): ) msr.append(msi) else: - msr = self.meanshift_refined( - i, window_size0=window_size0, window_size1=window_size1, gcc=gcc - ) + msr = self.meanshift_refined(i, window_size0=window_size0, window_size1=window_size1, gcc=gcc) msfile = open(file_name, "w") msfile.write("#chrom\tstart\tend\tcn\tstart_refined\tend_refined\n") for ms in msr: @@ -1221,9 +1089,7 @@ def interval_crossing_arcs(self, chrom, start, end, strand, ilist): if strand == -1: return [ a - for a in self.fetch( - chrom, max(0, start), min(end, hg.chrLen[hg.chrNum(chrom)]) - ) + for a in self.fetch(chrom, max(0, start), min(end, hg.chrLen[hg.chrNum(chrom)])) if not a.is_unmapped and a.is_reverse and ( @@ -1246,9 +1112,7 @@ def interval_crossing_arcs(self, chrom, start, end, strand, ilist): else: return [ a - for a in self.fetch( - chrom, max(0, start), min(end, hg.chrLen[hg.chrNum(chrom)]) - ) + for a in self.fetch(chrom, max(0, start), min(end, hg.chrLen[hg.chrNum(chrom)])) if not a.is_unmapped and not a.is_reverse and ( @@ -1294,9 +1158,7 @@ def get_mates(self, a): self.get_mates_time += time() - gmt return retval - def pair_support_count( - self, chrom, position, strand, meanshift, foldup=False, sensitivems=True - ): + def pair_support_count(self, chrom, position, strand, meanshift, foldup=False, sensitivems=True): # str(hg.interval(f[0][0][0].chrom, f[0][0][0].start, f[0][-1][0].end)), f[1], f[2] cd = 1 for fi in range(len(meanshift)): @@ -1327,11 +1189,7 @@ def pair_support_count( if self.sensitivems and sensitivems: cd = min(cd, 10) pcount = max( - mc[4] - * cd - / 20.0 - * ((self.insert_size - self.read_length) / 2 / self.read_length) - * mc[12], + mc[4] * cd / 20.0 * ((self.insert_size - self.read_length) / 2 / self.read_length) * mc[12], 2, ) pmincount = mc[11] @@ -1352,8 +1210,7 @@ def concordant_edge(self, v, bp_margin=0): and a.next_reference_name == v.chrom and a.next_reference_start >= v.pos and a.reference_start < v.pos - bp_margin - and a.next_reference_start - < a.reference_start + self.max_insert - self.read_length + and a.next_reference_start < a.reference_start + self.max_insert - self.read_length ] if len(dlist) > self.pair_support: v2 = breakpoint_vertex( @@ -1362,12 +1219,7 @@ def concordant_edge(self, v, bp_margin=0): -1, ) logging.debug( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + " concordant edges " - + str(v) - + " " - + str(len(dlist)) + "#TIME " + "%.3f\t" % (time() - TSTART) + " concordant edges " + str(v) + " " + str(len(dlist)) ) return (breakpoint_edge(v, v2), dlist) else: @@ -1380,8 +1232,7 @@ def concordant_edge(self, v, bp_margin=0): and a.next_reference_name == v.chrom and a.next_reference_start >= v.pos and a.reference_start < v.pos - bp_margin - and a.next_reference_start - < a.reference_start + self.max_insert - self.read_length + and a.next_reference_start < a.reference_start + self.max_insert - self.read_length ] if len(dlist) > self.pair_support: v2 = breakpoint_vertex( @@ -1390,21 +1241,10 @@ def concordant_edge(self, v, bp_margin=0): 1, ) logging.debug( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + " concordant edges " - + str(v) - + " " - + str(len(dlist)) + "#TIME " + "%.3f\t" % (time() - TSTART) + " concordant edges " + str(v) + " " + str(len(dlist)) ) return (breakpoint_edge(v, v2), dlist) - logging.debug( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + " concordant edges " - + str(v) - + " not found" - ) + logging.debug("#TIME " + "%.3f\t" % (time() - TSTART) + " concordant edges " + str(v) + " not found") return (None, dlist) def foldup_count(self, chrom, position, strand, cdiff=-1): @@ -1445,59 +1285,33 @@ def refine_discordant_edge(self, e): # logging.debug("#TIME " + '%.3f\t'%(time() - TSTART) + " refine discordant edge " + str(e)) v1min = max( 0, - ( - e.v1.pos - self.max_insert + self.read_length - if e.v1.strand == 1 - else e.v1.pos - ) - - 1, + (e.v1.pos - self.max_insert + self.read_length if e.v1.strand == 1 else e.v1.pos) - 1, ) v2min = max( 0, - ( - e.v2.pos - self.max_insert + self.read_length - if e.v2.strand == 1 - else e.v2.pos - ) - - 1, + (e.v2.pos - self.max_insert + self.read_length if e.v2.strand == 1 else e.v2.pos) - 1, ) v1max = ( min( - e.v1.pos + self.max_insert - self.read_length - if e.v1.strand == -1 - else e.v1.pos, + e.v1.pos + self.max_insert - self.read_length if e.v1.strand == -1 else e.v1.pos, hg.chrLen[hg.chrNum(e.v1.chrom)], ) - 1 ) v2max = ( min( - e.v2.pos + self.max_insert - self.read_length - if e.v2.strand == -1 - else e.v2.pos, + e.v2.pos + self.max_insert - self.read_length if e.v2.strand == -1 else e.v2.pos, hg.chrLen[hg.chrNum(e.v2.chrom)], ) - 1 ) d1list = [a for a in self.fetch(e.v1.chrom, v1min, v1max) if not a.is_unmapped] d2list = [a for a in self.fetch(e.v2.chrom, v2min, v2max) if not a.is_unmapped] - d1Set = set( - [(a.query_name, a.is_read1, a.is_reverse, a.is_secondary) for a in d1list] - ) + d1Set = set([(a.query_name, a.is_read1, a.is_reverse, a.is_secondary) for a in d1list]) if e.v1.strand == e.v2.strand: - d2Set = set( - [ - (a.query_name, a.is_read1, not a.is_reverse, not a.is_secondary) - for a in d2list - ] - ) + d2Set = set([(a.query_name, a.is_read1, not a.is_reverse, not a.is_secondary) for a in d2list]) else: - d2Set = set( - [ - (a.query_name, a.is_read1, a.is_reverse, not a.is_secondary) - for a in d2list - ] - ) + d2Set = set([(a.query_name, a.is_read1, a.is_reverse, not a.is_secondary) for a in d2list]) rSet = d1Set.intersection(d2Set) if len(rSet) == 0: return (e, 0, [], None) @@ -1516,12 +1330,8 @@ def refine_discordant_edge(self, e): not a.is_reverse, not a.is_secondary, ) in d2reads: - multi_r.add( - (a.query_name, a.is_read1, not a.is_reverse, not a.is_secondary) - ) - d2reads[ - (a.query_name, a.is_read1, not a.is_reverse, not a.is_secondary) - ] = a + multi_r.add((a.query_name, a.is_read1, not a.is_reverse, not a.is_secondary)) + d2reads[(a.query_name, a.is_read1, not a.is_reverse, not a.is_secondary)] = a else: for a in d2list: if ( @@ -1530,12 +1340,8 @@ def refine_discordant_edge(self, e): a.is_reverse, not a.is_secondary, ) in d2reads: - multi_r.add( - (a.query_name, a.is_read1, a.is_reverse, not a.is_secondary) - ) - d2reads[ - (a.query_name, a.is_read1, a.is_reverse, not a.is_secondary) - ] = a + multi_r.add((a.query_name, a.is_read1, a.is_reverse, not a.is_secondary)) + d2reads[(a.query_name, a.is_read1, a.is_reverse, not a.is_secondary)] = a dpairs = defaultdict(lambda: [], {}) for aa in rSet: @@ -1571,10 +1377,7 @@ def refine_discordant_edge(self, e): if a1.is_reverse: r1 = ( a1.infer_query_length() + a1clip_suffix - a1.query_alignment_end, - a1.infer_query_length() - + a1clip_suffix - - a1.query_alignment_start - - 1, + a1.infer_query_length() + a1clip_suffix - a1.query_alignment_start - 1, ) else: r1 = ( @@ -1584,10 +1387,7 @@ def refine_discordant_edge(self, e): if a2.is_reverse: r2 = ( a2.infer_query_length() + a2clip_suffix - a2.query_alignment_end, - a2.infer_query_length() - + a2clip_suffix - - a2.query_alignment_start - - 1, + a2.infer_query_length() + a2clip_suffix - a2.query_alignment_start - 1, ) else: r2 = ( @@ -1649,22 +1449,14 @@ def refine_discordant_edge(self, e): a = dpairs[max_p[0]][0][0] if hom >= 0: if vstrand == 1: - hom_seq = a.query_sequence[ - a.query_alignment_end - hom : a.query_alignment_end - ] + hom_seq = a.query_sequence[a.query_alignment_end - hom : a.query_alignment_end] else: - hom_seq = a.query_sequence[ - a.query_alignment_start : a.query_alignment_start + hom - ] + hom_seq = a.query_sequence[a.query_alignment_start : a.query_alignment_start + hom] else: if vstrand == 1: - hom_seq = a.query_sequence[ - a.query_alignment_end : a.query_alignment_end + abs(hom) - ] + hom_seq = a.query_sequence[a.query_alignment_end : a.query_alignment_end + abs(hom)] else: - hom_seq = a.query_sequence[ - a.query_alignment_start - abs(hom) : a.query_alignment_start - ] + hom_seq = a.query_sequence[a.query_alignment_start - abs(hom) : a.query_alignment_start] p1 = max_p[0][1] p2 = max_p[0][2] logging.debug( @@ -1699,11 +1491,7 @@ def refine_discordant_edge(self, e): def edge_has_high_mapq(self, read_list): bp1_mapq = max([rr[0].mapping_quality for rr in read_list]) bp2_mapq = max([rr[1].mapping_quality for rr in read_list]) - logging.debug( - "#TIME " - + "%.3f\t" % (time() - self.tstart) - + " breakpoint_mapq: %d %d" % (bp1_mapq, bp2_mapq) - ) + logging.debug("#TIME " + "%.3f\t" % (time() - self.tstart) + " breakpoint_mapq: %d %d" % (bp1_mapq, bp2_mapq)) if bp1_mapq < self.breakpoint_mapping_quality_cutoff: return False if bp2_mapq < self.breakpoint_mapping_quality_cutoff: @@ -1716,11 +1504,7 @@ def edge_has_high_entropy(self, read_list): [ stats.entropy( np.unique( - [ - x - for x in rr[0].get_reference_sequence().upper() - if x != "N" - ], + [x for x in rr[0].get_reference_sequence().upper() if x != "N"], return_counts=True, )[1] ) @@ -1731,11 +1515,7 @@ def edge_has_high_entropy(self, read_list): [ stats.entropy( np.unique( - [ - x - for x in rr[1].get_reference_sequence().upper() - if x != "N" - ], + [x for x in rr[1].get_reference_sequence().upper() if x != "N"], return_counts=True, )[1] ) @@ -1748,11 +1528,7 @@ def edge_has_high_entropy(self, read_list): [ stats.entropy( np.unique( - [ - x - for x in rr[0].query_alignment_sequence.upper() - if x != "N" - ], + [x for x in rr[0].query_alignment_sequence.upper() if x != "N"], return_counts=True, )[1] ) @@ -1763,11 +1539,7 @@ def edge_has_high_entropy(self, read_list): [ stats.entropy( np.unique( - [ - x - for x in rr[1].query_alignment_sequence.upper() - if x != "N" - ], + [x for x in rr[1].query_alignment_sequence.upper() if x != "N"], return_counts=True, )[1] ) @@ -1776,9 +1548,7 @@ def edge_has_high_entropy(self, read_list): ) logging.debug( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + " breakpoint_entropy: %.3f %.3f" % (bp1_entropy, bp2_entropy) + "#TIME " + "%.3f\t" % (time() - TSTART) + " breakpoint_entropy: %.3f %.3f" % (bp1_entropy, bp2_entropy) ) if bp1_entropy < self.breakpoint_entropy_cutoff: return False @@ -1787,12 +1557,7 @@ def edge_has_high_entropy(self, read_list): return True def edge_passes_filters(self, read_list, e=None): - logging.debug( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + " edge_breakpoint_filter: " - + str(e) - ) + logging.debug("#TIME " + "%.3f\t" % (time() - TSTART) + " edge_breakpoint_filter: " + str(e)) if self.edge_has_high_mapq(read_list) and self.edge_has_high_entropy(read_list): return True return False @@ -1805,10 +1570,7 @@ def sa_tag_overlaps_primary(self, a): return False if (t[2] == "+") != a.is_reverse: return False - if ( - min(abs(int(t[1]) - a.reference_start), abs(int(t[1]) - a.reference_end)) - > self.read_length - ): + if min(abs(int(t[1]) - a.reference_start), abs(int(t[1]) - a.reference_end)) > self.read_length: return False return True @@ -1827,9 +1589,7 @@ def sa_tag_mismatch_breakpoint(self, a, bp): return True cigar_counts = [int(i) for i in re.findall(r"\d+", t[3])] cigar_op = [i for i in re.findall(r"\D", t[3])] - sa_ref_len = sum( - [i[0] for i in zip(cigar_counts, cigar_op) if i[1] in "MDNX"] - ) + sa_ref_len = sum([i[0] for i in zip(cigar_counts, cigar_op) if i[1] in "MDNX"]) if abs(int(t[1]) + sa_ref_len - bp.pos) > 10: return True return False @@ -1842,12 +1602,7 @@ def interval_discordant_edges( ms=None, amplicon_name=None, ): - logging.debug( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + " searching discordant edges in " - + str(interval) - ) + logging.debug("#TIME " + "%.3f\t" % (time() - TSTART) + " searching discordant edges in " + str(interval)) if pair_support == -1: pair_support = self.pair_support if type(interval) != hg.interval_list: @@ -1887,8 +1642,7 @@ def interval_discordant_edges( and not ( a.reference_name == a.next_reference_name and a.mate_is_reverse - and abs(a.reference_start - a.next_reference_start) - < self.max_insert + and abs(a.reference_start - a.next_reference_start) < self.max_insert ) ] # this section catches everted sequencing artifacts drlist += [ @@ -1905,8 +1659,7 @@ def interval_discordant_edges( and not ( a.reference_name == a.next_reference_name and not a.mate_is_reverse - and abs(a.reference_start - a.next_reference_start) - < self.max_insert + and abs(a.reference_start - a.next_reference_start) < self.max_insert ) ] # this section catches everted sequencing artifacts # logging.debug("#TIME " + '%.3f\t'%(time() - TSTART) + " discordant edges: fetch discordant " + str(interval) + " " + str(len(dflist)) + " " + str(len(drlist))) @@ -1919,8 +1672,7 @@ def interval_discordant_edges( logging.debug( "#TIME " + "%.3f\t" % (time() - TSTART) - + " discordant edges: discordant read pairs found: %s %s %s" - % (str(interval), len(dflist), len(drlist)) + + " discordant edges: discordant read pairs found: %s %s %s" % (str(interval), len(dflist), len(drlist)) ) # perform biclustering for readpairs using union-find algorithm to give sets of connected read-pairs clist @@ -1930,10 +1682,8 @@ def interval_discordant_edges( for a in dflist + drlist: vlist.append( ( - hg.absPos(a.reference_name, a.reference_start) - * (-1 if a.is_reverse else 1), - hg.absPos(a.next_reference_name, a.next_reference_start) - * (-1 if a.mate_is_reverse else 1), + hg.absPos(a.reference_name, a.reference_start) * (-1 if a.is_reverse else 1), + hg.absPos(a.next_reference_name, a.next_reference_start) * (-1 if a.mate_is_reverse else 1), a, vcount, ) @@ -2027,13 +1777,9 @@ def interval_discordant_edges( hgl.append(hgv) hgl.sort() if c.is_reverse: - mcdrlist.extend( - hgl.merge_clusters(extend=self.max_insert - self.read_length) - ) + mcdrlist.extend(hgl.merge_clusters(extend=self.max_insert - self.read_length)) else: - mcdflist.extend( - hgl.merge_clusters(extend=self.max_insert - self.read_length) - ) + mcdflist.extend(hgl.merge_clusters(extend=self.max_insert - self.read_length)) # logging.debug("#TIME " + '%.3f\t'%(time() - TSTART) + " discordant edges: discordant clusters found: %s %d %d " % (str(interval), len(mcdflist), len(mcdrlist))) @@ -2057,21 +1803,12 @@ def interval_discordant_edges( neighbor_hglist = hg.interval_list( [ a2[0] - for a2 in neighbor_hglist.merge_clusters( - extend=self.max_insert - self.read_length - ) + for a2 in neighbor_hglist.merge_clusters(extend=self.max_insert - self.read_length) if len(a2[1]) >= pair_support ] ) for c2 in mcdflist + mcdrlist: - if ( - len( - hg.interval_list([c2[0]]).intersection( - neighbor_hglist, extend=self.max_insert - ) - ) - == 0 - ): + if len(hg.interval_list([c2[0]]).intersection(neighbor_hglist, extend=self.max_insert)) == 0: continue vl = [] vlSet = set() @@ -2081,16 +1818,11 @@ def interval_discordant_edges( for a2 in c2[1]: aq1 = hgddict[a1] aq2 = hgddict[a2] - if ( - aq1.query_name == aq2.query_name - and aq1.is_read1 != aq2.is_read1 - ): + if aq1.query_name == aq2.query_name and aq1.is_read1 != aq2.is_read1: if ( aq1.reference_name == aq2.reference_name - and abs(aq1.reference_start - aq2.reference_start) - < self.read_length - and abs(aq1.reference_end - aq2.reference_end) - < self.read_length + and abs(aq1.reference_start - aq2.reference_start) < self.read_length + and abs(aq1.reference_end - aq2.reference_end) < self.read_length and aq1.is_reverse != aq2.is_reverse ): continue @@ -2099,8 +1831,7 @@ def interval_discordant_edges( and aq1.is_reverse and not aq2.is_reverse and aq1.reference_start - aq2.reference_end + 1 > 0 - and aq1.reference_start - aq2.reference_end + 1 - < self.max_insert - 2 * self.read_length + and aq1.reference_start - aq2.reference_end + 1 < self.max_insert - 2 * self.read_length ): continue if ( @@ -2108,8 +1839,7 @@ def interval_discordant_edges( and aq2.is_reverse and not aq1.is_reverse and aq2.reference_start - aq1.reference_end + 1 > 0 - and aq2.reference_start - aq1.reference_end + 1 - < self.max_insert - 2 * self.read_length + and aq2.reference_start - aq1.reference_end + 1 < self.max_insert - 2 * self.read_length ): continue vl.append((aq1, aq2)) @@ -2123,75 +1853,37 @@ def interval_discordant_edges( ) vl1Set.add((aq1.reference_start, aq1.reference_end)) vl2Set.add((aq2.reference_start, aq2.reference_end)) - if ( - len(vl) == 0 - or len( - [ - v - for v in vl - if v[1].reference_start * v[0].reference_start > 0 - ] - ) - == 0 - ): + if len(vl) == 0 or len([v for v in vl if v[1].reference_start * v[0].reference_start > 0]) == 0: continue if not vl[0][0].is_reverse: bp1 = breakpoint_vertex( c1[0].chrom, - max( - [ - v[0].reference_end - 1 - for v in vl - if v[0].reference_start > 0 - ] - ), + max([v[0].reference_end - 1 for v in vl if v[0].reference_start > 0]), 1, ) else: bp1 = breakpoint_vertex( c1[0].chrom, - min( - [ - v[0].reference_start - for v in vl - if v[0].reference_start > 0 - ] - ), + min([v[0].reference_start for v in vl if v[0].reference_start > 0]), -1, ) if not vl[0][1].is_reverse: bp2 = breakpoint_vertex( c2[0].chrom, - max( - [ - v[1].reference_end - 1 - for v in vl - if v[1].reference_start > 0 - ] - ), + max([v[1].reference_end - 1 for v in vl if v[1].reference_start > 0]), 1, ) else: bp2 = breakpoint_vertex( c2[0].chrom, - min( - [ - v[1].reference_start - for v in vl - if v[1].reference_start > 0 - ] - ), + min([v[1].reference_start for v in vl if v[1].reference_start > 0]), -1, ) if ms is None: ps = pair_support else: ps = self.pair_support_count(bp1.chrom, bp1.pos, bp1.strand, ms) - if ( - len(vl) < ps - or len(vl1Set) < pair_support - or len(vl2Set) < pair_support - ): + if len(vl) < ps or len(vl1Set) < pair_support or len(vl2Set) < pair_support: continue if ( @@ -2206,11 +1898,7 @@ def interval_discordant_edges( bp1c = None bp2c = None vl2 = [] - if ( - bp1.chrom == bp2.chrom - and bp1.strand == bp2.strand - and abs(bp1.pos - bp2.pos) <= self.read_length - ): + if bp1.chrom == bp2.chrom and bp1.strand == bp2.strand and abs(bp1.pos - bp2.pos) <= self.read_length: non_inverted_reads = set() multiple_non_inverted = False if bp1.strand == 1: @@ -2245,8 +1933,7 @@ def interval_discordant_edges( "checking foldback2: " + str(bp1) + str(bp2) - + " %s %s %d %d %d" - % (bp1.strand, bp2.strand, len(vl), num_inverted, ps) + + " %s %s %d %d %d" % (bp1.strand, bp2.strand, len(vl), num_inverted, ps) ) if len(vl2) < ps or (not multiple_non_inverted): @@ -2265,10 +1952,7 @@ def interval_discordant_edges( for v1 in vl if v1[0].reference_end <= v[0].reference_end and v1[0].reference_start - > v[0].reference_end - - 1 - - self.max_insert - + 2 * self.read_length + > v[0].reference_end - 1 - self.max_insert + 2 * self.read_length ] ) > maxn @@ -2280,40 +1964,25 @@ def interval_discordant_edges( for v1 in vl if v1[0].reference_end <= v[0].reference_end and v1[0].reference_end - > v[0].reference_end - - self.max_insert - + 2 * self.read_length + > v[0].reference_end - self.max_insert + 2 * self.read_length ] ) vl = [ v for v in vl if v[0].reference_end - 1 <= maxp - and v[0].reference_end - 1 - > maxp - self.max_insert + 2 * self.read_length + and v[0].reference_end - 1 > maxp - self.max_insert + 2 * self.read_length ] if len(vl) < ps: continue bp1 = breakpoint_vertex( c1[0].chrom, - max( - [ - v[0].reference_end - 1 - for v in vl - if v[0].reference_start > 0 - ] - ), + max([v[0].reference_end - 1 for v in vl if v[0].reference_start > 0]), 1, ) bp2 = breakpoint_vertex( c2[0].chrom, - max( - [ - v[1].reference_end - 1 - for v in vl - if v[1].reference_start > 0 - ] - ), + max([v[1].reference_end - 1 for v in vl if v[1].reference_start > 0]), 1, ) if bp1.pos != bp2.pos: @@ -2330,9 +1999,7 @@ def interval_discordant_edges( for v1 in vl if v1[0].reference_start >= v[0].reference_start and v1[0].reference_start - < v[0].reference_start - + self.max_insert - - 2 * self.read_length + < v[0].reference_start + self.max_insert - 2 * self.read_length ] ) > maxn @@ -2344,40 +2011,25 @@ def interval_discordant_edges( for v1 in vl if v1[0].reference_start >= v[0].reference_start and v1[0].reference_start - < v[0].reference_start - + self.max_insert - - 2 * self.read_length + < v[0].reference_start + self.max_insert - 2 * self.read_length ] ) vl = [ v for v in vl if v[0].reference_start >= maxp - and v[0].reference_start - < maxp + self.max_insert - 2 * self.read_length + and v[0].reference_start < maxp + self.max_insert - 2 * self.read_length ] if len(vl) < ps: continue bp1 = breakpoint_vertex( c1[0].chrom, - min( - [ - v[0].reference_start - for v in vl - if v[0].reference_start > 0 - ] - ), + min([v[0].reference_start for v in vl if v[0].reference_start > 0]), -1, ) bp2 = breakpoint_vertex( c2[0].chrom, - min( - [ - v[1].reference_start - for v in vl - if v[1].reference_start > 0 - ] - ), + min([v[1].reference_start for v in vl if v[1].reference_start > 0]), -1, ) if bp1.pos != bp2.pos: @@ -2386,20 +2038,11 @@ def interval_discordant_edges( bre_refine = self.refine_discordant_edge(breakpoint_edge(bp1, bp2)) bre = bre_refine[0] - if ( - bp1.chrom == bp2.chrom - and bp1.strand == bp2.strand - and abs(bp1.pos - bp2.pos) <= self.read_length - ): + if bp1.chrom == bp2.chrom and bp1.strand == bp2.strand and abs(bp1.pos - bp2.pos) <= self.read_length: qname_exclude = set([]) for v in vl: - if ( - bp1.strand == 1 - and max(v[0].reference_start, v[1].reference_start) - > bre.v1.pos - ) or ( - bp1.strand == -1 - and max(v[0].reference_end, v[1].reference_end) < bre.v1.pos + if (bp1.strand == 1 and max(v[0].reference_start, v[1].reference_start) > bre.v1.pos) or ( + bp1.strand == -1 and max(v[0].reference_end, v[1].reference_end) < bre.v1.pos ): qname_exclude.add(v[0].query_name) continue @@ -2413,21 +2056,13 @@ def interval_discordant_edges( continue if ( bp1.strand == 1 - and bre.v1.pos - - v[0].reference_start - + bre.v2.pos - - v[1].reference_start - > self.max_insert + and bre.v1.pos - v[0].reference_start + bre.v2.pos - v[1].reference_start > self.max_insert ): qname_exclude.add(v[0].query_name) continue if ( bp2.strand == 1 - and v[0].reference_end - - bre.v1.pos - + v[1].reference_end - - bre.v2.pos - > self.max_insert + and v[0].reference_end - bre.v1.pos + v[1].reference_end - bre.v2.pos > self.max_insert ): qname_exclude.add(v[0].query_name) continue @@ -2435,19 +2070,14 @@ def interval_discordant_edges( if len(vl) < ps: continue - if ( - bre.type() == "everted" - and abs(bre.v1.pos - bre.v2.pos) < self.max_insert - ): + if bre.type() == "everted" and abs(bre.v1.pos - bre.v2.pos) < self.max_insert: logging.debug("skipping everted edge " + str(bp1) + str(bp2)) continue if bre.type() != "concordant": if self.edge_passes_filters(vl, bre): dnlist0.append((bre, len(vl))) if bp1c is not None and bp2c is not None: - brec_refine = self.refine_discordant_edge( - breakpoint_edge(bp1c, bp2c) - ) + brec_refine = self.refine_discordant_edge(breakpoint_edge(bp1c, bp2c)) brec = brec_refine[0] if brec.type() != "concordant" and brec.v1.pos != brec.v2.pos: if self.edge_passes_filters(vl, brec): @@ -2465,17 +2095,13 @@ def interval_discordant_edges( ): continue if ( - (bre2.v2.chrom, bre2.v2.pos, bre2.v2.strand) - == (bre1.v1.chrom, bre1.v1.pos, bre1.v1.strand) - and (bre2.v1.chrom, bre2.v1.pos, bre2.v1.strand) - == (bre1.v2.chrom, bre1.v2.pos, bre1.v2.strand) + (bre2.v2.chrom, bre2.v2.pos, bre2.v2.strand) == (bre1.v1.chrom, bre1.v1.pos, bre1.v1.strand) + and (bre2.v1.chrom, bre2.v1.pos, bre2.v1.strand) == (bre1.v2.chrom, bre1.v2.pos, bre1.v2.strand) ) and bb1 not in dnlist: dnlist.append(bb1) continue if len(dnlist) != len(dnlist0): - logging.debug( - "dnlists do not match " + str(len(dnlist0)) + " " + str(len(dnlist)) - ) + logging.debug("dnlists do not match " + str(len(dnlist0)) + " " + str(len(dnlist))) for bb1 in dnlist0: if bb1 not in dnlist: logging.debug("dnlist0: " + str(bb1[0]) + " " + str(bb1[1])) @@ -2521,12 +2147,7 @@ def interval_discordant_edges( nmatelist = [ a for a in nmatelist - if len( - hg.interval_list( - [hg.interval(a, bamfile=self.bamfile)] - ).intersection(ilist) - ) - == 0 + if len(hg.interval_list([hg.interval(a, bamfile=self.bamfile)]).intersection(ilist)) == 0 ] intersection_time += time() - ict nlist += nmatelist @@ -2548,22 +2169,13 @@ def interval_discordant_edges( vl1Set = set() vl2Set = set() if filter_repeats: - if ( - len( - hg.interval_list([cn[0]]).intersection(hg.conserved_regions) - ) - > 0 - ): + if len(hg.interval_list([cn[0]]).intersection(hg.conserved_regions)) > 0: continue hgmi = 0 for hgm in cn[1]: hgmi += 1 if filter_repeats: - if ( - hgm.filter_repeat() - or hgndict[hgm].mapping_quality - <= self.mapping_quality_cutoff - ): + if hgm.filter_repeat() or hgndict[hgm].mapping_quality <= self.mapping_quality_cutoff: continue for a in self.get_mates(hgndict[hgm]): if filter_repeats: @@ -2590,17 +2202,7 @@ def interval_discordant_edges( ) ) break - if ( - len(vl) == 0 - or len( - [ - v - for v in vl - if v[1].reference_start * v[0].reference_start > 0 - ] - ) - == 0 - ): + if len(vl) == 0 or len([v for v in vl if v[1].reference_start * v[0].reference_start > 0]) == 0: continue if not vl[0][0].is_reverse: bp1 = breakpoint_vertex( @@ -2611,13 +2213,7 @@ def interval_discordant_edges( else: bp1 = breakpoint_vertex( vl[0][0].reference_name, - min( - [ - v[0].reference_start - for v in vl - if v[0].reference_start > 0 - ] - ), + min([v[0].reference_start for v in vl if v[0].reference_start > 0]), -1, ) if not vl[0][1].is_reverse: @@ -2629,13 +2225,7 @@ def interval_discordant_edges( else: bp2 = breakpoint_vertex( vl[0][1].reference_name, - min( - [ - v[1].reference_start - for v in vl - if v[1].reference_start > 0 - ] - ), + min([v[1].reference_start for v in vl if v[1].reference_start > 0]), -1, ) if ms is None: @@ -2643,20 +2233,12 @@ def interval_discordant_edges( else: ps = self.pair_support_count(bp1.chrom, bp1.pos, bp1.strand, ms) - if ( - len(vl) < ps - or len(vl1Set) < pair_support - or len(vl2Set) < pair_support - ): + if len(vl) < ps or len(vl1Set) < pair_support or len(vl2Set) < pair_support: continue num_inverted = 0 non_inverted_reads = set() multiple_non_inverted = False - if ( - bp1.chrom == bp2.chrom - and bp1.pos == bp2.pos - and bp1.strand == bp2.strand - ): + if bp1.chrom == bp2.chrom and bp1.pos == bp2.pos and bp1.strand == bp2.strand: if bp1.strand == 1: for v in vl: if v[0].reference_start == v[1].reference_start: @@ -2690,9 +2272,7 @@ def interval_discordant_edges( + " " + str(self.get_mates_num_calls) ) - dnlist.sort( - key=lambda x: hg.absPos(x[0].v1.chrom, x[0].v1.pos) + 0.5 * x[0].v1.strand - ) + dnlist.sort(key=lambda x: hg.absPos(x[0].v1.chrom, x[0].v1.pos) + 0.5 * x[0].v1.strand) for e in dnlist: logging.debug( "#TIME %.3f\tdiscordant edges %s %s %s %s %d %f" @@ -2735,9 +2315,7 @@ def load_edges(self, edge_file): hom_seq = "" e = breakpoint_edge(el[0], hom=hom, hom_seq=hom_seq) edges.append((e, int(el[1]))) - edges.sort( - key=lambda x: hg.absPos(x[0].v1.chrom, x[0].v1.pos) + 0.1 * x[0].v1.strand - ) + edges.sort(key=lambda x: hg.absPos(x[0].v1.chrom, x[0].v1.pos) + 0.1 * x[0].v1.strand) return edges def get_sensitive_discordant_edges( @@ -2753,32 +2331,21 @@ def get_sensitive_discordant_edges( gcc=False, amplicon_name=None, ): - if amplicon_name is not None and os.path.exists( - "%s_edges_cnseg.txt" % amplicon_name - ): + if amplicon_name is not None and os.path.exists("%s_edges_cnseg.txt" % amplicon_name): return self.load_edges("%s_edges_cnseg.txt" % amplicon_name) if amplicon_name is not None and os.path.exists("%s_edges.txt" % amplicon_name): eilist = self.load_edges("%s_edges.txt" % amplicon_name) else: if eilist is None: if adaptive_counts: - eilist = self.interval_discordant_edges( - ilist, ms=msrlist, pair_support=pair_support - ) + eilist = self.interval_discordant_edges(ilist, ms=msrlist, pair_support=pair_support) else: - eilist = self.interval_discordant_edges( - ilist, pair_support=pair_support - ) - eilist.sort( - key=lambda x: hg.absPos(x[0].v1.chrom, x[0].v1.pos) - + 0.1 * x[0].v1.strand - ) + eilist = self.interval_discordant_edges(ilist, pair_support=pair_support) + eilist.sort(key=lambda x: hg.absPos(x[0].v1.chrom, x[0].v1.pos) + 0.1 * x[0].v1.strand) if amplicon_name is not None: edge_file = open("%s_edges.txt" % amplicon_name, "w") for e in eilist: - edge_file.write( - "%s\t%s\t%s\t%s\n" % (str(e[0]), e[1], e[0].hom, e[0].hom_seq) - ) + edge_file.write("%s\t%s\t%s\t%s\n" % (str(e[0]), e[1], e[0].hom, e[0].hom_seq)) edge_file.close() eiSet = set( [ @@ -2796,9 +2363,7 @@ def get_sensitive_discordant_edges( for i, msr in zip(ilist, msrlist): elist = [] for e in eilist: - if e[0].v1.pos != -1 and hg.interval( - e[0].v1.chrom, e[0].v1.pos, e[0].v1.pos - ).intersects(i): + if e[0].v1.pos != -1 and hg.interval(e[0].v1.chrom, e[0].v1.pos, e[0].v1.pos).intersects(i): elist.append(e) ms_vlist = [] msv_index = {} @@ -2817,11 +2382,8 @@ def get_sensitive_discordant_edges( msve = [ e for e in elist - if e[0].v1.strand - * (msr[msi].info["cn"] - msr[msi + 1].info["cn"]) - > 0 - and abs(e[0].v1.pos - msr[msi].end) - < self.max_insert + ms_window_size1 + if e[0].v1.strand * (msr[msi].info["cn"] - msr[msi + 1].info["cn"]) > 0 + and abs(e[0].v1.pos - msr[msi].end) < self.max_insert + ms_window_size1 ] if len(msve) == 0: # print("finesearch discordant edges", i.chrom, str(msr[msi]), str(msr[msi + 1])) @@ -2838,9 +2400,7 @@ def get_sensitive_discordant_edges( [ e for e in efine - if e[0].v1.strand - * (msr[msi].info["cn"] - msr[msi + 1].info["cn"]) - > 0 + if e[0].v1.strand * (msr[msi].info["cn"] - msr[msi + 1].info["cn"]) > 0 ] ) > 0 @@ -2850,12 +2410,7 @@ def get_sensitive_discordant_edges( [ (e[1], e[0]) for e in efine - if e[0].v1.strand - * ( - msr[msi].info["cn"] - - msr[msi + 1].info["cn"] - ) - > 0 + if e[0].v1.strand * (msr[msi].info["cn"] - msr[msi + 1].info["cn"]) > 0 and abs(e[0].v1.pos - msv.pos) < ms_window_size1 ] ) @@ -2865,12 +2420,7 @@ def get_sensitive_discordant_edges( [ (e[1], e[0]) for e in efine - if e[0].v1.strand - * ( - msr[msi].info["cn"] - - msr[msi + 1].info["cn"] - ) - > 0 + if e[0].v1.strand * (msr[msi].info["cn"] - msr[msi + 1].info["cn"]) > 0 and abs(e[0].v1.pos - msv.pos) < ms_window_size1 ] ) @@ -2879,12 +2429,7 @@ def get_sensitive_discordant_edges( [ (e[1], e[0]) for e in efine - if e[0].v1.strand - * ( - msr[msi].info["cn"] - - msr[msi + 1].info["cn"] - ) - > 0 + if e[0].v1.strand * (msr[msi].info["cn"] - msr[msi + 1].info["cn"]) > 0 ] ) ebest = (ebest[1], ebest[0]) @@ -2934,9 +2479,7 @@ def get_sensitive_discordant_edges( ) not in eiSet: eilist.append( ( - breakpoint_edge( - ebest[0].v2, ebest[0].v1 - ), + breakpoint_edge(ebest[0].v2, ebest[0].v1), ebest[1], ) ) @@ -2950,32 +2493,21 @@ def get_sensitive_discordant_edges( ebest[0].v1.strand, ) ) - elist.sort( - key=lambda x: hg.absPos(x[0].v1.chrom, x[0].v1.pos) - + 0.1 * x[0].v1.strand - ) - eilist.sort( - key=lambda x: hg.absPos(x[0].v1.chrom, x[0].v1.pos) - + 0.1 * x[0].v1.strand - ) + elist.sort(key=lambda x: hg.absPos(x[0].v1.chrom, x[0].v1.pos) + 0.1 * x[0].v1.strand) + eilist.sort(key=lambda x: hg.absPos(x[0].v1.chrom, x[0].v1.pos) + 0.1 * x[0].v1.strand) else: # print("msv end not refined", str(msr[msi]), str(msr[msi + 1])) msve = [ e for e in elist - if e[0].v1.strand - * (msr[msi].info["cn"] - msr[msi + 1].info["cn"]) - > 0 - and abs(e[0].v1.pos - msr[msi].end) - < self.max_insert + ms_window_size0 + if e[0].v1.strand * (msr[msi].info["cn"] - msr[msi + 1].info["cn"]) > 0 + and abs(e[0].v1.pos - msr[msi].end) < self.max_insert + ms_window_size0 ] if amplicon_name is not None: edge_file = open("%s_edges_cnseg.txt" % amplicon_name, "w") for e in eilist: - edge_file.write( - "%s\t%s\t%s\t%s\n" % (str(e[0]), e[1], e[0].hom, e[0].hom_seq) - ) + edge_file.write("%s\t%s\t%s\t%s\n" % (str(e[0]), e[1], e[0].hom, e[0].hom_seq)) edge_file.close() return eilist @@ -3011,17 +2543,9 @@ def interval_neighbors(self, i, ilist=[], rdlist=[], t=0, gcc=False): ms_window_size0 = 10000 ms_window_size1 = 300 merge_thresh = 100000 - logging.info( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + " Calculating coverage meanshift segmentation" - ) + logging.info("#TIME " + "%.3f\t" % (time() - TSTART) + " Calculating coverage meanshift segmentation") msrlist = [self.get_meanshift(i2, ms_window_size0, ms_window_size1, gcc)] - logging.info( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + " Detecting breakpoint edges (interval neighbors)" - ) + logging.info("#TIME " + "%.3f\t" % (time() - TSTART) + " Detecting breakpoint edges (interval neighbors)") edges = self.interval_discordant_edges(i2, ms=msrlist) edges = [(e[1], e[0]) for e in edges] edges.sort(reverse=True) @@ -3083,9 +2607,7 @@ def interval_neighbors(self, i, ilist=[], rdlist=[], t=0, gcc=False): nn = hg.interval_list([c[0] for c in mc]) for e in nn: logging.debug( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + " interval_neighbors: edges %s %s" % (str(i), str(e)) + "#TIME " + "%.3f\t" % (time() - TSTART) + " interval_neighbors: edges %s %s" % (str(i), str(e)) ) return nn @@ -3095,20 +2617,13 @@ def interval_hops(self, i=None, ilist=[], rdlist=[], gcc=False, explore=True): i = i[0] else: i1list = hg.interval_list([i]) - logging.debug( - "#TIME " + "%.3f\t" % (time() - TSTART) + " interval_hops: init " + str(i) - ) + logging.debug("#TIME " + "%.3f\t" % (time() - TSTART) + " interval_hops: init " + str(i)) ms_window_size0 = 10000 i2list = hg.interval_list([]) for i2 in i1list: ii = self.interval_extend(i2) logging.debug( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + " interval_hops: interval extend " - + str(i2) - + " " - + str(ii) + "#TIME " + "%.3f\t" % (time() - TSTART) + " interval_hops: interval extend " + str(i2) + " " + str(ii) ) i2list.append(ii) seen_list = hg.interval_list([]) @@ -3119,10 +2634,7 @@ def interval_hops(self, i=None, ilist=[], rdlist=[], gcc=False, explore=True): while len(seen_list) < 10 and len(unseen_list) > 0: icc = heapq.heappop(unseen_list) ic = icc[1] - if ( - explore == False - and len(hg.interval_list([ic]).intersection(i2list)) == 0 - ): + if explore == False and len(hg.interval_list([ic]).intersection(i2list)) == 0: seen_list.append(ic) continue logging.debug( @@ -3156,9 +2668,7 @@ def interval_hops(self, i=None, ilist=[], rdlist=[], gcc=False, explore=True): ) for ic2 in icn: logging.info( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + " New neighbor: %s (weight=%d)" % (str(ic2), ic2.info) + "#TIME " + "%.3f\t" % (time() - TSTART) + " New neighbor: %s (weight=%d)" % (str(ic2), ic2.info) ) contained = False for i2 in clist: @@ -3166,10 +2676,7 @@ def interval_hops(self, i=None, ilist=[], rdlist=[], gcc=False, explore=True): contained = True if contained: continue - if ( - ic2.size() < 2 * ms_window_size0 - and len(self.interval_discordant_edges(ic2)) < 2 - ): + if ic2.size() < 2 * ms_window_size0 and len(self.interval_discordant_edges(ic2)) < 2: continue if explore or len(hg.interval_list([ic]).intersection(i2list)) > 0: heapq.heappush(unseen_list, (-ic2.info, ic2)) @@ -3191,11 +2698,7 @@ def interval_amplified(self, i, filter_conserved=True, filter_small=True): ms_window_size = 10000 num_w = 0 num_high = 0 - if ( - filter_small - and i.size() < 2 * ms_window_size - and len(self.interval_discordant_edges(i)) < 2 - ): + if filter_small and i.size() < 2 * ms_window_size and len(self.interval_discordant_edges(i)) < 2: return False wc = self.window_coverage(i, ms_window_size, exact=False) mc = self.median_coverage() @@ -3212,25 +2715,17 @@ def interval_amplified(self, i, filter_conserved=True, filter_small=True): arm_coverage[0] + 3.0 * mc[0] / 2.0, ): num_high += 1 - elif mc[0] >= arm_coverage[0] and w[1] > max( - mc[0] + 3 * mc[2], 5.0 * mc[0] / 2.0 - ): + elif mc[0] >= arm_coverage[0] and w[1] > max(mc[0] + 3 * mc[2], 5.0 * mc[0] / 2.0): num_high += 1 else: - if mc[0] < arm_coverage[0] and w[1] > arm_coverage[0] + 3 * mc[ - 2 - ] * math.sqrt(arm_coverage[0] / mc[0]): + if mc[0] < arm_coverage[0] and w[1] > arm_coverage[0] + 3 * mc[2] * math.sqrt(arm_coverage[0] / mc[0]): num_high += 1 elif mc[0] >= arm_coverage[0] and w[1] > mc[0] + 3 * mc[2]: num_high += 1 # wc_high = len([w for w in wc if w[1] > mc[1] + 3 * mc[2]]) if num_high > num_w / 5: return True - elif ( - filter_small == False - and i.size() < 2 * ms_window_size - and len(self.interval_discordant_edges(i)) >= 2 - ): + elif filter_small == False and i.size() < 2 * ms_window_size and len(self.interval_discordant_edges(i)) >= 2: return True else: return False @@ -3256,10 +2751,7 @@ def interval_extend(self, i, strand=0, i0=None): if extend_right >= 0: if right_size < 1: extend_right = -1 - elif ( - ic.end + right_size * ms_window_size - > hg.chrLen[hg.chrNum(ic.chrom)] - ): + elif ic.end + right_size * ms_window_size > hg.chrLen[hg.chrNum(ic.chrom)]: if self.interval_amplified( hg.interval(ic.chrom, ic.end, hg.chrLen[hg.chrNum(ic.chrom)]), filter_small=False, @@ -3288,18 +2780,14 @@ def interval_extend(self, i, strand=0, i0=None): if left_size < 1: extend_left = -1 elif ic.start - left_size * ms_window_size <= 1: - if self.interval_amplified( - hg.interval(ic.chrom, 1, ic.start), filter_small=False - ): + if self.interval_amplified(hg.interval(ic.chrom, 1, ic.start), filter_small=False): ic.start = 1 extend_left = -1 else: extend_left = 0 left_size = left_size / 2 elif self.interval_amplified( - hg.interval( - ic.chrom, ic.start - left_size * ms_window_size, ic.start - ), + hg.interval(ic.chrom, ic.start - left_size * ms_window_size, ic.start), filter_small=False, ): ic.start = ic.start - left_size * ms_window_size @@ -3341,23 +2829,17 @@ def interval_extend(self, i, strand=0, i0=None): ) for e in ide: if e[0].v1.strand == 1: - ic.end = min( - ic.end + 2 * ms_window_size, hg.chrLen[hg.chrNum(ic.chrom)] - ) + ic.end = min(ic.end + 2 * ms_window_size, hg.chrLen[hg.chrNum(ic.chrom)]) break if strand <= 0: - ide = self.interval_discordant_edges( - hg.interval(ic.chrom, max(0, ic.start - ms_window_size), ic.start - 1) - ) + ide = self.interval_discordant_edges(hg.interval(ic.chrom, max(0, ic.start - ms_window_size), ic.start - 1)) for e in ide: if e[0].v1.strand == -1: ic.start = max(ic.start - 2 * ms_window_size, 0) break # if ic.size() > ms_window_size: logging.debug( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + " interval_extend: %s, %s, %s" % (str(i), strand, str(ic)) + "#TIME " + "%.3f\t" % (time() - TSTART) + " interval_extend: %s, %s, %s" % (str(i), strand, str(ic)) ) return ic @@ -3380,18 +2862,10 @@ def interval_filter_vertices( all_msv = [] msv_diff = {} all_msv_nocover = [] + logging.info("#TIME " + "%.3f\t" % (time() - self.tstart) + " Calculating coverage meanshift segmentation") + msrlist = [self.get_meanshift(i, ms_window_size0, ms_window_size1, gcc) for i in ilist] logging.info( - "#TIME " - + "%.3f\t" % (time() - self.tstart) - + " Calculating coverage meanshift segmentation" - ) - msrlist = [ - self.get_meanshift(i, ms_window_size0, ms_window_size1, gcc) for i in ilist - ] - logging.info( - "#TIME " - + "%.3f\t" % (time() - self.tstart) - + " Detecting breakpoint edges (interval filter vertices" + "#TIME " + "%.3f\t" % (time() - self.tstart) + " Detecting breakpoint edges (interval filter vertices" ) sensitive_elist = self.get_sensitive_discordant_edges( ilist, @@ -3403,9 +2877,7 @@ def interval_filter_vertices( amplicon_name=amplicon_name, ) eilist = sensitive_elist - logging.info( - "#TIME " + "%.3f\t" % (time() - self.tstart) + " Building breakpoint graph" - ) + logging.info("#TIME " + "%.3f\t" % (time() - self.tstart) + " Building breakpoint graph") for i, msr in zip(ilist, msrlist): elist = [] for e in eilist: @@ -3431,21 +2903,15 @@ def interval_filter_vertices( msve = [ e for e in elist - if e[0].v1.strand - * (msr[msi + 1].info["cn"] - msr[msi].info["cn"]) - < 0 - and abs(e[0].v1.pos - msr[msi].end) - < self.max_insert + ms_window_size1 + if e[0].v1.strand * (msr[msi + 1].info["cn"] - msr[msi].info["cn"]) < 0 + and abs(e[0].v1.pos - msr[msi].end) < self.max_insert + ms_window_size1 ] else: msve = [ e for e in elist - if e[0].v1.strand - * (msr[msi + 1].info["cn"] - msr[msi].info["cn"]) - < 0 - and abs(e[0].v1.pos - msr[msi].end) - < self.max_insert + ms_window_size0 + if e[0].v1.strand * (msr[msi + 1].info["cn"] - msr[msi].info["cn"]) < 0 + and abs(e[0].v1.pos - msr[msi].end) < self.max_insert + ms_window_size0 ] if len(msve) > 0: msve_match[msv] = msve @@ -3459,14 +2925,10 @@ def interval_filter_vertices( ms_addlist = [] kce = defaultdict(lambda: 0) # number of concordant reads koe = defaultdict(lambda: 0.0) # number of reads mapping outside the interval - kbpe = defaultdict( - lambda: 0.0 - ) # number of discordant reads across breakpoint edge + kbpe = defaultdict(lambda: 0.0) # number of discordant reads across breakpoint edge new_graph = breakpoint_graph() s = new_graph.new_vertex(ilist[0].chrom, -1, -1) - for i, msr, ms_vlist, msv_nocover in zip( - ilist, msrlist, all_msv, all_msv_nocover - ): + for i, msr, ms_vlist, msv_nocover in zip(ilist, msrlist, all_msv, all_msv_nocover): ngvlist = [] elist = [] for e in eilist: @@ -3477,27 +2939,13 @@ def interval_filter_vertices( ei = 0 nei = 0 msi = 0 - if ( - len(elist) == 0 - or elist[ei][0].v1.strand == 1 - or elist[ei][0].v1.pos > i.start - ): - if ( - len(msv_nocover) == 0 - or msv_nocover[msi].strand == 1 - or msv_nocover[msi].pos > i.start - ): + if len(elist) == 0 or elist[ei][0].v1.strand == 1 or elist[ei][0].v1.pos > i.start: + if len(msv_nocover) == 0 or msv_nocover[msi].strand == 1 or msv_nocover[msi].pos > i.start: nv = new_graph.new_vertex(i.chrom, i.start, -1) ne = new_graph.new_edge(s, nv) - koe[ne] = len( - self.interval_crossing_arcs( - i.chrom, i.start, i.start + self.max_insert, -1, ilist - ) - ) + koe[ne] = len(self.interval_crossing_arcs(i.chrom, i.start, i.start + self.max_insert, -1, ilist)) else: # len(ms_vlist) > 0 and ms_vlist[0].strand == -1 and ms_vlist[0].pos > i.start + self.max_insert - nv = new_graph.new_vertex( - i.chrom, msv_nocover[msi].pos, msv_nocover[msi].strand - ) + nv = new_graph.new_vertex(i.chrom, msv_nocover[msi].pos, msv_nocover[msi].strand) ne = new_graph.new_edge(s, nv) koe[ne] = len( self.interval_crossing_arcs( @@ -3512,24 +2960,14 @@ def interval_filter_vertices( msi += 1 else: nv = new_graph.new_vertex(i.chrom, elist[0][0].v1.pos, -1) - oecount = len( - self.interval_crossing_arcs( - nv.chrom, nv.pos, nv.pos + self.max_insert, -1, ilist - ) - ) + oecount = len(self.interval_crossing_arcs(nv.chrom, nv.pos, nv.pos + self.max_insert, -1, ilist)) if oecount >= ( self.pair_support if not adaptive_counts - else self.pair_support_count( - nv.chrom, nv.pos, -1, meanshift=msrlist, sensitivems=False - ) + else self.pair_support_count(nv.chrom, nv.pos, -1, meanshift=msrlist, sensitivems=False) ): ne = new_graph.new_edge(s, nv) - koe[ne] = len( - self.interval_crossing_arcs( - nv.chrom, nv.pos, nv.pos + self.max_insert, -1, ilist - ) - ) + koe[ne] = len(self.interval_crossing_arcs(nv.chrom, nv.pos, nv.pos + self.max_insert, -1, ilist)) ei += 1 ngvlist.append(nv) vc = breakpoint_vertex(ngvlist[0].chrom, ngvlist[0].pos, ngvlist[0].strand) @@ -3547,10 +2985,7 @@ def interval_filter_vertices( elif elist[ei][0].v1.pos < msv_nocover[msi].pos: vc = elist[ei][0].v1 ei += 1 - elif ( - elist[ei][0].v1.pos == msv_nocover[msi].pos - and elist[ei][0].v1.strand < msv_nocover[msi].strand - ): + elif elist[ei][0].v1.pos == msv_nocover[msi].pos and elist[ei][0].v1.strand < msv_nocover[msi].strand: vc = elist[ei][0].v1 ei += 1 else: @@ -3558,23 +2993,14 @@ def interval_filter_vertices( vc_type = "meanshift" ms_addlist.append(msv_nocover[msi]) msi += 1 - if (vc.pos == vp.pos and vc.strand <= vp.strand) or ( - vc.pos == vp.pos + 1 and vc.strand < vp.strand - ): + if (vc.pos == vp.pos and vc.strand <= vp.strand) or (vc.pos == vp.pos + 1 and vc.strand < vp.strand): continue logging.debug( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + "interval_filter vertices new: " - + str(vc) - + " " - + vc_type + "#TIME " + "%.3f\t" % (time() - TSTART) + "interval_filter vertices new: " + str(vc) + " " + vc_type ) if vc.strand == 1: if ngvlist[nei].strand == 1: - nvc_prime = new_graph.new_vertex( - ngvlist[nei].chrom, ngvlist[nei].pos + 1, -1 - ) + nvc_prime = new_graph.new_vertex(ngvlist[nei].chrom, ngvlist[nei].pos + 1, -1) # oecount = len(self.interval_crossing_arcs(nvc_prime.chrom, nvc_prime.pos, nvc_prime.pos + self.max_insert, -1, ilist)) # if oecount >= (self.pair_support if not adaptive_counts else self.pair_support_count(nvc_prime.chrom, nvc_prime.pos, -1, meanshift=zip(ilist, msrlist, cnlist), sensitivems=False)): # ne = new_graph.new_edge(s, nvc_prime) @@ -3619,12 +3045,9 @@ def interval_filter_vertices( + str(vc) ) if ngvlist[nei].strand == 1 and not ( - ngvlist[nei].chrom == vc.chrom - and ngvlist[nei].pos == vc.pos - 1 + ngvlist[nei].chrom == vc.chrom and ngvlist[nei].pos == vc.pos - 1 ): - nvc_prime = new_graph.new_vertex( - ngvlist[nei].chrom, ngvlist[nei].pos + 1, -1 - ) + nvc_prime = new_graph.new_vertex(ngvlist[nei].chrom, ngvlist[nei].pos + 1, -1) oecount = len( self.interval_crossing_arcs( nvc_prime.chrom, @@ -3671,9 +3094,7 @@ def interval_filter_vertices( ) else: oecount = len( - self.interval_crossing_arcs( - nv.chrom, nv.pos, nv.pos + self.max_insert, -1, ilist - ) + self.interval_crossing_arcs(nv.chrom, nv.pos, nv.pos + self.max_insert, -1, ilist) ) # if vc_type == 'meanshift' or oecount >= (self.pair_support if not adaptive_counts else self.pair_support_count(nv.chrom, nv.pos, -1, meanshift=zip(ilist, msrlist, cnlist), sensitivems=False)): if vc_type == "meanshift": @@ -3688,17 +3109,11 @@ def interval_filter_vertices( if ngvlist[nei].strand == -1: nv = new_graph.new_vertex(i.chrom, i.end, 1) ne = new_graph.new_edge(s, nv) - koe[ne] = len( - self.interval_crossing_arcs( - nv.chrom, nv.pos - self.max_insert, nv.pos, 1, ilist - ) - ) + koe[ne] = len(self.interval_crossing_arcs(nv.chrom, nv.pos - self.max_insert, nv.pos, 1, ilist)) ngvlist.append(nv) nei += 1 elif ngvlist[nei].strand == 1 and ngvlist[nei].pos < i.end: - nvc_prime = new_graph.new_vertex( - ngvlist[nei].chrom, ngvlist[nei].pos + 1, -1 - ) + nvc_prime = new_graph.new_vertex(ngvlist[nei].chrom, ngvlist[nei].pos + 1, -1) oecount = len( self.interval_crossing_arcs( nvc_prime.chrom, @@ -3731,11 +3146,7 @@ def interval_filter_vertices( nei += 1 nv = new_graph.new_vertex(i.chrom, i.end, 1) ne = new_graph.new_edge(s, nv) - koe[ne] = len( - self.interval_crossing_arcs( - nv.chrom, nv.pos - self.max_insert, nv.pos, 1, ilist - ) - ) + koe[ne] = len(self.interval_crossing_arcs(nv.chrom, nv.pos - self.max_insert, nv.pos, 1, ilist)) ngvlist.append(nv) nei += 1 ngvlist_full = ngvlist_full + ngvlist @@ -3751,17 +3162,10 @@ def interval_filter_vertices( ) # , self.pair_support_count(msa.chrom, msa.pos, msa.strand, ms, True) for e0 in elist_full: e = e0[0] - if ( - len(ilist.intersection([hg.interval(e.v2.chrom, e.v2.pos, e.v2.pos)])) - > 0 - and e.v1.pos >= e.v2.pos - ): + if len(ilist.intersection([hg.interval(e.v2.chrom, e.v2.pos, e.v2.pos)])) > 0 and e.v1.pos >= e.v2.pos: ne = new_graph.add_edge(e) logging.debug( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + "interval_filter vertices: added edge e = " - + str(e) + "#TIME " + "%.3f\t" % (time() - TSTART) + "interval_filter vertices: added edge e = " + str(e) ) logging.debug( "#TIME " @@ -3797,45 +3201,29 @@ def interval_filter_vertices( + ",".join(map(str, new_graph.vs.values())) ) kbpe[ne] = e0[1] - elif ( - len(ilist.intersection([hg.interval(e.v2.chrom, e.v2.pos, e.v2.pos)])) - == 0 - ): - ne = new_graph.add_edge( - breakpoint_edge(breakpoint_vertex(s.chrom, s.pos, s.strand), e.v1) - ) + elif len(ilist.intersection([hg.interval(e.v2.chrom, e.v2.pos, e.v2.pos)])) == 0: + ne = new_graph.add_edge(breakpoint_edge(breakpoint_vertex(s.chrom, s.pos, s.strand), e.v1)) koe[ne] = e0[1] for nei in range(1, len(ngvlist_full)): if ngvlist_full[nei].strand == 1: - new_graph.new_edge( - ngvlist_full[nei - 1], ngvlist_full[nei], edge_type="sequence" - ) + new_graph.new_edge(ngvlist_full[nei - 1], ngvlist_full[nei], edge_type="sequence") # else: # new_graph.new_edge(ngvlist[nei-1], ngvlist[nei]) for e in koe: koe[e] = max(0.0001, koe[e]) # set up all constants - logging.info( - "#TIME " - + "%.3f\t" % (time() - self.tstart) - + " Optimizing graph copy number flow" - ) + logging.info("#TIME " + "%.3f\t" % (time() - self.tstart) + " Optimizing graph copy number flow") C = self.median_coverage()[0] / 2 print("C (haploid coverage) = ", C) G = new_graph seqlist = [e for e in new_graph.es.values() if e.edge_type == "sequence"] n = len(seqlist) l = [abs(e.v2.pos - e.v1.pos) + 1 for e in seqlist] - k = [ - len([a for a in self.fetch(e.v1.chrom, e.v1.pos, e.v2.pos)]) - for e in seqlist - ] + k = [len([a for a in self.fetch(e.v1.chrom, e.v1.pos, e.v2.pos)]) for e in seqlist] # kgcc = [self.interval_coverage(hg.interval(i.chrom, e.v1.pos, e.v2.pos), gcc=True) * (e.v2.pos - e.v1.pos) / self.read_length for e in seqlist] # k = kgcc kcc = [ - self.interval_coverage(hg.interval(e.v1.chrom, e.v1.pos, e.v2.pos)) - * (e.v2.pos - e.v1.pos) - for e in seqlist + self.interval_coverage(hg.interval(e.v1.chrom, e.v1.pos, e.v2.pos)) * (e.v2.pos - e.v1.pos) for e in seqlist ] ke = {} ke.update(kbpe) @@ -3848,11 +3236,7 @@ def interval_filter_vertices( for e in seqlist ] # edge read count kbpe defined above - bplist = [ - e - for e in new_graph.es.values() - if (e.edge_type == "discordant" or e.edge_type == "breakpoint") - ] + bplist = [e for e in new_graph.es.values() if (e.edge_type == "discordant" or e.edge_type == "breakpoint")] m = len(bplist) bpdict = {bplist[bpi]: bpi for bpi in range(len(bplist))} print( @@ -3915,9 +3299,7 @@ def interval_filter_vertices( opro = [mosek.scopr.log] * (n + m) oprjo = range(n + m) oprfo = [-1 * ki for ki in k] + [-1 * ke[e] for e in bplist] - oprgo = [C * li / self.read_length for li in l] + [ - (self.max_insert) * C / 2 / self.read_length for e in bplist - ] + oprgo = [C * li / self.read_length for li in l] + [(self.max_insert) * C / 2 / self.read_length for e in bplist] oprho = [0.0001] * (n + m) opcj = {cj: C * l[cj] / self.read_length for cj in range(len(l))} for e in bpdict: @@ -3965,27 +3347,14 @@ def streamprinter(msg): for msv_ilist in zip(all_msv, ilist): slist = hg.interval_list( [ - hg.interval( - "\t".join( - map( - str, [sq[0].v1.chrom, sq[0].v1.pos, sq[0].v2.pos, sq[1]] - ) - ) - ) + hg.interval("\t".join(map(str, [sq[0].v1.chrom, sq[0].v1.pos, sq[0].v2.pos, sq[1]]))) for sq in zip(seqlist, res) ] ) slist.sort() - msl = ( - [msv_ilist[1].start] - + [v.pos for v in msv_ilist[0]] - + [msv_ilist[1].end] - ) + msl = [msv_ilist[1].start] + [v.pos for v in msv_ilist[0]] + [msv_ilist[1].end] mslist = hg.interval_list( - [ - hg.interval(msv_ilist[1].chrom, msl[i], msl[i + 1]) - for i in range(len(msl) - 1) - ] + [hg.interval(msv_ilist[1].chrom, msl[i], msl[i + 1]) for i in range(len(msl) - 1)] ) for msi in mslist: if len(hg.interval_list([msi]).intersection(slist)) == 0: @@ -3996,21 +3365,11 @@ def streamprinter(msg): for s in seqlist: print(str(s)) exit() - elif ( - sum( - [ - ap[0].intersection(ap[1]).size() - for ap in hg.interval_list([msi]).intersection(slist) - ] - ) - == 0 - ): + elif sum([ap[0].intersection(ap[1]).size() for ap in hg.interval_list([msi]).intersection(slist)]) == 0: print("MS0intersection", str(msi)) exit() - edge_code = defaultdict( - lambda: "discordant", {"concordant": "concordant", "source": "source"} - ) + edge_code = defaultdict(lambda: "discordant", {"concordant": "concordant", "source": "source"}) graph_logger.info( "SequenceEdge: StartPosition, EndPosition, PredictedCopyCount, AverageCoverage, Size, NumberReadsMapped" @@ -4058,57 +3417,19 @@ def streamprinter(msg): if len(ilist0) >= 10: lenlist = len(ilist0) all_msv_cat = reduce(lambda x, y: x + y, all_msv, []) - oncolist = ( - ",".join( - set([a[1].info["Name"] for a in ilist.intersection(hg.oncogene_list)]) - ) - + "," - ) - istr = ",".join( - [i.chrom + ":" + str(i.start) + "-" + str(i.end) for i in ilist] - ) - summary_logger.info( - "TotalIntervalSize = " + str(sum([a.size() for a in ilist])) - ) + oncolist = ",".join(set([a[1].info["Name"] for a in ilist.intersection(hg.oncogene_list)])) + "," + istr = ",".join([i.chrom + ":" + str(i.start) + "-" + str(i.end) for i in ilist]) + summary_logger.info("TotalIntervalSize = " + str(sum([a.size() for a in ilist]))) summary_logger.info( "AmplifiedIntervalSize = " - + str( - sum( - [ - seqlist[si].v2.pos - seqlist[si].v1.pos - for si in range(n) - if res[si] >= 2.5 - ] - ) - ) + + str(sum([seqlist[si].v2.pos - seqlist[si].v1.pos for si in range(n) if res[si] >= 2.5])) ) - if ( - len( - [ - seqlist[si].v2.pos - seqlist[si].v1.pos - for si in range(n) - if res[si] >= 2.5 - ] - ) - > 0 - ): + if len([seqlist[si].v2.pos - seqlist[si].v1.pos for si in range(n) if res[si] >= 2.5]) > 0: summary_logger.info( "AverageAmplifiedCopyCount = " + str( - sum( - [ - res[si] * (seqlist[si].v2.pos - seqlist[si].v1.pos) - for si in range(n) - if res[si] >= 2.5 - ] - ) - / sum( - [ - seqlist[si].v2.pos - seqlist[si].v1.pos + 1 - for si in range(n) - if res[si] >= 2.5 - ] - ) + sum([res[si] * (seqlist[si].v2.pos - seqlist[si].v1.pos) for si in range(n) if res[si] >= 2.5]) + / sum([seqlist[si].v2.pos - seqlist[si].v1.pos + 1 for si in range(n) if res[si] >= 2.5]) ) ) else: @@ -4117,38 +3438,17 @@ def streamprinter(msg): summary_logger.info("#SeqenceEdges = " + str(n)) summary_logger.info("#BreakpointEdges = " + str(len(kbpe))) summary_logger.info("#CoverageShifts = " + str(len(all_msv_cat))) - summary_logger.info( - "#MeanshiftSegmentsCopyCount>5 = " - + str(len([v for v in msv_diff.values() if v > 5])) - ) + summary_logger.info("#MeanshiftSegmentsCopyCount>5 = " + str(len([v for v in msv_diff.values() if v > 5]))) summary_logger.info( "#Foldbacks = " - + str( - len( - [ - msa - for msa in all_msv_cat - if self.foldup_count(msa.chrom, msa.pos, msa.strand) >= 1 - ] - ) - ) + + str(len([msa for msa in all_msv_cat if self.foldup_count(msa.chrom, msa.pos, msa.strand) >= 1])) ) summary_logger.info( - "#CoverageShiftsWithBreakpointEdges = " - + str(len([msa for msa in all_msv_cat if msa in ms_addlist])) + "#CoverageShiftsWithBreakpointEdges = " + str(len([msa for msa in all_msv_cat if msa in ms_addlist])) ) # Summary, #intervals, t talsize, size>2.5, AvgCoverage>2.5, #chromosomes, #sequenceedges, #breakpointedges, #meanshiftbreaks, #meanshift>5, #msfoldbackedges, #msfoldbackedges, #mswithoutbreakpoint, oncogenes, representativestring, #bpedgeswithcommonkmers - if ( - len( - [ - seqlist[si].v2.pos - seqlist[si].v1.pos - for si in range(n) - if res[si] >= 2.5 - ] - ) - > 0 - ): + if len([seqlist[si].v2.pos - seqlist[si].v1.pos for si in range(n) if res[si] >= 2.5]) > 0: # print '\t'.join(map(str, ["Summary:", lenlist, sum([a.size() for a in ilist]), sum([seqlist[si].v2.pos - seqlist[si].v1.pos for si in range(n) if res[si] >= 2.5]), sum([res[si] * (seqlist[si].v2.pos - seqlist[si].v1.pos) for si in range(n) if res[si] >= 2.5]) / sum([seqlist[si].v2.pos - seqlist[si].v1.pos for si in range(n) if res[si] >= 2.5]), len(Set([i.chrom for i in ilist])), n, len(kbpe), len(all_msv_cat), len([v for v in msv_diff.values() if v > 5]), len([msa for msa in all_msv_cat if self.foldup_count(msa.chrom, msa.pos, msa.strand) >= 1]), len([msa for msa in all_msv_cat if self.foldup_count(msa.chrom, msa.pos, msa.strand) >= 1]), len([msa for msa in all_msv_cat if msa in ms_addlist]), oncolist, istr, len([e for e in kbpe if e.kmer_homology()])])) print( "\t".join( @@ -4158,49 +3458,22 @@ def streamprinter(msg): "Summary:", lenlist, sum([a.size() for a in ilist]), + sum([seqlist[si].v2.pos - seqlist[si].v1.pos + 1 for si in range(n) if res[si] >= 2.5]), sum( [ - seqlist[si].v2.pos - seqlist[si].v1.pos + 1 - for si in range(n) - if res[si] >= 2.5 - ] - ), - sum( - [ - res[si] - * (seqlist[si].v2.pos - seqlist[si].v1.pos + 1) + res[si] * (seqlist[si].v2.pos - seqlist[si].v1.pos + 1) for si in range(n) if res[si] >= 2.5 ] ) - / sum( - [ - seqlist[si].v2.pos - seqlist[si].v1.pos + 1 - for si in range(n) - if res[si] >= 2.5 - ] - ), + / sum([seqlist[si].v2.pos - seqlist[si].v1.pos + 1 for si in range(n) if res[si] >= 2.5]), len(set([i.chrom for i in ilist])), n, len(kbpe), len(all_msv_cat), len([v for v in msv_diff.values() if v > 5]), - len( - [ - msa - for msa in all_msv_cat - if self.foldup_count(msa.chrom, msa.pos, msa.strand) - >= 1 - ] - ), - len( - [ - msa - for msa in all_msv_cat - if self.foldup_count(msa.chrom, msa.pos, msa.strand) - >= 1 - ] - ), + len([msa for msa in all_msv_cat if self.foldup_count(msa.chrom, msa.pos, msa.strand) >= 1]), + len([msa for msa in all_msv_cat if self.foldup_count(msa.chrom, msa.pos, msa.strand) >= 1]), len([msa for msa in all_msv_cat if msa in ms_addlist]), oncolist, istr, @@ -4278,47 +3551,16 @@ def streamprinter(msg): "Summary:", lenlist, sum([a.size() for a in ilist]), - sum( - [ - seqlist[si].v2.pos - seqlist[si].v1.pos + 1 - for si in range(n) - if res[si] >= 2.5 - ] - ), - sum( - [ - res[si] - * (seqlist[si].v2.pos - seqlist[si].v1.pos + 1) - for si in range(n) - ] - ) - / sum( - [ - seqlist[si].v2.pos - seqlist[si].v1.pos + 1 - for si in range(n) - ] - ), + sum([seqlist[si].v2.pos - seqlist[si].v1.pos + 1 for si in range(n) if res[si] >= 2.5]), + sum([res[si] * (seqlist[si].v2.pos - seqlist[si].v1.pos + 1) for si in range(n)]) + / sum([seqlist[si].v2.pos - seqlist[si].v1.pos + 1 for si in range(n)]), len(set([i.chrom for i in ilist])), n, len(kbpe), len(all_msv_cat), len([v for v in msv_diff.values() if v > 5]), - len( - [ - msa - for msa in all_msv_cat - if self.foldup_count(msa.chrom, msa.pos, msa.strand) - >= 1 - ] - ), - len( - [ - msa - for msa in all_msv_cat - if self.foldup_count(msa.chrom, msa.pos, msa.strand) - >= 1 - ] - ), + len([msa for msa in all_msv_cat if self.foldup_count(msa.chrom, msa.pos, msa.strand) >= 1]), + len([msa for msa in all_msv_cat if self.foldup_count(msa.chrom, msa.pos, msa.strand) >= 1]), len([msa for msa in all_msv_cat if msa in ms_addlist]), oncolist, istr, @@ -4333,10 +3575,7 @@ def streamprinter(msg): interval_index = 1 for i in ilist: - cycle_logger.info( - "Interval\t" - + "\t".join([str(interval_index), i.chrom, str(i.start), str(i.end)]) - ) + cycle_logger.info("Interval\t" + "\t".join([str(interval_index), i.chrom, str(i.start), str(i.end)])) interval_index += 1 new_graph.cycle_decomposition(wehc, s) @@ -4361,9 +3600,7 @@ def plot_segmentation( figvsize = 5.21 fighsize = 24 fig = plt.figure(figsize=(fighsize, figvsize)) - plt.subplots_adjust( - left=73 / 1000.0, right=1 - 73 / 1000.0, bottom=1 / 4.0, top=1 - 1 / 10.0 - ) + plt.subplots_adjust(left=73 / 1000.0, right=1 - 73 / 1000.0, bottom=1 / 4.0, top=1 - 1 / 10.0) # dpi = 300 if font == "large": plt.subplots_adjust( @@ -4373,9 +3610,7 @@ def plot_segmentation( top=90 / 100.0, ) if font == "all_amplicons": - plt.subplots_adjust( - left=73 / 1000.0, right=1 - 73 / 1000.0, bottom=1 / 5.21, top=95 / 100.0 - ) + plt.subplots_adjust(left=73 / 1000.0, right=1 - 73 / 1000.0, bottom=1 / 5.21, top=95 / 100.0) dpi = 1000.0 / fighsize gs = gridspec.GridSpec(2, 1, height_ratios=[8, 2]) @@ -4411,10 +3646,7 @@ def plot_segmentation( scale_max_ms = 0 # msrlist = [self.get_meanshift(i) if i.size() > 50000 else self.meanshift_segmentation(i, window_size=300) for i in ilist] msrlist = [ - self.get_meanshift(i) - if i.size() > 50000 - else self.get_meanshift(i, window_size0=300) - for i in ilist + self.get_meanshift(i) if i.size() > 50000 else self.get_meanshift(i, window_size0=300) for i in ilist ] sensitive_elist = self.get_sensitive_discordant_edges( ilist, @@ -4431,28 +3663,17 @@ def plot_segmentation( de = [ e for e in eilist - if e[0].v1.pos != -1 - and hg.interval(e[0].v1.chrom, e[0].v1.pos, e[0].v1.pos).intersects(i) + if e[0].v1.pos != -1 and hg.interval(e[0].v1.chrom, e[0].v1.pos, e[0].v1.pos).intersects(i) ] # self.interval_discordant_edges(i) elist_dict[i] = de - elist_dict[i].sort( - key=lambda x: hg.absPos(x[0].v1.chrom, x[0].v1.pos) - + 0.1 * x[0].v1.strand - ) + elist_dict[i].sort(key=lambda x: hg.absPos(x[0].v1.chrom, x[0].v1.pos) + 0.1 * x[0].v1.strand) for e in eilist: eposlist = [] if e[0].v1.pos != -1: - eposlist.append( - hg.interval(e[0].v1.chrom, e[0].v1.pos, e[0].v1.pos) - ) + eposlist.append(hg.interval(e[0].v1.chrom, e[0].v1.pos, e[0].v1.pos)) if e[0].v2.pos != -1: - eposlist.append( - hg.interval(e[0].v2.chrom, e[0].v2.pos, e[0].v2.pos) - ) - if ( - len(scale_list) == 0 - or len(hg.interval_list(eposlist).intersection(scale_list)) > 0 - ): + eposlist.append(hg.interval(e[0].v2.chrom, e[0].v2.pos, e[0].v2.pos)) + if len(scale_list) == 0 or len(hg.interval_list(eposlist).intersection(scale_list)) > 0: max_edge = max(max_edge, e[1]) for i in ilist: @@ -4479,22 +3700,9 @@ def plot_segmentation( covl = [] for i, msr in zip(ilist, msrlist): for seg in msr: - avg_cov = np.average( - [ - c[1] - for c in cx0 - if c[0][0] == seg.chrom and seg.start <= c[0][1] <= seg.end - ] - ) - if ( - len(scale_list) == 0 - or len(hg.interval_list([i]).intersection(scale_list)) > 0 - ): - covl += [ - c[1] - for c in cx0 - if c[0][0] == seg.chrom and seg.start <= c[0][1] <= seg.end - ] + avg_cov = np.average([c[1] for c in cx0 if c[0][0] == seg.chrom and seg.start <= c[0][1] <= seg.end]) + if len(scale_list) == 0 or len(hg.interval_list([i]).intersection(scale_list)) > 0: + covl += [c[1] for c in cx0 if c[0][0] == seg.chrom and seg.start <= c[0][1] <= seg.end] scale_max_cov = max(scale_max_cov, avg_cov) if seg.info["cn"] != float("inf"): scale_max_ms = max(scale_max_ms, seg.info["cn"]) @@ -4511,12 +3719,7 @@ def plot_segmentation( color="k", ) - logging.debug( - "Max cov, max ms scales set to: " - + str(scale_max_cov) - + " " - + str(scale_max_ms) - ) + logging.debug("Max cov, max ms scales set to: " + str(scale_max_cov) + " " + str(scale_max_ms)) covl.sort() if len(covl) > 0: m95cov = covl[-(len(covl) // 20)] @@ -4546,10 +3749,7 @@ def plot_segmentation( for i in ilist: for el in elist_dict[i]: e = el[0] - if ( - ilist.xpos(e.v2.chrom, e.v2.pos) is None - and ilist.xpos(e.v1.chrom, e.v1.pos) is None - ): + if ilist.xpos(e.v2.chrom, e.v2.pos) is None and ilist.xpos(e.v1.chrom, e.v1.pos) is None: continue elif ilist.xpos(e.v2.chrom, e.v2.pos) is None: ax2.axvline( @@ -4586,22 +3786,12 @@ def plot_segmentation( color=ecolor[e.type()], ) else: - xmid = ( - ilist.xpos(e.v1.chrom, e.v1.pos) - + ilist.xpos(e.v2.chrom, e.v2.pos) - ) / 2 - xdia = abs( - ilist.xpos(e.v2.chrom, e.v2.pos) - - ilist.xpos(e.v1.chrom, e.v1.pos) - ) + xmid = (ilist.xpos(e.v1.chrom, e.v1.pos) + ilist.xpos(e.v2.chrom, e.v2.pos)) / 2 + xdia = abs(ilist.xpos(e.v2.chrom, e.v2.pos) - ilist.xpos(e.v1.chrom, e.v1.pos)) ydia = (1.0 + xdia) * 3 * ymax pseudo_edge = breakpoint_edge( - breakpoint_vertex( - e.v1.chrom, hg.absPos(e.v1.chrom, e.v1.pos), e.v1.strand - ), - breakpoint_vertex( - e.v1.chrom, hg.absPos(e.v2.chrom, e.v2.pos), e.v2.strand - ), + breakpoint_vertex(e.v1.chrom, hg.absPos(e.v1.chrom, e.v1.pos), e.v1.strand), + breakpoint_vertex(e.v1.chrom, hg.absPos(e.v2.chrom, e.v2.pos), e.v2.strand), ) ee = Arc( (xmid, 0), @@ -4674,10 +3864,7 @@ def plot_segmentation( linewidth=ogene_width, ) ax3.text( - ( - ilist.xpos(i.chrom, max(g[1].start, i.start)) - + ilist.xpos(i.chrom, min(g[1].end, i.end)) - ) + (ilist.xpos(i.chrom, max(g[1].start, i.start)) + ilist.xpos(i.chrom, min(g[1].end, i.end))) / 2.0, ty, g[1].info["Name"], @@ -4698,10 +3885,7 @@ def plot_segmentation( linewidth=ogene_width, ) ax3.text( - ( - ilist.xpos(i.chrom, max(g[1].start, i.start)) - + ilist.xpos(i.chrom, min(g[1].end, i.end)) - ) + (ilist.xpos(i.chrom, max(g[1].start, i.start)) + ilist.xpos(i.chrom, min(g[1].end, i.end))) / 2.0, -0.05 + 0.37 * gparity, g[1].info["Name"], @@ -4721,10 +3905,7 @@ def plot_segmentation( linewidth=ogene_width, ) ax3.text( - ( - ilist.xpos(i.chrom, max(g[1].start, i.start)) - + ilist.xpos(i.chrom, min(g[1].end, i.end)) - ) + (ilist.xpos(i.chrom, max(g[1].start, i.start)) + ilist.xpos(i.chrom, min(g[1].end, i.end))) / 2.0, ty, g[1].info["Name"], @@ -4739,8 +3920,7 @@ def plot_segmentation( ax3.add_patch( Rectangle( [ilist.xpos(i.chrom, max(ss.start, i.start)), 0.65], - ilist.xpos(i.chrom, min(ss.end, i.end)) - - ilist.xpos(i.chrom, max(ss.start, i.start)), + ilist.xpos(i.chrom, min(ss.end, i.end)) - ilist.xpos(i.chrom, max(ss.start, i.start)), 0.25, fc=chrcolor[s.info[1]], ec="k", @@ -4748,11 +3928,7 @@ def plot_segmentation( ) if font == "large": ax3.text( - ( - ilist.xpos(i.chrom, max(ss.start, i.start)) - + ilist.xpos(i.chrom, min(ss.end, i.end)) - ) - / 2.0, + (ilist.xpos(i.chrom, max(ss.start, i.start)) + ilist.xpos(i.chrom, min(ss.end, i.end))) / 2.0, 0, s.info[0], horizontalalignment="center", @@ -4761,11 +3937,7 @@ def plot_segmentation( ) elif font == "large" or font == "all_amplicons": ax3.text( - ( - ilist.xpos(i.chrom, max(ss.start, i.start)) - + ilist.xpos(i.chrom, min(ss.end, i.end)) - ) - / 2.0, + (ilist.xpos(i.chrom, max(ss.start, i.start)) + ilist.xpos(i.chrom, min(ss.end, i.end))) / 2.0, 0, s.info[0], horizontalalignment="center", @@ -4774,11 +3946,7 @@ def plot_segmentation( ) else: ax3.text( - ( - ilist.xpos(i.chrom, max(ss.start, i.start)) - + ilist.xpos(i.chrom, min(ss.end, i.end)) - ) - / 2.0, + (ilist.xpos(i.chrom, max(ss.start, i.start)) + ilist.xpos(i.chrom, min(ss.end, i.end))) / 2.0, 0.2 + int(s[0]) % 2 * 0.15, s.info[0], horizontalalignment="center", @@ -4826,12 +3994,8 @@ def plot_segmentation( elif imax - imin > 0.02: segname = i.chrom.strip("chr") interval_poslist.append((segname, (imax + imin) / 2)) - ax3.xaxis.set_major_locator( - ticker.FixedLocator([c[1] for c in interval_poslist]) - ) - ax3.xaxis.set_major_formatter( - ticker.FixedFormatter([c[0] for c in interval_poslist]) - ) + ax3.xaxis.set_major_locator(ticker.FixedLocator([c[1] for c in interval_poslist])) + ax3.xaxis.set_major_formatter(ticker.FixedFormatter([c[0] for c in interval_poslist])) else: chrmin = {} chrmax = {} @@ -4851,18 +4015,14 @@ def plot_segmentation( ) ) ax3.xaxis.set_major_locator(ticker.FixedLocator([c[1] for c in chrposlist])) - ax3.xaxis.set_major_formatter( - ticker.FixedFormatter([c[0] for c in chrposlist]) - ) + ax3.xaxis.set_major_formatter(ticker.FixedFormatter([c[0] for c in chrposlist])) xposlist = [] if font != "all_amplicons": for i in ilist: xposlist.append((str(i.start), ilist.xpos(i.chrom, i.start))) xposlist.append((str(i.end), ilist.xpos(i.chrom, i.end))) ax3.xaxis.set_minor_locator(ticker.FixedLocator([c[1] for c in xposlist])) - ax3.xaxis.set_minor_formatter( - ticker.FixedFormatter([c[0] for c in xposlist]) - ) + ax3.xaxis.set_minor_formatter(ticker.FixedFormatter([c[0] for c in xposlist])) plt.setp(ax3.xaxis.get_minorticklabels(), rotation=90) ax3.tick_params(axis="x", which="minor", pad=15) # ax3.tick_params(axis='x', which='minor', pad=-5) @@ -4877,10 +4037,6 @@ def plot_segmentation( fig.savefig(amplicon_name + ".png", dpi=dpi) fig.savefig(amplicon_name + ".pdf", dpi=dpi) except np.linalg.linalg.LinAlgError: - logging.error( - "Numpy LinAlgError when forming amplicon plot! Cannot save " - + amplicon_name - + " image\n" - ) + logging.error("Numpy LinAlgError when forming amplicon plot! Cannot save " + amplicon_name + " image\n") plt.close() diff --git a/bin/breakpoint_graph.py b/bin/breakpoint_graph.py index 684b54a5..4fe16dbc 100755 --- a/bin/breakpoint_graph.py +++ b/bin/breakpoint_graph.py @@ -13,8 +13,8 @@ # This software program and documentation are copyrighted by The Regents of the University of California. The software program and documentation are supplied "as is", without any accompanying services from The Regents. The Regents does not warrant that the operation of the program will be uninterrupted or error-free. The end-user understands that the program was developed for research purposes and is advised not to rely exclusively on the program for any reason. # # IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. -#Author: Viraj Deshpande -#Contact: virajbdeshpande@gmail.com +# Author: Viraj Deshpande +# Contact: virajbdeshpande@gmail.com import sys @@ -25,7 +25,8 @@ from abstract_graph import * import ref_util as hg -cycle_logger = logging.getLogger('cycle') +cycle_logger = logging.getLogger("cycle") + class breakpoint_vertex(abstract_vertex): """Class representing breakpoint vertex derived from abstract_graph.abstract_vertex @@ -36,17 +37,18 @@ class breakpoint_vertex(abstract_vertex): strand = 1/-1 for forward/reverse strand vid = (optional)id of vertex graph = (optional) graph to which vertex belongs""" - def __init__(self, chrom='', pos=-2, strand=1, vid=0, graph=None): + + def __init__(self, chrom="", pos=-2, strand=1, vid=0, graph=None): """2 ways to initialize: - 1) chrom: breakpoint_vertex string in the format chrom:pos("+"/"-"") - 2) chrom, pos, strand: name(STR), pos (INT), strand("+"/"-"")""" + 1) chrom: breakpoint_vertex string in the format chrom:pos("+"/"-"") + 2) chrom, pos, strand: name(STR), pos (INT), strand("+"/"-"")""" if pos == -2: vstring = chrom - chrom = vstring[:vstring.find(':')] - pos = int(vstring[vstring.find(':') + 1:-1]) - strand = 1 if vstring[-1] == '+' else -1 + chrom = vstring[: vstring.find(":")] + pos = int(vstring[vstring.find(":") + 1 : -1]) + strand = 1 if vstring[-1] == "+" else -1 if graph is not None and graph.has_vertex(chrom, pos, strand): - raise Exception('Duplicate vertex added') + raise Exception("Duplicate vertex added") abstract_vertex.__init__(self, vid, graph) self.chrom = chrom self.pos = pos @@ -55,9 +57,9 @@ def __init__(self, chrom='', pos=-2, strand=1, vid=0, graph=None): def __repr__(self): """String format chrom:pos(+/-)""" if self.strand == 1: - return self.chrom + ':' + str(self.pos) + '+' + return self.chrom + ":" + str(self.pos) + "+" else: - return self.chrom + ':' + str(self.pos) + '-' + return self.chrom + ":" + str(self.pos) + "-" def __hash__(self): return str(self).__hash__() @@ -76,7 +78,18 @@ class breakpoint_edge(abstract_edge): edge_type = "discordant"/"breakpoint" or "concordant" : genomic connectivity or source; "sequence": genomic interval eid = (optional) edge id graph = (optional) graph to which edge belongs""" - def __init__(self, v1, v2=None, eid=0, graph=None, update_vertices=True, edge_type="discordant", hom=None, hom_seq=None): + + def __init__( + self, + v1, + v2=None, + eid=0, + graph=None, + update_vertices=True, + edge_type="discordant", + hom=None, + hom_seq=None, + ): """2 ways to initialize: 1) v1 = breakpoint_edge string in the format breakpoint_vertex1->breakpoint_vertex2 2) v1,v2 = breakpoint_point_vertices @@ -87,11 +100,12 @@ def __init__(self, v1, v2=None, eid=0, graph=None, update_vertices=True, edge_ty edge_type: " = "discordant"/"breakpoint" or "concordant" : genomic connectivity or source; "sequence": genomic interval Required: If edge_type = "sequence": v1.chrom = v2.chrom, v1.pos > v2.pos else if equal v1.strand > v2.strand - If edge_type = "concordant": v1.chrom = v2.chrom, |v1.pos - v2.pos| = 1 and the smaller has strand = 1 else -1""" + If edge_type = "concordant": v1.chrom = v2.chrom, |v1.pos - v2.pos| = 1 and the smaller has strand = 1 else -1 + """ if type(v1) == str: estr = v1 - v1 = breakpoint_vertex(estr.split('>')[0][:-1]) - v2 = breakpoint_vertex(estr.split('>')[1]) + v1 = breakpoint_vertex(estr.split(">")[0][:-1]) + v2 = breakpoint_vertex(estr.split(">")[1]) abstract_edge.__init__(self, v1, v2, eid, graph, update_vertices) if edge_type in ["concordant", "sequence"]: if v1.chrom != v2.chrom: @@ -100,12 +114,13 @@ def __init__(self, v1, v2=None, eid=0, graph=None, update_vertices=True, edge_ty if v1.strand == v2.strand: raise Exception("Edge of type " + edge_type + " connects same strand.") if edge_type == "concordant": - if ((v1.strand == 1 and v1.pos + 1 != v2.pos) or - (v2.strand == 1 and v2.pos + 1 != v1.pos)): + if (v1.strand == 1 and v1.pos + 1 != v2.pos) or (v2.strand == 1 and v2.pos + 1 != v1.pos): raise Exception("Edge of type " + edge_type + " connects non-adjacent positions.") if edge_type == "sequence": if v1.strand == -1 and v1.pos > v2.pos: - raise Exception("Start position for sequence edge greater than end position:" + str(v1) + '->' + str(v2)) + raise Exception( + "Start position for sequence edge greater than end position:" + str(v1) + "->" + str(v2) + ) if v1.strand == 1 and v2.pos > v1.pos: raise Exception("Start position for sequence edge greater than end position") self.edge_type = edge_type @@ -113,13 +128,17 @@ def __init__(self, v1, v2=None, eid=0, graph=None, update_vertices=True, edge_ty self.hom_seq = hom_seq def sequence(self, flank_size=-1): - if self.edge_type == 'sequence': + if self.edge_type == "sequence": seq = hg.interval(self.v1.chrom, self.v1.pos, self.v2.pos).sequence() if flank_size > 0: - seq = hg.interval(self.v1.chrom, self.v1.pos - flank_size + 1, self.v1.pos).sequence() + seq + hg.interval(self.v2.chrom, self.v2.pos, self.v2.pos + flank_size - 1).sequence() + seq = ( + hg.interval(self.v1.chrom, self.v1.pos - flank_size + 1, self.v1.pos).sequence() + + seq + + hg.interval(self.v2.chrom, self.v2.pos, self.v2.pos + flank_size - 1).sequence() + ) else: if self.hom == None: - seq = 'N' * 20 + seq = "N" * 20 else: seq = self.hom_seq if flank_size == -1: @@ -129,39 +148,93 @@ def sequence(self, flank_size=-1): hom = self.hom else: hom = 0 - if self.edge_type == 'source': + if self.edge_type == "source": if self.v2.strand == -1: - right_seq = hg.interval(self.v2.chrom, self.v2.pos + hom, self.v2.pos + hom + flank_size - 1).sequence() - left_seq = '' + right_seq = hg.interval( + self.v2.chrom, + self.v2.pos + hom, + self.v2.pos + hom + flank_size - 1, + ).sequence() + left_seq = "" else: - left_seq = hg.interval(self.v2.chrom, self.v2.pos - hom - flank_size + 1, self.v2.pos - hom).sequence() - right_seq = '' + left_seq = hg.interval( + self.v2.chrom, + self.v2.pos - hom - flank_size + 1, + self.v2.pos - hom, + ).sequence() + right_seq = "" elif self.v1.strand == 1: - left_seq = hg.interval(self.v1.chrom, self.v1.pos - hom - flank_size + 1, self.v1.pos - hom).sequence() + left_seq = hg.interval( + self.v1.chrom, + self.v1.pos - hom - flank_size + 1, + self.v1.pos - hom, + ).sequence() if self.v2.strand == -1: - right_seq = hg.interval(self.v2.chrom, self.v2.pos + hom, self.v2.pos + hom + flank_size - 1).sequence() + right_seq = hg.interval( + self.v2.chrom, + self.v2.pos + hom, + self.v2.pos + hom + flank_size - 1, + ).sequence() else: - right_seq = hg.interval(self.v2.chrom, self.v2.pos - hom - flank_size + 1, self.v2.pos - hom, strand=-1).sequence() + right_seq = hg.interval( + self.v2.chrom, + self.v2.pos - hom - flank_size + 1, + self.v2.pos - hom, + strand=-1, + ).sequence() else: - right_seq = hg.interval(self.v1.chrom, self.v1.pos + hom, self.v1.pos + hom + flank_size - 1).sequence() + right_seq = hg.interval( + self.v1.chrom, + self.v1.pos + hom, + self.v1.pos + hom + flank_size - 1, + ).sequence() if self.v2.strand == -1: - left_seq = hg.interval(self.v2.chrom, self.v2.pos + hom, self.v2.pos + hom + flank_size - 1, strand=-1).sequence() + left_seq = hg.interval( + self.v2.chrom, + self.v2.pos + hom, + self.v2.pos + hom + flank_size - 1, + strand=-1, + ).sequence() else: - left_seq = hg.interval(self.v2.chrom, self.v2.pos - hom - flank_size + 1, self.v2.pos - hom).sequence() + left_seq = hg.interval( + self.v2.chrom, + self.v2.pos - hom - flank_size + 1, + self.v2.pos - hom, + ).sequence() seq = left_seq + seq + right_seq return seq def kmer_homology(self, k=10, span=100): """Number of shared k-mers within "span" distance on either side of vertex positions""" - seq1 = ''.join([a.capitalize() for a in hg.interval(self.v1.chrom, max(1,self.v1.pos - span), min(self.v1.pos + span, hg.chrLen[hg.chrNum(self.v1.chrom)]), self.v1.strand).sequence()]) - seq2 = ''.join([a.capitalize() for a in hg.interval(self.v2.chrom, max(1,self.v2.pos - span), min(self.v2.pos + span, hg.chrLen[hg.chrNum(self.v2.chrom)]), -1 * self.v2.strand).sequence()]) - kset1 = set([seq1[i:i+10] for i in range(len(seq1) - k + 1)]) - kset2 = set([seq2[i:i+10] for i in range(len(seq2) - k + 1)]) + seq1 = "".join( + [ + a.capitalize() + for a in hg.interval( + self.v1.chrom, + max(1, self.v1.pos - span), + min(self.v1.pos + span, hg.chrLen[hg.chrNum(self.v1.chrom)]), + self.v1.strand, + ).sequence() + ] + ) + seq2 = "".join( + [ + a.capitalize() + for a in hg.interval( + self.v2.chrom, + max(1, self.v2.pos - span), + min(self.v2.pos + span, hg.chrLen[hg.chrNum(self.v2.chrom)]), + -1 * self.v2.strand, + ).sequence() + ] + ) + kset1 = set([seq1[i : i + 10] for i in range(len(seq1) - k + 1)]) + kset2 = set([seq2[i : i + 10] for i in range(len(seq2) - k + 1)]) return len(kset1.intersection(kset2)) def type(self, min_insert=0, max_insert=500): """Determine type of "breakpoint"/"discordant edge - Output values: + Output values: "source": Contains v.pos = -1, indicates end of linear contig. "interchromosomal": Different chromosomes. "everted": Forward strand of larger position connected to reverse strand of reverse, indicated by outward orientation of read-pairs, may suggest tandem duplication. @@ -169,7 +242,7 @@ def type(self, min_insert=0, max_insert=500): "reverse": Both vertex/paired-reads map to reverse strand "discordant": Alignment distance larger/smaller than max/min insert, may indicate deletion "concordant": Expected alignment length between min and max insert. NOTE: Different from edge_type - """ + """ if self.v1.pos == -1 or self.v2.pos == -1: return "source" elif self.v1.chrom != self.v2.chrom: @@ -191,19 +264,20 @@ def type(self, min_insert=0, max_insert=500): if vmax.pos - vmin.pos > max_insert or vmax.pos - vmin.pos < min_insert: return "discordant" return "concordant" - + def __repr__(self): """breakpoint_vertex1->breakpoint_vertex2""" - return str(self.v1) + '->' + str(self.v2) + return str(self.v1) + "->" + str(self.v2) def __lt__(self, other): - return min((self.v1.chrom, self.v1.pos), (self.v2.chrom, self.v2.pos)) < min((other.v1.chrom, self.v1.pos), - (other.v2.chrom, self.v2.pos)) + return min((self.v1.chrom, self.v1.pos), (self.v2.chrom, self.v2.pos)) < min( + (other.v1.chrom, self.v1.pos), (other.v2.chrom, self.v2.pos) + ) class breakpoint_graph(abstract_graph): - """Class representing breakpoint edge derived from abstract_graph.abstract_graph - """ + """Class representing breakpoint edge derived from abstract_graph.abstract_graph""" + def __init__(self, graphfile=None): """Creates an empty graph if no graphfile provided Loads graph from graph file in format defined in load_graphfile""" @@ -228,7 +302,7 @@ def new_vertex(self, chrom, pos, strand): self.vhash[v.__hash__()] = v return v - def new_edge(self, v1, v2, edge_type='discordant', hom=None, hom_seq=None): + def new_edge(self, v1, v2, edge_type="discordant", hom=None, hom_seq=None): """Create, add and return breakpoint_edge to current graph. Recommend using "add_edge()". "new_edge()" may incorrectly add duplicate edges Arguments: v1,v2: breakpoint_vertex (These need to be vertices(objects) from current breakpoint graph) @@ -239,7 +313,7 @@ def add_vertex(self, v): """Create and add new vertex to graph if no similar vertex exists""" return self.new_vertex(v.chrom, v.pos, v.strand) - def add_edge(self, e, edge_type='discordant'): + def add_edge(self, e, edge_type="discordant"): """Add and return edge similar e to the graph. If e(object) already belongs to graph, return e. Checks if corresponding vertices already present else return None. If edge_type not defined, then inherits e.edge_type. @@ -261,20 +335,20 @@ def load_graphfile(self, graphfile): graphfile_handle = open(graphfile) ll = [l.strip().split() for l in graphfile_handle] graphfile_handle.close() - self.copy_count=defaultdict(lambda:0, {}) + self.copy_count = defaultdict(lambda: 0, {}) for l in ll: if len(l) == 0: continue - if l[0] == 'sequence': + if l[0] == "sequence": v1 = self.add_vertex(breakpoint_vertex(l[1])) v2 = self.add_vertex(breakpoint_vertex(l[2])) - e = self.new_edge(v1, v2, edge_type='sequence') + e = self.new_edge(v1, v2, edge_type="sequence") self.copy_count[e] = float(l[3]) - if l[0] == 'concordant': + if l[0] == "concordant": e = self.add_edge(breakpoint_edge(l[1], edge_type=l[0])) self.copy_count[e] = float(l[2]) - if l[0] == 'source' or l[0] == 'discordant' or l[0] == 'breakpoint': - e = self.add_edge(breakpoint_edge(l[1], edge_type='discordant')) + if l[0] == "source" or l[0] == "discordant" or l[0] == "breakpoint": + e = self.add_edge(breakpoint_edge(l[1], edge_type="discordant")) self.copy_count[e] = float(l[2]) return @@ -310,7 +384,7 @@ def djikstra_distance(self, v1, v2, min_count=0): else: e_new = e.v1.elist v = e.v1 - e_new = [e_next for e_next in e_new if e_next.edge_type != 'sequence'] + e_new = [e_next for e_next in e_new if e_next.edge_type != "sequence"] e_search = [] for en in e_new: min_c = min(cc, self.copy_count[en]) @@ -328,7 +402,9 @@ def djikstra_distance(self, v1, v2, min_count=0): min_c = min(min_c, self.copy_count[en] / 2.0) if min_c < min_count: continue - en_seq, en_seqstrand = [(es, 1 if v_seq == es.v1 else -1) for es in v_seq.elist if es.edge_type == 'sequence'][0] + en_seq, en_seqstrand = [ + (es, 1 if v_seq == es.v1 else -1) for es in v_seq.elist if es.edge_type == "sequence" + ][0] min_c = min(min_c, self.copy_count[en_seq]) if min_c < min_count: continue @@ -344,17 +420,24 @@ def djikstra_distance(self, v1, v2, min_count=0): else: dd = d + e2.v2.pos - v2.pos return (dd, path + [(en, en_strand), (en_seq, en_seqstrand)], min_c) - heapq.heappush(a, (d + en_seq.v2.pos - en_seq.v1.pos + 1, path + [(en, en_strand), (en_seq, en_seqstrand)], min_c)) + heapq.heappush( + a, + ( + d + en_seq.v2.pos - en_seq.v1.pos + 1, + path + [(en, en_strand), (en_seq, en_seqstrand)], + min_c, + ), + ) return None def cycle_decomposition(self, w, s): - """ Decompose breakpoint_graph into 'simple' cycles. Simple cycles may contain a sequence edge atmost once along each strand. Reports maximum parsimonious cycles starting from thickest cycle until 80% of genomic content is covered. w is dict containing weights (counts) of edges s is source vertex, this vertex has the exception of not having a sequence edge attached""" + def thickest_cycle(hce, wehc): # print hce, wehc v1 = hce[1].v1 @@ -371,10 +454,10 @@ def thickest_cycle(hce, wehc): completed = True break for e in v1.elist: - if e.edge_type == 'sequence': + if e.edge_type == "sequence": continue else: - v2 = e.neighbor(v1) + v2 = e.neighbor(v1) if v2 == s: v3 = v2 if e in hdict[v1][3]: @@ -388,7 +471,7 @@ def thickest_cycle(hce, wehc): seenEdges.add(e) else: for e2 in v2.elist: - if e2.edge_type == 'sequence': + if e2.edge_type == "sequence": se = e2 v3 = e2.neighbor(v2) break @@ -417,7 +500,7 @@ def thickest_cycle(hce, wehc): s2Set = set() tc = hdict[hce[1].v1][1] v2 = hdict[hce[1].v1][2] - while v2 != hce[1].v1: #and not v2 in s2Set: + while v2 != hce[1].v1: # and not v2 in s2Set: # print hce[1].v1, v2, s2Set s2Set.add(v2) if v2 not in hdict: @@ -430,7 +513,7 @@ def thickest_cycle(hce, wehc): # print v2, tc return tc, hdict[hce[1].v1][0] - total_amplicon_content = sum([(e.v2.pos - e.v1.pos) * w[e] for e in w if e.edge_type == 'sequence']) + total_amplicon_content = sum([(e.v2.pos - e.v1.pos) * w[e] for e in w if e.edge_type == "sequence"]) amplicon_content_covered = 0 w2 = w.copy() cycle_number = 1 @@ -441,7 +524,7 @@ def thickest_cycle(hce, wehc): wer = we[::-1] we = wer wei = 0 - tcwmax = -1 + tcwmax = -1 tcmax = None tchwmax = -1 tchmax = None @@ -451,7 +534,7 @@ def thickest_cycle(hce, wehc): # for e in w2: # print "EEEEEEEEEEEE", str(e), e.edge_type, w2[e] # print "EEEEEEEEE========================" - while wei < len(we):# and (tcwmax == -1 or we[wei][0] >= tcwmax / 2.0): + while wei < len(we): # and (tcwmax == -1 or we[wei][0] >= tcwmax / 2.0): # if we[wei][1].edge_type == 'sequence': # wei += 1 # continue @@ -480,9 +563,12 @@ def thickest_cycle(hce, wehc): if -1 in [e.v1.pos for e in tc] + [e.v2.pos for e in tc]: csource = 0 for ci in range(len(tc) - 1): - if -1 in [tc[ci].v1.pos, tc[ci].v2.pos] and -1 in [tc[ci +1].v1.pos, tc[ci + 1].v2.pos]: + if -1 in [tc[ci].v1.pos, tc[ci].v2.pos] and -1 in [ + tc[ci + 1].v1.pos, + tc[ci + 1].v2.pos, + ]: csource = ci + 1 - tc = tc[ci + 1:] + tc[0:ci + 1] + tc = tc[ci + 1 :] + tc[0 : ci + 1] break if tc[0].v1 == tc[1].v1 or tc[0].v1 == tc[1].v2: v2 = tc[0].v1 @@ -495,7 +581,7 @@ def thickest_cycle(hce, wehc): v2 = tc[ci].v2 else: v2 = tc[ci].v1 - if tc[ci].edge_type == 'sequence': + if tc[ci].edge_type == "sequence": if v1.pos > v2.pos: tc = tc[::-1] break @@ -512,20 +598,20 @@ def thickest_cycle(hce, wehc): v2 = tc[ci].v2 else: v2 = tc[ci].v1 - if tc[ci].edge_type == 'sequence': + if tc[ci].edge_type == "sequence": if v1.pos > v2.pos: tc = tc[ci::-1] + tc[:ci:-1] break v1 = v2 ci = 0 - while tc[ci].type() == 'concordant' or tc[ci-1].type() == 'concordant': + while tc[ci].type() == "concordant" or tc[ci - 1].type() == "concordant": ci -= 1 - tc = tc[ci:] + tc[: ci] - + tc = tc[ci:] + tc[:ci] + if tcw == 0: print("tcw is 0") break - print("Cycle ", cycle_number, ": Copy count = ",tcw, tc) + print("Cycle ", cycle_number, ": Copy count = ", tcw, tc) cycle_edge_list = [] ci = 1 v0 = None @@ -536,48 +622,52 @@ def thickest_cycle(hce, wehc): else: v2 = tc[0].v2 v1 = tc[0].v1 - if tc[0].edge_type == 'sequence': + if tc[0].edge_type == "sequence": v0 = v1 v0c = v2 elif v1.pos == -1 or v2.pos == -1: print(v1, "->", v2) - cycle_edge_list.append((v1,v2)) + cycle_edge_list.append((v1, v2)) v1 = v2 while ci < len(tc): - if (tc[ci].v1.chrom, tc[ci].v1.pos, tc[ci].v1.strand) == (v1.chrom, v1.pos, v1.strand): + if (tc[ci].v1.chrom, tc[ci].v1.pos, tc[ci].v1.strand) == ( + v1.chrom, + v1.pos, + v1.strand, + ): v2 = tc[ci].v2 else: v2 = tc[ci].v1 if v1.pos == -1 or v2.pos == -1: if v0 is not None: print(v0, "->", v0c) - cycle_edge_list.append((v0,v0c)) + cycle_edge_list.append((v0, v0c)) print(v1, "->", v2) - cycle_edge_list.append((v1,v2)) + cycle_edge_list.append((v1, v2)) v0 = None v0c = None - elif tc[ci].edge_type == 'sequence': + elif tc[ci].edge_type == "sequence": if v0 is None: v0 = v1 v0c = v2 else: v0c = v2 - elif tc[ci].type() != 'concordant': + elif tc[ci].type() != "concordant": if v0 is not None: print(v0, "->", v0c) - cycle_edge_list.append((v0,v0c)) + cycle_edge_list.append((v0, v0c)) v0 = None v0c = None v1 = v2 ci += 1 if v0 is not None: print(v0, "->", v0c) - cycle_edge_list.append((v0,v0c)) + cycle_edge_list.append((v0, v0c)) if amplicon_content_covered <= 0.9 * total_amplicon_content or (tcw > 0.2 * cycle_list[0][1]): cycle_list.append([cycle_number, tcw, tc, cycle_edge_list]) acc = tcw * sum([abs(e[1].pos - e[0].pos) for e in cycle_edge_list if -1 not in [e[0].pos, e[1].pos]]) amplicon_content_covered += acc - cycle_number += 1 + cycle_number += 1 # print tcw, tc for e in tc: w2[e] = w2[e] - tcw @@ -589,63 +679,82 @@ def thickest_cycle(hce, wehc): segment_list = [] for c in cycle_list: max_segment = c[3][0] - max_orientation = '+' + max_orientation = "+" max_segi = 0 segi = 0 for e in c[3]: - if (-1 in (max_segment[0].pos, max_segment[1].pos) and -1 not in (e[0].pos, e[1].pos)) or (abs(e[0].pos-e[1].pos) >= abs(max_segment[0].pos - max_segment[1].pos)): + if (-1 in (max_segment[0].pos, max_segment[1].pos) and -1 not in (e[0].pos, e[1].pos)) or ( + abs(e[0].pos - e[1].pos) >= abs(max_segment[0].pos - max_segment[1].pos) + ): max_segment = e max_segi = segi - if e[0].pos + 0.4*e[0].strand <= e[1].pos + 0.4*e[1].strand: - max_orientation = '+' + if e[0].pos + 0.4 * e[0].strand <= e[1].pos + 0.4 * e[1].strand: + max_orientation = "+" else: - max_orientation = '-' - if e[0].pos + 0.4*e[0].strand <= e[1].pos + 0.4*e[1].strand: + max_orientation = "-" + if e[0].pos + 0.4 * e[0].strand <= e[1].pos + 0.4 * e[1].strand: if e not in segment_list: segment_list.append(e) else: if (e[1], e[0]) not in segment_list: segment_list.append((e[1], e[0])) segi += 1 - if max_orientation == '+': - c[3] = c[3][max_segi: ] + c[3][:max_segi] + if max_orientation == "+": + c[3] = c[3][max_segi:] + c[3][:max_segi] else: - c[3] = [(e[1], e[0]) for e in c[3][:max_segi + 1][::-1]+c[3][max_segi + 1:][::-1]] + c[3] = [(e[1], e[0]) for e in c[3][: max_segi + 1][::-1] + c[3][max_segi + 1 :][::-1]] segment_list.sort() segi = 1 segment_index = {} - for s in [ss for ss in segment_list if ss[0].pos != -1 and ss[1].pos != -1]: + for s in [ss for ss in segment_list if ss[0].pos != -1 and ss[1].pos != -1]: segment_index[s] = segi segi += 1 - cycle_logger.info('List of cycle segments') + cycle_logger.info("List of cycle segments") for s in [ss for ss in segment_list if ss[0].pos == -1 or ss[1].pos == -1]: segment_index[s] = 0 for s in [ss for ss in segment_list if ss[0].pos != -1 and ss[1].pos != -1]: - cycle_logger.info('Segment\t' + '\t'.join([str(segment_index[s]), s[0].chrom, str(s[0].pos), str(s[1].pos)])) + cycle_logger.info( + "Segment\t" + "\t".join([str(segment_index[s]), s[0].chrom, str(s[0].pos), str(s[1].pos)]) + ) for c in cycle_list: seglist = [] orientation_list = [] for e in c[3]: if e in segment_index: seglist.append(segment_index[e]) - orientation_list.append('+') + orientation_list.append("+") else: seglist.append(segment_index[(e[1], e[0])]) - orientation_list.append('-') - cycle_logger.info("Cycle=" + str(c[0]) + ";Copy_count=" + str(c[1]) + ";Segments=" + ','.join([str(e[0])+str(e[1]) for e in zip(seglist, orientation_list)])) + orientation_list.append("-") + cycle_logger.info( + "Cycle=" + + str(c[0]) + + ";Copy_count=" + + str(c[1]) + + ";Segments=" + + ",".join([str(e[0]) + str(e[1]) for e in zip(seglist, orientation_list)]) + ) return None def __repr__(self): - return '/n'.join(map(str, self.vs.values() + self.es.values())) + '\n' + return "/n".join(map(str, self.vs.values() + self.es.values())) + "\n" class graph_decomposition(object): """Class represents decomposition of a breakpoint_graph with balanced edge counts into cycles/walks Provides methods to merge and modify cycles into larger walks to represent architecture of complex rearrangements. """ - def __init__(self, segment_list=None, cycle_list=None, ilist=None, file=None, file_content=None): + + def __init__( + self, + segment_list=None, + cycle_list=None, + ilist=None, + file=None, + file_content=None, + ): if file is not None or file_content is not None: self.segment_list = hg.interval_list([]) self.segment_dict = {} @@ -653,30 +762,32 @@ def __init__(self, segment_list=None, cycle_list=None, ilist=None, file=None, fi self.ilist = hg.interval_list([]) if file_content: - lines = file_content.split('\n') + lines = file_content.split("\n") else: - lines = str(open(file).read().decode()).split('\n') + lines = str(open(file).read().decode()).split("\n") ll = [l.strip().split() for l in lines if len(l.strip()) > 0] for l in ll: - if 'Segment' == l[0]: + if "Segment" == l[0]: s = hg.interval(l[2], int(l[3]), int(l[4]), info=[l[1]]) self.segment_dict[l[1]] = s self.segment_list.append(s) - elif 'Cycle=' in l[0]: - ls = l[0].split(';') - ci = ls[0].split('=')[1] - cn = float(ls[1].split('=')[1]) + elif "Cycle=" in l[0]: + ls = l[0].split(";") + ci = ls[0].split("=")[1] + cn = float(ls[1].split("=")[1]) cl = [] - for s in ls[2].split('=')[1].split(','): - if s[-1] == '+': + for s in ls[2].split("=")[1].split(","): + if s[-1] == "+": cl.append((s[:-1], 1)) else: cl.append((s[:-1], -1)) self.cycle_dict[ci] = (ci, cn, cl) - elif 'Interval' == l[0]: + elif "Interval" == l[0]: self.ilist.append(hg.interval(l[2], int(l[3]), int(l[4]), info=[l[1]])) elif cycle_list is None: - segment_set = hg.interval_list([hg.interval(ss[0], ss[1], ss[2]) for ss in {(s.chrom, s.start, s.end) for s in segment_list}]) + segment_set = hg.interval_list( + [hg.interval(ss[0], ss[1], ss[2]) for ss in {(s.chrom, s.start, s.end) for s in segment_list}] + ) segment_set.sort() self.segment_list = segment_set self.segment_dict = {} @@ -690,14 +801,14 @@ def __init__(self, segment_list=None, cycle_list=None, ilist=None, file=None, fi for ii in range(len(self.segment_list)): s = self.segment_list[ii] s.info = [seg_id[(s.chrom, s.start, s.end)]] - self.cycle_dict = {'1':('1', 1, cl)} + self.cycle_dict = {"1": ("1", 1, cl)} self.ilist = hg.interval_list([s[0] for s in segment_set.merge_clusters(extend=1)]) for ii in range(len(self.ilist)): self.ilist[ii].info = [str(ii)] else: self.segment_list = segment_list self.segment_dict = {s.info[0]: s for s in segment_list} - self.cycle_dict = {c[0]:c for c in cycle_list} + self.cycle_dict = {c[0]: c for c in cycle_list} if ilist is not None: self.ilist = ilist else: @@ -722,16 +833,21 @@ def merge(self, c1, c2, si1, si2): cycle1 = self.cycle_dict[c1] cycle2 = self.cycle_dict[c2] # check if atmost 1 cycle has source vertex - if '0' in [s[0] for s in cycle1[2]] and '0' in [s[0] for s in cycle2[2]]: + if "0" in [s[0] for s in cycle1[2]] and "0" in [s[0] for s in cycle2[2]]: raise Exception("Cannot merge 2 cycles with source vertices") # if cycle2 has source vertex, exchange c1,c2 - if '0' in [s[0] for s in cycle2[2]]: + if "0" in [s[0] for s in cycle2[2]]: (c1, c2, si1, si2, cycle1, cycle2) = (c2, c1, si2, si1, cycle2, cycle1) if si1 == 0 or si1 == len(cycle1[2]) - 1: raise Exception("Cannot use source segment for merging") # check if segments overlap if not self.segment_dict[cycle1[2][si1][0]].intersects(self.segment_dict[cycle2[2][si2][0]]): - raise Exception("Segments do not overlap" + str(self.segment_dict[cycle1[2][si1][0]]) + " " + str(self.segment_dict[cycle2[2][si2][0]])) + raise Exception( + "Segments do not overlap" + + str(self.segment_dict[cycle1[2][si1][0]]) + + " " + + str(self.segment_dict[cycle2[2][si2][0]]) + ) # cnlist: (merged cn, cycle1cn, cycle2cn) if cycle1[1] == 0 or cycle2[1] == 0: raise Exception("Cycle copy numbers should be > 0 to merge") @@ -744,19 +860,35 @@ def merge(self, c1, c2, si1, si2): seg1_found = False seg2_found = False for i in self.segment_list: - if cycle1[2][si1][1] == 1 and (i.chrom, i.start, i.end) == (seg1.chrom, seg1.start, seg2.end): + if cycle1[2][si1][1] == 1 and (i.chrom, i.start, i.end) == ( + seg1.chrom, + seg1.start, + seg2.end, + ): seg1_found = True ns1 = i.info[0] overlap1 = (ns1, cycle1[2][si1][1]) - elif cycle1[2][si1][1] == -1 and (i.chrom, i.start, i.end) == (seg1.chrom, seg2.start, seg1.end): + elif cycle1[2][si1][1] == -1 and (i.chrom, i.start, i.end) == ( + seg1.chrom, + seg2.start, + seg1.end, + ): seg1_found = True ns1 = i.info[0] overlap1 = (ns1, cycle1[2][si1][1]) - if cycle1[2][si1][1] == 1 and (i.chrom, i.start, i.end) == (seg1.chrom, seg2.start, seg1.end): + if cycle1[2][si1][1] == 1 and (i.chrom, i.start, i.end) == ( + seg1.chrom, + seg2.start, + seg1.end, + ): seg2_found = True ns2 = i.info[0] overlap2 = (ns2, cycle1[2][si1][1]) - elif cycle1[2][si1][1] == -1 and (i.chrom, i.start, i.end) == (seg1.chrom, seg1.start, seg2.end): + elif cycle1[2][si1][1] == -1 and (i.chrom, i.start, i.end) == ( + seg1.chrom, + seg1.start, + seg2.end, + ): seg2_found = True ns2 = i.info[0] overlap2 = (ns2, cycle1[2][si1][1]) @@ -780,10 +912,10 @@ def merge(self, c1, c2, si1, si2): if not cycle1[2][si1][1]: (overlap1, overlap2, ns1, ns2) = (overlap2, overlap1, ns2, ns1) if cycle1[2][si1][1] == cycle2[2][si2][1]: - cycle2_span = cycle2[2][si2 + 1:] + cycle2[2][:si2] + cycle2_span = cycle2[2][si2 + 1 :] + cycle2[2][:si2] else: - cycle2_span = [(s[0], -1 * s[1]) for s in cycle2[2][:si2][::-1] + cycle2[2][si2 + 1:][::-1]] - cycle1_final = cycle1[2][si1 + 1:] + cycle2_span = [(s[0], -1 * s[1]) for s in cycle2[2][:si2][::-1] + cycle2[2][si2 + 1 :][::-1]] + cycle1_final = cycle1[2][si1 + 1 :] mcycle = cycle1_init + [overlap1] + cycle2_span + [overlap2] + cycle1_final mcycle_id = self.next_cycle_id() self.cycle_dict[mcycle_id] = (mcycle_id, cnlist[0], mcycle) @@ -824,9 +956,14 @@ def pivot(self, c1, si1, si2): self.segment_list.append(self.segment_dict[ns2]) cycle1_init = cycle1[2][:si1] if cycle1[2][si1][1] == -1: - (overlap1, overlap2, ns1, ns2) = ((overlap2[0], -1 * overlap2[1]), (overlap1[0], -1 * overlap1[1]), ns2, ns1) - cycle1_span = [(s[0], -1 * s[1]) for s in cycle1[2][si1 + 1:si2][::-1]] - cycle1_final = cycle1[2][si2 + 1:] + (overlap1, overlap2, ns1, ns2) = ( + (overlap2[0], -1 * overlap2[1]), + (overlap1[0], -1 * overlap1[1]), + ns2, + ns1, + ) + cycle1_span = [(s[0], -1 * s[1]) for s in cycle1[2][si1 + 1 : si2][::-1]] + cycle1_final = cycle1[2][si2 + 1 :] mcycle = cycle1_init + [overlap1] + cycle1_span + [overlap2] + cycle1_final mcycle_id = self.next_cycle_id() self.cycle_dict[mcycle_id] = (mcycle_id, cycle1[1], mcycle) @@ -839,16 +976,32 @@ def fasta_sequence(self, cycle_list=None, outfasta=None): ccnlist.sort(reverse=True) print(ccnlist) cycle_list = [c[1] for c in ccnlist] - fseq = '' + fseq = "" if outfasta is not None: - outfile = open(outfasta, 'w') + outfile = open(outfasta, "w") for c in cycle_list: if outfasta is None: - fseq += '>Cycle' + c + " Copy_count=" + str(self.cycle_dict[c][1]) + ";Segments=" + ','.join([seg[0] + ('+' if seg[1] == 1 else '-') for seg in self.cycle_dict[c][2]]) + '\n' + fseq += ( + ">Cycle" + + c + + " Copy_count=" + + str(self.cycle_dict[c][1]) + + ";Segments=" + + ",".join([seg[0] + ("+" if seg[1] == 1 else "-") for seg in self.cycle_dict[c][2]]) + + "\n" + ) else: - outfile.write('>Cycle' + c + " Copy_count=" + str(self.cycle_dict[c][1]) + ";Segments=" + ','.join([seg[0] + ('+' if seg[1] == 1 else '-') for seg in self.cycle_dict[c][2]]) + '\n') + outfile.write( + ">Cycle" + + c + + " Copy_count=" + + str(self.cycle_dict[c][1]) + + ";Segments=" + + ",".join([seg[0] + ("+" if seg[1] == 1 else "-") for seg in self.cycle_dict[c][2]]) + + "\n" + ) for s in self.cycle_dict[c][2]: - if s[0] == '0': + if s[0] == "0": continue if s[1] == 1: if outfasta is None: @@ -861,21 +1014,29 @@ def fasta_sequence(self, cycle_list=None, outfasta=None): else: outfile.write(hg.reverse_complement(self.segment_dict[s[0]].sequence(new_fa_file=self.fa_file))) if outfasta is None: - fseq += '\n' + fseq += "\n" else: - outfile.write('\n') + outfile.write("\n") if outfasta is not None: outfile.close() return fseq - def __repr__(self): + def __repr__(self): s = "" for i in self.ilist: - s += '\t'.join(["Interval", i.info[0], i.chrom, str(i.start), str(i.end)]) + '\n' + s += "\t".join(["Interval", i.info[0], i.chrom, str(i.start), str(i.end)]) + "\n" for i in self.segment_list: - s += '\t'.join(["Segment", i.info[0], i.chrom, str(i.start), str(i.end)]) + '\n' + s += "\t".join(["Segment", i.info[0], i.chrom, str(i.start), str(i.end)]) + "\n" ccnlist = [(c[1], c[0]) for c in self.cycle_dict.values()] ccnlist.sort(reverse=True) for c in ccnlist: - s += "Cycle=" + c[1] + ";Copy_count=" + str(c[0]) + ";Segments=" + ','.join([seg[0] + ('+' if seg[1] == 1 else '-') for seg in self.cycle_dict[c[1]][2]]) + '\n' + s += ( + "Cycle=" + + c[1] + + ";Copy_count=" + + str(c[0]) + + ";Segments=" + + ",".join([seg[0] + ("+" if seg[1] == 1 else "-") for seg in self.cycle_dict[c[1]][2]]) + + "\n" + ) return s diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index b98ce458..6b5eefaa 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -56,18 +56,13 @@ def check_samplesheet(file_in, file_out, input_format): sample_mapping_dict = {} with open(file_in, "r") as fin: - if input_format == "FASTQ": ## Check header MIN_COLS = 2 HEADER = ["sample", "fastq_1", "fastq_2"] header = [x.strip('"') for x in fin.readline().strip().split(",")] if header[: len(HEADER)] != HEADER: - print( - "ERROR: Please check samplesheet header -> {} != {}".format( - ",".join(header), ",".join(HEADER) - ) - ) + print("ERROR: Please check samplesheet header -> {} != {}".format(",".join(header), ",".join(HEADER))) sys.exit(1) ## Check sample entries @@ -84,9 +79,7 @@ def check_samplesheet(file_in, file_out, input_format): num_cols = len([x for x in lspl if x]) if num_cols < MIN_COLS: print_error( - "Invalid number of populated columns (minimum = {})!".format( - MIN_COLS - ), + "Invalid number of populated columns (minimum = {})!".format(MIN_COLS), "Line", line, ) @@ -102,9 +95,7 @@ def check_samplesheet(file_in, file_out, input_format): if fastq: if fastq.find(" ") != -1: print_error("FastQ file contains spaces!", "Line", line) - if not fastq.endswith(".fastq.gz") and not fastq.endswith( - ".fq.gz" - ): + if not fastq.endswith(".fastq.gz") and not fastq.endswith(".fq.gz"): print_error( "FastQ file does not have extension '.fastq.gz' or '.fq.gz'!", "Line", @@ -117,18 +108,14 @@ def check_samplesheet(file_in, file_out, input_format): elif sample and fastq_1 and not fastq_2: ## Single-end short reads sample_info = ["1", fastq_1, fastq_2] else: - print_error( - "Invalid combination of columns provided!", "Line", line - ) + print_error("Invalid combination of columns provided!", "Line", line) ## Create sample mapping dictionary = { sample: [ single_end, fastq_1, fastq_2 ] } if sample not in sample_mapping_dict: sample_mapping_dict[sample] = [sample_info] else: if sample_info in sample_mapping_dict[sample]: - print_error( - "Samplesheet contains duplicate rows!", "Line", line - ) + print_error("Samplesheet contains duplicate rows!", "Line", line) else: sample_mapping_dict[sample].append(sample_info) @@ -137,11 +124,7 @@ def check_samplesheet(file_in, file_out, input_format): HEADER = ["sample", "bam"] header = [x.strip('"') for x in fin.readline().strip().split(",")] if header[: len(HEADER)] != HEADER: - print( - "ERROR: Please check samplesheet header -> {} != {}".format( - ",".join(header), ",".join(HEADER) - ) - ) + print("ERROR: Please check samplesheet header -> {} != {}".format(",".join(header), ",".join(HEADER))) sys.exit(1) ## Check sample entries @@ -158,9 +141,7 @@ def check_samplesheet(file_in, file_out, input_format): num_cols = len([x for x in lspl if x]) if num_cols < MIN_COLS: print_error( - "Invalid number of populated columns (minimum = {})!".format( - MIN_COLS - ), + "Invalid number of populated columns (minimum = {})!".format(MIN_COLS), "Line", line, ) @@ -188,9 +169,7 @@ def check_samplesheet(file_in, file_out, input_format): sample_mapping_dict[sample] = [sample_info] else: if sample_info in sample_mapping_dict[sample]: - print_error( - "Samplesheet contains duplicate rows!", "Line", line - ) + print_error("Samplesheet contains duplicate rows!", "Line", line) else: sample_mapping_dict[sample].append(sample_info) @@ -204,30 +183,20 @@ def check_samplesheet(file_in, file_out, input_format): make_dir(out_dir) with open(file_out, "w") as fout: if input_format == "FASTQ": - fout.write( - ",".join(["sample", "single_end", "fastq_1", "fastq_2"]) + "\n" - ) + fout.write(",".join(["sample", "single_end", "fastq_1", "fastq_2"]) + "\n") for sample in sorted(sample_mapping_dict.keys()): - ## Check that multiple runs of the same sample are of the same datatype - if not all( - x[0] == sample_mapping_dict[sample][0][0] - for x in sample_mapping_dict[sample] - ): + if not all(x[0] == sample_mapping_dict[sample][0][0] for x in sample_mapping_dict[sample]): print_error( "Multiple runs of a sample must be of the same datatype!", "Sample: {}".format(sample), ) for idx, val in enumerate(sample_mapping_dict[sample]): - fout.write( - ",".join(["{}_T{}".format(sample, idx + 1)] + val) - + "\n" - ) + fout.write(",".join(["{}_T{}".format(sample, idx + 1)] + val) + "\n") elif input_format == "BAM": fout.write(",".join(["sample", "idx", "bam"]) + "\n") for sample in sorted(sample_mapping_dict.keys()): - for idx, val in enumerate(sample_mapping_dict[sample]): fout.write(",".join(["{}".format(sample)] + val) + "\n") diff --git a/bin/circle_map.py b/bin/circle_map.py index 17e9d4b8..13a6dd04 100755 --- a/bin/circle_map.py +++ b/bin/circle_map.py @@ -1,25 +1,25 @@ #!/usr/bin/env python -#MIT License +# MIT License # -#Copyright (c) 2019 Iñigo Prada Luengo +# Copyright (c) 2019 Iñigo Prada Luengo # -#Permission is hereby granted, free of charge, to any person obtaining a copy -#of this software and associated documentation files (the "Software"), to deal -#in the Software without restriction, including without limitation the rights -#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -#copies of the Software, and to permit persons to whom the Software is -#furnished to do so, subject to the following conditions: +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: # -#The above copyright notice and this permission notice shall be included in all -#copies or substantial portions of the Software. +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. # -#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -#SOFTWARE. +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. import argparse @@ -33,7 +33,15 @@ from realigner import realignment from bam2bam import bam2bam from repeats import repeat -from utils import merge_final_output, filter_by_ratio, start_realign, start_simulate, mutate, insert_size_dist, write_clipped_read +from utils import ( + merge_final_output, + filter_by_ratio, + start_realign, + start_simulate, + mutate, + insert_size_dist, + write_clipped_read, +) from Coverage import coverage import multiprocessing as mp import pybedtools as bt @@ -44,17 +52,16 @@ from circlemap.__version__ import __version__ as cm_version import datetime -class circle_map: +class circle_map: def __getpid__(self): - pid = os.getpid() - return (pid) + return pid def __init__(self): self.parser = argparse.ArgumentParser( - description='Circle-Map', - usage='''Circle-Map [options] + description="Circle-Map", + usage="""Circle-Map [options] version=%s contact= https://github.com/iprada/Circle-Map/issues @@ -69,65 +76,66 @@ def __init__(self): Repeats Identify circular DNA from repetitive regions Simulate Simulate circular DNA -''' % cm_version) +""" + % cm_version, + ) subparsers = self.parser.add_subparsers() self.readextractor = subparsers.add_parser( name="ReadExtractor", - description='Extracts circular DNA read candidates', + description="Extracts circular DNA read candidates", prog="Circle-Map ReadExtractor", - usage='''Circle-Map ReadExtractor [options]''' - + usage="""Circle-Map ReadExtractor [options]""", ) self.realigner = subparsers.add_parser( name="Realign", - description='Realign circular DNA read candidates', + description="Realign circular DNA read candidates", prog="Circle-Map Realign", - usage='''Circle-Map Realign [options]''' - + usage="""Circle-Map Realign [options]""", ) self.repeats = subparsers.add_parser( name="Repeats", - description='Identify circular DNA from repetitive regions', + description="Identify circular DNA from repetitive regions", prog="Circle-Map Repeats", - usage='''Circle-Map Repeats [options]''' - + usage="""Circle-Map Repeats [options]""", ) self.simulate = subparsers.add_parser( name="Simulate", - description='Simulate eccDNA NGS datastes', + description="Simulate eccDNA NGS datastes", prog="Circle-Map Reepeats", - usage='''Circle-Map Simulate [options]''' - + usage="""Circle-Map Simulate [options]""", ) self.simulate = subparsers.add_parser( name="bam2bam", - description='Realign the soft-clipped reads and report', + description="Realign the soft-clipped reads and report", prog="Circle-Map bam2bam", - usage='''Circle-Map bam2bam [options]''' - + usage="""Circle-Map bam2bam [options]""", ) if len(sys.argv) <= 1: self.parser.print_help() time.sleep(0.01) - sys.stderr.write("\nNo argument given to Circle-Map" - "\nExiting\n") + sys.stderr.write("\nNo argument given to Circle-Map" "\nExiting\n") sys.exit(0) else: if sys.argv[1] == "ReadExtractor": - - self.subprogram = self.args_readextractor() self.args = self.subprogram.parse_args(sys.argv[2:]) - object = readExtractor(self.args.i, self.args.output, self.args.directory, self.args.quality, - self.args.nodiscordant, - self.args.nohardclipped, self.args.nosoftclipped, self.args.verbose, - self.subprogram) + object = readExtractor( + self.args.i, + self.args.output, + self.args.directory, + self.args.quality, + self.args.nodiscordant, + self.args.nohardclipped, + self.args.nosoftclipped, + self.args.verbose, + self.subprogram, + ) object.extract_sv_circleReads() elif sys.argv[1] == "Realign": @@ -135,71 +143,109 @@ def __init__(self): self.args = self.subprogram.parse_args(sys.argv[2:]) # get clusters - splitted, sorted_bam, begin = start_realign(self.args.i, self.args.output, self.args.threads, - self.args.verbose, self.__getpid__(), - self.args.clustering_dist) - - - + splitted, sorted_bam, begin = start_realign( + self.args.i, + self.args.output, + self.args.threads, + self.args.verbose, + self.__getpid__(), + self.args.clustering_dist, + ) sorted_bam.close() - #get global insert size prior + # get global insert size prior metrics = insert_size_dist(self.args.sample_size, self.args.insert_mapq, self.args.qbam) - # pool based parallel of religment m = mp.Manager() lock = m.Lock() - object = realignment(self.args.i, self.args.qbam, self.args.sbam, self.args.fasta, - self.args.directory, - self.args.mapq, - self.args.insert_mapq, self.args.std, self.args.sample_size, - self.args.gap_open, - self.args.gap_ext, self.args.nhits, self.args.cut_off, self.args.min_sc, - self.args.merge_fraction, self.args.interval_probability, self.args.output, - self.args.threads, self.args.allele_frequency, lock, self.args.split, - self.args.ratio, self.args.verbose, self.__getpid__(), - self.args.edit_distance_fraction, self.args.remap_splits, - self.args.only_discordants, self.args.split, - self.args.split_quality, metrics,self.args.number_of_discordants) - + object = realignment( + self.args.i, + self.args.qbam, + self.args.sbam, + self.args.fasta, + self.args.directory, + self.args.mapq, + self.args.insert_mapq, + self.args.std, + self.args.sample_size, + self.args.gap_open, + self.args.gap_ext, + self.args.nhits, + self.args.cut_off, + self.args.min_sc, + self.args.merge_fraction, + self.args.interval_probability, + self.args.output, + self.args.threads, + self.args.allele_frequency, + lock, + self.args.split, + self.args.ratio, + self.args.verbose, + self.__getpid__(), + self.args.edit_distance_fraction, + self.args.remap_splits, + self.args.only_discordants, + self.args.split, + self.args.split_quality, + metrics, + self.args.number_of_discordants, + ) pool = mp.Pool(processes=self.args.threads) - - #progress bar + # progress bar with tqdm(total=len(splitted)) as pbar: - for i,exits in tqdm(enumerate(pool.imap_unordered(object.realign, splitted))): + for i, exits in tqdm(enumerate(pool.imap_unordered(object.realign, splitted))): pbar.update() - #kill if process returns 1,1 - if exits == [1,1]: + # kill if process returns 1,1 + if exits == [1, 1]: pool.close() pool.terminate() pbar.close() - print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"), - "An error happenend during execution. Exiting") + print( + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"), + "An error happenend during execution. Exiting", + ) sys.exit() pbar.close() pool.close() pool.join() - output = merge_final_output(self.args.sbam, self.args.output, begin, self.args.split, - self.args.directory, - self.args.merge_fraction, self.__getpid__()) + output = merge_final_output( + self.args.sbam, + self.args.output, + begin, + self.args.split, + self.args.directory, + self.args.merge_fraction, + self.__getpid__(), + ) # compute coverage statistics if self.args.no_coverage == False: - - coverage_object = coverage(self.args.sbam, output, - self.args.bases, self.args.cmapq, self.args.extension, - self.args.directory) - - #Generator function for the coverage calculations + coverage_object = coverage( + self.args.sbam, + output, + self.args.bases, + self.args.cmapq, + self.args.extension, + self.args.directory, + ) + + # Generator function for the coverage calculations output = coverage_object.compute_coverage(coverage_object.get_wg_coverage()) filtered_output = filter_by_ratio(output, self.args.ratio) - filtered_output.to_csv(r'%s' % self.args.output, header=None, index=None, sep='\t', mode='w') + filtered_output.to_csv( + r"%s" % self.args.output, + header=None, + index=None, + sep="\t", + mode="w", + ) else: output.saveas("%s" % self.args.output) @@ -209,62 +255,75 @@ def __init__(self): self.args = self.subprogram.parse_args(sys.argv[2:]) # get clusters - splitted, sorted_bam, begin = start_realign(self.args.i, self.args.output, self.args.threads, - self.args.verbose, self.__getpid__(), - self.args.clustering_dist) - - - #create output bam + splitted, sorted_bam, begin = start_realign( + self.args.i, + self.args.output, + self.args.threads, + self.args.verbose, + self.__getpid__(), + self.args.clustering_dist, + ) + + # create output bam circle_sv_reads = ps.AlignmentFile(self.args.output, "wb", template=sorted_bam) sorted_bam.close() - #get global insert size prior + # get global insert size prior metrics = insert_size_dist(self.args.sample_size, self.args.insert_mapq, self.args.qbam) - - - manager = mp.Manager() - - - - lock = manager.Lock() - object = bam2bam(self.args.i,self.args.output,self.args.qbam, self.args.fasta, - self.args.directory, - self.args.mapq, - self.args.insert_mapq, self.args.std, self.args.sample_size, - self.args.gap_open, - self.args.gap_ext, self.args.nhits, self.args.cut_off, self.args.min_sc, - self.args.interval_probability, - self.args.threads, lock, - self.args.verbose, self.__getpid__(), - self.args.edit_distance_fraction, self.args.remap_splits, - self.args.only_discordants, - self.args.split_quality, metrics,manager) + object = bam2bam( + self.args.i, + self.args.output, + self.args.qbam, + self.args.fasta, + self.args.directory, + self.args.mapq, + self.args.insert_mapq, + self.args.std, + self.args.sample_size, + self.args.gap_open, + self.args.gap_ext, + self.args.nhits, + self.args.cut_off, + self.args.min_sc, + self.args.interval_probability, + self.args.threads, + lock, + self.args.verbose, + self.__getpid__(), + self.args.edit_distance_fraction, + self.args.remap_splits, + self.args.only_discordants, + self.args.split_quality, + metrics, + manager, + ) object.beta_version_warning() - pool = mp.Pool(processes=self.args.threads) # create writer process - writer_p = mp.Process(target=object.listener_writer, args=(circle_sv_reads,)) + writer_p = mp.Process(target=object.listener_writer, args=(circle_sv_reads,)) writer_p.daemon = True writer_p.start() - #progress bar + # progress bar with tqdm(total=len(splitted)) as pbar: - for i,exits in tqdm(enumerate(pool.imap_unordered(object.realign,splitted))): + for i, exits in tqdm(enumerate(pool.imap_unordered(object.realign, splitted))): pbar.update() # kill if process returns 1,1 if exits == [1, 1]: pool.close() pool.terminate() pbar.close() - print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"), - "An error happenend during execution. Exiting") + print( + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"), + "An error happenend during execution. Exiting", + ) sys.exit() pbar.close() @@ -277,31 +336,41 @@ def __init__(self): circle_sv_reads.close() print("Done") - elif sys.argv[1] == "Repeats": - self.subprogram = self.args_repeats() self.args = self.subprogram.parse_args(sys.argv[2:]) - - object = repeat(self.args.i, self.args.directory, self.args.mismatch, self.args.fraction, - self.args.read_number) + object = repeat( + self.args.i, + self.args.directory, + self.args.mismatch, + self.args.fraction, + self.args.read_number, + ) bed = object.find_circles() - coverage_object = coverage(self.args.i, bed, - self.args.bases, self.args.cmapq, self.args.extension, - self.args.directory) + coverage_object = coverage( + self.args.i, + bed, + self.args.bases, + self.args.cmapq, + self.args.extension, + self.args.directory, + ) output = coverage_object.compute_coverage(coverage_object.get_wg_coverage()) filtered_output = filter_by_ratio(output, self.args.ratio) - filtered_output.to_csv(r'%s' % self.args.output, header=None, index=None, sep='\t', mode='w') - - + filtered_output.to_csv( + r"%s" % self.args.output, + header=None, + index=None, + sep="\t", + mode="w", + ) elif sys.argv[1] == "Simulate": - self.subprogram = self.args_simulate() self.args = self.subprogram.parse_args(sys.argv[2:]) @@ -316,36 +385,58 @@ def __init__(self): # mutate reference genome if self.args.variants == True: - mutate(self.args.g, sim_pid, self.args.Indels, self.args.substitution, self.args.java_memory) - + mutate( + self.args.g, + sim_pid, + self.args.Indels, + self.args.substitution, + self.args.java_memory, + ) manager = mp.Manager() # Shared memory object circle_list = manager.list() - skipped_circles = mp.Value('i', 0) - correct_circles = mp.Value('i', 0) + skipped_circles = mp.Value("i", 0) + correct_circles = mp.Value("i", 0) jobs = [] # init the processes for i in range(self.args.processes): - p = mp.Process(target=sim_ecc_reads, - args=(self.args.g, self.args.read_length, self.args.directory, - int(round(self.args.read_number / self.args.processes)), - self.args.skip_region, self.args.base_name, - self.args.mean_insert_size, self.args.error, - self.args.mean_coverage, lock, i, circle_list, - "%s_1.fastq" % self.args.base_name, "%s_2.fastq" % self.args.base_name, - skipped_circles, - correct_circles, self.args.insRate, self.args.insRate2, self.args.delRate, - self.args.delRate2, sim_pid,)) + p = mp.Process( + target=sim_ecc_reads, + args=( + self.args.g, + self.args.read_length, + self.args.directory, + int(round(self.args.read_number / self.args.processes)), + self.args.skip_region, + self.args.base_name, + self.args.mean_insert_size, + self.args.error, + self.args.mean_coverage, + lock, + i, + circle_list, + "%s_1.fastq" % self.args.base_name, + "%s_2.fastq" % self.args.base_name, + skipped_circles, + correct_circles, + self.args.insRate, + self.args.insRate2, + self.args.delRate, + self.args.delRate2, + sim_pid, + ), + ) jobs.append(p) p.start() # kill the process for p in jobs: p.join() print("Skipped %s circles, that overlapped the provided regions to exclude" % skipped_circles.value) - print("Simulated %s circles across %s parallel processes" % ( - correct_circles.value, self.args.processes)) + print( + "Simulated %s circles across %s parallel processes" % (correct_circles.value, self.args.processes) + ) print("Writting to disk bed file containing the simulated circle coordinates") bt.BedTool(list(circle_list)).saveas(self.args.output) @@ -353,92 +444,153 @@ def __init__(self): else: self.parser.print_help() time.sleep(0.01) - sys.stderr.write("\nWrong argument given to Circle-Map" - "\nExiting\n") + sys.stderr.write("\nWrong argument given to Circle-Map" "\nExiting\n") sys.exit(0) def args_readextractor(self): - parser = self.readextractor parser._action_groups.pop() - required = parser.add_argument_group('required arguments') - optional = parser.add_argument_group('optional arguments') + required = parser.add_argument_group("required arguments") + optional = parser.add_argument_group("optional arguments") # prefixing the argument with -- means it's optional # input and output - required.add_argument('-i', metavar='', help="Input: query name sorted bam file") + required.add_argument("-i", metavar="", help="Input: query name sorted bam file") if "-i" in sys.argv: - optional.add_argument('-o', '--output', metavar='', - help="Ouput: Reads indicating circular DNA structural variants", - default="circle_%s" % sys.argv[sys.argv.index("-i") + 1]) - - optional.add_argument('-dir', '--directory', metavar='', - help="Working directory, default is the working directory", - default=os.getcwd()) + optional.add_argument( + "-o", + "--output", + metavar="", + help="Ouput: Reads indicating circular DNA structural variants", + default="circle_%s" % sys.argv[sys.argv.index("-i") + 1], + ) + + optional.add_argument( + "-dir", + "--directory", + metavar="", + help="Working directory, default is the working directory", + default=os.getcwd(), + ) # mapping quality cutoff - optional.add_argument('-q', '--quality', type=int, metavar='', - help="bwa-mem mapping quality cutoff. Default value 10", - default=10) + optional.add_argument( + "-q", + "--quality", + type=int, + metavar="", + help="bwa-mem mapping quality cutoff. Default value 10", + default=10, + ) # read extraction options # extract discordant reads - optional.add_argument('-nd', '--nodiscordant', help="Turn off discordant (R2F1 oriented) read extraction", - action='store_true') + optional.add_argument( + "-nd", + "--nodiscordant", + help="Turn off discordant (R2F1 oriented) read extraction", + action="store_true", + ) # soft-clipped argument - optional.add_argument('-nsc', '--nosoftclipped', help="Turn off soft-clipped read extraction", - action='store_true') + optional.add_argument( + "-nsc", + "--nosoftclipped", + help="Turn off soft-clipped read extraction", + action="store_true", + ) # extract hard-clippped reads - optional.add_argument('-nhc', '--nohardclipped', help="Turn off hard-clipped read extraction", - action='store_true') + optional.add_argument( + "-nhc", + "--nohardclipped", + help="Turn off hard-clipped read extraction", + action="store_true", + ) # verbose level - optional.add_argument('-v', '--verbose', type=int, metavar='', - help='Verbose level, 1=error,2=warning, 3=message', - choices=[1, 2, 3], default=3) + optional.add_argument( + "-v", + "--verbose", + type=int, + metavar="", + help="Verbose level, 1=error,2=warning, 3=message", + choices=[1, 2, 3], + default=3, + ) else: - optional.add_argument('-o', '--output', metavar='', - help="Ouput: Reads indicating circular DNA structural variants") - - optional.add_argument('-dir', '--directory', metavar='', - help="Working directory, default is the working directory", - default=os.getcwd()) + optional.add_argument( + "-o", + "--output", + metavar="", + help="Ouput: Reads indicating circular DNA structural variants", + ) + + optional.add_argument( + "-dir", + "--directory", + metavar="", + help="Working directory, default is the working directory", + default=os.getcwd(), + ) # mapping quality cutoff - optional.add_argument('-q', '--quality', type=int, metavar='', - help="bwa-mem mapping quality cutoff. Default value 10", - default=10) + optional.add_argument( + "-q", + "--quality", + type=int, + metavar="", + help="bwa-mem mapping quality cutoff. Default value 10", + default=10, + ) # read extraction options # extract discordant reads - optional.add_argument('-nd', '--nodiscordant', help="Turn off discordant (R2F1 oriented) read extraction", - action='store_true') + optional.add_argument( + "-nd", + "--nodiscordant", + help="Turn off discordant (R2F1 oriented) read extraction", + action="store_true", + ) # soft-clipped argument - optional.add_argument('-nsc', '--nosoftclipped', help="Turn off soft-clipped read extraction", - action='store_true') + optional.add_argument( + "-nsc", + "--nosoftclipped", + help="Turn off soft-clipped read extraction", + action="store_true", + ) # extract hard-clippped reads - optional.add_argument('-nhc', '--nohardclipped', help="Turn off hard-clipped read extraction", - action='store_true') + optional.add_argument( + "-nhc", + "--nohardclipped", + help="Turn off hard-clipped read extraction", + action="store_true", + ) # verbose level - optional.add_argument('-v', '--verbose', type=int, metavar='', - help='Verbose level, 1=error,2=warning, 3=message. Default=3', - choices=[1, 2, 3], default=3) + optional.add_argument( + "-v", + "--verbose", + type=int, + metavar="", + help="Verbose level, 1=error,2=warning, 3=message. Default=3", + choices=[1, 2, 3], + default=3, + ) parser.print_help() time.sleep(0.01) sys.stderr.write( "\nNo input or output input given to readExtractor, be sure that you are providing the flags'-i' and '-o'" - "\nExiting\n") + "\nExiting\n" + ) sys.exit(0) # parse the commands @@ -449,268 +601,539 @@ def args_readextractor(self): sys.stderr.write("\nNo arguments given to read extractor. Exiting\n") sys.exit(0) - return (parser) + return parser def args_realigner(self): parser = self.realigner # declare the different groups for the parser parser._action_groups.pop() - io_options = parser.add_argument_group('Input/Output options') - alignment_options = parser.add_argument_group('Alignment options') - out_decision = parser.add_argument_group('eccDNA output options') - i_size_estimate = parser.add_argument_group('Insert size estimation options') - interval = parser.add_argument_group('Interval processing options') - coverage_metrics = parser.add_argument_group('Coverage metrics options') - running = parser.add_argument_group('Running options') - - io_options.add_argument('-i', metavar='', - help="Input: bam file containing the reads extracted by ReadExtractor") - io_options.add_argument('-qbam', metavar='', help="Input: query name sorted bam file") - io_options.add_argument('-sbam', metavar='', help="Input: coordinate sorted bam file") - io_options.add_argument('-fasta', metavar='', help="Input: Reference genome fasta file") + io_options = parser.add_argument_group("Input/Output options") + alignment_options = parser.add_argument_group("Alignment options") + out_decision = parser.add_argument_group("eccDNA output options") + i_size_estimate = parser.add_argument_group("Insert size estimation options") + interval = parser.add_argument_group("Interval processing options") + coverage_metrics = parser.add_argument_group("Coverage metrics options") + running = parser.add_argument_group("Running options") + + io_options.add_argument( + "-i", + metavar="", + help="Input: bam file containing the reads extracted by ReadExtractor", + ) + io_options.add_argument("-qbam", metavar="", help="Input: query name sorted bam file") + io_options.add_argument("-sbam", metavar="", help="Input: coordinate sorted bam file") + io_options.add_argument("-fasta", metavar="", help="Input: Reference genome fasta file") if "-i" and "-qbam" and "-fasta" in sys.argv: # output - io_options.add_argument('-o', '--output', metavar='', help="Output filename", - default="circle_%s.bed" % sys.argv[sys.argv.index("-i") + 1]) + io_options.add_argument( + "-o", + "--output", + metavar="", + help="Output filename", + default="circle_%s.bed" % sys.argv[sys.argv.index("-i") + 1], + ) # alignment - alignment_options.add_argument('-n', '--nhits', type=int, metavar='', - help="Number of realignment attempts. Default: 10", - default=10) - - alignment_options.add_argument('-p', '--cut_off', type=float, metavar='', - help="Probability cut-off for considering a soft-clipped as realigned: Default: 0.99", - default=0.99) - - alignment_options.add_argument('-m', '--min_sc', type=float, metavar='', - help="Minimum soft-clipped length to attempt the realignment. Default: 8", - default=8) - - alignment_options.add_argument('-g', '--gap_open', type=int, metavar='', - help="Gap open penalty in the position specific scoring matrix. Default: 5", - default=5) - - alignment_options.add_argument('-e', '--gap_ext', type=int, metavar='', - help="Gap extension penalty in the position specific scoring matrix. Default: 1", - default=1) - - alignment_options.add_argument('-q', '--mapq', type=int, metavar='', - help="Minimum mapping quality allowed in the supplementary alignments. Default: 20", - default=20) - - alignment_options.add_argument('-d', '--edit_distance-fraction', type=float, metavar='', - help="Maximum edit distance fraction allowed in the first realignment. Default (0.05)", - default=0.05) - - alignment_options.add_argument('-Q', '--split_quality', type=float, metavar='', - help="Minium split score to output an interval. Default (0.0)", - default=0.0) - - alignment_options.add_argument('-R', '--remap_splits', help="Remap probabilistacally the split reads", - action='store_true') + alignment_options.add_argument( + "-n", + "--nhits", + type=int, + metavar="", + help="Number of realignment attempts. Default: 10", + default=10, + ) + + alignment_options.add_argument( + "-p", + "--cut_off", + type=float, + metavar="", + help="Probability cut-off for considering a soft-clipped as realigned: Default: 0.99", + default=0.99, + ) + + alignment_options.add_argument( + "-m", + "--min_sc", + type=float, + metavar="", + help="Minimum soft-clipped length to attempt the realignment. Default: 8", + default=8, + ) + + alignment_options.add_argument( + "-g", + "--gap_open", + type=int, + metavar="", + help="Gap open penalty in the position specific scoring matrix. Default: 5", + default=5, + ) + + alignment_options.add_argument( + "-e", + "--gap_ext", + type=int, + metavar="", + help="Gap extension penalty in the position specific scoring matrix. Default: 1", + default=1, + ) + + alignment_options.add_argument( + "-q", + "--mapq", + type=int, + metavar="", + help="Minimum mapping quality allowed in the supplementary alignments. Default: 20", + default=20, + ) + + alignment_options.add_argument( + "-d", + "--edit_distance-fraction", + type=float, + metavar="", + help="Maximum edit distance fraction allowed in the first realignment. Default (0.05)", + default=0.05, + ) + + alignment_options.add_argument( + "-Q", + "--split_quality", + type=float, + metavar="", + help="Minium split score to output an interval. Default (0.0)", + default=0.0, + ) + + alignment_options.add_argument( + "-R", + "--remap_splits", + help="Remap probabilistacally the split reads", + action="store_true", + ) # insert size - i_size_estimate.add_argument('-iq', '--insert_mapq', type=int, metavar='', - help="Mapq cutoff for stimating the insert size distribution. Default 60", - default=60) - - i_size_estimate.add_argument('-sd', '--std', type=int, metavar='', - help="Standard deviations of the insert size to extend the intervals. Default 5", - default=4) - - i_size_estimate.add_argument('-s', '--sample_size', type=int, metavar='', - help="Number of concordant reads (R2F1) to use for estimating the insert size distribution. Default 100000", - default=100000) + i_size_estimate.add_argument( + "-iq", + "--insert_mapq", + type=int, + metavar="", + help="Mapq cutoff for stimating the insert size distribution. Default 60", + default=60, + ) + + i_size_estimate.add_argument( + "-sd", + "--std", + type=int, + metavar="", + help="Standard deviations of the insert size to extend the intervals. Default 5", + default=4, + ) + + i_size_estimate.add_argument( + "-s", + "--sample_size", + type=int, + metavar="", + help="Number of concordant reads (R2F1) to use for estimating the insert size distribution. Default 100000", + default=100000, + ) # Interval options - interval.add_argument('-f', '--merge_fraction', type=float, metavar='', - help="Merge intervals reciprocally overlapping by a fraction. Default 0.99", - default=0.99) - - interval.add_argument('-P', '--interval_probability', type=float, metavar='', - help="Skip edges of the graph with a probability below the threshold. Default: 0.01", - default=0.01) - interval.add_argument('-K', '--clustering_dist', type=int, metavar='', - help="Cluster reads that are K nucleotides appart in the same node. Default: 500", - default=500) - - interval.add_argument('-D', '--only_discordants', help="Use only discordant reads to build the graph", - action='store_false') - interval.add_argument('-F', '--allele_frequency', type=float, metavar='', - help="Minimum allele frequency required to report the circle interval. Default (0.1)", - default=0.1) + interval.add_argument( + "-f", + "--merge_fraction", + type=float, + metavar="", + help="Merge intervals reciprocally overlapping by a fraction. Default 0.99", + default=0.99, + ) + + interval.add_argument( + "-P", + "--interval_probability", + type=float, + metavar="", + help="Skip edges of the graph with a probability below the threshold. Default: 0.01", + default=0.01, + ) + interval.add_argument( + "-K", + "--clustering_dist", + type=int, + metavar="", + help="Cluster reads that are K nucleotides appart in the same node. Default: 500", + default=500, + ) + + interval.add_argument( + "-D", + "--only_discordants", + help="Use only discordant reads to build the graph", + action="store_false", + ) + interval.add_argument( + "-F", + "--allele_frequency", + type=float, + metavar="", + help="Minimum allele frequency required to report the circle interval. Default (0.1)", + default=0.1, + ) # When to call a circle - out_decision.add_argument('-S', '--split', type=int, metavar='', - help="Number of required split reads to output a eccDNA. Default: 0", - default=0) - - out_decision.add_argument('-O', '--number_of_discordants', type=int, metavar='', - help="Number of required discordant reads for intervals with only discordants. Default: 3", - default=3) - out_decision.add_argument('-r', '--ratio', type=float, metavar='', - help="Minimum in/out required coverage ratio. Default: 0.0", - default=0.0) + out_decision.add_argument( + "-S", + "--split", + type=int, + metavar="", + help="Number of required split reads to output a eccDNA. Default: 0", + default=0, + ) + + out_decision.add_argument( + "-O", + "--number_of_discordants", + type=int, + metavar="", + help="Number of required discordant reads for intervals with only discordants. Default: 3", + default=3, + ) + out_decision.add_argument( + "-r", + "--ratio", + type=float, + metavar="", + help="Minimum in/out required coverage ratio. Default: 0.0", + default=0.0, + ) # coverage metrics - coverage_metrics.add_argument('-N', '--no_coverage', help="Don't compute coverage statistics", - action='store_true') - - coverage_metrics.add_argument('-b', '--bases', type=int, metavar='', - help="Number of bases to extend for computing the coverage ratio. Default: 200", - default=200) - - coverage_metrics.add_argument('-cq', '--cmapq', type=int, metavar='', - help="Minimum mapping quality treshold for coverage computation. Default: 0", - default=0) - - coverage_metrics.add_argument('-E', '--extension', type=int, metavar='', - help="Number of bases inside the eccDNA breakpoint coordinates to compute the ratio. Default: 100", - default=100) + coverage_metrics.add_argument( + "-N", + "--no_coverage", + help="Don't compute coverage statistics", + action="store_true", + ) + + coverage_metrics.add_argument( + "-b", + "--bases", + type=int, + metavar="", + help="Number of bases to extend for computing the coverage ratio. Default: 200", + default=200, + ) + + coverage_metrics.add_argument( + "-cq", + "--cmapq", + type=int, + metavar="", + help="Minimum mapping quality treshold for coverage computation. Default: 0", + default=0, + ) + + coverage_metrics.add_argument( + "-E", + "--extension", + type=int, + metavar="", + help="Number of bases inside the eccDNA breakpoint coordinates to compute the ratio. Default: 100", + default=100, + ) # run options - running.add_argument('-t', '--threads', type=int, metavar='', - help="Number of threads to use.Default 1", - default=1) - - running.add_argument('-dir', '--directory', metavar='', - help="Working directory, default is the working directory", - default=os.getcwd()) - - running.add_argument('-v', '--verbose', type=int, metavar='', - help='Verbose level, 1=error,2=warning, 3=message', - choices=[1, 2, 3], default=3) - - + running.add_argument( + "-t", + "--threads", + type=int, + metavar="", + help="Number of threads to use.Default 1", + default=1, + ) + + running.add_argument( + "-dir", + "--directory", + metavar="", + help="Working directory, default is the working directory", + default=os.getcwd(), + ) + + running.add_argument( + "-v", + "--verbose", + type=int, + metavar="", + help="Verbose level, 1=error,2=warning, 3=message", + choices=[1, 2, 3], + default=3, + ) else: # output - io_options.add_argument('-o', metavar='', help="Output filename") - - alignment_options.add_argument('-n', '--nhits', type=int, metavar='', - help="Number of realignment attempts. Default: 10", - default=10) - - alignment_options.add_argument('-p', '--cut_off', type=float, metavar='', - help="Probability cut-off for considering a soft-clipped as realigned: Default: 0.99", - default=0.99) - - alignment_options.add_argument('-m', '--min_sc', type=float, metavar='', - help="Minimum soft-clipped length to attempt the realignment. Default: 8", - default=8) - - alignment_options.add_argument('-g', '--gap_open', type=int, metavar='', - help="Gap open penalty in the position specific scoring matrix. Default: 5", - default=5) - - alignment_options.add_argument('-e', '--gap_ext', type=int, metavar='', - help="Gap extension penalty in the position specific scoring matrix. Default: 1", - default=1) - - alignment_options.add_argument('-q', '--mapq', type=int, metavar='', - help="Minimum mapping quality allowed in the supplementary alignments. Default: 20", - default=20) - - alignment_options.add_argument('-d', '--edit_distance-fraction', type=float, metavar='', - help="Maximum edit distance fraction allowed in the first realignment. Default (0.05)", - default=0.05) - - alignment_options.add_argument('-Q', '--split_quality', type=float, metavar='', - help="Minium split score to output an interval. Default (0.0)", - default=0.0) - alignment_options.add_argument('-R', '--remap_splits', help="Remap probabilistacally bwa-mem split reads", - action='store_true') + io_options.add_argument("-o", metavar="", help="Output filename") + + alignment_options.add_argument( + "-n", + "--nhits", + type=int, + metavar="", + help="Number of realignment attempts. Default: 10", + default=10, + ) + + alignment_options.add_argument( + "-p", + "--cut_off", + type=float, + metavar="", + help="Probability cut-off for considering a soft-clipped as realigned: Default: 0.99", + default=0.99, + ) + + alignment_options.add_argument( + "-m", + "--min_sc", + type=float, + metavar="", + help="Minimum soft-clipped length to attempt the realignment. Default: 8", + default=8, + ) + + alignment_options.add_argument( + "-g", + "--gap_open", + type=int, + metavar="", + help="Gap open penalty in the position specific scoring matrix. Default: 5", + default=5, + ) + + alignment_options.add_argument( + "-e", + "--gap_ext", + type=int, + metavar="", + help="Gap extension penalty in the position specific scoring matrix. Default: 1", + default=1, + ) + + alignment_options.add_argument( + "-q", + "--mapq", + type=int, + metavar="", + help="Minimum mapping quality allowed in the supplementary alignments. Default: 20", + default=20, + ) + + alignment_options.add_argument( + "-d", + "--edit_distance-fraction", + type=float, + metavar="", + help="Maximum edit distance fraction allowed in the first realignment. Default (0.05)", + default=0.05, + ) + + alignment_options.add_argument( + "-Q", + "--split_quality", + type=float, + metavar="", + help="Minium split score to output an interval. Default (0.0)", + default=0.0, + ) + alignment_options.add_argument( + "-R", + "--remap_splits", + help="Remap probabilistacally bwa-mem split reads", + action="store_true", + ) # insert size - i_size_estimate.add_argument('-iq', '--insert_mapq', type=int, metavar='', - help="Mapq cutoff for stimating the insert size distribution. Default 60", - default=60) - - i_size_estimate.add_argument('-sd', '--std', type=int, metavar='', - help="Standard deviations of the insert size to extend the intervals. Default 5", - default=5) - - i_size_estimate.add_argument('-s', '--sample_size', type=int, metavar='', - help="Number of concordant reads (R2F1) to use for estimating the insert size distribution. Default 100000", - default=100000) + i_size_estimate.add_argument( + "-iq", + "--insert_mapq", + type=int, + metavar="", + help="Mapq cutoff for stimating the insert size distribution. Default 60", + default=60, + ) + + i_size_estimate.add_argument( + "-sd", + "--std", + type=int, + metavar="", + help="Standard deviations of the insert size to extend the intervals. Default 5", + default=5, + ) + + i_size_estimate.add_argument( + "-s", + "--sample_size", + type=int, + metavar="", + help="Number of concordant reads (R2F1) to use for estimating the insert size distribution. Default 100000", + default=100000, + ) # Interval options - interval.add_argument('-f', '--merge_fraction', type=float, metavar='', - help="Merge intervals reciprocally overlapping by a fraction. Default 0.99", - default=0.99) - - interval.add_argument('-P', '--interval_probability', type=float, metavar='', - help="Skip edges of the graph with a probability below the threshold. Default: 0.01", - default=0.01) - interval.add_argument('-K', '--clustering_dist', type=int, metavar='', - help="Cluster reads that are K nucleotides appart in the same node. Default: 500", - default=500) - interval.add_argument('-D', '--only_discordants', help="Use only discordant reads to build the graph", - action='store_true') - interval.add_argument('-F', '--allele_frequency', type=float, metavar='', - help="Minimum allele frequency required to report the circle interval. Default (0.1)", - default=0.1) + interval.add_argument( + "-f", + "--merge_fraction", + type=float, + metavar="", + help="Merge intervals reciprocally overlapping by a fraction. Default 0.99", + default=0.99, + ) + + interval.add_argument( + "-P", + "--interval_probability", + type=float, + metavar="", + help="Skip edges of the graph with a probability below the threshold. Default: 0.01", + default=0.01, + ) + interval.add_argument( + "-K", + "--clustering_dist", + type=int, + metavar="", + help="Cluster reads that are K nucleotides appart in the same node. Default: 500", + default=500, + ) + interval.add_argument( + "-D", + "--only_discordants", + help="Use only discordant reads to build the graph", + action="store_true", + ) + interval.add_argument( + "-F", + "--allele_frequency", + type=float, + metavar="", + help="Minimum allele frequency required to report the circle interval. Default (0.1)", + default=0.1, + ) # When to call a circle - out_decision.add_argument('-S', '--split', type=int, metavar='', - help="Number of required split reads to output a eccDNA. Default: 0", - default=0) - out_decision.add_argument('-O', '--number_of_discordants', type=int, metavar='', - help="Number of required discordant reads for intervals with only discordants. Default: 3", - default=3) - - out_decision.add_argument('-r', '--ratio', type=float, metavar='', - help="Minimum in/out required coverage ratio. Default: 0.0", - default=0.0) + out_decision.add_argument( + "-S", + "--split", + type=int, + metavar="", + help="Number of required split reads to output a eccDNA. Default: 0", + default=0, + ) + out_decision.add_argument( + "-O", + "--number_of_discordants", + type=int, + metavar="", + help="Number of required discordant reads for intervals with only discordants. Default: 3", + default=3, + ) + + out_decision.add_argument( + "-r", + "--ratio", + type=float, + metavar="", + help="Minimum in/out required coverage ratio. Default: 0.0", + default=0.0, + ) # coverage metrics - coverage_metrics.add_argument('-N', '--no_coverage', help="Don't compute coverage statistics", - action='store_true') - - coverage_metrics.add_argument('-b', '--bases', type=int, metavar='', - help="Number of bases to extend for computing the coverage ratio. Default: 200", - default=200) - - coverage_metrics.add_argument('-cq', '--cmapq', type=int, metavar='', - help="Minimum mapping quality treshold for coverage computation. Default: 0", - default=0) - - coverage_metrics.add_argument('-E', '--extension', type=int, metavar='', - help="Number of bases inside the eccDNA breakpoint coordinates to compute the ratio. Default: 100", - default=100) + coverage_metrics.add_argument( + "-N", + "--no_coverage", + help="Don't compute coverage statistics", + action="store_true", + ) + + coverage_metrics.add_argument( + "-b", + "--bases", + type=int, + metavar="", + help="Number of bases to extend for computing the coverage ratio. Default: 200", + default=200, + ) + + coverage_metrics.add_argument( + "-cq", + "--cmapq", + type=int, + metavar="", + help="Minimum mapping quality treshold for coverage computation. Default: 0", + default=0, + ) + + coverage_metrics.add_argument( + "-E", + "--extension", + type=int, + metavar="", + help="Number of bases inside the eccDNA breakpoint coordinates to compute the ratio. Default: 100", + default=100, + ) # Running options - running.add_argument('-t', '--threads', type=int, metavar='', - help="Number of threads to use.Default 1", - default=1) - - running.add_argument('-dir', '--directory', metavar='', - help="Working directory, default is the working directory", - default=os.getcwd()) - - running.add_argument('-v', '--verbose', type=int, metavar='', - help='Verbose level, 1=error,2=warning, 3=message', - choices=[1, 2, 3], default=3) + running.add_argument( + "-t", + "--threads", + type=int, + metavar="", + help="Number of threads to use.Default 1", + default=1, + ) + + running.add_argument( + "-dir", + "--directory", + metavar="", + help="Working directory, default is the working directory", + default=os.getcwd(), + ) + + running.add_argument( + "-v", + "--verbose", + type=int, + metavar="", + help="Verbose level, 1=error,2=warning, 3=message", + choices=[1, 2, 3], + default=3, + ) # find out which arguments are missing parser.print_help() time.sleep(0.01) - sys.stderr.write("\nInput does not match. Check that you provide the -i, -qbam and -fasta options" - "\nExiting\n") + sys.stderr.write( + "\nInput does not match. Check that you provide the -i, -qbam and -fasta options" "\nExiting\n" + ) sys.exit(0) if len(sys.argv[2:]) == 0: @@ -719,197 +1142,366 @@ def args_realigner(self): sys.stderr.write("\nNo arguments given to Realign. Exiting\n") sys.exit(0) - return (parser) - + return parser def args_bam2bam(self): parser = self.realigner # declare the different groups for the parser parser._action_groups.pop() - io_options = parser.add_argument_group('Required') - alignment_options = parser.add_argument_group('Alignment options') - i_size_estimate = parser.add_argument_group('Insert size estimation options') - interval = parser.add_argument_group('Interval processing options') - running = parser.add_argument_group('Running options') - - io_options.add_argument('-i', metavar='', - help="Input: bam file containing the reads extracted by ReadExtractor") - io_options.add_argument('-qbam', metavar='', help="Input: query name sorted bam file") - io_options.add_argument('-fasta', metavar='', help="Input: Reference genome fasta file") - io_options.add_argument('-o', '--output', metavar='', help="Output BAM name") + io_options = parser.add_argument_group("Required") + alignment_options = parser.add_argument_group("Alignment options") + i_size_estimate = parser.add_argument_group("Insert size estimation options") + interval = parser.add_argument_group("Interval processing options") + running = parser.add_argument_group("Running options") + + io_options.add_argument( + "-i", + metavar="", + help="Input: bam file containing the reads extracted by ReadExtractor", + ) + io_options.add_argument("-qbam", metavar="", help="Input: query name sorted bam file") + io_options.add_argument("-fasta", metavar="", help="Input: Reference genome fasta file") + io_options.add_argument("-o", "--output", metavar="", help="Output BAM name") if "-i" and "-qbam" and "-fasta" and "-o" in sys.argv: # output - - # alignment - alignment_options.add_argument('-n', '--nhits', type=int, metavar='', - help="Number of realignment attempts. Default: 10", - default=10) - - alignment_options.add_argument('-p', '--cut_off', type=float, metavar='', - help="Probability cut-off for considering a soft-clipped as realigned: Default: 0.99", - default=0.99) - - alignment_options.add_argument('-m', '--min_sc', type=float, metavar='', - help="Minimum soft-clipped length to attempt the realignment. Default: 8", - default=8) - - alignment_options.add_argument('-g', '--gap_open', type=int, metavar='', - help="Gap open penalty in the position specific scoring matrix. Default: 5", - default=5) - - alignment_options.add_argument('-e', '--gap_ext', type=int, metavar='', - help="Gap extension penalty in the position specific scoring matrix. Default: 1", - default=1) - - alignment_options.add_argument('-q', '--mapq', type=int, metavar='', - help="Minimum mapping quality allowed in the supplementary alignments. Default: 20", - default=20) - - alignment_options.add_argument('-d', '--edit_distance-fraction', type=float, metavar='', - help="Maximum edit distance fraction allowed in the first realignment. Default (0.05)", - default=0.05) - - alignment_options.add_argument('-Q', '--split_quality', type=float, metavar='', - help="Minium split score to output an interval. Default (0.0)", - default=0.0) - - alignment_options.add_argument('-R', '--remap_splits', help="Remap probabilistacally the split reads", - action='store_true') + alignment_options.add_argument( + "-n", + "--nhits", + type=int, + metavar="", + help="Number of realignment attempts. Default: 10", + default=10, + ) + + alignment_options.add_argument( + "-p", + "--cut_off", + type=float, + metavar="", + help="Probability cut-off for considering a soft-clipped as realigned: Default: 0.99", + default=0.99, + ) + + alignment_options.add_argument( + "-m", + "--min_sc", + type=float, + metavar="", + help="Minimum soft-clipped length to attempt the realignment. Default: 8", + default=8, + ) + + alignment_options.add_argument( + "-g", + "--gap_open", + type=int, + metavar="", + help="Gap open penalty in the position specific scoring matrix. Default: 5", + default=5, + ) + + alignment_options.add_argument( + "-e", + "--gap_ext", + type=int, + metavar="", + help="Gap extension penalty in the position specific scoring matrix. Default: 1", + default=1, + ) + + alignment_options.add_argument( + "-q", + "--mapq", + type=int, + metavar="", + help="Minimum mapping quality allowed in the supplementary alignments. Default: 20", + default=20, + ) + + alignment_options.add_argument( + "-d", + "--edit_distance-fraction", + type=float, + metavar="", + help="Maximum edit distance fraction allowed in the first realignment. Default (0.05)", + default=0.05, + ) + + alignment_options.add_argument( + "-Q", + "--split_quality", + type=float, + metavar="", + help="Minium split score to output an interval. Default (0.0)", + default=0.0, + ) + + alignment_options.add_argument( + "-R", + "--remap_splits", + help="Remap probabilistacally the split reads", + action="store_true", + ) # insert size - i_size_estimate.add_argument('-iq', '--insert_mapq', type=int, metavar='', - help="Mapq cutoff for stimating the insert size distribution. Default 60", - default=60) - - i_size_estimate.add_argument('-sd', '--std', type=int, metavar='', - help="Standard deviations of the insert size to extend the intervals. Default 5", - default=4) - - i_size_estimate.add_argument('-s', '--sample_size', type=int, metavar='', - help="Number of concordant reads (R2F1) to use for estimating the insert size distribution. Default 100000", - default=100000) + i_size_estimate.add_argument( + "-iq", + "--insert_mapq", + type=int, + metavar="", + help="Mapq cutoff for stimating the insert size distribution. Default 60", + default=60, + ) + + i_size_estimate.add_argument( + "-sd", + "--std", + type=int, + metavar="", + help="Standard deviations of the insert size to extend the intervals. Default 5", + default=4, + ) + + i_size_estimate.add_argument( + "-s", + "--sample_size", + type=int, + metavar="", + help="Number of concordant reads (R2F1) to use for estimating the insert size distribution. Default 100000", + default=100000, + ) # Interval options - - - interval.add_argument('-P', '--interval_probability', type=float, metavar='', - help="Skip edges of the graph with a probability below the threshold. Default: 0.01", - default=0.01) - interval.add_argument('-K', '--clustering_dist', type=int, metavar='', - help="Cluster reads that are K nucleotides appart in the same node. Default: 500", - default=500) - - interval.add_argument('-D', '--only_discordants', help="Use only discordant reads to build the graph", - action='store_false') - - + interval.add_argument( + "-P", + "--interval_probability", + type=float, + metavar="", + help="Skip edges of the graph with a probability below the threshold. Default: 0.01", + default=0.01, + ) + interval.add_argument( + "-K", + "--clustering_dist", + type=int, + metavar="", + help="Cluster reads that are K nucleotides appart in the same node. Default: 500", + default=500, + ) + + interval.add_argument( + "-D", + "--only_discordants", + help="Use only discordant reads to build the graph", + action="store_false", + ) # run options - running.add_argument('-t', '--threads', type=int, metavar='', - help="Number of threads to use.Default 1", - default=1) - - running.add_argument('-dir', '--directory', metavar='', - help="Working directory, default is the working directory", - default=os.getcwd()) - - running.add_argument('-v', '--verbose', type=int, metavar='', - help='Verbose level, 1=error,2=warning, 3=message', - choices=[1, 2, 3], default=3) - - + running.add_argument( + "-t", + "--threads", + type=int, + metavar="", + help="Number of threads to use.Default 1", + default=1, + ) + + running.add_argument( + "-dir", + "--directory", + metavar="", + help="Working directory, default is the working directory", + default=os.getcwd(), + ) + + running.add_argument( + "-v", + "--verbose", + type=int, + metavar="", + help="Verbose level, 1=error,2=warning, 3=message", + choices=[1, 2, 3], + default=3, + ) else: # output - alignment_options.add_argument('-n', '--nhits', type=int, metavar='', - help="Number of realignment attempts. Default: 10", - default=10) - - alignment_options.add_argument('-p', '--cut_off', type=float, metavar='', - help="Probability cut-off for considering a soft-clipped as realigned: Default: 0.99", - default=0.99) - - alignment_options.add_argument('-m', '--min_sc', type=float, metavar='', - help="Minimum soft-clipped length to attempt the realignment. Default: 8", - default=8) - - alignment_options.add_argument('-g', '--gap_open', type=int, metavar='', - help="Gap open penalty in the position specific scoring matrix. Default: 5", - default=5) - - alignment_options.add_argument('-e', '--gap_ext', type=int, metavar='', - help="Gap extension penalty in the position specific scoring matrix. Default: 1", - default=1) - - alignment_options.add_argument('-q', '--mapq', type=int, metavar='', - help="Minimum mapping quality allowed in the supplementary alignments. Default: 20", - default=20) - - alignment_options.add_argument('-d', '--edit_distance-fraction', type=float, metavar='', - help="Maximum edit distance fraction allowed in the first realignment. Default (0.05)", - default=0.05) - - alignment_options.add_argument('-Q', '--split_quality', type=float, metavar='', - help="Minium split score to output an interval. Default (0.0)", - default=0.0) - alignment_options.add_argument('-R', '--remap_splits', help="Remap probabilistacally bwa-mem split reads", - action='store_true') + alignment_options.add_argument( + "-n", + "--nhits", + type=int, + metavar="", + help="Number of realignment attempts. Default: 10", + default=10, + ) + + alignment_options.add_argument( + "-p", + "--cut_off", + type=float, + metavar="", + help="Probability cut-off for considering a soft-clipped as realigned: Default: 0.99", + default=0.99, + ) + + alignment_options.add_argument( + "-m", + "--min_sc", + type=float, + metavar="", + help="Minimum soft-clipped length to attempt the realignment. Default: 8", + default=8, + ) + + alignment_options.add_argument( + "-g", + "--gap_open", + type=int, + metavar="", + help="Gap open penalty in the position specific scoring matrix. Default: 5", + default=5, + ) + + alignment_options.add_argument( + "-e", + "--gap_ext", + type=int, + metavar="", + help="Gap extension penalty in the position specific scoring matrix. Default: 1", + default=1, + ) + + alignment_options.add_argument( + "-q", + "--mapq", + type=int, + metavar="", + help="Minimum mapping quality allowed in the supplementary alignments. Default: 20", + default=20, + ) + + alignment_options.add_argument( + "-d", + "--edit_distance-fraction", + type=float, + metavar="", + help="Maximum edit distance fraction allowed in the first realignment. Default (0.05)", + default=0.05, + ) + + alignment_options.add_argument( + "-Q", + "--split_quality", + type=float, + metavar="", + help="Minium split score to output an interval. Default (0.0)", + default=0.0, + ) + alignment_options.add_argument( + "-R", + "--remap_splits", + help="Remap probabilistacally bwa-mem split reads", + action="store_true", + ) # insert size - i_size_estimate.add_argument('-iq', '--insert_mapq', type=int, metavar='', - help="Mapq cutoff for stimating the insert size distribution. Default 60", - default=60) - - i_size_estimate.add_argument('-sd', '--std', type=int, metavar='', - help="Standard deviations of the insert size to extend the intervals. Default 5", - default=5) - - i_size_estimate.add_argument('-s', '--sample_size', type=int, metavar='', - help="Number of concordant reads (R2F1) to use for estimating the insert size distribution. Default 100000", - default=100000) + i_size_estimate.add_argument( + "-iq", + "--insert_mapq", + type=int, + metavar="", + help="Mapq cutoff for stimating the insert size distribution. Default 60", + default=60, + ) + + i_size_estimate.add_argument( + "-sd", + "--std", + type=int, + metavar="", + help="Standard deviations of the insert size to extend the intervals. Default 5", + default=5, + ) + + i_size_estimate.add_argument( + "-s", + "--sample_size", + type=int, + metavar="", + help="Number of concordant reads (R2F1) to use for estimating the insert size distribution. Default 100000", + default=100000, + ) # Interval options - - interval.add_argument('-P', '--interval_probability', type=float, metavar='', - help="Skip edges of the graph with a probability below the threshold. Default: 0.01", - default=0.01) - interval.add_argument('-K', '--clustering_dist', type=int, metavar='', - help="Cluster reads that are K nucleotides appart in the same node. Default: 500", - default=500) - interval.add_argument('-D', '--only_discordants', help="Use only discordant reads to build the graph", - action='store_true') - + interval.add_argument( + "-P", + "--interval_probability", + type=float, + metavar="", + help="Skip edges of the graph with a probability below the threshold. Default: 0.01", + default=0.01, + ) + interval.add_argument( + "-K", + "--clustering_dist", + type=int, + metavar="", + help="Cluster reads that are K nucleotides appart in the same node. Default: 500", + default=500, + ) + interval.add_argument( + "-D", + "--only_discordants", + help="Use only discordant reads to build the graph", + action="store_true", + ) # Running options - running.add_argument('-t', '--threads', type=int, metavar='', - help="Number of threads to use.Default 1", - default=1) - - running.add_argument('-dir', '--directory', metavar='', - help="Working directory, default is the working directory", - default=os.getcwd()) - - running.add_argument('-v', '--verbose', type=int, metavar='', - help='Verbose level, 1=error,2=warning, 3=message', - choices=[1, 2, 3], default=3) + running.add_argument( + "-t", + "--threads", + type=int, + metavar="", + help="Number of threads to use.Default 1", + default=1, + ) + + running.add_argument( + "-dir", + "--directory", + metavar="", + help="Working directory, default is the working directory", + default=os.getcwd(), + ) + + running.add_argument( + "-v", + "--verbose", + type=int, + metavar="", + help="Verbose level, 1=error,2=warning, 3=message", + choices=[1, 2, 3], + default=3, + ) # find out which arguments are missing parser.print_help() time.sleep(0.01) - sys.stderr.write("\nInput does not match. Check that you provide the -i, -qbam and -fasta options" - "\nExiting\n") + sys.stderr.write( + "\nInput does not match. Check that you provide the -i, -qbam and -fasta options" "\nExiting\n" + ) sys.exit(0) if len(sys.argv[2:]) == 0: @@ -918,106 +1510,183 @@ def args_bam2bam(self): sys.stderr.write("\nNo arguments given to bam2bam. Exiting\n") sys.exit(0) - return (parser) + return parser def args_repeats(self): - parser = self.repeats parser._action_groups.pop() - required = parser.add_argument_group('required arguments') - optional = parser.add_argument_group('optional arguments') + required = parser.add_argument_group("required arguments") + optional = parser.add_argument_group("optional arguments") # prefixing the argument with -- means it's optional # input and output - required.add_argument('-i', metavar='', help="Input: coordinate name sorted bam file") + required.add_argument("-i", metavar="", help="Input: coordinate name sorted bam file") if "-i" in sys.argv: - - optional.add_argument('-o', '--output', metavar='', - help="Ouput: Reads indicating circular DNA structural variants from repeat regions", - default="circle_repeats_%s" % sys.argv[sys.argv.index("-i") + 1]) - - optional.add_argument('-dir', '--directory', metavar='', - help="Working directory, default is the working directory", - default=os.getcwd()) + optional.add_argument( + "-o", + "--output", + metavar="", + help="Ouput: Reads indicating circular DNA structural variants from repeat regions", + default="circle_repeats_%s" % sys.argv[sys.argv.index("-i") + 1], + ) + + optional.add_argument( + "-dir", + "--directory", + metavar="", + help="Working directory, default is the working directory", + default=os.getcwd(), + ) # coverage metrics - optional.add_argument('-m', '--mismatch', metavar='', - help="Number of mismatches allowed on the reads", - default=2) - - optional.add_argument('-b', '--bases', type=int, metavar='', - help="Number of bases to extend for computing the coverage ratio. Default: 200", - default=200) - - optional.add_argument('-cq', '--cmapq', type=int, metavar='', - help="Minimum mapping quality treshold for coverage computation. Default: 0", - default=0) - - optional.add_argument('-E', '--extension', type=int, metavar='', - help="Number of bases inside the eccDNA coordinates to compute the ratio. Default: 100", - default=100) - - optional.add_argument('-r', '--ratio', type=float, metavar='', - help="Minimum in/out required ratio. Default: 0.6", - default=0.6) - - optional.add_argument('-f', '--fraction', type=float, metavar='', - help="Required fraction to merge the intervals of the double mapped reads. Default 0.8", - default=0.8) - - optional.add_argument('-n', '--read_number', metavar='', - help="Minimum number of reads required to output", - default=20) - - + optional.add_argument( + "-m", + "--mismatch", + metavar="", + help="Number of mismatches allowed on the reads", + default=2, + ) + + optional.add_argument( + "-b", + "--bases", + type=int, + metavar="", + help="Number of bases to extend for computing the coverage ratio. Default: 200", + default=200, + ) + + optional.add_argument( + "-cq", + "--cmapq", + type=int, + metavar="", + help="Minimum mapping quality treshold for coverage computation. Default: 0", + default=0, + ) + + optional.add_argument( + "-E", + "--extension", + type=int, + metavar="", + help="Number of bases inside the eccDNA coordinates to compute the ratio. Default: 100", + default=100, + ) + + optional.add_argument( + "-r", + "--ratio", + type=float, + metavar="", + help="Minimum in/out required ratio. Default: 0.6", + default=0.6, + ) + + optional.add_argument( + "-f", + "--fraction", + type=float, + metavar="", + help="Required fraction to merge the intervals of the double mapped reads. Default 0.8", + default=0.8, + ) + + optional.add_argument( + "-n", + "--read_number", + metavar="", + help="Minimum number of reads required to output", + default=20, + ) else: - - optional.add_argument('-o', '--output', metavar='', - help="Ouput: Reads indicating circular DNA structural variants", - ) - - optional.add_argument('-dir', '--directory', metavar='', - help="Working directory, default is the working directory", - default=os.getcwd()) + optional.add_argument( + "-o", + "--output", + metavar="", + help="Ouput: Reads indicating circular DNA structural variants", + ) + + optional.add_argument( + "-dir", + "--directory", + metavar="", + help="Working directory, default is the working directory", + default=os.getcwd(), + ) # coverage metrics - optional.add_argument('-m', '--mismatch', metavar='', - help="Number of mismatches allowed on the reads", - default=2) - - optional.add_argument('-b', '--bases', type=int, metavar='', - help="Number of bases to extend for computing the coverage ratio. Default: 200", - default=200) - - optional.add_argument('-cq', '--cmapq', type=int, metavar='', - help="Minimum mapping quality treshold for coverage computation. Default: 0", - default=0.6) - - optional.add_argument('-E', '--extension', type=int, metavar='', - help="Number of bases inside the eccDNA coordinates to compute the ratio. Default: 100", - default=100) - - optional.add_argument('-r', '--ratio', type=float, metavar='', - help="Minimum in/out required ratio. Default: 0.6", - default=0.6) - - optional.add_argument('-f', '--fraction', type=float, metavar='', - help="Required fraction to merge the intervals of the double mapped reads. Default 0.8", - default=0.8) - - optional.add_argument('-n', '--read_number', metavar='', - help="Minimum number of reads required to output", - default=20) + optional.add_argument( + "-m", + "--mismatch", + metavar="", + help="Number of mismatches allowed on the reads", + default=2, + ) + + optional.add_argument( + "-b", + "--bases", + type=int, + metavar="", + help="Number of bases to extend for computing the coverage ratio. Default: 200", + default=200, + ) + + optional.add_argument( + "-cq", + "--cmapq", + type=int, + metavar="", + help="Minimum mapping quality treshold for coverage computation. Default: 0", + default=0.6, + ) + + optional.add_argument( + "-E", + "--extension", + type=int, + metavar="", + help="Number of bases inside the eccDNA coordinates to compute the ratio. Default: 100", + default=100, + ) + + optional.add_argument( + "-r", + "--ratio", + type=float, + metavar="", + help="Minimum in/out required ratio. Default: 0.6", + default=0.6, + ) + + optional.add_argument( + "-f", + "--fraction", + type=float, + metavar="", + help="Required fraction to merge the intervals of the double mapped reads. Default 0.8", + default=0.8, + ) + + optional.add_argument( + "-n", + "--read_number", + metavar="", + help="Minimum number of reads required to output", + default=20, + ) parser.print_help() time.sleep(0.01) - sys.stderr.write("\nNo input input given to Repeats, be sure that you are providing the flag '-i'" - "\nExiting\n") + sys.stderr.write( + "\nNo input input given to Repeats, be sure that you are providing the flag '-i'" "\nExiting\n" + ) sys.exit(0) # parse the commands @@ -1028,142 +1697,339 @@ def args_repeats(self): sys.stderr.write("\nNo arguments given to Repeats. Exiting\n") sys.exit(0) - return (parser) + return parser def args_simulate(self): - parser = self.simulate parser._action_groups.pop() - required = parser.add_argument_group('required arguments') - optional = parser.add_argument_group('optional arguments') + required = parser.add_argument_group("required arguments") + optional = parser.add_argument_group("optional arguments") # prefixing the argument with -- means it's optional # input and output if "-g" and "-N" in sys.argv: - required.add_argument('-g', metavar='', - help="Genome fasta file (Needs to be indexed with samtools faidx)") - required.add_argument('-N', '--read-number', type=int, metavar='', - help="Number of reads to simulate") - optional.add_argument('-o', '--output', default='simulated.bed', - help="Output file name") - optional.add_argument('-dir', '--directory', metavar='', - help="Working directory, default is the working directory", - default=os.getcwd()) - optional.add_argument('-b', '--base-name', metavar='', default='simulated', - help="Fastq output basename") - optional.add_argument('-s', '--skip-region', metavar='', default=None, - help="Regions of the genome to skip the simulation. The input needs to be in bed format") - optional.add_argument('-r', '--read-length', metavar='', type=int, default=150, - help="Read length to simulate") - optional.add_argument('-m', '--mean-insert-size', metavar='', type=int, default=300, - help="Mean of the insert size distribution") - optional.add_argument('-c', '--mean-coverage', metavar='', type=int, default=30, - help="Mean sequencing coverage within the eccDNA coordinates") - optional.add_argument('-p', '--processes', metavar='', type=int, default=1, - help="Mean sequencing coverage within the eccDNA coordinates") - - optional.add_argument('-v', '--variants', action='store_true', - help="If set to true, introduce mutations in the reference genome prior to simulating" - "reads.") - optional.add_argument('-S', '--substitution', metavar='', type=float, default=0.0001, - help="Fraction of base substitutions to introduce on the genome. Default: 0.0001") - - optional.add_argument('-I', '--Indels', metavar='', type=float, default=0.001, - help="Fraction of indels to introduce on the genome. Default: 0.001") - optional.add_argument('-J', '--java_memory', metavar='', type=str, default="-Xmx16g", - help="Java memory allocation, required for mutating the genome. Default: -Xmx16g") - - optional.add_argument('-e', '--error', action='store_true', - help="Introduce sequencing errors ( Uses ART on the background)") - - optional.add_argument('-i', '--instrument', metavar='', type=str, default="HS25", - help="Illumina sequecing instrument to simulate reads from (Default HiSeq 2500)") - - optional.add_argument('-ir', '--insRate', metavar='', type=float, default=0.00009, - help="the first-read insertion rate (default: 0.00009)") - optional.add_argument('-ir2', '--insRate2', metavar='', type=float, default=0.00015, - help="the second-read insertion rate (default: 0.00015)") - optional.add_argument('-dr', '--delRate', metavar='', type=float, default=0.00011, - help="the first-read deletion rate (default: 0.00011)") - optional.add_argument('-dr2', '--delRate2', metavar='', type=float, default=0.00023, - help="the second-read deletion rate (default: 0.00023)") + required.add_argument( + "-g", + metavar="", + help="Genome fasta file (Needs to be indexed with samtools faidx)", + ) + required.add_argument( + "-N", + "--read-number", + type=int, + metavar="", + help="Number of reads to simulate", + ) + optional.add_argument("-o", "--output", default="simulated.bed", help="Output file name") + optional.add_argument( + "-dir", + "--directory", + metavar="", + help="Working directory, default is the working directory", + default=os.getcwd(), + ) + optional.add_argument( + "-b", + "--base-name", + metavar="", + default="simulated", + help="Fastq output basename", + ) + optional.add_argument( + "-s", + "--skip-region", + metavar="", + default=None, + help="Regions of the genome to skip the simulation. The input needs to be in bed format", + ) + optional.add_argument( + "-r", + "--read-length", + metavar="", + type=int, + default=150, + help="Read length to simulate", + ) + optional.add_argument( + "-m", + "--mean-insert-size", + metavar="", + type=int, + default=300, + help="Mean of the insert size distribution", + ) + optional.add_argument( + "-c", + "--mean-coverage", + metavar="", + type=int, + default=30, + help="Mean sequencing coverage within the eccDNA coordinates", + ) + optional.add_argument( + "-p", + "--processes", + metavar="", + type=int, + default=1, + help="Mean sequencing coverage within the eccDNA coordinates", + ) + + optional.add_argument( + "-v", + "--variants", + action="store_true", + help="If set to true, introduce mutations in the reference genome prior to simulating" "reads.", + ) + optional.add_argument( + "-S", + "--substitution", + metavar="", + type=float, + default=0.0001, + help="Fraction of base substitutions to introduce on the genome. Default: 0.0001", + ) + + optional.add_argument( + "-I", + "--Indels", + metavar="", + type=float, + default=0.001, + help="Fraction of indels to introduce on the genome. Default: 0.001", + ) + optional.add_argument( + "-J", + "--java_memory", + metavar="", + type=str, + default="-Xmx16g", + help="Java memory allocation, required for mutating the genome. Default: -Xmx16g", + ) + + optional.add_argument( + "-e", + "--error", + action="store_true", + help="Introduce sequencing errors ( Uses ART on the background)", + ) + + optional.add_argument( + "-i", + "--instrument", + metavar="", + type=str, + default="HS25", + help="Illumina sequecing instrument to simulate reads from (Default HiSeq 2500)", + ) + + optional.add_argument( + "-ir", + "--insRate", + metavar="", + type=float, + default=0.00009, + help="the first-read insertion rate (default: 0.00009)", + ) + optional.add_argument( + "-ir2", + "--insRate2", + metavar="", + type=float, + default=0.00015, + help="the second-read insertion rate (default: 0.00015)", + ) + optional.add_argument( + "-dr", + "--delRate", + metavar="", + type=float, + default=0.00011, + help="the first-read deletion rate (default: 0.00011)", + ) + optional.add_argument( + "-dr2", + "--delRate2", + metavar="", + type=float, + default=0.00023, + help="the second-read deletion rate (default: 0.00023)", + ) else: - required.add_argument('-g', metavar='', - help="Genome fasta file (Needs to be indexed with samtools faidx)") - required.add_argument('-N', '--read-number', type=int, metavar='', - help="Number of reads to simulate") - optional.add_argument('-o', '--output', default='simulated.bed', - help="Output file name") - optional.add_argument('-dir', '--directory', metavar='', - help="Working directory, default is the working directory", - default=os.getcwd()) - optional.add_argument('-b', '--base-name', metavar='', default='simulated', - help="Fastq output basename") - optional.add_argument('-s', '--skip-region', metavar='', default=None, - help="Regions of the genome to skip the simulation. The input needs to be in bed format") - optional.add_argument('-r', '--read-length', metavar='', type=int, default=150, - help="Read length to simulate") - optional.add_argument('-m', '--mean-insert', metavar='', type=int, default=300, - help="Mean of the insert size distribution") - - optional.add_argument('-c', '--mean-coverage', metavar='', type=int, default=30, - help="Mean sequencing coverage within the eccDNA coordinates") - - optional.add_argument('-p', '--processes', metavar='', type=int, default=1, - help="Number of parallel processes to use") - - optional.add_argument('-v', '--variants', action='store_true', - help="If set to true, introduce mutations in the reference genome prior to simulating" - "reads.") - optional.add_argument('-S', '--substitution', metavar='', type=float, default=0.0001, - help="Fraction of base substitutions to introduce on the genome. Default: 0.0001") - - optional.add_argument('-I', '--Indels', metavar='', type=float, default=0.001, - help="Fraction of indels to introduce on the genome. Default: 0.001") - optional.add_argument('-J', '--java_memory', metavar='', type=str, default="-Xmx16g", - help="Java memory allocation, required for mutating the genome. Default: -Xmx16g") - - optional.add_argument('-e', '--error', action='store_true', - help="Introduce sequencing errors ( Uses ART on the background)") - - optional.add_argument('-i', '--instrument', metavar='', type=str, default="HS25", - help="Illumina sequecing instrument to simulate reads from (Default HiSeq 2500)") - optional.add_argument('-ir', '--insRate', metavar='', type=float, default=0.00009, - help="the first-read insertion rate (default: 0.00009). Requires -e") - optional.add_argument('-ir2', '--insRate2', metavar='', type=float, default=0.00015, - help="the second-read insertion rate (default: 0.00015). Requires -e") - optional.add_argument('-dr', '--delRate', metavar='', type=float, default=0.00011, - help="the first-read deletion rate (default: 0.00011). Requires -e") - optional.add_argument('-dr2', '--delRate2', metavar='', type=float, default=0.00023, - help="the second-read deletion rate (default: 0.00023). Requires -e") + required.add_argument( + "-g", + metavar="", + help="Genome fasta file (Needs to be indexed with samtools faidx)", + ) + required.add_argument( + "-N", + "--read-number", + type=int, + metavar="", + help="Number of reads to simulate", + ) + optional.add_argument("-o", "--output", default="simulated.bed", help="Output file name") + optional.add_argument( + "-dir", + "--directory", + metavar="", + help="Working directory, default is the working directory", + default=os.getcwd(), + ) + optional.add_argument( + "-b", + "--base-name", + metavar="", + default="simulated", + help="Fastq output basename", + ) + optional.add_argument( + "-s", + "--skip-region", + metavar="", + default=None, + help="Regions of the genome to skip the simulation. The input needs to be in bed format", + ) + optional.add_argument( + "-r", + "--read-length", + metavar="", + type=int, + default=150, + help="Read length to simulate", + ) + optional.add_argument( + "-m", + "--mean-insert", + metavar="", + type=int, + default=300, + help="Mean of the insert size distribution", + ) + + optional.add_argument( + "-c", + "--mean-coverage", + metavar="", + type=int, + default=30, + help="Mean sequencing coverage within the eccDNA coordinates", + ) + + optional.add_argument( + "-p", + "--processes", + metavar="", + type=int, + default=1, + help="Number of parallel processes to use", + ) + + optional.add_argument( + "-v", + "--variants", + action="store_true", + help="If set to true, introduce mutations in the reference genome prior to simulating" "reads.", + ) + optional.add_argument( + "-S", + "--substitution", + metavar="", + type=float, + default=0.0001, + help="Fraction of base substitutions to introduce on the genome. Default: 0.0001", + ) + + optional.add_argument( + "-I", + "--Indels", + metavar="", + type=float, + default=0.001, + help="Fraction of indels to introduce on the genome. Default: 0.001", + ) + optional.add_argument( + "-J", + "--java_memory", + metavar="", + type=str, + default="-Xmx16g", + help="Java memory allocation, required for mutating the genome. Default: -Xmx16g", + ) + + optional.add_argument( + "-e", + "--error", + action="store_true", + help="Introduce sequencing errors ( Uses ART on the background)", + ) + + optional.add_argument( + "-i", + "--instrument", + metavar="", + type=str, + default="HS25", + help="Illumina sequecing instrument to simulate reads from (Default HiSeq 2500)", + ) + optional.add_argument( + "-ir", + "--insRate", + metavar="", + type=float, + default=0.00009, + help="the first-read insertion rate (default: 0.00009). Requires -e", + ) + optional.add_argument( + "-ir2", + "--insRate2", + metavar="", + type=float, + default=0.00015, + help="the second-read insertion rate (default: 0.00015). Requires -e", + ) + optional.add_argument( + "-dr", + "--delRate", + metavar="", + type=float, + default=0.00011, + help="the first-read deletion rate (default: 0.00011). Requires -e", + ) + optional.add_argument( + "-dr2", + "--delRate2", + metavar="", + type=float, + default=0.00023, + help="the second-read deletion rate (default: 0.00023). Requires -e", + ) parser.print_help() time.sleep(0.01) sys.stderr.write( "\nNo input input given to Simulate, be sure that you are providing the flags '-g' and '-N'" - "\nExiting\n") + "\nExiting\n" + ) sys.exit(0) - if len(sys.argv[2:]) == 0: parser.print_help() time.sleep(0.01) sys.stderr.write("\nNo arguments given to Simulate. Exiting\n") + return parser - - return (parser) - def main(): run = circle_map() pid = run.__getpid__() # clean os.system("rm -rf temp_files_%s" % pid) -if __name__ == '__main__': - main() +if __name__ == "__main__": + main() diff --git a/bin/collect_seeds.py b/bin/collect_seeds.py index 19eff3cd..35e74a43 100755 --- a/bin/collect_seeds.py +++ b/bin/collect_seeds.py @@ -8,6 +8,7 @@ from datetime import datetime from subprocess import call + # Read the CNVkit .cns files def collect_seeds(sample, cns): with open(cns) as infile, open(sample + "_CNV_GAIN.bed", "w") as outfile: @@ -17,9 +18,7 @@ def collect_seeds(sample, cns): s, e = int(fields[1]), int(fields[2]) cn_r = float(fields[4]) cn = 2 ** (cn_r + 1) - if ( - cn >= args.cngain - ): # do not filter on size since amplified_intervals.py will merge small ones. + if cn >= args.cngain: # do not filter on size since amplified_intervals.py will merge small ones. outline = "\t".join(fields[0:3] + ["CNVkit", str(cn)]) + "\n" outfile.write(outline) return sample + "_CNV_GAIN.bed" @@ -28,9 +27,7 @@ def collect_seeds(sample, cns): # MAIN # if __name__ == "__main__": # Parses the command line arguments - parser = argparse.ArgumentParser( - description="Collect AmpliconArchitect Copy Number Seeds" - ) + parser = argparse.ArgumentParser(description="Collect AmpliconArchitect Copy Number Seeds") parser.add_argument("-s", "--sample", help="sample name", required=True) parser.add_argument("--cns", help="CNVKit .cns file of CNV changes.", default="") parser.add_argument( diff --git a/bin/downsample.py b/bin/downsample.py index 8de04151..534c9a9d 100755 --- a/bin/downsample.py +++ b/bin/downsample.py @@ -16,8 +16,8 @@ # # IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. -#Author: Viraj Deshpande -#Contact: virajbdeshpande@gmail.com +# Author: Viraj Deshpande +# Contact: virajbdeshpande@gmail.com import pysam @@ -29,37 +29,74 @@ import os import numpy as np import matplotlib -matplotlib.use('Agg') + +matplotlib.use("Agg") import matplotlib.pyplot as plt from matplotlib.patches import Ellipse import logging import random import hashlib -#plt.rc('text', usetex=True) -#plt.rc('font', family='serif') + +# plt.rc('text', usetex=True) +# plt.rc('font', family='serif') import global_names -parser = argparse.\ -ArgumentParser(description="Reconstruct Amplicons connected to listed intervals.") -parser.add_argument('--bam', dest='bam', - help="Coordinate sorted BAM file with index", metavar='FILE', - action='store', type=str, nargs=1) -parser.add_argument('--final', dest='final', - help="Optional Final coverage. Default is 10. If initial coverage is less than final, do nothing.", metavar='FLOAT', - action='store', type=float, default=10.0) -parser.add_argument('--downsample_dir', dest='downsample_dir', - help="Optional directory to output. Default is same as original bamfile", metavar='DIR', - action='store', type=str, default='') -parser.add_argument('--cbam', dest='cbam', - help="Optional bamfile to use for coverage calculation. Also generates new coverage bam file in downsample_dir.", metavar='FILE', - action='store', type=str, default=None) -parser.add_argument('--cbed', dest='cbed', - help="Optional bedfile defining 1000 10kbp genomic windows for coverage calcualtion", metavar='FILE', - action='store', type=str, default=None) -parser.add_argument('--ref', dest='ref', - help="Values: [hg19, GRCh37, GRCh38, mm10, None]. \"hg19\", \"mm10\", \"GRCh38\" : chr1, .. chrM etc / \"GRCh37\" : '1', '2', .. 'MT' etc/ \"None\" : Do not use any annotations. AA can tolerate additional chromosomes not stated but accuracy and annotations may be affected.", metavar='STR', - action='store', type=str, required=True) +parser = argparse.ArgumentParser(description="Reconstruct Amplicons connected to listed intervals.") +parser.add_argument( + "--bam", + dest="bam", + help="Coordinate sorted BAM file with index", + metavar="FILE", + action="store", + type=str, + nargs=1, +) +parser.add_argument( + "--final", + dest="final", + help="Optional Final coverage. Default is 10. If initial coverage is less than final, do nothing.", + metavar="FLOAT", + action="store", + type=float, + default=10.0, +) +parser.add_argument( + "--downsample_dir", + dest="downsample_dir", + help="Optional directory to output. Default is same as original bamfile", + metavar="DIR", + action="store", + type=str, + default="", +) +parser.add_argument( + "--cbam", + dest="cbam", + help="Optional bamfile to use for coverage calculation. Also generates new coverage bam file in downsample_dir.", + metavar="FILE", + action="store", + type=str, + default=None, +) +parser.add_argument( + "--cbed", + dest="cbed", + help="Optional bedfile defining 1000 10kbp genomic windows for coverage calcualtion", + metavar="FILE", + action="store", + type=str, + default=None, +) +parser.add_argument( + "--ref", + dest="ref", + help='Values: [hg19, GRCh37, GRCh38, mm10, None]. "hg19", "mm10", "GRCh38" : chr1, .. chrM etc / "GRCh37" : \'1\', \'2\', .. \'MT\' etc/ "None" : Do not use any annotations. AA can tolerate additional chromosomes not stated but accuracy and annotations may be affected.', + metavar="STR", + action="store", + type=str, + required=True, +) args = parser.parse_args() @@ -71,16 +108,16 @@ from breakpoint_graph import * -if os.path.splitext(args.bam[0])[-1] == '.cram': - bamFile = pysam.Samfile(args.bam[0], 'rc') +if os.path.splitext(args.bam[0])[-1] == ".cram": + bamFile = pysam.Samfile(args.bam[0], "rc") else: - bamFile = pysam.Samfile(args.bam[0], 'rb') + bamFile = pysam.Samfile(args.bam[0], "rb") cbam = None if args.cbam is not None: - if os.path.splitext(args.cbam[0])[-1] == '.cram': - cbam = pysam.Samfile(args.cbam, 'rc') + if os.path.splitext(args.cbam[0])[-1] == ".cram": + cbam = pysam.Samfile(args.cbam, "rc") else: - cbam = pysam.Samfile(args.cbam, 'rb') + cbam = pysam.Samfile(args.cbam, "rb") cbed = args.cbed @@ -100,9 +137,9 @@ cstats = None coverage_stats_file.close() -coverage_windows=None +coverage_windows = None if cbed is not None: - coverage_windows=hg.interval_list(cbed, 'bed') + coverage_windows = hg.interval_list(cbed, "bed") coverage_windows.sort() if cstats is None and cbam is not None: cbam2b = b2b.bam_to_breakpoint(cbam, coverage_stats=cstats, coverage_windows=coverage_windows) @@ -115,17 +152,21 @@ final = args.final if cstats[0] <= final: - exit() + exit() ratio = float(final) / float(cstats[0]) downsample_dir = os.path.dirname(os.path.abspath(args.bam[0])) -if args.downsample_dir != '': +if args.downsample_dir != "": downsample_dir = args.downsample_dir -i=0 +i = 0 rulist = [] t0 = time() -b2 = pysam.Samfile(downsample_dir + '/' + os.path.basename(args.bam[0])[:-4] + '.DS.bam', 'wb', template = bamFile) +b2 = pysam.Samfile( + downsample_dir + "/" + os.path.basename(args.bam[0])[:-4] + ".DS.bam", + "wb", + template=bamFile, +) for a in bamFile.fetch(): random.seed(a.query_name + str(t0)) random.uniform(0, 1) @@ -133,7 +174,7 @@ if ru < ratio: b2.write(a) b2.close() -pysam.index(downsample_dir + '/' + os.path.basename(args.bam[0])[:-4] + '.DS.bam') +pysam.index(downsample_dir + "/" + os.path.basename(args.bam[0])[:-4] + ".DS.bam") print("Downsampling:", args.bam[0], float(cstats[0]), final, ratio) # if args.cbam is not None and not os.path.exists(downsample_dir + '/' + os.path.basename(args.cbam)[:-4] + '.DS.bam'): @@ -144,5 +185,3 @@ # c2.write(a) # c2.close() # pysam.index(downsample_dir + '/' + os.path.basename(args.cbam)[:-4] + '.DS.bam') - - diff --git a/bin/extract_circle_SV_reads.py b/bin/extract_circle_SV_reads.py index a8e72d1a..a82215e0 100644 --- a/bin/extract_circle_SV_reads.py +++ b/bin/extract_circle_SV_reads.py @@ -1,24 +1,24 @@ -#MIT License +# MIT License # -#Copyright (c) 2019 Iñigo Prada Luengo +# Copyright (c) 2019 Iñigo Prada Luengo # -#Permission is hereby granted, free of charge, to any person obtaining a copy -#of this software and associated documentation files (the "Software"), to deal -#in the Software without restriction, including without limitation the rights -#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -#copies of the Software, and to permit persons to whom the Software is -#furnished to do so, subject to the following conditions: +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: # -#The above copyright notice and this permission notice shall be included in all -#copies or substantial portions of the Software. +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. # -#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -#SOFTWARE. +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. import pysam as ps import os @@ -30,91 +30,98 @@ class readExtractor: """Class for managing the read extracting part of circle map""" - def __init__(self,sorted_bam,output_bam,working_dir,mapq_cutoff,extract_discordant,extract_soft_clipped,extract_hard_clipped, - verbose,parser - ): - #input-output + + def __init__( + self, + sorted_bam, + output_bam, + working_dir, + mapq_cutoff, + extract_discordant, + extract_soft_clipped, + extract_hard_clipped, + verbose, + parser, + ): + # input-output self.sorted_bam = sorted_bam self.output_bam = output_bam - #working place + # working place self.working_dir = working_dir - #read options + # read options self.no_discordants = extract_discordant self.no_soft_clipped = extract_soft_clipped self.no_hard_clipped = extract_hard_clipped - #mapq cutoff + # mapq cutoff self.mapq_cutoff = mapq_cutoff - #verbose level + # verbose level self.verbose = int(verbose) - #parser options + # parser options self.parser = parser def extract_sv_circleReads(self): - """Function that extracts Structural Variant reads that indicate circular DNA, The programme with extract soft-clipped reads and R2F1 (<- ->) oriented reads""" os.chdir(self.working_dir) - #input + # input raw_bam = ps.AlignmentFile(self.working_dir + "/" + self.sorted_bam, "rb") - #HD the tag for the header line. SO indicates sorting order of the alignements - if 'HD' in raw_bam.header: - - if raw_bam.header['HD']['SO'] != 'queryname': + # HD the tag for the header line. SO indicates sorting order of the alignements + if "HD" in raw_bam.header: + if raw_bam.header["HD"]["SO"] != "queryname": sys.stderr.write( - "The input bam header says that bam is not sorted by queryname. It is sorted by %s\n\n" % (raw_bam.header['HD']['SO'])) + "The input bam header says that bam is not sorted by queryname. It is sorted by %s\n\n" + % (raw_bam.header["HD"]["SO"]) + ) sys.stderr.write( - "Sort your bam file queryname with the following command:\n\n\tsamtools sort -n -o output.bam input.bam") + "Sort your bam file queryname with the following command:\n\n\tsamtools sort -n -o output.bam input.bam" + ) time.sleep(0.01) self.parser.print_help() sys.exit(1) else: + if self.verbose >= 2: + warnings.warn( + "WARNING:Circle-Map does not know if the input bam is queryname sorted\n Please check that, the output would be unexpected otherwise" + ) + print( + "As sanity check, sort your bam file queryname with the following command:\n\n\tsamtools sort -n -o output.bam input.bam" + ) - if self.verbose >=2: - warnings.warn("WARNING:Circle-Map does not know if the input bam is queryname sorted\n Please check that, the output would be unexpected otherwise") - print("As sanity check, sort your bam file queryname with the following command:\n\n\tsamtools sort -n -o output.bam input.bam") - - - + circle_sv_reads = ps.AlignmentFile(self.working_dir + "/" + self.output_bam, "wb", template=raw_bam) - circle_sv_reads = ps.AlignmentFile(self.working_dir + "/" + self.output_bam , "wb", template=raw_bam) + # modify the tag to unsorted + if "HD" in raw_bam.header == True: + circle_sv_reads.header["HD"]["SO"] = "unsorted" - - #modify the tag to unsorted - if 'HD' in raw_bam.header == True: - circle_sv_reads.header['HD']['SO'] = 'unsorted' - - if self.verbose >=3: + if self.verbose >= 3: print("Extracting circular structural variants") - #timing + # timing begin = time.time() + # cache read1. operate in read2. this speed-ups the search + read1 = "" - #cache read1. operate in read2. this speed-ups the search - read1 = '' - - #counter for processed reads + # counter for processed reads processed_reads = 0 for read in raw_bam: + if self.verbose >= 3: + processed_reads += 1 - if self.verbose >=3: - processed_reads +=1 - - - if (processed_reads/1000000).is_integer() == True: + if (processed_reads / 1000000).is_integer() == True: partial_timer = time.time() - partial_time = (partial_timer - begin)/60 - print("Processed %s reads in %s mins" % (processed_reads,round(partial_time,3))) + partial_time = (partial_timer - begin) / 60 + print("Processed %s reads in %s mins" % (processed_reads, round(partial_time, 3))) if read.is_read1: read1 = read @@ -128,103 +135,138 @@ def extract_sv_circleReads(self): # both reads in memory read2 = read - #both reads need to be mapped + # both reads need to be mapped if read1.is_unmapped == False and read2.is_unmapped == False: - - if read2.is_reverse and read1.is_reverse == False: - - # read2 leftmost mapping position smaller than read1 leftmost mapping position if read2.reference_start < read1.reference_start: - - - #aligned to the same chromosome + # aligned to the same chromosome if read1.reference_id == read2.reference_id: - - if read1.mapq >= self.mapq_cutoff and read2.mapq >= self.mapq_cutoff: - - - #is discordant extraction turn off? + # is discordant extraction turn off? if self.no_discordants == False: - - #add mate mapping quality info - read1.tags += [('MQ',read2.mapq)] - read2.tags += [('MQ', read1.mapq)] + # add mate mapping quality info + read1.tags += [("MQ", read2.mapq)] + read2.tags += [("MQ", read1.mapq)] circle_sv_reads.write(read1) circle_sv_reads.write(read2) else: - pass else: - - #extract soft-clipped if the mapq is high enough - write_clipped_read(circle_sv_reads, read1, read2, self.no_soft_clipped, - self.no_hard_clipped, self.mapq_cutoff) - - write_clipped_read(circle_sv_reads, read2, read1, self.no_soft_clipped, - self.no_hard_clipped, self.mapq_cutoff) - - + # extract soft-clipped if the mapq is high enough + write_clipped_read( + circle_sv_reads, + read1, + read2, + self.no_soft_clipped, + self.no_hard_clipped, + self.mapq_cutoff, + ) + + write_clipped_read( + circle_sv_reads, + read2, + read1, + self.no_soft_clipped, + self.no_hard_clipped, + self.mapq_cutoff, + ) else: - - write_clipped_read(circle_sv_reads, read1, read2, self.no_soft_clipped, - self.no_hard_clipped, self.mapq_cutoff) - - write_clipped_read(circle_sv_reads, read2, read1, self.no_soft_clipped, - self.no_hard_clipped, self.mapq_cutoff) - + write_clipped_read( + circle_sv_reads, + read1, + read2, + self.no_soft_clipped, + self.no_hard_clipped, + self.mapq_cutoff, + ) + + write_clipped_read( + circle_sv_reads, + read2, + read1, + self.no_soft_clipped, + self.no_hard_clipped, + self.mapq_cutoff, + ) else: - - #if the leftmost mapping condition is not met check if they are soft-clipped - write_clipped_read(circle_sv_reads, read1, read2, self.no_soft_clipped, - self.no_hard_clipped, self.mapq_cutoff) - - write_clipped_read(circle_sv_reads, read2, read1, self.no_soft_clipped, - self.no_hard_clipped, self.mapq_cutoff) - + # if the leftmost mapping condition is not met check if they are soft-clipped + write_clipped_read( + circle_sv_reads, + read1, + read2, + self.no_soft_clipped, + self.no_hard_clipped, + self.mapq_cutoff, + ) + + write_clipped_read( + circle_sv_reads, + read2, + read1, + self.no_soft_clipped, + self.no_hard_clipped, + self.mapq_cutoff, + ) else: - - #check soft-clipped if R2F1 orientation is not True - - write_clipped_read(circle_sv_reads, read1, read2, self.no_soft_clipped, - self.no_hard_clipped, self.mapq_cutoff) - - write_clipped_read(circle_sv_reads, read2, read1, self.no_soft_clipped, - self.no_hard_clipped, self.mapq_cutoff) - + # check soft-clipped if R2F1 orientation is not True + + write_clipped_read( + circle_sv_reads, + read1, + read2, + self.no_soft_clipped, + self.no_hard_clipped, + self.mapq_cutoff, + ) + + write_clipped_read( + circle_sv_reads, + read2, + read1, + self.no_soft_clipped, + self.no_hard_clipped, + self.mapq_cutoff, + ) else: - - #check read 1 and read two for independent unmaps + # check read 1 and read two for independent unmaps if read1.is_unmapped == False: - write_clipped_read(circle_sv_reads, read1,read2, self.no_soft_clipped, - self.no_hard_clipped, self.mapq_cutoff, own_mapq=True) + write_clipped_read( + circle_sv_reads, + read1, + read2, + self.no_soft_clipped, + self.no_hard_clipped, + self.mapq_cutoff, + own_mapq=True, + ) if read2.is_unmapped == False: - write_clipped_read(circle_sv_reads, read2, read1, self.no_soft_clipped, - self.no_hard_clipped, self.mapq_cutoff, own_mapq=True) + write_clipped_read( + circle_sv_reads, + read2, + read1, + self.no_soft_clipped, + self.no_hard_clipped, + self.mapq_cutoff, + own_mapq=True, + ) else: # reads are not queryname sorted and cannot be processed in paired mode warnings.warn("Unpaired reads found. Is your bam file queryname sorted?") - end = time.time() circle_sv_reads.close() - - - if self.verbose >=3: - - + if self.verbose >= 3: print("finished extracting reads. Elapsed time:", (end - begin) / 60, "mins") print("Thanks for using Circle-Map") diff --git a/bin/get_region_coverage.py b/bin/get_region_coverage.py index e35de9d6..a63a9b17 100755 --- a/bin/get_region_coverage.py +++ b/bin/get_region_coverage.py @@ -11,9 +11,7 @@ parser.add_argument("-i", "--input", metavar="", help="Input bam file") # parser.add_argument('-o', '--output', metavar='', # help="Output bam file") -parser.add_argument( - "-b", "--bed", metavar="", help="CIRCexplorer parse output in bed format" -) +parser.add_argument("-b", "--bed", metavar="", help="CIRCexplorer parse output in bed format") parser.add_argument( "-q", "--mapq", @@ -45,8 +43,6 @@ input_file, input_file_extension = os.path.splitext(os.path.basename(args.input)) output = directory + "/" + input_file + "_coverage.bed" -coverage_object = coverage( - bam_file, bt.BedTool(args.bed), args.bases, args.mapq, args.extension, directory -) +coverage_object = coverage(bam_file, bt.BedTool(args.bed), args.bases, args.mapq, args.extension, directory) bed_coverage = coverage_object.compute_coverage(coverage_object.get_wg_coverage()) bt.BedTool(bed_coverage).saveas(output) diff --git a/bin/global_names.py b/bin/global_names.py index 65c5adb3..46cc69d8 100755 --- a/bin/global_names.py +++ b/bin/global_names.py @@ -1,2 +1,2 @@ -REF="hg19" -TSTART=0 \ No newline at end of file +REF = "hg19" +TSTART = 0 diff --git a/bin/hg19util.py b/bin/hg19util.py index 11796637..f2e5b50c 100755 --- a/bin/hg19util.py +++ b/bin/hg19util.py @@ -40,17 +40,11 @@ DATA_REPO = os.environ["AA_DATA_REPO"] except: logging.warning( - "#TIME " - + "%.3f\t" % clock() - + " Unable to set AA_DATA_REPO variable. Setting to working directory" + "#TIME " + "%.3f\t" % clock() + " Unable to set AA_DATA_REPO variable. Setting to working directory" ) DATA_REPO = "." if DATA_REPO == "." or DATA_REPO == "": - logging.warning( - "#TIME " - + "%.3f\t" % clock() - + " AA_DATA_REPO not set or empy. Setting to working directory" - ) + logging.warning("#TIME " + "%.3f\t" % clock() + " AA_DATA_REPO not set or empy. Setting to working directory") DATA_REPO = "." REF = global_names.REF @@ -91,16 +85,12 @@ def fetch(self, a=None, b=0, c=0): chrLen_filename = DATA_REPO + "/" + REF + "/" + REF_files["chrLen_file"] duke35_filename = DATA_REPO + "/" + REF + "/" + REF_files["duke35_filename"] -wgexclude_filename = ( - DATA_REPO + "/" + REF + "/" + REF_files["mapability_exclude_filename"] -) +wgexclude_filename = DATA_REPO + "/" + REF + "/" + REF_files["mapability_exclude_filename"] gene_filename = DATA_REPO + "/" + REF + "/" + REF_files["gene_filename"] exon_filename = DATA_REPO + "/" + REF + "/" + REF_files["exon_file"] oncogene_filename = DATA_REPO + "/" + REF + "/" + REF_files["oncogene_filename"] centromere_filename = DATA_REPO + "/" + REF + "/" + REF_files["centromere_filename"] -conserved_regions_filename = ( - DATA_REPO + "/" + REF + "/" + REF_files["conserved_regions_filename"] -) +conserved_regions_filename = DATA_REPO + "/" + REF + "/" + REF_files["conserved_regions_filename"] segdup_filename = DATA_REPO + "/" + REF + "/" + REF_files["segdup_filename"] complementary_nucleotide = defaultdict( lambda: "N", @@ -145,11 +135,7 @@ def chrNum(chrname, mode="append"): chrLen[chrNum(ll[0], mode="init")] = int(ll[1]) except: logging.warning( - "#TIME " - + "%.3f\t" % clock() - + ' Unable to open chromosome lengths file: "' - + chrLen_filename - + '"' + "#TIME " + "%.3f\t" % clock() + ' Unable to open chromosome lengths file: "' + chrLen_filename + '"' ) chrOffset = {} @@ -241,10 +227,7 @@ def load_line(self, line, file_format, exclude_info_string=False): else: self.strand = -1 if not exclude_info_string: - self.info = { - r[0 : r.find("=")]: r[r.find("=") + 1 :] - for r in ll[8].strip().strip(";").split(";") - } + self.info = {r[0 : r.find("=")]: r[r.find("=") + 1 :] for r in ll[8].strip().strip(";").split(";")} self.info["Variant"] = ll[5] elif file_format == "bed": ll = line.strip().split() @@ -254,19 +237,13 @@ def load_line(self, line, file_format, exclude_info_string=False): ci = int(self.chrom) if 0 < ci < 23: self.chrom = "chr" + self.chrom - logging.info( - "Corrected chromosome name (appended 'chr') " - + self.chrom - + " \n" - ) + logging.info("Corrected chromosome name (appended 'chr') " + self.chrom + " \n") except ValueError: if self.chrom in {"M", "X", "Y"}: self.chrom = "chr" + self.chrom else: - logging.warning( - "Chromosome name " + self.chrom + " may be incompatible" - ) + logging.warning("Chromosome name " + self.chrom + " may be incompatible") self.start, self.end = sorted([int(float(ll[1])), int(float(ll[2]))]) if int(float(ll[2])) >= int(float(ll[1])): @@ -297,9 +274,7 @@ def load_pysamread(self, line, bamfile): if line.reference_end is not None: self.end = line.reference_end else: - logging.warning( - "Reference_end for " + str(self) + " was NoneType. Setting to 0." - ) + logging.warning("Reference_end for " + str(self) + " was NoneType. Setting to 0.") if line.is_reverse: self.strand = -1 @@ -326,8 +301,7 @@ def __str__(self): return "\t".join( map( str, - [self.chrom, self.start, self.end] - + [str(s) + "=" + str(self.info[s]) for s in self.info], + [self.chrom, self.start, self.end] + [str(s) + "=" + str(self.info[s]) for s in self.info], ) ) else: @@ -340,9 +314,7 @@ def gc_content(self): # exit() if len(seq) == 0: return 0.5 - return float( - seq.count("G") + seq.count("C") + seq.count("g") + seq.count("c") - ) / len(seq) + return float(seq.count("G") + seq.count("C") + seq.count("g") + seq.count("c")) / len(seq) def sequence(self, new_fa_file=None): if new_fa_file is not None: @@ -358,15 +330,11 @@ def intersects(self, n, extend=0, margin=0.0): if margin > 0.0: if self.intersects( interval(n.chrom, n.start, n.end - (1 - margin) * (n.end - n.start)) - ) and self.intersects( - interval(n.chrom, n.start + (1 - margin) * (n.end - n.start)), n.end - ): + ) and self.intersects(interval(n.chrom, n.start + (1 - margin) * (n.end - n.start)), n.end): return True else: s = self - if n.intersects( - interval(s.chrom, s.start, s.end - (1 - margin) * (s.end - s.start)) - ) and n.intersects( + if n.intersects(interval(s.chrom, s.start, s.end - (1 - margin) * (s.end - s.start))) and n.intersects( interval(s.chrom, s.start + (1 - margin) * (s.end - s.start)), s.end ): return True @@ -402,13 +370,9 @@ def atomize(self, y): if il[0].intersects(il[1]): ilint = il[0].intersection(il[1]) if il[0].start < il[1].start: - ilr.append( - (interval(il[0].chrom, il[0].start, ilint.start - 1), [il[0]]) - ) + ilr.append((interval(il[0].chrom, il[0].start, ilint.start - 1), [il[0]])) elif il[1].start < il[0].start: - ilr.append( - (interval(il[1].chrom, il[1].start, ilint.start - 1), [il[1]]) - ) + ilr.append((interval(il[1].chrom, il[1].start, ilint.start - 1), [il[1]])) ilr.append((ilint, il)) if il[0].end > il[1].end: ilr.append((interval(il[0].chrom, ilint.end + 1, il[0].end), [il[0]])) @@ -429,8 +393,7 @@ def contains(self, x, y=-1, z=-1): z = y if ( self.intersects(interval(x, y, z)) - and self.intersection(interval(x, y, z)).size() - == interval(x, y, z).size() + and self.intersection(interval(x, y, z)).size() == interval(x, y, z).size() ): return True return False @@ -543,9 +506,7 @@ def extend(self, extend_len=0): class interval_list(list, object): - def __init__( - self, ilist=None, file_format=None, sort=True, exclude_info_string=False - ): + def __init__(self, ilist=None, file_format=None, sort=True, exclude_info_string=False): if ilist == None: ilist = [] self.file_format = file_format @@ -576,11 +537,7 @@ def bed_to_list(self, file_name, exclude_info_string=False): f.close() except: logging.warning( - "#TIME " - + "%.3f\t" % clock() - + ' interval_list: Unable to open interval file "' - + file_name - + '".' + "#TIME " + "%.3f\t" % clock() + ' interval_list: Unable to open interval file "' + file_name + '".' ) def merge_clusters(self, extend=0, margin=0.0): @@ -649,11 +606,7 @@ def intersection(self, l2, extend=0): l2j = len(l2) - 1 il = [] while si >= 0: - while ( - l2i >= 0 - and l2[l2i] > self[si] - and not self[si].intersects(l2[l2i], extend=extend) - ): + while l2i >= 0 and l2[l2i] > self[si] and not self[si].intersects(l2[l2i], extend=extend): l2i -= 1 l2j = l2i while l2j >= 0 and self[si].intersects(l2[l2j], extend=extend): @@ -677,19 +630,12 @@ def atomize(self, h2): # else: # print "%%", i, j, [], [str(aa[0]) for aa in atomlist] if c is not None: - if ( - i < len(self) - and self[i] not in c[1] - and (self[i].intersects(c[0], -1) or c[0] > self[i]) - ): + if i < len(self) and self[i] not in c[1] and (self[i].intersects(c[0], -1) or c[0] > self[i]): atm = self[i].atomize(c[0]) atm = [ ( aa[0], - [ - (lambda x: c[1][0] if x == c[0] else x)(aai) - for aai in aa[1] - ], + [(lambda x: c[1][0] if x == c[0] else x)(aai) for aai in aa[1]], ) for aa in atm ] @@ -697,20 +643,13 @@ def atomize(self, h2): c = atm[-1] i += 1 atomlist += atm[:-1] - elif ( - j < len(h2) - and h2[j] not in c[1] - and (h2[j].intersects(c[0], -1) or c[0] > h2[j]) - ): + elif j < len(h2) and h2[j] not in c[1] and (h2[j].intersects(c[0], -1) or c[0] > h2[j]): # print j, str(h2[j]), str(c[0]), c[0] > h2[j] atm = c[0].atomize(h2[j]) atm = [ ( aa[0], - [ - (lambda x: c[1][0] if x == c[0] else x)(aai) - for aai in aa[1] - ], + [(lambda x: c[1][0] if x == c[0] else x)(aai) for aai in aa[1]], ) for aa in atm ] @@ -762,17 +701,13 @@ def get_repeat_content(self): if not duke_int.intersects(self[i]) and self[i] > duke_int: continue j = i - repc = ( - 5.0 if float(duke_int.info[0]) == 0 else 1 / float(duke_int.info[0]) - ) + repc = 5.0 if float(duke_int.info[0]) == 0 else 1 / float(duke_int.info[0]) while j < len(self) and self[j].intersects(duke_int): sum_duke[j] += self[j].intersection(duke_int).size() * repc len_duke[j] += self[j].intersection(duke_int).size() j += 1 duke35_file.close() - return { - self[i]: sum_duke[i] / len_duke[i] for i in range(len(interval_list)) - } + return {self[i]: sum_duke[i] / len_duke[i] for i in range(len(interval_list))} except: logging.warning( "#TIME " @@ -793,9 +728,7 @@ def offsets(self): vlist = [i for i in self if chrNum(i.chrom) >= 100 and i.chrom[:3] != "chr"] hlist = [i for i in self if chrNum(i.chrom) < 100 or i.chrom[:3] == "chr"] - v_count = len( - [i for i in self if chrNum(i.chrom) >= 100 and i.chrom[:3] != "chr"] - ) + v_count = len([i for i in self if chrNum(i.chrom) >= 100 and i.chrom[:3] != "chr"]) h_count = len(self) - v_count h_sum = sum([i.size() for i in hlist]) v_sum = sum([i.size() for i in vlist]) @@ -840,9 +773,7 @@ def xpos(self, chrom, pos): for i in self: if i.intersects(interval(chrom, max(0, pos - 1), pos)): o = offset[i] - return (o[1] * (pos - i.start) + o[0] * (i.end - pos)) / ( - i.end - i.start - ) + return (o[1] * (pos - i.start) + o[0] * (i.end - pos)) / (i.end - i.start) return None def offset_breaks(self): @@ -852,9 +783,7 @@ def offset_breaks(self): vlist = [i for i in self if chrNum(i.chrom) >= 100 and i.chrom[:3] != "chr"] hlist = [i for i in self if chrNum(i.chrom) < 100 or i.chrom[:3] == "chr"] - v_count = len( - [i for i in self if chrNum(i.chrom) >= 100 and i.chrom[:3] != "chr"] - ) + v_count = len([i for i in self if chrNum(i.chrom) >= 100 and i.chrom[:3] != "chr"]) h_count = len(self) - v_count h_sum = sum([i.size() for i in hlist]) v_sum = sum([i.size() for i in vlist]) @@ -909,23 +838,14 @@ def load_exons(): if ( len(j.strip()) > 0 and j.strip()[0] != "#" - and { - r.split("=")[0]: r.split("=")[1] - for r in j.strip().split()[8].strip(";").split(";") - }["color"] + and {r.split("=")[0]: r.split("=")[1] for r in j.strip().split()[8].strip(";").split(";")}["color"] == "000080" ) ] exon_file.close() exon_list.extend((exonFields)) except: - logging.warning( - "#TIME " - + "%.3f\t" % clock() - + 'unable to load exon file: "' - + exon_filename - + '"' - ) + logging.warning("#TIME " + "%.3f\t" % clock() + 'unable to load exon file: "' + exon_filename + '"') conserved_regions = interval_list(conserved_regions_filename, "bed") @@ -936,9 +856,7 @@ def load_exons(): centromere_list = interval_list(centromere_filename, "bed") centromere_list.sort() -centromere_list = interval_list( - [i[0] for i in centromere_list.merge_clusters(extend=1)] -) +centromere_list = interval_list([i[0] for i in centromere_list.merge_clusters(extend=1)]) segdup_list = interval_list(segdup_filename, "bed") diff --git a/bin/make_AmpliconClassifier_input.sh b/bin/make_AmpliconClassifier_input.sh deleted file mode 100755 index 6171c656..00000000 --- a/bin/make_AmpliconClassifier_input.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env bash - -#arg 1: location to search of AA files -#arg 2: output name - -find $1 -name "*_cycles.txt" | grep -v "annotated_cycles" | sort > scf.txt -find $1 -name "*_graph.txt" | sort > sgf.txt -if [ "$(wc -l < scf.txt)" -ne "$(wc -l < sgf.txt)" ]; then - echo "ERROR: Unequal numbers of cycles and graph files found!" - exit -fi -cat scf.txt | rev | cut -d '/' -f 1 | cut -c12- | rev > san.txt -paste san.txt scf.txt sgf.txt > $2.input -rm san.txt scf.txt sgf.txt diff --git a/bin/mycolors.py b/bin/mycolors.py index fa57ba91..458cdb80 100755 --- a/bin/mycolors.py +++ b/bin/mycolors.py @@ -14,127 +14,120 @@ # # IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. -#Author: Viraj Deshpande -#Contact: virajbdeshpande@gmail.com - +# Author: Viraj Deshpande +# Contact: virajbdeshpande@gmail.com chrcolor = { -'b' : 'b', -'g' : 'g', -'r' : 'r', -'c' : 'c', -'m' : 'm', -'y' : 'y', -'k' : 'k', -'w' : 'w', -'chr1' : (153/256.0, 102/256.0, 0/256.0), -'chr2' : (102/256.0, 102/256.0, 0/256.0), -'chr3' : (153/256.0, 153/256.0, 30/256.0), -'chr4' : (204/256.0, 0/256.0, 0/256.0), -'chr5' : (255/256.0, 0/256.0, 0/256.0), -'chr6' : (255/256.0, 0/256.0, 204/256.0), -'chr7' : (255/256.0, 204/256.0, 204/256.0), -'chr8' : (255/256.0, 153/256.0, 0/256.0), -'chr9' : (255/256.0, 204/256.0, 0/256.0), -'chr10': (255/256.0, 255/256.0, 0/256.0), -'chr11': (204/256.0, 255/256.0, 0/256.0), -'chr12': (0/256.0, 255/256.0, 0/256.0), -'chr13': (53/256.0, 128/256.0, 0/256.0), -'chr14': (0/256.0, 0/256.0, 204/256.0), -'chr15': (102/256.0, 153/256.0, 255/256.0), -'chr16': (153/256.0, 204/256.0, 255/256.0), -'chr17': (0/256.0, 255/256.0, 255/256.0), -'chr18': (204/256.0, 255/256.0, 255/256.0), -'chr19': (153/256.0, 0/256.0, 204/256.0), -'chr20': (204/256.0, 51/256.0, 255/256.0), -'chr21': (204/256.0, 153/256.0, 255/256.0), -'chr22': (102/256.0, 102/256.0, 102/256.0), -'chr23': (153/256.0, 153/256.0, 153/256.0), -'chrX' : (153/256.0, 153/256.0, 153/256.0), -'chr24': (204/256.0, 204/256.0, 204/256.0), -'chrY' : (204/256.0, 204/256.0, 204/256.0), -'chrM' : (204/256.0, 204/256.0, 153/256.0), -'chr0' : (204/256.0, 204/256.0, 153/256.0), -'chrUn': (121/256.0, 204/256.0, 61/256.0), -'chrNA': (255/256.0, 255/256.0, 255/256.0), - -'lum90chr1' : (255/256.0,216/256.0,156/256.0), -'lum90chr2' : (230/256.0,230/256.0,165/256.0), -'lum90chr3' : (232/256.0,232/256.0,135/256.0), -'lum90chr4' : (255/256.0,166/256.0,166/256.0), -'lum90chr5' : (255/256.0,147/256.0,147/256.0), -'lum90chr6' : (255/256.0,152/256.0,255/256.0), -'lum90chr7' : (255/256.0,214/256.0,214/256.0), -'lum90chr8' : (255/256.0,202/256.0,102/256.0), -'lum90chr9' : (255/256.0,220/256.0,58/256.0), -'lum90chr10' : (234/256.0,234/256.0,0/256.0), -'lum90chr11' : (194/256.0,245/256.0,0/256.0), -'lum90chr12' : (34/256.0,255/256.0,34/256.0), -'lum90chr13' : (174/256.0,244/256.0,155/256.0), -'lum90chr14' : (215/256.0,215/256.0,255/256.0), -'lum90chr15' : (182/256.0,224/256.0,255/256.0), -'lum90chr16' : (182/256.0,231/256.0,255/256.0), -'lum90chr17' : (0/256.0,252/256.0,252/256.0), -'lum90chr18' : (185/256.0,236/256.0,236/256.0), -'lum90chr19' : (255/256.0,191/256.0,255/256.0), -'lum90chr20' : (255/256.0,177/256.0,255/256.0), -'lum90chr21' : (255/256.0,206/256.0,255/256.0), -'lum90chr22' : (198/256.0,198/256.0,198/256.0), -'lum90chr23' : (153/256.0,153/256.0,153/256.0), -'lum90chrX' : (153/256.0,153/256.0,153/256.0), -'lum90chr24' : (204/256.0,204/256.0,204/256.0), -'lum90chrY' : (204/256.0,204/256.0,204/256.0), -'lum90chrM' : (174/256.0,174/256.0,122/256.0), -'lum90chr0' : (174/256.0,174/256.0,122/256.0), -'lum90chrUn' : (108/256.0,191/256.0,38/256.0), -'lum90chrNA' : (171/256.0,171/256.0,171/256.0), - -'lum80chr1' : (244/256.0,188/256.0,127/256.0), -'lum80chr2' : (202/256.0,202/256.0,136/256.0), -'lum80chr3' : (203/256.0,203/256.0,103/256.0), -'lum80chr4' : (255/256.0,137/256.0,137/256.0), -'lum80chr5' : (255/256.0,116/256.0,116/256.0), -'lum80chr6' : (255/256.0,119/256.0,255/256.0), -'lum80chr7' : (237/256.0,186/256.0,186/256.0), -'lum80chr8' : (255/256.0,174/256.0,62/256.0), -'lum80chr9' : (243/256.0,192/256.0,0/256.0), -'lum80chr10' : (206/256.0,206/256.0,0/256.0), -'lum80chr11' : (166/256.0,216/256.0,0/256.0), -'lum80chr12' : (0/256.0,232/256.0,0/256.0), -'lum80chr13' : (146/256.0,216/256.0,126/256.0), -'lum80chr14' : (186/256.0,186/256.0,255/256.0), -'lum80chr15' : (152/256.0,196/256.0,255/256.0), -'lum80chr16' : (152/256.0,203/256.0,254/256.0), -'lum80chr17' : (0/256.0,224/256.0,224/256.0), -'lum80chr18' : (156/256.0,208/256.0,208/256.0), -'lum80chr19' : (250/256.0,161/256.0,255/256.0), -'lum80chr20' : (255/256.0,146/256.0,255/256.0), -'lum80chr21' : (227/256.0,177/256.0,255/256.0), -'lum80chr22' : (198/256.0,198/256.0,198/256.0), -'lum80chr23' : (153/256.0,153/256.0,153/256.0), -'lum80chrX' : (153/256.0,153/256.0,153/256.0), -'lum80chr24' : (204/256.0,204/256.0,204/256.0), -'lum80chrY' : (204/256.0,204/256.0,204/256.0), -'lum80chrM' : (174/256.0,174/256.0,122/256.0), -'lum80chr0' : (174/256.0,174/256.0,122/256.0), -'lum80chrUn' : (108/256.0,191/256.0,38/256.0), -'lum80chrNA' : (171/256.0,171/256.0,171/256.0), - -'vlpurple' : (218/256.0,218/256.0,235/256.0), -'vlorange' : (253/256.0,208/256.0,162/256.0), -'vlpgreen' : (218/256.0,218/256.0,235/256.0) - + "b": "b", + "g": "g", + "r": "r", + "c": "c", + "m": "m", + "y": "y", + "k": "k", + "w": "w", + "chr1": (153 / 256.0, 102 / 256.0, 0 / 256.0), + "chr2": (102 / 256.0, 102 / 256.0, 0 / 256.0), + "chr3": (153 / 256.0, 153 / 256.0, 30 / 256.0), + "chr4": (204 / 256.0, 0 / 256.0, 0 / 256.0), + "chr5": (255 / 256.0, 0 / 256.0, 0 / 256.0), + "chr6": (255 / 256.0, 0 / 256.0, 204 / 256.0), + "chr7": (255 / 256.0, 204 / 256.0, 204 / 256.0), + "chr8": (255 / 256.0, 153 / 256.0, 0 / 256.0), + "chr9": (255 / 256.0, 204 / 256.0, 0 / 256.0), + "chr10": (255 / 256.0, 255 / 256.0, 0 / 256.0), + "chr11": (204 / 256.0, 255 / 256.0, 0 / 256.0), + "chr12": (0 / 256.0, 255 / 256.0, 0 / 256.0), + "chr13": (53 / 256.0, 128 / 256.0, 0 / 256.0), + "chr14": (0 / 256.0, 0 / 256.0, 204 / 256.0), + "chr15": (102 / 256.0, 153 / 256.0, 255 / 256.0), + "chr16": (153 / 256.0, 204 / 256.0, 255 / 256.0), + "chr17": (0 / 256.0, 255 / 256.0, 255 / 256.0), + "chr18": (204 / 256.0, 255 / 256.0, 255 / 256.0), + "chr19": (153 / 256.0, 0 / 256.0, 204 / 256.0), + "chr20": (204 / 256.0, 51 / 256.0, 255 / 256.0), + "chr21": (204 / 256.0, 153 / 256.0, 255 / 256.0), + "chr22": (102 / 256.0, 102 / 256.0, 102 / 256.0), + "chr23": (153 / 256.0, 153 / 256.0, 153 / 256.0), + "chrX": (153 / 256.0, 153 / 256.0, 153 / 256.0), + "chr24": (204 / 256.0, 204 / 256.0, 204 / 256.0), + "chrY": (204 / 256.0, 204 / 256.0, 204 / 256.0), + "chrM": (204 / 256.0, 204 / 256.0, 153 / 256.0), + "chr0": (204 / 256.0, 204 / 256.0, 153 / 256.0), + "chrUn": (121 / 256.0, 204 / 256.0, 61 / 256.0), + "chrNA": (255 / 256.0, 255 / 256.0, 255 / 256.0), + "lum90chr1": (255 / 256.0, 216 / 256.0, 156 / 256.0), + "lum90chr2": (230 / 256.0, 230 / 256.0, 165 / 256.0), + "lum90chr3": (232 / 256.0, 232 / 256.0, 135 / 256.0), + "lum90chr4": (255 / 256.0, 166 / 256.0, 166 / 256.0), + "lum90chr5": (255 / 256.0, 147 / 256.0, 147 / 256.0), + "lum90chr6": (255 / 256.0, 152 / 256.0, 255 / 256.0), + "lum90chr7": (255 / 256.0, 214 / 256.0, 214 / 256.0), + "lum90chr8": (255 / 256.0, 202 / 256.0, 102 / 256.0), + "lum90chr9": (255 / 256.0, 220 / 256.0, 58 / 256.0), + "lum90chr10": (234 / 256.0, 234 / 256.0, 0 / 256.0), + "lum90chr11": (194 / 256.0, 245 / 256.0, 0 / 256.0), + "lum90chr12": (34 / 256.0, 255 / 256.0, 34 / 256.0), + "lum90chr13": (174 / 256.0, 244 / 256.0, 155 / 256.0), + "lum90chr14": (215 / 256.0, 215 / 256.0, 255 / 256.0), + "lum90chr15": (182 / 256.0, 224 / 256.0, 255 / 256.0), + "lum90chr16": (182 / 256.0, 231 / 256.0, 255 / 256.0), + "lum90chr17": (0 / 256.0, 252 / 256.0, 252 / 256.0), + "lum90chr18": (185 / 256.0, 236 / 256.0, 236 / 256.0), + "lum90chr19": (255 / 256.0, 191 / 256.0, 255 / 256.0), + "lum90chr20": (255 / 256.0, 177 / 256.0, 255 / 256.0), + "lum90chr21": (255 / 256.0, 206 / 256.0, 255 / 256.0), + "lum90chr22": (198 / 256.0, 198 / 256.0, 198 / 256.0), + "lum90chr23": (153 / 256.0, 153 / 256.0, 153 / 256.0), + "lum90chrX": (153 / 256.0, 153 / 256.0, 153 / 256.0), + "lum90chr24": (204 / 256.0, 204 / 256.0, 204 / 256.0), + "lum90chrY": (204 / 256.0, 204 / 256.0, 204 / 256.0), + "lum90chrM": (174 / 256.0, 174 / 256.0, 122 / 256.0), + "lum90chr0": (174 / 256.0, 174 / 256.0, 122 / 256.0), + "lum90chrUn": (108 / 256.0, 191 / 256.0, 38 / 256.0), + "lum90chrNA": (171 / 256.0, 171 / 256.0, 171 / 256.0), + "lum80chr1": (244 / 256.0, 188 / 256.0, 127 / 256.0), + "lum80chr2": (202 / 256.0, 202 / 256.0, 136 / 256.0), + "lum80chr3": (203 / 256.0, 203 / 256.0, 103 / 256.0), + "lum80chr4": (255 / 256.0, 137 / 256.0, 137 / 256.0), + "lum80chr5": (255 / 256.0, 116 / 256.0, 116 / 256.0), + "lum80chr6": (255 / 256.0, 119 / 256.0, 255 / 256.0), + "lum80chr7": (237 / 256.0, 186 / 256.0, 186 / 256.0), + "lum80chr8": (255 / 256.0, 174 / 256.0, 62 / 256.0), + "lum80chr9": (243 / 256.0, 192 / 256.0, 0 / 256.0), + "lum80chr10": (206 / 256.0, 206 / 256.0, 0 / 256.0), + "lum80chr11": (166 / 256.0, 216 / 256.0, 0 / 256.0), + "lum80chr12": (0 / 256.0, 232 / 256.0, 0 / 256.0), + "lum80chr13": (146 / 256.0, 216 / 256.0, 126 / 256.0), + "lum80chr14": (186 / 256.0, 186 / 256.0, 255 / 256.0), + "lum80chr15": (152 / 256.0, 196 / 256.0, 255 / 256.0), + "lum80chr16": (152 / 256.0, 203 / 256.0, 254 / 256.0), + "lum80chr17": (0 / 256.0, 224 / 256.0, 224 / 256.0), + "lum80chr18": (156 / 256.0, 208 / 256.0, 208 / 256.0), + "lum80chr19": (250 / 256.0, 161 / 256.0, 255 / 256.0), + "lum80chr20": (255 / 256.0, 146 / 256.0, 255 / 256.0), + "lum80chr21": (227 / 256.0, 177 / 256.0, 255 / 256.0), + "lum80chr22": (198 / 256.0, 198 / 256.0, 198 / 256.0), + "lum80chr23": (153 / 256.0, 153 / 256.0, 153 / 256.0), + "lum80chrX": (153 / 256.0, 153 / 256.0, 153 / 256.0), + "lum80chr24": (204 / 256.0, 204 / 256.0, 204 / 256.0), + "lum80chrY": (204 / 256.0, 204 / 256.0, 204 / 256.0), + "lum80chrM": (174 / 256.0, 174 / 256.0, 122 / 256.0), + "lum80chr0": (174 / 256.0, 174 / 256.0, 122 / 256.0), + "lum80chrUn": (108 / 256.0, 191 / 256.0, 38 / 256.0), + "lum80chrNA": (171 / 256.0, 171 / 256.0, 171 / 256.0), + "vlpurple": (218 / 256.0, 218 / 256.0, 235 / 256.0), + "vlorange": (253 / 256.0, 208 / 256.0, 162 / 256.0), + "vlpgreen": (218 / 256.0, 218 / 256.0, 235 / 256.0), } - - - -ecolor = {'interchromosomal' : 'blue', - 'concordant' : 'black', - 'everted' : (139/256.0, 69/256.0, 19/256.0), # 'brown', yellow', - 'forward' : 'magenta', - 'reverse' : (0/256.0, 139/256.0, 139/256.0), #'cyan', - 'discordant' : 'red'} - +ecolor = { + "interchromosomal": "blue", + "concordant": "black", + "everted": (139 / 256.0, 69 / 256.0, 19 / 256.0), # 'brown', yellow', + "forward": "magenta", + "reverse": (0 / 256.0, 139 / 256.0, 139 / 256.0), #'cyan', + "discordant": "red", +} diff --git a/bin/realigner.py b/bin/realigner.py index a9779e19..31dd62e9 100644 --- a/bin/realigner.py +++ b/bin/realigner.py @@ -1,25 +1,25 @@ #!/usr/bin/env python -#MIT License +# MIT License # -#Copyright (c) 2019 Iñigo Prada Luengo +# Copyright (c) 2019 Iñigo Prada Luengo # -#Permission is hereby granted, free of charge, to any person obtaining a copy -#of this software and associated documentation files (the "Software"), to deal -#in the Software without restriction, including without limitation the rights -#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -#copies of the Software, and to permit persons to whom the Software is -#furnished to do so, subject to the following conditions: +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: # -#The above copyright notice and this permission notice shall be included in all -#copies or substantial portions of the Software. +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. # -#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -#SOFTWARE. +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. from __future__ import division @@ -33,15 +33,44 @@ import traceback - class realignment: """Class for managing the realignment and eccDNA indetification of circle-map""" - def __init__(self, input_bam,qname_bam,sorted_bam,genome_fasta,directory,mapq_cutoff,insert_size_mapq,std_extension, - insert_size_sample_size,gap_open,gap_ext,n_hits,prob_cutoff,min_soft_clipped_length,overlap_frac, - interval_p_cut, output_name,ncores,af,locker,split,ratio,verbose,pid,edit_distance_frac, - remap_splits,only_discordants,splits,score,insert_size,discordant_filter): - #I/O + def __init__( + self, + input_bam, + qname_bam, + sorted_bam, + genome_fasta, + directory, + mapq_cutoff, + insert_size_mapq, + std_extension, + insert_size_sample_size, + gap_open, + gap_ext, + n_hits, + prob_cutoff, + min_soft_clipped_length, + overlap_frac, + interval_p_cut, + output_name, + ncores, + af, + locker, + split, + ratio, + verbose, + pid, + edit_distance_frac, + remap_splits, + only_discordants, + splits, + score, + insert_size, + discordant_filter, + ): + # I/O self.edit_distance_frac = edit_distance_frac self.ecc_dna_str = input_bam self.qname_bam = qname_bam @@ -49,7 +78,7 @@ def __init__(self, input_bam,qname_bam,sorted_bam,genome_fasta,directory,mapq_cu self.directory = directory self.genome_fa = genome_fasta - #realignment parameters + # realignment parameters # probabilistic realignment options self.n_hits = n_hits @@ -61,142 +90,132 @@ def __init__(self, input_bam,qname_bam,sorted_bam,genome_fasta,directory,mapq_cu self.only_discordants = only_discordants self.split = splits self.score = score - self.af= af + self.af = af self.insert = insert_size # affine gap scoring options self.gap_open = gap_open self.gap_ext = gap_ext - - #insert size stimation parameters + # insert size stimation parameters self.insert_size_mapq = insert_size_mapq self.std_extenstion = std_extension self.insert_sample_size = insert_size_sample_size - - - #output options + # output options self.overlap_fraction = overlap_frac self.output = output_name self.discordant_filter = discordant_filter - - #regular options + # regular options self.cores = ncores self.verbose = verbose self.lock = locker - #this two parameters don't work on this class. They are here for printing the parameters + # this two parameters don't work on this class. They are here for printing the parameters self.split = split self.ratio = ratio - #for instances running on the same directoiry + # for instances running on the same directoiry self.pid = pid - - - - def print_parameters(self): - print("Running realignment\n") - print("Probabilistic realignment parameters:\n" + print( + "Probabilistic realignment parameters:\n" "\tAlignments to consider: %s \n" "\tProbability cut-off to consider as mapped: %s \n" "\tMinimum soft-clipped length to attemp realignment: %s \n" "\tMinimum bwa mem mapping quality to consider: %s \n" "\tGap open penalty: %s \n" "\tGap extension penalty: %s \n" - % (self.n_hits, self.prob_cutoff,self.min_sc_length,self.mapq_cutoff,self.gap_open, self.gap_ext)) - - print("Interval extension parameters:\n" + % ( + self.n_hits, + self.prob_cutoff, + self.min_sc_length, + self.mapq_cutoff, + self.gap_open, + self.gap_ext, + ) + ) + + print( + "Interval extension parameters:\n" "\tInsert size mapping quality cut-off: %s \n" "\tNumber of read to sample: %s \n" "\tNumber of standard deviations to extend the realignment intervals: %s \n" - % (self.insert_size_mapq,self.insert_sample_size,self.std_extenstion)) + % (self.insert_size_mapq, self.insert_sample_size, self.std_extenstion) + ) - print("eccDNA output options: \n" + print( + "eccDNA output options: \n" "\tSplit read cut-off: %s \n" - "\tCoverage ratio cut-off: %s \n" % (self.split,self.ratio)) - + "\tCoverage ratio cut-off: %s \n" % (self.split, self.ratio) + ) - print("Interval processing options: \n" + print( + "Interval processing options: \n" "\tMerging fraction: %s \n" - "\tInterval probability cut-off: %s \n" - % (self.overlap_fraction,self.interval_p)) - + "\tInterval probability cut-off: %s \n" % (self.overlap_fraction, self.interval_p) + ) - - - - - - def realign(self,peaks): + def realign(self, peaks): """Function that will iterate trough the bam file containing reads indicating eccDNA structural variants and will output a bed file containing the soft-clipped reads, the discordant and the coverage within the interval""" - #open files for every process + # open files for every process try: - peaks_pd = pd.DataFrame.from_records(peaks,columns=['chrom', 'start', 'end']) + peaks_pd = pd.DataFrame.from_records(peaks, columns=["chrom", "start", "end"]) sorted_bam = ps.AlignmentFile(self.sorted_bam_str, "rb") genome_fa = ps.FastaFile(self.genome_fa) - ecc_dna = ps.AlignmentFile(self.ecc_dna_str,"rb") + ecc_dna = ps.AlignmentFile(self.ecc_dna_str, "rb") begin = time.time() - - - - - - - # compute insert size distribution insert_metrics = self.insert - - #define realignment extension interval - extension = insert_metrics[0] + self.std_extenstion*insert_metrics[1] - + # define realignment extension interval + extension = insert_metrics[0] + self.std_extenstion * insert_metrics[1] iteration = 0 - results = [] only_discordants = [] - - - for index,interval in peaks_pd.iterrows(): - - - - if check_size_and_write(results,only_discordants,self.output,self.lock,self.directory,self.overlap_fraction,self.pid) == True: + for index, interval in peaks_pd.iterrows(): + if ( + check_size_and_write( + results, + only_discordants, + self.output, + self.lock, + self.directory, + self.overlap_fraction, + self.pid, + ) + == True + ): results = [] only_discordants = [] try: - - - - - #find out the prior distribution (mate alignment positions). - candidate_mates = get_mate_intervals(ecc_dna,interval,self.mapq_cutoff,self.verbose,self.only_discordants) - - - - - + # find out the prior distribution (mate alignment positions). + candidate_mates = get_mate_intervals( + ecc_dna, + interval, + self.mapq_cutoff, + self.verbose, + self.only_discordants, + ) if len(candidate_mates) > 0 or candidate_mates != None: - - - realignment_interval_extended = get_realignment_intervals(candidate_mates,extension,self.interval_p, - self.verbose) + realignment_interval_extended = get_realignment_intervals( + candidate_mates, extension, self.interval_p, self.verbose + ) if realignment_interval_extended is None: continue @@ -204,211 +223,317 @@ def realign(self,peaks): iteration_results = [] iteration_discordants = [] disorcordants_per_it = 0 - for index,mate_interval in realignment_interval_extended.iterrows(): - + for ( + index, + mate_interval, + ) in realignment_interval_extended.iterrows(): iteration += 1 - #sample realignment intervals - #fasta file fetch is 1 based that why I do +1 + # sample realignment intervals + # fasta file fetch is 1 based that why I do +1 - plus_coding_interval = genome_fa.fetch(str(mate_interval['chrom']),int(int(mate_interval['start'])+1),int(int(mate_interval['end'])+1)).upper() + plus_coding_interval = genome_fa.fetch( + str(mate_interval["chrom"]), + int(int(mate_interval["start"]) + 1), + int(int(mate_interval["end"]) + 1), + ).upper() interval_length = len(plus_coding_interval) minus_coding_interval = str(Seq(plus_coding_interval).complement()) # precompute the denominators of the error model. They will be constants for every interval plus_base_freqs = background_freqs(plus_coding_interval) - minus_base_freqs = {'T':plus_base_freqs['A'],'A':plus_base_freqs['T'], - 'C':plus_base_freqs['G'],'G':plus_base_freqs['C']} - - minus_base_freqs = np.array([plus_base_freqs['T'],plus_base_freqs['A'],plus_base_freqs['G'],plus_base_freqs['C']]) - plus_base_freqs = np.array([plus_base_freqs['A'],plus_base_freqs['T'],plus_base_freqs['C'],plus_base_freqs['G']]) - - - #note that I am getting the reads of the interval. Not the reads of the mates - - for read in ecc_dna.fetch(interval['chrom'],int(interval['start']),int(interval['end']),multiple_iterators=True): - - + minus_base_freqs = { + "T": plus_base_freqs["A"], + "A": plus_base_freqs["T"], + "C": plus_base_freqs["G"], + "G": plus_base_freqs["C"], + } + + minus_base_freqs = np.array( + [ + plus_base_freqs["T"], + plus_base_freqs["A"], + plus_base_freqs["G"], + plus_base_freqs["C"], + ] + ) + plus_base_freqs = np.array( + [ + plus_base_freqs["A"], + plus_base_freqs["T"], + plus_base_freqs["C"], + plus_base_freqs["G"], + ] + ) + + # note that I am getting the reads of the interval. Not the reads of the mates + + for read in ecc_dna.fetch( + interval["chrom"], + int(interval["start"]), + int(interval["end"]), + multiple_iterators=True, + ): if is_soft_clipped(read): - if read.mapq >= self.mapq_cutoff: - # no need to realignment - if read.has_tag('SA') and self.remap != True: - - - #check realignment from SA tag + if read.has_tag("SA") and self.remap != True: + # check realignment from SA tag support = circle_from_SA(read, self.mapq_cutoff, mate_interval) - - - if support is None: + if support is None: pass else: - - if support['support'] == True: - - score = len(get_longest_soft_clipped_bases(read)['seq'])* (1-phred_to_prob(np.array(int(read.get_tag('SA').split(',')[4]),dtype=np.float64))) - - #compute mapping positions + if support["support"] == True: + score = len(get_longest_soft_clipped_bases(read)["seq"]) * ( + 1 + - phred_to_prob( + np.array( + int(read.get_tag("SA").split(",")[4]), + dtype=np.float64, + ) + ) + ) + + # compute mapping positions read_end = rightmost_from_read(read) - supplementary_end = rightmost_from_sa(support['leftmost'],support['cigar']) - - + supplementary_end = rightmost_from_sa( + support["leftmost"], + support["cigar"], + ) # I store the read name to the output, so that a read counts as 1 no matter it is SC in 2 pieces - if read.reference_start < support['leftmost']: - - iteration_results.append([interval['chrom'],read.reference_start,(supplementary_end-1),read.qname,iteration,float(round(score,2))]) - - elif read.reference_start > support['leftmost']: - + if read.reference_start < support["leftmost"]: iteration_results.append( - [interval['chrom'], (support['leftmost']-1), read_end, read.qname,iteration,float(round(score,2))]) + [ + interval["chrom"], + read.reference_start, + (supplementary_end - 1), + read.qname, + iteration, + float(round(score, 2)), + ] + ) + + elif read.reference_start > support["leftmost"]: + iteration_results.append( + [ + interval["chrom"], + (support["leftmost"] - 1), + read_end, + read.qname, + iteration, + float(round(score, 2)), + ] + ) else: - #uninformative read + # uninformative read pass - - else: - #sc length - sc_len = len(get_longest_soft_clipped_bases(read)['seq']) - - - if non_colinearity(int(read.cigar[0][0]),int(read.cigar[-1][0]),int(read.pos), - int(mate_interval.start),int(mate_interval.end)) == True: - - + # sc length + sc_len = len(get_longest_soft_clipped_bases(read)["seq"]) + + if ( + non_colinearity( + int(read.cigar[0][0]), + int(read.cigar[-1][0]), + int(read.pos), + int(mate_interval.start), + int(mate_interval.end), + ) + == True + ): if sc_len >= self.min_sc_length: edits_allowed = adaptative_myers_k(sc_len, self.edit_distance_frac) - #realignment - - realignment_dict = realign(read,self.n_hits,plus_coding_interval,minus_coding_interval, - plus_base_freqs,minus_base_freqs,self.gap_open,self.gap_ext,self.verbose,edits_allowed) - + # realignment + + realignment_dict = realign( + read, + self.n_hits, + plus_coding_interval, + minus_coding_interval, + plus_base_freqs, + minus_base_freqs, + self.gap_open, + self.gap_ext, + self.verbose, + edits_allowed, + ) if realignment_dict == None: - pass else: - #calc edit distance allowed - prob = realignment_probability(realignment_dict,interval_length) - if prob >= self.prob_cutoff and realignment_dict['alignments'][1][3] <= edits_allowed: - + # calc edit distance allowed + prob = realignment_probability( + realignment_dict, + interval_length, + ) + if ( + prob >= self.prob_cutoff + and realignment_dict["alignments"][1][3] <= edits_allowed + ): # here I have to retrieve the nucleotide mapping positions. Which should be the # the left sampling pysam coordinate - edlib coordinates read_end = rightmost_from_read(read) + soft_clip_start = int(mate_interval["start"]) + int( + realignment_dict["alignments"][1][0][0] + ) - soft_clip_start = int(mate_interval['start'])+ int(realignment_dict['alignments'][1][0][0]) - - soft_clip_end = int(mate_interval['start']) + int(realignment_dict['alignments'][1][0][1]) - - score = sc_len*prob + soft_clip_end = int(mate_interval["start"]) + int( + realignment_dict["alignments"][1][0][1] + ) + score = sc_len * prob # I store the read name to the output, so that a read counts as 1 no matter it is SC in 2 pieces - if read.reference_start < int(mate_interval['start']) + int( - realignment_dict['alignments'][1][0][0]): - - iteration_results.append([interval['chrom'], read.reference_start, soft_clip_end+1, read.qname,iteration,float(round(score,2))]) - - elif read.reference_start + int(mate_interval['start']) + int( - realignment_dict['alignments'][1][0][0]): - - iteration_results.append([interval['chrom'], soft_clip_start, read_end, read.qname,iteration,float(round(score,2))]) + if read.reference_start < int(mate_interval["start"]) + int( + realignment_dict["alignments"][1][0][0] + ): + iteration_results.append( + [ + interval["chrom"], + read.reference_start, + soft_clip_end + 1, + read.qname, + iteration, + float(round(score, 2)), + ] + ) + + elif ( + read.reference_start + + int(mate_interval["start"]) + + int(realignment_dict["alignments"][1][0][0]) + ): + iteration_results.append( + [ + interval["chrom"], + soft_clip_start, + read_end, + read.qname, + iteration, + float(round(score, 2)), + ] + ) else: # uninformative read pass - - else: pass else: - pass else: pass else: - #discordant reads - #R2F1 oriented when iterating trough R2 + # discordant reads + # R2F1 oriented when iterating trough R2 if read.is_reverse == True and read.mate_is_reverse == False: if read.is_read2: if read.reference_start < read.next_reference_start: # discordant read - disorcordants_per_it +=1 - iteration_discordants.append([interval['chrom'],read.reference_start,read.next_reference_start + read.infer_query_length(),read.qname]) - - - - - #R2F1 when iterating trough F1 - elif read.is_reverse == False and read.mate_is_reverse == True: + disorcordants_per_it += 1 + iteration_discordants.append( + [ + interval["chrom"], + read.reference_start, + read.next_reference_start + read.infer_query_length(), + read.qname, + ] + ) + + # R2F1 when iterating trough F1 + elif read.is_reverse == False and read.mate_is_reverse == True: if read.is_read2 == False: if read.next_reference_start < read.reference_start: - disorcordants_per_it +=1 - iteration_discordants.append([interval['chrom'], read.next_reference_start,read.reference_start+read.infer_query_length(), read.qname]) - - - #second pass to add discordant read info + disorcordants_per_it += 1 + iteration_discordants.append( + [ + interval["chrom"], + read.next_reference_start, + read.reference_start + read.infer_query_length(), + read.qname, + ] + ) + + # second pass to add discordant read info if len(iteration_results) > 0: - - - results = results + assign_discordants(iteration_results,iteration_discordants,insert_metrics[0],insert_metrics[1]) - + results = results + assign_discordants( + iteration_results, + iteration_discordants, + insert_metrics[0], + insert_metrics[1], + ) elif len(iteration_discordants) > 0: - discordant_bed = pd.DataFrame.from_records(iteration_discordants,columns=['chrom','start','end','read']).sort_values(['chrom','start','end']) - - discordant_bed = discordant_bed.groupby(merge_bed(discordant_bed)).agg( - {'chrom': 'first', 'start': 'first', 'end': 'last', 'read': 'count'}) - - - for index,disc_interval in discordant_bed.iterrows(): - only_discordants.append([disc_interval['chrom'],disc_interval['start'],disc_interval['end'],disc_interval['read'],0]) - - + discordant_bed = pd.DataFrame.from_records( + iteration_discordants, + columns=["chrom", "start", "end", "read"], + ).sort_values(["chrom", "start", "end"]) + + discordant_bed = discordant_bed.groupby(merge_bed(discordant_bed)).agg( + { + "chrom": "first", + "start": "first", + "end": "last", + "read": "count", + } + ) + + for index, disc_interval in discordant_bed.iterrows(): + only_discordants.append( + [ + disc_interval["chrom"], + disc_interval["start"], + disc_interval["end"], + disc_interval["read"], + 0, + ] + ) except BaseException as e: - traceback.print_exc(file=sys.stdout) - warnings.warn( - "Failed on interval %s due to the error %s" % ( - str(interval), str(e))) - return([1,1]) - + warnings.warn("Failed on interval %s due to the error %s" % (str(interval), str(e))) + return [1, 1] ecc_dna.close() genome_fa.close() - # Write process output to disk - output = iteration_merge(only_discordants,results, - self.overlap_fraction,self.split,self.score, - self.min_sc_length,sorted_bam,self.af,insert_metrics[0],insert_metrics[1],self.discordant_filter) + output = iteration_merge( + only_discordants, + results, + self.overlap_fraction, + self.split, + self.score, + self.min_sc_length, + sorted_bam, + self.af, + insert_metrics[0], + insert_metrics[1], + self.discordant_filter, + ) write_to_disk(output, self.output, self.lock, self.directory, self.pid) - except: print("Failed on cluster:") print(traceback.print_exc(file=sys.stdout)) - return([1,1]) + return [1, 1] sorted_bam.close() genome_fa.close() ecc_dna.close() - - return([0,0]) + return [0, 0] diff --git a/bin/ref_util.py b/bin/ref_util.py index d980f4b0..1936d9af 100755 --- a/bin/ref_util.py +++ b/bin/ref_util.py @@ -14,8 +14,8 @@ # # IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. -#Author: Viraj Deshpande -#Contact: virajbdeshpande@gmail.com +# Author: Viraj Deshpande +# Contact: virajbdeshpande@gmail.com ##This is a suite to load reference genome (not just hg19, as filename implies), genes, exons, repeat content and perform operations on this genome, compare variants @@ -40,53 +40,87 @@ try: DATA_REPO = os.environ["AA_DATA_REPO"] except: - logging.warning("#TIME " + '%.3f\t'%(time() - TSTART) + " Unable to set AA_DATA_REPO variable. Setting to working directory") - DATA_REPO = '.' -if DATA_REPO == '.' or DATA_REPO == '': - logging.warning("#TIME " + '%.3f\t'%(time() - TSTART) + " AA_DATA_REPO not set or empy. Setting to working directory") - DATA_REPO = '.' - -REF_files = defaultdict(lambda: '', {}) + logging.warning( + "#TIME " + "%.3f\t" % (time() - TSTART) + " Unable to set AA_DATA_REPO variable. Setting to working directory" + ) + DATA_REPO = "." +if DATA_REPO == "." or DATA_REPO == "": + logging.warning( + "#TIME " + "%.3f\t" % (time() - TSTART) + " AA_DATA_REPO not set or empy. Setting to working directory" + ) + DATA_REPO = "." + +REF_files = defaultdict(lambda: "", {}) try: - for l in open(DATA_REPO + '/' + REF + '/file_list.txt'): + for l in open(DATA_REPO + "/" + REF + "/file_list.txt"): REF_files[l.strip().split()[0]] = l.strip().split()[1] except: - logging.warning("#TIME " + '%.3f\t'%(time() - TSTART) + " Unable to find reference in $AA_DATA_REPO/REF/file_list.txt. Setting to empty.") + logging.warning( + "#TIME " + + "%.3f\t" % (time() - TSTART) + + " Unable to find reference in $AA_DATA_REPO/REF/file_list.txt. Setting to empty." + ) class fake_fasta(object): def fetch(self, a=None, b=0, c=0): - return ''.join(['N' for i in range(c - b + 1)]) + return "".join(["N" for i in range(c - b + 1)]) + + try: - fa_file = pysam.Fastafile(DATA_REPO + '/' + REF + '/' + REF_files['fa_file']) + fa_file = pysam.Fastafile(DATA_REPO + "/" + REF + "/" + REF_files["fa_file"]) except: - logging.warning("#TIME " + '%.3f\t'%(time() - TSTART) + " Unable to open fasta file: \"" + DATA_REPO + '/' + REF + '/' + REF_files['fa_file'] + "\". Reference sequences will be set to N.") + logging.warning( + "#TIME " + + "%.3f\t" % (time() - TSTART) + + ' Unable to open fasta file: "' + + DATA_REPO + + "/" + + REF + + "/" + + REF_files["fa_file"] + + '". Reference sequences will be set to N.' + ) fa_file = fake_fasta() -chrLen_filename = DATA_REPO + '/' + REF + '/' + REF_files['chrLen_file'] -duke35_filename = DATA_REPO + '/' + REF + '/' + REF_files['duke35_filename'] -wgexclude_filename = DATA_REPO + '/' + REF + '/' + REF_files['mapability_exclude_filename'] -gene_filename = DATA_REPO + '/' + REF + '/' + REF_files['gene_filename'] -exon_filename = DATA_REPO + '/' + REF + '/' + REF_files['exon_file'] -oncogene_filename = DATA_REPO + '/' + REF + '/' + REF_files['oncogene_filename'] -centromere_filename = DATA_REPO + '/' + REF + '/' + REF_files['centromere_filename'] -conserved_regions_filename = DATA_REPO + '/' + REF + '/' + REF_files['conserved_regions_filename'] -segdup_filename = DATA_REPO + '/' + REF + '/' + REF_files['segdup_filename'] -complementary_nucleotide = defaultdict(lambda: 'N', {'A':'T', 'C':'G', 'G':'C', 'T':'A', 'a':'t', 'c':'g', 'g':'c', 't':'a', 'n':'n', 'N':'N'}) +chrLen_filename = DATA_REPO + "/" + REF + "/" + REF_files["chrLen_file"] +duke35_filename = DATA_REPO + "/" + REF + "/" + REF_files["duke35_filename"] +wgexclude_filename = DATA_REPO + "/" + REF + "/" + REF_files["mapability_exclude_filename"] +gene_filename = DATA_REPO + "/" + REF + "/" + REF_files["gene_filename"] +exon_filename = DATA_REPO + "/" + REF + "/" + REF_files["exon_file"] +oncogene_filename = DATA_REPO + "/" + REF + "/" + REF_files["oncogene_filename"] +centromere_filename = DATA_REPO + "/" + REF + "/" + REF_files["centromere_filename"] +conserved_regions_filename = DATA_REPO + "/" + REF + "/" + REF_files["conserved_regions_filename"] +segdup_filename = DATA_REPO + "/" + REF + "/" + REF_files["segdup_filename"] +complementary_nucleotide = defaultdict( + lambda: "N", + { + "A": "T", + "C": "G", + "G": "C", + "T": "A", + "a": "t", + "c": "g", + "g": "c", + "t": "a", + "n": "n", + "N": "N", + }, +) duke35 = [] duke35_exists = [True] # Handling chromosome names, lengths, sorting, positions and addition of new chromosomes chr_id = {} chrName = {} -chromList = [str(x) for x in range(1, 23)] + ['X'+'Y'] # must be updated if including an organism with more chroms. +chromList = [str(x) for x in range(1, 23)] + ["X" + "Y"] # must be updated if including an organism with more chroms. -def chrNum(chrname, mode='append'): +def chrNum(chrname, mode="append"): if chrname in chr_id: return chr_id[chrname] else: - if mode == 'init': + if mode == "init": cnum = len(chr_id) else: cnum = 1000000 + len(chr_id) @@ -94,15 +128,20 @@ def chrNum(chrname, mode='append'): chrName[cnum] = chrname return chr_id[chrname] + chrLen = defaultdict(lambda: 0, {}) try: for line in open(chrLen_filename): ll = line.strip().split() - chrLen[chrNum(ll[0], mode='init')] = int(ll[1]) + chrLen[chrNum(ll[0], mode="init")] = int(ll[1]) except: - logging.warning("#TIME " + '%.3f\t'%(time() - TSTART) + " Unable to open chromosome lengths file: \"" + chrLen_filename + "\"") + logging.warning( + "#TIME " + "%.3f\t" % (time() - TSTART) + ' Unable to open chromosome lengths file: "' + chrLen_filename + '"' + ) chrOffset = {} + + def absPos(chrname, pos=0): cnum = chrNum(chrname) if chrNum(chrname) not in chrOffset: @@ -114,10 +153,12 @@ def absPos(chrname, pos=0): sumlen += chrLen[chrkeys[i]] if cnum < chrkeys[i]: break - return chrOffset[chrNum(chrname)] + pos + return chrOffset[chrNum(chrname)] + pos + for c in chrLen: - ap = absPos(chrName[c]) + ap = absPos(chrName[c]) + def chrPos(abspos): for c in chrOffset: @@ -125,6 +166,7 @@ def chrPos(abspos): return (chrName[c], abspos - chrOffset[c]) return None + def update_chrLen(len_list): for l in len_list: chrLen[chrNum(l[0])] = int(l[1]) @@ -132,15 +174,22 @@ def update_chrLen(len_list): cpos = absPos(l[0], 1) - def reverse_complement(seq): - return ''.join([complementary_nucleotide[a] for a in seq][::-1]) - + return "".join([complementary_nucleotide[a] for a in seq][::-1]) class interval(object): - def __init__(self, line, start=-1, end=-1, strand=1, - file_format='', bamfile=None, info='', exclude_info_string=False): + def __init__( + self, + line, + start=-1, + end=-1, + strand=1, + file_format="", + bamfile=None, + info="", + exclude_info_string=False, + ): self.info = "" self.file_format = file_format if type(line) == pysam.AlignedRead or type(line) == pysam.AlignedSegment: @@ -155,46 +204,45 @@ def __init__(self, line, start=-1, end=-1, strand=1, self.info = info def load_line(self, line, file_format, exclude_info_string=False): - if file_format == '': + if file_format == "": if len(line.strip().split()) == 1: - self.chrom = line.split(':')[0] - self.start = int(line.split(':')[1].split('-')[0]) - if '-' not in line: - self.end = int(line.split(':')[1].split('-')[0]) + self.chrom = line.split(":")[0] + self.start = int(line.split(":")[1].split("-")[0]) + if "-" not in line: + self.end = int(line.split(":")[1].split("-")[0]) else: - self.end = int(line.split(':')[1].split('-')[1]) + self.end = int(line.split(":")[1].split("-")[1]) if self.start < self.end: self.strand = 1 else: self.strand = -1 return else: - file_format = 'bed' - if file_format=='gff': + file_format = "bed" + if file_format == "gff": ll = line.strip().split() self.chrom = ll[0] self.start, self.end = sorted([int(float(ll[3])), int(float(ll[4]))]) - if ll[6] =='+': + if ll[6] == "+": self.strand = 1 else: self.strand = -1 if not exclude_info_string: - self.info = {r[0: r.find('=')]: r[r.find('=') + 1: ] - for r in ll[8].strip().strip(';').split(';')} - self.info['Variant'] = ll[5] - elif file_format == 'bed': + self.info = {r[0 : r.find("=")]: r[r.find("=") + 1 :] for r in ll[8].strip().strip(";").split(";")} + self.info["Variant"] = ll[5] + elif file_format == "bed": ll = line.strip().split() self.chrom = ll[0] if (REF == "hg19" or REF == "GRCh38" or REF == "mm10" or REF == "GRCm38") and 0 < len(self.chrom) < 3: try: ci = int(self.chrom) if 0 < ci < 23: - self.chrom = 'chr' + self.chrom + self.chrom = "chr" + self.chrom logging.info("Corrected chromosome name (appended 'chr') " + self.chrom + " \n") except ValueError: if self.chrom in {"M", "X", "Y"}: - self.chrom = 'chr' + self.chrom + self.chrom = "chr" + self.chrom else: logging.warning("Chromosome name " + self.chrom + " may be incompatible") @@ -206,10 +254,10 @@ def load_line(self, line, file_format, exclude_info_string=False): if not exclude_info_string: self.info = ll[3:] else: - raise(Exception("Invalid interval format" + str(line))) + raise (Exception("Invalid interval format" + str(line))) def load_pos(self, chrom, start, end, strand): - self.chrom = chrom + self.chrom = chrom self.start = int(start) self.end = int(end) self.strand = strand @@ -247,13 +295,18 @@ def size(self): def __str__(self): if len(str(self.info)) == 0: - return '\t'.join(map(str, [self.chrom, self.start, self.end])) + return "\t".join(map(str, [self.chrom, self.start, self.end])) elif type(self.info) == list: - return '\t'.join(map(str, [self.chrom, self.start, self.end] + list(self.info))) + return "\t".join(map(str, [self.chrom, self.start, self.end] + list(self.info))) elif type(self.info) == dict: - return '\t'.join(map(str, [self.chrom, self.start, self.end] + [str(s) + '=' + str(self.info[s]) for s in self.info])) + return "\t".join( + map( + str, + [self.chrom, self.start, self.end] + [str(s) + "=" + str(self.info[s]) for s in self.info], + ) + ) else: - return '\t'.join(map(str, [self.chrom, self.start, self.end, self.info])) + return "\t".join(map(str, [self.chrom, self.start, self.end, self.info])) def gc_content(self): seq = fa_file.fetch(self.chrom, self.start, self.end) @@ -262,7 +315,7 @@ def gc_content(self): # exit() if len(seq) == 0: return 0.5 - return float(seq.count('G') + seq.count('C') + seq.count('g') + seq.count('c')) / len(seq) + return float(seq.count("G") + seq.count("C") + seq.count("g") + seq.count("c")) / len(seq) def sequence(self, new_fa_file=None): if new_fa_file is not None: @@ -272,28 +325,32 @@ def sequence(self, new_fa_file=None): if self.strand == 1: return seq else: - return ''.join([complementary_nucleotide[a] for a in seq][::-1]) + return "".join([complementary_nucleotide[a] for a in seq][::-1]) def intersects(self, n, extend=0, margin=0.0): if margin > 0.0: - if self.intersects(interval(n.chrom, n.start, n.end - (1 - margin) * (n.end - n.start))) and self.intersects(interval(n.chrom, n.start + (1 - margin) * (n.end - n.start)), n.end): + if self.intersects( + interval(n.chrom, n.start, n.end - (1 - margin) * (n.end - n.start)) + ) and self.intersects(interval(n.chrom, n.start + (1 - margin) * (n.end - n.start)), n.end): return True else: s = self - if n.intersects(interval(s.chrom, s.start, s.end - (1 - margin) * (s.end - s.start))) and n.intersects(interval(s.chrom, s.start + (1 - margin) * (s.end - s.start)), s.end): + if n.intersects(interval(s.chrom, s.start, s.end - (1 - margin) * (s.end - s.start))) and n.intersects( + interval(s.chrom, s.start + (1 - margin) * (s.end - s.start)), s.end + ): return True return False a = [self.chrom, max(0, self.start - extend), self.end + extend] b = [n.chrom, n.start, n.end] - if (a[0] != b[0]): + if a[0] != b[0]: return False - if (int(a[1])-int(b[1]))*(int(a[2])-int(b[1])) <= 0: + if (int(a[1]) - int(b[1])) * (int(a[2]) - int(b[1])) <= 0: return True - if (int(a[1])-int(b[2]))*(int(a[2])-int(b[2])) <= 0: + if (int(a[1]) - int(b[2])) * (int(a[2]) - int(b[2])) <= 0: return True - if (int(a[1])-int(b[1]))*(int(a[1])-int(b[2])) <= 0: + if (int(a[1]) - int(b[1])) * (int(a[1]) - int(b[2])) <= 0: return True - if (int(a[2])-int(b[1]))*(int(a[2])-int(b[2])) <= 0: + if (int(a[2]) - int(b[1])) * (int(a[2]) - int(b[2])) <= 0: return True return False @@ -335,7 +392,10 @@ def contains(self, x, y=-1, z=-1): if y != -1: if z == -1: z = y - if self.intersects(interval(x, y, z)) and self.intersection(interval(x, y, z)).size() == interval(x, y, z).size(): + if ( + self.intersects(interval(x, y, z)) + and self.intersection(interval(x, y, z)).size() == interval(x, y, z).size() + ): return True return False @@ -350,9 +410,9 @@ def filter_repeat(self): def rep_content(self): # logging.info("#TIME " + '%.3f\t'%(time() - TSTART) + " rep_content: init ") - if self.chrom == 'chrM' or self.chrom == 'MT': + if self.chrom == "chrM" or self.chrom == "MT": return 5.0 - if self.chrom.strip('chr') not in chromList: + if self.chrom.strip("chr") not in chromList: return 1.0 s34 = interval(self.chrom, self.start, max(self.start, self.end - 34)) # logging.info("#TIME " + '%.3f\t'%(time() - TSTART) + " rep_content: to load duke ") @@ -362,7 +422,13 @@ def rep_content(self): duke35.extend([l.strip() for l in duke35file]) duke35file.close() except: - logging.warning("#TIME " + '%.3f\t'%(time() - TSTART) + " rep_content: Unable to open mapability file \"" + duke35_filename + "\"." ) + logging.warning( + "#TIME " + + "%.3f\t" % (time() - TSTART) + + ' rep_content: Unable to open mapability file "' + + duke35_filename + + '".' + ) duke35_exists[0] = False duke35.extend(["chr_Un 0 1 1"]) # logging.info("#TIME " + '%.3f\t'%(time() - TSTART) + " rep_content: duke loaded") @@ -392,7 +458,18 @@ def rep_content(self): if not s34.intersects(m): p += 1 if p >= len(duke35) or p <= 0: - raise Exception('p index out of range: ' + str(p)+' '+str(lo)+' '+str(self)+' '+str(m) + ' '+str(interval(duke35[lo]))) + raise Exception( + "p index out of range: " + + str(p) + + " " + + str(lo) + + " " + + str(self) + + " " + + str(m) + + " " + + str(interval(duke35[lo])) + ) m = interval(duke35[p]) continue repc = 5.0 if float(m.info[0]) == 0 else 1.0 / float(m.info[0]) @@ -413,15 +490,20 @@ def num_unmasked(self): if self.chrom not in fa_file.references: return self.size() seq = fa_file.fetch(self.chrom, self.start, self.end) - return len([c for c in seq if c in 'ACGT']) + return len([c for c in seq if c in "ACGT"]) def segdup_uniqueness(self): sl = interval_list([self]).intersection(segdup_list) slsd = sum([self.intersection(i[1]).size() for i in sl]) return float(self.size()) / (self.size() + slsd) - + def extend(self, extend_len=0): - return interval(self.chrom, max(0, self.start - extend_len), min(self.end + extend_len, chrLen[chrNum(self.chrom)]), self.strand) + return interval( + self.chrom, + max(0, self.start - extend_len), + min(self.end + extend_len, chrLen[chrNum(self.chrom)]), + self.strand, + ) class interval_list(list, object): @@ -429,10 +511,10 @@ def __init__(self, ilist=None, file_format=None, sort=True, exclude_info_string= if ilist == None: ilist = [] self.file_format = file_format - if file_format in ['bed', 'gff']: + if file_format in ["bed", "gff"]: self.bed_to_list(ilist, exclude_info_string=exclude_info_string) if file_format is None: - list.__init__(self,ilist) + list.__init__(self, ilist) if sort: self.sort() self.offset = None @@ -441,13 +523,27 @@ def bed_to_list(self, file_name, exclude_info_string=False): if file_name is not None: try: f = open(file_name) - list.__init__(self, [interval(l, file_format=self.file_format, exclude_info_string=exclude_info_string) - for l in f if len(l.strip().split()) > 2 - and l.strip()[0] != '#']) + list.__init__( + self, + [ + interval( + l, + file_format=self.file_format, + exclude_info_string=exclude_info_string, + ) + for l in f + if len(l.strip().split()) > 2 and l.strip()[0] != "#" + ], + ) f.close() except: - logging.error("#TIME " + '%.3f\t'%(time() - TSTART) + " interval_list: Unable to open interval file \"" + file_name + "\"." ) - + logging.error( + "#TIME " + + "%.3f\t" % (time() - TSTART) + + ' interval_list: Unable to open interval file "' + + file_name + + '".' + ) def merge_clusters(self, extend=0, margin=0.0): ml = [] @@ -473,7 +569,7 @@ def merge_clusters(self, extend=0, margin=0.0): cstart = 0 cl = self[cstart:cend] if ci is not None: - ml.append((ci,cl)) + ml.append((ci, cl)) return ml[::-1] def repeats(self, count=1): @@ -541,7 +637,13 @@ def atomize(self, h2): if c is not None: if i < len(self) and self[i] not in c[1] and (self[i].intersects(c[0], -1) or c[0] > self[i]): atm = self[i].atomize(c[0]) - atm = [(aa[0], [(lambda x: c[1][0] if x == c[0] else x)(aai) for aai in aa[1]]) for aa in atm] + atm = [ + ( + aa[0], + [(lambda x: c[1][0] if x == c[0] else x)(aai) for aai in aa[1]], + ) + for aa in atm + ] # print "%i", [len(rr[1]) for rr in atm], [str(rr[0]) for rr in atm] c = atm[-1] i += 1 @@ -549,7 +651,13 @@ def atomize(self, h2): elif j < len(h2) and h2[j] not in c[1] and (h2[j].intersects(c[0], -1) or c[0] > h2[j]): # print j, str(h2[j]), str(c[0]), c[0] > h2[j] atm = c[0].atomize(h2[j]) - atm = [(aa[0], [(lambda x: c[1][0] if x == c[0] else x)(aai) for aai in aa[1]]) for aa in atm] + atm = [ + ( + aa[0], + [(lambda x: c[1][0] if x == c[0] else x)(aai) for aai in aa[1]], + ) + for aa in atm + ] # print "%j", [len(rr[1]) for rr in atm], [str(rr[0]) for rr in atm] c = atm[-1] j += 1 @@ -593,7 +701,7 @@ def get_repeat_content(self): for line in duke35_file: lno += 1 duke_int = interval(line) - while not(duke_int.intersects(self[i])) and duke_int > self[i]: + while not (duke_int.intersects(self[i])) and duke_int > self[i]: i += 1 if not duke_int.intersects(self[i]) and self[i] > duke_int: continue @@ -604,12 +712,18 @@ def get_repeat_content(self): len_duke[j] += self[j].intersection(duke_int).size() j += 1 duke35_file.close() - return {self[i]:sum_duke[i] / len_duke[i] for i in range(len(interval_list))} + return {self[i]: sum_duke[i] / len_duke[i] for i in range(len(interval_list))} except: - logging.warning("#TIME " + '%.3f\t'%(time() - TSTART) + " get_repeat_content: Unable to open mapability file \"" + duke35_filename + "\"." ) + logging.warning( + "#TIME " + + "%.3f\t" % (time() - TSTART) + + ' get_repeat_content: Unable to open mapability file "' + + duke35_filename + + '".' + ) duke35_exists[0] = False duke35.extend(["chr_Un 0 1 1"]) - return {self[i]:1.0 for i in range(len(interval_list))} + return {self[i]: 1.0 for i in range(len(interval_list))} def offsets(self): if self.offset is not None: @@ -617,9 +731,9 @@ def offsets(self): gap = 0.1 hratio = 0.8 - vlist = [i for i in self if chrNum(i.chrom) >= 100 and i.chrom[:3] != 'chr'] - hlist = [i for i in self if chrNum(i.chrom) < 100 or i.chrom[:3] == 'chr'] - v_count = len([i for i in self if chrNum(i.chrom) >= 100 and i.chrom[:3] != 'chr']) + vlist = [i for i in self if chrNum(i.chrom) >= 100 and i.chrom[:3] != "chr"] + hlist = [i for i in self if chrNum(i.chrom) < 100 or i.chrom[:3] == "chr"] + v_count = len([i for i in self if chrNum(i.chrom) >= 100 and i.chrom[:3] != "chr"]) h_count = len(self) - v_count h_sum = sum([i.size() for i in hlist]) v_sum = sum([i.size() for i in vlist]) @@ -628,13 +742,12 @@ def offsets(self): hS = sum([i.size() for i in hlist if i.size() > h_sum * gap / max(1, h_count)]) min_hsize = hS / (max(1, h_count) / gap - hK) h_sum = hS + hK * min_hsize - + vK = len([i for i in vlist if i.size() < v_sum * gap / max(1, v_count)]) vS = sum([i.size() for i in vlist if i.size() > v_sum * gap / max(1, v_count)]) min_vsize = vS / (max(1, v_count) / gap - vK) v_sum = vS + vK * min_vsize - offset = {} h_start = 0 @@ -673,9 +786,9 @@ def offset_breaks(self): gap = 0.1 hratio = 0.8 - vlist = [i for i in self if chrNum(i.chrom) >= 100 and i.chrom[:3] != 'chr'] - hlist = [i for i in self if chrNum(i.chrom) < 100 or i.chrom[:3] == 'chr'] - v_count = len([i for i in self if chrNum(i.chrom) >= 100 and i.chrom[:3] != 'chr']) + vlist = [i for i in self if chrNum(i.chrom) >= 100 and i.chrom[:3] != "chr"] + hlist = [i for i in self if chrNum(i.chrom) < 100 or i.chrom[:3] == "chr"] + v_count = len([i for i in self if chrNum(i.chrom) >= 100 and i.chrom[:3] != "chr"]) h_count = len(self) - v_count h_sum = sum([i.size() for i in hlist]) v_sum = sum([i.size() for i in vlist]) @@ -693,60 +806,63 @@ def offset_breaks(self): iprev = i continue if i in hlist and iprev.chrom == i.chrom: - breaks.append((offset[i][0] - hscale * hgap / 2, ':', i.chrom)) + breaks.append((offset[i][0] - hscale * hgap / 2, ":", i.chrom)) print(str(i), str(iprev), i in hlist, iprev.chrom == i.chrom) elif i in hlist and iprev.chrom != i.chrom: - breaks.append((offset[i][0] - hscale * hgap / 2, '--', i.chrom)) + breaks.append((offset[i][0] - hscale * hgap / 2, "--", i.chrom)) elif i in vlist and iprev in hlist: - breaks.append((offset[i][0] - vscale * vgap / 2, '-', i.chrom)) + breaks.append((offset[i][0] - vscale * vgap / 2, "-", i.chrom)) elif i in vlist and i.chrom == iprev.chrom: - breaks.append((offset[i][0] - vscale * vgap / 2, ':', i.chrom)) + breaks.append((offset[i][0] - vscale * vgap / 2, ":", i.chrom)) else: - breaks.append((offset[i][0] - vscale * vgap / 2, '--', i.chrom)) + breaks.append((offset[i][0] - vscale * vgap / 2, "--", i.chrom)) iprev = i return breaks def __str__(self): - return str(([str(i) for i in self])) - + return str(([str(i) for i in self])) -oncogene_list = interval_list(oncogene_filename, 'gff') +oncogene_list = interval_list(oncogene_filename, "gff") oncogene_list.sort() -gene_list = interval_list(gene_filename, 'gff') - +gene_list = interval_list(gene_filename, "gff") exon_list = interval_list([]) + + def load_exons(): if len(exon_list) > 0: return try: exon_file = open(exon_filename) - exonFields = [interval(j, file_format='gff') - for j in exon_file.read().strip().split('\n') - if (len(j.strip()) > 0 and j.strip()[0] != '#' and - {r.split('=')[0]:r.split('=')[1] - for r in j.strip().split()[8].strip(';').split(';') - }['color'] == '000080')] + exonFields = [ + interval(j, file_format="gff") + for j in exon_file.read().strip().split("\n") + if ( + len(j.strip()) > 0 + and j.strip()[0] != "#" + and {r.split("=")[0]: r.split("=")[1] for r in j.strip().split()[8].strip(";").split(";")}["color"] + == "000080" + ) + ] exon_file.close() exon_list.extend((exonFields)) except: - logging.warning("#TIME " + '%.3f\t'%(time() - TSTART) + "unable to load exon file: \"" + exon_filename + "\"") + logging.warning("#TIME " + "%.3f\t" % (time() - TSTART) + 'unable to load exon file: "' + exon_filename + '"') + -conserved_regions = interval_list(conserved_regions_filename, 'bed') +conserved_regions = interval_list(conserved_regions_filename, "bed") conserved_regions.sort() -wgexclude = interval_list(wgexclude_filename, 'bed') +wgexclude = interval_list(wgexclude_filename, "bed") wgexclude.sort() -centromere_list = interval_list(centromere_filename, 'bed') +centromere_list = interval_list(centromere_filename, "bed") centromere_list.sort() centromere_list = interval_list([i[0] for i in centromere_list.merge_clusters(extend=1)]) -segdup_list = interval_list(segdup_filename, 'bed') +segdup_list = interval_list(segdup_filename, "bed") segdup_list.sort() - - diff --git a/bin/repeats.py b/bin/repeats.py index f899df4b..29d1c871 100644 --- a/bin/repeats.py +++ b/bin/repeats.py @@ -1,38 +1,37 @@ #!/usr/bin/env python -#MIT License +# MIT License # -#Copyright (c) 2019 Iñigo Prada Luengo +# Copyright (c) 2019 Iñigo Prada Luengo # -#Permission is hereby granted, free of charge, to any person obtaining a copy -#of this software and associated documentation files (the "Software"), to deal -#in the Software without restriction, including without limitation the rights -#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -#copies of the Software, and to permit persons to whom the Software is -#furnished to do so, subject to the following conditions: +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: # -#The above copyright notice and this permission notice shall be included in all -#copies or substantial portions of the Software. +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. # -#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -#SOFTWARE. +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. import pysam as ps import pybedtools as bt import os import time -from utils import merge_coverage_bed,rightmost_from_sa +from utils import merge_coverage_bed, rightmost_from_sa class repeat: """Class for indentifying repeat derived eccDNA by looking of the reads with two alignments""" - - def __init__(self,bam,directory,mismatch,fraction,read_number): + def __init__(self, bam, directory, mismatch, fraction, read_number): self.bam = bam self.dir = directory self.mismatch = mismatch @@ -40,63 +39,60 @@ def __init__(self,bam,directory,mismatch,fraction,read_number): self.number = read_number def find_circles(self): - begin = time.time() os.chdir("%s" % self.dir) - bam = ps.AlignmentFile("%s" % self.bam,'rb') - - + bam = ps.AlignmentFile("%s" % self.bam, "rb") print("Iterating trough the bam file") output = [] for read in bam: - try: - if read.has_tag('XA'): - tag = read.get_tag('XA').split(';')[:-1] - - read_edit_distance = read.get_tag('NM') + if read.has_tag("XA"): + tag = read.get_tag("XA").split(";")[:-1] - if read_edit_distance <= self.mismatch and len(tag) ==1: + read_edit_distance = read.get_tag("NM") + if read_edit_distance <= self.mismatch and len(tag) == 1: read_chrom = bam.get_reference_name(read.reference_id) - chrom = tag[0].split(',')[0] - + chrom = tag[0].split(",")[0] if chrom == read_chrom: - - - aln = int(tag[0].split(',')[1][1:]) + aln = int(tag[0].split(",")[1][1:]) if aln < read.reference_start: - - interval = [chrom,aln,read.reference_start+ read.infer_read_length(),1] + interval = [ + chrom, + aln, + read.reference_start + read.infer_read_length(), + 1, + ] output.append(interval) else: - - interval = [chrom,read.reference_start,rightmost_from_sa(aln,tag[0].split(',')[2]),1] + interval = [ + chrom, + read.reference_start, + rightmost_from_sa(aln, tag[0].split(",")[2]), + 1, + ] output.append(interval) - except BaseException as e: print(e) + bed = merge_coverage_bed(output, self.fraction, self.number) - - bed = merge_coverage_bed(output,self.fraction,self.number) - - #add dots to read metrics stats + # add dots to read metrics stats with_dot = [] for interval in bed: interval.append(".") with_dot.append(interval) - bed= bt.BedTool(with_dot) + bed = bt.BedTool(with_dot) - return(bed) + return bed diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py index 69b09987..960f405b 100755 --- a/bin/scrape_software_versions.py +++ b/bin/scrape_software_versions.py @@ -5,7 +5,6 @@ results = {} version_files = [x for x in os.listdir(".") if x.endswith(".version.txt")] for version_file in version_files: - software = version_file.replace(".version.txt", "") if software == "pipeline": software = "nf-core/circdna" diff --git a/bin/simulations.py b/bin/simulations.py index 64341f43..26ad1224 100644 --- a/bin/simulations.py +++ b/bin/simulations.py @@ -1,24 +1,24 @@ -#MIT License +# MIT License # -#Copyright (c) 2019 Iñigo Prada Luengo +# Copyright (c) 2019 Iñigo Prada Luengo # -#Permission is hereby granted, free of charge, to any person obtaining a copy -#of this software and associated documentation files (the "Software"), to deal -#in the Software without restriction, including without limitation the rights -#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -#copies of the Software, and to permit persons to whom the Software is -#furnished to do so, subject to the following conditions: +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: # -#The above copyright notice and this permission notice shall be included in all -#copies or substantial portions of the Software. +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. # -#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -#SOFTWARE. +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. import numpy as np import os @@ -35,29 +35,45 @@ import warnings -def sim_ecc_reads(genome_fasta,read_length,directory,reads,exclude_regions,fastq,insert_size,errors,mean_cov,locker, - process,sim_circles,paired_end_fastq_1,paired_end_fastq_2,skipped,correct,ins_rate1,ins_rate2,del_rate1, - del_rate2,sim_pid): +def sim_ecc_reads( + genome_fasta, + read_length, + directory, + reads, + exclude_regions, + fastq, + insert_size, + errors, + mean_cov, + locker, + process, + sim_circles, + paired_end_fastq_1, + paired_end_fastq_2, + skipped, + correct, + ins_rate1, + ins_rate2, + del_rate1, + del_rate2, + sim_pid, +): """Function that takes as arguments a genome fasta file, weights each chromosome based on the length and simulates single end eccDNA reads """ - # Get the length of the chromosomes and store them in a sequence dictionary chromosomes = {} whole_genome_len = 0 - for rec in SeqIO.parse(genome_fasta, 'fasta'): + for rec in SeqIO.parse(genome_fasta, "fasta"): name = rec.id seqLen = len(rec) whole_genome_len += seqLen chromosomes[name] = seqLen - #chromosome sampling probability weighted based on its length + # chromosome sampling probability weighted based on its length weighted_chromosomes = {} for contigs in chromosomes: - weighted_chromosomes[contigs] = chromosomes[contigs]/whole_genome_len - - - + weighted_chromosomes[contigs] = chromosomes[contigs] / whole_genome_len contig_list = [] weights = [] @@ -65,103 +81,98 @@ def sim_ecc_reads(genome_fasta,read_length,directory,reads,exclude_regions,fastq weights.append(value) contig_list.append(contigs) - #Simulate the reads: - - + # Simulate the reads: os.chdir(directory) circle_bed = [] - - - set_of_reads = [] set_of_left_reads = [] set_of_right_reads = [] circle_number = 0 - #reads simulated by a process + # reads simulated by a process n_of_reads = 0 # n_of_reads_it = 0 begin = time.time() - #simulated reads + # simulated reads while n_of_reads < reads + 1: - - - #sample weighted chromosome - #set random seed, important for paralell + # sample weighted chromosome + # set random seed, important for paralell np.random.seed() chr = np.random.choice(contig_list, p=weights) # decide ecDNA length - #sample circle length - circle_length = rd.randint(150,350) - + # sample circle length + circle_length = rd.randint(150, 350) # linear decrease in coverage based on circle length - - - - # compute circles sequencing coverage - rounds_of_sim = (circle_length * mean_cov)/(read_length*2) - - - - - - + rounds_of_sim = (circle_length * mean_cov) / (read_length * 2) # take in to account short length contigs - #start position can't be bigger than (chr_length-circle_length) - chr_pos_start = rd.randint(0,(chromosomes[chr] - circle_length)) - #set end + # start position can't be bigger than (chr_length-circle_length) + chr_pos_start = rd.randint(0, (chromosomes[chr] - circle_length)) + # set end if chromosomes[chr] == (chromosomes[chr] - circle_length): chr_pos_end = chromosomes[chr] else: chr_pos_end = chr_pos_start + circle_length - #if user of provides regions to exclude, check within it is on the region. and skip it - if exclude_regions != None and bt.BedTool(exclude_regions).sort().any_hits(bt.Interval(chr,chr_pos_start,chr_pos_end)) != 0: - #hit in a gap region + # if user of provides regions to exclude, check within it is on the region. and skip it + if ( + exclude_regions != None + and bt.BedTool(exclude_regions).sort().any_hits(bt.Interval(chr, chr_pos_start, chr_pos_end)) != 0 + ): + # hit in a gap region # shared memory object between processes. It is use to track the number of skipped circles with skipped.get_lock(): - skipped.value+=1 + skipped.value += 1 continue else: - #shared memory object between processes. It is use to track the number of correctly simulated circles + # shared memory object between processes. It is use to track the number of correctly simulated circles with correct.get_lock(): - correct.value+=1 - #save each circle positions, so that then I can check true circles - + correct.value += 1 + # save each circle positions, so that then I can check true circles first_line = [chr, chr_pos_start, chr_pos_end] - #create class object outside the loop - new_read = sim_paired_end(n_of_reads, insert_size, genome_fasta, chr, chr_pos_start, - chr_pos_end, read_length, circle_number,process) - - #simulation rounds - for each_sim in range(0,round(int(rounds_of_sim))): - - + # create class object outside the loop + new_read = sim_paired_end( + n_of_reads, + insert_size, + genome_fasta, + chr, + chr_pos_start, + chr_pos_end, + read_length, + circle_number, + process, + ) + + # simulation rounds + for each_sim in range(0, round(int(rounds_of_sim))): if errors == True: - - - - if (n_of_reads_it+1) !=1000: - + if (n_of_reads_it + 1) != 1000: # sim the read get_seq = new_read.simulate_read() # put it in fastq format - simulated_reads = sim_paired_end.simulate_read_with_errors(new_read, get_seq[0], get_seq[1], - get_seq[2],ins_rate1,ins_rate2,del_rate1, - del_rate2,sim_pid) + simulated_reads = sim_paired_end.simulate_read_with_errors( + new_read, + get_seq[0], + get_seq[1], + get_seq[2], + ins_rate1, + ins_rate2, + del_rate1, + del_rate2, + sim_pid, + ) if simulated_reads != None: # save the read assert len(set_of_left_reads) == len(set_of_right_reads) @@ -174,28 +185,32 @@ def sim_ecc_reads(genome_fasta,read_length,directory,reads,exclude_regions,fastq else: continue - - - else: - # simulate reads and save to disk get_seq = new_read.simulate_read() - simulated_reads = sim_paired_end.simulate_read_with_errors(new_read, get_seq[0], get_seq[1], - get_seq[2],ins_rate1,ins_rate2,del_rate1, - del_rate2,sim_pid) + simulated_reads = sim_paired_end.simulate_read_with_errors( + new_read, + get_seq[0], + get_seq[1], + get_seq[2], + ins_rate1, + ins_rate2, + del_rate1, + del_rate2, + sim_pid, + ) set_of_left_reads.append(simulated_reads[0]) set_of_right_reads.append(simulated_reads[1]) # save to disk assert len(set_of_left_reads) == len(set_of_right_reads) locker.acquire() - print("Process %s: writting to disk 10000 reads" % process ) + print("Process %s: writting to disk 10000 reads" % process) fastq_1 = open(paired_end_fastq_1, "a") - SeqIO.write(set_of_left_reads,fastq_1, "fastq") + SeqIO.write(set_of_left_reads, fastq_1, "fastq") fastq_1.close() fastq_2 = open(paired_end_fastq_2, "a") - SeqIO.write(set_of_right_reads,fastq_2, "fastq") + SeqIO.write(set_of_right_reads, fastq_2, "fastq") fastq_2.close() locker.release() assert len(set_of_left_reads) == len(set_of_right_reads) @@ -204,12 +219,29 @@ def sim_ecc_reads(genome_fasta,read_length,directory,reads,exclude_regions,fastq n_of_reads_it += 1 # sim the first read of the list - new_read = sim_paired_end(n_of_reads, insert_size, genome_fasta, chr, chr_pos_start, - chr_pos_end, read_length, circle_number,process) + new_read = sim_paired_end( + n_of_reads, + insert_size, + genome_fasta, + chr, + chr_pos_start, + chr_pos_end, + read_length, + circle_number, + process, + ) get_seq = new_read.simulate_read() - simulated_reads = sim_paired_end.simulate_read_with_errors(new_read, get_seq[0], get_seq[1], - get_seq[2],ins_rate1,ins_rate2,del_rate1, - del_rate2,sim_pid) + simulated_reads = sim_paired_end.simulate_read_with_errors( + new_read, + get_seq[0], + get_seq[1], + get_seq[2], + ins_rate1, + ins_rate2, + del_rate1, + del_rate2, + sim_pid, + ) assert len(set_of_left_reads) == len(set_of_right_reads) set_of_left_reads = [simulated_reads[0]] set_of_right_reads = [simulated_reads[1]] @@ -217,32 +249,30 @@ def sim_ecc_reads(genome_fasta,read_length,directory,reads,exclude_regions,fastq n_of_reads += 1 n_of_reads_it = 1 - else: - - if (n_of_reads_it+1) != 10000: - - #sim the read + if (n_of_reads_it + 1) != 10000: + # sim the read get_seq = new_read.simulate_read() - #put it in fastq format - simulated_reads = sim_paired_end.simulate_perfect_read(new_read,get_seq[0], get_seq[1], get_seq[2]) - #save the read + # put it in fastq format + simulated_reads = sim_paired_end.simulate_perfect_read( + new_read, get_seq[0], get_seq[1], get_seq[2] + ) + # save the read set_of_left_reads.append(simulated_reads[0]) set_of_right_reads.append(simulated_reads[1]) - n_of_reads +=1 + n_of_reads += 1 n_of_reads_it += 1 - - else: - #simulate reads and save to disk + # simulate reads and save to disk get_seq = new_read.simulate_read() - simulated_reads = sim_paired_end.simulate_perfect_read(new_read, get_seq[0], get_seq[1], - get_seq[2]) + simulated_reads = sim_paired_end.simulate_perfect_read( + new_read, get_seq[0], get_seq[1], get_seq[2] + ) set_of_left_reads.append(simulated_reads[0]) set_of_right_reads.append(simulated_reads[1]) - #save to disk + # save to disk locker.acquire() assert len(set_of_left_reads) == len(set_of_right_reads) print("Process %s: writting to disk 10000 reads" % process) @@ -255,29 +285,33 @@ def sim_ecc_reads(genome_fasta,read_length,directory,reads,exclude_regions,fastq assert len(set_of_left_reads) == len(set_of_right_reads) locker.release() - - n_of_reads += 1 n_of_reads_it += 1 - #sim the first read of the list - new_read = sim_paired_end(n_of_reads, insert_size, genome_fasta, chr, chr_pos_start, - chr_pos_end, read_length, circle_number,process) + # sim the first read of the list + new_read = sim_paired_end( + n_of_reads, + insert_size, + genome_fasta, + chr, + chr_pos_start, + chr_pos_end, + read_length, + circle_number, + process, + ) get_seq = new_read.simulate_read() - simulated_reads = sim_paired_end.simulate_perfect_read(new_read, get_seq[0], get_seq[1], - get_seq[2]) + simulated_reads = sim_paired_end.simulate_perfect_read( + new_read, get_seq[0], get_seq[1], get_seq[2] + ) assert len(set_of_left_reads) == len(set_of_right_reads) set_of_left_reads = [simulated_reads[0]] set_of_right_reads = [simulated_reads[1]] assert len(set_of_left_reads) == len(set_of_right_reads) - n_of_reads +=1 + n_of_reads += 1 n_of_reads_it = 1 - - - - circle_bed.append(first_line) # last save to disk @@ -291,20 +325,25 @@ def sim_ecc_reads(genome_fasta,read_length,directory,reads,exclude_regions,fastq fastq_2.close() locker.release() - - - - #shared memory between the processes.This is a list that every process will rate the simulated circles + # shared memory between the processes.This is a list that every process will rate the simulated circles for element in circle_bed: sim_circles.append(element) - - class sim_paired_end: - - #init the class - def __init__(self,read_number,insert_size,genome_fa,chr,chr_pos_start,chr_pos_end,read_length,circle_id,process): + # init the class + def __init__( + self, + read_number, + insert_size, + genome_fa, + chr, + chr_pos_start, + chr_pos_end, + read_length, + circle_id, + process, + ): self.read_number = read_number self.insert_size = insert_size self.genome_fa = genome_fa @@ -346,11 +385,16 @@ def simulate_read(self): # assertion to check the error here common_id = "%s|%s|%s:%s-%s:%s|%s:%s|1|%s" % ( - self.read_number, self.chr, start, self.chr_pos_end, self.chr_pos_start, (self.chr_pos_start + right_dntps), right_start, - (right_start + self.read_length), self.circle_id) - - - + self.read_number, + self.chr, + start, + self.chr_pos_end, + self.chr_pos_start, + (self.chr_pos_start + right_dntps), + right_start, + (right_start + self.read_length), + self.circle_id, + ) else: if right_start > self.chr_pos_end: @@ -359,7 +403,14 @@ def simulate_read(self): right_start = self.chr_pos_start + (right_start - self.chr_pos_end) right_read = fastafile.fetch(self.chr, right_start, (right_start + self.read_length)) common_id = "%s|%s|%s:%s|%s:%s|3|%s" % ( - self.read_number, self.chr, start, (start + self.read_length), right_start, (right_start + self.read_length), self.circle_id) + self.read_number, + self.chr, + start, + (start + self.read_length), + right_start, + (right_start + self.read_length), + self.circle_id, + ) else: # right split read scenario assert right_start <= self.chr_pos_end @@ -373,9 +424,16 @@ def simulate_read(self): right_split_read = fastafile.fetch(self.chr, self.chr_pos_start, (self.chr_pos_start + right_dntps)) right_read = left_split_read + right_split_read common_id = "%s|%s|%s:%s|%s:%s-%s:%s|2|%s" % ( - self.read_number,self.chr, start, (start + self.read_length), right_start, self.chr_pos_end, self.chr_pos_start, - (self.chr_pos_start, right_dntps), self.circle_id) - + self.read_number, + self.chr, + start, + (start + self.read_length), + right_start, + self.chr_pos_end, + self.chr_pos_start, + (self.chr_pos_start, right_dntps), + self.circle_id, + ) else: # non split read scenario @@ -383,13 +441,18 @@ def simulate_read(self): # correct right read start right_read = fastafile.fetch(self.chr, right_start, (right_start + self.read_length)) common_id = "%s|%s|%s:%s|%s:%s|0|%s" % ( - self.read_number, self.chr, start, (start + self.read_length), right_start, (right_start + self.read_length), self.circle_id) - - return(right_read,left_read,common_id) - - - - def simulate_perfect_read(self,right_read,left_read,common_id): + self.read_number, + self.chr, + start, + (start + self.read_length), + right_start, + (right_start + self.read_length), + self.circle_id, + ) + + return (right_read, left_read, common_id) + + def simulate_perfect_read(self, right_read, left_read, common_id): # put all together # unique identifiers for right and left reads right_read_id = "2:N:0:CGCTGTG" @@ -405,7 +468,6 @@ def simulate_perfect_read(self,right_read,left_read,common_id): right_read = right_read.upper() - fastq_left = "@%s\n%s\n+\n%s\n" % (left_id, left_read, quality) fastq_right = "@%s\n%s\n+\n%s\n" % (right_id, right_read, quality) @@ -413,9 +475,17 @@ def simulate_perfect_read(self,right_read,left_read,common_id): left_record = SeqIO.read(StringIO(fastq_left), "fastq") return (left_record, right_record) - - def simulate_read_with_errors(self,right_read, left_read, common_id,ins_rate1,ins_rate2,del_rate1, - del_rate2,pid): + def simulate_read_with_errors( + self, + right_read, + left_read, + common_id, + ins_rate1, + ins_rate2, + del_rate1, + del_rate2, + pid, + ): # put all together # unique identifiers for right and left reads dir = os.getcwd() @@ -427,22 +497,30 @@ def simulate_read_with_errors(self,right_read, left_read, common_id,ins_rate1,in left_id = common_id + "space" + left_read_id # attemp to use art to simulate the quality scores and the error rate - #create a one read genome + # create a one read genome left_fasta = open("left_read_%s.fa" % (self.process), "w") left_fasta.write(">" + left_id + "\n" + str(left_read) + "\n") # sim the read with art left_fasta.close() - sp.call("art_illumina -q -na -ss HS25 -ir %s -ir2 %s -dr %s -dr2 %s -nf 0 -i left_read_%s.fa -l %s -f 1 -o left%s" % - (ins_rate1,ins_rate2,del_rate1,del_rate2,self.process,self.read_length,self.process), - shell=True,stdout=sp.DEVNULL, stderr=sp.STDOUT) - - - with open("left%s.fq" % (self.process), 'r') as left: - left_read = left.read().replace('space', ' ').replace('1:N:0:CGCTGTG-1', '1:N:0:CGCTGTG') - - - + sp.call( + "art_illumina -q -na -ss HS25 -ir %s -ir2 %s -dr %s -dr2 %s -nf 0 -i left_read_%s.fa -l %s -f 1 -o left%s" + % ( + ins_rate1, + ins_rate2, + del_rate1, + del_rate2, + self.process, + self.read_length, + self.process, + ), + shell=True, + stdout=sp.DEVNULL, + stderr=sp.STDOUT, + ) + + with open("left%s.fq" % (self.process), "r") as left: + left_read = left.read().replace("space", " ").replace("1:N:0:CGCTGTG-1", "1:N:0:CGCTGTG") # get the reverse complement of the right read right_read = Seq(right_read, generic_dna) @@ -453,33 +531,32 @@ def simulate_read_with_errors(self,right_read, left_read, common_id,ins_rate1,in right_fasta.close() # sim the read with art - sp.call("art_illumina -na -q -ss HS25 -ir %s -ir2 %s -dr %s -dr2 %s -nf 0 -i right_read_%s.fa -l %s -f 1 -o right%s" % - (ins_rate1,ins_rate2,del_rate1,del_rate2,self.process,self.read_length,self.process), - shell=True,stdout=sp.DEVNULL, stderr=sp.STDOUT) - - with open("right%s.fq" % (self.process), 'r') as right: - right_read = right.read().replace('space', ' ').replace('1:N:0:CGCTGTG-1', '2:N:0:CGCTGTG') - - #sometimes the reading fails. I introduce this to capture it + sp.call( + "art_illumina -na -q -ss HS25 -ir %s -ir2 %s -dr %s -dr2 %s -nf 0 -i right_read_%s.fa -l %s -f 1 -o right%s" + % ( + ins_rate1, + ins_rate2, + del_rate1, + del_rate2, + self.process, + self.read_length, + self.process, + ), + shell=True, + stdout=sp.DEVNULL, + stderr=sp.STDOUT, + ) + + with open("right%s.fq" % (self.process), "r") as right: + right_read = right.read().replace("space", " ").replace("1:N:0:CGCTGTG-1", "2:N:0:CGCTGTG") + + # sometimes the reading fails. I introduce this to capture it try: - right_record = SeqIO.read(StringIO(right_read), "fastq") left_record = SeqIO.read(StringIO(left_read), "fastq") os.chdir(dir) return (left_record, right_record) except ValueError as v: - - warnings.warn('Catched ValueError in a sampling round. Skipping') + warnings.warn("Catched ValueError in a sampling round. Skipping") os.chdir(dir) - return(None) - - - - - - - - - - - + return None diff --git a/bin/summarise_aa.py b/bin/summarise_aa.py index 5b4cf971..2680a5c2 100755 --- a/bin/summarise_aa.py +++ b/bin/summarise_aa.py @@ -35,12 +35,8 @@ df_summary.insert(loc=0, column="id", value=args.id) df_class = pd.read_table(args.class_file, sep="\t") -df_class["amplicon_number"] = df_class["amplicon_number"].str.replace( - "amplicon", "", regex=False -) -df_class = df_class.rename( - columns={"sample_name": "id", "amplicon_number": "AmpliconID"} -) +df_class["amplicon_number"] = df_class["amplicon_number"].str.replace("amplicon", "", regex=False) +df_class = df_class.rename(columns={"sample_name": "id", "amplicon_number": "AmpliconID"}) df_full = pd.merge(df_class, df_summary) df_full.to_csv(args.output, sep="\t", index=False) diff --git a/bin/utils.py b/bin/utils.py index 45492d52..260ca40a 100644 --- a/bin/utils.py +++ b/bin/utils.py @@ -1,24 +1,24 @@ -#MIT License +# MIT License # -#Copyright (c) 2019 Iñigo Prada Luengo +# Copyright (c) 2019 Iñigo Prada Luengo # -#Permission is hereby granted, free of charge, to any person obtaining a copy -#of this software and associated documentation files (the "Software"), to deal -#in the Software without restriction, including without limitation the rights -#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -#copies of the Software, and to permit persons to whom the Software is -#furnished to do so, subject to the following conditions: +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: # -#The above copyright notice and this permission notice shall be included in all -#copies or substantial portions of the Software. +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. # -#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -#SOFTWARE. +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. import pysam as ps import pybedtools as bt @@ -40,29 +40,25 @@ import datetime - - - def is_soft_clipped(read): - """Function that checks the CIGAR string of the sam file and returns true if the read is soft-clipped""" # cigar 4 equals to S in pysam sam representation match = 0 for cigar in read.cigar: if cigar[0] == 4: - match +=1 + match += 1 else: pass if match > 0: - return(True) + return True else: - return(False) + return False -def is_hard_clipped(read): +def is_hard_clipped(read): """Function that checks the CIGAR string of the sam file and returns true if the read is hard-clipped""" # cigar 5 equals to H in pysam sam representation @@ -74,19 +70,19 @@ def is_hard_clipped(read): pass if match > 0: - return (True) + return True else: - return (False) + return False + def rightmost_from_read(read): """Function that takes as input a read a returns its rightmost mapping position""" rightmost = 0 - #matches, deletions and ref skip consume reference + # matches, deletions and ref skip consume reference for cigar in read.cigar: - if cigar[0] == 0: rightmost += cigar[1] @@ -96,44 +92,37 @@ def rightmost_from_read(read): elif cigar[0] == 3: rightmost += cigar[1] + return read.reference_start + rightmost - return(read.reference_start + rightmost) -def rightmost_from_sa(leftmost,sa_cigar): +def rightmost_from_sa(leftmost, sa_cigar): """Function that takes as input the leftmost position of a supplementary alignment and returns it rightmost mapping position""" + # the SA alignment is 1 based + rightmost = int(leftmost) - 1 - #the SA alignment is 1 based - rightmost = int(leftmost)-1 - - cigar = [''.join(g) for _, g in it.groupby(sa_cigar, str.isalpha)] + cigar = ["".join(g) for _, g in it.groupby(sa_cigar, str.isalpha)] # matches, deletions and ref skip consume reference - match_index = [x for x in range(len(cigar)) if cigar[x] == 'M'] - deletion_index = [x for x in range(len(cigar)) if cigar[x] == 'D'] - ambiguous_index = [x for x in range(len(cigar)) if cigar[x] == 'N'] - + match_index = [x for x in range(len(cigar)) if cigar[x] == "M"] + deletion_index = [x for x in range(len(cigar)) if cigar[x] == "D"] + ambiguous_index = [x for x in range(len(cigar)) if cigar[x] == "N"] for index in match_index: - rightmost += int(cigar[index-1]) + rightmost += int(cigar[index - 1]) for index in deletion_index: - rightmost += int(cigar[index-1]) - + rightmost += int(cigar[index - 1]) for index in ambiguous_index: - rightmost += int(cigar[index-1]) - - assert rightmost >= (int(leftmost)-1) - - return(rightmost) - + rightmost += int(cigar[index - 1]) + assert rightmost >= (int(leftmost) - 1) + return rightmost def aligned_bases(read): - """Function that counts the number of aligned bases from the CIGAR string and returns and integer""" aligned = 0 @@ -144,46 +133,43 @@ def aligned_bases(read): else: pass assert aligned >= 0 - return(aligned) + return aligned -def aligned_bases_from_sa(sa_cigar): +def aligned_bases_from_sa(sa_cigar): """Function that gets as input the SA tag CIGAR and reports the number of bases that where matched to the genome""" - cigar = [''.join(g) for _, g in it.groupby(sa_cigar, str.isalpha)] - + cigar = ["".join(g) for _, g in it.groupby(sa_cigar, str.isalpha)] - match_index = [x for x in range(len(cigar)) if cigar[x]=='M'] + match_index = [x for x in range(len(cigar)) if cigar[x] == "M"] aligned = 0 - #if only one hit + # if only one hit if type(match_index) == int: - aligned += int(cigar[match_index -1]) + aligned += int(cigar[match_index - 1]) - #when there are more than 1 hits + # when there are more than 1 hits else: assert type(match_index) == list for index in match_index: aligned += int(cigar[index - 1]) - assert aligned >=0 - return(aligned) + assert aligned >= 0 + return aligned def genome_alignment_from_cigar(sa_cigar): - """Function that gets as input the SA tag CIGAR and returns the length of the alignment interval in the genome it will look at the number of matches and deletions in the CIGAR, as they are the elements that will explain the genome alignment """ aligned = 0 - cigar = [''.join(g) for _, g in it.groupby(sa_cigar, str.isalpha)] - - #do it for the matches - match_index = [x for x in range(len(cigar)) if cigar[x]=='M'] + cigar = ["".join(g) for _, g in it.groupby(sa_cigar, str.isalpha)] + # do it for the matches + match_index = [x for x in range(len(cigar)) if cigar[x] == "M"] # if only one hit if type(match_index) == int: @@ -196,10 +182,8 @@ def genome_alignment_from_cigar(sa_cigar): for index in match_index: aligned += int(cigar[index - 1]) - - if 'D' in cigar == True: - - deletion_index = cigar.index('D') + if "D" in cigar == True: + deletion_index = cigar.index("D") # if only one hit if type(deletion_index) == int: @@ -212,87 +196,76 @@ def genome_alignment_from_cigar(sa_cigar): for index in deletion_index: aligned += int(cigar[index - 1]) - assert aligned >=0 - return(aligned) - - - + assert aligned >= 0 + return aligned -def bam_circ_sv_peaks(bam,input_bam_name,cores,verbose,pid,clusters): +def bam_circ_sv_peaks(bam, input_bam_name, cores, verbose, pid, clusters): """Function that takes as input a bam file and returns a merged bed file of the genome covered by the bam, it will create and index too""" # check bam header for sorting state + # check the header of the bam file for the sorting state, and sort if necessary - - #check the header of the bam file for the sorting state, and sort if necessary - - if 'HD' in bam.header: - if bam.header['HD']['SO'] == 'queryname': - print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"),"Bam is sorted by queryname, exiting") + if "HD" in bam.header: + if bam.header["HD"]["SO"] == "queryname": + print( + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"), + "Bam is sorted by queryname, exiting", + ) bam.close() sys.exit() - - - - - elif bam.header['HD']['SO'] == 'unsorted': - + elif bam.header["HD"]["SO"] == "unsorted": bam.close() print("Bam is unsorted, exiting") sys.exit() - - - - elif bam.header['HD']['SO'] == 'coordinate': - + elif bam.header["HD"]["SO"] == "coordinate": bam.close() # this handles Circle-Map bam2bam if input_bam_name != None: - sorted_bam = ps.AlignmentFile("%s" % input_bam_name) - - - - + sorted_bam = ps.AlignmentFile("%s" % input_bam_name) else: if verbose < 2: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:")) warnings.warn( - "WARNING: the bam file does not have an SO tag.\nCircle-Map cannot check if the bam file is sorted by coordinate.\n If the bam file is not sorted by coordinate the program will file") + "WARNING: the bam file does not have an SO tag.\nCircle-Map cannot check if the bam file is sorted by coordinate.\n If the bam file is not sorted by coordinate the program will file" + ) print( - "As sanity check, sort your bam file coordinate with the following command:\n\n\tsamtools sort -o output.bam input.bam") - + "As sanity check, sort your bam file coordinate with the following command:\n\n\tsamtools sort -o output.bam input.bam" + ) else: - if verbose < 2: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:")) warnings.warn( - "WARNING: the bam file does not have an HD tag.\nCircle-Map cannot check if the bam file is sorted by coordinate.\n If the bam file is not sorted by coordinate the program will file") + "WARNING: the bam file does not have an HD tag.\nCircle-Map cannot check if the bam file is sorted by coordinate.\n If the bam file is not sorted by coordinate the program will file" + ) print( - "As sanity check, sort your bam file coordinate with the following command:\n\n\tsamtools sort -o output.bam input.bam") - + "As sanity check, sort your bam file coordinate with the following command:\n\n\tsamtools sort -o output.bam input.bam" + ) - #from bam to BedGraph + # from bam to BedGraph - sp.call("bedtools genomecov -bg -ibam %s | sort -T temp_files_%s -k 1,1 -k2,2n | mergeBed -d %s -c 4 -o mean | sort -r -n -k 4,4 > temp_files_%s/peaks.bed" % - (input_bam_name,pid,clusters,pid),shell=True) + sp.call( + "bedtools genomecov -bg -ibam %s | sort -T temp_files_%s -k 1,1 -k2,2n | mergeBed -d %s -c 4 -o mean | sort -r -n -k 4,4 > temp_files_%s/peaks.bed" + % (input_bam_name, pid, clusters, pid), + shell=True, + ) - #Decide number of chunks + # Decide number of chunks chunks = cores * 100 - #Create empty list + # Create empty list split_peaks = [] for i in range(0, chunks): split_peaks.append([]) - #put chunks in the list + # put chunks in the list counter = 0 for interval in bt.BedTool("temp_files_%s/peaks.bed" % pid): if counter == chunks: @@ -308,89 +281,87 @@ def bam_circ_sv_peaks(bam,input_bam_name,cores,verbose,pid,clusters): split_peaks[counter].append(splitted) else: split_peaks[counter].append(splitted) - counter +=1 + counter += 1 else: split_peaks[counter].append([interval.chrom, str(interval.start), str(interval.end)]) counter += 1 + return (sorted_bam, split_peaks) - return(sorted_bam,split_peaks) - - -def get_mate_intervals(sorted_bam,interval,mapq_cutoff,verbose,only_discordants): +def get_mate_intervals(sorted_bam, interval, mapq_cutoff, verbose, only_discordants): """Function that takes as input a sorted bam, an interval and the mapq cutoff and returns the mate alignment positions - (the realignment prior) intervals""" + (the realignment prior) intervals""" try: - - - candidate_mates = [] - for read in sorted_bam.fetch(interval['chrom'], int(interval['start']), int(interval['end']),multiple_iterators=True): - + for read in sorted_bam.fetch( + interval["chrom"], + int(interval["start"]), + int(interval["end"]), + multiple_iterators=True, + ): if read.mapq >= mapq_cutoff: - # create mate interval based on the soft-clipped SA alignments - if is_soft_clipped(read) == True and read.has_tag('SA'): + if is_soft_clipped(read) == True and read.has_tag("SA"): if only_discordants != True: - read_chr = sorted_bam.get_reference_name(read.reference_id) - suplementary = read.get_tag('SA') + suplementary = read.get_tag("SA") # [chr, left_most start, "strand,CIGAR,mapq, edit_distance] - supl_info = [x.strip() for x in suplementary.split(',')] + supl_info = [x.strip() for x in suplementary.split(",")] if read_chr == supl_info[0] and int(supl_info[4]) >= mapq_cutoff: - # split read with the same orientation - if (read.is_reverse == True and supl_info[2] == '-') or ( - read.is_reverse == False and supl_info[2] == '+'): - + if (read.is_reverse == True and supl_info[2] == "-") or ( + read.is_reverse == False and supl_info[2] == "+" + ): # SA is downstream, the interval is start, start+read length if read.reference_start > int(supl_info[1]): - ref_alignment_length = genome_alignment_from_cigar(supl_info[3]) # ref_alignment_length * 2 is done for extending the realignment region # "SA" means that the realignment prior has been generated by a supplementary alignment # L means that the SA is aligned to to a rightmost part. - mate_interval = [interval['chrom'], int(supl_info[1]) - (ref_alignment_length), - (int(supl_info[1]) + (ref_alignment_length)), "SA", "L",str( - 1-phred_to_prob(np.array(int(supl_info[4]),dtype=np.float64)))] + mate_interval = [ + interval["chrom"], + int(supl_info[1]) - (ref_alignment_length), + (int(supl_info[1]) + (ref_alignment_length)), + "SA", + "L", + str(1 - phred_to_prob(np.array(int(supl_info[4]), dtype=np.float64))), + ] candidate_mates.append(mate_interval) - # SA is upstream, the interval is end - read length, end elif read.reference_start < int(supl_info[1]): - ref_alignment_length = genome_alignment_from_cigar(supl_info[3]) # ref_alignment_length * 2 is done for extending the realignment region, "SA" means that the realignment prior has been generated # by a supplementary alignment. R means that the SA is aligned to to a rightmost part. - mate_interval = [interval['chrom'], (int(supl_info[1]) - (ref_alignment_length)), - int(supl_info[1]) + (ref_alignment_length), "SA", "R",str(1-phred_to_prob(np.array(int(supl_info[4]),dtype=np.float64)))] + mate_interval = [ + interval["chrom"], + (int(supl_info[1]) - (ref_alignment_length)), + int(supl_info[1]) + (ref_alignment_length), + "SA", + "R", + str(1 - phred_to_prob(np.array(int(supl_info[4]), dtype=np.float64))), + ] candidate_mates.append(mate_interval) else: pass - - - # check discordant reads (R2F1 orientation) elif read.is_unmapped == False and read.mate_is_unmapped == False: - # check R2F1 orientation,when the R2 read if read.is_reverse == True and read.mate_is_reverse == False: - # R2F1 order if read.reference_start < read.next_reference_start: - if read.reference_id == read.next_reference_id: # create mate interval read_length = read.infer_query_length() @@ -398,155 +369,149 @@ def get_mate_intervals(sorted_bam,interval,mapq_cutoff,verbose,only_discordants) # DR means that the realignment prior has been generated by the discordants. R means # that the mate has been aligned to a rightmost part - - - - - mate_interval = [interval['chrom'], read.next_reference_start, - (read.next_reference_start + read_length), "DR", - "R",str(1-phred_to_prob(np.array(read.get_tag('MQ'),dtype=np.float64)))] + mate_interval = [ + interval["chrom"], + read.next_reference_start, + (read.next_reference_start + read_length), + "DR", + "R", + str(1 - phred_to_prob(np.array(read.get_tag("MQ"), dtype=np.float64))), + ] candidate_mates.append(mate_interval) - # R2F1 when iterating trough F1 read elif read.is_reverse == False and read.mate_is_reverse == True: - if read.next_reference_start < read.reference_start: - if read.reference_id == read.next_reference_id: # create mate interval read_length = read.infer_query_length() # L means that the mate is aligned to a leftmost part - mate_interval = [interval['chrom'], read.next_reference_start, - (read.next_reference_start + read_length), - "DR", "L",str(1-phred_to_prob(np.array(read.get_tag('MQ'),dtype=np.float64)))] + mate_interval = [ + interval["chrom"], + read.next_reference_start, + (read.next_reference_start + read_length), + "DR", + "L", + str(1 - phred_to_prob(np.array(read.get_tag("MQ"), dtype=np.float64))), + ] candidate_mates.append(mate_interval) else: - if only_discordants != True: # soft clipped without and SA and hard clipped reads (secondary) - - if is_soft_clipped(read) == True and read.has_tag('SA') == False: + if is_soft_clipped(read) == True and read.has_tag("SA") == False: # mate interval is whole chromosome - if 'SQ' in sorted_bam.header: - - for reference in sorted_bam.header['SQ']: - - if reference['SN'] == sorted_bam.get_reference_name(read.reference_id): + if "SQ" in sorted_bam.header: + for reference in sorted_bam.header["SQ"]: + if reference["SN"] == sorted_bam.get_reference_name(read.reference_id): # LR is added just not to crash the program - mate_interval = [interval['chrom'], 1, reference['LN'], "SC", "LR",0] + mate_interval = [ + interval["chrom"], + 1, + reference["LN"], + "SC", + "LR", + 0, + ] candidate_mates.append(mate_interval) - else: - if verbose < 2: - warnings.warn( "WARNING: the bam file does not have a SQ tag. Circle-Map cannot check the reference length for realigning\n" - "soft clipped reads without a SA tag, hence, skipping. Please, check if your bam file is truncated") + "soft clipped reads without a SA tag, hence, skipping. Please, check if your bam file is truncated" + ) elif is_hard_clipped(read): - # all hard clipped reads have SA tag with bwa, but just as sanity - if read.has_tag('SA'): - + if read.has_tag("SA"): read_chr = sorted_bam.get_reference_name(read.reference_id) - suplementary = read.get_tag('SA') + suplementary = read.get_tag("SA") # [chr, left_most start, "strand,CIGAR,mapq, edit_distance] - supl_info = [x.strip() for x in suplementary.split(',')] + supl_info = [x.strip() for x in suplementary.split(",")] if read_chr == supl_info[0] and int(supl_info[4]) >= mapq_cutoff: - # SA alignment with the same orientation - if (read.is_reverse == True and supl_info[2] == '-') or ( - read.is_reverse == False and supl_info[2] == '+'): - + if (read.is_reverse == True and supl_info[2] == "-") or ( + read.is_reverse == False and supl_info[2] == "+" + ): # SA is downstream, the interval is start, start+read length if read.reference_start > int(supl_info[1]): - ref_alignment_length = genome_alignment_from_cigar(supl_info[3]) # ref_alignment_length * 2 is done for extending the realignment region # "SA" means that the realignment prior has been generated by a supplementary alignment # L means that the SA is in a downstream region - mate_interval = [interval['chrom'], int(supl_info[1]) - (ref_alignment_length * 2), - (int(supl_info[1]) + (ref_alignment_length * 2)), "SA", "L",str(1-phred_to_prob(int(supl_info[4])))] + mate_interval = [ + interval["chrom"], + int(supl_info[1]) - (ref_alignment_length * 2), + (int(supl_info[1]) + (ref_alignment_length * 2)), + "SA", + "L", + str(1 - phred_to_prob(int(supl_info[4]))), + ] candidate_mates.append(mate_interval) - # SA is upstream, the interval is end - read length, end elif read.reference_start < int(supl_info[1]): - ref_alignment_length = genome_alignment_from_cigar(supl_info[3]) # ref_alignment_length * 2 is done for extending the realignment region, "SA" means that the realignment prior has been generated # by a supplementary alignment # R means that the SA is in a upstream region - mate_interval = [interval['chrom'], - (int(supl_info[1]) - (ref_alignment_length * 2)), - int(supl_info[1]) + (ref_alignment_length * 2), "SA", "R",str(1-phred_to_prob(int(supl_info[4])))] - - + mate_interval = [ + interval["chrom"], + (int(supl_info[1]) - (ref_alignment_length * 2)), + int(supl_info[1]) + (ref_alignment_length * 2), + "SA", + "R", + str(1 - phred_to_prob(int(supl_info[4]))), + ] candidate_mates.append(mate_interval) else: pass - - else: # low mapping quality reads, do nothing pass - #this function should return the candidate mates (realignment prior, discordant intervals/split read intervals and soft-clipped reads) - return(candidate_mates) + # this function should return the candidate mates (realignment prior, discordant intervals/split read intervals and soft-clipped reads) + return candidate_mates except BaseException as e: - warnings.warn( - "WARNING: Could not get mate interval priors for the interval %s due to the following error %s \n Skipping interval" % (str(interval),str(e))) - - - - - - + "WARNING: Could not get mate interval priors for the interval %s due to the following error %s \n Skipping interval" + % (str(interval), str(e)) + ) - - - -def insert_size_dist(sample_size,mapq_cutoff,qname_bam): +def insert_size_dist(sample_size, mapq_cutoff, qname_bam): """Function that takes as input a queryname sorted bam and computes the mean insert a size and the standard deviation from. This number is computed from the F1R2 read with a user defined sample size, using a user defined mapping quality cutoff in both reads""" - whole_bam = ps.AlignmentFile(qname_bam, "rb") counter = 0 insert_length = [] - read1 = '' + read1 = "" # this is similar to the code of read extractor. I save the first read in memory and then I operate # in both reads together for read in whole_bam: - - if read.is_read1: read1 = read # Checks for initialization read1 = '' by looking for read1 type == string. @@ -572,206 +537,275 @@ def insert_size_dist(sample_size,mapq_cutoff,qname_bam): else: pass - mean = np.mean(insert_length) std = np.std(insert_length) - return(mean, std) + return (mean, std) -def normalize_probability_matrix(pandas_df): - return(pandas_df) -def get_realignment_intervals(bed_prior,interval_extension,interval_p_cutoff,verbose): +def normalize_probability_matrix(pandas_df): + return pandas_df +def get_realignment_intervals(bed_prior, interval_extension, interval_p_cutoff, verbose): """Function that takes as input a bed file with the read type information and will remove the soft-clipped if there - are more informative priors (DR,SA). If there are only soft-clipped reads, they will be saved to a bed file to attemp - lonely soft-clipped read rescue""" + are more informative priors (DR,SA). If there are only soft-clipped reads, they will be saved to a bed file to attemp + lonely soft-clipped read rescue""" try: - - labels = ['chrom', 'start', 'end', 'read_type', 'orientation','probability'] + labels = ["chrom", "start", "end", "read_type", "orientation", "probability"] candidate_mates_dataframe = pd.DataFrame.from_records(bed_prior, columns=labels) read_types = candidate_mates_dataframe.read_type.unique() orientation = candidate_mates_dataframe.orientation.unique() - - - - - #this contains the sumatory over all probabilities + # this contains the sumatory over all probabilities sum = 0 - - if np.any(read_types == 'SC') == False: - - + if np.any(read_types == "SC") == False: # nothing. Sort and merge - - candidate_mates_dataframe = candidate_mates_dataframe.sort_values(by=['chrom', 'start','end'],ascending=[True,True,True]) - candidate_mates_dataframe['probability'] = candidate_mates_dataframe.probability.astype(float) - - candidate_mates = candidate_mates_dataframe.groupby((candidate_mates_dataframe.end.shift()-candidate_mates_dataframe.start).lt(0).cumsum()).agg({'chrom':'first','start':'first','end':'last','probability':'sum'}) + candidate_mates_dataframe = candidate_mates_dataframe.sort_values( + by=["chrom", "start", "end"], ascending=[True, True, True] + ) + candidate_mates_dataframe["probability"] = candidate_mates_dataframe.probability.astype(float) + + candidate_mates = candidate_mates_dataframe.groupby( + (candidate_mates_dataframe.end.shift() - candidate_mates_dataframe.start).lt(0).cumsum() + ).agg( + { + "chrom": "first", + "start": "first", + "end": "last", + "probability": "sum", + } + ) sum = np.sum(float(x[3]) for index, x in candidate_mates.iterrows()) - candidate_mates['probability'] = candidate_mates['probability'] / sum - - - elif np.any(read_types == 'SC') == True and (np.any(read_types == 'DR') == True or np.any(read_types == 'SA') == True): - #remove lines with sc - - candidate_mates_no_sc = candidate_mates_dataframe.drop(candidate_mates_dataframe[candidate_mates_dataframe.read_type == 'SC'].index) - candidate_mates_dataframe = candidate_mates_no_sc.sort_values(by=['chrom', 'start', 'end'],ascending=[True, True, True]) - candidate_mates_dataframe['probability'] = candidate_mates_dataframe.probability.astype(float) - - - candidate_mates = candidate_mates_dataframe.groupby((candidate_mates_dataframe.end.shift()-candidate_mates_dataframe.start).lt(0).cumsum()).agg({'chrom':'first','start':'first','end':'last','probability':'sum'}) - - - sum = np.sum(float(x[3]) for index,x in candidate_mates.iterrows()) - candidate_mates['probability'] = candidate_mates['probability']/sum - + candidate_mates["probability"] = candidate_mates["probability"] / sum + + elif np.any(read_types == "SC") == True and ( + np.any(read_types == "DR") == True or np.any(read_types == "SA") == True + ): + # remove lines with sc + + candidate_mates_no_sc = candidate_mates_dataframe.drop( + candidate_mates_dataframe[candidate_mates_dataframe.read_type == "SC"].index + ) + candidate_mates_dataframe = candidate_mates_no_sc.sort_values( + by=["chrom", "start", "end"], ascending=[True, True, True] + ) + candidate_mates_dataframe["probability"] = candidate_mates_dataframe.probability.astype(float) + + candidate_mates = candidate_mates_dataframe.groupby( + (candidate_mates_dataframe.end.shift() - candidate_mates_dataframe.start).lt(0).cumsum() + ).agg( + { + "chrom": "first", + "start": "first", + "end": "last", + "probability": "sum", + } + ) + sum = np.sum(float(x[3]) for index, x in candidate_mates.iterrows()) + candidate_mates["probability"] = candidate_mates["probability"] / sum else: - #only soft clipped + # only soft clipped - return(None) + return None extended = [] - - - - #if argmax is turn on interval_p is 0 + # if argmax is turn on interval_p is 0 if interval_p_cutoff == 0: - #argmax(probability) - - candidate_mates = candidate_mates.loc[candidate_mates['probability'] == candidate_mates['probability'].max()] - - for item,row in candidate_mates.iterrows(): + # argmax(probability) - if ('LR' in orientation) or ('L' and 'R' in orientation): + candidate_mates = candidate_mates.loc[ + candidate_mates["probability"] == candidate_mates["probability"].max() + ] + for item, row in candidate_mates.iterrows(): + if ("LR" in orientation) or ("L" and "R" in orientation): + start = row["start"] - interval_extension - start = row['start'] - interval_extension - - end = row['end'] + interval_extension + end = row["end"] + interval_extension if start < 0: - extended.append([row['chrom'], str(0), int(round(end)),float(row['probability'])]) + extended.append( + [ + row["chrom"], + str(0), + int(round(end)), + float(row["probability"]), + ] + ) else: - extended.append([row['chrom'], int(round(start)), int(round(end)),float(row['probability'])]) - - elif 'L' in orientation: - - start = row['start'] - interval_extension + extended.append( + [ + row["chrom"], + int(round(start)), + int(round(end)), + float(row["probability"]), + ] + ) + + elif "L" in orientation: + start = row["start"] - interval_extension if start < 0: - extended.append([row['chrom'], str(0), row['end'],float(row['probability'])]) + extended.append( + [ + row["chrom"], + str(0), + row["end"], + float(row["probability"]), + ] + ) else: - extended.append([row['chrom'], int(round(start)), row['end'],float(row['probability'])]) - - elif 'R' in orientation: - - end = row['end'] + interval_extension - - extended.append([row['chrom'], row['start'], int(round(end)),float(row['probability'])]) - - return (pd.DataFrame.from_records(extended, columns=['chrom', 'start', 'end','probability'])) + extended.append( + [ + row["chrom"], + int(round(start)), + row["end"], + float(row["probability"]), + ] + ) + + elif "R" in orientation: + end = row["end"] + interval_extension + + extended.append( + [ + row["chrom"], + row["start"], + int(round(end)), + float(row["probability"]), + ] + ) + + return pd.DataFrame.from_records(extended, columns=["chrom", "start", "end", "probability"]) else: + for index, interval in candidate_mates.iterrows(): + # small pseudocount to denominator to avoid div by zero - for index,interval in candidate_mates.iterrows(): - - #small pseudocount to denominator to avoid div by zero - - - if interval['probability'] >= interval_p_cutoff: - - if ('LR' in orientation) or ('L' and 'R' in orientation): - + if interval["probability"] >= interval_p_cutoff: + if ("LR" in orientation) or ("L" and "R" in orientation): + start = interval["start"] - interval_extension - start = interval['start'] - interval_extension - - end = interval['end'] + interval_extension + end = interval["end"] + interval_extension if start < 0: - extended.append([interval['chrom'], str(0), int(round(end)),float(interval['probability'])]) + extended.append( + [ + interval["chrom"], + str(0), + int(round(end)), + float(interval["probability"]), + ] + ) else: - extended.append([interval['chrom'], int(round(start)), int(round(end)),float(interval['probability'])]) - - elif 'L' in orientation: - - start = interval['start'] - interval_extension + extended.append( + [ + interval["chrom"], + int(round(start)), + int(round(end)), + float(interval["probability"]), + ] + ) + + elif "L" in orientation: + start = interval["start"] - interval_extension if start < 0: - extended.append([interval['chrom'], str(0), interval['end'],float(interval['probability'])]) + extended.append( + [ + interval["chrom"], + str(0), + interval["end"], + float(interval["probability"]), + ] + ) else: - extended.append([interval['chrom'], int(round(start)), interval['end'],float(interval['probability'])]) - - elif 'R' in orientation: - - end = interval['end'] + interval_extension - - extended.append([interval['chrom'], interval['start'], int(round(end)),float(interval['probability'])]) - - - return(pd.DataFrame.from_records(extended,columns=['chrom','start','end','probability']).sort_values(by=['probability'],ascending=[False])) - + extended.append( + [ + interval["chrom"], + int(round(start)), + interval["end"], + float(interval["probability"]), + ] + ) + + elif "R" in orientation: + end = interval["end"] + interval_extension + + extended.append( + [ + interval["chrom"], + interval["start"], + int(round(end)), + float(interval["probability"]), + ] + ) + + return pd.DataFrame.from_records(extended, columns=["chrom", "start", "end", "probability"]).sort_values( + by=["probability"], ascending=[False] + ) except BaseException as e: - - warnings.warn( - "WARNING: Could not compute the probability for the mate interval priors %s due to the following error %s \n Skipping intervals" % ( - str(bed_prior), str(e))) + "WARNING: Could not compute the probability for the mate interval priors %s due to the following error %s \n Skipping intervals" + % (str(bed_prior), str(e)) + ) - - -def circle_from_SA(read,mapq_cutoff,mate_interval): - +def circle_from_SA(read, mapq_cutoff, mate_interval): """Function that takes as input a read (soft-clipped) with a Suplementary alignment the mapping quality cut-off and the mate intervals and checks if it fits the conditions to call a circle. Will return True if the supplementary alignment matches the interval""" - suplementary = read.get_tag('SA') + suplementary = read.get_tag("SA") - #this list will have the following information [chr,left_most start,"strand,CIGAR,mapq, edit_distance] + # this list will have the following information [chr,left_most start,"strand,CIGAR,mapq, edit_distance] - supl_info = [x.strip() for x in suplementary.split(',')] + supl_info = [x.strip() for x in suplementary.split(",")] - #mapq filter + # mapq filter if int(supl_info[4]) > mapq_cutoff: - #chromosome filter - if supl_info[0] == mate_interval['chrom']: - #aligned to the mate interval - if int(mate_interval['start']) < int(supl_info[1]) < int(mate_interval['end']): - - #orientation - if read.is_reverse == True and supl_info[2] == '-': - return{'support' : True, 'leftmost': int(supl_info[1]), 'cigar' : supl_info[3]} - - elif read.is_reverse == False and supl_info[2] == '+': - - return{'support' : True, 'leftmost' : int(supl_info[1]), 'cigar' : supl_info[3]} + # chromosome filter + if supl_info[0] == mate_interval["chrom"]: + # aligned to the mate interval + if int(mate_interval["start"]) < int(supl_info[1]) < int(mate_interval["end"]): + # orientation + if read.is_reverse == True and supl_info[2] == "-": + return { + "support": True, + "leftmost": int(supl_info[1]), + "cigar": supl_info[3], + } + + elif read.is_reverse == False and supl_info[2] == "+": + return { + "support": True, + "leftmost": int(supl_info[1]), + "cigar": supl_info[3], + } else: - - return{'support' : False} + return {"support": False} else: - - return{'support' : False} + return {"support": False} else: - return{'support' : False} + return {"support": False} + + @jit(nopython=True) def number_encoding(seq): """Function that takes as input a DNA sequence, and encodes the sequence to numbers, so that it can be accelerated @@ -786,7 +820,7 @@ def number_encoding(seq): encoded.append(3) elif i == "G": encoded.append(4) - return(np.array(encoded)) + return np.array(encoded) def check_alphabet(sequence): @@ -797,28 +831,28 @@ def check_alphabet(sequence): for base in sequence: if base in code: - return(True) - return(False) + return True + return False + -def check_compatibility(seq1,seq2): +def check_compatibility(seq1, seq2): """Function that takes as input two DNA sequence and checks whether their alphabets have at least one element in common. This due to an old bug in edlib""" for base in seq1: - for base2 in seq2: - if base == base2: + return True - return(True) + return False - return(False) @jit(nopython=True) def phred_to_prob(values): """Function that takes as input a numpy array with phred base quality scores and returns an array with base probabi- lity scores""" - return(10**((values*-1)/10)) + return 10 ** ((values * -1) / 10) + def get_longest_soft_clipped_bases(read): """Function that takes as input the cigar string and returns a dictionary containing the longest soft-clipped part of @@ -826,344 +860,327 @@ def get_longest_soft_clipped_bases(read): read_cigar = read.cigar - - #get index of the soft-clipped in the cigar + # get index of the soft-clipped in the cigar match_index = [x for x in range(len(read_cigar)) if read_cigar[x][0] == 4] - # soft-clipped in only one side if len(match_index) == 1: - - #return first n soft-clipped + # return first n soft-clipped if match_index == [0]: - return{'seq': read.seq[0:read_cigar[0][1]],'qual': read.query_qualities[0:read_cigar[0][1]],'mapq':read.mapq} - - #return last n nucleotides - elif match_index[0] == (len(read_cigar)-1): - - return {'seq':read.seq[-read_cigar[match_index[0]][1]:], - 'qual':read.query_qualities[-read_cigar[match_index[0]][1]:],'mapq':read.mapq} - - - - - - - #soft-clipped in both sides of the read + return { + "seq": read.seq[0 : read_cigar[0][1]], + "qual": read.query_qualities[0 : read_cigar[0][1]], + "mapq": read.mapq, + } + + # return last n nucleotides + elif match_index[0] == (len(read_cigar) - 1): + return { + "seq": read.seq[-read_cigar[match_index[0]][1] :], + "qual": read.query_qualities[-read_cigar[match_index[0]][1] :], + "mapq": read.mapq, + } + + # soft-clipped in both sides of the read else: - - #make sure that is soft-clipped on both sides + # make sure that is soft-clipped on both sides try: - assert read_cigar[0][0] == 4 and read_cigar[-1][0] == 4 # longest soft-clipped are first n nucleotides if read_cigar[0][1] >= read_cigar[-1][1]: - - - - return {'seq': read.seq[0:read_cigar[0][1]],'qual': read.query_qualities[0:read_cigar[0][1]], - 'mapq':read.mapq} + return { + "seq": read.seq[0 : read_cigar[0][1]], + "qual": read.query_qualities[0 : read_cigar[0][1]], + "mapq": read.mapq, + } else: - - return{'seq':read.seq[-read_cigar[-1][1]:],'qual': read.query_qualities[-read_cigar[-1][1]:], - 'mapq': read.mapq} + return { + "seq": read.seq[-read_cigar[-1][1] :], + "qual": read.query_qualities[-read_cigar[-1][1] :], + "mapq": read.mapq, + } except AssertionError as e: - print(e) + def background_freqs(seq): """Function that takes as input the sequence of the nucletide frequencies in the realignment interval""" - return{nucleotide: seq.count(nucleotide)/len(seq) for nucleotide in 'ATCG'} - - - - - -def realign(read,n_hits,plus_strand,minus_strand,plus_base_freqs,minus_base_freqs,gap_open,gap_extend,verbose,max_edit): - - + # return{nucleotide: seq.count(nucleotide)/len(seq) for nucleotide in 'ATCG'} + return {nucleotide: max(1, seq.count(nucleotide)) / len(seq) for nucleotide in "ATCG"} + + +def realign( + read, + n_hits, + plus_strand, + minus_strand, + plus_base_freqs, + minus_base_freqs, + gap_open, + gap_extend, + verbose, + max_edit, +): """Function that takes as input a read, the number of hits to find and the plus and minus strand and will return the number of hits, the sequencing qualities for that read and the g+c content of the realignment interval""" - - #get soft-clipped read + # get soft-clipped read soft_clipped_read = get_longest_soft_clipped_bases(read) - #encoding of DNA and operations A,T,C,G,=,X,DI. THis is done for Numba - nuc_and_ops = np.array([1,2,3,4,5,6,7]) - encoded_nucs = number_encoding(soft_clipped_read['seq']) + # encoding of DNA and operations A,T,C,G,=,X,DI. THis is done for Numba + nuc_and_ops = np.array([1, 2, 3, 4, 5, 6, 7]) + encoded_nucs = number_encoding(soft_clipped_read["seq"]) hits = 0 - min_score = len(soft_clipped_read['seq']) - + min_score = len(soft_clipped_read["seq"]) top_hits = {} - if read.is_reverse: - while hits < n_hits and min_score >= -10: + alignment = edlib.align(soft_clipped_read["seq"], minus_strand, mode="HW", task="path") + if hits == 0: + if alignment["editDistance"] > max_edit: + return None - alignment = edlib.align(soft_clipped_read['seq'], minus_strand, mode='HW', task='path') - if hits ==0: - if alignment['editDistance'] > max_edit: - return(None) - + for location in alignment["locations"]: + mask_bases = "X" * (location[1] - location[0]) - - for location in alignment['locations']: - - - - mask_bases = 'X' * ( location[1] - location[0]) - - - minus_strand = minus_strand[:location[0]] + mask_bases + minus_strand[location[1]:] + minus_strand = minus_strand[: location[0]] + mask_bases + minus_strand[location[1] :] hits += 1 - - score = pssm(phred_to_prob(np.array(soft_clipped_read['qual'],dtype=np.float64)), encoded_nucs, - edlib_cigar_to_iterable(alignment['cigar']),minus_base_freqs,gap_open,gap_extend,nuc_and_ops,verbose) + score = pssm( + phred_to_prob(np.array(soft_clipped_read["qual"], dtype=np.float64)), + encoded_nucs, + edlib_cigar_to_iterable(alignment["cigar"]), + minus_base_freqs, + gap_open, + gap_extend, + nuc_and_ops, + verbose, + ) if score < min_score: min_score = score - - top_hits[hits] = (location,alignment['cigar'],score,alignment['editDistance'],"-") + top_hits[hits] = ( + location, + alignment["cigar"], + score, + alignment["editDistance"], + "-", + ) else: # the search was exaustive - hits +=n_hits + hits += n_hits else: - #min socre stops the search if the score is orders of magnitude smaller that the top score given the edit - #distance + # min socre stops the search if the score is orders of magnitude smaller that the top score given the edit + # distance while hits < n_hits and min_score >= -10: + alignment = edlib.align(soft_clipped_read["seq"], plus_strand, mode="HW", task="path") + # stop search if edit distance is to high + if hits == 0: + if alignment["editDistance"] > max_edit: + return None + for location in alignment["locations"]: + mask_bases = "X" * (location[1] - location[0]) - - alignment = edlib.align(soft_clipped_read['seq'], plus_strand, mode='HW', task='path') - #stop search if edit distance is to high - if hits ==0: - if alignment['editDistance'] > max_edit: - return (None) - - - for location in alignment['locations']: - - mask_bases = 'X' * ( location[1] - location[0]) - - plus_strand = plus_strand[:location[0]] + mask_bases + plus_strand[location[1]:] + plus_strand = plus_strand[: location[0]] + mask_bases + plus_strand[location[1] :] hits += 1 - score = pssm(phred_to_prob(np.array(soft_clipped_read['qual'],dtype=np.float64)), encoded_nucs, - edlib_cigar_to_iterable(alignment['cigar']), plus_base_freqs,gap_open,gap_extend,nuc_and_ops,verbose) + score = pssm( + phred_to_prob(np.array(soft_clipped_read["qual"], dtype=np.float64)), + encoded_nucs, + edlib_cigar_to_iterable(alignment["cigar"]), + plus_base_freqs, + gap_open, + gap_extend, + nuc_and_ops, + verbose, + ) if score < min_score: min_score = score - top_hits[hits] = (location,alignment['cigar'],score,alignment['editDistance'],"+") + top_hits[hits] = ( + location, + alignment["cigar"], + score, + alignment["editDistance"], + "+", + ) else: + hits += n_hits - hits +=n_hits - - - - - return({'alignments':top_hits,'mapq_prior': soft_clipped_read['mapq']}) + return {"alignments": top_hits, "mapq_prior": soft_clipped_read["mapq"]} def edlib_cigar_to_iterable(edlib_cigar): """Function that takes as input the edlib cigar and parses it to get it in a iterable manner""" - #encoding of DNA and operations A,T,C,G,=,X,ID - #nuc_and_ops = np.array([1,2,3,4,5,6,7]) + # encoding of DNA and operations A,T,C,G,=,X,ID + # nuc_and_ops = np.array([1,2,3,4,5,6,7]) length = [] operations = [] - for i in re.findall(r'\d+[IDX=]',edlib_cigar): + for i in re.findall(r"\d+[IDX=]", edlib_cigar): length.append(int(i[0])) - if i[1] == '=': + if i[1] == "=": operations.append(5) - elif i[1] == 'X': + elif i[1] == "X": operations.append(6) - elif i[1] == 'I' or 'D': + elif i[1] == "I" or "D": operations.append(7) - - return(np.array(length),np.array(operations)) + return (np.array(length), np.array(operations)) @jit(nopython=True) -def pssm(seq_prob,seq_nucl,iterable_cigar,base_freqs,gap_open,gap_extend,nuc_and_ops,verbose): +def pssm( + seq_prob, + seq_nucl, + iterable_cigar, + base_freqs, + gap_open, + gap_extend, + nuc_and_ops, + verbose, +): """Function that takes as input the sequencing probabilities and cigar string and returns the log2 pssm of the read""" - - - - - #start positon to operate in the pssm. This is done to iterate over the operations in the cigar, and keep track of + # start positon to operate in the pssm. This is done to iterate over the operations in the cigar, and keep track of # were I am in the seq and quality values seq_pos = 0 indel_penalty = 0 - - - #iterate trough CIGAR operations - for index in range(0,len(iterable_cigar[0])): - + # iterate trough CIGAR operations + for index in range(0, len(iterable_cigar[0])): operation_length = iterable_cigar[0][index] end = operation_length + seq_pos - - operation = iterable_cigar[1][index] - - #match, 1 minus prob(base called wrong) + # match, 1 minus prob(base called wrong) if operation == nuc_and_ops[4]: - - for nucleotide in range(seq_pos,end): - + for nucleotide in range(seq_pos, end): if seq_nucl[nucleotide] == nuc_and_ops[0]: - - seq_prob[nucleotide] = np.log2((1 - (seq_prob[nucleotide]))/base_freqs[0]) + seq_prob[nucleotide] = np.log2((1 - (seq_prob[nucleotide])) / base_freqs[0]) elif seq_nucl[nucleotide] == nuc_and_ops[1]: - seq_prob[nucleotide] = np.log2((1 - (seq_prob[nucleotide])) / base_freqs[1]) elif seq_nucl[nucleotide] == nuc_and_ops[2]: - seq_prob[nucleotide] = np.log2((1 - (seq_prob[nucleotide])) / base_freqs[2]) elif seq_nucl[nucleotide] == nuc_and_ops[3]: - seq_prob[nucleotide] = np.log2((1 - (seq_prob[nucleotide])) / base_freqs[3]) - - seq_pos += operation_length - - elif operation == nuc_and_ops[5]: - - - for nucleotide in range(seq_pos,end): - + for nucleotide in range(seq_pos, end): if seq_nucl[nucleotide] == nuc_and_ops[0]: - - seq_prob[nucleotide] = np.log2( - (seq_prob[nucleotide]/3)/base_freqs[0]) - + seq_prob[nucleotide] = np.log2((seq_prob[nucleotide] / 3) / base_freqs[0]) elif seq_nucl[nucleotide] == nuc_and_ops[1]: - - seq_prob[nucleotide] = np.log2( - (seq_prob[nucleotide]/3)/base_freqs[1]) + seq_prob[nucleotide] = np.log2((seq_prob[nucleotide] / 3) / base_freqs[1]) elif seq_nucl[nucleotide] == nuc_and_ops[2]: - - seq_prob[nucleotide] = np.log2( - (seq_prob[nucleotide]/3)/base_freqs[2]) - + seq_prob[nucleotide] = np.log2((seq_prob[nucleotide] / 3) / base_freqs[2]) elif seq_nucl[nucleotide] == nuc_and_ops[3]: - - seq_prob[nucleotide] = np.log2( - (seq_prob[nucleotide]/3)/base_freqs[3]) - - - + seq_prob[nucleotide] = np.log2((seq_prob[nucleotide] / 3) / base_freqs[3]) elif seq_nucl[nucleotide] == nuc_and_ops[6]: - if verbose < 2: - seq_prob[nucleotide] = 0 - print("Warning:Ambiguous base found in nucleotide sequence. Assigning score of 0 in the log2 pssm") + print( + "Warning:Ambiguous base found in nucleotide sequence. Assigning score of 0 in the log2 pssm" + ) seq_pos += operation_length - elif operation == nuc_and_ops[6]: + # affine gap scoring model + indel_penalty += gap_open + gap_extend * (operation_length - 1) - #affine gap scoring model - indel_penalty += gap_open + gap_extend*(operation_length-1) - + return np.sum(seq_prob) - indel_penalty - return(np.sum(seq_prob)-indel_penalty) - -def realignment_probability(hit_dict,interval_length): +def realignment_probability(hit_dict, interval_length): """Function that takes as input the realignment dictionary and returns the alignment probability of the best hit""" + best_hit = hit_dict["alignments"][1][2] - best_hit = hit_dict['alignments'][1][2] - - #this might be included on the denominator + # this might be included on the denominator try: - posterior = 2**best_hit/(np.sum((2**value[2]) for key,value in hit_dict['alignments'].items())) + posterior = 2**best_hit / (np.sum((2 ** value[2]) for key, value in hit_dict["alignments"].items())) except ZeroDivisionError as e: print(e) - warnings.warn("ZeroDivisionError caught while computing the realignment posterior probability." - "Setting posterior probability to 0") + warnings.warn( + "ZeroDivisionError caught while computing the realignment posterior probability." + "Setting posterior probability to 0" + ) posterior = 0 - return(posterior) - - - - - + return posterior -def fraction(start1,start2,end1,end2,read1,read2): +def fraction(start1, start2, end1, end2, read1, read2): """Function that performs a first round of merging. If the realigned intervals and SA intervals overlap, and are ca- lled within the same iteration (which means that it is the same circle probably) they will be merged""" - #check that they come from the same read - read_match = (read1 == read2)*1 + # check that they come from the same read + read_match = (read1 == read2) * 1 + # calculate distance between the two intervals + distance = abs(start1 - start2) + abs(end1 - end2) - #calculate distance between the two intervals - distance = (abs(start1-start2) + abs(end1-end2)) + # overlap of interval 1 on interval 2 + one_overlap_two = 1 - (distance / (end1 - start1)) + # overlap of interval two on interval 1 + two_overlap_one = 1 - (distance / (end2 - start2)) - #overlap of interval 1 on interval 2 - one_overlap_two = 1 - (distance/(end1-start1)) - #overlap of interval two on interval 1 - two_overlap_one = 1 - (distance/(end2-start2)) + return one_overlap_two + two_overlap_one + read_match - return(one_overlap_two + two_overlap_one + read_match) - - -def merge_fraction(chrom1,x1,x2,chrom2,y1,y2): +def merge_fraction(chrom1, x1, x2, chrom2, y1, y2): """compute overlap (reciprocal) of the interval y over interval x""" - distance = (np.minimum(x2.values,y2.values) - np.maximum(x1.values,y1.values)) - - - - one_overlap_two = distance/(y2.values-y1.values) + distance = np.minimum(x2.values, y2.values) - np.maximum(x1.values, y1.values) - two_overlap_one = distance/(x2.values-x1.values) + one_overlap_two = distance / (y2.values - y1.values) + two_overlap_one = distance / (x2.values - x1.values) # check if they are on the same chromosome and the amount of overlap if so - return(pd.Series(chrom1 == chrom2) + pd.Series(two_overlap_one.clip(0)) + pd.Series(one_overlap_two.clip(0))) - - -def iteration_merge(only_discordants,results,fraction,splits,score,sc_len,bam,af,insert,std,n_discordant): + return pd.Series(chrom1 == chrom2) + pd.Series(two_overlap_one.clip(0)) + pd.Series(one_overlap_two.clip(0)) + + +def iteration_merge( + only_discordants, + results, + fraction, + splits, + score, + sc_len, + bam, + af, + insert, + std, + n_discordant, +): """finction that merges the results of every iteration and filters the data by allele frequency""" norm_fraction = 3 @@ -1173,32 +1190,39 @@ def iteration_merge(only_discordants,results,fraction,splits,score,sc_len,bam,af interval.append(0) parsed_discordants.append(interval) - discordant_bed = bt.BedTool(parsed_discordants) - - - - - unparsed_pd = pd.DataFrame.from_records(results, - columns=['chrom', 'start', 'end', 'read', 'iteration','score', 'discordants']) - - unparsed_pd = unparsed_pd.sort_values(['iteration','chrom','start','end']).reset_index() - - - grouped = unparsed_pd.groupby(merge_fraction(unparsed_pd.iteration.shift(), unparsed_pd.start.shift(), - unparsed_pd.end.shift(), unparsed_pd.iteration, - unparsed_pd.start, - unparsed_pd.end).lt(norm_fraction).cumsum()).agg( - {'chrom': 'first', 'start': 'min', 'end': 'max', 'discordants': 'max', 'read': 'sum','score':'sum'}) + unparsed_pd = pd.DataFrame.from_records( + results, + columns=["chrom", "start", "end", "read", "iteration", "score", "discordants"], + ) + + unparsed_pd = unparsed_pd.sort_values(["iteration", "chrom", "start", "end"]).reset_index() + + grouped = unparsed_pd.groupby( + merge_fraction( + unparsed_pd.iteration.shift(), + unparsed_pd.start.shift(), + unparsed_pd.end.shift(), + unparsed_pd.iteration, + unparsed_pd.start, + unparsed_pd.end, + ) + .lt(norm_fraction) + .cumsum() + ).agg( + { + "chrom": "first", + "start": "min", + "end": "max", + "discordants": "max", + "read": "sum", + "score": "sum", + } + ) bedtool_output = bt.BedTool.from_dataframe(grouped) - - - - - allele_free = bedtool_output.cat(discordant_bed, postmerge=False) write = [] @@ -1206,78 +1230,96 @@ def iteration_merge(only_discordants,results,fraction,splits,score,sc_len,bam,af try: if int(interval[4]) != 0: if (int(interval[4])) >= splits and float(interval[5]) > score: - start_cov = bam.count(contig=interval[0], - start=int(interval[1]), stop=int(interval[1])+1 - ,read_callback='nofilter') - - end_cov = bam.count(contig=interval[0], - start=int(interval[2])-1, stop=int(interval[2]) - ,read_callback='nofilter') - - - - circle_af = ((int(interval[4]) * 2)) / ((start_cov+end_cov+0.01)/2) - if circle_af >=af: + start_cov = bam.count( + contig=interval[0], + start=int(interval[1]), + stop=int(interval[1]) + 1, + read_callback="nofilter", + ) + + end_cov = bam.count( + contig=interval[0], + start=int(interval[2]) - 1, + stop=int(interval[2]), + read_callback="nofilter", + ) + + circle_af = ((int(interval[4]) * 2)) / ((start_cov + end_cov + 0.01) / 2) + if circle_af >= af: write.append(interval) else: if int(interval[3]) >= n_discordant: - start_cov = bam.count(contig=interval[0],start=int(interval[1]), stop=int(interval[1]) + 1, - read_callback='nofilter') - - end_cov = bam.count(contig=interval[0], - start=int(interval[2]) - 1, stop=int(interval[2]), - read_callback='nofilter') - - circle_af = (int(interval[3])) / ((start_cov+end_cov+0.01)/2) - - if circle_af >= af: - write.append(interval) + start_cov = bam.count( + contig=interval[0], + start=int(interval[1]), + stop=int(interval[1]) + 1, + read_callback="nofilter", + ) + + end_cov = bam.count( + contig=interval[0], + start=int(interval[2]) - 1, + stop=int(interval[2]), + read_callback="nofilter", + ) + + circle_af = (int(interval[3])) / ((start_cov + end_cov + 0.01) / 2) + + if circle_af >= af: + write.append(interval) except BaseException as e: print(e) pass - return(bt.BedTool(write)) + return bt.BedTool(write) - - - -def merge_final_output(bam,results,begin,splits,dir,fraction,pid): - +def merge_final_output(bam, results, begin, splits, dir, fraction, pid): """Function that takes as input the final results, and merge reciprocal intervals (this is done to combine the output of different clusters)""" - - bam = ps.AlignmentFile(bam, "rb") os.chdir("temp_files_%s/" % pid) # multiply *2 for reciprocal overlap +1 to check chromosome - norm_fraction = (fraction*2)+1 + norm_fraction = (fraction * 2) + 1 unparsed_bed = bt.BedTool(os.path.basename(results)) + print( + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"), + "Writting final output to disk", + ) + unparsed_pd = unparsed_bed.to_dataframe(names=["chrom", "start", "end", "discordants", "sc", "score"]) - - print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"),"Writting final output to disk") - - - unparsed_pd = unparsed_bed.to_dataframe( - names=['chrom', 'start', 'end', 'discordants', 'sc','score']) - - - - second_merging_round = unparsed_pd.sort_values(['chrom', 'start', 'end']).reset_index() - #merge the output + second_merging_round = unparsed_pd.sort_values(["chrom", "start", "end"]).reset_index() + # merge the output # merge_fraction calculates the degree of overlap between the two genomic intervals - #lt(norm_freaction) looks the ones that surpass the merging threshold (returns 0 if true, 1 if not) + # lt(norm_freaction) looks the ones that surpass the merging threshold (returns 0 if true, 1 if not) # Cumsum calculates the cumulative sum over the output of lt. Which is then used for the grouping. - #If the cumulative sum is the same for two rows, they are merged + # If the cumulative sum is the same for two rows, they are merged final_output = second_merging_round.groupby( - merge_fraction(second_merging_round.chrom.shift(), second_merging_round.start.shift(), - second_merging_round.end.shift(),second_merging_round.chrom,second_merging_round.start,second_merging_round.end).lt(norm_fraction).cumsum()).agg( - {'chrom': 'first', 'start': 'min', 'end': 'max', 'discordants' : 'max', 'sc': 'sum','score':'sum'}) + merge_fraction( + second_merging_round.chrom.shift(), + second_merging_round.start.shift(), + second_merging_round.end.shift(), + second_merging_round.chrom, + second_merging_round.start, + second_merging_round.end, + ) + .lt(norm_fraction) + .cumsum() + ).agg( + { + "chrom": "first", + "start": "min", + "end": "max", + "discordants": "max", + "sc": "sum", + "score": "sum", + } + ) unfiltered_output = bt.BedTool.from_dataframe(final_output) @@ -1285,286 +1327,380 @@ def merge_final_output(bam,results,begin,splits,dir,fraction,pid): filtered = [] for interval in unfiltered_output: - - if (int(interval[4])+int(interval[3])) >= splits: + if (int(interval[4]) + int(interval[3])) >= splits: if int(interval[1]) != 0: - interval[1] = int(interval[1])+1 + interval[1] = int(interval[1]) + 1 filtered.append(interval) filtered_output = bt.BedTool(filtered) os.chdir("%s" % dir) - - print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"),"Finished!") + print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"), "Finished!") end = time.time() total_time = (end - begin) / 60 + print( + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"), + "Circle-Map Realign finished indentifying circles in %s \n" % total_time, + ) + print( + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"), + "Circle-Map has identified %s circles\n" % len(filtered_output), + ) - print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"),"Circle-Map Realign finished indentifying circles in %s \n" % total_time) - print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"),"Circle-Map has identified %s circles\n" % len(filtered_output)) - - - - return(filtered_output) + return filtered_output -def write_to_disk(partial_bed,output,locker,dir,pid): - +def write_to_disk(partial_bed, output, locker, dir, pid): """function that writes to disk the results of every worker thread""" - locker.acquire() - os.chdir("%s/temp_files_%s/" % (dir,pid)) - output_bed = bt.BedTool('%s' % os.path.basename(output)) - writer_bed = output_bed.cat(partial_bed,postmerge=False) - writer_bed.saveas('%s' % os.path.basename(output)) + os.chdir("%s/temp_files_%s/" % (dir, pid)) + output_bed = bt.BedTool("%s" % os.path.basename(output)) + writer_bed = output_bed.cat(partial_bed, postmerge=False) + writer_bed.saveas("%s" % os.path.basename(output)) os.chdir("%s" % dir) locker.release() -def start_realign(circle_bam,output,threads,verbose,pid,clusters): + +def start_realign(circle_bam, output, threads, verbose, pid, clusters): """Function that start the realigner function - - Splits the clusters to cores and removes the from disk the bedtools intermediates""" + - Splits the clusters to cores and removes the from disk the bedtools intermediates""" begin = time.time() - print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"),"Realigning reads using Circle-Map\n") - - print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"),"Clustering structural variant reads\n") + print( + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"), + "Realigning reads using Circle-Map\n", + ) + print( + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"), + "Clustering structural variant reads\n", + ) eccdna_bam = ps.AlignmentFile("%s" % circle_bam, "rb") sp.call("mkdir temp_files_%s" % pid, shell=True) - - sorted_bam,splitted = bam_circ_sv_peaks(eccdna_bam,circle_bam,threads,verbose,pid,clusters) - - - + sorted_bam, splitted = bam_circ_sv_peaks(eccdna_bam, circle_bam, threads, verbose, pid, clusters) # split to cores - print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"),"Splitting clusters to to processors\n") + print( + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"), + "Splitting clusters to to processors\n", + ) os.chdir("temp_files_%s" % pid) sp.call("touch %s" % os.path.basename(output), shell=True) os.chdir("../") - #this releases from tmp file the unmerged and peak file + # this releases from tmp file the unmerged and peak file bt.cleanup() - return(splitted,sorted_bam,begin) + return (splitted, sorted_bam, begin) + def start_simulate(pid): """Function for starting Circle-Map simulate""" print("\nRunning Circle-Map Simulate\n") - sp.call("mkdir temp_files_%s" % pid, shell=True) + return pid - return(pid) -def mutate(genome,pid,indel,snp,java_mem): +def mutate(genome, pid, indel, snp, java_mem): """Function that takes as input the path of the genome,the indel ans substitution rate, and it will create a sinthetic genome introducing random mutations on the fasta sequence and providing a vcf""" - print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"),"Introducing mutations in the fasta genome") + print( + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"), + "Introducing mutations in the fasta genome", + ) print("\t Indel rate: %s" % indel) print("\t Substitution rate: %s" % snp) - sp.call("mutate.sh %s in=%s out=temp_files_%s/mutated.fa subrate=%s indelrate=%s" % (java_mem,genome,pid,snp,indel),shell=True) - - print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"),"Simulating reads") + sp.call( + "mutate.sh %s in=%s out=temp_files_%s/mutated.fa subrate=%s indelrate=%s" % (java_mem, genome, pid, snp, indel), + shell=True, + ) - return(None) + print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"), "Simulating reads") + return None - - - -def check_size_and_write(results,only_discortants,output,lock,directory,fraction,pid): +def check_size_and_write(results, only_discortants, output, lock, directory, fraction, pid): """Function that checks if the intervals in memory are to big. And writes them to disk to release memory.""" - if sys.getsizeof(results) < 100000000: - return(False) - + return False else: + partial_bed = iteration_merge(only_discortants, results, fraction) - partial_bed = iteration_merge(only_discortants, results,fraction) + print( + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"), + "Writting %s circular intervals to disk" % len(partial_bed), + ) + write_to_disk(partial_bed, output, lock, directory, pid) - print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S:"),"Writting %s circular intervals to disk" % len(partial_bed)) - write_to_disk(partial_bed,output,lock,directory,pid) + return True - return(True) - -def merge_coverage_bed(results,frac,number): +def merge_coverage_bed(results, frac, number): """Function that takes as bed file containing the coordinates of the double mapped reads and returns the merged bed file containing the information about the clusters""" - fraction = (frac*2)+1 - - unparsed_pd = pd.DataFrame.from_records(results,columns=['chrom', 'start', 'end','item']) - + fraction = (frac * 2) + 1 + unparsed_pd = pd.DataFrame.from_records(results, columns=["chrom", "start", "end", "item"]) - sort = unparsed_pd.sort_values(by=['chrom', 'start', 'end']).reset_index(drop=True) + sort = unparsed_pd.sort_values(by=["chrom", "start", "end"]).reset_index(drop=True) - merging_out = sort.groupby( - merge_fraction(sort.chrom, sort.start, - sort.end,sort.chrom.shift(),sort.start.shift(),sort.end.shift()).lt(fraction).cumsum()).agg( - {'chrom': 'first', 'start': 'min', 'end': 'max','item': 'sum'}) + merging_out = sort.groupby( + merge_fraction( + sort.chrom, + sort.start, + sort.end, + sort.chrom.shift(), + sort.start.shift(), + sort.end.shift(), + ) + .lt(fraction) + .cumsum() + ).agg({"chrom": "first", "start": "min", "end": "max", "item": "sum"}) merging_out = merging_out.drop(merging_out[merging_out.item < number].index) - merging_out = merging_out.sort_values(by=['chrom', 'start', 'end']).reset_index(drop=True) - - - - - - + merging_out = merging_out.sort_values(by=["chrom", "start", "end"]).reset_index(drop=True) final_output = merging_out.groupby( - merge_fraction(merging_out.chrom, merging_out.start, - merging_out.end, merging_out.chrom.shift(), merging_out.start.shift(),merging_out.end.shift()).lt(fraction).cumsum()).agg( - {'chrom': 'first', 'start': 'min', 'end': 'last', 'item': 'sum'}) - + merge_fraction( + merging_out.chrom, + merging_out.start, + merging_out.end, + merging_out.chrom.shift(), + merging_out.start.shift(), + merging_out.end.shift(), + ) + .lt(fraction) + .cumsum() + ).agg({"chrom": "first", "start": "min", "end": "last", "item": "sum"}) bedtool_output = bt.BedTool.from_dataframe(final_output) - return(bedtool_output) + return bedtool_output -def filter_by_ratio(eccdna_bed,cutoff): - """Function that takes as input the eccDNA bed and returns the data filtered by tha change at the start and the end - """ - #circle list is a shared memory object +def filter_by_ratio(eccdna_bed, cutoff): + """Function that takes as input the eccDNA bed and returns the data filtered by tha change at the start and the end""" + + # circle list is a shared memory object circle_list = [] unparsed_pd = eccdna_bed.to_dataframe( - names=['chrom', 'start', 'end', 'discordants', 'soft-clipped', 'score', 'mean','std','start_ratio','end_ratio','continuity']) + names=[ + "chrom", + "start", + "end", + "discordants", + "soft-clipped", + "score", + "mean", + "std", + "start_ratio", + "end_ratio", + "continuity", + ] + ) for item, row in unparsed_pd.iterrows(): - - if float(row[8]) > cutoff or float(row[9]) > cutoff: - circle_list.append([row['chrom'],row['start'],row['end'],row['discordants'],row['soft-clipped'], - row['score'],row['mean'],row['std'],row['start_ratio'],row['end_ratio'],row['continuity']]) - - output = pd.DataFrame.from_records(circle_list,columns=['chrom', 'start', 'end', 'discordants', 'soft-clipped', 'score', 'mean','std', - 'start_ratio','end_ratio','continuity']) - - return(output) + circle_list.append( + [ + row["chrom"], + row["start"], + row["end"], + row["discordants"], + row["soft-clipped"], + row["score"], + row["mean"], + row["std"], + row["start_ratio"], + row["end_ratio"], + row["continuity"], + ] + ) + + output = pd.DataFrame.from_records( + circle_list, + columns=[ + "chrom", + "start", + "end", + "discordants", + "soft-clipped", + "score", + "mean", + "std", + "start_ratio", + "end_ratio", + "continuity", + ], + ) + + return output def merge_bed(discordants_pd): """Function that takes as input a bed file and returns a pandas dataframe indicating if files should be merged. This function will merge everything that is overlapping by at least 1bp""" - #check range overlap + # check range overlap overlap = ((discordants_pd.start - discordants_pd.shift().end) - 1).lt(0) - #check chr overlap - chr_overlap = (discordants_pd.chrom == discordants_pd.shift().chrom) - #if both bools are succesful returns a 2 - return ((overlap * 1 + chr_overlap * 1).lt(2).cumsum()) + # check chr overlap + chr_overlap = discordants_pd.chrom == discordants_pd.shift().chrom + # if both bools are succesful returns a 2 + return (overlap * 1 + chr_overlap * 1).lt(2).cumsum() + -def assign_discordants(split_bed,discordant_bed,insert_mean,insert_std): +def assign_discordants(split_bed, discordant_bed, insert_mean, insert_std): """Function that takes as input the the discordant reads supporting an interval and assigns them to the interval if they are close by (using the insert size estimate)""" max_dist = (insert_mean) + (5 * insert_std) - splits = pd.DataFrame.from_records(split_bed, columns=['chrom', 'start', 'end', 'read', 'iteration', - 'score']).sort_values(['chrom', 'start', 'end']) + splits = pd.DataFrame.from_records( + split_bed, columns=["chrom", "start", "end", "read", "iteration", "score"] + ).sort_values(["chrom", "start", "end"]) - splits['score'] = splits['score'].astype(float) + splits["score"] = splits["score"].astype(float) - merged_splits = splits.groupby(['chrom', 'start', 'end', 'iteration']).agg( - {'chrom': 'first', 'start': 'first', 'end': 'max', 'read': 'nunique', 'iteration': 'first', 'score': 'sum'}) + merged_splits = splits.groupby(["chrom", "start", "end", "iteration"]).agg( + { + "chrom": "first", + "start": "first", + "end": "max", + "read": "nunique", + "iteration": "first", + "score": "sum", + } + ) - merged_splits['read'] = merged_splits['read'].astype(int) - discordant_bed = pd.DataFrame.from_records(discordant_bed,columns=['chrom', 'start', 'end', 'read']) + merged_splits["read"] = merged_splits["read"].astype(int) + discordant_bed = pd.DataFrame.from_records(discordant_bed, columns=["chrom", "start", "end", "read"]) if len(discordant_bed) > 0: assigned_splits = [] for index, row in merged_splits.iterrows(): - chrom_filt = discordant_bed[(discordant_bed['chrom'] == row['chrom'])] + chrom_filt = discordant_bed[(discordant_bed["chrom"] == row["chrom"])] start_filt = chrom_filt[ - (chrom_filt['start'] > row['start']) & ((chrom_filt['start'] - row['start']) < max_dist)] - end_filt = start_filt[(start_filt['end'] < row['end']) & ((row['end'] - start_filt['end']) < max_dist)] + (chrom_filt["start"] > row["start"]) & ((chrom_filt["start"] - row["start"]) < max_dist) + ] + end_filt = start_filt[(start_filt["end"] < row["end"]) & ((row["end"] - start_filt["end"]) < max_dist)] assigned_splits.append( - [row['chrom'], row['start'], row['end'], row['read'], row['iteration'], float(row['score']), len(end_filt)]) - - return (assigned_splits) + [ + row["chrom"], + row["start"], + row["end"], + row["read"], + row["iteration"], + float(row["score"]), + len(end_filt), + ] + ) + + return assigned_splits else: assigned_splits = [] - for index,row in merged_splits.iterrows(): + for index, row in merged_splits.iterrows(): assigned_splits.append( - [row['chrom'], row['start'], row['end'], row['read'], row['iteration'], float(row['score']), - 0]) + [ + row["chrom"], + row["start"], + row["end"], + row["read"], + row["iteration"], + float(row["score"]), + 0, + ] + ) + + return assigned_splits - return(assigned_splits) -def adaptative_myers_k(sc_len,edit_frac): +def adaptative_myers_k(sc_len, edit_frac): """Calculate the edit distance allowed as a function of the read length""" - return(float(sc_len*edit_frac)) + return float(sc_len * edit_frac) + + @jit(nopython=True) -def non_colinearity(read_start_cigar,read_end_cigar,aln_start,mate_interval_start,mate_interval_end): +def non_colinearity(read_start_cigar, read_end_cigar, aln_start, mate_interval_start, mate_interval_end): """Input a read and the mate interval in the graph. The function checks whether the alignment would be linear (splicing) or colinear. Will return false, in order to not attemp realignment. This is mainly thought for skipping deletions and RNA splicing""" - - #check left soft-clipped + # check left soft-clipped if read_start_cigar == 4: - #graph needs to be upstream or looping to itself + # graph needs to be upstream or looping to itself if int(mate_interval_start) > aln_start: - return (True) + return True elif aln_start < int(mate_interval_end): - #looping to itself - return (True) + # looping to itself + return True else: - return (False) - #check right softclipped + return False + # check right softclipped if read_end_cigar == 4: - # graph needs to be downstream or looping to itself + # graph needs to be downstream or looping to itself if int(mate_interval_end) < aln_start: - return (True) + return True elif aln_start > int(mate_interval_start): - #looping to itself - return (True) + # looping to itself + return True else: - return (False) + return False + @jit(nopython=True) def prob_to_phred(prob): """Function that takes as input a probability and returns a phred-scaled probability. Rounded to the nearest decimal""" if prob == 1.0: prob = 0.999999999 - return(int(np.around(-10*np.log10(1-prob)))) + return int(np.around(-10 * np.log10(1 - prob))) + -def realignment_read_to_SA_string(realignment_dict,prob,chrom,soft_clip_start): +def realignment_read_to_SA_string(realignment_dict, prob, chrom, soft_clip_start): """Function that takes as input the realignment dict, the alignment posterior probability and the chromosome and returns an SA string""" - sa_tag = chrom + "," + str(soft_clip_start) + "," + str(realignment_dict['alignments'][1][4]) + "," \ - + realignment_dict['alignments'][1][1] + ","+ str(prob_to_phred(prob)) + "," + str(realignment_dict['alignments'][1][3]) + ";" - return(sa_tag) - -def write_clipped_read(bam,read,mate,no_soft_clipped,no_hard_clipped,mapq_cutoff,own_mapq=False): + sa_tag = ( + chrom + + "," + + str(soft_clip_start) + + "," + + str(realignment_dict["alignments"][1][4]) + + "," + + realignment_dict["alignments"][1][1] + + "," + + str(prob_to_phred(prob)) + + "," + + str(realignment_dict["alignments"][1][3]) + + ";" + ) + return sa_tag + + +def write_clipped_read(bam, read, mate, no_soft_clipped, no_hard_clipped, mapq_cutoff, own_mapq=False): """Function that takes as input a bam file and a read and writes the read to the bam file""" # If mate is unmapped, own mapq is set to true and the read will get its own mapq - if read.has_tag('MQ'): + if read.has_tag("MQ"): if is_soft_clipped(read) == True: - if no_soft_clipped == False: - # gets its on mapq since mate is unmapped if read.mapq >= mapq_cutoff: bam.write(read) @@ -1574,20 +1710,17 @@ def write_clipped_read(bam,read,mate,no_soft_clipped,no_hard_clipped,mapq_cutoff else: if is_hard_clipped(read) == True: if no_hard_clipped == False: - bam.write(read) + bam.write(read) else: - if is_soft_clipped(read) == True: - if no_soft_clipped == False: - # gets its on mapq since mate is unmapped if read.mapq >= mapq_cutoff: if own_mapq == True: - read.tags += [('MQ', read.mapq)] + read.tags += [("MQ", read.mapq)] else: - read.tags += [('MQ', mate.mapq)] + read.tags += [("MQ", mate.mapq)] bam.write(read) @@ -1598,10 +1731,10 @@ def write_clipped_read(bam,read,mate,no_soft_clipped,no_hard_clipped,mapq_cutoff if no_hard_clipped == False: if read.mapq >= mapq_cutoff: if own_mapq == True: - read.tags += [('MQ', read.mapq)] + read.tags += [("MQ", read.mapq)] else: - read.tags += [('MQ', mate.mapq)] + read.tags += [("MQ", mate.mapq)] bam.write(read) - return(None) + return None diff --git a/conf/base.config b/conf/base.config index 830fcffa..adb12597 100644 --- a/conf/base.config +++ b/conf/base.config @@ -24,6 +24,11 @@ process { // If possible, it would be nice to keep the same label naming convention when // adding in your local modules too. // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors + withLabel:process_single { + cpus = { check_max( 1 , 'cpus' ) } + memory = { check_max( 6.GB * task.attempt, 'memory' ) } + time = { check_max( 4.h * task.attempt, 'time' ) } + } withLabel:process_low { cpus = { check_max( 2 * task.attempt, 'cpus' ) } memory = { check_max( 12.GB * task.attempt, 'memory' ) } diff --git a/conf/modules.config b/conf/modules.config index cb092441..ebd5e978 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -7,16 +7,11 @@ ext.args2 = Second set of arguments appended to command in module (multi-tool modules). ext.args3 = Third set of arguments appended to command in module (multi-tool modules). ext.prefix = File name prefix for output files. + ext.when = When to run the module. ---------------------------------------------------------------------------------------- */ process { - publishDir = [ - path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - withName: 'SAMPLESHEET_CHECK' { publishDir = [ path: { "${params.outdir}/pipeline_info" }, @@ -83,36 +78,34 @@ if(!params.skip_qc) { // // Trimming options // -if (!params.skip_trimming) { - process { - withName: 'TRIMGALORE' { - ext.prefix = { "${meta.id}.trimmed" } - ext.args = [ - '--fastqc', - params.trim_nextseq > 0 ? "--nextseq ${params.trim_nextseq}" : '' - ].join(' ').trim() - publishDir = [ - [ - path: { "${params.outdir}/reports/trimgalore_fastqc" }, - mode: params.publish_dir_mode, - pattern: "*.{html,zip}", - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ], - [ - path: { "${params.outdir}/trimgalore" }, - mode: params.publish_dir_mode, - pattern: "*.fq.gz", - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: params.save_trimmed - ], - [ - path: { "${params.outdir}/trimgalore" }, - mode: params.publish_dir_mode, - pattern: "*.txt", - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] +process { + withName: 'TRIMGALORE' { + ext.args = [ + '--fastqc', + params.trim_nextseq > 0 ? "--nextseq ${params.trim_nextseq}" : '' + ].join(' ').trim() + ext.prefix = { "${meta.id}.trimmed" } + publishDir = [ + [ + path: { "${params.outdir}/reports/trimgalore_fastqc" }, + mode: params.publish_dir_mode, + pattern: "*.{html,zip}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ], + [ + path: { "${params.outdir}/trimgalore" }, + mode: params.publish_dir_mode, + pattern: "*.fq.gz", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: params.save_trimmed + ], + [ + path: { "${params.outdir}/trimgalore" }, + mode: params.publish_dir_mode, + pattern: "*.txt", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] - } + ] } } @@ -124,7 +117,7 @@ process { ext.args = "" publishDir = [ path: { "${params.outdir}/bwa" }, - enabled: true, + enabled: false, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, pattern: "*.bam" @@ -145,7 +138,7 @@ process { ext.prefix = { "${meta.id}.sorted" } publishDir = [ path: { "${params.outdir}/bwa" }, - enabled: params.save_sorted_bam, + enabled: true, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, pattern: "*sorted.bam" @@ -153,9 +146,10 @@ process { } } + process { - withName: '.*:BAM_STATS_SAMTOOLS_RAW:.*' { - publishDir = [ + withName: '.*BAM_STATS_SAMTOOLS_RAW.*' { + publishDir = [ path: { "${params.outdir}/reports/samtools_stats/${meta.id}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, @@ -167,168 +161,158 @@ process { // // Picard MarkDuplicates and Filtering options // -if (!params.skip_markduplicates) { - process { - withName: '.*:MARK_DUPLICATES_PICARD:PICARD_MARKDUPLICATES' { - ext.args = 'ASSUME_SORTED=true REMOVE_DUPLICATES=false VALIDATION_STRINGENCY=LENIENT TMP_DIR=tmp' - ext.prefix = { "${meta.id}.md" } - publishDir = [ - [ - path: { "${params.outdir}/reports/markduplicates" }, - mode: params.publish_dir_mode, - pattern: '*metrics.txt', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ], - [ - path: { "${params.outdir}/markduplicates/bam" }, - mode: params.publish_dir_mode, - pattern: '*.md.{bam,bai}', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enable: params.save_markduplicates_bam - ] - ] - } - - withName: '.*:MARK_DUPLICATES_PICARD:SAMTOOLS_INDEX' { - ext.prefix = { "${meta.id}.markdup.sorted" } - publishDir = [ - path: { "${params.outdir}/bwa" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - pattern: '*.{bai,csi}' - ] - } - - withName: '.*:MARK_DUPLICATES_PICARD:BAM_STATS_SAMTOOLS:.*' { - publishDir = [ - path: { "${params.outdir}/reports/samtools_stats/${meta.id}/md/" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - pattern: '*.{stats,flagstat,idxstats}' - ] - } - withName: 'SAMTOOLS_VIEW_FILTER' { - if (params.keep_duplicates) { - ext.args = '' - } else { - ext.args = '-F 0x0400' - } - ext.prefix = { "${meta.id}.md.filtered" } - publishDir = [ - path: { "${params.outdir}/markduplicates/duplicates_removed" }, - mode: params.publish_dir_mode, - pattern: '*filtered.{bai,bam}', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: false - ] - } - withName: 'SAMTOOLS_SORT_FILTERED' { - ext.args = "" - ext.prefix = { "${meta.id}.md.filtered.sorted" } - publishDir = [ - path: { "${params.outdir}/markduplicates/duplicates_removed" }, +process { + withName: '.*BAM_MARKDUPLICATES_PICARD:PICARD_MARKDUPLICATES' { + ext.args = '--ASSUME_SORTED true --REMOVE_DUPLICATES false --VALIDATION_STRINGENCY LENIENT --TMP_DIR tmp' + ext.prefix = { "${meta.id}.md" } + publishDir = [ + [ + path: { "${params.outdir}/reports/markduplicates" }, mode: params.publish_dir_mode, - pattern: '*filtered.sorted.bam', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: true - ] - } - - withName: 'SAMTOOLS_INDEX_FILTERED' { - ext.args = "" - ext.prefix = { "${meta.id}.md.filtered.sorted" } - publishDir = [ - path: { "${params.outdir}/markduplicates/duplicates_removed" }, + pattern: '*metrics.txt', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ], + [ + path: { "${params.outdir}/markduplicates/bam" }, mode: params.publish_dir_mode, - pattern: '*filtered.sorted.bai', + pattern: '*.md.{bam,bai}', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: true + enable: params.save_markduplicates_bam ] - } + ] + } + withName: '.*BAM_MARKDUPLICATES_PICARD:SAMTOOLS_INDEX' { + ext.prefix = { "${meta.id}.markdup.sorted" } + publishDir = [ + path: { "${params.outdir}/markduplicates/bam" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: '*.{bai,csi}' + ] } -} -if ("circle_finder" in params.circle_identifier.split(",")) { - process { - withName: 'BEDTOOLS_SORTEDBAM2BED' { - ext.args = "-cigar" - } - withName: 'BEDTOOLS_SPLITBAM2BED' { - ext.args = "-cigar" - } - withName: 'SAMTOOLS_SORT_QNAME_CF' { - ext.prefix = { "${meta.id}.qname.sorted" } - ext.args = "-n" - publishDir = [ - path: { "${params.outdir}/circlefinder/samtools" }, - mode: params.publish_dir_mode, - pattern: '*filtered.sorted.bai', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: true - ] - } + withName: '.*BAM_MARKDUPLICATES_PICARD:BAM_STATS_SAMTOOLS.*' { + ext.args = "" + publishDir = [ + path: { "${params.outdir}/reports/samtools_stats/${meta.id}/md/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: '*.{stats,flagstat,idxstats}' + ] + } + withName: 'SAMTOOLS_VIEW_FILTER' { + ext.args = '-h -F 0x0400' + publishDir = [ + path: { "${params.outdir}/markduplicates/duplicates_removed" }, + mode: params.publish_dir_mode, + pattern: '*filtered.{bai,bam}', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: true + ] + ext.prefix = { "${meta.id}.md.filtered" } + } + withName: 'SAMTOOLS_SORT_FILTERED' { + ext.args = "" + ext.prefix = { "${meta.id}.md.filtered.sorted" } + publishDir = [ + path: { "${params.outdir}/markduplicates/duplicates_removed" }, + mode: params.publish_dir_mode, + pattern: '*filtered.sorted.bam', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: true + ] + } + withName: 'SAMTOOLS_INDEX_FILTERED' { + ext.args = "" + ext.prefix = { "${meta.id}.md.filtered.sorted" } + publishDir = [ + path: { "${params.outdir}/markduplicates/duplicates_removed" }, + mode: params.publish_dir_mode, + pattern: '*filtered.sorted.bai', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: true + ] } } -if ("circle_finder" in params.circle_identifier.split(",")) { - process { - withName: 'BEDTOOLS_SORTEDBAM2BED' { - ext.args = "-cigar" - } - withName: 'BEDTOOLS_SPLITBAM2BED' { - ext.args = "-cigar" - } - withName: 'SAMTOOLS_SORT_QNAME_CF' { - ext.prefix = { "${meta.id}.qname.sorted" } - ext.args = "-n" - publishDir = [ - path: { "${params.outdir}/circlefinder/samtools" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: false - ] - } - withName: 'CIRCLEFINDER' { - ext.args = "" - publishDir = [ - path: { "${params.outdir}/circlefinder" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: true - ] - } - withName: 'SAMBLASTER' { - ext.args = "--ignoreUnmated" - publishDir = [ - path: { "${params.outdir}/circlefinder/samblaster" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: false - ] - } +// +// Circle_finder Options +// +process { + withName: 'SAMTOOLS_SORT_QNAME_CF' { + ext.args = "-n" + ext.prefix = { "${meta.id}.qname.sorted" } + publishDir = [ + path: { "${params.outdir}/circlefinder/samtools" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: params.save_circle_finder_intermediate + ] + } + withName: 'SAMBLASTER' { + ext.args = "--ignoreUnmated" + publishDir = [ + path: { "${params.outdir}/circlefinder/samblaster" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: params.save_circle_finder_intermediate, + ] + } + withName: 'BEDTOOLS_SPLITBAM2BED' { + ext.prefix = { "${meta.id}.split" } + ext.args = "-cigar" + publishDir = [ + path: { "${params.outdir}/circlefinder/split" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: params.save_circle_finder_intermediate + ] + } + withName: 'BEDTOOLS_SORTEDBAM2BED' { + ext.args = "-cigar" + ext.prefix = { "${meta.id}.concordant" } + publishDir = [ + path: { "${params.outdir}/circlefinder/concordant" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: params.save_circle_finder_intermediate + ] + } + withName: 'CIRCLEFINDER' { + ext.args = "" + publishDir = [ + path: { "${params.outdir}/circlefinder" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: true + ] } } -if ("circexplorer2" in params.circle_identifier.split(",")) { - process { - withName: 'CIRCEXPLORER2_PARSE' { - ext.args = "-t BWA" - publishDir = [ - path: { "${params.outdir}/circexplorer2" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: true - ] - } +// +// CIRCexplorer2 Options +// +process { + withName: 'CIRCEXPLORER2_PARSE' { + ext.args = "-t BWA" + publishDir = [ + path: { "${params.outdir}/circexplorer2" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: true + ] } } +// +// AmpliconArchitect Options +// process { withName: 'CNVKIT_BATCH' { - ext.args = "--method wgs" - publishDir = [ + ext.args = "--method wgs" + publishDir = [ path: { "${params.outdir}/ampliconarchitect/cnvkit" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, @@ -337,8 +321,8 @@ process { } withName: 'CNVKIT_SEGMENT' { - ext.args = "" - publishDir = [ + ext.args = "" + publishDir = [ path: { "${params.outdir}/ampliconarchitect/cnvkit" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, @@ -346,8 +330,8 @@ process { ] } withName: 'COLLECT_SEEDS' { - ext.args = "" - publishDir = [ + ext.args = "" + publishDir = [ path: { "${params.outdir}/ampliconarchitect/cnvkit" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, @@ -355,8 +339,8 @@ process { ] } withName: 'AMPLIFIED_INTERVALS' { - ext.args = "" - publishDir = [ + ext.args = "" + publishDir = [ path: { "${params.outdir}/ampliconarchitect/cnvkit" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, @@ -365,8 +349,8 @@ process { } withName: 'AMPLICONARCHITECT_AMPLICONARCHITECT' { time = '96.h' - ext.args = "" - publishDir = [ + ext.args = "" + publishDir = [ [ path: { "${params.outdir}/ampliconarchitect/ampliconarchitect/sv_view" }, mode: params.publish_dir_mode, @@ -400,154 +384,55 @@ process { ] } - withName: 'AMPLICONARCHITECT_AMPLICONCLASSIFIER' { - ext.args = "--annotate_cycles_file --report_complexity --verbose_classification --plotstyle 'individual'" - publishDir = [ - [ - path: { "${params.outdir}/ampliconarchitect/ampliconclassifier/${meta.id}" }, - mode: params.publish_dir_mode, - pattern: '*.input', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ], - [ - path: { "${params.outdir}/ampliconarchitect/ampliconclassifier/${meta.id}" }, - mode: params.publish_dir_mode, - pattern: '*.log', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ], - [ - path: { "${params.outdir}/ampliconarchitect/ampliconclassifier/${meta.id}" }, - mode: params.publish_dir_mode, - pattern: '*amplicon_classification_profiles.tsv', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ], - [ - path: { "${params.outdir}/ampliconarchitect/ampliconclassifier/${meta.id}" }, - mode: params.publish_dir_mode, - pattern: '*ecDNA_counts.tsv', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ], - [ - path: { "${params.outdir}/ampliconarchitect/ampliconclassifier/${meta.id}" }, - mode: params.publish_dir_mode, - pattern: '*gene_list.tsv', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ], - [ - path: { "${params.outdir}/ampliconarchitect/ampliconclassifier/${meta.id}/bed" }, - mode: params.publish_dir_mode, - pattern: '*.bed', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ], - [ - path: { "${params.outdir}/ampliconarchitect/ampliconclassifier/${meta.id}/annotated_cycles" }, - mode: params.publish_dir_mode, - pattern: '*annotated_cycles.txt', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ], - [ - path: { "${params.outdir}/ampliconarchitect/ampliconclassifier/${meta.id}/radar" }, - mode: params.publish_dir_mode, - pattern: '*class_radar.{png,pdf}', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ], - [ - path: { "${params.outdir}/ampliconarchitect/ampliconclassifier/${meta.id}" }, - mode: params.publish_dir_mode, - pattern: '*_feature_entropy.tsv', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ], - [ - path: { "${params.outdir}/ampliconarchitect/ampliconclassifier/${meta.id}" }, - mode: params.publish_dir_mode, - pattern: '*edge_classification_profiles.tsv', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] + withName: 'AMPLICONCLASSIFIER_AMPLICONCLASSIFIER' { + ext.args = "--report_complexity --verbose_classification --plotstyle 'individual'" + publishDir = [ + path: { "${params.outdir}/ampliconclassifier/ampliconclassifier" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: true ] } - withName: 'AMPLICONARCHITECT_AMPLICONSIMILARITY' { - ext.args = "" - publishDir = [ + withName: 'AMPLICONCLASSIFIER_AMPLICONSIMILARITY' { + ext.args = "" + publishDir = [ [ - path: { "${params.outdir}/ampliconarchitect/ampliconclassifier/input" }, + path: { "${params.outdir}/ampliconclassifier/ampliconclassifier/input" }, mode: params.publish_dir_mode, pattern: '*.input', saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ], [ - path: { "${params.outdir}/ampliconarchitect/ampliconclassifier/log" }, + path: { "${params.outdir}/ampliconclassifier/ampliconsimilarity/log" }, mode: params.publish_dir_mode, pattern: '*.log', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, ], [ - path: { "${params.outdir}/ampliconarchitect/ampliconclassifier/similarity" }, + path: { "${params.outdir}/ampliconclassifier/ampliconsimilarity/similarity" }, mode: params.publish_dir_mode, pattern: '*_similarity_scores.tsv', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, ], ] } - withName: 'SUMMARISE_AA' { - ext.args = "" - publishDir = [ - path: { "${params.outdir}/ampliconarchitect/summary" }, + withName: 'AMPLICONCLASSIFIER_MAKEINPUT' { + ext.args = "" + publishDir = [ + path: { "${params.outdir}/ampliconclassifier/makeinput" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: true ] } -} - -if ("unicycler" in params.circle_identifier.split(",")) { - process { - withName: 'UNICYCLER' { - time = '96.h' - ext.args = "--no_rotate" - publishDir = [ - path: { "${params.outdir}/unicycler" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: true - ] - } - withName: 'SEQTK_SEQ' { - ext.args = "" - publishDir = [ - path: { "${params.outdir}/unicycler" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: true - ] - } - withName: 'SEQTK_SEQ' { - ext.args = "" - publishDir = [ - path: { "${params.outdir}/unicycler" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: false - ] - } - withName: 'GETCIRCULARREADS' { - ext.args = "" - publishDir = [ - path: { "${params.outdir}/unicycler/getcircularreads" }, - mode: params.publish_dir_mode, - pattern: '*.unicycler.circular.fastq.gz', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: true - ] - } - withName: 'MINIMAP2_ALIGN' { - ext.args = "" - publishDir = [ - path: { "${params.outdir}/unicycler/minimap2" }, - mode: params.publish_dir_mode, - pattern: '*.paf', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: true - ] - } + withName: 'AMPLICONCLASSIFIER_MAKERESULTSTABLE' { + ext.args = "" + publishDir = [ + path: { "${params.outdir}/ampliconclassifier/resultstable" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: true + ] } } @@ -556,9 +441,9 @@ if ("unicycler" in params.circle_identifier.split(",")) { // process { withName: 'SAMTOOLS_SORT_QNAME_CM' { - ext.prefix = { "${meta.id}.qname.sorted" } - ext.args = "-n" - publishDir = [ + ext.prefix = { "${meta.id}.qname.sorted" } + ext.args = "-n" + publishDir = [ path: { "${params.outdir}/circlemap/samtools" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, @@ -566,51 +451,101 @@ process { ] } withName: 'CIRCLEMAP_READEXTRACTOR' { - ext.prefix = { "${meta.id}.qname.sorted" } - ext.args = "" - publishDir = [ + ext.args = "" + ext.prefix = { "${meta.id}.circular_read_candidates" } + publishDir = [ path: { "${params.outdir}/circlemap/readextractor" }, mode: params.publish_dir_mode, + pattern: "*.qname.sorted.bam", saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: true + enabled: false ] } withName: 'SAMTOOLS_SORT_RE' { - ext.prefix = { "${meta.id}.circular_read_candidates_sorted" } - ext.args = "" - publishDir = [ + ext.prefix = { "${meta.id}.circular_read_candidates.sorted" } + ext.args = "" + publishDir = [ + path: { "${params.outdir}/circlemap/readextractor" }, + mode: params.publish_dir_mode, + pattern: "*.circular_read_candidates.sorted.bam", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: params.save_circle_map_intermediate + ] + } + withName: 'SAMTOOLS_INDEX_RE' { + ext.prefix = { "${meta.id}.circular_read_candidates.sorted" } + ext.args = "" + publishDir = [ path: { "${params.outdir}/circlemap/readextractor" }, mode: params.publish_dir_mode, + pattern: "*.circular_read_candidates.sorted.bam.bai", saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: false + enabled: params.save_circle_map_intermediate ] } - withName: 'SAMTOOLS_INDEX_RE' { - ext.prefix = { "${meta.id}.circular_read_candidates_sorted" } - ext.args = "" - publishDir = [ - path: { "${params.outdir}/circlemap/readextractor" }, + withName: 'CIRCLEMAP_REPEATS' { + ext.args = "" + publishDir = [ + path: { "${params.outdir}/circlemap/repeats" }, mode: params.publish_dir_mode, - pattern: '*.{bam,bai}', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: false, + enabled: true ] } withName: 'CIRCLEMAP_REALIGN' { - ext.args = "" - time = '96.h' - publishDir = [ + time = '96.h' + ext.args = "" + publishDir = [ path: { "${params.outdir}/circlemap/realign" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, enabled: true ] } - withName: 'CIRCLEMAP_REPEATS' { - ext.args = "" - publishDir = [ - path: { "${params.outdir}/circlemap/repeats" }, +} + +// +// Unicycler Options +// +process { + withName: 'UNICYCLER' { + time = '96.h' + ext.args = "--no_rotate" + publishDir = [ + path: { "${params.outdir}/unicycler/unicycler" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: params.save_unicycler_intermediate + ] + } + withName: 'SEQTK_SEQ' { + ext.args = "" + ext.prefix = { "${meta.id}.unicycler" } + publishDir = [ + path: { "${params.outdir}/unicycler/seqtk" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: params.save_unicycler_intermediate + ] + } + withName: 'GETCIRCULARREADS' { + ext.args = "" + ext.prefix = { "${meta.id}.unicycler.circular" } + publishDir = [ + path: { "${params.outdir}/unicycler/getcircularreads" }, + mode: params.publish_dir_mode, + pattern: '*.unicycler.circular.fastq.gz', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: params.save_unicycler_intermediate + ] + } + withName: 'MINIMAP2_ALIGN' { + ext.args = "" + ext.prefix = { "${meta.id}.unicycler.circular.mapped" } + publishDir = [ + path: { "${params.outdir}/unicycler/minimap2" }, mode: params.publish_dir_mode, + pattern: '*.paf', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, enabled: true ] @@ -622,21 +557,17 @@ if (!params.skip_multiqc) { withName: 'MULTIQC' { ext.args = params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' publishDir = [ - path: { [ - "${params.outdir}/multiqc" - ].join('') }, + path: { "${params.outdir}/multiqc" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } + withName: 'CUSTOM_DUMPSOFTWAREVERSIONS' { + publishDir = [ + path: { "${params.outdir}/pipeline_info" }, + mode: params.publish_dir_mode, + pattern: '*_versions.yml' + ] + } } - - withName: CUSTOM_DUMPSOFTWAREVERSIONS { - publishDir = [ - path: { "${params.outdir}/pipeline_info" }, - mode: params.publish_dir_mode, - pattern: '*_versions.yml' - ] - } - } diff --git a/conf/test.config b/conf/test.config index f36d755b..40dfeffa 100644 --- a/conf/test.config +++ b/conf/test.config @@ -10,10 +10,9 @@ ---------------------------------------------------------------------------------------- */ -stubRun = false params { - config_profile_name = 'Test profile' - config_profile_description = 'Minimal test dataset to check pipeline function' + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' // Limit resources so that this can run on GitHub Actions max_cpus = 2 @@ -22,6 +21,7 @@ params { // Input data input = 'https://raw.githubusercontent.com/nf-core/test-datasets/circdna/samplesheet/samplesheet.csv' + input_format = 'FASTQ' // Genome references diff --git a/conf/test_AA.config b/conf/test_AA.config index 03ee6bb0..0acbb061 100644 --- a/conf/test_AA.config +++ b/conf/test_AA.config @@ -33,6 +33,7 @@ params { skip_markduplicates = true igenomes_ignore = true + cnvkit_cnn = "https://raw.githubusercontent.com/nf-core/test-datasets/circdna/cnvkit/dummy_file.cnn" mosek_license_dir = "https://raw.githubusercontent.com/nf-core/test-datasets/circdna/mosek/mosek.lic" aa_data_repo = "data_repo" reference_build = "GRCh38" diff --git a/conf/test_full.config b/conf/test_full.config index 86e9cc96..14a2a4cc 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -10,7 +10,6 @@ ---------------------------------------------------------------------------------------- */ -stubRun = false params { config_profile_name = 'Full test profile' config_profile_description = 'Full test dataset to check pipeline function' diff --git a/docs/usage.md b/docs/usage.md index 4f3046f7..2a4de6da 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -18,7 +18,7 @@ Depending on the branch (circle_identifier) used in the pipeline, different inpu You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with either 2 or 3 columns (depending on the input format), and a header row as shown in the examples below. -```console +```bash --input '[path to samplesheet file]' ``` @@ -28,7 +28,7 @@ The two input formats accepted by the pipeline are "FASTQ" and "BAM". If not spe ### FASTQ -```console +```bash sample,fastq_1,fastq_2 circdna_1,circdna_1_R1.fastq.gz,circdna_1_R2.fastq.gz circdna_2,circdna_2_R1.fastq.gz,circdna_2_R2.fastq.gz @@ -45,7 +45,7 @@ An [example samplesheet fastq](../assets/samplesheet.csv) has been provided with ### BAM -```console +```bash sample,bam circdna_1,circdna_1.bam circdna_2,circdna_2.bam @@ -63,7 +63,7 @@ An [example samplesheet bam](../assets/samplesheet_bam.csv) has been provided wi If using FASTQ input, the `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: -```console +```bash sample,fastq_1,fastq_2 CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz @@ -78,11 +78,33 @@ CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. +## Samplesheet input - BAM + +The pipeline can be run from directly from bam files. Here,the samplesheet has to be a comma-separated file with 2 columns, and a header row as shown in the examples below. + +```console +--input '[path to samplesheet file]' +``` + +```console +sample,bam +sample1, sample1.bam +sample2, sample2.bam +sample3, sample3.bam +``` + +| Column | Description | +| -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | +| `bam` | Full path to BAM file for Illumina short reads. File has to be aligned to a reference genome and in bam format with the extension ".bam" | + +An [example samplesheet](../assets/samplesheet_bam.csv) has been provided with the pipeline. + ## Running the pipeline The typical command for running the pipeline is as follows: -```console +```bash nextflow run nf-core/circdna --input samplesheet.csv --outdir --genome GRCh38 -profile docker --circle_identifier ``` @@ -90,28 +112,27 @@ This will launch the pipeline with the `docker` configuration profile. See below Note that the pipeline will create the following files in your working directory: -```console +````bash work # Directory containing the nextflow working files # Finished results in specified location (defined with --outdir) .nextflow_log # Log file from Nextflow # Other nextflow hidden files, eg. history of pipeline runs and old logs. -``` ### Updating the pipeline When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: -```console +```bash nextflow pull nf-core/circdna -``` +```` ### Reproducibility It is a good idea to specify a pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. -First, go to the [nf-core/circdna releases page](https://github.com/nf-core/circdna/releases) and find the latest version number - numeric only (eg. `1.3.1`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.3.1`. +First, go to the [nf-core/circdna releases page](https://github.com/nf-core/circdna/releases) and find the latest pipeline version - numeric only (eg. `1.3.1`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.3.1`. Of course, you can switch to another version by changing the number after the `-r` flag. -This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. +This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. For example, at the bottom of the MultiQC reports. ## Core Nextflow arguments @@ -121,7 +142,7 @@ This version number will be logged in reports when you run the pipeline, so that Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments. -Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Conda) - see below. When using Biocontainers, most of these software packaging methods pull Docker containers from quay.io e.g [FastQC](https://quay.io/repository/biocontainers/fastqc) except for Singularity which directly downloads Singularity images via https hosted by the [Galaxy project](https://depot.galaxyproject.org/singularity/) and Conda which downloads and installs software locally from [Bioconda](https://bioconda.github.io/). +Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Conda) - see below. > We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. @@ -130,8 +151,11 @@ The pipeline also dynamically loads configurations from [https://github.com/nf-c Note that multiple profiles can be loaded, for example: `-profile test,docker` - the order of arguments is important! They are loaded in sequence, so later profiles can overwrite earlier profiles. -If `-profile` is not specified, the pipeline will run locally and expect all software to be installed and available on the `PATH`. This is _not_ recommended. +If `-profile` is not specified, the pipeline will run locally and expect all software to be installed and available on the `PATH`. This is _not_ recommended, since it can lead to different results on different machines dependent on the computer enviroment. +- `test` + - A profile with a complete configuration for automated testing + - Includes links to test data so needs no other parameters - `docker` - A generic configuration profile to be used with [Docker](https://docker.com/) - `singularity` @@ -169,7 +193,7 @@ Whilst the default requirements set within the pipeline will hopefully work for For example, if the nf-core/rnaseq pipeline is failing after multiple re-submissions of the `STAR_ALIGN` process due to an exit code of `137` this would indicate that there is an out of memory issue: -```console +```bash [62/149eb0] NOTE: Process `NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN (WT_REP1)` terminated with an error exit status (137) -- Execution is retried (1) Error executing process > 'NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN (WT_REP1)' @@ -198,8 +222,14 @@ Work dir: Tip: you can replicate the issue by changing to the process work dir and entering the command `bash .command.run` ``` +#### For beginners + +A first step to bypass this error, you could try to increase the amount of CPUs, memory, and time for the whole pipeline. Therefor you can try to increase the resource for the parameters `--max_cpus`, `--max_memory`, and `--max_time`. Based on the error above, you have to increase the amount of memory. Therefore you can go to the [parameter documentation of rnaseq](https://nf-co.re/rnaseq/3.9/parameters) and scroll down to the `show hidden parameter` button to get the default value for `--max_memory`. In this case 128GB, you than can try to run your pipeline again with `--max_memory 200GB -resume` to skip all process, that were already calculated. If you can not increase the resource of the complete pipeline, you can try to adapt the resource for a single process as mentioned below. + +#### Advanced option on process level + To bypass this error you would need to find exactly which resources are set by the `STAR_ALIGN` process. The quickest way is to search for `process STAR_ALIGN` in the [nf-core/rnaseq Github repo](https://github.com/nf-core/rnaseq/search?q=process+STAR_ALIGN). -We have standardised the structure of Nextflow DSL2 pipelines such that all module files will be present in the `modules/` directory and so, based on the search results, the file we want is `modules/nf-core/software/star/align/main.nf`. +We have standardised the structure of Nextflow DSL2 pipelines such that all module files will be present in the `modules/` directory and so, based on the search results, the file we want is `modules/nf-core/star/align/main.nf`. If you click on the link to that file you will notice that there is a `label` directive at the top of the module that is set to [`label process_high`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/modules/nf-core/software/star/align/main.nf#L9). The [Nextflow `label`](https://www.nextflow.io/docs/latest/process.html#label) directive allows us to organise workflow processes in separate groups which can be referenced in a configuration file to select and configure subset of processes having similar computing requirements. The default values for the `process_high` label are set in the pipeline's [`base.config`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/base.config#L33-L37) which in this case is defined as 72GB. @@ -218,7 +248,7 @@ process { > > If you get a warning suggesting that the process selector isn't recognised check that the process name has been specified correctly. -### Updating containers +### Updating containers (advanced users) The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. If for some reason you need to use a different version of a particular tool with the pipeline then you just need to identify the `process` name and override the Nextflow `container` definition for that process using the `withName` declaration. For example, in the [nf-core/viralrecon](https://nf-co.re/viralrecon) pipeline a tool called [Pangolin](https://github.com/cov-lineages/pangolin) has been used during the COVID-19 pandemic to assign lineages to SARS-CoV-2 genome sequenced samples. Given that the lineage assignments change quite frequently it doesn't make sense to re-release the nf-core/viralrecon everytime a new version of Pangolin has been released. However, you can override the default container used by the pipeline by creating a custom config file and passing it as a command-line argument via `-c custom.config`. @@ -266,6 +296,14 @@ See the main [Nextflow documentation](https://www.nextflow.io/docs/latest/config If you have any questions or issues please send us a message on [Slack](https://nf-co.re/join/slack) on the [`#configs` channel](https://nfcore.slack.com/channels/configs). +## Azure Resource Requests + +To be used with the `azurebatch` profile by specifying the `-profile azurebatch`. +We recommend providing a compute `params.vm_type` of `Standard_D16_v3` VMs by default but these options can be changed if required. + +Note that the choice of VM size depends on your quota and the overall workload during the analysis. +For a thorough list, please refer the [Azure Sizes for virtual machines in Azure](https://docs.microsoft.com/en-us/azure/virtual-machines/sizes). + ## Running in the background Nextflow handles job submissions and supervises the running jobs. The Nextflow process must run until the pipeline is finished. @@ -280,6 +318,6 @@ Some HPC setups also allow you to run nextflow within a cluster job submitted yo In some cases, the Nextflow Java virtual machines can start to request a large amount of memory. We recommend adding the following line to your environment to limit this (typically in `~/.bashrc` or `~./bash_profile`): -```console +```bash NXF_OPTS='-Xms1g -Xmx4g' ``` diff --git a/lib/NfcoreSchema.groovy b/lib/NfcoreSchema.groovy index b3d092f8..33cd4f6e 100755 --- a/lib/NfcoreSchema.groovy +++ b/lib/NfcoreSchema.groovy @@ -46,7 +46,6 @@ class NfcoreSchema { 'quiet', 'syslog', 'v', - 'version', // Options for `nextflow run` command 'ansi', diff --git a/lib/NfcoreTemplate.groovy b/lib/NfcoreTemplate.groovy index 2fc0a9b9..25a0a74a 100755 --- a/lib/NfcoreTemplate.groovy +++ b/lib/NfcoreTemplate.groovy @@ -32,6 +32,25 @@ class NfcoreTemplate { } } + // + // Generate version string + // + public static String version(workflow) { + String version_string = "" + + if (workflow.manifest.version) { + def prefix_v = workflow.manifest.version[0] != 'v' ? 'v' : '' + version_string += "${prefix_v}${workflow.manifest.version}" + } + + if (workflow.commitId) { + def git_shortsha = workflow.commitId.substring(0, 7) + version_string += "-g${git_shortsha}" + } + + return version_string + } + // // Construct and send completion email // @@ -61,7 +80,7 @@ class NfcoreTemplate { misc_fields['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp def email_fields = [:] - email_fields['version'] = workflow.manifest.version + email_fields['version'] = NfcoreTemplate.version(workflow) email_fields['runName'] = workflow.runName email_fields['success'] = workflow.success email_fields['dateComplete'] = workflow.complete @@ -145,6 +164,64 @@ class NfcoreTemplate { output_tf.withWriter { w -> w << email_txt } } + // + // Construct and send a notification to a web server as JSON + // e.g. Microsoft Teams and Slack + // + public static void IM_notification(workflow, params, summary_params, projectDir, log) { + def hook_url = params.hook_url + + def summary = [:] + for (group in summary_params.keySet()) { + summary << summary_params[group] + } + + def misc_fields = [:] + misc_fields['start'] = workflow.start + misc_fields['complete'] = workflow.complete + misc_fields['scriptfile'] = workflow.scriptFile + misc_fields['scriptid'] = workflow.scriptId + if (workflow.repository) misc_fields['repository'] = workflow.repository + if (workflow.commitId) misc_fields['commitid'] = workflow.commitId + if (workflow.revision) misc_fields['revision'] = workflow.revision + misc_fields['nxf_version'] = workflow.nextflow.version + misc_fields['nxf_build'] = workflow.nextflow.build + misc_fields['nxf_timestamp'] = workflow.nextflow.timestamp + + def msg_fields = [:] + msg_fields['version'] = NfcoreTemplate.version(workflow) + msg_fields['runName'] = workflow.runName + msg_fields['success'] = workflow.success + msg_fields['dateComplete'] = workflow.complete + msg_fields['duration'] = workflow.duration + msg_fields['exitStatus'] = workflow.exitStatus + msg_fields['errorMessage'] = (workflow.errorMessage ?: 'None') + msg_fields['errorReport'] = (workflow.errorReport ?: 'None') + msg_fields['commandLine'] = workflow.commandLine.replaceFirst(/ +--hook_url +[^ ]+/, "") + msg_fields['projectDir'] = workflow.projectDir + msg_fields['summary'] = summary << misc_fields + + // Render the JSON template + def engine = new groovy.text.GStringTemplateEngine() + // Different JSON depending on the service provider + // Defaults to "Adaptive Cards" (https://adaptivecards.io), except Slack which has its own format + def json_path = hook_url.contains("hooks.slack.com") ? "slackreport.json" : "adaptivecard.json" + def hf = new File("$projectDir/assets/${json_path}") + def json_template = engine.createTemplate(hf).make(msg_fields) + def json_message = json_template.toString() + + // POST + def post = new URL(hook_url).openConnection(); + post.setRequestMethod("POST") + post.setDoOutput(true) + post.setRequestProperty("Content-Type", "application/json") + post.getOutputStream().write(json_message.getBytes("UTF-8")); + def postRC = post.getResponseCode(); + if (! postRC.equals(200)) { + log.warn(post.getErrorStream().getText()); + } + } + // // Print pipeline summary on completion // @@ -154,7 +231,7 @@ class NfcoreTemplate { if (workflow.stats.ignoredCount == 0) { log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Pipeline completed successfully${colors.reset}-" } else { - log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} Pipeline completed successfully, but with errored process(es) ${colors.reset}-" + log.info "-${colors.purple}[$workflow.manifest.name]${colors.yellow} Pipeline completed successfully, but with errored process(es) ${colors.reset}-" } } else { log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} Pipeline completed with errors${colors.reset}-" @@ -242,6 +319,7 @@ class NfcoreTemplate { // public static String logo(workflow, monochrome_logs) { Map colors = logColours(monochrome_logs) + String workflow_version = NfcoreTemplate.version(workflow) String.format( """\n ${dashedLine(monochrome_logs)} @@ -250,7 +328,7 @@ class NfcoreTemplate { ${colors.blue} |\\ | |__ __ / ` / \\ |__) |__ ${colors.yellow}} {${colors.reset} ${colors.blue} | \\| | \\__, \\__/ | \\ |___ ${colors.green}\\`-._,-`-,${colors.reset} ${colors.green}`._,._,\'${colors.reset} - ${colors.purple} ${workflow.manifest.name} v${workflow.manifest.version}${colors.reset} + ${colors.purple} ${workflow.manifest.name} ${workflow_version}${colors.reset} ${dashedLine(monochrome_logs)} """.stripIndent() ) diff --git a/lib/Utils.groovy b/lib/Utils.groovy old mode 100755 new mode 100644 index 28567bd7..8d030f4e --- a/lib/Utils.groovy +++ b/lib/Utils.groovy @@ -21,19 +21,26 @@ class Utils { } // Check that all channels are present - def required_channels = ['conda-forge', 'bioconda', 'defaults'] - def conda_check_failed = !required_channels.every { ch -> ch in channels } + // This channel list is ordered by required channel priority. + def required_channels_in_order = ['conda-forge', 'bioconda', 'defaults'] + def channels_missing = ((required_channels_in_order as Set) - (channels as Set)) as Boolean // Check that they are in the right order - conda_check_failed |= !(channels.indexOf('conda-forge') < channels.indexOf('bioconda')) - conda_check_failed |= !(channels.indexOf('bioconda') < channels.indexOf('defaults')) + def channel_priority_violation = false + def n = required_channels_in_order.size() + for (int i = 0; i < n - 1; i++) { + channel_priority_violation |= !(channels.indexOf(required_channels_in_order[i]) < channels.indexOf(required_channels_in_order[i+1])) + } - if (conda_check_failed) { + if (channels_missing | channel_priority_violation) { log.warn "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + " There is a problem with your Conda configuration!\n\n" + " You will need to set-up the conda-forge and bioconda channels correctly.\n" + - " Please refer to https://bioconda.github.io/user/install.html#set-up-channels\n" + - " NB: The order of the channels matters!\n" + + " Please refer to https://bioconda.github.io/\n" + + " The observed channel order is \n" + + " ${channels}\n" + + " but the following channel order is required:\n" + + " ${required_channels_in_order}\n" + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" } } diff --git a/lib/WorkflowCircdna.groovy b/lib/WorkflowCircdna.groovy index 3f00892e..f5ccc9df 100755 --- a/lib/WorkflowCircdna.groovy +++ b/lib/WorkflowCircdna.groovy @@ -2,6 +2,8 @@ // This file holds several functions specific to the workflow/circdna.nf in the nf-core/circdna pipeline // +import groovy.text.SimpleTemplateEngine + class WorkflowCircdna { // @@ -10,6 +12,7 @@ class WorkflowCircdna { public static void initialise(params, log) { genomeExistsError(params, log) + if (!params.fasta) { log.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file." System.exit(1) @@ -41,9 +44,24 @@ class WorkflowCircdna { yaml_file_text += "data: |\n" yaml_file_text += "${summary_section}" return yaml_file_text - } + }// - // + public static String methodsDescriptionText(run_workflow, mqc_methods_yaml) { + // Convert to a named map so can be used as with familar NXF ${workflow} variable syntax in the MultiQC YML file + def meta = [:] + meta.workflow = run_workflow.toMap() + meta["manifest_map"] = run_workflow.manifest.toMap() + + meta["doi_text"] = meta.manifest_map.doi ? "(doi: ${meta.manifest_map.doi})" : "" + meta["nodoi_text"] = meta.manifest_map.doi ? "": "
  • If available, make sure to update the text to include the Zenodo DOI of version of the pipeline used.
  • " + + def methods_text = mqc_methods_yaml.text + + def engine = new SimpleTemplateEngine() + def description_html = engine.createTemplate(methods_text).make(meta) + + return description_html + }// // Exit pipeline if incorrect --genome key provided // private static void genomeExistsError(params, log) { diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index 984d6555..48934af2 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -9,9 +9,8 @@ class WorkflowMain { // public static String citation(workflow) { return "If you use ${workflow.manifest.name} for your analysis please cite:\n\n" + - // TODO nf-core: Add Zenodo DOI for pipeline after first release - //"* The pipeline\n" + - //" https://doi.org/10.5281/zenodo.XXXXXXX\n\n" + + "* The pipeline\n" + + " https://doi.org/10.5281/zenodo.6685250\n\n" + "* The nf-core framework\n" + " https://doi.org/10.1038/s41587-020-0439-x\n\n" + "* Software dependencies\n" + @@ -19,7 +18,7 @@ class WorkflowMain { } // - // Print help to screen if required + // Generate help string // public static String help(workflow, params, log) { def command = "nextflow run ${workflow.manifest.name} --input samplesheet.csv --genome GRCh37 -profile docker" @@ -32,7 +31,7 @@ class WorkflowMain { } // - // Print parameter summary log to screen + // Generate parameter summary log string // public static String paramsSummaryLog(workflow, params, log) { def summary_log = '' @@ -53,19 +52,27 @@ class WorkflowMain { System.exit(0) } - // Validate workflow parameters via the JSON schema - if (params.validate_params) { - NfcoreSchema.validateParameters(workflow, params, log) + // Print workflow version and exit on --version + if (params.version) { + String workflow_version = NfcoreTemplate.version(workflow) + log.info "${workflow.manifest.name} ${workflow_version}" + System.exit(0) } // Print parameter summary log to screen + log.info paramsSummaryLog(workflow, params, log) + // Validate workflow parameters via the JSON schema + if (params.validate_params) { + NfcoreSchema.validateParameters(workflow, params, log) + } + // Check that a -profile or Nextflow config has been provided to run the pipeline NfcoreTemplate.checkConfigProvided(workflow, log) // Check that conda channels are set-up correctly - if (params.enable_conda) { + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { Utils.checkCondaChannels(log) } @@ -78,17 +85,15 @@ class WorkflowMain { System.exit(1) } } - // // Get attribute from genome config file e.g. fasta // - public static String getGenomeAttribute(params, attribute) { - def val = '' + public static Object getGenomeAttribute(params, attribute) { if (params.genomes && params.genome && params.genomes.containsKey(params.genome)) { if (params.genomes[ params.genome ].containsKey(attribute)) { - val = params.genomes[ params.genome ][ attribute ] + return params.genomes[ params.genome ][ attribute ] } } - return val + return null } } diff --git a/main.nf b/main.nf index 2096e436..69cd3ebd 100644 --- a/main.nf +++ b/main.nf @@ -4,6 +4,7 @@ nf-core/circdna ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Github : https://github.com/nf-core/circdna + Website: https://nf-co.re/circdna Slack : https://nfcore.slack.com/channels/circdna ---------------------------------------------------------------------------------------- @@ -55,6 +56,7 @@ workflow NFCORE_CIRCDNA { workflow { NFCORE_CIRCDNA () } + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ THE END diff --git a/modules.json b/modules.json index 5306451d..06ba80fb 100644 --- a/modules.json +++ b/modules.json @@ -2,48 +2,94 @@ "name": "nf-core/circdna", "homePage": "https://github.com/nf-core/circdna", "repos": { - "nf-core/modules": { - "bwa/index": { - "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" + "https://github.com/nf-core/modules.git": { + "modules": { + "nf-core": { + "bwa/index": { + "branch": "master", + "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "installed_by": ["modules"] + }, + "cat/fastq": { + "branch": "master", + "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "installed_by": ["modules"] + }, + "custom/dumpsoftwareversions": { + "branch": "master", + "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "installed_by": ["modules"] + }, + "fastqc": { + "branch": "master", + "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "installed_by": ["modules"] + }, + "minimap2/align": { + "branch": "master", + "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "installed_by": ["modules"] + }, + "multiqc": { + "branch": "master", + "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "installed_by": ["modules"] + }, + "picard/markduplicates": { + "branch": "master", + "git_sha": "2f88b26e9804b99e98f7cd08e74c3f88288a3358", + "installed_by": ["modules"] + }, + "samtools/faidx": { + "branch": "master", + "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "installed_by": ["modules"] + }, + "samtools/flagstat": { + "branch": "master", + "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "installed_by": ["modules", "bam_stats_samtools"] + }, + "samtools/idxstats": { + "branch": "master", + "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "installed_by": ["modules", "bam_stats_samtools"] + }, + "samtools/index": { + "branch": "master", + "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "installed_by": ["modules"] + }, + "samtools/sort": { + "branch": "master", + "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "installed_by": ["modules"] + }, + "samtools/stats": { + "branch": "master", + "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "installed_by": ["modules", "bam_stats_samtools"] + }, + "samtools/view": { + "branch": "master", + "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "installed_by": ["modules"] + }, + "trimgalore": { + "branch": "master", + "git_sha": "72ffbd7128015a1d4b65b95ff8d37be8fee2f981", + "installed_by": ["modules"] + } + } }, - "bwa/mem": { - "git_sha": "4f5274c3de0c9521f5033893ff61057a74c45ba9" - }, - "cat/fastq": { - "git_sha": "9aadd9a6d3f5964476582319b3a1c54a3e3fe7c9" - }, - "custom/dumpsoftwareversions": { - "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" - }, - "fastqc": { - "git_sha": "49b18b1639f4f7104187058866a8fab33332bdfe" - }, - "minimap2/align": { - "git_sha": "1a5a9e7b4009dcf34e6867dd1a5a1d9a718b027b" - }, - "picard/markduplicates": { - "git_sha": "49b18b1639f4f7104187058866a8fab33332bdfe" - }, - "samtools/flagstat": { - "git_sha": "897c33d5da084b61109500ee44c01da2d3e4e773" - }, - "samtools/idxstats": { - "git_sha": "897c33d5da084b61109500ee44c01da2d3e4e773" - }, - "samtools/index": { - "git_sha": "1ad73f1b2abdea9398680d6d20014838135c9a35" - }, - "samtools/sort": { - "git_sha": "1ad73f1b2abdea9398680d6d20014838135c9a35" - }, - "samtools/stats": { - "git_sha": "897c33d5da084b61109500ee44c01da2d3e4e773" - }, - "samtools/view": { - "git_sha": "12afb6b0faf3cabf769c9a2a7dd477e3f066eac0" - }, - "trimgalore": { - "git_sha": "85ec13ff1fc2196c5a507ea497de468101baabed" + "subworkflows": { + "nf-core": { + "bam_stats_samtools": { + "branch": "master", + "git_sha": "92eb5091ae5368a60cda58b3a0ced8b36d715b0f", + "installed_by": ["subworkflows"] + } + } } } } diff --git a/modules/local/ampliconarchitect/ampliconarchitect.nf b/modules/local/ampliconarchitect/ampliconarchitect.nf index b14e4500..6a96c30b 100644 --- a/modules/local/ampliconarchitect/ampliconarchitect.nf +++ b/modules/local/ampliconarchitect/ampliconarchitect.nf @@ -2,7 +2,7 @@ process AMPLICONARCHITECT_AMPLICONARCHITECT { tag "$meta.id" label 'process_low' - conda (params.enable_conda ? "conda-forge::python=2.7 bioconda::pysam=0.15.2 anaconda::flask=1.1.2 anaconda::cython=0.29.14 anaconda::numpy=1.16.6 anaconda::scipy=1.2.1 conda-forge::matplotlib=2.2.5 mosek::mosek=8.0.60 anaconda::future=0.18.2" : null) + conda "conda-forge::python=2.7 bioconda::pysam=0.15.2 anaconda::flask=1.1.2 anaconda::cython=0.29.14 anaconda::numpy=1.16.6 anaconda::scipy=1.2.1 conda-forge::matplotlib=2.2.5 mosek::mosek=8.0.60 anaconda::future=0.18.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/mulled-v2-6eefa51f13933d65b4f8155ca2f8cd81dea474ba:baa777f7c4e89a2ec4d1eab7d424a1f46503bac7-0': 'quay.io/biocontainers/mulled-v2-6eefa51f13933d65b4f8155ca2f8cd81dea474ba:baa777f7c4e89a2ec4d1eab7d424a1f46503bac7-0' }" diff --git a/modules/local/ampliconarchitect/ampliconclassifier.nf b/modules/local/ampliconarchitect/ampliconclassifier.nf deleted file mode 100644 index d9d575dd..00000000 --- a/modules/local/ampliconarchitect/ampliconclassifier.nf +++ /dev/null @@ -1,70 +0,0 @@ -process AMPLICONARCHITECT_AMPLICONCLASSIFIER { - tag "$meta.id" - label 'process_low' - - conda (params.enable_conda ? "bioconda::ampliconclassifier=0.4.5=hdfd78af_1" : null) - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/ampliconclassifier:0.4.5--hdfd78af_1': - 'quay.io/biocontainers/ampliconclassifier:0.4.5--hdfd78af_1' }" - - input: - tuple val(meta), path(cycles), path(graph) - - output: - tuple val(meta), path("*amplicon_classification_profiles.tsv") , emit: class_tsv , optional: true - tuple val(meta), path("*edge_classification_profiles.tsv") , emit: edge_tsv , optional: true - tuple val(meta), path("*gene_list.tsv") , emit: gene_list , optional: true - tuple val(meta), path("*ecDNA_counts.tsv") , emit: ecDNA_counts , optional: true - tuple val(meta), path("*.bed") , emit: bed , optional: true - tuple val(meta), path("*annotated_cycles.txt") , emit: annotated_cycles, optional: true - tuple val(meta), path("*class_radar.{png,pdf}") , emit: radar_plot , optional: true - tuple val(meta), path("*feature_entropy.tsv") , emit: entropy , optional: true - path "*.AmpliconClassifier.input" , emit: input , optional: true - path "*.classifier_stdout.log" , emit: log , optional: true - path "versions.yml" , emit: versions , optional: true - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - - """ - REF=${params.reference_build} - export AA_DATA_REPO=${params.aa_data_repo} - export AA_SRC=${projectDir}/bin - - # Make AmpliconClassifier Input from graph and cycles files - make_AmpliconClassifier_input.sh ./ ${meta.id}.AmpliconClassifier - - amplicon_classifier.py \\ - --ref \$REF \\ - $args \\ - --input ${meta.id}.AmpliconClassifier.input \\ - > ${meta.id}.classifier_stdout.log - - mv ${meta.id}_classification_bed_files/* ./ - mv ${meta.id}_annotated_cycles_files/* ./ - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - AmpliconClassifier: \$(echo \$(amplicon_classifier.py --version | sed 's/amplicon_classifier //g' | sed 's/ .*//g')) - END_VERSIONS - """ - - stub: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - """ - export AA_DATA_REPO=${params.aa_data_repo} - export MOSEKLM_LICENSE_FILE=${params.mosek_license_dir} - export AA_SRC=${projectDir}/bin - REF=${params.reference_build} - - touch "${prefix}.amplicon_classification_profiles.tsv" - touch "${prefix}.classifier_stdout.log" - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - AmpliconClassifier: \$(echo \$(amplicon_classifier.py --version | sed 's/amplicon_classifier //g' | sed 's/ .*//g')) - END_VERSIONS - """ -} diff --git a/modules/local/ampliconclassifier/ampliconclassifier.nf b/modules/local/ampliconclassifier/ampliconclassifier.nf new file mode 100644 index 00000000..1290a132 --- /dev/null +++ b/modules/local/ampliconclassifier/ampliconclassifier.nf @@ -0,0 +1,65 @@ +process AMPLICONCLASSIFIER_AMPLICONCLASSIFIER { + tag "AA Amplicons" + label 'process_low' + + conda "bioconda::ampliconclassifier=0.4.14" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ampliconclassifier:0.4.14--hdfd78af_0': + 'quay.io/biocontainers/ampliconclassifier:0.4.14--hdfd78af_0' }" + + input: + path (input_file) + + output: + path ("*amplicon_classification_profiles.tsv" ), emit: class_tsv , optional: true + path ("*edge_classification_profiles.tsv" ), emit: edge_tsv , optional: true + path ("*gene_list.tsv" ) , emit: gene_list , optional: true + path ("*ecDNA_counts.tsv" ) , emit: ecDNA_counts , optional: true + path ("*.bed" ) , emit: bed , optional: true + path ("*annotated_cycles.txt" ) , emit: annotated_cycles, optional: true + path ("*class_radar.{png,pdf}" ) , emit: radar_plot , optional: true + path ("*feature_entropy.tsv" ) , emit: entropy , optional: true + path ("*feature_basic_properties.tsv" ) , emit: basic_properties, optional: true + path ("*classification_bed_files/*" ) , emit: bed_files , optional: true + path ("*annotated_cycles_files/" ) , emit: cycles_files , optional: true + path ("*.classifier_stdout.log" ) , emit: log , optional: true + path ("*" ) , emit: all , optional: true + path ("versions.yml" ) , emit: versions , optional: true + + script: + def args = task.ext.args ?: '' + + """ + REF=${params.reference_build} + export AA_DATA_REPO=${params.aa_data_repo} + export AA_SRC=${projectDir}/bin + + amplicon_classifier.py \\ + --ref \$REF \\ + $args \\ + --input $input_file \\ + > ampliconclassifier.classifier_stdout.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + AmpliconClassifier: \$(echo \$(amplicon_classifier.py --version | sed 's/amplicon_classifier //g' | sed 's/ .*//g')) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + """ + export AA_DATA_REPO=${params.aa_data_repo} + export MOSEKLM_LICENSE_FILE=${params.mosek_license_dir} + export AA_SRC=${projectDir}/bin + REF=${params.reference_build} + + touch "ampliconclassifier_amplicon_classification_profiles.tsv" + touch "ampliconclassifier_classifier_stdout.log" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + AmpliconClassifier: \$(echo \$(amplicon_classifier.py --version | sed 's/amplicon_classifier //g' | sed 's/ .*//g')) + END_VERSIONS + """ +} diff --git a/modules/local/ampliconarchitect/ampliconsimilarity.nf b/modules/local/ampliconclassifier/ampliconsimilarity.nf similarity index 53% rename from modules/local/ampliconarchitect/ampliconsimilarity.nf rename to modules/local/ampliconclassifier/ampliconsimilarity.nf index 87b207ec..cef9ad56 100644 --- a/modules/local/ampliconarchitect/ampliconsimilarity.nf +++ b/modules/local/ampliconclassifier/ampliconsimilarity.nf @@ -1,58 +1,50 @@ -process AMPLICONARCHITECT_AMPLICONSIMILARITY { - tag "$meta.id" +process AMPLICONCLASSIFIER_AMPLICONSIMILARITY { + tag "AA Amplicons" label 'process_low' - conda (params.enable_conda ? "bioconda::ampliconclassifier=0.4.5=hdfd78af_1" : null) + conda "bioconda::ampliconclassifier=0.4.14" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/ampliconclassifier:0.4.5--hdfd78af_1': - 'quay.io/biocontainers/ampliconclassifier:0.4.5--hdfd78af_1' }" + 'https://depot.galaxyproject.org/singularity/ampliconclassifier:0.4.14--hdfd78af_0': + 'quay.io/biocontainers/ampliconclassifier:0.4.14--hdfd78af_0' }" input: - tuple val(meta), path(cycles), path(graph) + path(input) output: - tuple val(meta), path("*_scores.tsv") , emit: scores + path("*_scores.tsv") , emit: scores path("*") - path "versions.yml" , emit: versions + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when script: def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" """ REF=${params.reference_build} export AA_DATA_REPO=${params.aa_data_repo} export AA_SRC=${projectDir}/bin - make_AmpliconClassifier_input.sh ./ ${meta.id}.AmpliconSimilarity - amplicon_similarity.py \\ --ref \$REF \\ $args \\ - --input ${meta.id}.AmpliconSimilarity.input \\ + --input $input cat <<-END_VERSIONS > versions.yml "${task.process}": - python: \$(python --version | sed 's/Python //g') + AmpliconClassifier: \$(echo \$(amplicon_classifier.py --version | sed 's/amplicon_classifier //g' | sed 's/ .*//g')) END_VERSIONS """ stub: def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" """ REF=${params.reference_build} export AA_DATA_REPO=${params.aa_data_repo} export AA_SRC=${projectDir}/bin - touch "${prefix}.similarity_scores.tsv" + touch "ampliconclassifier_similarity_scores.tsv" - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g') - END_VERSIONS cat <<-END_VERSIONS > versions.yml "${task.process}": AmpliconClassifier: \$(echo \$(amplicon_classifier.py --version | sed 's/amplicon_classifier //g' | sed 's/ .*//g')) diff --git a/modules/local/ampliconclassifier/makeinput.nf b/modules/local/ampliconclassifier/makeinput.nf new file mode 100644 index 00000000..c872589f --- /dev/null +++ b/modules/local/ampliconclassifier/makeinput.nf @@ -0,0 +1,43 @@ +process AMPLICONCLASSIFIER_MAKEINPUT { + tag 'AA Amplicons' + label 'process_low' + + conda "bioconda::ampliconclassifier=0.4.14" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ampliconclassifier:0.4.14--hdfd78af_0': + 'quay.io/biocontainers/ampliconclassifier:0.4.14--hdfd78af_0' }" + + input: + path(graph) + path(cycles) + + output: + path "*.input" , emit: input + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + + """ + make_input.sh ./ ampliconclassifier + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + AmpliconClassifier: \$(echo \$(amplicon_classifier.py --version | sed 's/amplicon_classifier //g' | sed 's/ .*//g')) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + """ + touch "ampliconclassifier.input" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + AmpliconClassifier: \$(echo \$(amplicon_classifier.py --version | sed 's/amplicon_classifier //g' | sed 's/ .*//g')) + END_VERSIONS + """ +} diff --git a/modules/local/ampliconclassifier/makeresultstable.nf b/modules/local/ampliconclassifier/makeresultstable.nf new file mode 100644 index 00000000..20a6fafc --- /dev/null +++ b/modules/local/ampliconclassifier/makeresultstable.nf @@ -0,0 +1,58 @@ +process AMPLICONCLASSIFIER_MAKERESULTSTABLE { + tag 'AA Amplicons' + label 'process_low' + + conda "bioconda::ampliconclassifier=0.4.14" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ampliconclassifier:0.4.14--hdfd78af_0': + 'quay.io/biocontainers/ampliconclassifier:0.4.14--hdfd78af_0' }" + + input: + path (input_file) + path (class_file) + path (gene_list) + path (feature_entropy) + path (basic_properties) + path (bed_files) + + output: + path "*result_data.json" , emit: json + path "*result_table.tsv" , emit: tsv + path "index.html" , emit: html + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + # Create subdirectories in working directory + mkdir ampliconclassifier_classification_bed_files + mv $bed_files ampliconclassifier_classification_bed_files/ + + make_results_table.py \\ + $args \\ + --input $input_file \\ + --classification_file $class_file + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + AmpliconClassifier: \$(echo \$(amplicon_classifier.py --version | sed 's/amplicon_classifier //g' | sed 's/ .*//g')) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + + """ + touch ampliconclasifier_result_data.json + touch ampliconclasifier_result_table.tsv + touch index.html + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + AmpliconClassifier: \$(echo \$(amplicon_classifier.py --version | sed 's/amplicon_classifier //g' | sed 's/ .*//g')) + END_VERSIONS + """ +} diff --git a/modules/local/amplified_intervals.nf b/modules/local/amplified_intervals.nf index 225141bb..56da6cbd 100644 --- a/modules/local/amplified_intervals.nf +++ b/modules/local/amplified_intervals.nf @@ -2,7 +2,7 @@ process AMPLIFIED_INTERVALS { tag "$meta.id" label 'process_low' - conda (params.enable_conda ? "conda-forge::python=2.7 bioconda::pysam=0.15.2 anaconda::flask=1.1.2 anaconda::cython=0.29.14 anaconda::numpy=1.16.6 anaconda::scipy=1.2.1 conda-forge::matplotlib=2.2.5 mosek::mosek=8.0.60 anaconda::future=0.18.2" : null) + conda "conda-forge::python=2.7 bioconda::pysam=0.15.2 anaconda::flask=1.1.2 anaconda::cython=0.29.14 anaconda::numpy=1.16.6 anaconda::scipy=1.2.1 conda-forge::matplotlib=2.2.5 mosek::mosek=8.0.60 anaconda::future=0.18.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/mulled-v2-6eefa51f13933d65b4f8155ca2f8cd81dea474ba:baa777f7c4e89a2ec4d1eab7d424a1f46503bac7-0': 'quay.io/biocontainers/mulled-v2-6eefa51f13933d65b4f8155ca2f8cd81dea474ba:baa777f7c4e89a2ec4d1eab7d424a1f46503bac7-0' }" @@ -14,6 +14,9 @@ process AMPLIFIED_INTERVALS { tuple val(meta), path("*CNV_SEEDS.bed"), emit: bed path "versions.yml" , emit: versions + when: + task.ext.when == null || task.ext.when + script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" diff --git a/modules/local/bedtools/coverage.nf b/modules/local/bedtools/coverage.nf deleted file mode 100644 index f84006b4..00000000 --- a/modules/local/bedtools/coverage.nf +++ /dev/null @@ -1,36 +0,0 @@ -process BEDTOOLS_COVERAGE { - tag "$meta.id" - label 'process_medium' - - conda (params.enable_conda ? "bioconda::bedtools=2.30.0" : null) - if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/bedtools:2.30.0--h7d7f7ad_2" - } else { - container "quay.io/biocontainers/bedtools:2.30.0--h7d7f7ad_2" - } - - input: - tuple val(meta), path(bed), path(bam) - - output: - tuple val(meta), path("*.filterd.bed"), emit: bed - path "versions.yml" , emit: versions - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - """ - bedtools \\ - coverage \\ - $args \\ - -@ $task.cpus \\ - -o ${prefix}.bam \\ - -T $prefix \\ - $bam - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - bedtools: \$(echo \$(bedtools --version) - END_VERSIONS - """ -} diff --git a/modules/local/bedtools/sortedbam2bed.nf b/modules/local/bedtools/sortedbam2bed.nf index f9eecb94..113a40b3 100644 --- a/modules/local/bedtools/sortedbam2bed.nf +++ b/modules/local/bedtools/sortedbam2bed.nf @@ -2,7 +2,7 @@ process BEDTOOLS_SORTEDBAM2BED { tag "$meta.id" label 'process_medium' - conda (params.enable_conda ? "bioconda::bedtools=2.30.0" : null) + conda "bioconda::bedtools=2.30.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/bedtools:2.30.0--h7d7f7ad_1': 'quay.io/biocontainers/bedtools:2.30.0--h7d7f7ad_2' }" diff --git a/modules/local/bedtools/splitbam2bed.nf b/modules/local/bedtools/splitbam2bed.nf index f801b516..a4b8990b 100644 --- a/modules/local/bedtools/splitbam2bed.nf +++ b/modules/local/bedtools/splitbam2bed.nf @@ -2,12 +2,10 @@ process BEDTOOLS_SPLITBAM2BED { tag "$meta.id" label 'process_medium' - conda (params.enable_conda ? "bioconda::bedtools=2.30.0" : null) - if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/bedtools:2.30.0--h7d7f7ad_1" - } else { - container "quay.io/biocontainers/bedtools:2.30.0--h7d7f7ad_2" - } + conda "bioconda::bedtools=2.30.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bedtools:2.30.0--h7d7f7ad_1' : + 'quay.io/biocontainers/bedtools:2.30.0--h7d7f7ad_2'}" input: tuple val(meta), path(split_bam) @@ -31,7 +29,7 @@ process BEDTOOLS_SPLITBAM2BED { awk 'BEGIN{FS=OFS=" "} {if ((\$9=="M" && \$NF=="H") || \ (\$9=="M" && \$NF=="S")) {printf ("%s\tfirst\\n",\$0)} else if ((\$9=="S" && \$NF=="M") || \ (\$9=="H" && \$NF=="M")) {printf ("%s\tsecond\\n",\$0)} }' | \ - awk 'BEGIN{FS=OFS="\t"} {gsub(" ", "", \$8)} 1' > '${prefix}.split.txt' + awk 'BEGIN{FS=OFS="\t"} {gsub(" ", "", \$8)} 1' > '${prefix}.txt' # Software Version cat <<-END_VERSIONS > versions.yml diff --git a/modules/nf-core/modules/bwa/mem/main.nf b/modules/local/bwa/mem/main.nf similarity index 59% rename from modules/nf-core/modules/bwa/mem/main.nf rename to modules/local/bwa/mem/main.nf index f55af944..e8a3951e 100644 --- a/modules/nf-core/modules/bwa/mem/main.nf +++ b/modules/local/bwa/mem/main.nf @@ -2,14 +2,14 @@ process BWA_MEM { tag "$meta.id" label 'process_high' - conda (params.enable_conda ? "bioconda::bwa=0.7.17 bioconda::samtools=1.15.1" : null) + conda "bioconda::bwa=0.7.17 bioconda::samtools=1.16.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-fe8faa35dbf6dc65a0f7f5d4ea12e31a79f73e40:8110a70be2bfe7f75a2ea7f2a89cda4cc7732095-0' : - 'quay.io/biocontainers/mulled-v2-fe8faa35dbf6dc65a0f7f5d4ea12e31a79f73e40:8110a70be2bfe7f75a2ea7f2a89cda4cc7732095-0' }" + 'https://depot.galaxyproject.org/singularity/mulled-v2-fe8faa35dbf6dc65a0f7f5d4ea12e31a79f73e40:219b6c272b25e7e642ae3ff0bf0c5c81a5135ab4-0' : + 'quay.io/biocontainers/mulled-v2-fe8faa35dbf6dc65a0f7f5d4ea12e31a79f73e40:219b6c272b25e7e642ae3ff0bf0c5c81a5135ab4-0' }" input: tuple val(meta), path(reads) - path index + tuple val(meta2), path(index) val sort_bam output: @@ -25,7 +25,7 @@ process BWA_MEM { def prefix = task.ext.prefix ?: "${meta.id}" def samtools_command = sort_bam ? 'sort' : 'view' """ - INDEX=`find -L ./ -name "*.amb" | sed 's/.amb//'` + INDEX=`find -L ./ -name "*.amb" | sed 's/\\.amb\$//'` bwa mem \\ $args \\ @@ -40,4 +40,19 @@ process BWA_MEM { samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') END_VERSIONS """ + + stub: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def samtools_command = sort_bam ? 'sort' : 'view' + """ + touch ${prefix}.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bwa: \$(echo \$(bwa 2>&1) | sed 's/^.*Version: //; s/Contact:.*\$//') + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ } diff --git a/modules/nf-core/modules/bwa/mem/meta.yml b/modules/local/bwa/mem/meta.yml similarity index 90% rename from modules/nf-core/modules/bwa/mem/meta.yml rename to modules/local/bwa/mem/meta.yml index f84c5227..62357bf8 100644 --- a/modules/nf-core/modules/bwa/mem/meta.yml +++ b/modules/local/bwa/mem/meta.yml @@ -28,6 +28,11 @@ input: description: | List of input FastQ files of size 1 and 2 for single-end and paired-end data, respectively. + - meta2: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] - index: type: file description: BWA genome index files diff --git a/modules/local/circlefinder.nf b/modules/local/circlefinder.nf index 2f88c445..0dd0c827 100644 --- a/modules/local/circlefinder.nf +++ b/modules/local/circlefinder.nf @@ -7,7 +7,10 @@ process CIRCLEFINDER { output: tuple val(meta), path("*.microDNA-JT.txt") , optional: true, emit: circdna - tuple val(meta), path("*.circle_finder_exit_log.txt") , optional: true + tuple val(meta), path("*.circle_finder_exit_log.txt") , optional: true, emit: log + + when: + task.ext.when == null || task.ext.when script: def args = task.ext.args ?: '' @@ -96,6 +99,6 @@ process CIRCLEFINDER { else if (\$7=="+" && \$9=="second" && \$2<\$12 && \$22>=\$2 && \$23<=\$13) {printf ("%s\\t%d\\t%d\\n",\$1,\$2,\$13)} \ else if (\$17=="-" && \$19=="second" && \$12<\$2 && \$22>=\$12 && \$23<=\$3) {printf ("%s\\t%d\\t%d\\n",\$1,\$12,\$3)} \ else if (\$7=="-" && \$9=="second" && \$2<\$12 && \$22>=\$2 && \$23<=\$13) {printf ("%s\\t%d\\t%d\\n",\$1,\$2,\$13)} }' | \ - sort | uniq -c | awk '{printf ("%s\\t%d\\t%d\\t%d\\n",\$2,\$3,\$4,\$1)}' > ${prefix}.microDNA-JT.txt + sort | uniq -c | awk '{printf ("%s\\t%d\\t%d\\t%d\\n",\$2,\$3,\$4,\$1)}' > ${prefix}.microDNA-JT.txt """ } diff --git a/modules/local/circlemap/readextractor.nf b/modules/local/circlemap/readextractor.nf index 7d2ecc7c..a9f59455 100644 --- a/modules/local/circlemap/readextractor.nf +++ b/modules/local/circlemap/readextractor.nf @@ -2,7 +2,7 @@ process CIRCLEMAP_READEXTRACTOR { tag "$meta.id" label 'process_low' - conda (params.enable_conda ? "bioconda::circle-map=1.1.4=pyh5e36f6f_2" : null) + conda "bioconda::circle-map=1.1.4=pyh5e36f6f_2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/circle-map:1.1.4--pyh5e36f6f_2': 'quay.io/biocontainers/circle-map:1.1.4--pyh5e36f6f_2' }" diff --git a/modules/local/circlemap/realign.nf b/modules/local/circlemap/realign.nf index 144be937..ba6e9dc4 100644 --- a/modules/local/circlemap/realign.nf +++ b/modules/local/circlemap/realign.nf @@ -1,7 +1,8 @@ process CIRCLEMAP_REALIGN { tag "$meta.id" label 'process_high' - conda (params.enable_conda ? "bioconda::circle-map=1.1.4=pyh5e36f6f_2" : null) + + conda "bioconda::circle-map=1.1.4=pyh5e36f6f_2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/circle-map:1.1.4--pyh5e36f6f_2': 'quay.io/biocontainers/circle-map:1.1.4--pyh5e36f6f_2' }" diff --git a/modules/local/circlemap/repeats.nf b/modules/local/circlemap/repeats.nf index 83923828..fd065eb0 100644 --- a/modules/local/circlemap/repeats.nf +++ b/modules/local/circlemap/repeats.nf @@ -2,7 +2,7 @@ process CIRCLEMAP_REPEATS { tag "$meta.id" label 'process_low' - conda (params.enable_conda ? "bioconda::circle-map=1.1.4=pyh5e36f6f_2" : null) + conda "bioconda::circle-map=1.1.4=pyh5e36f6f_2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/circle-map:1.1.4--pyh5e36f6f_2': 'quay.io/biocontainers/circle-map:1.1.4--pyh5e36f6f_2' }" diff --git a/modules/local/cnvkit/batch/main.nf b/modules/local/cnvkit/batch/main.nf index 13e2eb5f..b096a403 100644 --- a/modules/local/cnvkit/batch/main.nf +++ b/modules/local/cnvkit/batch/main.nf @@ -2,7 +2,7 @@ process CNVKIT_BATCH { tag "$meta.id" label 'process_low' - conda (params.enable_conda ? 'bioconda::cnvkit=0.9.9' : null) + conda 'bioconda::cnvkit=0.9.9' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/cnvkit:0.9.9--pyhdfd78af_0' : 'quay.io/biocontainers/cnvkit:0.9.9--pyhdfd78af_0' }" @@ -10,6 +10,7 @@ process CNVKIT_BATCH { input: tuple val(meta), path(bam), path(bai) path fasta + path cnn output: tuple val(meta), path("*.bed"), emit: bed @@ -23,9 +24,9 @@ process CNVKIT_BATCH { script: def args = task.ext.args ?: '' - def reference = params.aa_data_repo + "/" + params.reference_build + "/" + params.reference_build + "_cnvkit_filtered_ref.cnn" - def fasta_args = reference ? "" : "--fasta $fasta" - def reference_args = reference ? "--reference $reference" : "" + def reference_args = cnn ? "--reference $cnn" : "" + def fasta_args = cnn ? "" : "--fasta $fasta" +"" """ cnvkit.py \\ @@ -45,9 +46,8 @@ process CNVKIT_BATCH { stub: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - def reference = params.aa_data_repo + "/" + params.reference_build + "/" + params.reference_build + "_cnvkit_filtered_ref.cnn" - def fasta_args = reference ? "" : "--fasta $fasta" - def reference_args = reference ? "--reference $reference" : "" + def fasta_args = cnn ? "" : "--fasta $fasta" + def reference_args = cnn ? "--reference $cnn" : "" """ touch ${prefix}.bed diff --git a/modules/local/cnvkit/batch/meta.yml b/modules/local/cnvkit/batch/meta.yml deleted file mode 100644 index 474c55f2..00000000 --- a/modules/local/cnvkit/batch/meta.yml +++ /dev/null @@ -1,93 +0,0 @@ -name: cnvkit_batch -description: Copy number variant detection from high-throughput sequencing data -keywords: - - bam - - fasta - - copy number -tools: - - cnvkit: - description: | - CNVkit is a Python library and command-line software toolkit to infer and visualize copy number from high-throughput DNA sequencing data. It is designed for use with hybrid capture, including both whole-exome and custom target panels, and short-read sequencing platforms such as Illumina and Ion Torrent. - homepage: https://cnvkit.readthedocs.io/en/stable/index.html - documentation: https://cnvkit.readthedocs.io/en/stable/index.html - licence: ["Apache-2.0"] -params: - - outdir: - type: string - description: | - The pipeline's output directory. By default, the module will - output files into `$params.outdir/` - - publish_dir_mode: - type: string - description: | - Value for the Nextflow `publishDir` mode parameter. - Available: symlink, rellink, link, copy, copyNoFollow, move. - - enable_conda: - type: boolean - description: | - Run the module with Conda using the software specified - via the `conda` directive - - singularity_pull_docker_container: - type: boolean - description: | - Instead of directly downloading Singularity images for use with Singularity, - force the workflow to pull and convert Docker containers instead. -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - tumour: - type: file - description: | - Input tumour sample bam file (or cram) - - normal: - type: file - description: | - Input normal sample bam file (or cram) - - fasta: - type: file - description: | - Input reference genome fasta file - - targetfile: - type: file - description: | - Input target bed file - - reference: - type: file - description: | - Input reference cnn-file (only for germline and tumor-only running) -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - bed: - type: file - description: File containing genomic regions - pattern: "*.{bed}" - - cnn: - type: file - description: File containing coverage information - pattern: "*.{cnn}" - - cnr: - type: file - description: File containing copy number ratio information - pattern: "*.{cnr}" - - cns: - type: file - description: File containing copy number segment information - pattern: "*.{cns}" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" -authors: - - "@kaurravneet4123" - - "@KevinMenden" - - "@MaxUlysse" - - "@drpatelh" - - "@fbdtemme" - - "@lassefolkersen" diff --git a/modules/local/collect_seeds.nf b/modules/local/collect_seeds.nf index 5cae5033..7654659a 100644 --- a/modules/local/collect_seeds.nf +++ b/modules/local/collect_seeds.nf @@ -2,10 +2,10 @@ process COLLECT_SEEDS { tag "$meta.id" label 'process_low' - conda (params.enable_conda ? "conda-forge::python=3.9.5" : null) - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python:3.9--1' : - 'quay.io/biocontainers/python:3.9--1' }" + conda "conda-forge::python=3.9.5" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9--1' : + 'quay.io/biocontainers/python:3.9--1' }" input: tuple val(meta), path(cns) @@ -14,6 +14,9 @@ process COLLECT_SEEDS { tuple val(meta), path("*.bed"), emit: bed path "versions.yml" , emit: versions + when: + task.ext.when == null || task.ext.when + script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" diff --git a/modules/local/getcircularreads.nf b/modules/local/getcircularreads.nf index cc881c7f..ddb37e55 100644 --- a/modules/local/getcircularreads.nf +++ b/modules/local/getcircularreads.nf @@ -6,6 +6,10 @@ process GETCIRCULARREADS { output: tuple val(meta), path("*unicycler.circular.fastq.gz"), optional: true, emit: fastq + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when script: def args = task.ext.args ?: '' @@ -17,8 +21,13 @@ process GETCIRCULARREADS { cat temp.fastq | grep -A3 "circular=true" | \\ grep -v "^--" | \\ gzip --no-name > \\ - ${prefix}.unicycler.circular.fastq.gz + ${prefix}.fastq.gz fi rm temp.fastq + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS """ } diff --git a/modules/local/multiqc.nf b/modules/local/multiqc.nf index 63758a33..024968d9 100644 --- a/modules/local/multiqc.nf +++ b/modules/local/multiqc.nf @@ -1,10 +1,10 @@ process MULTIQC { label 'process_medium' - conda (params.enable_conda ? 'bioconda::multiqc=1.12' : null) + conda 'bioconda::multiqc=1.13a' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.12--pyhdfd78af_0' : - 'quay.io/biocontainers/multiqc:1.12--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.13a--pyhdfd78af_1' : + 'quay.io/biocontainers/multiqc:1.13a--pyhdfd78af_1' }" input: path multiqc_config @@ -28,6 +28,9 @@ process MULTIQC { path "*_plots" , optional:true, emit: plots path "versions.yml" , emit: versions + when: + task.ext.when == null || task.ext.when + script: def args = task.ext.args ?: '' def custom_config = params.multiqc_config ? "--config $multiqc_custom_config" : '' diff --git a/modules/local/samblaster.nf b/modules/local/samblaster.nf index 8e1b1645..51878911 100644 --- a/modules/local/samblaster.nf +++ b/modules/local/samblaster.nf @@ -2,7 +2,7 @@ process SAMBLASTER { tag "$meta.id" label 'process_low' - conda (params.enable_conda ? "bioconda::samblaster=0.1.26 bioconda::samtools=1.14" : null) + conda "bioconda::samblaster=0.1.26 bioconda::samtools=1.14" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/mulled-v2-19fa9f1a5c3966b63a24166365e81da35738c5ab:ba4a02b56f3e524a6e006bcd99fe8cc1d7fe09eb-0' : 'quay.io/biocontainers/mulled-v2-19fa9f1a5c3966b63a24166365e81da35738c5ab:ba4a02b56f3e524a6e006bcd99fe8cc1d7fe09eb-0' }" @@ -14,6 +14,9 @@ process SAMBLASTER { tuple val(meta), path("*.split.bam"), emit: split_bam path "versions.yml" , emit: versions + when: + task.ext.when == null || task.ext.when + script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf index 073d1f16..505de488 100644 --- a/modules/local/samplesheet_check.nf +++ b/modules/local/samplesheet_check.nf @@ -1,5 +1,6 @@ process SAMPLESHEET_CHECK { tag "$samplesheet" + label 'process_low' conda (params.enable_conda ? "conda-forge::python=3.8.3" : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? @@ -13,6 +14,9 @@ process SAMPLESHEET_CHECK { path '*.csv' , emit: csv path "versions.yml", emit: versions + when: + task.ext.when == null || task.ext.when + script: // This script is bundled with the pipeline, in nf-core/circdna/bin/ """ check_samplesheet.py \\ diff --git a/modules/local/samtools/flagstat/main.nf b/modules/local/samtools/flagstat/main.nf new file mode 100644 index 00000000..0c8b40c9 --- /dev/null +++ b/modules/local/samtools/flagstat/main.nf @@ -0,0 +1,46 @@ +process SAMTOOLS_FLAGSTAT { + tag "$meta.id" + label 'process_single' + + conda "bioconda::samtools=1.16.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.16.1--h6899075_1' : + 'quay.io/biocontainers/samtools:1.16.1--h6899075_1' }" + + input: + tuple val(meta), path(bam), path(bai) + + output: + tuple val(meta), path("*.flagstat"), emit: flagstat + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + samtools \\ + flagstat \\ + --threads ${task.cpus} \\ + $bam \\ + > ${prefix}.flagstat + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.flagstat + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/samtools/flagstat/meta.yml b/modules/local/samtools/flagstat/meta.yml similarity index 100% rename from modules/nf-core/modules/samtools/flagstat/meta.yml rename to modules/local/samtools/flagstat/meta.yml diff --git a/modules/local/samtools/idxstats/main.nf b/modules/local/samtools/idxstats/main.nf new file mode 100644 index 00000000..7808f910 --- /dev/null +++ b/modules/local/samtools/idxstats/main.nf @@ -0,0 +1,48 @@ +process SAMTOOLS_IDXSTATS { + tag "$meta.id" + label 'process_single' + + conda "bioconda::samtools=1.16.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.16.1--h6899075_1' : + 'quay.io/biocontainers/samtools:1.16.1--h6899075_1' }" + + input: + tuple val(meta), path(bam), path(bai) + + output: + tuple val(meta), path("*.idxstats"), emit: idxstats + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + samtools \\ + idxstats \\ + --threads ${task.cpus-1} \\ + $bam \\ + > ${prefix}.idxstats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + touch ${prefix}.idxstats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/samtools/idxstats/meta.yml b/modules/local/samtools/idxstats/meta.yml similarity index 100% rename from modules/nf-core/modules/samtools/idxstats/meta.yml rename to modules/local/samtools/idxstats/meta.yml diff --git a/modules/local/seqtk/seq.nf b/modules/local/seqtk/seq.nf index 2a501a0f..d8f422fb 100644 --- a/modules/local/seqtk/seq.nf +++ b/modules/local/seqtk/seq.nf @@ -1,7 +1,8 @@ process SEQTK_SEQ { tag "$meta.id" label 'process_low' - conda (params.enable_conda ? "bioconda::seqtk=1.3" : null) + + conda "bioconda::seqtk=1.3" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/seqtk:1.3--hed695b0_2': 'quay.io/biocontainers/seqtk:1.3--hed695b0_2' }" @@ -23,7 +24,7 @@ process SEQTK_SEQ { $args \\ -F "#" \\ $fasta | \\ - gzip -c > ${prefix}.unicycler.fastq.gz + gzip -c > ${prefix}.fastq.gz cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/summarise_aa.nf b/modules/local/summarise_aa.nf index 40bdaa8d..739b42c1 100644 --- a/modules/local/summarise_aa.nf +++ b/modules/local/summarise_aa.nf @@ -2,7 +2,7 @@ process SUMMARISE_AA { tag "$meta.id" label 'process_low' - conda (params.enable_conda ? "pandas=1.1.5" : null) + conda "pandas=1.1.5" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/pandas:1.1.5' : 'quay.io/biocontainers/pandas:1.1.5' }" @@ -14,6 +14,9 @@ process SUMMARISE_AA { tuple val(meta), path("*aa_results_summary.tsv"), emit: txt path "versions.yml" , emit: versions + when: + task.ext.when == null || task.ext.when + script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" diff --git a/modules/local/unicycler/main.nf b/modules/local/unicycler/main.nf index 7f564afd..cf83079a 100644 --- a/modules/local/unicycler/main.nf +++ b/modules/local/unicycler/main.nf @@ -2,10 +2,10 @@ process UNICYCLER { tag "$meta.id" label 'process_high' - conda (params.enable_conda ? 'bioconda::unicycler=0.5.0=py39h2add14b_1' : null) + conda 'bioconda::unicycler=0.5.0=py39h2add14b_1' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/unicycler:0.5.0--py39h2add14b_1' : - 'quay.io/biocontainers/unicycler:0.5.0--py37h09c1ff4_1' }" + 'quay.io/biocontainers/unicycler:0.5.0--py39h2add14b_1' }" input: tuple val(meta), path(shortreads) diff --git a/modules/local/unicycler/meta.yml b/modules/local/unicycler/meta.yml deleted file mode 100644 index 918faaf8..00000000 --- a/modules/local/unicycler/meta.yml +++ /dev/null @@ -1,60 +0,0 @@ -name: unicycler -description: Assembles bacterial genomes -keywords: - - genome - - assembly - - genome assembler - - small genome -tools: - - unicycler: - description: Hybrid assembly pipeline for bacterial genomes - homepage: https://github.com/rrwick/Unicycler - documentation: https://github.com/rrwick/Unicycler - tool_dev_url: https://github.com/rrwick/Unicycler - doi: 10.1371/journal.pcbi.1005595 - licence: ["GPL v3"] -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - shortreads: - type: file - description: | - List of input Illumina FastQ files of size 1 and 2 for single-end and paired-end data, - respectively. - - longreads: - type: file - description: | - List of input FastQ files of size 1, PacBio or Nanopore long reads. -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - scaffolds: - type: file - description: Fasta file containing scaffolds - pattern: "*.{scaffolds.fa.gz}" - - gfa: - type: file - description: gfa file containing assembly - pattern: "*.{assembly.gfa.gz}" - - log: - type: file - description: unicycler log file - pattern: "*.{log}" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" -authors: - - "@JoseEspinosa" - - "@drpatelh" - - "@d4straub" diff --git a/modules/nf-core/modules/bwa/index/main.nf b/modules/nf-core/bwa/index/main.nf similarity index 57% rename from modules/nf-core/modules/bwa/index/main.nf rename to modules/nf-core/bwa/index/main.nf index 3affbf16..7ccf3110 100644 --- a/modules/nf-core/modules/bwa/index/main.nf +++ b/modules/nf-core/bwa/index/main.nf @@ -1,18 +1,18 @@ process BWA_INDEX { tag "$fasta" - label 'process_high' + label 'process_single' - conda (params.enable_conda ? "bioconda::bwa=0.7.17" : null) + conda "bioconda::bwa=0.7.17" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/bwa:0.7.17--hed695b0_7' : 'quay.io/biocontainers/bwa:0.7.17--hed695b0_7' }" input: - path fasta + tuple val(meta), path(fasta) output: - path "bwa" , emit: index - path "versions.yml", emit: versions + tuple val(meta), path(bwa) , emit: index + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -32,4 +32,20 @@ process BWA_INDEX { bwa: \$(echo \$(bwa 2>&1) | sed 's/^.*Version: //; s/Contact:.*\$//') END_VERSIONS """ + + stub: + """ + mkdir bwa + + touch bwa/genome.amb + touch bwa/genome.ann + touch bwa/genome.bwt + touch bwa/genome.pac + touch bwa/genome.sa + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bwa: \$(echo \$(bwa 2>&1) | sed 's/^.*Version: //; s/Contact:.*\$//') + END_VERSIONS + """ } diff --git a/modules/nf-core/modules/bwa/index/meta.yml b/modules/nf-core/bwa/index/meta.yml similarity index 73% rename from modules/nf-core/modules/bwa/index/meta.yml rename to modules/nf-core/bwa/index/meta.yml index 2bbd81d9..2c6cfcd7 100644 --- a/modules/nf-core/modules/bwa/index/meta.yml +++ b/modules/nf-core/bwa/index/meta.yml @@ -15,10 +15,20 @@ tools: arxiv: arXiv:1303.3997 licence: ["GPL-3.0-or-later"] input: + - meta: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] - fasta: type: file description: Input genome fasta file output: + - meta: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] - index: type: file description: BWA genome index files diff --git a/modules/nf-core/modules/cat/fastq/main.nf b/modules/nf-core/cat/fastq/main.nf similarity index 59% rename from modules/nf-core/modules/cat/fastq/main.nf rename to modules/nf-core/cat/fastq/main.nf index b6854895..8a0b5600 100644 --- a/modules/nf-core/modules/cat/fastq/main.nf +++ b/modules/nf-core/cat/fastq/main.nf @@ -1,8 +1,8 @@ process CAT_FASTQ { tag "$meta.id" - label 'process_low' + label 'process_single' - conda (params.enable_conda ? "conda-forge::sed=4.7" : null) + conda "conda-forge::sed=4.7" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : 'ubuntu:20.04' }" @@ -20,9 +20,9 @@ process CAT_FASTQ { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - def readList = reads.collect{ it.toString() } + def readList = reads instanceof List ? reads.collect{ it.toString() } : [reads.toString()] if (meta.single_end) { - if (readList.size > 1) { + if (readList.size >= 1) { """ cat ${readList.join(' ')} > ${prefix}.merged.fastq.gz @@ -33,7 +33,7 @@ process CAT_FASTQ { """ } } else { - if (readList.size > 2) { + if (readList.size >= 2) { def read1 = [] def read2 = [] readList.eachWithIndex{ v, ix -> ( ix & 1 ? read2 : read1 ) << v } @@ -48,4 +48,33 @@ process CAT_FASTQ { """ } } + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def readList = reads instanceof List ? reads.collect{ it.toString() } : [reads.toString()] + if (meta.single_end) { + if (readList.size > 1) { + """ + touch ${prefix}.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } else { + if (readList.size > 2) { + """ + touch ${prefix}_1.merged.fastq.gz + touch ${prefix}_2.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } + } diff --git a/modules/nf-core/modules/cat/fastq/meta.yml b/modules/nf-core/cat/fastq/meta.yml similarity index 100% rename from modules/nf-core/modules/cat/fastq/meta.yml rename to modules/nf-core/cat/fastq/meta.yml diff --git a/modules/nf-core/modules/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf similarity index 79% rename from modules/nf-core/modules/custom/dumpsoftwareversions/main.nf rename to modules/nf-core/custom/dumpsoftwareversions/main.nf index 327d5100..3df21765 100644 --- a/modules/nf-core/modules/custom/dumpsoftwareversions/main.nf +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -1,11 +1,11 @@ process CUSTOM_DUMPSOFTWAREVERSIONS { - label 'process_low' + label 'process_single' // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container - conda (params.enable_conda ? "bioconda::multiqc=1.11" : null) + conda "bioconda::multiqc=1.13" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.11--pyhdfd78af_0' : - 'quay.io/biocontainers/multiqc:1.11--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.13--pyhdfd78af_0' : + 'quay.io/biocontainers/multiqc:1.13--pyhdfd78af_0' }" input: path versions diff --git a/modules/nf-core/modules/custom/dumpsoftwareversions/meta.yml b/modules/nf-core/custom/dumpsoftwareversions/meta.yml similarity index 100% rename from modules/nf-core/modules/custom/dumpsoftwareversions/meta.yml rename to modules/nf-core/custom/dumpsoftwareversions/meta.yml diff --git a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py new file mode 100755 index 00000000..e55b8d43 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python + + +"""Provide functions to merge multiple versions.yml files.""" + + +import platform +from textwrap import dedent + +import yaml + + +def _make_versions_html(versions): + """Generate a tabular HTML output of all versions for MultiQC.""" + html = [ + dedent( + """\\ + + + + + + + + + + """ + ) + ] + for process, tmp_versions in sorted(versions.items()): + html.append("") + for i, (tool, version) in enumerate(sorted(tmp_versions.items())): + html.append( + dedent( + f"""\\ + + + + + + """ + ) + ) + html.append("") + html.append("
    Process Name Software Version
    {process if (i == 0) else ''}{tool}{version}
    ") + return "\\n".join(html) + + +def main(): + """Load all version files and generate merged output.""" + versions_this_module = {} + versions_this_module["${task.process}"] = { + "python": platform.python_version(), + "yaml": yaml.__version__, + } + + with open("$versions") as f: + versions_by_process = yaml.load(f, Loader=yaml.BaseLoader) | versions_this_module + + # aggregate versions by the module name (derived from fully-qualified process name) + versions_by_module = {} + for process, process_versions in versions_by_process.items(): + module = process.split(":")[-1] + try: + if versions_by_module[module] != process_versions: + raise AssertionError( + "We assume that software versions are the same between all modules. " + "If you see this error-message it means you discovered an edge-case " + "and should open an issue in nf-core/tools. " + ) + except KeyError: + versions_by_module[module] = process_versions + + versions_by_module["Workflow"] = { + "Nextflow": "$workflow.nextflow.version", + "$workflow.manifest.name": "$workflow.manifest.version", + } + + versions_mqc = { + "id": "software_versions", + "section_name": "${workflow.manifest.name} Software Versions", + "section_href": "https://github.com/${workflow.manifest.name}", + "plot_type": "html", + "description": "are collected at run time from the software output.", + "data": _make_versions_html(versions_by_module), + } + + with open("software_versions.yml", "w") as f: + yaml.dump(versions_by_module, f, default_flow_style=False) + with open("software_versions_mqc.yml", "w") as f: + yaml.dump(versions_mqc, f, default_flow_style=False) + + with open("versions.yml", "w") as f: + yaml.dump(versions_this_module, f, default_flow_style=False) + + +if __name__ == "__main__": + main() diff --git a/modules/nf-core/fastqc/main.nf b/modules/nf-core/fastqc/main.nf new file mode 100644 index 00000000..9ae58381 --- /dev/null +++ b/modules/nf-core/fastqc/main.nf @@ -0,0 +1,51 @@ +process FASTQC { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::fastqc=0.11.9" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/fastqc:0.11.9--0' : + 'quay.io/biocontainers/fastqc:0.11.9--0' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*.html"), emit: html + tuple val(meta), path("*.zip") , emit: zip + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + // Make list of old name and new name pairs to use for renaming in the bash while loop + def old_new_pairs = reads instanceof Path || reads.size() == 1 ? [[ reads, "${prefix}.${reads.extension}" ]] : reads.withIndex().collect { entry, index -> [ entry, "${prefix}_${index + 1}.${entry.extension}" ] } + def rename_to = old_new_pairs*.join(' ').join(' ') + def renamed_files = old_new_pairs.collect{ old_name, new_name -> new_name }.join(' ') + """ + printf "%s %s\\n" $rename_to | while read old_name new_name; do + [ -f "\${new_name}" ] || ln -s \$old_name \$new_name + done + fastqc $args --threads $task.cpus $renamed_files + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastqc: \$( fastqc --version | sed -e "s/FastQC v//g" ) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.html + touch ${prefix}.zip + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastqc: \$( fastqc --version | sed -e "s/FastQC v//g" ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/fastqc/meta.yml b/modules/nf-core/fastqc/meta.yml similarity index 100% rename from modules/nf-core/modules/fastqc/meta.yml rename to modules/nf-core/fastqc/meta.yml diff --git a/modules/nf-core/modules/minimap2/align/main.nf b/modules/nf-core/minimap2/align/main.nf similarity index 86% rename from modules/nf-core/modules/minimap2/align/main.nf rename to modules/nf-core/minimap2/align/main.nf index 08ac6eef..430dbab9 100644 --- a/modules/nf-core/modules/minimap2/align/main.nf +++ b/modules/nf-core/minimap2/align/main.nf @@ -2,7 +2,8 @@ process MINIMAP2_ALIGN { tag "$meta.id" label 'process_medium' - conda (params.enable_conda ? 'bioconda::minimap2=2.21 bioconda::samtools=1.12' : null) + // Note: the versions here need to match the versions used in the mulled container below and minimap2/index + conda "bioconda::minimap2=2.24 bioconda::samtools=1.14" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' : 'quay.io/biocontainers/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' }" @@ -25,7 +26,6 @@ process MINIMAP2_ALIGN { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - def input_reads = meta.single_end ? "$reads" : "${reads[0]} ${reads[1]}" def bam_output = bam_format ? "-a | samtools sort | samtools view -@ ${task.cpus} -b -h -o ${prefix}.bam" : "-o ${prefix}.paf" def cigar_paf = cigar_paf_format && !bam_format ? "-c" : '' def set_cigar_bam = cigar_bam && bam_format ? "-L" : '' @@ -33,8 +33,8 @@ process MINIMAP2_ALIGN { minimap2 \\ $args \\ -t $task.cpus \\ - $reference \\ - $input_reads \\ + "${reference ?: reads}" \\ + "$reads" \\ $cigar_paf \\ $set_cigar_bam \\ $bam_output diff --git a/modules/nf-core/modules/minimap2/align/meta.yml b/modules/nf-core/minimap2/align/meta.yml similarity index 100% rename from modules/nf-core/modules/minimap2/align/meta.yml rename to modules/nf-core/minimap2/align/meta.yml diff --git a/modules/nf-core/modules/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py b/modules/nf-core/modules/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py deleted file mode 100644 index d1390392..00000000 --- a/modules/nf-core/modules/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env python - -import yaml -import platform -from textwrap import dedent - - -def _make_versions_html(versions): - html = [ - dedent( - """\\ - - - - - - - - - - """ - ) - ] - for process, tmp_versions in sorted(versions.items()): - html.append("") - for i, (tool, version) in enumerate(sorted(tmp_versions.items())): - html.append( - dedent( - f"""\\ - - - - - - """ - ) - ) - html.append("") - html.append("
    Process Name Software Version
    {process if (i == 0) else ''}{tool}{version}
    ") - return "\\n".join(html) - - -versions_this_module = {} -versions_this_module["${task.process}"] = { - "python": platform.python_version(), - "yaml": yaml.__version__, -} - -with open("$versions") as f: - versions_by_process = yaml.load(f, Loader=yaml.BaseLoader) | versions_this_module - -# aggregate versions by the module name (derived from fully-qualified process name) -versions_by_module = {} -for process, process_versions in versions_by_process.items(): - module = process.split(":")[-1] - try: - assert versions_by_module[module] == process_versions, ( - "We assume that software versions are the same between all modules. " - "If you see this error-message it means you discovered an edge-case " - "and should open an issue in nf-core/tools. " - ) - except KeyError: - versions_by_module[module] = process_versions - -versions_by_module["Workflow"] = { - "Nextflow": "$workflow.nextflow.version", - "$workflow.manifest.name": "$workflow.manifest.version", -} - -versions_mqc = { - "id": "software_versions", - "section_name": "${workflow.manifest.name} Software Versions", - "section_href": "https://github.com/${workflow.manifest.name}", - "plot_type": "html", - "description": "are collected at run time from the software output.", - "data": _make_versions_html(versions_by_module), -} - -with open("software_versions.yml", "w") as f: - yaml.dump(versions_by_module, f, default_flow_style=False) -with open("software_versions_mqc.yml", "w") as f: - yaml.dump(versions_mqc, f, default_flow_style=False) - -with open("versions.yml", "w") as f: - yaml.dump(versions_this_module, f, default_flow_style=False) diff --git a/modules/nf-core/modules/fastqc/main.nf b/modules/nf-core/modules/fastqc/main.nf deleted file mode 100644 index 05730368..00000000 --- a/modules/nf-core/modules/fastqc/main.nf +++ /dev/null @@ -1,59 +0,0 @@ -process FASTQC { - tag "$meta.id" - label 'process_medium' - - conda (params.enable_conda ? "bioconda::fastqc=0.11.9" : null) - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/fastqc:0.11.9--0' : - 'quay.io/biocontainers/fastqc:0.11.9--0' }" - - input: - tuple val(meta), path(reads) - - output: - tuple val(meta), path("*.html"), emit: html - tuple val(meta), path("*.zip") , emit: zip - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - // Add soft-links to original FastQs for consistent naming in pipeline - def prefix = task.ext.prefix ?: "${meta.id}" - if (meta.single_end) { - """ - [ ! -f ${prefix}.fastq.gz ] && ln -s $reads ${prefix}.fastq.gz - fastqc $args --threads $task.cpus ${prefix}.fastq.gz - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - fastqc: \$( fastqc --version | sed -e "s/FastQC v//g" ) - END_VERSIONS - """ - } else { - """ - [ ! -f ${prefix}_1.fastq.gz ] && ln -s ${reads[0]} ${prefix}_1.fastq.gz - [ ! -f ${prefix}_2.fastq.gz ] && ln -s ${reads[1]} ${prefix}_2.fastq.gz - fastqc $args --threads $task.cpus ${prefix}_1.fastq.gz ${prefix}_2.fastq.gz - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - fastqc: \$( fastqc --version | sed -e "s/FastQC v//g" ) - END_VERSIONS - """ - } - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - """ - touch ${prefix}.html - touch ${prefix}.zip - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - fastqc: \$( fastqc --version | sed -e "s/FastQC v//g" ) - END_VERSIONS - """ -} diff --git a/modules/nf-core/modules/samtools/view/main.nf b/modules/nf-core/modules/samtools/view/main.nf deleted file mode 100644 index 11cfb74b..00000000 --- a/modules/nf-core/modules/samtools/view/main.nf +++ /dev/null @@ -1,44 +0,0 @@ -process SAMTOOLS_VIEW { - tag "$meta.id" - label 'process_medium' - - conda (params.enable_conda ? "bioconda::samtools=1.15.1" : null) - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.15.1--h1170115_0' : - 'quay.io/biocontainers/samtools:1.15.1--h1170115_0' }" - - input: - tuple val(meta), path(input), path(index) - path fasta - - output: - tuple val(meta), path("*.bam") , emit: bam , optional: true - tuple val(meta), path("*.cram"), emit: cram, optional: true - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def args2 = task.ext.args2 ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def reference = fasta ? "--reference ${fasta} -C" : "" - def file_type = input.getExtension() - if ("$input" == "${prefix}.${file_type}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" - """ - samtools \\ - view \\ - --threads ${task.cpus-1} \\ - ${reference} \\ - $args \\ - $input \\ - $args2 \\ - > ${prefix}.${file_type} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') - END_VERSIONS - """ -} diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf new file mode 100644 index 00000000..68f66bea --- /dev/null +++ b/modules/nf-core/multiqc/main.nf @@ -0,0 +1,53 @@ +process MULTIQC { + label 'process_single' + + conda "bioconda::multiqc=1.13" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/multiqc:1.13--pyhdfd78af_0' : + 'quay.io/biocontainers/multiqc:1.13--pyhdfd78af_0' }" + + input: + path multiqc_files, stageAs: "?/*" + path(multiqc_config) + path(extra_multiqc_config) + path(multiqc_logo) + + output: + path "*multiqc_report.html", emit: report + path "*_data" , emit: data + path "*_plots" , optional:true, emit: plots + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def config = multiqc_config ? "--config $multiqc_config" : '' + def extra_config = extra_multiqc_config ? "--config $extra_multiqc_config" : '' + """ + multiqc \\ + --force \\ + $args \\ + $config \\ + $extra_config \\ + . + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) + END_VERSIONS + """ + + stub: + """ + touch multiqc_data + touch multiqc_plots + touch multiqc_report.html + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml new file mode 100644 index 00000000..ebc29b27 --- /dev/null +++ b/modules/nf-core/multiqc/meta.yml @@ -0,0 +1,55 @@ +name: MultiQC +description: Aggregate results from bioinformatics analyses across many samples into a single report +keywords: + - QC + - bioinformatics tools + - Beautiful stand-alone HTML report +tools: + - multiqc: + description: | + MultiQC searches a given directory for analysis logs and compiles a HTML report. + It's a general use tool, perfect for summarising the output from numerous bioinformatics tools. + homepage: https://multiqc.info/ + documentation: https://multiqc.info/docs/ + licence: ["GPL-3.0-or-later"] + +input: + - multiqc_files: + type: file + description: | + List of reports / files recognised by MultiQC, for example the html and zip output of FastQC + - multiqc_config: + type: file + description: Optional config yml for MultiQC + pattern: "*.{yml,yaml}" + - extra_multiqc_config: + type: file + description: Second optional config yml for MultiQC. Will override common sections in multiqc_config. + pattern: "*.{yml,yaml}" + - multiqc_logo: + type: file + description: Optional logo file for MultiQC + pattern: "*.{png}" + +output: + - report: + type: file + description: MultiQC report file + pattern: "multiqc_report.html" + - data: + type: dir + description: MultiQC data dir + pattern: "multiqc_data" + - plots: + type: file + description: Plots created by MultiQC + pattern: "*_data" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@abhi18av" + - "@bunop" + - "@drpatelh" + - "@jfy133" diff --git a/modules/nf-core/modules/picard/markduplicates/main.nf b/modules/nf-core/picard/markduplicates/main.nf similarity index 80% rename from modules/nf-core/modules/picard/markduplicates/main.nf rename to modules/nf-core/picard/markduplicates/main.nf index e754a587..be243a95 100644 --- a/modules/nf-core/modules/picard/markduplicates/main.nf +++ b/modules/nf-core/picard/markduplicates/main.nf @@ -2,13 +2,15 @@ process PICARD_MARKDUPLICATES { tag "$meta.id" label 'process_medium' - conda (params.enable_conda ? "bioconda::picard=2.26.10" : null) + conda "bioconda::picard=3.0.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/picard:2.26.10--hdfd78af_0' : - 'quay.io/biocontainers/picard:2.26.10--hdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/picard:3.0.0--hdfd78af_1' : + 'quay.io/biocontainers/picard:3.0.0--hdfd78af_1' }" input: tuple val(meta), path(bam) + path fasta + path fai output: tuple val(meta), path("*.bam") , emit: bam @@ -33,9 +35,10 @@ process PICARD_MARKDUPLICATES { -Xmx${avail_mem}g \\ MarkDuplicates \\ $args \\ - I=$bam \\ - O=${prefix}.bam \\ - M=${prefix}.MarkDuplicates.metrics.txt + --INPUT $bam \\ + --OUTPUT ${prefix}.bam \\ + --REFERENCE_SEQUENCE $fasta \\ + --METRICS_FILE ${prefix}.MarkDuplicates.metrics.txt cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/modules/picard/markduplicates/meta.yml b/modules/nf-core/picard/markduplicates/meta.yml similarity index 85% rename from modules/nf-core/modules/picard/markduplicates/meta.yml rename to modules/nf-core/picard/markduplicates/meta.yml index 842817bc..3f2357bb 100644 --- a/modules/nf-core/modules/picard/markduplicates/meta.yml +++ b/modules/nf-core/picard/markduplicates/meta.yml @@ -24,7 +24,15 @@ input: - bam: type: file description: BAM file - pattern: "*.{bam}" + pattern: "*.{bam,cram,sam}" + - fasta: + type: file + description: Reference genome fasta file + pattern: "*.{fasta,fa}" + - fai: + type: file + description: Reference genome fasta index + pattern: "*.{fai}" output: - meta: type: map diff --git a/modules/nf-core/samtools/faidx/main.nf b/modules/nf-core/samtools/faidx/main.nf new file mode 100644 index 00000000..ce6580d2 --- /dev/null +++ b/modules/nf-core/samtools/faidx/main.nf @@ -0,0 +1,44 @@ +process SAMTOOLS_FAIDX { + tag "$fasta" + label 'process_single' + + conda "bioconda::samtools=1.16.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.16.1--h6899075_1' : + 'quay.io/biocontainers/samtools:1.16.1--h6899075_1' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path ("*.fai"), emit: fai + tuple val(meta), path ("*.gzi"), emit: gzi, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + samtools \\ + faidx \\ + $args \\ + $fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + """ + touch ${fasta}.fai + cat <<-END_VERSIONS > versions.yml + + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/faidx/meta.yml b/modules/nf-core/samtools/faidx/meta.yml new file mode 100644 index 00000000..fe2fe9a1 --- /dev/null +++ b/modules/nf-core/samtools/faidx/meta.yml @@ -0,0 +1,47 @@ +name: samtools_faidx +description: Index FASTA file +keywords: + - index + - fasta +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: FASTA file + pattern: "*.{fa,fasta}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fai: + type: file + description: FASTA index file + pattern: "*.{fai}" + - gzi: + type: file + description: Optional gzip index file for compressed inputs + pattern: "*.gzi" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@ewels" + - "@phue" diff --git a/modules/nf-core/modules/samtools/flagstat/main.nf b/modules/nf-core/samtools/flagstat/main.nf similarity index 74% rename from modules/nf-core/modules/samtools/flagstat/main.nf rename to modules/nf-core/samtools/flagstat/main.nf index b87b2108..2120cd7d 100644 --- a/modules/nf-core/modules/samtools/flagstat/main.nf +++ b/modules/nf-core/samtools/flagstat/main.nf @@ -1,11 +1,11 @@ process SAMTOOLS_FLAGSTAT { tag "$meta.id" - label 'process_low' + label 'process_single' - conda (params.enable_conda ? "bioconda::samtools=1.15.1" : null) + conda "bioconda::samtools=1.16.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.15.1--h1170115_0' : - 'quay.io/biocontainers/samtools:1.15.1--h1170115_0' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.16.1--h6899075_1' : + 'quay.io/biocontainers/samtools:1.16.1--h6899075_1' }" input: tuple val(meta), path(bam), path(bai) @@ -19,12 +19,13 @@ process SAMTOOLS_FLAGSTAT { script: def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" """ samtools \\ flagstat \\ - --threads ${task.cpus-1} \\ + --threads ${task.cpus} \\ $bam \\ - > ${bam}.flagstat + > ${prefix}.flagstat cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/modules/samtools/view/meta.yml b/modules/nf-core/samtools/flagstat/meta.yml similarity index 66% rename from modules/nf-core/modules/samtools/view/meta.yml rename to modules/nf-core/samtools/flagstat/meta.yml index a8b43ecc..95269063 100644 --- a/modules/nf-core/modules/samtools/view/meta.yml +++ b/modules/nf-core/samtools/flagstat/meta.yml @@ -1,7 +1,9 @@ -name: samtools_view -description: filter/convert SAM/BAM/CRAM file +name: samtools_flagstat +description: Counts the number of alignments in a BAM/CRAM/SAM file for each FLAG type keywords: - - view + - stats + - mapping + - counts - bam - sam - cram @@ -21,37 +23,27 @@ input: description: | Groovy Map containing sample information e.g. [ id:'test', single_end:false ] - - input: + - bam: type: file description: BAM/CRAM/SAM file pattern: "*.{bam,cram,sam}" - - index: - type: optional file - description: BAM.BAI/CRAM.CRAI file - pattern: "*.{.bai,.crai}" - - fasta: - type: optional file - description: Reference file the CRAM was created with - pattern: "*.{fasta,fa}" + - bai: + type: file + description: Index for BAM/CRAM/SAM file + pattern: "*.{bai,crai,sai}" output: - meta: type: map description: | Groovy Map containing sample information e.g. [ id:'test', single_end:false ] - - bam: - type: file - description: filtered/converted BAM/SAM file - pattern: "*.{bam,sam}" - - cram: + - flagstat: type: file - description: filtered/converted CRAM file - pattern: "*.cram" + description: File containing samtools flagstat output + pattern: "*.{flagstat}" - versions: type: file description: File containing software versions pattern: "versions.yml" authors: - "@drpatelh" - - "@joseespinosa" - - "@FriederikeHanssen" diff --git a/modules/nf-core/modules/samtools/idxstats/main.nf b/modules/nf-core/samtools/idxstats/main.nf similarity index 73% rename from modules/nf-core/modules/samtools/idxstats/main.nf rename to modules/nf-core/samtools/idxstats/main.nf index a49ff35f..a7b87d8b 100644 --- a/modules/nf-core/modules/samtools/idxstats/main.nf +++ b/modules/nf-core/samtools/idxstats/main.nf @@ -1,11 +1,11 @@ process SAMTOOLS_IDXSTATS { tag "$meta.id" - label 'process_low' + label 'process_single' - conda (params.enable_conda ? "bioconda::samtools=1.15.1" : null) + conda "bioconda::samtools=1.16.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.15.1--h1170115_0' : - 'quay.io/biocontainers/samtools:1.15.1--h1170115_0' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.16.1--h6899075_1' : + 'quay.io/biocontainers/samtools:1.16.1--h6899075_1' }" input: tuple val(meta), path(bam), path(bai) @@ -19,11 +19,14 @@ process SAMTOOLS_IDXSTATS { script: def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ samtools \\ idxstats \\ + --threads ${task.cpus-1} \\ $bam \\ - > ${bam}.idxstats + > ${prefix}.idxstats cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/samtools/idxstats/meta.yml b/modules/nf-core/samtools/idxstats/meta.yml new file mode 100644 index 00000000..3710ab88 --- /dev/null +++ b/modules/nf-core/samtools/idxstats/meta.yml @@ -0,0 +1,50 @@ +name: samtools_idxstats +description: Reports alignment summary statistics for a BAM/CRAM/SAM file +keywords: + - stats + - mapping + - counts + - chromosome + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: hhttp://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - bai: + type: file + description: Index for BAM/CRAM/SAM file + pattern: "*.{bai,crai,sai}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - idxstats: + type: file + description: File containing samtools idxstats output + pattern: "*.{idxstats}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" diff --git a/modules/nf-core/modules/samtools/index/main.nf b/modules/nf-core/samtools/index/main.nf similarity index 70% rename from modules/nf-core/modules/samtools/index/main.nf rename to modules/nf-core/samtools/index/main.nf index e41cdcc8..8b95687a 100644 --- a/modules/nf-core/modules/samtools/index/main.nf +++ b/modules/nf-core/samtools/index/main.nf @@ -2,10 +2,10 @@ process SAMTOOLS_INDEX { tag "$meta.id" label 'process_low' - conda (params.enable_conda ? "bioconda::samtools=1.15" : null) + conda "bioconda::samtools=1.16.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.15--h1170115_1' : - 'quay.io/biocontainers/samtools:1.15--h1170115_1' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.16.1--h6899075_1' : + 'quay.io/biocontainers/samtools:1.16.1--h6899075_1' }" input: tuple val(meta), path(input) @@ -33,4 +33,16 @@ process SAMTOOLS_INDEX { samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') END_VERSIONS """ + + stub: + """ + touch ${input}.bai + touch ${input}.crai + touch ${input}.csi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ } diff --git a/modules/nf-core/modules/samtools/index/meta.yml b/modules/nf-core/samtools/index/meta.yml similarity index 100% rename from modules/nf-core/modules/samtools/index/meta.yml rename to modules/nf-core/samtools/index/meta.yml diff --git a/modules/nf-core/modules/samtools/sort/main.nf b/modules/nf-core/samtools/sort/main.nf similarity index 67% rename from modules/nf-core/modules/samtools/sort/main.nf rename to modules/nf-core/samtools/sort/main.nf index 0e2de8ba..84c167cd 100644 --- a/modules/nf-core/modules/samtools/sort/main.nf +++ b/modules/nf-core/samtools/sort/main.nf @@ -2,16 +2,17 @@ process SAMTOOLS_SORT { tag "$meta.id" label 'process_medium' - conda (params.enable_conda ? "bioconda::samtools=1.15" : null) + conda "bioconda::samtools=1.16.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.15--h1170115_1' : - 'quay.io/biocontainers/samtools:1.15--h1170115_1' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.16.1--h6899075_1' : + 'quay.io/biocontainers/samtools:1.16.1--h6899075_1' }" input: tuple val(meta), path(bam) output: tuple val(meta), path("*.bam"), emit: bam + tuple val(meta), path("*.csi"), emit: csi, optional: true path "versions.yml" , emit: versions when: @@ -28,4 +29,15 @@ process SAMTOOLS_SORT { samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') END_VERSIONS """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ } diff --git a/modules/nf-core/modules/samtools/sort/meta.yml b/modules/nf-core/samtools/sort/meta.yml similarity index 92% rename from modules/nf-core/modules/samtools/sort/meta.yml rename to modules/nf-core/samtools/sort/meta.yml index a820c55a..09289751 100644 --- a/modules/nf-core/modules/samtools/sort/meta.yml +++ b/modules/nf-core/samtools/sort/meta.yml @@ -39,6 +39,10 @@ output: type: file description: File containing software versions pattern: "versions.yml" + - csi: + type: file + description: BAM index file (optional) + pattern: "*.csi" authors: - "@drpatelh" - "@ewels" diff --git a/modules/nf-core/modules/samtools/stats/main.nf b/modules/nf-core/samtools/stats/main.nf similarity index 79% rename from modules/nf-core/modules/samtools/stats/main.nf rename to modules/nf-core/samtools/stats/main.nf index bbdc3240..0a2a3640 100644 --- a/modules/nf-core/modules/samtools/stats/main.nf +++ b/modules/nf-core/samtools/stats/main.nf @@ -1,11 +1,11 @@ process SAMTOOLS_STATS { tag "$meta.id" - label 'process_low' + label 'process_single' - conda (params.enable_conda ? "bioconda::samtools=1.15.1" : null) + conda "bioconda::samtools=1.16.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.15.1--h1170115_0' : - 'quay.io/biocontainers/samtools:1.15.1--h1170115_0' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.16.1--h6899075_1' : + 'quay.io/biocontainers/samtools:1.16.1--h6899075_1' }" input: tuple val(meta), path(input), path(input_index) @@ -20,14 +20,15 @@ process SAMTOOLS_STATS { script: def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" def reference = fasta ? "--reference ${fasta}" : "" """ samtools \\ stats \\ - --threads ${task.cpus-1} \\ + --threads ${task.cpus} \\ ${reference} \\ ${input} \\ - > ${input}.stats + > ${prefix}.stats cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -38,7 +39,7 @@ process SAMTOOLS_STATS { stub: def prefix = task.ext.prefix ?: "${meta.id}" """ - touch ${input}.stats + touch ${prefix}.stats cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/modules/samtools/stats/meta.yml b/modules/nf-core/samtools/stats/meta.yml similarity index 100% rename from modules/nf-core/modules/samtools/stats/meta.yml rename to modules/nf-core/samtools/stats/meta.yml diff --git a/modules/nf-core/samtools/view/main.nf b/modules/nf-core/samtools/view/main.nf new file mode 100644 index 00000000..729c85e5 --- /dev/null +++ b/modules/nf-core/samtools/view/main.nf @@ -0,0 +1,66 @@ +process SAMTOOLS_VIEW { + tag "$meta.id" + label 'process_low' + + conda "bioconda::samtools=1.16.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.16.1--h6899075_1' : + 'quay.io/biocontainers/samtools:1.16.1--h6899075_1' }" + + input: + tuple val(meta), path(input), path(index) + path fasta + path qname + + output: + tuple val(meta), path("*.bam"), emit: bam, optional: true + tuple val(meta), path("*.cram"), emit: cram, optional: true + tuple val(meta), path("*.sam"), emit: sam, optional: true + tuple val(meta), path("*.bai"), emit: bai, optional: true + tuple val(meta), path("*.csi"), emit: csi, optional: true + tuple val(meta), path("*.crai"), emit: crai, optional: true + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reference = fasta ? "--reference ${fasta}" : "" + def readnames = qname ? "--qname-file ${qname}": "" + def file_type = args.contains("--output-fmt sam") ? "sam" : + args.contains("--output-fmt bam") ? "bam" : + args.contains("--output-fmt cram") ? "cram" : + input.getExtension() + if ("$input" == "${prefix}.${file_type}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + samtools \\ + view \\ + --threads ${task.cpus-1} \\ + ${reference} \\ + ${readnames} \\ + $args \\ + -o ${prefix}.${file_type} \\ + $input \\ + $args2 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bam + touch ${prefix}.cram + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/view/meta.yml b/modules/nf-core/samtools/view/meta.yml new file mode 100644 index 00000000..a52e4f8d --- /dev/null +++ b/modules/nf-core/samtools/view/meta.yml @@ -0,0 +1,79 @@ +name: samtools_view +description: filter/convert SAM/BAM/CRAM file +keywords: + - view + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: hhttp://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - index: + type: optional file + description: BAM.BAI/CRAM.CRAI file + pattern: "*.{.bai,.crai}" + - fasta: + type: optional file + description: Reference file the CRAM was created with + pattern: "*.{fasta,fa}" + - qname: + type: file + description: Optional file with read names to output only select alignments + pattern: "*.{txt,list}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: optional filtered/converted BAM file + pattern: "*.{bam}" + - cram: + type: file + description: optional filtered/converted CRAM file + pattern: "*.{cram}" + - sam: + type: file + description: optional filtered/converted SAM file + pattern: "*.{sam}" + # bai, csi, and crai are created with `--write-index` + - bai: + type: file + description: optional BAM file index + pattern: "*.{bai}" + - csi: + type: file + description: optional tabix BAM file index + pattern: "*.{csi}" + - crai: + type: file + description: optional CRAM file index + pattern: "*.{crai}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@joseespinosa" + - "@FriederikeHanssen" + - "@priyanka-surana" diff --git a/modules/nf-core/modules/trimgalore/main.nf b/modules/nf-core/trimgalore/main.nf similarity index 64% rename from modules/nf-core/modules/trimgalore/main.nf rename to modules/nf-core/trimgalore/main.nf index 3a3fca90..37e88f58 100644 --- a/modules/nf-core/modules/trimgalore/main.nf +++ b/modules/nf-core/trimgalore/main.nf @@ -2,7 +2,7 @@ process TRIMGALORE { tag "$meta.id" label 'process_high' - conda (params.enable_conda ? 'bioconda::trim-galore=0.6.7' : null) + conda "bioconda::trim-galore=0.6.7" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/trim-galore:0.6.7--hdfd78af_0' : 'quay.io/biocontainers/trim-galore:0.6.7--hdfd78af_0' }" @@ -11,13 +11,12 @@ process TRIMGALORE { tuple val(meta), path(reads) output: - tuple val(meta), path("*{trimmed,val}*.fq.gz"), emit: reads - tuple val(meta), path("*report.txt") , emit: log - path "versions.yml" , emit: versions - - tuple val(meta), path("*unpaired*.fq.gz") , emit: unpaired, optional: true - tuple val(meta), path("*.html") , emit: html , optional: true - tuple val(meta), path("*.zip") , emit: zip , optional: true + tuple val(meta), path("*{3prime,5prime,trimmed,val}*.fq.gz"), emit: reads + tuple val(meta), path("*report.txt") , emit: log , optional: true + tuple val(meta), path("*unpaired*.fq.gz") , emit: unpaired, optional: true + tuple val(meta), path("*.html") , emit: html , optional: true + tuple val(meta), path("*.zip") , emit: zip , optional: true + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -32,15 +31,9 @@ process TRIMGALORE { cores = (task.cpus as int) - 4 if (meta.single_end) cores = (task.cpus as int) - 3 if (cores < 1) cores = 1 - if (cores > 4) cores = 4 + if (cores > 8) cores = 8 } - // Clipping presets have to be evaluated in the context of SE/PE - def c_r1 = params.clip_r1 > 0 ? "--clip_r1 ${params.clip_r1}" : '' - def c_r2 = params.clip_r2 > 0 ? "--clip_r2 ${params.clip_r2}" : '' - def tpc_r1 = params.three_prime_clip_r1 > 0 ? "--three_prime_clip_r1 ${params.three_prime_clip_r1}" : '' - def tpc_r2 = params.three_prime_clip_r2 > 0 ? "--three_prime_clip_r2 ${params.three_prime_clip_r2}" : '' - // Added soft-links to original fastqs for consistent naming in MultiQC def prefix = task.ext.prefix ?: "${meta.id}" if (meta.single_end) { @@ -50,8 +43,6 @@ process TRIMGALORE { $args \\ --cores $cores \\ --gzip \\ - $c_r1 \\ - $tpc_r1 \\ ${prefix}.fastq.gz cat <<-END_VERSIONS > versions.yml @@ -69,10 +60,6 @@ process TRIMGALORE { --cores $cores \\ --paired \\ --gzip \\ - $c_r1 \\ - $c_r2 \\ - $tpc_r1 \\ - $tpc_r2 \\ ${prefix}_1.fastq.gz \\ ${prefix}_2.fastq.gz diff --git a/modules/nf-core/modules/trimgalore/meta.yml b/modules/nf-core/trimgalore/meta.yml similarity index 97% rename from modules/nf-core/modules/trimgalore/meta.yml rename to modules/nf-core/trimgalore/meta.yml index 439f566d..f84c4d77 100644 --- a/modules/nf-core/modules/trimgalore/meta.yml +++ b/modules/nf-core/trimgalore/meta.yml @@ -36,7 +36,7 @@ output: description: | List of input adapter trimmed FastQ files of size 1 and 2 for single-end and paired-end data, respectively. - pattern: "*.{fq.gz}" + pattern: "*{3prime,5prime,trimmed,val}*.fq.gz" - unpaired: type: file description: | diff --git a/nextflow.config b/nextflow.config index d9fceba4..87899c5a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -9,13 +9,9 @@ // Global default params, used in configs params { // Input options - input = null - input_format = "FASTQ" - bam_sorted = false - bwa_index = null + input = null + input_format = "FASTQ" - // concatenate fastq options - save_merged_fastq = false // References genome = null @@ -23,9 +19,11 @@ params { igenomes_ignore = false // BWA Reference + bwa_index = null save_reference = false // BAM Sorted + bam_sorted = false save_sorted_bam = false // Circular DNA identification options @@ -43,22 +41,31 @@ params { save_trimmed = false skip_trimming = false - // Picard markduplicates + // Picard Markduplicates options skip_markduplicates = false - keep_duplicates = false - save_markduplicates_bam = false + keep_duplicates = true + save_markduplicates_bam = true - // AmpliconArchitect + // AmpliconArchitect options + cnvkit_cnn = null aa_data_repo = null aa_cngain = '4.5' mosek_license_dir = null reference_build = null + // Save Intermediate Files + save_merged_fastq = false + save_circle_map_intermediate = false + save_circle_finder_intermediate = false + save_unicycler_intermediate = false + // MultiQC options - skip_multiqc = false - multiqc_config = null - multiqc_title = null - max_multiqc_email_size = '25.MB' + skip_multiqc = false + multiqc_config = null + multiqc_title = null + multiqc_logo = null + max_multiqc_email_size = '25.MB' + multiqc_methods_description = null // Boilerplate options outdir = null @@ -68,11 +75,12 @@ params { email_on_fail = null plaintext_email = false monochrome_logs = false + hook_url = null help = false + version = false validate_params = true show_hidden_params = false schema_ignore_params = 'genomes' - enable_conda = false // Config options custom_config_version = 'master' @@ -82,11 +90,12 @@ params { config_profile_url = null config_profile_name = null + // Max resource options // Defaults only, expecting to be overwritten - max_memory = '128.GB' - max_cpus = 16 - max_time = '240.h' + max_memory = '128.GB' + max_cpus = 16 + max_time = '240.h' } // Load base.config by default for all pipelines @@ -111,7 +120,16 @@ try { profiles { debug { process.beforeScript = 'echo $HOSTNAME' } conda { - params.enable_conda = true + conda.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + mamba { + conda.enabled = true + conda.useMamba = true docker.enabled = false singularity.enabled = false podman.enabled = false @@ -126,6 +144,9 @@ profiles { shifter.enabled = false charliecloud.enabled = false } + arm { + docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64' + } singularity { singularity.enabled = true singularity.autoMounts = true @@ -155,16 +176,17 @@ profiles { podman.enabled = false shifter.enabled = false } - long { - params.config_profile_name = 'Long Run Time' - params.config_profile_description = 'This is a pipeline run with a long maximum time' - params.max_time = '20000.h' + gitpod { + executor.name = 'local' + executor.cpus = 16 + executor.memory = 60.GB } test { includeConfig 'conf/test.config' } test_AA { includeConfig 'conf/test_AA.config' } test_full { includeConfig 'conf/test_full.config' } } + // Load igenomes.config if required if (!params.igenomes_ignore) { includeConfig 'conf/igenomes.config' @@ -172,6 +194,7 @@ if (!params.igenomes_ignore) { params.genomes = [:] } + // Export these variables to prevent local Python/R libraries from conflicting with those in the container // The JULIA depot path has been adjusted to a fixed path `/usr/local/share/julia` that needs to be used for packages in the container. // See https://apeltzer.github.io/post/03-julia-lang-nextflow/ for details on that. Once we have a common agreement on where to keep Julia packages, this is adjustable. @@ -206,12 +229,13 @@ dag { manifest { name = 'nf-core/circdna' - author = 'Daniel Schreyer' + author = """Daniel Schreyer""" homePage = 'https://github.com/nf-core/circdna' - description = 'Pipeline for the identification of circular DNAs' + description = """Pipeline for the identification of circular DNAs""" mainScript = 'main.nf' - nextflowVersion = '!>=21.10.3' - version = '1.0.1' + nextflowVersion = '!>=22.10.1' + version = '1.0.2' + doi = '' } // Load modules.config for DSL2 module specific options diff --git a/nextflow_schema.json b/nextflow_schema.json index 1c9e678d..8aa2e8d3 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -82,15 +82,6 @@ "help_text": "This parameter is *mandatory* if `--genome` is not specified. If you don't have a BWA index available this will be generated for you automatically. Combine with `--save_reference` to save BWA index for future runs.", "fa_icon": "far fa-file-code" }, - "bwa_index": { - "type": "string", - "format": "file-path", - "mimetype": "text/plain", - "pattern": "^\\S+\\.\\{alt,amb,ann,bwt,pac,sa\\}$", - "description": "Path to BWA Index genome file.", - "help_text": "This parameter is *optional*. If you don't have a BWA index available this will be generated for you automatically.", - "fa_icon": "far fa-file-code" - }, "igenomes_base": { "type": "string", "format": "directory-path", @@ -112,6 +103,15 @@ "description": "Save the index reference fasta in the results directory.", "help_text": "By default, indexed reference genome files will not be saved to the results directory. Specify this flag (or set to true in your config file) to copy these files to the results directory when complete.", "fa_icon": "fas fa-save" + }, + "bwa_index": { + "type": "string", + "format": "file-path", + "mimetype": "text/plain", + "pattern": "^\\S+\\.\\{amb,ann,bwt,pac,sa\\}$", + "description": "Path to BWA Index genome file.", + "help_text": "This parameter is *optional*. If you don't have a BWA index available this will be generated for you automatically.", + "fa_icon": "far fa-file-code" } } }, @@ -144,16 +144,16 @@ }, "keep_duplicates": { "type": "boolean", - "default": false, + "default": true, "fa_icon": "fas fa-save", "help_text": "Set this parameter to skip filtering of duplicates marked by Picard Markduplicates.", "description": "Keep read duplications marked by picard MarkDuplicates." }, "save_markduplicates_bam": { "type": "boolean", - "default": false, + "default": true, "fa_icon": "fas fa-save", - "help_text": "Set this parameter to save bam file with marked duplicate reads..", + "help_text": "Set this parameter to save bam file with marked duplicate reads.", "description": "Store bam with marked duplicate reads." } } @@ -228,11 +228,57 @@ } } }, + "circle_map_options": { + "title": "circle-map options", + "type": "object", + "fa_icon": "fas fa-circle-notch", + "description": "Parameters used to run Circle-Map.", + "properties": { + "save_circle_map_intermediate": { + "type": "boolean", + "default": false, + "fa_icon": "fas fa-save", + "help_text": "Set this parameter to save bam file with reads extracted by circle-map readextractor.", + "description": "Store bam file with read candidates for circle-map circular dna calling." + } + } + }, + "circle_finder_options": { + "title": "Circle_finder options", + "type": "object", + "fa_icon": "fas fa-circle-notch", + "description": "Parameters used to run Circle_finder.", + "properties": { + "save_circle_finder_intermediate": { + "type": "boolean", + "default": false, + "fa_icon": "fas fa-save", + "help_text": "Set this parameter to save Circle_finder intermediate files.", + "description": "Store bed files created during Circle_finder run." + } + } + }, + "unicycler_options": { + "title": "Unicycler options", + "type": "object", + "fa_icon": "fas fa-circle-notch", + "description": "Parameters used to run Unicycler.", + "properties": { + "save_unicycler_intermediate": { + "type": "boolean", + "default": false, + "fa_icon": "fas fa-save", + "help_text": "Set this parameter to save Uniycler intermediate files.", + "description": "Store fastq intermediate files created during Uniycler run." + } + } + }, + "amplicon_architect_options": { - "title": "AmpliconArchitect options", + "title": "ampliconarchitect options", "type": "object", "fa_icon": "fas fa-circle-notch", - "description": "Parameters used to run AmpliconArchitect. The software needs additional data files not included in ", + "description": "parameters used to run ampliconarchitect. The software needs additional data files not included in ", "properties": { "aa_data_repo": { "type": "string", @@ -262,9 +308,16 @@ "type": "string", "mimetype": "text/plain", "fa_icon": "fas fa-book", - "description": "When running AmpliconArchitect, specify reference build ['GRCh37', 'GRCh38]. This is *mandatory* to match fasta and AA reference build!", + "description": "When running AmpliconArchitect, specify reference build ['GRCh37', 'GRCh38', 'mm10']. This is *mandatory* to match fasta and AA reference build!", "help_text": "Specify the reference genome build used for alignment of the WGS reads.", "default": "GRCh38" + }, + "cnvkit_cnn": { + "type": "string", + "mimetype": "text/plain", + "fa_icon": "fas fa-file-code", + "description": "Path to cnn file inside the AmpliconArchitect Data Repository of the respective reference genome. By default it uses the 'aa_data_repo' and the 'reference_build' input to construct the file path.", + "help_text": "Specify path to cnvkit cnn file inside AmpliconArchitect Data Repository." } } }, @@ -364,6 +417,12 @@ "fa_icon": "fas fa-question-circle", "hidden": true }, + "version": { + "type": "boolean", + "description": "Display version and exit.", + "fa_icon": "fas fa-question-circle", + "hidden": true + }, "publish_dir_mode": { "type": "string", "default": "copy", @@ -401,12 +460,30 @@ "fa_icon": "fas fa-palette", "hidden": true }, + "hook_url": { + "type": "string", + "description": "Incoming hook URL for messaging service", + "fa_icon": "fas fa-people-group", + "help_text": "Incoming hook URL for messaging service. Currently, MS Teams and Slack are supported.", + "hidden": true + }, "multiqc_config": { "type": "string", "description": "Custom config file to supply to MultiQC.", "fa_icon": "fas fa-cog", "hidden": true }, + "multiqc_logo": { + "type": "string", + "description": "Custom logo file to supply to MultiQC. File name must also be set in the MultiQC config file", + "fa_icon": "fas fa-image", + "hidden": true + }, + "multiqc_methods_description": { + "type": "string", + "description": "Custom MultiQC yaml file containing HTML including a methods description.", + "fa_icon": "fas fa-cog" + }, "tracedir": { "type": "string", "description": "Directory to keep pipeline Nextflow logs and reports.", @@ -427,12 +504,6 @@ "description": "Show all params when using `--help`", "hidden": true, "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." - }, - "enable_conda": { - "type": "boolean", - "description": "Run this workflow with Conda. You can also use '-profile conda' instead of providing this parameter.", - "hidden": true, - "fa_icon": "fas fa-bacon" } } } @@ -444,6 +515,9 @@ { "$ref": "#/definitions/max_job_request_options" }, { "$ref": "#/definitions/process_skipping_options" }, { "$ref": "#/definitions/circdna_identifier_options" }, + { "$ref": "#/definitions/circle_map_options" }, + { "$ref": "#/definitions/circle_finder_options" }, + { "$ref": "#/definitions/unicycler_options" }, { "$ref": "#/definitions/amplicon_architect_options" }, { "$ref": "#/definitions/read_trimming_options" }, { "$ref": "#/definitions/generic_options" } diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..0d62beb6 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,10 @@ +# Config file for Python. Mostly used to configure linting of bin/check_samplesheet.py with Black. +# Should be kept the same as nf-core/tools to avoid fighting with template synchronisation. +[tool.black] +line-length = 120 +target_version = ["py37", "py38", "py39", "py310"] + +[tool.isort] +profile = "black" +known_first_party = ["nf_core"] +multi_line_output = 3 diff --git a/subworkflows/nf-core/mark_duplicates_picard.nf b/subworkflows/local/bam_markduplicates_picard/main.nf similarity index 73% rename from subworkflows/nf-core/mark_duplicates_picard.nf rename to subworkflows/local/bam_markduplicates_picard/main.nf index dac91223..9cb24cdc 100644 --- a/subworkflows/nf-core/mark_duplicates_picard.nf +++ b/subworkflows/local/bam_markduplicates_picard/main.nf @@ -2,27 +2,24 @@ // Picard MarkDuplicates, index BAM file and run samtools stats, flagstat and idxstats // -include { PICARD_MARKDUPLICATES } from '../../modules/nf-core/modules/picard/markduplicates/main' -include { SAMTOOLS_INDEX } from '../../modules/nf-core/modules/samtools/index/main' -include { BAM_STATS_SAMTOOLS } from './bam_stats_samtools' +include { PICARD_MARKDUPLICATES } from '../../../modules/nf-core/picard/markduplicates/main' +include { SAMTOOLS_INDEX } from '../../../modules/nf-core/samtools/index/main' +include { BAM_STATS_SAMTOOLS } from '../bam_stats_samtools/main' + +workflow BAM_MARKDUPLICATES_PICARD { -workflow MARK_DUPLICATES_PICARD { take: - bam // channel: [ val(meta), [ bam ] ] + ch_bam // channel: [ val(meta), [ bam ] ] + ch_fasta // channel: [ fasta ] + ch_fai // channel: [ fai ] main: ch_versions = Channel.empty() - // - // Picard MarkDuplicates - // - PICARD_MARKDUPLICATES ( bam ) + PICARD_MARKDUPLICATES ( ch_bam, ch_fasta, ch_fai ) ch_versions = ch_versions.mix(PICARD_MARKDUPLICATES.out.versions.first()) - // - // Index BAM file and run samtools stats, flagstat and idxstats - // SAMTOOLS_INDEX ( PICARD_MARKDUPLICATES.out.bam ) ch_versions = ch_versions.mix(SAMTOOLS_INDEX.out.versions.first()) @@ -39,15 +36,15 @@ workflow MARK_DUPLICATES_PICARD { } .set { ch_bam_bai } - BAM_STATS_SAMTOOLS ( ch_bam_bai ) + BAM_STATS_SAMTOOLS ( ch_bam_bai, ch_fasta ) ch_versions = ch_versions.mix(BAM_STATS_SAMTOOLS.out.versions) emit: bam = PICARD_MARKDUPLICATES.out.bam // channel: [ val(meta), [ bam ] ] - metrics = PICARD_MARKDUPLICATES.out.metrics // channel: [ val(meta), [ metrics ] ] - + metrics = PICARD_MARKDUPLICATES.out.metrics // channel: [ val(meta), [ bam ] ] bai = SAMTOOLS_INDEX.out.bai // channel: [ val(meta), [ bai ] ] csi = SAMTOOLS_INDEX.out.csi // channel: [ val(meta), [ csi ] ] + stats = BAM_STATS_SAMTOOLS.out.stats // channel: [ val(meta), [ stats ] ] flagstat = BAM_STATS_SAMTOOLS.out.flagstat // channel: [ val(meta), [ flagstat ] ] idxstats = BAM_STATS_SAMTOOLS.out.idxstats // channel: [ val(meta), [ idxstats ] ] diff --git a/subworkflows/local/bam_markduplicates_picard/meta.yml b/subworkflows/local/bam_markduplicates_picard/meta.yml new file mode 100644 index 00000000..322a169b --- /dev/null +++ b/subworkflows/local/bam_markduplicates_picard/meta.yml @@ -0,0 +1,73 @@ +name: "bam_markduplicates_picard" +description: Picard MarkDuplicates, index BAM file and run samtools stats, flagstat and idxstats +keywords: + - markduplicates + - bam + - sam + - cram + +modules: + - picard/markduplicates + - samtools/index + - samtools/stats + - samtools/idxstats + - samtools/flagstat + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - fasta: + type: file + description: Reference genome fasta file + pattern: "*.{fasta,fa}" + - fai: + type: file + description: Reference genome fasta index file + pattern: "*.{fai}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - bam: + type: file + description: BAM file with duplicate reads marked/removed + pattern: "*.{bam}" + - bai: + type: file + description: BAM/CRAM/SAM samtools index + pattern: "*.{bai,crai,sai}" + - csi: + type: file + description: CSI samtools index + pattern: "*.csi" + - stats: + type: file + description: File containing samtools stats output + - flagstat: + type: file + description: File containing samtools flagstat output + - idxstats: + type: file + description: File containing samtools idxstats output + pattern: "*.{idxstats}" + - metrics: + type: file + description: Duplicate metrics file generated by picard + pattern: "*.{metrics.txt}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@dmarron" + - "@drpatelh" diff --git a/subworkflows/nf-core/bam_stats_samtools.nf b/subworkflows/local/bam_stats_samtools/main.nf similarity index 53% rename from subworkflows/nf-core/bam_stats_samtools.nf rename to subworkflows/local/bam_stats_samtools/main.nf index 3d505a4d..8b893cbd 100644 --- a/subworkflows/nf-core/bam_stats_samtools.nf +++ b/subworkflows/local/bam_stats_samtools/main.nf @@ -2,25 +2,26 @@ // Run SAMtools stats, flagstat and idxstats // -include { SAMTOOLS_STATS } from '../../modules/nf-core/modules/samtools/stats/main' -include { SAMTOOLS_IDXSTATS } from '../../modules/nf-core/modules/samtools/idxstats/main' -include { SAMTOOLS_FLAGSTAT } from '../../modules/nf-core/modules/samtools/flagstat/main' +include { SAMTOOLS_STATS } from '../../../modules/nf-core/samtools/stats/main' +include { SAMTOOLS_IDXSTATS } from '../../../modules/local/samtools/idxstats/main' +include { SAMTOOLS_FLAGSTAT } from '../../../modules/local/samtools/flagstat/main' workflow BAM_STATS_SAMTOOLS { take: - ch_bam_bai // channel: [ val(meta), [ bam ], [bai/csi] ] + bam_bai // channel: [ val(meta), [ bam/cram ], [bai/csi] ] + fasta // channel: [ fasta ] main: ch_versions = Channel.empty() - SAMTOOLS_STATS ( ch_bam_bai, [] ) - ch_versions = ch_versions.mix(SAMTOOLS_STATS.out.versions.first()) + SAMTOOLS_STATS ( bam_bai, fasta ) + ch_versions = ch_versions.mix(SAMTOOLS_STATS.out.versions) - SAMTOOLS_FLAGSTAT ( ch_bam_bai ) - ch_versions = ch_versions.mix(SAMTOOLS_FLAGSTAT.out.versions.first()) + SAMTOOLS_FLAGSTAT ( bam_bai ) + ch_versions = ch_versions.mix(SAMTOOLS_FLAGSTAT.out.versions) - SAMTOOLS_IDXSTATS ( ch_bam_bai ) - ch_versions = ch_versions.mix(SAMTOOLS_IDXSTATS.out.versions.first()) + SAMTOOLS_IDXSTATS ( bam_bai ) + ch_versions = ch_versions.mix(SAMTOOLS_IDXSTATS.out.versions) emit: stats = SAMTOOLS_STATS.out.stats // channel: [ val(meta), [ stats ] ] diff --git a/subworkflows/local/bam_stats_samtools/meta.yml b/subworkflows/local/bam_stats_samtools/meta.yml new file mode 100644 index 00000000..5252b0e4 --- /dev/null +++ b/subworkflows/local/bam_stats_samtools/meta.yml @@ -0,0 +1,54 @@ +name: bam_stats_samtools +description: Produces comprehensive statistics from SAM/BAM/CRAM file +keywords: + - statistics + - counts + - bam + - sam + - cram +modules: + - samtools/stats + - samtools/idxstats + - samtools/flagstat +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - bai: + type: file + description: Index for BAM/CRAM/SAM file + pattern: "*.{bai,crai,sai}" + - fasta: + type: file + description: Reference genome fasta file + pattern: "*.{fasta,fa}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - stats: + type: file + description: File containing samtools stats output + pattern: "*.{stats}" + - flagstat: + type: file + description: File containing samtools flagstat output + pattern: "*.{flagstat}" + - idxstats: + type: file + description: File containing samtools idxstats output + pattern: "*.{idxstats}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" diff --git a/subworkflows/local/circdna_assembly.nf b/subworkflows/local/circdna_assembly.nf deleted file mode 100644 index 0781bafc..00000000 --- a/subworkflows/local/circdna_assembly.nf +++ /dev/null @@ -1,9 +0,0 @@ - -workflow CIRCDNA_ASSEMBLY { - take: - reads - fasta - - main: - ch_trimmed_reads.map{ meta, file -> [meta, file, []] }.set{ch_unicycler_input} -} diff --git a/subworkflows/nf-core/bam_stats_samtools/main.nf b/subworkflows/nf-core/bam_stats_samtools/main.nf new file mode 100644 index 00000000..cfcc48dd --- /dev/null +++ b/subworkflows/nf-core/bam_stats_samtools/main.nf @@ -0,0 +1,32 @@ +// +// Run SAMtools stats, flagstat and idxstats +// + +include { SAMTOOLS_STATS } from '../../../modules/nf-core/samtools/stats/main' +include { SAMTOOLS_IDXSTATS } from '../../../modules/nf-core/samtools/idxstats/main' +include { SAMTOOLS_FLAGSTAT } from '../../../modules/nf-core/samtools/flagstat/main' + +workflow BAM_STATS_SAMTOOLS { + take: + bam_bai // channel: [ val(meta), [ bam/cram ], [bai/csi] ] + fasta // channel: [ fasta ] + + main: + ch_versions = Channel.empty() + + SAMTOOLS_STATS ( bam_bai, fasta ) + ch_versions = ch_versions.mix(SAMTOOLS_STATS.out.versions) + + SAMTOOLS_FLAGSTAT ( bam_bai ) + ch_versions = ch_versions.mix(SAMTOOLS_FLAGSTAT.out.versions) + + SAMTOOLS_IDXSTATS ( bam_bai ) + ch_versions = ch_versions.mix(SAMTOOLS_IDXSTATS.out.versions) + + emit: + stats = SAMTOOLS_STATS.out.stats // channel: [ val(meta), [ stats ] ] + flagstat = SAMTOOLS_FLAGSTAT.out.flagstat // channel: [ val(meta), [ flagstat ] ] + idxstats = SAMTOOLS_IDXSTATS.out.idxstats // channel: [ val(meta), [ idxstats ] ] + + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/nf-core/bam_stats_samtools/meta.yml b/subworkflows/nf-core/bam_stats_samtools/meta.yml new file mode 100644 index 00000000..5252b0e4 --- /dev/null +++ b/subworkflows/nf-core/bam_stats_samtools/meta.yml @@ -0,0 +1,54 @@ +name: bam_stats_samtools +description: Produces comprehensive statistics from SAM/BAM/CRAM file +keywords: + - statistics + - counts + - bam + - sam + - cram +modules: + - samtools/stats + - samtools/idxstats + - samtools/flagstat +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - bai: + type: file + description: Index for BAM/CRAM/SAM file + pattern: "*.{bai,crai,sai}" + - fasta: + type: file + description: Reference genome fasta file + pattern: "*.{fasta,fa}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - stats: + type: file + description: File containing samtools stats output + pattern: "*.{stats}" + - flagstat: + type: file + description: File containing samtools flagstat output + pattern: "*.{flagstat}" + - idxstats: + type: file + description: File containing samtools idxstats output + pattern: "*.{idxstats}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" diff --git a/workflows/circdna.nf b/workflows/circdna.nf index 9ed49dfe..0d55313d 100644 --- a/workflows/circdna.nf +++ b/workflows/circdna.nf @@ -15,8 +15,15 @@ def checkPathParamList = [ params.input, params.multiqc_config, params.fasta ] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } // Check mandatory parameters -if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } -if (params.fasta) { ch_fasta = file(params.fasta) } else { exit 1, 'Fasta reference genome not specified!' } +if (params.input) { ch_input = Channel.fromPath(params.input) } else { exit 1, 'Input samplesheet not specified!' } +if (params.fasta) { ch_fasta = Channel.fromPath(params.fasta) } else { exit 1, 'Fasta reference genome not specified!' } + +if (!(params.input_format == "FASTQ" | params.input_format == "BAM")) { + exit 1, 'Please specifiy --input_format "FASTQ" or "BAM" in capital letters, depending on the input file format.' +} + +// Modify fasta channel to include meta data +ch_fasta_meta = ch_fasta.map{ it -> [[id:it[0].baseName], it] } branch = params.circle_identifier.split(",") run_circexplorer2 = ("circexplorer2" in branch) @@ -30,6 +37,10 @@ if (!(run_unicycler | run_circle_map_realign | run_circle_map_repeats | run_circ exit 1, 'circle_identifier param not valid. Please check!' } +if (run_unicycler && !params.input_format == "FASTQ") { + exit 1, 'Unicycler needs FastQ input. Please specify input_format == "FASTQ", if possible, or don`t run unicycler.' +} + if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } // Check if BWA Index is given @@ -48,10 +59,15 @@ if (run_ampliconarchitect) { exit 1, "Mosek License Directory is missing! Please specifiy directory containing mosek license using --mosek_license_dir and rename license to 'mosek.lic'." } if (!params.aa_data_repo) { exit 1, "AmpliconArchitect Data Repository Missing! Please see https://github.com/jluebeck/AmpliconArchitect for more information and specify --aa_data_repo." } - if (params.reference_build != "hg19" & params.reference_build != "GRCh38" & params.reference_build != "GRCh37"){ - exit 1, "Reference Build not given! Please specify --reference_build 'hg19', 'GRCh38', or 'GRCh37'." + if (params.reference_build != "hg19" & params.reference_build != "GRCh38" & params.reference_build != "GRCh37" & params.reference_build != "mm10"){ + exit 1, "Reference Build not given! Please specify --reference_build 'mm10', 'hg19', 'GRCh38', or 'GRCh37'." + } + + if (!params.cnvkit_cnn) { + ch_cnvkit_reference = file(params.aa_data_repo + "/" + params.reference_build + "/" + params.reference_build + "_cnvkit_filtered_ref.cnn", checkIfExists: true) + } else { + ch_cnvkit_reference = file(params.cnvkit_cnn) } - ch_cnvkit_reference = Channel.fromPath(params.aa_data_repo + "/" + params.reference_build + "/" + params.reference_build + "_cnvkit_filtered_ref.cnn") } @@ -61,8 +77,10 @@ if (run_ampliconarchitect) { ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -ch_multiqc_config = file("$projectDir/assets/multiqc_config.yaml", checkIfExists: true) -ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config) : Channel.empty() +ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) +ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath( params.multiqc_config, checkIfExists: true ) : Channel.empty() +ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.empty() +ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -81,52 +99,53 @@ include { INPUT_CHECK } from '../subworkflows/local/input_check' ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main' +include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' // CONCATENATE FASTQ -include { CAT_FASTQ } from '../modules/nf-core/modules/cat/fastq/main' +include { CAT_FASTQ } from '../modules/nf-core/cat/fastq/main' // QUALITY CONTROL -include { FASTQC } from '../modules/nf-core/modules/fastqc/main' +include { FASTQC } from '../modules/nf-core/fastqc/main' // TRIMMING -include { TRIMGALORE } from '../modules/nf-core/modules/trimgalore/main' +include { TRIMGALORE } from '../modules/nf-core/trimgalore/main' // Genome Preparation -include { BWA_INDEX } from '../modules/nf-core/modules/bwa/index/main' +include { BWA_INDEX } from '../modules/nf-core/bwa/index/main' // Alignment -include { BWA_MEM } from '../modules/nf-core/modules/bwa/mem/main' -include { SAMTOOLS_SORT as SAMTOOLS_SORT_BAM } from '../modules/nf-core/modules/samtools/sort/main' -include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_BAM } from '../modules/nf-core/modules/samtools/index/main' +include { BWA_MEM } from '../modules/local/bwa/mem/main' +include { SAMTOOLS_SORT as SAMTOOLS_SORT_BAM } from '../modules/nf-core/samtools/sort/main' +include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_BAM } from '../modules/nf-core/samtools/index/main' // PICARD -include { MARK_DUPLICATES_PICARD } from '../subworkflows/nf-core/mark_duplicates_picard' -include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_FILTER } from '../modules/nf-core/modules/samtools/view/main' -include { SAMTOOLS_SORT as SAMTOOLS_SORT_FILTERED } from '../modules/nf-core/modules/samtools/sort/main' -include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_FILTERED } from '../modules/nf-core/modules/samtools/index/main' +include { SAMTOOLS_FAIDX } from '../modules/nf-core/samtools/faidx/main' +include { BAM_MARKDUPLICATES_PICARD } from '../subworkflows/local/bam_markduplicates_picard/main' +include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_FILTER } from '../modules/nf-core/samtools/view/main' +include { SAMTOOLS_SORT as SAMTOOLS_SORT_FILTERED } from '../modules/nf-core/samtools/sort/main' +include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_FILTERED } from '../modules/nf-core/samtools/index/main' // SAMTOOLS STATISTICS -include { SAMTOOLS_STATS } from '../modules/nf-core/modules/samtools/stats/main' +include { SAMTOOLS_STATS } from '../modules/nf-core/samtools/stats/main' // BAM STATS -include { BAM_STATS_SAMTOOLS as BAM_STATS_SAMTOOLS_RAW } from '../subworkflows/nf-core/bam_stats_samtools.nf' +include { BAM_STATS_SAMTOOLS as BAM_STATS_SAMTOOLS_RAW } from '../subworkflows/local/bam_stats_samtools/main' // CIRCLE-MAP include { CIRCLEMAP_READEXTRACTOR } from '../modules/local/circlemap/readextractor.nf' -include { SAMTOOLS_SORT as SAMTOOLS_SORT_RE } from '../modules/nf-core/modules/samtools/sort/main' -include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_RE } from '../modules/nf-core/modules/samtools/index/main' -include { SAMTOOLS_SORT as SAMTOOLS_SORT_QNAME_CM } from '../modules/nf-core/modules/samtools/sort/main' +include { SAMTOOLS_SORT as SAMTOOLS_SORT_RE } from '../modules/nf-core/samtools/sort/main' +include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_RE } from '../modules/nf-core/samtools/index/main' +include { SAMTOOLS_SORT as SAMTOOLS_SORT_QNAME_CM } from '../modules/nf-core/samtools/sort/main' include { CIRCLEMAP_REALIGN } from '../modules/local/circlemap/realign.nf' include { CIRCLEMAP_REPEATS } from '../modules/local/circlemap/repeats.nf' // CIRCLE_FINDER -include { SAMTOOLS_SORT as SAMTOOLS_SORT_QNAME_CF } from '../modules/nf-core/modules/samtools/sort/main' -include { SAMBLASTER } from '../modules/local/samblaster.nf' -include { BEDTOOLS_SORTEDBAM2BED } from '../modules/local/bedtools/sortedbam2bed.nf' -include { BEDTOOLS_SPLITBAM2BED } from '../modules/local/bedtools/splitbam2bed.nf' -include { CIRCLEFINDER } from '../modules/local/circlefinder.nf' +include { SAMTOOLS_SORT as SAMTOOLS_SORT_QNAME_CF } from '../modules/nf-core/samtools/sort/main' +include { SAMBLASTER } from '../modules/local/samblaster.nf' +include { BEDTOOLS_SORTEDBAM2BED } from '../modules/local/bedtools/sortedbam2bed.nf' +include { BEDTOOLS_SPLITBAM2BED } from '../modules/local/bedtools/splitbam2bed.nf' +include { CIRCLEFINDER } from '../modules/local/circlefinder.nf' // CIRCexplorer2 include { CIRCEXPLORER2_PARSE } from '../modules/local/circexplorer2/parse.nf' @@ -137,15 +156,16 @@ include { CNVKIT_SEGMENT } from '../modules/local include { COLLECT_SEEDS } from '../modules/local/collect_seeds.nf' include { AMPLIFIED_INTERVALS } from '../modules/local/amplified_intervals.nf' include { AMPLICONARCHITECT_AMPLICONARCHITECT } from '../modules/local/ampliconarchitect/ampliconarchitect.nf' -include { AMPLICONARCHITECT_AMPLICONCLASSIFIER } from '../modules/local/ampliconarchitect/ampliconclassifier.nf' -include { AMPLICONARCHITECT_AMPLICONSIMILARITY } from '../modules/local/ampliconarchitect/ampliconsimilarity.nf' -include { SUMMARISE_AA } from '../modules/local/summarise_aa.nf' +include { AMPLICONCLASSIFIER_AMPLICONCLASSIFIER } from '../modules/local/ampliconclassifier/ampliconclassifier.nf' +include { AMPLICONCLASSIFIER_AMPLICONSIMILARITY } from '../modules/local/ampliconclassifier/ampliconsimilarity.nf' +include { AMPLICONCLASSIFIER_MAKEINPUT } from '../modules/local/ampliconclassifier/makeinput.nf' +include { AMPLICONCLASSIFIER_MAKERESULTSTABLE } from '../modules/local/ampliconclassifier/makeresultstable.nf' // Unicycler include { UNICYCLER } from '../modules/local/unicycler/main.nf' include { SEQTK_SEQ } from '../modules/local/seqtk/seq.nf' include { GETCIRCULARREADS } from '../modules/local/getcircularreads.nf' -include { MINIMAP2_ALIGN } from '../modules/nf-core/modules/minimap2/align/main.nf' +include { MINIMAP2_ALIGN } from '../modules/nf-core/minimap2/align/main.nf' // MULTIQC @@ -242,11 +262,12 @@ workflow CIRCDNA { // MODULE: Run bwa index // if (!bwa_index_exists & (run_ampliconarchitect | run_circexplorer2 | - run_circle_finder | run_circle_map_realign | run_circle_map_repeats)) { + run_circle_finder | run_circle_map_realign | + run_circle_map_repeats)) { BWA_INDEX ( - ch_fasta + ch_fasta_meta ) - ch_bwa_index = BWA_INDEX.out.index + ch_bwa_index = BWA_INDEX.out.index.map{ meta, index -> ["bwa_index", index] }.collect() ch_versions = ch_versions.mix(BWA_INDEX.out.versions) } @@ -256,22 +277,22 @@ workflow CIRCDNA { // if (run_ampliconarchitect | run_circexplorer2 | run_circle_finder | run_circle_map_realign | run_circle_map_repeats) { - BWA_MEM ( - ch_trimmed_reads, - ch_bwa_index, - true - ) - ch_bam_sorted = BWA_MEM.out.bam - ch_bwa_sorted = BWA_MEM.out.bam - ch_versions = ch_versions.mix(BWA_MEM.out.versions) + BWA_MEM ( + ch_trimmed_reads, + ch_bwa_index, + Channel.value(true) + ) + ch_bam_sorted = BWA_MEM.out.bam + ch_full_bam_sorted = BWA_MEM.out.bam + ch_bwa_sorted = BWA_MEM.out.bam + ch_versions = ch_versions.mix(BWA_MEM.out.versions) - // SAMTOOLS INDEX SORTED BAM - SAMTOOLS_INDEX_BAM ( - ch_bam_sorted - ) - ch_versions = ch_versions.mix(SAMTOOLS_INDEX_BAM.out.versions) + // SAMTOOLS INDEX SORTED BAM + SAMTOOLS_INDEX_BAM ( + ch_bam_sorted + ) + ch_versions = ch_versions.mix(SAMTOOLS_INDEX_BAM.out.versions) } - } else if (params.input_format == "BAM") { // Use BAM Files as input INPUT_CHECK ( @@ -281,10 +302,11 @@ workflow CIRCDNA { SAMTOOLS_SORT_BAM ( INPUT_CHECK.out.reads ) - ch_versions = ch_versions.mix(SAMTOOLS_SORT_BAM.out.versions) + ch_versions = ch_versions.mix(SAMTOOLS_SORT_BAM.out.versions) ch_bam_sorted = SAMTOOLS_SORT_BAM.out.bam } else { ch_bam_sorted = INPUT_CHECK.out.reads + ch_full_bam_sorted = INPUT_CHECK.out.reads ch_bwa_sorted = INPUT_CHECK.out.reads } // SAMTOOLS INDEX SORTED BAM @@ -297,61 +319,84 @@ workflow CIRCDNA { ch_trimgalore_multiqc_log = Channel.empty() } + + + // Define Index channel and additional bam sorted channels for Circle_finder - not usable with duplicates removed + ch_bam_sorted_bai = SAMTOOLS_INDEX_BAM.out.bai + ch_full_bam_sorted = ch_bam_sorted + ch_full_bam_sorted_bai = SAMTOOLS_INDEX_BAM.out.bai + if (run_ampliconarchitect | run_circexplorer2 | run_circle_finder | run_circle_map_realign | run_circle_map_repeats) { - ch_bam_sorted_bai = SAMTOOLS_INDEX_BAM.out.bai + ch_fasta = ch_fasta_meta.map{ meta, index -> [index] }.collect() BAM_STATS_SAMTOOLS_RAW ( - ch_bam_sorted.join(ch_bam_sorted_bai) + ch_bam_sorted.join(ch_bam_sorted_bai). + map { meta, bam, bai -> [meta, bam, bai] }, + ch_fasta ) + ch_versions = ch_versions.mix(BAM_STATS_SAMTOOLS_RAW.out.versions) ch_samtools_stats = BAM_STATS_SAMTOOLS_RAW.out.stats ch_samtools_flagstat = BAM_STATS_SAMTOOLS_RAW.out.flagstat ch_samtools_idxstats = BAM_STATS_SAMTOOLS_RAW.out.idxstats // PICARD MARK_DUPLICATES if (!params.skip_markduplicates) { - MARK_DUPLICATES_PICARD ( - ch_bam_sorted - ) - ch_versions = ch_versions.mix(MARK_DUPLICATES_PICARD.out.versions) - ch_bam_sorted = MARK_DUPLICATES_PICARD.out.bam - ch_bam_sorted_bai = MARK_DUPLICATES_PICARD.out.bai - ch_markduplicates_stats = MARK_DUPLICATES_PICARD.out.stats - ch_markduplicates_flagstat = MARK_DUPLICATES_PICARD.out.flagstat - ch_markduplicates_idxstats = MARK_DUPLICATES_PICARD.out.idxstats - ch_markduplicates_multiqc = MARK_DUPLICATES_PICARD.out.metrics - - // FILTER BAM FILES USING SAMTOOLS VIEW - SAMTOOLS_VIEW_FILTER ( - ch_bam_sorted.join(ch_bam_sorted_bai), - ch_fasta - ) - ch_versions = ch_versions.mix(SAMTOOLS_VIEW_FILTER.out.versions) - - SAMTOOLS_SORT_FILTERED ( - SAMTOOLS_VIEW_FILTER.out.bam - ) - ch_versions = ch_versions.mix(SAMTOOLS_SORT_FILTERED.out.versions) - ch_bam_sorted = SAMTOOLS_SORT_FILTERED.out.bam + // Index Fasta File for Markduplicates + SAMTOOLS_FAIDX ( ch_fasta_meta ) + ch_fai = SAMTOOLS_FAIDX.out.fai.map {meta, fai -> fai }.collect() - SAMTOOLS_INDEX_FILTERED ( - ch_bam_sorted + // MARK DUPLICATES IN BAM FILE + BAM_MARKDUPLICATES_PICARD ( + ch_bam_sorted, ch_fasta, ch_fai ) - ch_versions = ch_versions.mix(SAMTOOLS_INDEX_FILTERED.out.versions) - ch_bam_sorted_bai = SAMTOOLS_INDEX_FILTERED.out.bai + // FILTER DUPLICATES IN BAM FILES USING SAMTOOLS VIEW + if (!params.keep_duplicates) { + SAMTOOLS_VIEW_FILTER ( + ch_bam_sorted.join(ch_bam_sorted_bai), + ch_fasta, + [] + ) + ch_versions = ch_versions.mix(SAMTOOLS_VIEW_FILTER.out.versions) + + // SORT FILTERED BAM FILE + SAMTOOLS_SORT_FILTERED ( + SAMTOOLS_VIEW_FILTER.out.bam + ) + ch_versions = ch_versions.mix(SAMTOOLS_SORT_FILTERED.out.versions) + + // INDEX FILTERED BAM FILE + SAMTOOLS_INDEX_FILTERED ( + SAMTOOLS_SORT_FILTERED.out.bam + ) + + ch_bam_sorted = SAMTOOLS_SORT_FILTERED.out.bam + ch_bam_sorted_bai = SAMTOOLS_INDEX_FILTERED.out.bai + ch_versions = ch_versions.mix(SAMTOOLS_INDEX_FILTERED.out.versions) + } + else { + ch_bam_sorted = BAM_MARKDUPLICATES_PICARD.out.bam + ch_bam_sorted_bai = BAM_MARKDUPLICATES_PICARD.out.bai + ch_markduplicates_stats = BAM_MARKDUPLICATES_PICARD.out.stats + ch_markduplicates_flagstat = BAM_MARKDUPLICATES_PICARD.out.flagstat + ch_markduplicates_idxstats = BAM_MARKDUPLICATES_PICARD.out.idxstats + ch_markduplicates_multiqc = BAM_MARKDUPLICATES_PICARD.out.metrics + ch_versions = ch_versions.mix(BAM_MARKDUPLICATES_PICARD.out.versions) + } } else { - ch_markduplicates_stats = Channel.empty() - ch_markduplicates_flagstat = Channel.empty() - ch_markduplicates_idxstats = Channel.empty() - ch_markduplicates_multiqc = Channel.empty() + ch_markduplicates_stats = Channel.empty() + ch_markduplicates_flagstat = Channel.empty() + ch_markduplicates_idxstats = Channel.empty() + ch_markduplicates_multiqc = Channel.empty() } } if (run_ampliconarchitect) { CNVKIT_BATCH ( ch_bam_sorted.join(ch_bam_sorted_bai), - ch_fasta + ch_fasta, + ch_cnvkit_reference ) ch_versions = ch_versions.mix(CNVKIT_BATCH.out.versions) @@ -377,20 +422,44 @@ workflow CIRCDNA { ) ch_versions = ch_versions.mix(AMPLICONARCHITECT_AMPLICONARCHITECT.out.versions) - ch_aa_cycles = AMPLICONARCHITECT_AMPLICONARCHITECT.out.cycles - ch_aa_graphs = AMPLICONARCHITECT_AMPLICONARCHITECT.out.graph - AMPLICONARCHITECT_AMPLICONCLASSIFIER ( - ch_aa_cycles.join(ch_aa_graphs) - ) - AMPLICONARCHITECT_AMPLICONSIMILARITY ( - ch_aa_cycles.join(ch_aa_graphs) + ch_aa_cycles = AMPLICONARCHITECT_AMPLICONARCHITECT.out.cycles. + map {meta, path -> [path]} + ch_aa_graphs = AMPLICONARCHITECT_AMPLICONARCHITECT.out.graph. + map {meta, path -> [path]} + + AMPLICONCLASSIFIER_MAKEINPUT ( + ch_aa_graphs.flatten().collect().ifEmpty([]), + ch_aa_cycles.flatten().collect().ifEmpty([]) ) - aa_summary_ch = AMPLICONARCHITECT_AMPLICONARCHITECT.out.summary - ch_versions = ch_versions.mix(AMPLICONARCHITECT_AMPLICONCLASSIFIER.out.versions) - SUMMARISE_AA ( - aa_summary_ch.join(AMPLICONARCHITECT_AMPLICONCLASSIFIER.out.class_tsv) + AMPLICONCLASSIFIER_AMPLICONCLASSIFIER ( + AMPLICONCLASSIFIER_MAKEINPUT.out.input + ) + ac_input_ch = AMPLICONCLASSIFIER_MAKEINPUT.out.input + ch_versions = ch_versions.mix(AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.versions) + AMPLICONCLASSIFIER_AMPLICONSIMILARITY ( + ac_input_ch + ) + ch_versions = ch_versions.mix(AMPLICONCLASSIFIER_AMPLICONSIMILARITY.out.versions) + + ac_input_ch. + map {file -> ["group", file]}. + set {ac_results_input_ch} + AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.class_tsv. + map {file -> ["group", file]}. + set {ac_class_ch} + // ac_results_input_ch.join(ac_class_ch). + // map{group, input_file, class_file -> [input_file, class_file]} + + AMPLICONCLASSIFIER_MAKERESULTSTABLE ( + ac_input_ch, + AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.class_tsv, + AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.gene_list, + AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.entropy, + AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.basic_properties, + AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.bed_files ) + ch_versions = ch_versions.mix(AMPLICONCLASSIFIER_MAKERESULTSTABLE.out.versions) } @@ -399,7 +468,7 @@ workflow CIRCDNA { // if (run_circle_finder) { SAMTOOLS_SORT_QNAME_CF ( - ch_bam_sorted + ch_full_bam_sorted ) ch_versions = ch_versions.mix(SAMTOOLS_SORT_QNAME_CF.out.versions) @@ -414,7 +483,7 @@ workflow CIRCDNA { ch_versions = ch_versions.mix(BEDTOOLS_SPLITBAM2BED.out.versions) BEDTOOLS_SORTEDBAM2BED ( - ch_bam_sorted.join(ch_bam_sorted_bai) + ch_full_bam_sorted.join(ch_full_bam_sorted_bai) ) ch_versions = ch_versions.mix(BEDTOOLS_SORTEDBAM2BED.out.versions) @@ -431,8 +500,6 @@ workflow CIRCDNA { if (run_circle_map_realign || run_circle_map_repeats) { - - SAMTOOLS_SORT_QNAME_CM ( ch_bam_sorted ) @@ -443,7 +510,6 @@ workflow CIRCDNA { ) ch_versions = ch_versions.mix(CIRCLEMAP_READEXTRACTOR.out.versions) - SAMTOOLS_SORT_RE ( CIRCLEMAP_READEXTRACTOR.out.bam ) @@ -521,16 +587,12 @@ workflow CIRCDNA { false ) ch_versions = ch_versions.mix(MINIMAP2_ALIGN.out.versions) - } else if (run_unicycler && !params.input_format == "FASTQ") { - exit 1, 'Unicycler needs FastQ input. Please specify input_format == "FASTQ", if possible, or don`t run unicycler.' } - // // MODULE: Pipeline reporting // CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml') - ) // @@ -572,6 +634,9 @@ workflow.onComplete { NfcoreTemplate.email(workflow, params, summary_params, projectDir, log, multiqc_report) } NfcoreTemplate.summary(workflow, params, log) + if (params.hook_url) { + NfcoreTemplate.IM_notification(workflow, params, summary_params, projectDir, log) + } } /*