diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml new file mode 100644 index 00000000..5ccf4182 --- /dev/null +++ b/.github/workflows/awstest.yml @@ -0,0 +1,30 @@ +name: nf-core AWS test +# This workflow is triggered on PRs to the master branch. +# It runs the -profile 'test_full' on AWS batch + +on: + push: + branches: + - master + release: + types: [published] + +jobs: + run-awstest: + name: Run AWS test + runs-on: ubuntu-latest + steps: + - name: Setup Miniconda + uses: goanpeca/setup-miniconda@v1.0.2 + with: + auto-update-conda: true + python-version: 3.7 + - name: Install awscli + run: conda install -c conda-forge awscli + - name: Start AWS batch job + env: + AWS_ACCESS_KEY_ID: ${{secrets.AWS_KEY_ID}} + AWS_SECRET_ACCESS_KEY: ${{secrets.AWS_KEY_SECRET}} + TOWER_ACCESS_TOKEN: ${{secrets.TOWER_ACCESS_TOKEN}} + run: | # Submits job to AWS batch using a 'nextflow-big' instance. Setting JVM options to "-XX:+UseG1GC" for more efficient garbage collection when staging remote files. + aws batch submit-job --region eu-west-1 --job-name nf-core-viralrecon --job-queue 'default-8b3836e0-5eda-11ea-96e5-0a2c3f6a2a32' --job-definition nextflow-4GiB --container-overrides '{"command": ["nf-core/viralrecon", "-r '"${GITHUB_SHA}"' -profile test_full --outdir s3://nf-core-awsmegatests/viralrecon/results-'"${GITHUB_SHA}"' -w s3://nf-core-awsmegatests/viralrecon/work-'"${GITHUB_SHA}"' -with-tower"], "environment": [{"name": "TOWER_ACCESS_TOKEN", "value": "'"$TOWER_ACCESS_TOKEN"'"}, {"name": "NXF_OPTS", "value": "-XX:+UseG1GC"}]}' diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 686ef4c5..2f031c0e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -22,9 +22,73 @@ jobs: - name: Pull docker image run: | docker pull nfcore/viralrecon:dev - docker tag nfcore/viralrecon:dev nfcore/viralrecon:dev + docker tag nfcore/viralrecon:dev nfcore/viralrecon:1.0.0 - name: Run pipeline with test data run: | - # TODO nf-core: You can customise CI pipeline run tests as required - # (eg. adding multiple test runs with different parameters) nextflow run ${GITHUB_WORKSPACE} -profile test,docker + + parameters: + env: + NXF_VER: '19.10.0' + NXF_ANSI_LOG: false + runs-on: ubuntu-latest + strategy: + matrix: + parameters: [--skip_adapter_trimming, --skip_markduplicates, --skip_variants, --skip_amplicon_trimming, --skip_kraken2, --skip_assembly] + steps: + - uses: actions/checkout@v2 + - name: Install Nextflow + run: | + wget -qO- get.nextflow.io | bash + sudo mv nextflow /usr/local/bin/ + - name: Pull docker image + run: | + docker pull nfcore/viralrecon:dev + docker tag nfcore/viralrecon:dev nfcore/viralrecon:1.0.0 + - name: Run pipeline with test amplicon data with various options + run: | + nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.parameters }} + + test_sra: + env: + NXF_VER: '19.10.0' + NXF_ANSI_LOG: false + runs-on: ubuntu-latest + strategy: + matrix: + parameters: [--skip_sra, ''] + steps: + - uses: actions/checkout@v2 + - name: Install Nextflow + run: | + wget -qO- get.nextflow.io | bash + sudo mv nextflow /usr/local/bin/ + - name: Pull docker image + run: | + docker pull nfcore/viralrecon:dev + docker tag nfcore/viralrecon:dev nfcore/viralrecon:1.0.0 + - name: Run pipeline with minimal data via SRA ids and various options + run: | + nextflow run ${GITHUB_WORKSPACE} -profile test_sra,docker ${{ matrix.parameters }} + + test_sispa: + env: + NXF_VER: '19.10.0' + NXF_ANSI_LOG: false + runs-on: ubuntu-latest + strategy: + matrix: + parameters: [--gff false, ''] + steps: + - uses: actions/checkout@v2 + - name: Install Nextflow + run: | + wget -qO- get.nextflow.io | bash + sudo mv nextflow /usr/local/bin/ + - name: Pull docker image + run: | + docker pull nfcore/viralrecon:dev + docker tag nfcore/viralrecon:dev nfcore/viralrecon:1.0.0 + - name: Run pipeline with minimal SISPA data and various options + run: | + nextflow run ${GITHUB_WORKSPACE} -profile test_sispa,docker ${{ matrix.parameters }} diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e3e8eb0..734d327b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,14 +3,35 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). -## v1.0dev - [date] +## [1.0.0] - 2020-06-01 Initial release of nf-core/viralrecon, created with the [nf-core](http://nf-co.re/) template. -### `Added` +This pipeline is a re-implementation of the [SARS_Cov2_consensus-nf](https://github.com/BU-ISCIII/SARS_Cov2_consensus-nf) and [SARS_Cov2_assembly-nf](https://github.com/BU-ISCIII/SARS_Cov2_assembly-nf) pipelines initially developed by [Sarai Varona](https://github.com/svarona) and [Sara Monzon](https://github.com/saramonzon) from [BU-ISCIII](https://github.com/BU-ISCIII). Porting both of these pipelines to nf-core was an international collaboration between numerous contributors and developers, led by [Harshil Patel](https://github.com/drpatelh) from the [The Bioinformatics & Biostatistics Group](https://www.crick.ac.uk/research/science-technology-platforms/bioinformatics-and-biostatistics/) at [The Francis Crick Institute](https://www.crick.ac.uk/), London. We appreciated the need to have a portable, reproducible and scalable pipeline for the analysis of COVID-19 sequencing samples and so the Avengers Assembled! -### `Fixed` +### Pipeline summary -### `Dependencies` - -### `Deprecated` +1. Download samples via SRA, ENA or GEO ids ([`ENA FTP`](https://ena-docs.readthedocs.io/en/latest/retrieval/file-download.html), [`parallel-fastq-dump`](https://github.com/rvalieris/parallel-fastq-dump); *if required*) +2. Merge re-sequenced FastQ files ([`cat`](http://www.linfo.org/cat.html); *if required*) +3. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) +4. Adapter trimming ([`fastp`](https://github.com/OpenGene/fastp)) +5. Variant calling + 1. Read alignment ([`Bowtie 2`](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml)) + 2. Sort and index alignments ([`SAMtools`](https://sourceforge.net/projects/samtools/files/samtools/)) + 3. Primer sequence removal ([`iVar`](https://github.com/andersen-lab/ivar); *amplicon data only*) + 4. Duplicate read marking ([`picard`](https://broadinstitute.github.io/picard/); *removal optional*) + 5. Alignment-level QC ([`picard`](https://broadinstitute.github.io/picard/), [`SAMtools`](https://sourceforge.net/projects/samtools/files/samtools/)) + 6. Choice of multiple variant calling and consensus sequence generation routes ([`VarScan 2`](http://dkoboldt.github.io/varscan/), [`BCFTools`](http://samtools.github.io/bcftools/bcftools.html), [`BEDTools`](https://github.com/arq5x/bedtools2/) *||* [`iVar variants and consensus`](https://github.com/andersen-lab/ivar) *||* [`BCFTools`](http://samtools.github.io/bcftools/bcftools.html), [`BEDTools`](https://github.com/arq5x/bedtools2/)) + * Variant annotation ([`SnpEff`](http://snpeff.sourceforge.net/SnpEff.html), [`SnpSift`](http://snpeff.sourceforge.net/SnpSift.html)) + * Consensus assessment report ([`QUAST`](http://quast.sourceforge.net/quast)) +6. _De novo_ assembly + 1. Primer trimming ([`Cutadapt`](https://cutadapt.readthedocs.io/en/stable/guide.html); *amplicon data only*) + 2. Removal of host reads ([`Kraken 2`](http://ccb.jhu.edu/software/kraken2/)) + 3. Choice of multiple assembly tools ([`SPAdes`](http://cab.spbu.ru/software/spades/) *||* [`metaSPAdes`](http://cab.spbu.ru/software/meta-spades/) *||* [`Unicycler`](https://github.com/rrwick/Unicycler) *||* [`minia`](https://github.com/GATB/minia)) + * Blast to reference genome ([`blastn`](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastSearch)) + * Contiguate assembly ([`ABACAS`](https://www.sanger.ac.uk/science/tools/pagit)) + * Assembly report ([`PlasmidID`](https://github.com/BU-ISCIII/plasmidID)) + * Assembly assessment report ([`QUAST`](http://quast.sourceforge.net/quast)) + * Call variants relative to reference ([`Minimap2`](https://github.com/lh3/minimap2), [`seqwish`](https://github.com/ekg/seqwish), [`vg`](https://github.com/vgteam/vg), [`Bandage`](https://github.com/rrwick/Bandage)) + * Variant annotation ([`SnpEff`](http://snpeff.sourceforge.net/SnpEff.html), [`SnpSift`](http://snpeff.sourceforge.net/SnpSift.html)) +7. Present QC and visualisation for raw read, alignment, assembly and variant calling results ([`MultiQC`](http://multiqc.info/)) diff --git a/CITATIONS.md b/CITATIONS.md new file mode 100644 index 00000000..ef6aa270 --- /dev/null +++ b/CITATIONS.md @@ -0,0 +1,103 @@ +# nf-core/viralrecon: Citations + +## [nf-core](https://www.ncbi.nlm.nih.gov/pubmed/32055031/) + +> Ewels PA, Peltzer A, Fillinger S, Patel H, Alneberg J, Wilm A, Garcia MU, Di Tommaso P, Nahnsen S. The nf-core framework for community-curated bioinformatics pipelines. Nat Biotechnol. 2020 Mar;38(3):276-278. doi: 10.1038/s41587-020-0439-x. PubMed PMID: 32055031. ReadCube: [Full Access Link](https://rdcu.be/b1GjZ). + +## [Nextflow](https://www.ncbi.nlm.nih.gov/pubmed/28398311/) + +> Di Tommaso P, Chatzou M, Floden EW, Barja PP, Palumbo E, Notredame C. Nextflow enables reproducible computational workflows. Nat Biotechnol. 2017 Apr 11;35(4):316-319. doi: 10.1038/nbt.3820. PubMed PMID: 28398311. + +## Pipeline tools + +* [ABACAS](https://www.ncbi.nlm.nih.gov/pubmed/19497936/) + > Assefa S, Keane TM, Otto TD, Newbold C, Berriman M. ABACAS: algorithm-based automatic contiguation of assembled sequences. Bioinformatics. 2009 Aug 1;25(15):1968-9. doi: 10.1093/bioinformatics/btp347. Epub 2009 Jun 3. PubMed PMID: 19497936; PubMed Central PMCID: PMC2712343. + +* [Bandage](https://www.ncbi.nlm.nih.gov/pubmed/26099265) + > Wick R.R., Schultz M.B., Zobel J. & Holt K.E. Bandage: interactive visualisation of de novo genome assemblies. Bioinformatics, 31(20), 3350-3352. doi: 10.1093/bioinformatics/btv383. PubMed PMID: 26099265; PubMed Central PCMID: PMC4595904. + +* [BCFtools](https://www.ncbi.nlm.nih.gov/pubmed/21903627/) + > Li H. A statistical framework for SNP calling, mutation discovery, association mapping and population genetical parameter estimation from sequencing data. Bioinformatics. 2011 Nov 1;27(21):2987-93. doi: 10.1093/bioinformatics/btr509. Epub 2011 Sep 8. PubMed PMID: 21903627; PubMed Central PMCID: PMC3198575. + +* [BEDTools](https://www.ncbi.nlm.nih.gov/pubmed/20110278/) + > Quinlan AR, Hall IM. BEDTools: a flexible suite of utilities for comparing genomic features. Bioinformatics. 2010 Mar 15;26(6):841-2. doi: 10.1093/bioinformatics/btq033. Epub 2010 Jan 28. PubMed PMID: 20110278; PubMed Central PMCID: PMC2832824. + +* [BLAST](https://www.ncbi.nlm.nih.gov/pubmed/20003500/) + > Camacho C, Coulouris G, Avagyan V, Ma N, Papadopoulos J, Bealer K, Madden TL. BLAST+: architecture and applications. BMC Bioinformatics. 2009 Dec 15;10:421. doi: 10.1186/1471-2105-10-421. PubMed PMID: 20003500; PubMed Central PMCID: PMC2803857. + +* [Bowtie 2](https://www.ncbi.nlm.nih.gov/pubmed/22388286/) + > Langmead B, Salzberg SL. Fast gapped-read alignment with Bowtie 2. Nat Methods. 2012 Mar 4;9(4):357-9. doi: 10.1038/nmeth.1923. PubMed PMID: 22388286; PubMed Central PMCID: PMC3322381. + +* [Cutadapt](http://dx.doi.org/10.14806/ej.17.1.200) + > Marcel, M. Cutadapt removes adapter sequences from high-throughput sequencing reads. EMBnet.journal, [S.l.], v. 17, n. 1, p. pp. 10-12, may 2011. ISSN 2226-6089. doi: 10.14806/ej.17.1.200. + +* [fastp](https://www.ncbi.nlm.nih.gov/pubmed/30423086/) + > Chen S, Zhou Y, Chen Y, Gu J. fastp: an ultra-fast all-in-one FASTQ preprocessor. Bioinformatics. 2018 Sep 1;34(17):i884-i890. doi: 10.1093/bioinformatics/bty560. PubMed PMID: 30423086; PubMed Central PMCID: PMC6129281. + +* [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) + +* [iVar](https://www.ncbi.nlm.nih.gov/pubmed/30621750/) + > Grubaugh ND, Gangavarapu K, Quick J, Matteson NL, De Jesus JG, Main BJ, Tan AL, Paul LM, Brackney DE, Grewal S, Gurfield N, Van Rompay KKA, Isern S, Michael SF, Coffey LL, Loman NJ, Andersen KG. An amplicon-based sequencing framework for accurately measuring intrahost virus diversity using PrimalSeq and iVar. Genome Biol. 2019 Jan 8;20(1):8. doi: 10.1186/s13059-018-1618-7. PubMed PMID: 30621750; PubMed Central PMCID: PMC6325816. + +* [Kraken 2](https://www.ncbi.nlm.nih.gov/pubmed/31779668/) + > Wood DE, Lu J, Langmead B. Improved metagenomic analysis with Kraken 2. Genome Biol. 2019 Nov 28;20(1):257. doi: 10.1186/s13059-019-1891-0. PubMed PMID: 31779668; PubMed Central PMCID: PMC6883579. + +* [minia](https://www.ncbi.nlm.nih.gov/pubmed/24040893/) + > Chikhi R, Rizk G. Space-efficient and exact de Bruijn graph representation based on a Bloom filter. Algorithms Mol Biol. 2013 Sep 16;8(1):22. doi: 10.1186/1748-7188-8-22. PubMed PMID: 24040893; PubMed Central PMCID: PMC3848682. + +* [Minimap2](https://www.ncbi.nlm.nih.gov/pubmed/29750242/) + > Li H. Minimap2: pairwise alignment for nucleotide sequences. Bioinformatics. 2018 Sep 15;34(18):3094-3100. doi: 10.1093/bioinformatics/bty191. PubMed PMID: 29750242; PubMed Central PMCID: PMC6137996. + +* [MultiQC](https://www.ncbi.nlm.nih.gov/pubmed/27312411/) + > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. + +* [parallel-fastq-dump](https://github.com/rvalieris/parallel-fastq-dump) + +* [picard-tools](http://broadinstitute.github.io/picard) + +* [QUAST](https://www.ncbi.nlm.nih.gov/pubmed/23422339/) + > Gurevich A, Saveliev V, Vyahhi N, Tesler G. QUAST: quality assessment tool for genome assemblies. Bioinformatics. 2013 Apr 15;29(8):1072-5. doi: 10.1093/bioinformatics/btt086. Epub 2013 Feb 19. PubMed PMID: 23422339; PubMed Central PMCID: PMC3624806. + +* [R](https://www.R-project.org/) + > R Core Team (2017). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. + +* [SAMtools](https://www.ncbi.nlm.nih.gov/pubmed/19505943/) + > Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R; 1000 Genome Project Data Processing Subgroup. The Sequence Alignment/Map format and SAMtools. Bioinformatics. 2009 Aug 15;25(16):2078-9. doi: 10.1093/bioinformatics/btp352. Epub 2009 Jun 8. PubMed PMID: 19505943; PubMed Central PMCID: PMC2723002. + +* [seqwish](https://github.com/ekg/seqwish) + +* [SnpEff](https://www.ncbi.nlm.nih.gov/pubmed/22728672/) + > Cingolani P, Platts A, Wang le L, Coon M, Nguyen T, Wang L, Land SJ, Lu X, Ruden DM. A program for annotating and predicting the effects of single nucleotide polymorphisms, SnpEff: SNPs in the genome of Drosophila melanogaster strain w1118; iso-2; iso-3. Fly (Austin). 2012 Apr-Jun;6(2):80-92. doi: 10.4161/fly.19695. PubMed PMID: 22728672; PubMed Central PMCID: PMC3679285. + +* [SnpSift](https://www.ncbi.nlm.nih.gov/pubmed/22435069/) + > Cingolani P, Patel VM, Coon M, Nguyen T, Land SJ, Ruden DM, Lu X. Using Drosophila melanogaster as a Model for Genotoxic Chemical Mutational Studies with a New Program, SnpSift. Front Genet. 2012 Mar 15;3:35. doi: 10.3389/fgene.2012.00035. eCollection 2012. PubMed PMID: 22435069; PubMed Central PMCID: PMC3304048. + +* [SPAdes](https://www.ncbi.nlm.nih.gov/pubmed/24093227/) + > Nurk S, Bankevich A, Antipov D, Gurevich AA, Korobeynikov A, Lapidus A, Prjibelski AD, Pyshkin A, Sirotkin A, Sirotkin Y, Stepanauskas R, Clingenpeel SR, Woyke T, McLean JS, Lasken R, Tesler G, Alekseyev MA, Pevzner PA. Assembling single-cell genomes and mini-metagenomes from chimeric MDA products. J Comput Biol. 2013 Oct;20(10):714-37. doi: 10.1089/cmb.2013.0084. PubMed PMID: 24093227; PubMed Central PMCID: PMC3791033. + +* [SRA Toolkit](http://ncbi.github.io/sra-tools/) + +* [Trimmomatic](https://www.ncbi.nlm.nih.gov/pubmed/24695404/) + > Bolger AM, Lohse M, Usadel B. Trimmomatic: a flexible trimmer for Illumina sequence data. Bioinformatics. 2014 Aug 1;30(15):2114-20. doi: 10.1093/bioinformatics/btu170. Epub 2014 Apr 1. PubMed PMID: 24695404; PubMed Central PMCID: PMC4103590. + +* [Unicycler](https://www.ncbi.nlm.nih.gov/pubmed/28594827/) + > Wick RR, Judd LM, Gorrie CL, Holt KE. Unicycler: Resolving bacterial genome assemblies from short and long sequencing reads. PLoS Comput Biol. 2017 Jun 8;13(6):e1005595. doi: 10.1371/journal.pcbi.1005595. eCollection 2017 Jun. PubMed PMID: 28594827; PubMed Central PMCID: PMC5481147. + +* [VarScan 2](https://www.ncbi.nlm.nih.gov/pubmed/22300766/) + > Koboldt DC, Zhang Q, Larson DE, Shen D, McLellan MD, Lin L, Miller CA, Mardis ER, Ding L, Wilson RK. VarScan 2: somatic mutation and copy number alteration discovery in cancer by exome sequencing. Genome Res. 2012 Mar;22(3):568-76. doi: 10.1101/gr.129684.111. Epub 2012 Feb 2. PubMed PMID: 22300766; PubMed Central PMCID: PMC3290792. + +* [vg](https://www.ncbi.nlm.nih.gov/pubmed/30125266/) + > Garrison E, Sirén J, Novak AM, Hickey G, Eizenga JM, Dawson ET, Jones W, Garg S, Markello C, Lin MF, Paten B, Durbin R. Variation graph toolkit improves read mapping by representing genetic variation in the reference. Nat Biotechnol. 2018 Oct;36(9):875-879. doi: 10.1038/nbt.4227. Epub 2018 Aug 20. PubMed PMID: 30125266; PubMed Central PMCID: PMC6126949. + +## Software packaging/containerisation tools + +* [Bioconda](https://www.ncbi.nlm.nih.gov/pubmed/29967506/) + > Grüning B, Dale R, Sjödin A, Chapman BA, Rowe J, Tomkins-Tinch CH, Valieris R, Köster J; Bioconda Team. Bioconda: sustainable and comprehensive software distribution for the life sciences. Nat Methods. 2018 Jul;15(7):475-476. doi: 10.1038/s41592-018-0046-7. PubMed PMID: 29967506. + +* [Anaconda](https://anaconda.com) + > Anaconda Software Distribution. Computer software. Vers. 2-2.4.0. Anaconda, Nov. 2016. Web. + +* [Singularity](https://www.ncbi.nlm.nih.gov/pubmed/28494014/) + > Kurtzer GM, Sochat V, Bauer MW. Singularity: Scientific containers for mobility of compute. PLoS One. 2017 May 11;12(5):e0177459. doi: 10.1371/journal.pone.0177459. eCollection 2017. PubMed PMID: 28494014; PubMed Central PMCID: PMC5426675. + +* [Docker](https://dl.acm.org/doi/10.5555/2600239.2600241) diff --git a/Dockerfile b/Dockerfile index 6a47941f..e2ff3e00 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,8 +6,15 @@ LABEL authors="Sarai Varona and Sara Monzon" \ COPY environment.yml / RUN conda env create -f /environment.yml && conda clean -a +# For Bandage: otherwise it complains about missing libGL.so.1 +RUN apt-get install -y libgl1-mesa-glx && apt-get clean -y + # Add conda installation dir to PATH (instead of doing 'conda activate') -ENV PATH /opt/conda/envs/nf-core-viralrecon-1.0dev/bin:$PATH +ENV PATH /opt/conda/envs/nf-core-viralrecon-1.0.0/bin:$PATH # Dump the details of the installed packages to a file for posterity -RUN conda env export --name nf-core-viralrecon-1.0dev > nf-core-viralrecon-1.0dev.yml +RUN conda env export --name nf-core-viralrecon-1.0.0 > nf-core-viralrecon-1.0.0.yml + +# Instruct R processes to use these empty files instead of clashing with a local version +RUN touch .Rprofile +RUN touch .Renviron diff --git a/README.md b/README.md index 5166c012..da53f2f9 100644 --- a/README.md +++ b/README.md @@ -1,17 +1,51 @@ # ![nf-core/viralrecon](docs/images/nf-core-viralrecon_logo.png) -**Assembly and intrahost/low-frequency variant calling for viral samples**. - [![GitHub Actions CI Status](https://github.com/nf-core/viralrecon/workflows/nf-core%20CI/badge.svg)](https://github.com/nf-core/viralrecon/actions) [![GitHub Actions Linting Status](https://github.com/nf-core/viralrecon/workflows/nf-core%20linting/badge.svg)](https://github.com/nf-core/viralrecon/actions) [![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A519.10.0-brightgreen.svg)](https://www.nextflow.io/) [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg)](http://bioconda.github.io/) [![Docker](https://img.shields.io/docker/automated/nfcore/viralrecon.svg)](https://hub.docker.com/r/nfcore/viralrecon) +[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.XXXXXXX.svg)](https://doi.org/10.5281/zenodo.XXXXXXX) ## Introduction -The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It comes with docker containers making installation trivial and results highly reproducible. +**nfcore/viralrecon** is a bioinformatics analysis pipeline used to perform assembly and intrahost/low-frequency variant calling for viral samples. The pipeline currently supports metagenomics and amplicon sequencing data derived from the Illumina sequencing platform. + +This pipeline is a re-implementation of the [SARS_Cov2_consensus-nf](https://github.com/BU-ISCIII/SARS_Cov2_consensus-nf) and [SARS_Cov2_assembly-nf](https://github.com/BU-ISCIII/SARS_Cov2_assembly-nf) pipelines initially developed by [Sarai Varona](https://github.com/svarona) and [Sara Monzon](https://github.com/saramonzon) from [BU-ISCIII](https://github.com/BU-ISCIII). Porting both of these pipelines to nf-core was an international collaboration between numerous contributors and developers, led by [Harshil Patel](https://github.com/drpatelh) from the [The Bioinformatics & Biostatistics Group](https://www.crick.ac.uk/research/science-technology-platforms/bioinformatics-and-biostatistics/) at [The Francis Crick Institute](https://www.crick.ac.uk/), London. We appreciated the need to have a portable, reproducible and scalable pipeline for the analysis of COVID-19 sequencing samples and so the Avengers Assembled! Please come and join us and add yourself to the contributor list :) + +We have integrated a number of options in the pipeline to allow you to run specific aspects of the workflow if you so wish. For example, you can skip all of the assembly steps with the `--skip_assembly` parameter. See [usage docs](docs/usage.md) for all of the available options when running the pipeline. + +Please click [here](https://raw.githack.com/nf-core/viralrecon/master/docs/html/multiqc_report.html) to see an example MultiQC report generated using the parameters defined in [this configuration file](https://github.com/nf-core/viralrecon/blob/master/conf/test_full.config) to run the pipeline on [samples](https://zenodo.org/record/3735111) which were prepared from the [ncov-2019 ARTIC Network V1 amplicon set](https://artic.network/ncov-2019) and sequenced on the Illumina MiSeq platform in 301bp paired-end format. + +The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It comes with docker containers making installation trivial and results highly reproducible. Furthermore, automated continuous integration tests to run the pipeline on a full-sized dataset are passing on AWS cloud. + +## Pipeline summary + +1. Download samples via SRA, ENA or GEO ids ([`ENA FTP`](https://ena-docs.readthedocs.io/en/latest/retrieval/file-download.html), [`parallel-fastq-dump`](https://github.com/rvalieris/parallel-fastq-dump); *if required*) +2. Merge re-sequenced FastQ files ([`cat`](http://www.linfo.org/cat.html); *if required*) +3. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) +4. Adapter trimming ([`fastp`](https://github.com/OpenGene/fastp)) +5. Variant calling + 1. Read alignment ([`Bowtie 2`](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml)) + 2. Sort and index alignments ([`SAMtools`](https://sourceforge.net/projects/samtools/files/samtools/)) + 3. Primer sequence removal ([`iVar`](https://github.com/andersen-lab/ivar); *amplicon data only*) + 4. Duplicate read marking ([`picard`](https://broadinstitute.github.io/picard/); *removal optional*) + 5. Alignment-level QC ([`picard`](https://broadinstitute.github.io/picard/), [`SAMtools`](https://sourceforge.net/projects/samtools/files/samtools/)) + 6. Choice of multiple variant calling and consensus sequence generation routes ([`VarScan 2`](http://dkoboldt.github.io/varscan/), [`BCFTools`](http://samtools.github.io/bcftools/bcftools.html), [`BEDTools`](https://github.com/arq5x/bedtools2/) *||* [`iVar variants and consensus`](https://github.com/andersen-lab/ivar) *||* [`BCFTools`](http://samtools.github.io/bcftools/bcftools.html), [`BEDTools`](https://github.com/arq5x/bedtools2/)) + * Variant annotation ([`SnpEff`](http://snpeff.sourceforge.net/SnpEff.html), [`SnpSift`](http://snpeff.sourceforge.net/SnpSift.html)) + * Consensus assessment report ([`QUAST`](http://quast.sourceforge.net/quast)) +6. _De novo_ assembly + 1. Primer trimming ([`Cutadapt`](https://cutadapt.readthedocs.io/en/stable/guide.html); *amplicon data only*) + 2. Removal of host reads ([`Kraken 2`](http://ccb.jhu.edu/software/kraken2/)) + 3. Choice of multiple assembly tools ([`SPAdes`](http://cab.spbu.ru/software/spades/) *||* [`metaSPAdes`](http://cab.spbu.ru/software/meta-spades/) *||* [`Unicycler`](https://github.com/rrwick/Unicycler) *||* [`minia`](https://github.com/GATB/minia)) + * Blast to reference genome ([`blastn`](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastSearch)) + * Contiguate assembly ([`ABACAS`](https://www.sanger.ac.uk/science/tools/pagit)) + * Assembly report ([`PlasmidID`](https://github.com/BU-ISCIII/plasmidID)) + * Assembly assessment report ([`QUAST`](http://quast.sourceforge.net/quast)) + * Call variants relative to reference ([`Minimap2`](https://github.com/lh3/minimap2), [`seqwish`](https://github.com/ekg/seqwish), [`vg`](https://github.com/vgteam/vg), [`Bandage`](https://github.com/rrwick/Bandage)) + * Variant annotation ([`SnpEff`](http://snpeff.sourceforge.net/SnpEff.html), [`SnpSift`](http://snpeff.sourceforge.net/SnpSift.html)) +7. Present QC and visualisation for raw read, alignment, assembly and variant calling results ([`MultiQC`](http://multiqc.info/)) ## Quick Start @@ -29,10 +63,8 @@ nextflow run nf-core/viralrecon -profile test, - ```bash -nextflow run nf-core/viralrecon -profile --reads '*_R{1,2}.fastq.gz' --genome GRCh37 +nextflow run nf-core/viralrecon -profile --input samplesheet.csv --genome 'NC_045512.2' -profile docker ``` See [usage docs](docs/usage.md) for all of the available options when running the pipeline. @@ -45,27 +77,47 @@ The nf-core/viralrecon pipeline comes with documentation about the pipeline, fou 2. Pipeline configuration * [Local installation](https://nf-co.re/usage/local_installation) * [Adding your own system config](https://nf-co.re/usage/adding_own_config) - * [Reference genomes](https://nf-co.re/usage/reference_genomes) + * [Reference genomes](docs/usage.md#reference-genomes) 3. [Running the pipeline](docs/usage.md) 4. [Output and how to interpret the results](docs/output.md) 5. [Troubleshooting](https://nf-co.re/usage/troubleshooting) - - ## Credits -nf-core/viralrecon was originally written by Sarai Varona and Sara Monzon. +These scripts were originally written by [Sarai Varona](https://github.com/svarona), [Miguel Juliá](https://github.com/MiguelJulia) and [Sara Monzon](https://github.com/saramonzon) from [BU-ISCIII](https://github.com/BU-ISCIII) and co-ordinated by Isabel Cuesta for the [Institute of Health Carlos III](https://eng.isciii.es/eng.isciii.es/Paginas/Inicio.html), Spain. Through collaboration with the nf-core community the pipeline has now been updated substantially to include additional processing steps, to standardise inputs/outputs and to improve pipeline reporting; implemented primarily by [Harshil Patel](https://github.com/drpatelh) from [The Bioinformatics & Biostatistics Group](https://www.crick.ac.uk/research/science-technology-platforms/bioinformatics-and-biostatistics/) at [The Francis Crick Institute](https://www.crick.ac.uk/), London. + +Many thanks to others who have helped out and contributed along the way too, including (but not limited to): + +| Name | Affiliation | +|-----------------------------------------------------------|---------------------------------------------------------------------------------------| +| [Alexander Peltzer](https://github.com/apeltzer) | [Boehringer Ingelheim, Germany](https://www.boehringer-ingelheim.de/) | +| [Alison Meynert](https://github.com/ameynert) | [University of Edinburgh, Scotland](https://www.ed.ac.uk/) | +| [Edgar Garriga Nogales](https://github.com/edgano) | [Centre for Genomic Regulation, Spain](https://www.crg.eu/) | +| [Erik Garrison](https://github.com/ekg) | [UCSC, USA](https://www.ucsc.edu/) | +| [Gisela Gabernet](https://github.com/ggabernet) | [QBiC, University of Tübingen, Germany](https://portal.qbic.uni-tuebingen.de/portal/) | +| [Joao Curado](https://github.com/jcurado-flomics) | [Flomics Biotech, Spain](https://www.flomics.com/) | +| [Jose Espinosa-Carrasco](https://github.com/JoseEspinosa) | [Centre for Genomic Regulation, Spain](https://www.crg.eu/) | +| [Katrin Sameith](https://github.com/ktrns) | [DRESDEN-concept Genome Center, Germany](https://genomecenter.tu-dresden.de) | +| [Lluc Cabus](https://github.com/lcabus-flomics) | [Flomics Biotech, Spain](https://www.flomics.com/) | +| [Marta Pozuelo](https://github.com/mpozuelo-flomics) | [Flomics Biotech, Spain](https://www.flomics.com/) | +| [Maxime Garcia](https://github.com/MaxUlysse) | [SciLifeLab, Sweden](https://www.scilifelab.se/) | +| [Michael Heuer](https://github.com/heuermh) | [UC Berkeley, USA](https://https://rise.cs.berkeley.edu) | +| [Phil Ewels](https://github.com/ewels) | [SciLifeLab, Sweden](https://www.scilifelab.se/) | +| [Simon Heumos](https://github.com/subwaystation) | [QBiC, University of Tübingen, Germany](https://portal.qbic.uni-tuebingen.de/portal/) | +| [Stephen Kelly](https://github.com/stevekm) | [Memorial Sloan Kettering Cancer Center, USA](https://www.mskcc.org/) | +| [Thanh Le Viet](https://github.com/thanhleviet) | [Quadram Institute, UK](https://quadram.ac.uk/) | + +> Listed in alphabetical order ## Contributions and Support -If you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md). +If you would like to contribute to this pipeline, please see the [contributing guidelines](https://github.com/nf-core/viralrecon/blob/master/.github/CONTRIBUTING.md). For further information or help, don't hesitate to get in touch on [Slack](https://nfcore.slack.com/channels/viralrecon) (you can join with [this invite](https://nf-co.re/join/slack)). ## Citation - - +If you use nf-core/viralrecon for your analysis, please cite it using the following doi: [10.5281/zenodo.XXXXXX](https://doi.org/10.5281/zenodo.XXXXXX) You can cite the `nf-core` publication as follows: @@ -73,5 +125,7 @@ You can cite the `nf-core` publication as follows: > > Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen. > -> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x). +> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x). > ReadCube: [Full Access Link](https://rdcu.be/b1GjZ) + +An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](https://github.com/nf-core/viralrecon/blob/master/CITATIONS.md) file. diff --git a/assets/headers/blast_outfmt6_header.txt b/assets/headers/blast_outfmt6_header.txt new file mode 100644 index 00000000..1bbb1847 --- /dev/null +++ b/assets/headers/blast_outfmt6_header.txt @@ -0,0 +1 @@ +stitle qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore slen qlen qcovs %cgAligned %refCovered diff --git a/assets/headers/ivar_variants_header_mqc.txt b/assets/headers/ivar_variants_header_mqc.txt new file mode 100644 index 00000000..8dcb12ef --- /dev/null +++ b/assets/headers/ivar_variants_header_mqc.txt @@ -0,0 +1,8 @@ +#id: 'ivar_variants' +#section_name: 'VARIANTS: iVar variant counts' +#description: "is calculated from the total number of variants called by +# iVar." +#plot_type: 'bargraph' +#anchor: 'ivar_variants' +#pconfig: +# title: 'iVar variant counts' diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml index b9dfaee7..8613adff 100644 --- a/assets/multiqc_config.yaml +++ b/assets/multiqc_config.yaml @@ -2,10 +2,275 @@ report_comment: > This report has been generated by the nf-core/viralrecon analysis pipeline. For information about how to interpret these results, please see the documentation. + +data_format: 'yaml' + +run_modules: + - custom_content + - fastqc + - fastp + - bowtie2 + - samtools + - picard + - varscan2 + - ivar + - bcftools + - snpeff + - quast + - cutadapt + - kraken + +exclude_modules: + - 'general_stats' + +module_order: + - fastqc: + name: 'PREPROCESS: FastQC (raw reads)' + anchor: 'fastqc_raw' + info: 'This section of the report shows FastQC results for the raw reads before adapter trimming.' + path_filters: + - './fastqc/*' + - fastp: + name: 'PREPROCESS: fastp (adapter trimming)' + info: 'This section of the report shows fastp results for reads after adapter and quality trimming.' + path_filters: + - './fastp/log/*' + - fastqc: + name: 'PREPROCESS: FastQC (adapter trimming)' + anchor: 'fastqc_fastp' + info: 'This section of the report shows FastQC results for reads after adapter and quality trimming.' + path_filters: + - './fastp/fastqc/*' + - bowtie2: + name: 'VARIANTS: Bowtie 2' + info: 'This section of the report shows Bowtie 2 mapping results for reads after adapter trimming and quality trimming.' + path_filters: + - './bowtie2/log/*' + - samtools: + name: 'VARIANTS: SAMTools (raw)' + anchor: 'samtools_bowtie2' + info: 'This section of the report shows SAMTools counts/statistics after mapping with Bowtie 2.' + path_filters: + - './bowtie2/flagstat/*' + - ivar: + name: 'VARIANTS: iVar trim' + info: 'This section of the report shows counts observed for each amplicon primer per sample as detected by iVar trim.' + path_filters: + - './ivar/trim/log/*' + - samtools: + name: 'VARIANTS: SAMTools (iVar)' + anchor: 'samtools_ivar' + info: 'This section of the report shows SAMTools counts/statistics after primer sequence removal with iVar.' + path_filters: + - './ivar/trim/flagstat/*' + - samtools: + name: 'VARIANTS: SAMTools (MarkDuplicates)' + anchor: 'samtools_markduplicates' + info: 'This section of the report shows SAMTools counts/statistics after duplicate removal with picard MarkDuplicates.' + path_filters: + - './picard/markdup/*' + - picard: + name: 'VARIANTS: Picard Metrics' + info: 'This section of the report shows picard CollectMultipleMetrics and MarkDuplicates results after mapping (if "--protocol amplicon" this will be after primer sequence removal with iVar).' + path_filters: + - './picard/metrics/*' + - varscan2: + name: 'VARIANTS: VarScan 2' + info: 'This section of the report shows total number of variants called by VarScan 2 broken down by those that were reported or not.' + path_filters: + - './varscan2/counts/lowfreq/*' + - bcftools: + name: 'VARIANTS: BCFTools (VarScan 2; high freq)' + anchor: 'bcftools_varscan2' + info: 'This section of the report shows BCFTools stats results for high frequency variants called by VarScan 2. The allele frequency filtering threshold can be set by the --max_allele_freq parameter (Default: 0.8).' + path_filters: + - './varscan2/bcftools/highfreq/*' + - snpeff: + name: 'VARIANTS: SnpEff (VarScan 2; high freq)' + anchor: 'snpeff_varscan2' + info: 'This section of the report shows SnpEff results for high frequency variants called by VarScan 2. The allele frequency filtering threshold can be set by the --max_allele_freq parameter (Default: 0.8).' + path_filters: + - './varscan2/snpeff/highfreq/*' + - quast: + name: 'VARIANTS: QUAST (VarScan 2; high freq)' + anchor: 'quast_varscan2' + info: 'This section of the report shows QUAST results for consensus sequences generated from high frequency variants with VarScan 2. The allele frequency filtering threshold can be set by the --max_allele_freq parameter (Default: 0.8).' + path_filters: + - './varscan2/quast/highfreq/*' + - bcftools: + name: 'VARIANTS: BCFTools (iVar; high freq)' + anchor: 'bcftools_ivar' + info: 'This section of the report shows BCFTools stats results for high frequency variants called by iVar. The allele frequency filtering threshold can be set by the --max_allele_freq parameter (Default: 0.8).' + path_filters: + - './ivar/variants/bcftools/highfreq/*' + - snpeff: + name: 'VARIANTS: SnpEff (iVar; high freq)' + anchor: 'snpeff_ivar' + info: 'This section of the report shows SnpEff results for high frequency variants called by iVar. The allele frequency filtering threshold can be set by the --max_allele_freq parameter (Default: 0.8).' + path_filters: + - './ivar/variants/snpeff/highfreq/*' + - quast: + name: 'VARIANTS: QUAST (iVar; high freq)' + anchor: 'quast_ivar' + info: 'This section of the report shows QUAST results for consensus sequences generated from high frequency variants with iVar. The allele frequency filtering threshold can be set by the --max_allele_freq parameter (Default: 0.8).' + path_filters: + - './ivar/consensus/quast/highfreq/*' + - bcftools: + name: 'VARIANTS: BCFTools (BCFTools)' + anchor: 'bcftools_bcftools' + info: 'This section of the report shows BCFTools stats results for variants called by BCFTools.' + path_filters: + - './bcftools/variants/bcftools/*' + - snpeff: + name: 'VARIANTS: SnpEff (BCFTools)' + anchor: 'snpeff_bcftools' + info: 'This section of the report shows SnpEff results for variants called by BCFTools.' + path_filters: + - './bcftools/variants/snpeff/*' + - quast: + name: 'VARIANTS: QUAST (BCFTools)' + anchor: 'quast_bcftools' + info: 'This section of the report shows QUAST results for consensus sequence generated from BCFTools variants.' + path_filters: + - './bcftools/consensus/quast/*' + - cutadapt: + name: 'ASSEMBLY: Cutadapt (primer trimming)' + info: 'This section of the report shows Cutadapt results for reads after primer sequence trimming.' + path_filters: + - './cutadapt/log/*' + - fastqc: + name: 'ASSEMBLY: FastQC (primer trimming)' + anchor: 'fastqc_cutadapt' + info: 'This section of the report shows FastQC results for reads after primer sequence trimming with Cutadapt.' + path_filters: + - './cutadapt/fastqc/*' + - kraken: + name: 'ASSEMBLY: Kraken 2' + info: 'This section of the report shows Kraken 2 classification results for reads after primer sequence trimming with Cutadapt.' + path_filters: + - './kraken2/*' + - quast: + name: 'ASSEMBLY: QUAST (SPAdes)' + anchor: 'quast_spades' + info: 'This section of the report shows QUAST results from SPAdes de novo assembly.' + path_filters: + - './spades/quast/*' + - bcftools: + name: 'ASSEMBLY: BCFTools (SPAdes)' + anchor: 'bcftools_spades' + info: 'This section of the report shows BCFTools stats results for variants called in the SPAdes assembly relative to the reference.' + path_filters: + - './spades/bcftools/*' + - snpeff: + name: 'ASSEMBLY: SnpEff (SPAdes)' + anchor: 'snpeff_spades' + info: 'This section of the report shows SnpEff results for variants called in the SPAdes assembly relative to the reference.' + path_filters: + - './spades/snpeff/*' + - quast: + name: 'ASSEMBLY: QUAST (MetaSPAdes)' + anchor: 'quast_metaspades' + info: 'This section of the report shows QUAST results from MetaSPAdes de novo assembly.' + path_filters: + - './metaspades/quast/*' + - bcftools: + name: 'ASSEMBLY: BCFTools (MetaSPAdes)' + anchor: 'bcftools_metaspades' + info: 'This section of the report shows BCFTools stats results for variants called in the MetaSPAdes assembly relative to the reference.' + path_filters: + - './metaspades/bcftools/*' + - snpeff: + name: 'ASSEMBLY: SnpEff (MetaSPAdes)' + anchor: 'snpeff_metaspades' + info: 'This section of the report shows SnpEff results for variants called in the MetaSPAdes assembly relative to the reference.' + path_filters: + - './metaspades/snpeff/*' + - quast: + name: 'ASSEMBLY: QUAST (Unicycler)' + anchor: 'quast_unicycler' + info: 'This section of the report shows QUAST results from Unicycler de novo assembly.' + path_filters: + - './unicycler/quast/*' + - bcftools: + name: 'ASSEMBLY: BCFTools (Unicycler)' + anchor: 'bcftools_unicycler' + info: 'This section of the report shows BCFTools stats results for variants called in the Unicycler assembly relative to the reference.' + path_filters: + - './unicycler/bcftools/*' + - snpeff: + name: 'ASSEMBLY: SnpEff (Unicycler)' + anchor: 'snpeff_unicycler' + info: 'This section of the report shows SnpEff results for variants called in the Unicycler assembly relative to the reference.' + path_filters: + - './unicycler/snpeff/*' + - quast: + name: 'ASSEMBLY: QUAST (minia)' + anchor: 'quast_minia' + info: 'This section of the report shows QUAST results from minia de novo assembly.' + path_filters: + - './minia/quast/*' + - bcftools: + name: 'ASSEMBLY: BCFTools (minia)' + anchor: 'bcftools_minia' + info: 'This section of the report shows BCFTools stats results for variants called in the minia assembly relative to the reference.' + path_filters: + - './minia/bcftools/*' + - snpeff: + name: 'ASSEMBLY: SnpEff (minia)' + anchor: 'snpeff_minia' + info: 'This section of the report shows SnpEff results for variants called in the minia assembly relative to the reference.' + path_filters: + - './minia/snpeff/*' + report_section_order: + summary_assembly_metrics: + before: summary_variants_metrics + ivar_variants: + before: quast_varscan2 software_versions: - order: -1000 - nf-core-viralrecon-summary: order: -1001 + nf-core-viralrecon-summary: + order: -1002 + +custom_plot_config: + picard_insert_size: + cpswitch_c_active: False + smooth_points: 1000 + +bcftools: + collapse_complementary_changes: true + +# See https://github.com/ewels/MultiQC_TestData/blob/master/data/custom_content/with_config/table_headerconfig/multiqc_config.yaml +custom_data: + summary_variants_metrics: + section_name: 'Variant calling metrics' + description: 'generated by the nf-core/viralrecon pipeline' + plot_type: 'table' + pconfig: + id: 'summary_variants_metrics_plot' + table_title: 'Variant calling metrics' + namespace: 'Variant calling metrics' + format: '{:.0f}' + summary_assembly_metrics: + section_name: 'De novo assembly metrics' + description: 'generated by the nf-core/viralrecon pipeline' + plot_type: 'table' + pconfig: + id: 'summary_assembly_metrics_plot' + table_title: 'De novo assembly metrics' + namespace: 'De novo assembly metrics' + format: '{:.0f}' -export_plots: true +extra_fn_clean_exts: + - '.trim' + - '.bowtie2' + - '.mkD' + - '.ptrim' + - '.highfreq' + - '.lowfreq' + - '.consensus' + - '.snpEff' + - '.scaffolds' + - '.kraken2' + - type: regex + pattern: '.(AF|k)[0-9]+.*' diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py new file mode 100755 index 00000000..55234c47 --- /dev/null +++ b/bin/check_samplesheet.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python + +import os +import sys +import errno +import argparse + +def parse_args(args=None): + Description = 'Reformat nf-core/viralrecon samplesheet file and check its contents.' + Epilog = """Example usage: python check_samplesheet.py """ + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument('FILE_IN', help="Input samplesheet file.") + parser.add_argument('FILE_OUT', help="Output file.") + return parser.parse_args(args) + + +def make_dir(path): + if not len(path) == 0: + try: + os.makedirs(path) + except OSError as exception: + if exception.errno != errno.EEXIST: + raise + + +def print_error(error,line): + print("ERROR: Please check samplesheet -> {}\nLine: '{}'".format(error,line.strip())) + sys.exit(1) + + +def check_samplesheet(FileIn,FileOut): + ## Check header + HEADER = ['sample', 'fastq_1', 'fastq_2'] + fin = open(FileIn,'r') + header = fin.readline().strip().split(',') + if header != HEADER: + print("ERROR: Please check samplesheet header -> {} != {}".format(','.join(header),','.join(HEADER))) + sys.exit(1) + + sampleRunDict = {} + while True: + line = fin.readline() + if line: + lspl = [x.strip() for x in line.strip().split(',')] + + ## Check valid number of columns per row + if len(lspl) != len(header): + print_error("Invalid number of columns (minimum = {})!".format(len(header)),line) + + numCols = len([x for x in lspl if x]) + if numCols < 2: + print_error("Invalid number of populated columns (minimum = 2)!",line) + + ## Check sample name entries + sample,fastQFiles = lspl[0],lspl[1:] + if sample: + if sample.find(' ') != -1: + print_error("Sample entry contains spaces!",line) + else: + print_error("Sample entry has not been specified!",line) + + ## Check FastQ file extension + for fastq in fastQFiles: + if fastq: + if fastq.find(' ') != -1: + print_error("FastQ file contains spaces!",line) + if fastq[-9:] != '.fastq.gz' and fastq[-6:] != '.fq.gz': + print_error("FastQ file does not have extension '.fastq.gz' or '.fq.gz'!",line) + + ## Auto-detect paired-end/single-end + sample_info = [] ## [single_end, is_sra, is_ftp, fastq_1, fastq_2, md5_1, md5_2] + fastq_1,fastq_2 = fastQFiles + if sample and fastq_1 and fastq_2: ## Paired-end short reads + sample_info = ['0', '0', '0', fastq_1, fastq_2, '', ''] + elif sample and fastq_1 and not fastq_2: ## Single-end short reads + sample_info = ['1', '0', '0', fastq_1, fastq_2, '', ''] + else: + print_error("Invalid combination of columns provided!",line) + + if sample not in sampleRunDict: + sampleRunDict[sample] = [sample_info] + else: + if sample_info in sampleRunDict[sample]: + print_error("Samplesheet contains duplicate rows!",line) + else: + sampleRunDict[sample].append(sample_info) + else: + fin.close() + break + + ## Write validated samplesheet with appropriate columns + if len(sampleRunDict) > 0: + OutDir = os.path.dirname(FileOut) + make_dir(OutDir) + fout = open(FileOut,'w') + fout.write(','.join(['sample_id', 'single_end', 'is_sra', 'is_ftp', 'fastq_1', 'fastq_2', 'md5_1', 'md5_2']) + '\n') + for sample in sorted(sampleRunDict.keys()): + + ## Check that multiple runs of the same sample are of the same datatype + if not all(x[:2] == sampleRunDict[sample][0][:2] for x in sampleRunDict[sample]): + print_error("Multiple runs of a sample must be of the same datatype","Sample: {}".format(sample)) + + for idx,val in enumerate(sampleRunDict[sample]): + fout.write(','.join(["{}_T{}".format(sample,idx+1)] + val) + '\n') + fout.close() + + +def main(args=None): + args = parse_args(args) + check_samplesheet(args.FILE_IN,args.FILE_OUT) + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/bin/fetch_sra_runinfo.py b/bin/fetch_sra_runinfo.py new file mode 100755 index 00000000..869e0d5f --- /dev/null +++ b/bin/fetch_sra_runinfo.py @@ -0,0 +1,169 @@ +#!/usr/bin/env python + +import os +import re +import sys +import csv +import errno +import requests +import argparse + + +## Example ids supported by this script +SRA_IDS = ['PRJNA63463', 'SAMN00765663', 'SRA023522', 'SRP003255', 'SRR390278', 'SRS282569', 'SRX111814'] +ENA_IDS = ['ERA2421642', 'ERP120836', 'ERR674736', 'ERS4399631', 'ERX629702', 'PRJEB7743', 'SAMEA3121481'] +GEO_IDS = ['GSE18729', 'GSM465244'] +ID_REGEX = r'^[A-Z]+' +PREFIX_LIST = sorted(list(set([re.search(ID_REGEX,x).group() for x in SRA_IDS + ENA_IDS + GEO_IDS]))) + + +def parse_args(args=None): + Description = 'Download and create a run information metadata file from SRA/ENA/GEO identifiers.' + Epilog = """Example usage: python fetch_sra_runinfo.py """ + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument('FILE_IN', help="File containing database identifiers, one per line.") + parser.add_argument('FILE_OUT', help="Output file in tab-delimited format.") + parser.add_argument('-pl', '--platform', type=str, dest="PLATFORM", default='', help="Comma-separated list of platforms to use for filtering. Accepted values = 'ILLUMINA', 'OXFORD_NANOPORE' (default: '').") + parser.add_argument('-ll', '--library_layout', type=str, dest="LIBRARY_LAYOUT", default='', help="Comma-separated list of library layouts to use for filtering. Accepted values = 'SINGLE', 'PAIRED' (default: '').") + return parser.parse_args(args) + + +def validate_csv_param(param,validVals,param_desc): + validList = [] + if param: + userVals = param.split(',') + intersect = list(set(userVals) & set(validVals)) + if len(intersect) == len(userVals): + validList = intersect + else: + print("ERROR: Please provide a valid {} parameter!\nProvided values = {}\nAccepted values = {}".format(param_desc,param,','.join(validVals))) + sys.exit(1) + return validList + + +def make_dir(path): + if not len(path) == 0: + try: + os.makedirs(path) + except OSError as exception: + if exception.errno != errno.EEXIST: + raise + + +def fetch_url(url,encoding='utf-8'): + try: + r = requests.get(url) + except requests.exceptions.RequestException as e: + raise SystemExit(e) + if r.status_code != 200: + print("ERROR: Connection failed\nError code '{}'".format(r.status_code)) + sys.exit(1) + return r.content.decode(encoding).splitlines() + + +def id_to_srx(db_id): + ids = [] + url = 'https://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?save=efetch&db=sra&rettype=runinfo&term={}'.format(db_id) + for row in csv.DictReader(fetch_url(url), delimiter=','): + ids.append(row['Experiment']) + return ids + + +def id_to_erx(db_id): + ids = [] + url = 'http://www.ebi.ac.uk/ena/data/warehouse/filereport?accession={}&result=read_run'.format(db_id) + for row in csv.DictReader(fetch_url(url), delimiter='\t'): + ids.append(row['experiment_accession']) + return ids + + +def gse_to_srx(db_id): + ids = [] + url = 'https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={}&targ=gsm&view=data&form=text'.format(db_id) + gsm_ids = [x.split('=')[1].strip() for x in fetch_url(url) if x.find('GSM') != -1] + for gsm_id in gsm_ids: + ids += id_to_srx(gsm_id) + return ids + + +def fetch_sra_runinfo(FileIn,FileOut,platformList=[],libraryLayoutList=[]): + total_out = 0 + seen_ids = []; run_ids = [] + header = [] + make_dir(os.path.dirname(FileOut)) + fin = open(FileIn,'r') + fout = open(FileOut,'w') + while True: + line = fin.readline() + if line: + db_id = line.strip() + match = re.search(ID_REGEX, db_id) + if match: + prefix = match.group() + if prefix in PREFIX_LIST: + if not db_id in seen_ids: + + ids = [db_id] + ## Resolve/expand these ids against GEO URL + if prefix in ['GSE']: + ids = gse_to_srx(db_id) + + ## Resolve/expand these ids against SRA URL + elif prefix in ['GSM', 'PRJNA', 'SAMN', 'SRR']: + ids = id_to_srx(db_id) + + ## Resolve/expand these ids against ENA URL + elif prefix in ['ERR']: + ids = id_to_erx(db_id) + + ## Resolve/expand to get run identifier from ENA and write to file + for id in ids: + url = 'http://www.ebi.ac.uk/ena/data/warehouse/filereport?accession={}&result=read_run'.format(id) + csv_dict = csv.DictReader(fetch_url(url), delimiter='\t') + for row in csv_dict: + run_id = row['run_accession'] + if not run_id in run_ids: + + writeID = True + if platformList: + if row['instrument_platform'] not in platformList: + writeID = False + if libraryLayoutList: + if row['library_layout'] not in libraryLayoutList: + writeID = False + + if writeID: + if total_out == 0: + header = sorted(row.keys()) + fout.write('{}\n'.format('\t'.join(sorted(header)))) + else: + if header != sorted(row.keys()): + print("ERROR: Metadata columns do not match for id {}!\nLine: '{}'".format(run_id,line.strip())) + sys.exit(1) + fout.write('{}\n'.format('\t'.join([row[x] for x in header]))) + total_out += 1 + run_ids.append(run_id) + seen_ids.append(db_id) + else: + id_str = ', '.join([x + "*" for x in PREFIX_LIST]) + print("ERROR: Please provide a valid database id starting with {}!\nLine: '{}'".format(id_str,line.strip())) + sys.exit(1) + else: + id_str = ', '.join([x + "*" for x in PREFIX_LIST]) + print("ERROR: Please provide a valid database id starting with {}!\nLine: '{}'".format(id_str,line.strip())) + sys.exit(1) + else: + break + fin.close() + fout.close() + + +def main(args=None): + args = parse_args(args) + platformList = validate_csv_param(args.PLATFORM,validVals=['ILLUMINA', 'OXFORD_NANOPORE'],param_desc='--platform') + libraryLayoutList = validate_csv_param(args.LIBRARY_LAYOUT,validVals=['SINGLE', 'PAIRED'],param_desc='--library_layout') + fetch_sra_runinfo(args.FILE_IN,args.FILE_OUT,platformList,libraryLayoutList) + +if __name__ == '__main__': + sys.exit(main()) diff --git a/bin/ivar_variants_to_vcf.py b/bin/ivar_variants_to_vcf.py new file mode 100755 index 00000000..4860266f --- /dev/null +++ b/bin/ivar_variants_to_vcf.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python +import os +import sys +import re +import errno +import argparse + +def parse_args(args=None): + Description = 'Convert iVar variants tsv file to vcf format.' + Epilog = """Example usage: python ivar_variants_to_vcf.py """ + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument('FILE_IN', help="Input tsv file.") + parser.add_argument('FILE_OUT', help="Full path to output vcf file.") + parser.add_argument('-po', '--pass_only', dest="PASS_ONLY", help="Only output variants that PASS all filters.",action='store_true') + parser.add_argument('-ma', '--min_allele_freq', type=float, dest="MIN_ALLELE_FREQ", default=0, help="Only output variants where allele frequency greater than this number (default: 0).") + + return parser.parse_args(args) + +def make_dir(path): + if not len(path) == 0: + try: + os.makedirs(path) + except OSError as exception: + if exception.errno != errno.EEXIST: + raise + +def ivar_variants_to_vcf(FileIn,FileOut,passOnly=False,minAF=0): + filename = os.path.splitext(FileIn)[0] + header = ('##fileformat=VCFv4.2\n' + '##source=iVar\n' + '##INFO=\n' + '##FILTER=\n' + '##FILTER= 0.05">\n' + '##FORMAT=\n' + '##FORMAT=\n' + '##FORMAT=\n' + '##FORMAT=\n' + '##FORMAT=\n' + '##FORMAT=\n' + '##FORMAT=\n' + '##FORMAT=\n') + header += '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+filename+'\n' + + varList = [] + varCountDict = {'SNP':0, 'INS':0, 'DEL':0} + OutDir = os.path.dirname(FileOut) + make_dir(OutDir) + fout = open(FileOut,'w') + fout.write(header) + with open(FileIn) as f: + for line in f: + if not re.match("REGION",line): + line = re.split("\t", line) + CHROM=line[0] + POS=line[1] + ID='.' + REF=line[2] + ALT=line[3] + var_type = 'SNP' + if ALT[0] == '+': + ALT = REF + ALT[1:] + var_type = 'INS' + elif ALT[0] == '-': + REF += ALT[1:] + ALT = line[2] + var_type = 'DEL' + QUAL='.' + pass_test=line[13] + if pass_test == 'TRUE': + FILTER='PASS' + else: + FILTER='FAIL' + INFO='DP='+line[11] + FORMAT='GT:REF_DP:REF_RV:REF_QUAL:ALT_DP:ALT_RV:ALT_QUAL:ALT_FREQ' + SAMPLE='1:'+line[4]+':'+line[5]+':'+line[6]+':'+line[7]+':'+line[8]+':'+line[9]+':'+line[10] + oline = CHROM+'\t'+POS+'\t'+ID+'\t'+REF+'\t'+ALT+'\t'+QUAL+'\t'+FILTER+'\t'+INFO+'\t'+FORMAT+'\t'+SAMPLE+'\n' + writeLine = True + if passOnly and FILTER != 'PASS': + writeLine = False + if float(line[10]) < minAF: + writeLine = False + if (CHROM,POS,REF,ALT) in varList: + writeLine = False + else: + varList.append((CHROM,POS,REF,ALT)) + if writeLine: + varCountDict[var_type] += 1 + fout.write(oline) + fout.close() + + ## Print variant counts to pass to MultiQC + varCountList = [(k, str(v)) for k, v in sorted(varCountDict.items())] + print('\t'.join(['sample'] + [x[0] for x in varCountList])) + print('\t'.join([filename] + [x[1] for x in varCountList])) + +def main(args=None): + args = parse_args(args) + ivar_variants_to_vcf(args.FILE_IN,args.FILE_OUT,args.PASS_ONLY,args.MIN_ALLELE_FREQ) + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/bin/multiqc_to_custom_tsv.py b/bin/multiqc_to_custom_tsv.py new file mode 100755 index 00000000..8afae320 --- /dev/null +++ b/bin/multiqc_to_custom_tsv.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python + +import os +import sys +import errno +import argparse +import yaml + + +def parse_args(args=None): + Description = 'Create custom spreadsheet for pertinent MultiQC metrics generated by the nf-core/viralrecon pipeline.' + Epilog = "Example usage: python multiqc_to_custom_tsv.py" + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument('-md', '--multiqc_data_dir', type=str, dest="MULTIQC_DATA_DIR", default='multiqc_data', help="Full path to directory containing YAML files for each module, as generated by MultiQC. (default: 'multiqc_data').") + parser.add_argument('-op', '--out_prefix', type=str, dest="OUT_PREFIX", default='summary', help="Full path to output prefix (default: 'summary').") + return parser.parse_args(args) + + +def make_dir(path): + if not len(path) == 0: + try: + os.makedirs(path) + except OSError as exception: + if exception.errno != errno.EEXIST: + raise + + +# Find key in dictionary created from YAML file recursively +# From https://stackoverflow.com/a/37626981 +def find_tag(d, tag): + if tag in d: + yield d[tag] + for k,v in d.items(): + if isinstance(v, dict): + for i in find_tag(v, tag): + yield i + + +def yaml_fields_to_dict(YAMLFile,AppendDict={},FieldMappingList=[],ValidSampleList=[]): + intFields = ['number_of_SNPs', 'number_of_indels', 'MISSENSE', + '# contigs (>= 0 bp)', '# contigs (>= 5000 bp)', 'Largest contig'] + with open(YAMLFile) as f: + yaml_dict = yaml.safe_load(f) + for k in yaml_dict.keys(): + key = k + if os.path.basename(YAMLFile).startswith('multiqc_picard_insertSize'): + key = k[:-3] + if os.path.basename(YAMLFile).startswith('multiqc_cutadapt'): + names = [x for x in ValidSampleList if key.startswith(x)] + if names != []: + key = names[0] + inclSample = True + if len(ValidSampleList) != 0 and key not in ValidSampleList: + inclSample = False + if inclSample: + if key not in AppendDict: + AppendDict[key] = {} + if FieldMappingList != []: + for i,j in FieldMappingList: + val = list(find_tag(yaml_dict[k], j[0])) + ## Fix for Cutadapt reporting reads/pairs as separate values + if j[0] == 'r_written' and len(val) == 0: + val = [list(find_tag(yaml_dict[k], 'pairs_written'))[0] * 2] + if len(val) != 0: + val = val[0] + if len(j) == 2: + val = list(find_tag(val, j[1]))[0] + if j[0] in intFields: + val = int(val) + if i not in AppendDict[key]: + AppendDict[key][i] = val + else: + print('WARNING: {} key already exists in dictionary so will be overwritten. YAML file {}.'.format(i,YAMLFile)) + else: + AppendDict[key] = yaml_dict[k] + return AppendDict + + +def metrics_dict_to_file(FileFieldList,MultiQCDataDir,OutFile,ValidSampleList=[]): + MetricsDict = {} + FieldList = [] + for yamlFile,mappingList in FileFieldList: + yamlFile = os.path.join(MultiQCDataDir,yamlFile) + if os.path.exists(yamlFile): + MetricsDict = yaml_fields_to_dict(YAMLFile=yamlFile,AppendDict=MetricsDict,FieldMappingList=mappingList,ValidSampleList=ValidSampleList) + FieldList += [x[0] for x in mappingList] + else: + print('WARNING: File does not exist: {}'.format(yamlFile)) + + if MetricsDict != {}: + make_dir(os.path.dirname(OutFile)) + fout = open(OutFile,'w') + header = ['Sample'] + FieldList + fout.write('{}\n'.format('\t'.join(header))) + for k in sorted(MetricsDict.keys()): + rowList = [k] + for field in FieldList: + if field in MetricsDict[k]: + rowList.append(MetricsDict[k][field]) + else: + rowList.append('NA') + fout.write('{}\n'.format('\t'.join(map(str,rowList)))) + fout.close() + return MetricsDict + + +def main(args=None): + args = parse_args(args) + + ## File names for MultiQC YAML along with fields to fetch from each file + VariantFileFieldList = [ + ('multiqc_fastp.yaml', [('# Input reads', ['before_filtering','total_reads']), + ('# Trimmed reads (fastp)', ['after_filtering','total_reads'])]), + ('multiqc_samtools_flagstat_samtools_bowtie2.yaml', [('% Mapped reads (viral)', ['mapped_passed_pct'])]), + ('multiqc_samtools_flagstat_samtools_ivar.yaml', [('# Trimmed reads (iVar)', ['flagstat_total'])]), + ('multiqc_samtools_flagstat_samtools_markduplicates.yaml', [('# Duplicate reads', ['duplicates_passed']), + ('# Reads after MarkDuplicates', ['flagstat_total'])]), + ('multiqc_picard_insertSize.yaml', [('Insert size mean', ['MEAN_INSERT_SIZE']), + ('Insert size std dev', ['STANDARD_DEVIATION'])]), + ('multiqc_picard_wgsmetrics.yaml', [('Coverage mean', ['MEAN_COVERAGE']), + ('Coverage std dev', ['SD_COVERAGE']), + ('% Coverage > 10x', ['PCT_10X'])]), + ('multiqc_bcftools_stats_bcftools_varscan2.yaml', [('# High conf SNPs (VarScan 2)', ['number_of_SNPs']), + ('# High conf INDELs (VarScan 2)', ['number_of_indels'])]), + ('multiqc_bcftools_stats_bcftools_ivar.yaml', [('# High conf SNPs (iVar)', ['number_of_SNPs']), + ('# High conf INDELs (iVar)', ['number_of_indels'])]), + ('multiqc_bcftools_stats_bcftools_bcftools.yaml', [('# High conf SNPs (BCFTools)', ['number_of_SNPs']), + ('# High conf INDELs (BCFTools)', ['number_of_indels'])]), + ('multiqc_snpeff_snpeff_varscan2.yaml', [('# Missense variants (VarScan 2)', ['MISSENSE'])]), + ('multiqc_snpeff_snpeff_ivar.yaml', [('# Missense variants (iVar)', ['MISSENSE'])]), + ('multiqc_snpeff_snpeff_bcftools.yaml', [('# Missense variants (BCFTools)', ['MISSENSE'])]), + ('multiqc_quast_quast_varscan2.yaml', [('# Ns per 100kb consensus (VarScan 2)', ["# N's per 100 kbp"])]), + ('multiqc_quast_quast_ivar.yaml', [('# Ns per 100kb consensus (iVar)', ["# N's per 100 kbp"])]), + ('multiqc_quast_quast_bcftools.yaml', [('# Ns per 100kb consensus (BCFTools)', ["# N's per 100 kbp"])]), + ] + + AssemblyFileFieldList = [ + ('multiqc_fastp.yaml', [('# Input reads', ['before_filtering','total_reads'])]), + ('multiqc_cutadapt.yaml', [('# Trimmed reads (Cutadapt)', ['r_written'])]), + ('multiqc_general_stats.yaml', [('% Non-host reads (Kraken 2)', ['ASSEMBLY: Kraken 2_mqc-generalstats-assembly_kraken_2-Unclassified'])]), + ('multiqc_quast_quast_spades.yaml', [('# Contigs (SPAdes)', ['# contigs (>= 0 bp)']), + ('Largest contig (SPAdes)', ['Largest contig']), + ('% Genome fraction (SPAdes)', ['Genome fraction (%)']), + ('N50 (SPAdes)', ['N50'])]), + ('multiqc_quast_quast_metaspades.yaml', [('# Contigs (metaSPAdes)', ['# contigs (>= 0 bp)']), + ('Largest contig (metaSPAdes)', ['Largest contig']), + ('% Genome fraction (metaSPAdes)', ['Genome fraction (%)']), + ('N50 (metaSPAdes)', ['N50'])]), + ('multiqc_quast_quast_unicycler.yaml', [('# Contigs (Unicycler)', ['# contigs (>= 0 bp)']), + ('Largest contig (Unicycler)', ['Largest contig']), + ('% Genome fraction (Unicycler)', ['Genome fraction (%)']), + ('N50 (Unicycler)', ['N50'])]), + ('multiqc_quast_quast_minia.yaml', [('# Contigs (minia)', ['# contigs (>= 0 bp)']), + ('Largest contig (minia)', ['Largest contig']), + ('% Genome fraction (minia)', ['Genome fraction (%)']), + ('N50 (minia)', ['N50'])]), + ('multiqc_bcftools_stats_bcftools_spades.yaml', [('# SNPs (SPAdes)', ['number_of_SNPs']), + ('# INDELs (SPAdes)', ['number_of_indels'])]), + ('multiqc_bcftools_stats_bcftools_metaspades.yaml', [('# SNPs (metaSPAdes)', ['number_of_SNPs']), + ('# INDELs (metaSPAdes)', ['number_of_indels'])]), + ('multiqc_bcftools_stats_bcftools_unicycler.yaml', [('# SNPs (Unicycler)', ['number_of_SNPs']), + ('# INDELs (Unicycler)', ['number_of_indels'])]), + ('multiqc_bcftools_stats_bcftools_minia.yaml', [('# SNPs (minia)', ['number_of_SNPs']), + ('# INDELs (minia)', ['number_of_indels'])]), + ('multiqc_snpeff_snpeff_spades.yaml', [('# Missense variants (SPAdes)', ['MISSENSE'])]), + ('multiqc_snpeff_snpeff_metaspades.yaml', [('# Missense variants (metaSPAdes)', ['MISSENSE'])]), + ('multiqc_snpeff_snpeff_unicycler.yaml', [('# Missense variants (Unicycler)', ['MISSENSE'])]), + ('multiqc_snpeff_snpeff_minia.yaml', [('# Missense variants (minia)', ['MISSENSE'])]) + ] + + ## Dictionary of samples being single-end/paired-end + isPEDict = {} + yamlFile = os.path.join(args.MULTIQC_DATA_DIR,'multiqc_fastp.yaml') + if os.path.exists(yamlFile): + MetricsDict = yaml_fields_to_dict(YAMLFile=yamlFile,AppendDict={},FieldMappingList=[('command', ['command'])],ValidSampleList=[]) + for sample,val in MetricsDict.items(): + if MetricsDict[sample]['command'].find('--out2') != -1: + isPEDict[sample] = True + else: + isPEDict[sample] = False + + ## Write variant calling metrics to file + metrics_dict_to_file(FileFieldList=VariantFileFieldList, + MultiQCDataDir=args.MULTIQC_DATA_DIR, + OutFile=args.OUT_PREFIX+'_variants_metrics_mqc.tsv', + ValidSampleList=isPEDict.keys()) + + ## Write de novo assembly metrics to file + metrics_dict_to_file(FileFieldList=AssemblyFileFieldList, + MultiQCDataDir=args.MULTIQC_DATA_DIR, + OutFile=args.OUT_PREFIX+'_assembly_metrics_mqc.tsv', + ValidSampleList=isPEDict.keys()) + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py index eea61a9a..574e6040 100755 --- a/bin/scrape_software_versions.py +++ b/bin/scrape_software_versions.py @@ -3,17 +3,62 @@ from collections import OrderedDict import re -# TODO nf-core: Add additional regexes for new tools in process get_software_versions regexes = { 'nf-core/viralrecon': ['v_pipeline.txt', r"(\S+)"], 'Nextflow': ['v_nextflow.txt', r"(\S+)"], - 'FastQC': ['v_fastqc.txt', r"FastQC v(\S+)"], - 'MultiQC': ['v_multiqc.txt', r"multiqc, version (\S+)"], + 'parallel-fastq-dump': ['v_parallel_fastq_dump.txt', r"parallel-fastq-dump\s:\s(\S+)"], + 'FastQC': ['v_fastqc.txt', r"FastQC\sv(\S+)"], + 'fastp': ['v_fastp.txt', r"fastp\s(\S+)"], + 'Bowtie 2': ['v_bowtie2.txt', r"bowtie2-align-s\sversion\s(\S+)"], + 'Samtools': ['v_samtools.txt', r"samtools\s(\S+)"], + 'BEDTools': ['v_bedtools.txt', r"bedtools\sv(\S+)"], + 'Picard': ['v_picard.txt', r"\n(\S+)"], + 'iVar': ['v_ivar.txt', r"iVar\sversion\s(\S+)"], + 'VarScan 2': ['v_varscan.txt', r"VarScan\sv(\S+)"], + 'SnpEff': ['v_snpeff.txt', r"SnpEff\s(\S+)"], + 'SnpSift': ['v_snpsift.txt', r"SnpSift\sversion\s(\S+)"], + 'BCFTools': ['v_bcftools.txt', r"bcftools\s(\S+)"], + 'Cutadapt': ['v_cutadapt.txt', r"(\S+)"], + 'Kraken2': ['v_kraken2.txt', r"Kraken\sversion\s(\S+)"], + 'SPAdes': ['v_spades.txt', r"SPAdes\sgenome\sassembler\sv(\S+)"], + 'Unicycler': ['v_unicycler.txt', r"Unicycler\sv(\S+)"], + 'minia': ['v_minia.txt', r"Minia\sversion\s(\S+)"], + 'Minimap2': ['v_minimap2.txt', r"(\S+)"], + 'vg': ['v_vg.txt', r"vg\sversion\sv(\S+)"], + 'BLAST': ['v_blast.txt', r"blastn:\s(\S+)"], + 'ABACAS': ['v_abacas.txt', r"ABACAS.(\S+)"], + 'QUAST': ['v_quast.txt', r"QUAST\sv(\S+)"], + 'Bandage': ['v_bandage.txt', r"Version:\s(\S+)"], + 'R': ['v_R.txt', r"R\sversion\s(\S+)"], + 'MultiQC': ['v_multiqc.txt', r"multiqc,\sversion\s(\S+)"] } results = OrderedDict() results['nf-core/viralrecon'] = 'N/A' results['Nextflow'] = 'N/A' +results['parallel-fastq-dump'] = 'N/A' results['FastQC'] = 'N/A' +results['fastp'] = 'N/A' +results['Bowtie 2'] = 'N/A' +results['Samtools'] = 'N/A' +results['BEDTools'] = 'N/A' +results['Picard'] = 'N/A' +results['iVar'] = 'N/A' +results['VarScan 2'] = 'N/A' +results['SnpEff'] = 'N/A' +results['SnpSift'] = 'N/A' +results['BCFTools'] = 'N/A' +results['Cutadapt'] = 'N/A' +results['Kraken2'] = 'N/A' +results['SPAdes'] = 'N/A' +results['Unicycler'] = 'N/A' +results['minia'] = 'N/A' +results['Minimap2'] = 'N/A' +results['vg'] = 'N/A' +results['BLAST'] = 'N/A' +results['ABACAS'] = 'N/A' +results['QUAST'] = 'N/A' +results['Bandage'] = 'N/A' +results['R'] = 'N/A' results['MultiQC'] = 'N/A' # Search each file using its regex diff --git a/bin/sra_runinfo_to_samplesheet.py b/bin/sra_runinfo_to_samplesheet.py new file mode 100755 index 00000000..77c9eded --- /dev/null +++ b/bin/sra_runinfo_to_samplesheet.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python + +import os +import sys +import errno +import argparse + +def parse_args(args=None): + Description = "Create valid nf-core/viralrecon samplesheet file from output of 'fetch_sra_runinfo.py' script." + Epilog = """Example usage: python sra_runinfo_to_samplesheet.py """ + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument('FILE_IN', help="Input metadata file created from 'fetch_sra_runinfo.py' script.") + parser.add_argument('FILE_OUT', help="Output file.") + return parser.parse_args(args) + + +def make_dir(path): + if not len(path) == 0: + try: + os.makedirs(path) + except OSError as exception: + if exception.errno != errno.EEXIST: + raise + + +def sra_runinfo_to_samplesheet(FileIn,FileOut): + + sampleRunDict = {} + fin = open(FileIn,'r') + header = fin.readline().strip().split('\t') + while True: + line = fin.readline() + if line: + line_dict = dict(zip(header,line.strip().split('\t'))) + run_id = line_dict['run_accession'] + exp_id = line_dict['experiment_accession'] + library = line_dict['library_layout'] + fastq_files = line_dict['fastq_ftp'] + fastq_md5 = line_dict['fastq_md5'] + + db_id = exp_id + sample_info = [] ## [single_end, is_sra, is_ftp, fastq_1, fastq_2, md5_1, md5_2] + if library == 'SINGLE': + if fastq_files: + sample_info = ['1', '1', '1', fastq_files , '', fastq_md5, ''] + else: + db_id = run_id + sample_info = ['1', '1', '0', '', '', '', ''] + elif library == 'PAIRED': + if fastq_files: + fq_files = fastq_files.split(';')[-2:] + if fq_files[0].find('_1.fastq.gz') != -1 and fq_files[1].find('_2.fastq.gz') != -1: + sample_info = ['0', '1', '1'] + fq_files + fastq_md5.split(';')[-2:] + else: + print("Invalid FastQ files found for database id:'{}'!.".format(run_id)) + else: + db_id = run_id + sample_info = ['0', '1', '0', '', '', '', ''] + + if sample_info: + if db_id not in sampleRunDict: + sampleRunDict[db_id] = [sample_info] + else: + if sample_info in sampleRunDict[db_id]: + print("Input run info file contains duplicate rows!\nLine: '{}'".format(line)) + else: + sampleRunDict[db_id].append(sample_info) + else: + break + fin.close() + + ## Write samplesheet with appropriate columns + if len(sampleRunDict) != 0: + OutDir = os.path.dirname(FileOut) + make_dir(OutDir) + fout = open(FileOut,'w') + fout.write(','.join(['sample_id', 'single_end', 'is_sra', 'is_ftp', 'fastq_1', 'fastq_2', 'md5_1', 'md5_2']) + '\n') + for db_id in sorted(sampleRunDict.keys()): + for idx,val in enumerate(sampleRunDict[db_id]): + fout.write(','.join(["{}_T{}".format(db_id,idx+1)] + val) + '\n') + fout.close() + + +def main(args=None): + args = parse_args(args) + sra_runinfo_to_samplesheet(args.FILE_IN,args.FILE_OUT) + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/conf/base.config b/conf/base.config index 6a19bfab..031dc275 100644 --- a/conf/base.config +++ b/conf/base.config @@ -11,7 +11,6 @@ process { - // TODO nf-core: Check the defaults for all processes cpus = { check_max( 1 * task.attempt, 'cpus' ) } memory = { check_max( 7.GB * task.attempt, 'memory' ) } time = { check_max( 4.h * task.attempt, 'time' ) } @@ -20,12 +19,7 @@ process { maxRetries = 1 maxErrors = '-1' - // Process-specific resource requirements - // NOTE - Only one of the labels below are used in the fastqc process in the main script. - // If possible, it would be nice to keep the same label naming convention when - // adding in your processes. - // TODO nf-core: Customise requirements for specific processes. - // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors + // Groupable resource requirements for processes withLabel:process_low { cpus = { check_max( 2 * task.attempt, 'cpus' ) } memory = { check_max( 14.GB * task.attempt, 'memory' ) } @@ -34,18 +28,25 @@ process { withLabel:process_medium { cpus = { check_max( 6 * task.attempt, 'cpus' ) } memory = { check_max( 42.GB * task.attempt, 'memory' ) } - time = { check_max( 8.h * task.attempt, 'time' ) } + time = { check_max( 12.h * task.attempt, 'time' ) } } withLabel:process_high { cpus = { check_max( 12 * task.attempt, 'cpus' ) } memory = { check_max( 84.GB * task.attempt, 'memory' ) } - time = { check_max( 10.h * task.attempt, 'time' ) } + time = { check_max( 24.h * task.attempt, 'time' ) } } withLabel:process_long { time = { check_max( 20.h * task.attempt, 'time' ) } } + withLabel:error_retry { + errorStrategy = 'retry' + maxRetries = 2 + } + withLabel:error_ignore { + errorStrategy = 'ignore' + } withName:get_software_versions { cache = false } - + } diff --git a/conf/igenomes.config b/conf/igenomes.config deleted file mode 100644 index 2de92422..00000000 --- a/conf/igenomes.config +++ /dev/null @@ -1,420 +0,0 @@ -/* - * ------------------------------------------------- - * Nextflow config file for iGenomes paths - * ------------------------------------------------- - * Defines reference genomes, using iGenome paths - * Can be used by any config that customises the base - * path using $params.igenomes_base / --igenomes_base - */ - -params { - // illumina iGenomes reference file paths - genomes { - 'GRCh37' { - fasta = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/README.txt" - mito_name = "MT" - macs_gsize = "2.7e9" - blacklist = "${baseDir}/assets/blacklists/GRCh37-blacklist.bed" - } - 'GRCh38' { - fasta = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Annotation/Genes/genes.bed" - mito_name = "chrM" - macs_gsize = "2.7e9" - blacklist = "${baseDir}/assets/blacklists/hg38-blacklist.bed" - } - 'GRCm38' { - fasta = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/README.txt" - mito_name = "MT" - macs_gsize = "1.87e9" - blacklist = "${baseDir}/assets/blacklists/GRCm38-blacklist.bed" - } - 'TAIR10' { - fasta = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Annotation/README.txt" - mito_name = "Mt" - } - 'EB2' { - fasta = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Annotation/README.txt" - } - 'UMD3.1' { - fasta = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Annotation/README.txt" - mito_name = "MT" - } - 'WBcel235' { - fasta = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/Genes/genes.bed" - mito_name = "MtDNA" - macs_gsize = "9e7" - } - 'CanFam3.1' { - fasta = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Annotation/README.txt" - mito_name = "MT" - } - 'GRCz10' { - fasta = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Annotation/Genes/genes.bed" - mito_name = "MT" - } - 'BDGP6' { - fasta = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Annotation/Genes/genes.bed" - mito_name = "M" - macs_gsize = "1.2e8" - } - 'EquCab2' { - fasta = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Annotation/README.txt" - mito_name = "MT" - } - 'EB1' { - fasta = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Annotation/README.txt" - } - 'Galgal4' { - fasta = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Annotation/Genes/genes.bed" - mito_name = "MT" - } - 'Gm01' { - fasta = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Annotation/README.txt" - } - 'Mmul_1' { - fasta = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Annotation/README.txt" - mito_name = "MT" - } - 'IRGSP-1.0' { - fasta = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Annotation/Genes/genes.bed" - mito_name = "Mt" - } - 'CHIMP2.1.4' { - fasta = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Annotation/README.txt" - mito_name = "MT" - } - 'Rnor_6.0' { - fasta = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Annotation/Genes/genes.bed" - mito_name = "MT" - } - 'R64-1-1' { - fasta = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Annotation/Genes/genes.bed" - mito_name = "MT" - macs_gsize = "1.2e7" - } - 'EF2' { - fasta = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Annotation/README.txt" - mito_name = "MT" - macs_gsize = "1.21e7" - } - 'Sbi1' { - fasta = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Annotation/README.txt" - } - 'Sscrofa10.2' { - fasta = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Annotation/README.txt" - mito_name = "MT" - } - 'AGPv3' { - fasta = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Annotation/Genes/genes.bed" - mito_name = "Mt" - } - 'hg38' { - fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.bed" - mito_name = "chrM" - macs_gsize = "2.7e9" - blacklist = "${baseDir}/assets/blacklists/hg38-blacklist.bed" - } - 'hg19' { - fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Annotation/README.txt" - mito_name = "chrM" - macs_gsize = "2.7e9" - blacklist = "${baseDir}/assets/blacklists/hg19-blacklist.bed" - } - 'mm10' { - fasta = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Annotation/README.txt" - mito_name = "chrM" - macs_gsize = "1.87e9" - blacklist = "${baseDir}/assets/blacklists/mm10-blacklist.bed" - } - 'bosTau8' { - fasta = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Annotation/Genes/genes.bed" - mito_name = "chrM" - } - 'ce10' { - fasta = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Annotation/README.txt" - mito_name = "chrM" - macs_gsize = "9e7" - } - 'canFam3' { - fasta = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Annotation/README.txt" - mito_name = "chrM" - } - 'danRer10' { - fasta = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Annotation/Genes/genes.bed" - mito_name = "chrM" - } - 'dm6' { - fasta = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Annotation/Genes/genes.bed" - mito_name = "chrM" - macs_gsize = "1.2e8" - } - 'equCab2' { - fasta = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Annotation/README.txt" - mito_name = "chrM" - } - 'galGal4' { - fasta = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Annotation/README.txt" - mito_name = "chrM" - } - 'panTro4' { - fasta = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Annotation/README.txt" - mito_name = "chrM" - } - 'rn6' { - fasta = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Annotation/Genes/genes.bed" - mito_name = "chrM" - } - 'sacCer3' { - fasta = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/BismarkIndex/" - readme = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Annotation/README.txt" - mito_name = "chrM" - macs_gsize = "1.2e7" - } - 'susScr3' { - fasta = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Annotation/README.txt" - mito_name = "chrM" - } - } -} diff --git a/conf/test.config b/conf/test.config index 8f93ad62..146126a4 100644 --- a/conf/test.config +++ b/conf/test.config @@ -10,17 +10,19 @@ params { config_profile_name = 'Test profile' config_profile_description = 'Minimal test dataset to check pipeline function' + // Limit resources so that this can run on GitHub Actions max_cpus = 2 max_memory = 6.GB max_time = 48.h - // Input data - // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets - // TODO nf-core: Give any required params for the test so that command line flags are not needed - single_end = false - readPaths = [ - ['Testdata', ['https://github.com/nf-core/test-datasets/raw/exoseq/testdata/Testdata_R1.tiny.fastq.gz', 'https://github.com/nf-core/test-datasets/raw/exoseq/testdata/Testdata_R2.tiny.fastq.gz']], - ['SRR389222', ['https://github.com/nf-core/test-datasets/raw/methylseq/testdata/SRR389222_sub1.fastq.gz', 'https://github.com/nf-core/test-datasets/raw/methylseq/testdata/SRR389222_sub2.fastq.gz']] - ] + // Input data to test amplicon analysis + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_amplicon.csv' + protocol = 'amplicon' + amplicon_fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/genome/NC_045512.2/amplicon/nCoV-2019.artic.V1.primer.fasta' + amplicon_bed = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/genome/NC_045512.2/amplicon/nCoV-2019.artic.V1.bed' + + // Genome references + genome = 'NC_045512.2' + kraken2_db = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/genome/kraken2/kraken2_hs22.tar.gz' } diff --git a/conf/test_full.config b/conf/test_full.config new file mode 100644 index 00000000..38a5745c --- /dev/null +++ b/conf/test_full.config @@ -0,0 +1,22 @@ +/* + * ------------------------------------------------- + * Nextflow config file for running tests + * ------------------------------------------------- + * Defines bundled input files and everything required + * to run a full pipeline test. Use as follows: + * nextflow run nf-core/viralrecon -profile test_full, + */ + +params { + config_profile_name = 'Full test profile' + config_profile_description = 'Full test dataset to check pipeline function' + + // Input data for full test of amplicon analysis + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_full_amplicon.csv' + protocol = 'amplicon' + amplicon_fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/genome/NC_045512.2/amplicon/nCoV-2019.artic.V1.primer.fasta' + amplicon_bed = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/genome/NC_045512.2/amplicon/nCoV-2019.artic.V1.bed' + + // Genome references + genome = 'NC_045512.2' +} diff --git a/conf/test_full_sispa.config b/conf/test_full_sispa.config new file mode 100644 index 00000000..88f38d92 --- /dev/null +++ b/conf/test_full_sispa.config @@ -0,0 +1,21 @@ +/* + * ------------------------------------------------- + * Nextflow config file for running tests + * ------------------------------------------------- + * Defines bundled input files and everything required + * to run a full pipeline test. Use as follows: + * nextflow run nf-core/viralrecon -profile test_full_sispa, + */ + +params { + config_profile_name = 'Full test profile' + config_profile_description = 'Full test dataset to check pipeline function' + + // Input data for full test of SISPA/metagenomics analysis + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_full_sispa.csv' + protocol = 'metagenomic' + + // Genome references + genome = 'NC_045512.2' + +} diff --git a/conf/test_sispa.config b/conf/test_sispa.config new file mode 100644 index 00000000..4f2b01a5 --- /dev/null +++ b/conf/test_sispa.config @@ -0,0 +1,26 @@ +/* + * ------------------------------------------------- + * Nextflow config file for running tests + * ------------------------------------------------- + * Defines bundled input files and everything required + * to run a fast and simple test. Use as follows: + * nextflow run nf-core/viralrecon -profile test_sispa, + */ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = 6.GB + max_time = 48.h + + // Input data to test SISPA/metagenomics analysis + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_sispa.csv' + protocol = 'metagenomic' + + // Genome references + genome = 'MN908947.3' + kraken2_db = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/genome/kraken2/kraken2_hs22.tar.gz' +} diff --git a/conf/test_sra.config b/conf/test_sra.config new file mode 100644 index 00000000..f11012e3 --- /dev/null +++ b/conf/test_sra.config @@ -0,0 +1,27 @@ +/* + * ------------------------------------------------- + * Nextflow config file for running tests + * ------------------------------------------------- + * Defines bundled input files and everything required + * to run a fast and simple test. Use as follows: + * nextflow run nf-core/viralrecon -profile test_sra, + */ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = 6.GB + max_time = 48.h + + // Input data to test SRA functionality using SISPA/metagenomics data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_sra.csv' + protocol = 'metagenomic' + + // Genome references + genome = 'NC_045512.2' + kraken2_db = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/genome/kraken2/kraken2_hs22.tar.gz' + +} diff --git a/docs/README.md b/docs/README.md index 667cb548..843b40f6 100644 --- a/docs/README.md +++ b/docs/README.md @@ -6,7 +6,7 @@ The nf-core/viralrecon documentation is split into the following files: 2. Pipeline configuration * [Local installation](https://nf-co.re/usage/local_installation) * [Adding your own system config](https://nf-co.re/usage/adding_own_config) - * [Reference genomes](https://nf-co.re/usage/reference_genomes) + * [Reference genomes](usage.md#reference-genomes) 3. [Running the pipeline](usage.md) 4. [Output and how to interpret the results](output.md) 5. [Troubleshooting](https://nf-co.re/usage/troubleshooting) diff --git a/docs/html/multiqc_report.html b/docs/html/multiqc_report.html new file mode 100644 index 00000000..97033fec --- /dev/null +++ b/docs/html/multiqc_report.html @@ -0,0 +1,13316 @@ + + + + + + + + + + + + + +MultiQC Report + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+

+ + + + + + +

+ +

Loading report..

+ +
+ +
+
+ + + +
+ + + + +
+ + + + +
+

+ + Highlight Samples +

+ +

+ + This report has flat image plots that won't be highlighted.
+ See the documentation + for help. +

+ +
+ + + +
+

+ Regex mode off + + +

+
    +
    + + +
    +

    + + Rename Samples +

    + +

    + + This report has flat image plots that won't be renamed.
    + See the documentation + for help. +

    + +
    + + + +
    +

    Click here for bulk input.

    +
    +

    Paste two columns of a tab-delimited table here (eg. from Excel).

    +

    First column should be the old name, second column the new name.

    +
    + + +
    +
    +

    + Regex mode off + + +

    +
      +
      + + +
      +

      + + Show / Hide Samples +

      + +

      + + This report has flat image plots that won't be hidden.
      + See the documentation + for help. +

      + +
      +
      + +
      +
      + +
      +
      + + +
      +
      +

      Warning! This can take a few seconds.

      +

      + Regex mode off + + +

      +
        +
        + + +
        +

        Export Plots

        +
        + +
        +
        +
        +
        +
        + + px +
        +
        +
        +
        + + px +
        +
        +
        +
        +
        + +
        +
        + +
        +
        +
        +
        + +
        +
        +
        + + X +
        +
        +
        +
        + +
        +

        Download the raw data used to create the plots in this report below:

        +
        +
        + +
        +
        + +
        +
        + +

        Note that additional data was saved in multiqc_data when this report was generated.

        + +
        +
        +
        + +
        +
        Choose Plots
        + + +
        + +
        + +

        If you use plots from MultiQC in a publication or presentation, please cite:

        +
        + MultiQC: Summarize analysis results for multiple tools and samples in a single report
        + Philip Ewels, Måns Magnusson, Sverker Lundin and Max Käller
        + Bioinformatics (2016)
        + doi: 10.1093/bioinformatics/btw354
        + PMID: 27312411 +
        +
        +
        + + +
        +

        Save Settings

        +

        You can save the toolbox settings for this report to the browser.

        +
        + + +
        +
        + +

        Load Settings

        +

        Choose a saved report profile from the dropdown box below:

        +
        +
        + +
        +
        + + + + +
        +
        +
        + + +
        +

        About MultiQC

        +

        This report was generated using MultiQC, version 1.9

        +

        You can see a YouTube video describing how to use MultiQC reports here: + https://youtu.be/qPbIlO_KWN0

        +

        For more information about MultiQC, including other videos and + extensive documentation, please visit http://multiqc.info

        +

        You can report bugs, suggest improvements and find the source code for MultiQC on GitHub: + https://github.com/ewels/MultiQC

        +

        MultiQC is published in Bioinformatics:

        +
        + MultiQC: Summarize analysis results for multiple tools and samples in a single report
        + Philip Ewels, Måns Magnusson, Sverker Lundin and Max Käller
        + Bioinformatics (2016)
        + doi: 10.1093/bioinformatics/btw354
        + PMID: 27312411 +
        +
        + +
        + +
        + + +
        + + + +

        + + + + +

        + + + +

        + A modular tool to aggregate results from bioinformatics analyses across many samples into a single report. +

        + + + +
        This report has been generated by the nf-core/viralrecon analysis pipeline. For information about how to interpret these results, please see the documentation. +
        + + + + + + + + + +
        +

        Report + + generated on 2020-05-31, 21:33 + + + based on data in: + + nf-core/viralrecon/work/fd/3c8e92b3222a535ed19d3834f6d258

        + + +
        + + + + + + + +
        + + + + + + + + + + + + + +
        +

        Variant calling metrics

        +

        generated by the nf-core/viralrecon pipeline

        + + + + +
        + + + + +
        + + + + + + + + + Showing 2/2 rows and 23/23 columns. + +
        +
        + +
        Sample# Input reads# Trimmed reads (fastp)% Mapped reads (viral)# Trimmed reads (iVar)# Duplicate reads# Reads after MarkDuplicatesInsert size meanInsert size std devCoverage meanCoverage std dev% Coverage > 10x# High conf SNPs (VarScan 2)# High conf INDELs (VarScan 2)# High conf SNPs (iVar)# High conf INDELs (iVar)# High conf SNPs (BCFTools)# High conf INDELs (BCFTools)# Missense variants (VarScan 2)# Missense variants (iVar)# Missense variants (BCFTools)# Ns per 100kb consensus (VarScan 2)# Ns per 100kb consensus (iVar)# Ns per 100kb consensus (BCFTools)
        sample1
        2755026
        2384570
        100
        2371846
        2216597
        2371846
        523
        215
        1096
        479
        1
        6
        0
        6
        0
        6
        0
        2
        2
        6
        224
        167
        224
        sample2
        2139958
        1913910
        99
        1890837
        1816623
        1890837
        480
        177
        499
        312
        1
        6
        0
        7
        0
        7
        0
        4
        5
        5
        338
        292
        338
        + +
        + + +
        + + +
        +
        + + + +
        +

        De novo assembly metrics

        +

        generated by the nf-core/viralrecon pipeline

        + + + + +
        + + + + +
        + + + + + + + + + Showing 2/2 rows and 31/31 columns. + +
        +
        + +
        Sample# Input reads# Trimmed reads (Cutadapt)% Non-host reads (Kraken 2)# Contigs (SPAdes)Largest contig (SPAdes)% Genome fraction (SPAdes)N50 (SPAdes)# Contigs (metaSPAdes)Largest contig (metaSPAdes)% Genome fraction (metaSPAdes)N50 (metaSPAdes)# Contigs (Unicycler)Largest contig (Unicycler)% Genome fraction (Unicycler)N50 (Unicycler)# Contigs (minia)Largest contig (minia)% Genome fraction (minia)N50 (minia)# SNPs (SPAdes)# INDELs (SPAdes)# SNPs (metaSPAdes)# INDELs (metaSPAdes)# SNPs (Unicycler)# INDELs (Unicycler)# SNPs (minia)# INDELs (minia)# Missense variants (SPAdes)# Missense variants (metaSPAdes)# Missense variants (Unicycler)# Missense variants (minia)
        sample1
        2755026
        2384570
        100
        3576
        5513
        40
        774
        16
        29962
        100
        29962
        166
        5386
        38
        916
        29
        16188
        99
        15793
        435
        174
        6
        0
        3
        0
        8
        0
        344
        2
        1
        4
        sample2
        2139958
        1913910
        99
        800
        5513
        86
        1070
        17
        29919
        100
        29919
        30
        2400
        83
        1478
        29
        18084
        99
        16188
        88
        26
        7
        0
        7
        1
        6
        0
        64
        5
        5
        5
        + +
        + + +
        + + +
        +
        + + + +
        +

        PREPROCESS: FastQC (raw reads)

        +

        PREPROCESS: FastQC (raw reads) This section of the report shows FastQC results for the raw reads before adapter trimming.

        + + + + +
        + +

        + Sequence Counts + + + +

        + +

        Sequence counts for each sample. Duplicate read counts are an estimate only.

        + + +
        +

        This plot show the total number of reads, broken down into unique and duplicate +if possible (only more recent versions of FastQC give duplicate info).

        +

        You can read more about duplicate calculation in the +FastQC documentation. +A small part has been copied here for convenience:

        +

        Only sequences which first appear in the first 100,000 sequences +in each file are analysed. This should be enough to get a good impression +for the duplication levels in the whole file. Each sequence is tracked to +the end of the file to give a representative count of the overall duplication level.

        +

        The duplication detection requires an exact sequence match over the whole length of +the sequence. Any reads over 75bp in length are truncated to 50bp for this analysis.

        +
        + +
        + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Sequence Quality Histograms + + + +

        + +

        The mean quality value across each base position in the read.

        + + +
        +

        To enable multiple samples to be plotted on the same graph, only the mean quality +scores are plotted (unlike the box plots seen in FastQC reports).

        +

        Taken from the FastQC help:

        +

        The y-axis on the graph shows the quality scores. The higher the score, the better +the base call. The background of the graph divides the y axis into very good quality +calls (green), calls of reasonable quality (orange), and calls of poor quality (red). +The quality of calls on most platforms will degrade as the run progresses, so it is +common to see base calls falling into the orange area towards the end of a read.

        +
        + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Per Sequence Quality Scores + + + +

        + +

        The number of reads with average quality scores. Shows if a subset of reads has poor quality.

        + + +
        +

        From the FastQC help:

        +

        The per sequence quality score report allows you to see if a subset of your +sequences have universally low quality values. It is often the case that a +subset of sequences will have universally poor quality, however these should +represent only a small percentage of the total sequences.

        +
        + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Per Base Sequence Content + + + +

        + +

        The proportion of each base position for which each of the four normal DNA bases has been called.

        + + +
        +

        To enable multiple samples to be shown in a single plot, the base composition data +is shown as a heatmap. The colours represent the balance between the four bases: +an even distribution should give an even muddy brown colour. Hover over the plot +to see the percentage of the four bases under the cursor.

        +

        To see the data as a line plot, as in the original FastQC graph, click on a sample track.

        +

        From the FastQC help:

        +

        Per Base Sequence Content plots out the proportion of each base position in a +file for which each of the four normal DNA bases has been called.

        +

        In a random library you would expect that there would be little to no difference +between the different bases of a sequence run, so the lines in this plot should +run parallel with each other. The relative amount of each base should reflect +the overall amount of these bases in your genome, but in any case they should +not be hugely imbalanced from each other.

        +

        It's worth noting that some types of library will always produce biased sequence +composition, normally at the start of the read. Libraries produced by priming +using random hexamers (including nearly all RNA-Seq libraries) and those which +were fragmented using transposases inherit an intrinsic bias in the positions +at which reads start. This bias does not concern an absolute sequence, but instead +provides enrichement of a number of different K-mers at the 5' end of the reads. +Whilst this is a true technical bias, it isn't something which can be corrected +by trimming and in most cases doesn't seem to adversely affect the downstream +analysis.

        +
        + +
        +
        +
        + + Click a sample row to see a line plot for that dataset. +
        +
        Rollover for sample name
        + +
        + Position: - +
        %T: -
        +
        %C: -
        +
        %A: -
        +
        %G: -
        +
        +
        +
        + +
        +
        +
        +
        + + +
        +
        + + + + +
        + +

        + Per Sequence GC Content + + + +

        + +

        The average GC content of reads. Normal random library typically have a + roughly normal distribution of GC content.

        + + +
        +

        From the FastQC help:

        +

        This module measures the GC content across the whole length of each sequence +in a file and compares it to a modelled normal distribution of GC content.

        +

        In a normal random library you would expect to see a roughly normal distribution +of GC content where the central peak corresponds to the overall GC content of +the underlying genome. Since we don't know the the GC content of the genome the +modal GC content is calculated from the observed data and used to build a +reference distribution.

        +

        An unusually shaped distribution could indicate a contaminated library or +some other kinds of biased subset. A normal distribution which is shifted +indicates some systematic bias which is independent of base position. If there +is a systematic bias which creates a shifted normal distribution then this won't +be flagged as an error by the module since it doesn't know what your genome's +GC content should be.

        +
        + +
        + + +
        + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Per Base N Content + + + +

        + +

        The percentage of base calls at each position for which an N was called.

        + + +
        +

        From the FastQC help:

        +

        If a sequencer is unable to make a base call with sufficient confidence then it will +normally substitute an N rather than a conventional base call. This graph shows the +percentage of base calls at each position for which an N was called.

        +

        It's not unusual to see a very low proportion of Ns appearing in a sequence, especially +nearer the end of a sequence. However, if this proportion rises above a few percent +it suggests that the analysis pipeline was unable to interpret the data well enough to +make valid base calls.

        +
        + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Sequence Length Distribution + +

        + +

        The distribution of fragment sizes (read lengths) found. + See the FastQC help

        + + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Sequence Duplication Levels + + + +

        + +

        The relative level of duplication found for every sequence.

        + + +
        +

        From the FastQC Help:

        +

        In a diverse library most sequences will occur only once in the final set. +A low level of duplication may indicate a very high level of coverage of the +target sequence, but a high level of duplication is more likely to indicate +some kind of enrichment bias (eg PCR over amplification). This graph shows +the degree of duplication for every sequence in a library: the relative +number of sequences with different degrees of duplication.

        +

        Only sequences which first appear in the first 100,000 sequences +in each file are analysed. This should be enough to get a good impression +for the duplication levels in the whole file. Each sequence is tracked to +the end of the file to give a representative count of the overall duplication level.

        +

        The duplication detection requires an exact sequence match over the whole length of +the sequence. Any reads over 75bp in length are truncated to 50bp for this analysis.

        +

        In a properly diverse library most sequences should fall into the far left of the +plot in both the red and blue lines. A general level of enrichment, indicating broad +oversequencing in the library will tend to flatten the lines, lowering the low end +and generally raising other categories. More specific enrichments of subsets, or +the presence of low complexity contaminants will tend to produce spikes towards the +right of the plot.

        +
        + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Overrepresented sequences + + + +

        + +

        The total amount of overrepresented sequences found in each library.

        + + +
        +

        FastQC calculates and lists overrepresented sequences in FastQ files. It would not be +possible to show this for all samples in a MultiQC report, so instead this plot shows +the number of sequences categorized as over represented.

        +

        Sometimes, a single sequence may account for a large number of reads in a dataset. +To show this, the bars are split into two: the first shows the overrepresented reads +that come from the single most common sequence. The second shows the total count +from all remaining overrepresented sequences.

        +

        From the FastQC Help:

        +

        A normal high-throughput library will contain a diverse set of sequences, with no +individual sequence making up a tiny fraction of the whole. Finding that a single +sequence is very overrepresented in the set either means that it is highly biologically +significant, or indicates that the library is contaminated, or not as diverse as you expected.

        +

        FastQC lists all of the sequences which make up more than 0.1% of the total. +To conserve memory only sequences which appear in the first 100,000 sequences are tracked +to the end of the file. It is therefore possible that a sequence which is overrepresented +but doesn't appear at the start of the file for some reason could be missed by this module.

        +
        + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Adapter Content + + + +

        + +

        The cumulative percentage count of the proportion of your + library which has seen each of the adapter sequences at each position.

        + + +
        +

        Note that only samples with ≥ 0.1% adapter contamination are shown.

        +

        There may be several lines per sample, as one is shown for each adapter +detected in the file.

        +

        From the FastQC Help:

        +

        The plot shows a cumulative percentage count of the proportion +of your library which has seen each of the adapter sequences at each position. +Once a sequence has been seen in a read it is counted as being present +right through to the end of the read so the percentages you see will only +increase as the read length goes on.

        +
        + +
        No samples found with any adapter contamination > 0.1%
        + +
        +
        + + + + +
        + +

        + Status Checks + + + +

        + +

        Status for each FastQC section showing whether results seem entirely normal (green), +slightly abnormal (orange) or very unusual (red).

        + + +
        +

        FastQC assigns a status for each section of the report. +These give a quick evaluation of whether the results of the analysis seem +entirely normal (green), slightly abnormal (orange) or very unusual (red).

        +

        It is important to stress that although the analysis results appear to give a pass/fail result, +these evaluations must be taken in the context of what you expect from your library. +A 'normal' sample as far as FastQC is concerned is random and diverse. +Some experiments may be expected to produce libraries which are biased in particular ways. +You should treat the summary evaluations therefore as pointers to where you should concentrate +your attention and understand why your library may not look random and diverse.

        +

        Specific guidance on how to interpret the output of each module can be found in the relevant +report section, or in the FastQC help.

        +

        In this heatmap, we summarise all of these into a single heatmap for a quick overview. +Note that not all FastQC sections have plots in MultiQC reports, but all status checks +are shown in this heatmap.

        +
        + +
        + +
        loading..
        +
        + + +
        + + +
        +
        + + + +
        +

        PREPROCESS: fastp (adapter trimming)

        +

        PREPROCESS: fastp (adapter trimming) This section of the report shows fastp results for reads after adapter and quality trimming.

        + + + + +
        + +

        + Filtered Reads + +

        + +

        Filtering statistics of sampled reads.

        + + +
        + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Duplication Rates + +

        + +

        Duplication rates of sampled reads.

        + + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Insert Sizes + +

        + +

        Insert size estimation of sampled reads.

        + + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Sequence Quality + +

        + +

        Average sequencing quality over each base of all reads.

        + + +
        + + + + +
        + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + GC Content + +

        + +

        Average GC content over each base of all reads.

        + + +
        + + + + +
        + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + N content + +

        + +

        Average N content over each base of all reads.

        + + +
        + + + + +
        + +
        loading..
        +
        + + +
        + + +
        +
        + + + +
        +

        PREPROCESS: FastQC (adapter trimming)

        +

        PREPROCESS: FastQC (adapter trimming) This section of the report shows FastQC results for reads after adapter and quality trimming.

        + + + + +
        + +

        + Sequence Counts + + + +

        + +

        Sequence counts for each sample. Duplicate read counts are an estimate only.

        + + +
        +

        This plot show the total number of reads, broken down into unique and duplicate +if possible (only more recent versions of FastQC give duplicate info).

        +

        You can read more about duplicate calculation in the +FastQC documentation. +A small part has been copied here for convenience:

        +

        Only sequences which first appear in the first 100,000 sequences +in each file are analysed. This should be enough to get a good impression +for the duplication levels in the whole file. Each sequence is tracked to +the end of the file to give a representative count of the overall duplication level.

        +

        The duplication detection requires an exact sequence match over the whole length of +the sequence. Any reads over 75bp in length are truncated to 50bp for this analysis.

        +
        + +
        + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Sequence Quality Histograms + + + +

        + +

        The mean quality value across each base position in the read.

        + + +
        +

        To enable multiple samples to be plotted on the same graph, only the mean quality +scores are plotted (unlike the box plots seen in FastQC reports).

        +

        Taken from the FastQC help:

        +

        The y-axis on the graph shows the quality scores. The higher the score, the better +the base call. The background of the graph divides the y axis into very good quality +calls (green), calls of reasonable quality (orange), and calls of poor quality (red). +The quality of calls on most platforms will degrade as the run progresses, so it is +common to see base calls falling into the orange area towards the end of a read.

        +
        + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Per Sequence Quality Scores + + + +

        + +

        The number of reads with average quality scores. Shows if a subset of reads has poor quality.

        + + +
        +

        From the FastQC help:

        +

        The per sequence quality score report allows you to see if a subset of your +sequences have universally low quality values. It is often the case that a +subset of sequences will have universally poor quality, however these should +represent only a small percentage of the total sequences.

        +
        + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Per Base Sequence Content + + + +

        + +

        The proportion of each base position for which each of the four normal DNA bases has been called.

        + + +
        +

        To enable multiple samples to be shown in a single plot, the base composition data +is shown as a heatmap. The colours represent the balance between the four bases: +an even distribution should give an even muddy brown colour. Hover over the plot +to see the percentage of the four bases under the cursor.

        +

        To see the data as a line plot, as in the original FastQC graph, click on a sample track.

        +

        From the FastQC help:

        +

        Per Base Sequence Content plots out the proportion of each base position in a +file for which each of the four normal DNA bases has been called.

        +

        In a random library you would expect that there would be little to no difference +between the different bases of a sequence run, so the lines in this plot should +run parallel with each other. The relative amount of each base should reflect +the overall amount of these bases in your genome, but in any case they should +not be hugely imbalanced from each other.

        +

        It's worth noting that some types of library will always produce biased sequence +composition, normally at the start of the read. Libraries produced by priming +using random hexamers (including nearly all RNA-Seq libraries) and those which +were fragmented using transposases inherit an intrinsic bias in the positions +at which reads start. This bias does not concern an absolute sequence, but instead +provides enrichement of a number of different K-mers at the 5' end of the reads. +Whilst this is a true technical bias, it isn't something which can be corrected +by trimming and in most cases doesn't seem to adversely affect the downstream +analysis.

        +
        + +
        +
        +
        + + Click a sample row to see a line plot for that dataset. +
        +
        Rollover for sample name
        + +
        + Position: - +
        %T: -
        +
        %C: -
        +
        %A: -
        +
        %G: -
        +
        +
        +
        + +
        +
        +
        +
        + + +
        +
        + + + + +
        + +

        + Per Sequence GC Content + + + +

        + +

        The average GC content of reads. Normal random library typically have a + roughly normal distribution of GC content.

        + + +
        +

        From the FastQC help:

        +

        This module measures the GC content across the whole length of each sequence +in a file and compares it to a modelled normal distribution of GC content.

        +

        In a normal random library you would expect to see a roughly normal distribution +of GC content where the central peak corresponds to the overall GC content of +the underlying genome. Since we don't know the the GC content of the genome the +modal GC content is calculated from the observed data and used to build a +reference distribution.

        +

        An unusually shaped distribution could indicate a contaminated library or +some other kinds of biased subset. A normal distribution which is shifted +indicates some systematic bias which is independent of base position. If there +is a systematic bias which creates a shifted normal distribution then this won't +be flagged as an error by the module since it doesn't know what your genome's +GC content should be.

        +
        + +
        + + +
        + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Per Base N Content + + + +

        + +

        The percentage of base calls at each position for which an N was called.

        + + +
        +

        From the FastQC help:

        +

        If a sequencer is unable to make a base call with sufficient confidence then it will +normally substitute an N rather than a conventional base call. This graph shows the +percentage of base calls at each position for which an N was called.

        +

        It's not unusual to see a very low proportion of Ns appearing in a sequence, especially +nearer the end of a sequence. However, if this proportion rises above a few percent +it suggests that the analysis pipeline was unable to interpret the data well enough to +make valid base calls.

        +
        + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Sequence Length Distribution + +

        + +

        The distribution of fragment sizes (read lengths) found. + See the FastQC help

        + + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Sequence Duplication Levels + + + +

        + +

        The relative level of duplication found for every sequence.

        + + +
        +

        From the FastQC Help:

        +

        In a diverse library most sequences will occur only once in the final set. +A low level of duplication may indicate a very high level of coverage of the +target sequence, but a high level of duplication is more likely to indicate +some kind of enrichment bias (eg PCR over amplification). This graph shows +the degree of duplication for every sequence in a library: the relative +number of sequences with different degrees of duplication.

        +

        Only sequences which first appear in the first 100,000 sequences +in each file are analysed. This should be enough to get a good impression +for the duplication levels in the whole file. Each sequence is tracked to +the end of the file to give a representative count of the overall duplication level.

        +

        The duplication detection requires an exact sequence match over the whole length of +the sequence. Any reads over 75bp in length are truncated to 50bp for this analysis.

        +

        In a properly diverse library most sequences should fall into the far left of the +plot in both the red and blue lines. A general level of enrichment, indicating broad +oversequencing in the library will tend to flatten the lines, lowering the low end +and generally raising other categories. More specific enrichments of subsets, or +the presence of low complexity contaminants will tend to produce spikes towards the +right of the plot.

        +
        + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Overrepresented sequences + + + +

        + +

        The total amount of overrepresented sequences found in each library.

        + + +
        +

        FastQC calculates and lists overrepresented sequences in FastQ files. It would not be +possible to show this for all samples in a MultiQC report, so instead this plot shows +the number of sequences categorized as over represented.

        +

        Sometimes, a single sequence may account for a large number of reads in a dataset. +To show this, the bars are split into two: the first shows the overrepresented reads +that come from the single most common sequence. The second shows the total count +from all remaining overrepresented sequences.

        +

        From the FastQC Help:

        +

        A normal high-throughput library will contain a diverse set of sequences, with no +individual sequence making up a tiny fraction of the whole. Finding that a single +sequence is very overrepresented in the set either means that it is highly biologically +significant, or indicates that the library is contaminated, or not as diverse as you expected.

        +

        FastQC lists all of the sequences which make up more than 0.1% of the total. +To conserve memory only sequences which appear in the first 100,000 sequences are tracked +to the end of the file. It is therefore possible that a sequence which is overrepresented +but doesn't appear at the start of the file for some reason could be missed by this module.

        +
        + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Adapter Content + + + +

        + +

        The cumulative percentage count of the proportion of your + library which has seen each of the adapter sequences at each position.

        + + +
        +

        Note that only samples with ≥ 0.1% adapter contamination are shown.

        +

        There may be several lines per sample, as one is shown for each adapter +detected in the file.

        +

        From the FastQC Help:

        +

        The plot shows a cumulative percentage count of the proportion +of your library which has seen each of the adapter sequences at each position. +Once a sequence has been seen in a read it is counted as being present +right through to the end of the read so the percentages you see will only +increase as the read length goes on.

        +
        + +
        No samples found with any adapter contamination > 0.1%
        + +
        +
        + + + + +
        + +

        + Status Checks + + + +

        + +

        Status for each FastQC section showing whether results seem entirely normal (green), +slightly abnormal (orange) or very unusual (red).

        + + +
        +

        FastQC assigns a status for each section of the report. +These give a quick evaluation of whether the results of the analysis seem +entirely normal (green), slightly abnormal (orange) or very unusual (red).

        +

        It is important to stress that although the analysis results appear to give a pass/fail result, +these evaluations must be taken in the context of what you expect from your library. +A 'normal' sample as far as FastQC is concerned is random and diverse. +Some experiments may be expected to produce libraries which are biased in particular ways. +You should treat the summary evaluations therefore as pointers to where you should concentrate +your attention and understand why your library may not look random and diverse.

        +

        Specific guidance on how to interpret the output of each module can be found in the relevant +report section, or in the FastQC help.

        +

        In this heatmap, we summarise all of these into a single heatmap for a quick overview. +Note that not all FastQC sections have plots in MultiQC reports, but all status checks +are shown in this heatmap.

        +
        + +
        + +
        loading..
        +
        + + +
        + + +
        +
        + + + +
        +

        VARIANTS: Bowtie 2

        +

        This section of the report shows Bowtie 2 mapping results for reads after adapter trimming and quality trimming.

        + + + + +
        + +

        + Paired-end alignments + + + +

        + +

        This plot shows the number of reads aligning to the reference in different ways. +

        Please note that single mate alignment counts are halved to tally with pair counts properly.

        + + +
        +

        There are 6 possible types of alignment:

        +
          +
        • PE mapped uniquely: Pair has only one occurence in the reference genome.
        • +
        • PE mapped discordantly uniquely: Pair has only one occurence but not in proper pair.
        • +
        • PE one mate mapped uniquely: One read of a pair has one occurence.
        • +
        • PE multimapped: Pair has multiple occurence.
        • +
        • PE one mate multimapped: One read of a pair has multiple occurence.
        • +
        • PE neither mate aligned: Pair has no occurence.
        • +
        +
        + +
        + + +
        +
        loading..
        +
        + + +
        + + +
        +
        + + + +
        +

        VARIANTS: SAMTools (raw)

        +

        Samtools This section of the report shows SAMTools counts/statistics after mapping with Bowtie 2.

        + + + + +
        + +

        + Percent Mapped + + + +

        + +

        Alignment metrics from samtools stats; mapped vs. unmapped reads.

        + + +
        +

        For a set of samples that have come from the same multiplexed library, +similar numbers of reads for each sample are expected. Large differences in numbers might +indicate issues during the library preparation process. Whilst large differences in read +numbers may be controlled for in downstream processings (e.g. read count normalisation), +you may wish to consider whether the read depths achieved have fallen below recommended +levels depending on the applications.

        +

        Low alignment rates could indicate contamination of samples (e.g. adapter sequences), +low sequencing quality or other artefacts. These can be further investigated in the +sequence level QC (e.g. from FastQC).

        +
        + +
        + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Alignment metrics + +

        + +

        This module parses the output from samtools stats. All numbers in millions.

        + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Samtools Flagstat + +

        + +

        This module parses the output from samtools flagstat. All numbers in millions.

        + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Mapped reads per contig + +

        + +

        The samtools idxstats tool counts the number of mapped reads per chromosome / contig. Chromosomes with < 0.1% of the total aligned reads are omitted from this plot.

        + + +
        + + +
           
        + + + +
        + +
        loading..
        +
        + + +
        + + +
        +
        + + + +
        +

        VARIANTS: iVar trim

        +

        VARIANTS: iVar trim This section of the report shows counts observed for each amplicon primer per sample as detected by iVar trim.

        + + + + +
        + +

        + iVar Primer Counts + + + +

        + +

        Counts observed for each primer per sample.

        + + +
        +

        This lists the number of times a specific primer was found in the respective sample.

        +
        + +
        + +
        loading..
        +
        + + +
        + + +
        +
        + + + +
        +

        VARIANTS: SAMTools (iVar)

        +

        Samtools This section of the report shows SAMTools counts/statistics after primer sequence removal with iVar.

        + + + + +
        + +

        + Percent Mapped + + + +

        + +

        Alignment metrics from samtools stats; mapped vs. unmapped reads.

        + + +
        +

        For a set of samples that have come from the same multiplexed library, +similar numbers of reads for each sample are expected. Large differences in numbers might +indicate issues during the library preparation process. Whilst large differences in read +numbers may be controlled for in downstream processings (e.g. read count normalisation), +you may wish to consider whether the read depths achieved have fallen below recommended +levels depending on the applications.

        +

        Low alignment rates could indicate contamination of samples (e.g. adapter sequences), +low sequencing quality or other artefacts. These can be further investigated in the +sequence level QC (e.g. from FastQC).

        +
        + +
        + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Alignment metrics + +

        + +

        This module parses the output from samtools stats. All numbers in millions.

        + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Samtools Flagstat + +

        + +

        This module parses the output from samtools flagstat. All numbers in millions.

        + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Mapped reads per contig + +

        + +

        The samtools idxstats tool counts the number of mapped reads per chromosome / contig. Chromosomes with < 0.1% of the total aligned reads are omitted from this plot.

        + + +
        + + +
           
        + + + +
        + +
        loading..
        +
        + + +
        + + +
        +
        + + + +
        +

        VARIANTS: SAMTools (MarkDuplicates)

        +

        Samtools This section of the report shows SAMTools counts/statistics after duplicate removal with picard MarkDuplicates.

        + + + + +
        + +

        + Percent Mapped + + + +

        + +

        Alignment metrics from samtools stats; mapped vs. unmapped reads.

        + + +
        +

        For a set of samples that have come from the same multiplexed library, +similar numbers of reads for each sample are expected. Large differences in numbers might +indicate issues during the library preparation process. Whilst large differences in read +numbers may be controlled for in downstream processings (e.g. read count normalisation), +you may wish to consider whether the read depths achieved have fallen below recommended +levels depending on the applications.

        +

        Low alignment rates could indicate contamination of samples (e.g. adapter sequences), +low sequencing quality or other artefacts. These can be further investigated in the +sequence level QC (e.g. from FastQC).

        +
        + +
        + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Alignment metrics + +

        + +

        This module parses the output from samtools stats. All numbers in millions.

        + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Samtools Flagstat + +

        + +

        This module parses the output from samtools flagstat. All numbers in millions.

        + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Mapped reads per contig + +

        + +

        The samtools idxstats tool counts the number of mapped reads per chromosome / contig. Chromosomes with < 0.1% of the total aligned reads are omitted from this plot.

        + + +
        + + +
           
        + + + +
        + +
        loading..
        +
        + + +
        + + +
        +
        + + + +
        +

        VARIANTS: Picard Metrics

        +

        VARIANTS: Picard Metrics This section of the report shows picard CollectMultipleMetrics and MarkDuplicates results after mapping (if "--protocol amplicon" this will be after primer sequence removal with iVar).

        + + + + +
        + +

        + Alignment Summary + +

        + +

        Please note that Picard's read counts are divided by two for paired-end data.

        + + +
        + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Base Distribution + +

        + +

        Plot shows the distribution of bases by cycle.

        + + +
        + + + + + +
        + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Insert Size + +

        + +

        Plot shows the number of reads at a given insert size. Reads with different orientations are summed.

        + + +
        + + +
        + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Mark Duplicates + + + +

        + +

        Number of reads, categorised by duplication state. Pair counts are doubled - see help text for details.

        + + +
        +

        The table in the Picard metrics file contains some columns referring +read pairs and some referring to single reads.

        +

        To make the numbers in this plot sum correctly, values referring to pairs are doubled +according to the scheme below:

        +
          +
        • READS_IN_DUPLICATE_PAIRS = 2 * READ_PAIR_DUPLICATES
        • +
        • READS_IN_UNIQUE_PAIRS = 2 * (READ_PAIRS_EXAMINED - READ_PAIR_DUPLICATES)
        • +
        • READS_IN_UNIQUE_UNPAIRED = UNPAIRED_READS_EXAMINED - UNPAIRED_READ_DUPLICATES
        • +
        • READS_IN_DUPLICATE_PAIRS_OPTICAL = 2 * READ_PAIR_OPTICAL_DUPLICATES
        • +
        • READS_IN_DUPLICATE_PAIRS_NONOPTICAL = READS_IN_DUPLICATE_PAIRS - READS_IN_DUPLICATE_PAIRS_OPTICAL
        • +
        • READS_IN_DUPLICATE_UNPAIRED = UNPAIRED_READ_DUPLICATES
        • +
        • READS_UNMAPPED = UNMAPPED_READS
        • +
        +
        + +
        + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Mean Base Quality by Cycle + + + +

        + +

        Plot shows the mean base quality by cycle.

        + + +
        +

        This metric gives an overall snapshot of sequencing machine performance. +For most types of sequencing data, the output is expected to show a slight +reduction in overall base quality scores towards the end of each read.

        +

        Spikes in quality within reads are not expected and may indicate that technical +problems occurred during sequencing.

        +
        + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Base Quality Distribution + +

        + +

        Plot shows the count of each base quality score.

        + + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + WGS Coverage + +

        + +

        The number of bases in the genome territory for each fold coverage. Note that final 1% of data is hidden to prevent very long tails.

        + + +
        + + +
        + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + WGS Filtered Bases + +

        + +

        For more information about the filtered categories, see the Picard documentation.

        + + +
        +
        loading..
        +
        + + +
        + + +
        +
        + + + +
        +

        VARIANTS: VarScan 2

        +

        VARIANTS: VarScan 2 This section of the report shows total number of variants called by VarScan 2 broken down by those that were reported or not.

        + + + + +
        + +

        + Variants detected + +

        + +

        This plot shows the total number of variant positions, broken down by those that were reported or not.

        + + +
        + + +
           
        + + +
        + +
        +
        loading..
        +
        + + +
        + + +
        +
        + + + +
        +

        VARIANTS: BCFTools (VarScan 2; high freq)

        +

        Bcftools This section of the report shows BCFTools stats results for high frequency variants called by VarScan 2. The allele frequency filtering threshold can be set by the --max_allele_freq parameter (Default: 0.8).

        + + + + +
        + +

        + Variant Substitution Types + +

        + + + + +
        + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variant Quality + +

        + + + + +
        + + + + +
        + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Indel Distribution + +

        + + + + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variant depths + +

        + +

        Read depth support distribution for called variants

        + + +
        loading..
        +
        + + +
        + + +
        +
        + + + +
        +

        VARIANTS: SnpEff (VarScan 2; high freq)

        +

        VARIANTS: SnpEff (VarScan 2; high freq) This section of the report shows SnpEff results for high frequency variants called by VarScan 2. The allele frequency filtering threshold can be set by the --max_allele_freq parameter (Default: 0.8).

        + + + + +
        + +

        + Variants by Genomic Region + + + +

        + +

        The stacked bar plot shows locations of detected variants in +the genome and the number of variants for each location.

        + + +
        +

        The upstream and downstream interval size to detect these +genomic regions is 5000bp by default.

        +
        + +
        + + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variant Effects by Impact + + + +

        + +

        The stacked bar plot shows the putative impact of detected +variants and the number of variants for each impact.

        + + +
        +

        There are four levels of impacts predicted by SnpEff:

        +
          +
        • High: High impact (like stop codon)
        • +
        • Moderate: Middle impact (like same type of amino acid substitution)
        • +
        • Low: Low impact (ie silence mutation)
        • +
        • Modifier: No impact
        • +
        +
        + +
        + + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variants by Effect Types + + + +

        + +

        The stacked bar plot shows the effect of variants at protein +level and the number of variants for each effect type.

        + + +
        +

        This plot shows the effect of variants with respect to +the mRNA.

        +
        + +
        + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variants by Functional Class + + + +

        + +

        The stacked bar plot shows the effect of variants and +the number of variants for each effect type.

        + + +
        +

        This plot shows the effect of variants on the translation of +the mRNA as protein. There are three possible cases:

        +
          +
        • Silent: The amino acid does not change.
        • +
        • Missense: The amino acid is different.
        • +
        • Nonsense: The variant generates a stop codon.
        • +
        +
        + +
        + + + +
        +
        loading..
        +
        + + +
        + + +
        +
        + + + +
        +

        VARIANTS: QUAST (VarScan 2; high freq)

        +

        VARIANTS: QUAST (VarScan 2; high freq) This section of the report shows QUAST results for consensus sequences generated from high frequency variants with VarScan 2. The allele frequency filtering threshold can be set by the --max_allele_freq parameter (Default: 0.8).

        + + + + +
        + +

        + Assembly Statistics + +

        + + + + +
        + + + + + + + + + Showing 2/2 rows and 10/10 columns. + +
        +
        + +
        Sample NameN50 (Kbp)N75 (Kbp)L50 (K)L75 (K)Largest contig (Kbp)Length (Mbp)MisassembliesMismatches/100kbpIndels/100kbpGenome Fraction
        sample1
        29.9Kbp
        29.9Kbp
        0.0K
        1.0K
        29.9Kbp
        0.0Mbp
        0.0
        20.11
        0.00
        99.8%
        sample2
        29.9Kbp
        29.9Kbp
        0.0K
        1.0K
        29.9Kbp
        0.0Mbp
        0.0
        20.13
        0.00
        99.7%
        + +
        + +
        +
        + + + + +
        + +

        + Number of Contigs + +

        + +

        This plot shows the number of contigs found for each assembly, broken + down by length.

        + + +
        + + +
        +
        loading..
        +
        + + +
        + + +
        +
        + + + +
        +

        VARIANTS: iVar variant counts

        +

        is calculated from the total number of variants called by iVar.

        + + + + +
        + + + + +
        + + +
        +
        loading..
        +
        + + +
        + + +
        +
        + + + +
        +

        VARIANTS: BCFTools (iVar; high freq)

        +

        Bcftools This section of the report shows BCFTools stats results for high frequency variants called by iVar. The allele frequency filtering threshold can be set by the --max_allele_freq parameter (Default: 0.8).

        + + + + +
        + +

        + Variant Substitution Types + +

        + + + + +
        + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variant Quality + +

        + + + + +
        + + + + +
        + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Indel Distribution + +

        + + + + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variant depths + +

        + +

        Read depth support distribution for called variants

        + + +
        loading..
        +
        + + +
        + + +
        +
        + + + +
        +

        VARIANTS: SnpEff (iVar; high freq)

        +

        VARIANTS: SnpEff (iVar; high freq) This section of the report shows SnpEff results for high frequency variants called by iVar. The allele frequency filtering threshold can be set by the --max_allele_freq parameter (Default: 0.8).

        + + + + +
        + +

        + Variants by Genomic Region + + + +

        + +

        The stacked bar plot shows locations of detected variants in +the genome and the number of variants for each location.

        + + +
        +

        The upstream and downstream interval size to detect these +genomic regions is 5000bp by default.

        +
        + +
        + + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variant Effects by Impact + + + +

        + +

        The stacked bar plot shows the putative impact of detected +variants and the number of variants for each impact.

        + + +
        +

        There are four levels of impacts predicted by SnpEff:

        +
          +
        • High: High impact (like stop codon)
        • +
        • Moderate: Middle impact (like same type of amino acid substitution)
        • +
        • Low: Low impact (ie silence mutation)
        • +
        • Modifier: No impact
        • +
        +
        + +
        + + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variants by Effect Types + + + +

        + +

        The stacked bar plot shows the effect of variants at protein +level and the number of variants for each effect type.

        + + +
        +

        This plot shows the effect of variants with respect to +the mRNA.

        +
        + +
        + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variants by Functional Class + + + +

        + +

        The stacked bar plot shows the effect of variants and +the number of variants for each effect type.

        + + +
        +

        This plot shows the effect of variants on the translation of +the mRNA as protein. There are three possible cases:

        +
          +
        • Silent: The amino acid does not change.
        • +
        • Missense: The amino acid is different.
        • +
        • Nonsense: The variant generates a stop codon.
        • +
        +
        + +
        + + + +
        +
        loading..
        +
        + + +
        + + +
        +
        + + + +
        +

        VARIANTS: QUAST (iVar; high freq)

        +

        VARIANTS: QUAST (iVar; high freq) This section of the report shows QUAST results for consensus sequences generated from high frequency variants with iVar. The allele frequency filtering threshold can be set by the --max_allele_freq parameter (Default: 0.8).

        + + + + +
        + +

        + Assembly Statistics + +

        + + + + +
        + + + + + + + + + Showing 2/2 rows and 10/10 columns. + +
        +
        + +
        Sample NameN50 (Kbp)N75 (Kbp)L50 (K)L75 (K)Largest contig (Kbp)Length (Mbp)MisassembliesMismatches/100kbpIndels/100kbpGenome Fraction
        sample1
        29.9Kbp
        29.9Kbp
        0.0K
        1.0K
        29.9Kbp
        0.0Mbp
        0.0
        20.11
        0.00
        99.8%
        sample2
        29.8Kbp
        29.8Kbp
        0.0K
        1.0K
        29.8Kbp
        0.0Mbp
        0.0
        23.53
        0.00
        99.5%
        + +
        + +
        +
        + + + + +
        + +

        + Number of Contigs + +

        + +

        This plot shows the number of contigs found for each assembly, broken + down by length.

        + + +
        + + +
        +
        loading..
        +
        + + +
        + + +
        +
        + + + +
        +

        VARIANTS: BCFTools (BCFTools)

        +

        Bcftools This section of the report shows BCFTools stats results for variants called by BCFTools.

        + + + + +
        + +

        + Variant Substitution Types + +

        + + + + +
        + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variant Quality + +

        + + + + +
        + + + + +
        + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Indel Distribution + +

        + + + + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variant depths + +

        + +

        Read depth support distribution for called variants

        + + +
        loading..
        +
        + + +
        + + +
        +
        + + + +
        +

        VARIANTS: SnpEff (BCFTools)

        +

        VARIANTS: SnpEff (BCFTools) This section of the report shows SnpEff results for variants called by BCFTools.

        + + + + +
        + +

        + Variants by Genomic Region + + + +

        + +

        The stacked bar plot shows locations of detected variants in +the genome and the number of variants for each location.

        + + +
        +

        The upstream and downstream interval size to detect these +genomic regions is 5000bp by default.

        +
        + +
        + + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variant Effects by Impact + + + +

        + +

        The stacked bar plot shows the putative impact of detected +variants and the number of variants for each impact.

        + + +
        +

        There are four levels of impacts predicted by SnpEff:

        +
          +
        • High: High impact (like stop codon)
        • +
        • Moderate: Middle impact (like same type of amino acid substitution)
        • +
        • Low: Low impact (ie silence mutation)
        • +
        • Modifier: No impact
        • +
        +
        + +
        + + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variants by Effect Types + + + +

        + +

        The stacked bar plot shows the effect of variants at protein +level and the number of variants for each effect type.

        + + +
        +

        This plot shows the effect of variants with respect to +the mRNA.

        +
        + +
        + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variants by Functional Class + + + +

        + +

        The stacked bar plot shows the effect of variants and +the number of variants for each effect type.

        + + +
        +

        This plot shows the effect of variants on the translation of +the mRNA as protein. There are three possible cases:

        +
          +
        • Silent: The amino acid does not change.
        • +
        • Missense: The amino acid is different.
        • +
        • Nonsense: The variant generates a stop codon.
        • +
        +
        + +
        + + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variant Qualities + + + +

        + +

        The line plot shows the quantity as function of the +variant quality score.

        + + +
        +

        The quality score corresponds to the QUAL column of the +VCF file. This score is set by the variant caller.

        +
        + +
        loading..
        +
        + + +
        + + +
        +
        + + + +
        +

        VARIANTS: QUAST (BCFTools)

        +

        VARIANTS: QUAST (BCFTools) This section of the report shows QUAST results for consensus sequence generated from BCFTools variants.

        + + + + +
        + +

        + Assembly Statistics + +

        + + + + +
        + + + + + + + + + Showing 2/2 rows and 10/10 columns. + +
        +
        + +
        Sample NameN50 (Kbp)N75 (Kbp)L50 (K)L75 (K)Largest contig (Kbp)Length (Mbp)MisassembliesMismatches/100kbpIndels/100kbpGenome Fraction
        sample1
        29.9Kbp
        29.9Kbp
        0.0K
        1.0K
        29.9Kbp
        0.0Mbp
        0.0
        20.11
        0.00
        99.8%
        sample2
        29.9Kbp
        29.9Kbp
        0.0K
        1.0K
        29.9Kbp
        0.0Mbp
        0.0
        23.49
        0.00
        99.7%
        + +
        + +
        +
        + + + + +
        + +

        + Number of Contigs + +

        + +

        This plot shows the number of contigs found for each assembly, broken + down by length.

        + + +
        + + +
        +
        loading..
        +
        + + +
        + + +
        +
        + + + +
        +

        ASSEMBLY: Cutadapt (primer trimming)

        +

        ASSEMBLY: Cutadapt (primer trimming) This section of the report shows Cutadapt results for reads after primer sequence trimming.

        + + + + +
        + +

        + Filtered Reads + +

        + +

        This plot shows the number of reads (SE) / pairs (PE) removed by Cutadapt.

        + + +
        + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Trimmed Sequence Lengths + + + +

        + +

        This plot shows the number of reads with certain lengths of adapter trimmed.

        + + +
        +

        Obs/Exp shows the raw counts divided by the number expected due to sequencing errors. +A defined peak may be related to adapter length.

        +

        See the cutadapt documentation +for more information on how these numbers are generated.

        +
        + +

        Flat image plot. Toolbox functions such as highlighting / hiding samples will not work (see the docs).

        + + +
        + +
        + + +
        + + +
        +
        + + + +
        +

        ASSEMBLY: FastQC (primer trimming)

        +

        ASSEMBLY: FastQC (primer trimming) This section of the report shows FastQC results for reads after primer sequence trimming with Cutadapt.

        + + + + +
        + +

        + Sequence Counts + + + +

        + +

        Sequence counts for each sample. Duplicate read counts are an estimate only.

        + + +
        +

        This plot show the total number of reads, broken down into unique and duplicate +if possible (only more recent versions of FastQC give duplicate info).

        +

        You can read more about duplicate calculation in the +FastQC documentation. +A small part has been copied here for convenience:

        +

        Only sequences which first appear in the first 100,000 sequences +in each file are analysed. This should be enough to get a good impression +for the duplication levels in the whole file. Each sequence is tracked to +the end of the file to give a representative count of the overall duplication level.

        +

        The duplication detection requires an exact sequence match over the whole length of +the sequence. Any reads over 75bp in length are truncated to 50bp for this analysis.

        +
        + +
        + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Sequence Quality Histograms + + + +

        + +

        The mean quality value across each base position in the read.

        + + +
        +

        To enable multiple samples to be plotted on the same graph, only the mean quality +scores are plotted (unlike the box plots seen in FastQC reports).

        +

        Taken from the FastQC help:

        +

        The y-axis on the graph shows the quality scores. The higher the score, the better +the base call. The background of the graph divides the y axis into very good quality +calls (green), calls of reasonable quality (orange), and calls of poor quality (red). +The quality of calls on most platforms will degrade as the run progresses, so it is +common to see base calls falling into the orange area towards the end of a read.

        +
        + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Per Sequence Quality Scores + + + +

        + +

        The number of reads with average quality scores. Shows if a subset of reads has poor quality.

        + + +
        +

        From the FastQC help:

        +

        The per sequence quality score report allows you to see if a subset of your +sequences have universally low quality values. It is often the case that a +subset of sequences will have universally poor quality, however these should +represent only a small percentage of the total sequences.

        +
        + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Per Base Sequence Content + + + +

        + +

        The proportion of each base position for which each of the four normal DNA bases has been called.

        + + +
        +

        To enable multiple samples to be shown in a single plot, the base composition data +is shown as a heatmap. The colours represent the balance between the four bases: +an even distribution should give an even muddy brown colour. Hover over the plot +to see the percentage of the four bases under the cursor.

        +

        To see the data as a line plot, as in the original FastQC graph, click on a sample track.

        +

        From the FastQC help:

        +

        Per Base Sequence Content plots out the proportion of each base position in a +file for which each of the four normal DNA bases has been called.

        +

        In a random library you would expect that there would be little to no difference +between the different bases of a sequence run, so the lines in this plot should +run parallel with each other. The relative amount of each base should reflect +the overall amount of these bases in your genome, but in any case they should +not be hugely imbalanced from each other.

        +

        It's worth noting that some types of library will always produce biased sequence +composition, normally at the start of the read. Libraries produced by priming +using random hexamers (including nearly all RNA-Seq libraries) and those which +were fragmented using transposases inherit an intrinsic bias in the positions +at which reads start. This bias does not concern an absolute sequence, but instead +provides enrichement of a number of different K-mers at the 5' end of the reads. +Whilst this is a true technical bias, it isn't something which can be corrected +by trimming and in most cases doesn't seem to adversely affect the downstream +analysis.

        +
        + +
        +
        +
        + + Click a sample row to see a line plot for that dataset. +
        +
        Rollover for sample name
        + +
        + Position: - +
        %T: -
        +
        %C: -
        +
        %A: -
        +
        %G: -
        +
        +
        +
        + +
        +
        +
        +
        + + +
        +
        + + + + +
        + +

        + Per Sequence GC Content + + + +

        + +

        The average GC content of reads. Normal random library typically have a + roughly normal distribution of GC content.

        + + +
        +

        From the FastQC help:

        +

        This module measures the GC content across the whole length of each sequence +in a file and compares it to a modelled normal distribution of GC content.

        +

        In a normal random library you would expect to see a roughly normal distribution +of GC content where the central peak corresponds to the overall GC content of +the underlying genome. Since we don't know the the GC content of the genome the +modal GC content is calculated from the observed data and used to build a +reference distribution.

        +

        An unusually shaped distribution could indicate a contaminated library or +some other kinds of biased subset. A normal distribution which is shifted +indicates some systematic bias which is independent of base position. If there +is a systematic bias which creates a shifted normal distribution then this won't +be flagged as an error by the module since it doesn't know what your genome's +GC content should be.

        +
        + +
        + + +
        + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Per Base N Content + + + +

        + +

        The percentage of base calls at each position for which an N was called.

        + + +
        +

        From the FastQC help:

        +

        If a sequencer is unable to make a base call with sufficient confidence then it will +normally substitute an N rather than a conventional base call. This graph shows the +percentage of base calls at each position for which an N was called.

        +

        It's not unusual to see a very low proportion of Ns appearing in a sequence, especially +nearer the end of a sequence. However, if this proportion rises above a few percent +it suggests that the analysis pipeline was unable to interpret the data well enough to +make valid base calls.

        +
        + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Sequence Length Distribution + +

        + +

        The distribution of fragment sizes (read lengths) found. + See the FastQC help

        + + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Sequence Duplication Levels + + + +

        + +

        The relative level of duplication found for every sequence.

        + + +
        +

        From the FastQC Help:

        +

        In a diverse library most sequences will occur only once in the final set. +A low level of duplication may indicate a very high level of coverage of the +target sequence, but a high level of duplication is more likely to indicate +some kind of enrichment bias (eg PCR over amplification). This graph shows +the degree of duplication for every sequence in a library: the relative +number of sequences with different degrees of duplication.

        +

        Only sequences which first appear in the first 100,000 sequences +in each file are analysed. This should be enough to get a good impression +for the duplication levels in the whole file. Each sequence is tracked to +the end of the file to give a representative count of the overall duplication level.

        +

        The duplication detection requires an exact sequence match over the whole length of +the sequence. Any reads over 75bp in length are truncated to 50bp for this analysis.

        +

        In a properly diverse library most sequences should fall into the far left of the +plot in both the red and blue lines. A general level of enrichment, indicating broad +oversequencing in the library will tend to flatten the lines, lowering the low end +and generally raising other categories. More specific enrichments of subsets, or +the presence of low complexity contaminants will tend to produce spikes towards the +right of the plot.

        +
        + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Overrepresented sequences + + + +

        + +

        The total amount of overrepresented sequences found in each library.

        + + +
        +

        FastQC calculates and lists overrepresented sequences in FastQ files. It would not be +possible to show this for all samples in a MultiQC report, so instead this plot shows +the number of sequences categorized as over represented.

        +

        Sometimes, a single sequence may account for a large number of reads in a dataset. +To show this, the bars are split into two: the first shows the overrepresented reads +that come from the single most common sequence. The second shows the total count +from all remaining overrepresented sequences.

        +

        From the FastQC Help:

        +

        A normal high-throughput library will contain a diverse set of sequences, with no +individual sequence making up a tiny fraction of the whole. Finding that a single +sequence is very overrepresented in the set either means that it is highly biologically +significant, or indicates that the library is contaminated, or not as diverse as you expected.

        +

        FastQC lists all of the sequences which make up more than 0.1% of the total. +To conserve memory only sequences which appear in the first 100,000 sequences are tracked +to the end of the file. It is therefore possible that a sequence which is overrepresented +but doesn't appear at the start of the file for some reason could be missed by this module.

        +
        + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Adapter Content + + + +

        + +

        The cumulative percentage count of the proportion of your + library which has seen each of the adapter sequences at each position.

        + + +
        +

        Note that only samples with ≥ 0.1% adapter contamination are shown.

        +

        There may be several lines per sample, as one is shown for each adapter +detected in the file.

        +

        From the FastQC Help:

        +

        The plot shows a cumulative percentage count of the proportion +of your library which has seen each of the adapter sequences at each position. +Once a sequence has been seen in a read it is counted as being present +right through to the end of the read so the percentages you see will only +increase as the read length goes on.

        +
        + +
        No samples found with any adapter contamination > 0.1%
        + +
        +
        + + + + +
        + +

        + Status Checks + + + +

        + +

        Status for each FastQC section showing whether results seem entirely normal (green), +slightly abnormal (orange) or very unusual (red).

        + + +
        +

        FastQC assigns a status for each section of the report. +These give a quick evaluation of whether the results of the analysis seem +entirely normal (green), slightly abnormal (orange) or very unusual (red).

        +

        It is important to stress that although the analysis results appear to give a pass/fail result, +these evaluations must be taken in the context of what you expect from your library. +A 'normal' sample as far as FastQC is concerned is random and diverse. +Some experiments may be expected to produce libraries which are biased in particular ways. +You should treat the summary evaluations therefore as pointers to where you should concentrate +your attention and understand why your library may not look random and diverse.

        +

        Specific guidance on how to interpret the output of each module can be found in the relevant +report section, or in the FastQC help.

        +

        In this heatmap, we summarise all of these into a single heatmap for a quick overview. +Note that not all FastQC sections have plots in MultiQC reports, but all status checks +are shown in this heatmap.

        +
        + +
        + +
        loading..
        +
        + + +
        + + +
        +
        + + + +
        +

        ASSEMBLY: Kraken 2

        +

        ASSEMBLY: Kraken 2 This section of the report shows Kraken 2 classification results for reads after primer sequence trimming with Cutadapt.

        + + + + +
        + +

        + Top taxa + + + +

        + +

        The number of reads falling into the top 5 taxa across different ranks.

        + + +
        +

        To make this plot, the percentage of each sample assigned to a given taxa is summed across all samples. +The counts for these top five taxa are then plotted for each of the 9 different taxa ranks. +The unclassified count is always shown across all taxa ranks.

        +

        The total number of reads is approximated by dividing the number of unclassified reads by the percentage of +the library that they account for. +Note that this is only an approximation, and that kraken percentages don't always add to exactly 100%.

        +

        The category "Other" shows the difference between the above total read count and the sum of the read counts +in the top 5 taxa shown + unclassified. This should cover all taxa not in the top 5, +/- any rounding errors.

        +

        Note that any taxon that does not exactly fit a taxon rank (eg. - or G2) is ignored.

        +
        + +
        + + +
           
        + + + + + + + + +
        + +
        +
        loading..
        +
        + + +
        + + +
        +
        + + + +
        +

        ASSEMBLY: QUAST (SPAdes)

        +

        ASSEMBLY: QUAST (SPAdes) This section of the report shows QUAST results from SPAdes de novo assembly.

        + + + + +
        + +

        + Assembly Statistics + +

        + + + + +
        + + + + + + + + + Showing 2/2 rows and 10/10 columns. + +
        +
        + +
        Sample NameN50 (Kbp)N75 (Kbp)L50 (K)L75 (K)Largest contig (Kbp)Length (Mbp)MisassembliesMismatches/100kbpIndels/100kbpGenome Fraction
        sample1
        0.8Kbp
        0.6Kbp
        0.0K
        11.0K
        5.5Kbp
        0.0Mbp
        3.0
        50.60
        0.00
        39.7%
        sample2
        1.1Kbp
        0.9Kbp
        0.0K
        17.0K
        5.5Kbp
        0.0Mbp
        2.0
        39.10
        0.00
        85.5%
        + +
        + +
        +
        + + + + +
        + +

        + Number of Contigs + +

        + +

        This plot shows the number of contigs found for each assembly, broken + down by length.

        + + +
        + + +
        +
        loading..
        +
        + + +
        + + +
        +
        + + + +
        +

        ASSEMBLY: BCFTools (SPAdes)

        +

        Bcftools This section of the report shows BCFTools stats results for variants called in the SPAdes assembly relative to the reference.

        + + + + +
        + +

        + Variant Substitution Types + +

        + + + + +
        + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variant Quality + +

        + + + + +
        + + + + +
        + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Indel Distribution + +

        + + + + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variant depths + +

        + +

        Read depth support distribution for called variants

        + + +
        loading..
        +
        + + +
        + + +
        +
        + + + +
        +

        ASSEMBLY: SnpEff (SPAdes)

        +

        ASSEMBLY: SnpEff (SPAdes) This section of the report shows SnpEff results for variants called in the SPAdes assembly relative to the reference.

        + + + + +
        + +

        + Variants by Genomic Region + + + +

        + +

        The stacked bar plot shows locations of detected variants in +the genome and the number of variants for each location.

        + + +
        +

        The upstream and downstream interval size to detect these +genomic regions is 5000bp by default.

        +
        + +
        + + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variant Effects by Impact + + + +

        + +

        The stacked bar plot shows the putative impact of detected +variants and the number of variants for each impact.

        + + +
        +

        There are four levels of impacts predicted by SnpEff:

        +
          +
        • High: High impact (like stop codon)
        • +
        • Moderate: Middle impact (like same type of amino acid substitution)
        • +
        • Low: Low impact (ie silence mutation)
        • +
        • Modifier: No impact
        • +
        +
        + +
        + + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variants by Effect Types + + + +

        + +

        The stacked bar plot shows the effect of variants at protein +level and the number of variants for each effect type.

        + + +
        +

        This plot shows the effect of variants with respect to +the mRNA.

        +
        + +
        + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variants by Functional Class + + + +

        + +

        The stacked bar plot shows the effect of variants and +the number of variants for each effect type.

        + + +
        +

        This plot shows the effect of variants on the translation of +the mRNA as protein. There are three possible cases:

        +
          +
        • Silent: The amino acid does not change.
        • +
        • Missense: The amino acid is different.
        • +
        • Nonsense: The variant generates a stop codon.
        • +
        +
        + +
        + + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variant Qualities + + + +

        + +

        The line plot shows the quantity as function of the +variant quality score.

        + + +
        +

        The quality score corresponds to the QUAL column of the +VCF file. This score is set by the variant caller.

        +
        + +
        loading..
        +
        + + +
        + + +
        +
        + + + +
        +

        ASSEMBLY: QUAST (MetaSPAdes)

        +

        ASSEMBLY: QUAST (MetaSPAdes) This section of the report shows QUAST results from MetaSPAdes de novo assembly.

        + + + + +
        + +

        + Assembly Statistics + +

        + + + + +
        + + + + + + + + + Showing 2/2 rows and 10/10 columns. + +
        +
        + +
        Sample NameN50 (Kbp)N75 (Kbp)L50 (K)L75 (K)Largest contig (Kbp)Length (Mbp)MisassembliesMismatches/100kbpIndels/100kbpGenome Fraction
        sample1
        30.0Kbp
        30.0Kbp
        0.0K
        1.0K
        30.0Kbp
        0.0Mbp
        0.0
        20.09
        0.00
        99.9%
        sample2
        29.9Kbp
        29.9Kbp
        0.0K
        1.0K
        29.9Kbp
        0.0Mbp
        0.0
        23.48
        0.00
        99.7%
        + +
        + +
        +
        + + + + +
        + +

        + Number of Contigs + +

        + +

        This plot shows the number of contigs found for each assembly, broken + down by length.

        + + +
        + + +
        +
        loading..
        +
        + + +
        + + +
        +
        + + + +
        +

        ASSEMBLY: BCFTools (MetaSPAdes)

        +

        Bcftools This section of the report shows BCFTools stats results for variants called in the MetaSPAdes assembly relative to the reference.

        + + + + +
        + +

        + Variant Substitution Types + +

        + + + + +
        + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variant Quality + +

        + + + + +
        + + + + +
        + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Indel Distribution + +

        + + + + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variant depths + +

        + +

        Read depth support distribution for called variants

        + + +
        loading..
        +
        + + +
        + + +
        +
        + + + +
        +

        ASSEMBLY: SnpEff (MetaSPAdes)

        +

        ASSEMBLY: SnpEff (MetaSPAdes) This section of the report shows SnpEff results for variants called in the MetaSPAdes assembly relative to the reference.

        + + + + +
        + +

        + Variants by Genomic Region + + + +

        + +

        The stacked bar plot shows locations of detected variants in +the genome and the number of variants for each location.

        + + +
        +

        The upstream and downstream interval size to detect these +genomic regions is 5000bp by default.

        +
        + +
        + + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variant Effects by Impact + + + +

        + +

        The stacked bar plot shows the putative impact of detected +variants and the number of variants for each impact.

        + + +
        +

        There are four levels of impacts predicted by SnpEff:

        +
          +
        • High: High impact (like stop codon)
        • +
        • Moderate: Middle impact (like same type of amino acid substitution)
        • +
        • Low: Low impact (ie silence mutation)
        • +
        • Modifier: No impact
        • +
        +
        + +
        + + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variants by Effect Types + + + +

        + +

        The stacked bar plot shows the effect of variants at protein +level and the number of variants for each effect type.

        + + +
        +

        This plot shows the effect of variants with respect to +the mRNA.

        +
        + +
        + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variants by Functional Class + + + +

        + +

        The stacked bar plot shows the effect of variants and +the number of variants for each effect type.

        + + +
        +

        This plot shows the effect of variants on the translation of +the mRNA as protein. There are three possible cases:

        +
          +
        • Silent: The amino acid does not change.
        • +
        • Missense: The amino acid is different.
        • +
        • Nonsense: The variant generates a stop codon.
        • +
        +
        + +
        + + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variant Qualities + + + +

        + +

        The line plot shows the quantity as function of the +variant quality score.

        + + +
        +

        The quality score corresponds to the QUAL column of the +VCF file. This score is set by the variant caller.

        +
        + +
        loading..
        +
        + + +
        + + +
        +
        + + + +
        +

        ASSEMBLY: QUAST (Unicycler)

        +

        ASSEMBLY: QUAST (Unicycler) This section of the report shows QUAST results from Unicycler de novo assembly.

        + + + + +
        + +

        + Assembly Statistics + +

        + + + + +
        + + + + + + + + + Showing 2/2 rows and 10/10 columns. + +
        +
        + +
        Sample NameN50 (Kbp)N75 (Kbp)L50 (K)L75 (K)Largest contig (Kbp)Length (Mbp)MisassembliesMismatches/100kbpIndels/100kbpGenome Fraction
        sample1
        0.9Kbp
        0.7Kbp
        0.0K
        9.0K
        5.4Kbp
        0.0Mbp
        0.0
        17.69
        0.00
        37.8%
        sample2
        1.5Kbp
        1.0Kbp
        0.0K
        12.0K
        2.4Kbp
        0.0Mbp
        0.0
        28.32
        0.00
        82.7%
        + +
        + +
        +
        + + + + +
        + +

        + Number of Contigs + +

        + +

        This plot shows the number of contigs found for each assembly, broken + down by length.

        + + +
        + + +
        +
        loading..
        +
        + + +
        + + +
        +
        + + + +
        +

        ASSEMBLY: BCFTools (Unicycler)

        +

        Bcftools This section of the report shows BCFTools stats results for variants called in the Unicycler assembly relative to the reference.

        + + + + +
        + +

        + Variant Substitution Types + +

        + + + + +
        + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variant Quality + +

        + + + + +
        + + + + +
        + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Indel Distribution + +

        + + + + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variant depths + +

        + +

        Read depth support distribution for called variants

        + + +
        loading..
        +
        + + +
        + + +
        +
        + + + +
        +

        ASSEMBLY: SnpEff (Unicycler)

        +

        ASSEMBLY: SnpEff (Unicycler) This section of the report shows SnpEff results for variants called in the Unicycler assembly relative to the reference.

        + + + + +
        + +

        + Variants by Genomic Region + + + +

        + +

        The stacked bar plot shows locations of detected variants in +the genome and the number of variants for each location.

        + + +
        +

        The upstream and downstream interval size to detect these +genomic regions is 5000bp by default.

        +
        + +
        + + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variant Effects by Impact + + + +

        + +

        The stacked bar plot shows the putative impact of detected +variants and the number of variants for each impact.

        + + +
        +

        There are four levels of impacts predicted by SnpEff:

        +
          +
        • High: High impact (like stop codon)
        • +
        • Moderate: Middle impact (like same type of amino acid substitution)
        • +
        • Low: Low impact (ie silence mutation)
        • +
        • Modifier: No impact
        • +
        +
        + +
        + + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variants by Effect Types + + + +

        + +

        The stacked bar plot shows the effect of variants at protein +level and the number of variants for each effect type.

        + + +
        +

        This plot shows the effect of variants with respect to +the mRNA.

        +
        + +
        + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variants by Functional Class + + + +

        + +

        The stacked bar plot shows the effect of variants and +the number of variants for each effect type.

        + + +
        +

        This plot shows the effect of variants on the translation of +the mRNA as protein. There are three possible cases:

        +
          +
        • Silent: The amino acid does not change.
        • +
        • Missense: The amino acid is different.
        • +
        • Nonsense: The variant generates a stop codon.
        • +
        +
        + +
        + + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variant Qualities + + + +

        + +

        The line plot shows the quantity as function of the +variant quality score.

        + + +
        +

        The quality score corresponds to the QUAL column of the +VCF file. This score is set by the variant caller.

        +
        + +
        loading..
        +
        + + +
        + + +
        +
        + + + +
        +

        ASSEMBLY: QUAST (minia)

        +

        ASSEMBLY: QUAST (minia) This section of the report shows QUAST results from minia de novo assembly.

        + + + + +
        + +

        + Assembly Statistics + +

        + + + + +
        + + + + + + + + + Showing 2/2 rows and 10/10 columns. + +
        +
        + +
        Sample NameN50 (Kbp)N75 (Kbp)L50 (K)L75 (K)Largest contig (Kbp)Length (Mbp)MisassembliesMismatches/100kbpIndels/100kbpGenome Fraction
        sample1
        15.8Kbp
        6.1Kbp
        0.0K
        3.0K
        16.2Kbp
        0.0Mbp
        0.0
        20.21
        0.00
        99.3%
        sample2
        16.2Kbp
        7.6Kbp
        0.0K
        3.0K
        18.1Kbp
        0.0Mbp
        0.0
        20.31
        0.00
        98.8%
        + +
        + +
        +
        + + + + +
        + +

        + Number of Contigs + +

        + +

        This plot shows the number of contigs found for each assembly, broken + down by length.

        + + +
        + + +
        +
        loading..
        +
        + + +
        + + +
        +
        + + + +
        +

        ASSEMBLY: BCFTools (minia)

        +

        Bcftools This section of the report shows BCFTools stats results for variants called in the minia assembly relative to the reference.

        + + + + +
        + +

        + Variant Substitution Types + +

        + + + + +
        + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variant Quality + +

        + + + + +
        + + + + +
        + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Indel Distribution + +

        + + + + +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variant depths + +

        + +

        Read depth support distribution for called variants

        + + +
        loading..
        +
        + + +
        + + +
        +
        + + + +
        +

        ASSEMBLY: SnpEff (minia)

        +

        ASSEMBLY: SnpEff (minia) This section of the report shows SnpEff results for variants called in the minia assembly relative to the reference.

        + + + + +
        + +

        + Variants by Genomic Region + + + +

        + +

        The stacked bar plot shows locations of detected variants in +the genome and the number of variants for each location.

        + + +
        +

        The upstream and downstream interval size to detect these +genomic regions is 5000bp by default.

        +
        + +
        + + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variant Effects by Impact + + + +

        + +

        The stacked bar plot shows the putative impact of detected +variants and the number of variants for each impact.

        + + +
        +

        There are four levels of impacts predicted by SnpEff:

        +
          +
        • High: High impact (like stop codon)
        • +
        • Moderate: Middle impact (like same type of amino acid substitution)
        • +
        • Low: Low impact (ie silence mutation)
        • +
        • Modifier: No impact
        • +
        +
        + +
        + + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variants by Effect Types + + + +

        + +

        The stacked bar plot shows the effect of variants at protein +level and the number of variants for each effect type.

        + + +
        +

        This plot shows the effect of variants with respect to +the mRNA.

        +
        + +
        + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variants by Functional Class + + + +

        + +

        The stacked bar plot shows the effect of variants and +the number of variants for each effect type.

        + + +
        +

        This plot shows the effect of variants on the translation of +the mRNA as protein. There are three possible cases:

        +
          +
        • Silent: The amino acid does not change.
        • +
        • Missense: The amino acid is different.
        • +
        • Nonsense: The variant generates a stop codon.
        • +
        +
        + +
        + + + +
        +
        loading..
        +
        + +
        +
        + + + + +
        + +

        + Variant Qualities + + + +

        + +

        The line plot shows the quantity as function of the +variant quality score.

        + + +
        +

        The quality score corresponds to the QUAL column of the +VCF file. This score is set by the variant caller.

        +
        + +
        loading..
        +
        + + +
        + + +
        +
        + + + +
        +

        nf-core/viralrecon Software Versions

        +

        are collected at run time from the software output.

        + + + + +
        + + + + + +
        + +
        nf-core/viralrecon
        v1.0dev
        +
        Nextflow
        v20.01.0
        +
        parallel-fastq-dump
        v0.6.6
        +
        FastQC
        v0.11.9
        +
        fastp
        v0.20.1
        +
        Bowtie 2
        v2.3.5.1
        +
        Samtools
        v1.9
        +
        BEDTools
        v2.29.2
        +
        Picard
        v2.22.8
        +
        iVar
        v1.2.2
        +
        VarScan 2
        v2.4.4
        +
        SnpEff
        v4.5covid19
        +
        SnpSift
        v4.3t
        +
        BCFTools
        v1.9
        +
        Cutadapt
        v2.10
        +
        Kraken2
        v2.0.9-beta
        +
        SPAdes
        v3.14.0
        +
        Unicycler
        v0.4.7
        +
        minia
        v3.2.3
        +
        Minimap2
        v2.17-r941
        +
        vg
        v1.24.0
        +
        BLAST
        v2.9.0+
        +
        ABACAS
        v1.3.1
        +
        QUAST
        v5.0.2
        +
        Bandage
        v0.8.1
        +
        R
        v3.6.2
        +
        MultiQC
        v1.9
        +
        + + +
        + + +
        +
        + + + +
        +

        nf-core/viralrecon Workflow Summary

        +

        - this information is collected when the pipeline is started.

        + + + + +
        + + + + + +
        +
        Run Name
        fervent_majorana
        +
        Samplesheet
        https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_full_amplicon.csv
        +
        Protocol
        amplicon
        +
        Amplicon Fasta File
        https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/genome/NC_045512.2/amplicon/nCoV-2019.artic.V1.primer.fasta
        +
        Amplicon BED File
        https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/genome/NC_045512.2/amplicon/nCoV-2019.artic.V1.bed
        +
        Viral Genome
        NC_045512.2
        +
        Viral Fasta File
        https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/genome/NC_045512.2/GCF_009858895.2_ASM985889v3_genomic.200409.fna.gz
        +
        Viral GFF
        https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/genome/NC_045512.2/GCF_009858895.2_ASM985889v3_genomic.200409.gff.gz
        +
        Host Kraken2 DB
        https://zenodo.org/record/3738199/files/kraken2_human.tar.gz
        +
        Host Kraken2 Name
        human
        +
        Cut Mean Quality
        30
        +
        Qualified Phred
        30
        +
        Unqualified Perc Limit
        10
        +
        Min Trim Length
        50
        +
        Variant Calling Tools
        varscan2,ivar,bcftools
        +
        Min Base Quality
        20
        +
        Min Read Depth
        10
        +
        Max Allele Freq
        0.8
        +
        Assembly Tools
        spades,metaspades,unicycler,minia
        +
        Minia Kmer Size
        31
        +
        Max Resources
        224 GB memory, 32 cpus, 3d time per job
        +
        Container
        singularity - nfcore-viralrecon-dev.img
        +
        Output dir
        ./results
        +
        Publish dir mode
        copy
        +
        Launch dir
        nf-core/viralrecon/test_full
        +
        Working dir
        nf-core/viralrecon/test_full/work
        +
        Script dir
        nf-core/viralrecon
        +
        User
        patelh
        +
        Config Profile
        test_full,crick
        +
        Config Description
        Full test dataset to check pipeline function
        +
        Config Contact
        Harshil Patel (@drpatelh)
        +
        Config URL
        https://www.crick.ac.uk/research/platforms-and-facilities/scientific-computing/technologies
        +
        E-mail Address
        harshil.patel@crick.ac.uk
        +
        E-mail on failure
        N/A
        +
        MultiQC maxsize
        25 MB
        +
        + + +
        + + +
        + + + + +
        + + + + + + + + + + + + + + + + diff --git a/docs/images/mqc_bcftools_stats_plot.png b/docs/images/mqc_bcftools_stats_plot.png new file mode 100755 index 00000000..dcdab9c3 Binary files /dev/null and b/docs/images/mqc_bcftools_stats_plot.png differ diff --git a/docs/images/mqc_bowtie2_plot.png b/docs/images/mqc_bowtie2_plot.png new file mode 100755 index 00000000..edb2f472 Binary files /dev/null and b/docs/images/mqc_bowtie2_plot.png differ diff --git a/docs/images/mqc_cutadapt_plot.png b/docs/images/mqc_cutadapt_plot.png new file mode 100755 index 00000000..1cbdea48 Binary files /dev/null and b/docs/images/mqc_cutadapt_plot.png differ diff --git a/docs/images/mqc_fastp_plot.png b/docs/images/mqc_fastp_plot.png new file mode 100755 index 00000000..798539ca Binary files /dev/null and b/docs/images/mqc_fastp_plot.png differ diff --git a/docs/images/mqc_fastqc_plot.png b/docs/images/mqc_fastqc_plot.png new file mode 100755 index 00000000..25401cff Binary files /dev/null and b/docs/images/mqc_fastqc_plot.png differ diff --git a/docs/images/mqc_ivar_trim_plot.png b/docs/images/mqc_ivar_trim_plot.png new file mode 100755 index 00000000..ca054905 Binary files /dev/null and b/docs/images/mqc_ivar_trim_plot.png differ diff --git a/docs/images/mqc_ivar_variants_plot.png b/docs/images/mqc_ivar_variants_plot.png new file mode 100755 index 00000000..1ff6884f Binary files /dev/null and b/docs/images/mqc_ivar_variants_plot.png differ diff --git a/docs/images/mqc_kraken2_plot.png b/docs/images/mqc_kraken2_plot.png new file mode 100755 index 00000000..6837fb18 Binary files /dev/null and b/docs/images/mqc_kraken2_plot.png differ diff --git a/docs/images/mqc_picard_duplicates_plot.png b/docs/images/mqc_picard_duplicates_plot.png new file mode 100755 index 00000000..57e51e1e Binary files /dev/null and b/docs/images/mqc_picard_duplicates_plot.png differ diff --git a/docs/images/mqc_picard_insert_size_plot.png b/docs/images/mqc_picard_insert_size_plot.png new file mode 100755 index 00000000..7614aa29 Binary files /dev/null and b/docs/images/mqc_picard_insert_size_plot.png differ diff --git a/docs/images/mqc_picard_wgs_coverage_plot.png b/docs/images/mqc_picard_wgs_coverage_plot.png new file mode 100755 index 00000000..54c4f6af Binary files /dev/null and b/docs/images/mqc_picard_wgs_coverage_plot.png differ diff --git a/docs/images/mqc_quast_plot.png b/docs/images/mqc_quast_plot.png new file mode 100755 index 00000000..afcb6d09 Binary files /dev/null and b/docs/images/mqc_quast_plot.png differ diff --git a/docs/images/mqc_samtools_stats_plot.png b/docs/images/mqc_samtools_stats_plot.png new file mode 100755 index 00000000..3d926e74 Binary files /dev/null and b/docs/images/mqc_samtools_stats_plot.png differ diff --git a/docs/images/mqc_snpeff_plot.png b/docs/images/mqc_snpeff_plot.png new file mode 100755 index 00000000..79313e5c Binary files /dev/null and b/docs/images/mqc_snpeff_plot.png differ diff --git a/docs/images/mqc_varscan2_plot.png b/docs/images/mqc_varscan2_plot.png new file mode 100755 index 00000000..98ed977f Binary files /dev/null and b/docs/images/mqc_varscan2_plot.png differ diff --git a/docs/output.md b/docs/output.md index b23f7f4b..6401dd20 100644 --- a/docs/output.md +++ b/docs/output.md @@ -1,43 +1,526 @@ -# nf-core/viralrecon: Output - -This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. - - - -## Pipeline overview - -The pipeline is built using [Nextflow](https://www.nextflow.io/) -and processes data using the following steps: - -* [FastQC](#fastqc) - read quality control -* [MultiQC](#multiqc) - aggregate report, describing results of the whole pipeline - -## FastQC - -[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your reads. It provides information about the quality score distribution across your reads, the per base sequence content (%T/A/G/C). You get information about adapter contamination and other overrepresented sequences. - -For further reading and documentation see the [FastQC help](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). - -> **NB:** The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. To see how your reads look after trimming, look at the FastQC reports in the `trim_galore` directory. - -**Output directory: `results/fastqc`** - -* `sample_fastqc.html` - * FastQC report, containing quality metrics for your untrimmed raw fastq files -* `zips/sample_fastqc.zip` - * zip file containing the FastQC report, tab-delimited data file and plot images - -## MultiQC - -[MultiQC](http://multiqc.info) is a visualisation tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in within the report data directory. - -The pipeline has special steps which allow the software versions used to be reported in the MultiQC output for future traceability. - -**Output directory: `results/multiqc`** - -* `Project_multiqc_report.html` - * MultiQC report - a standalone HTML file that can be viewed in your web browser -* `Project_multiqc_data/` - * Directory containing parsed statistics from the different tools used in the pipeline - -For more information about how to use MultiQC reports, see [http://multiqc.info](http://multiqc.info) +# ![nf-core/viralrecon](images/nf-core-viralrecon_logo.png) + +This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. Please click [here](https://raw.githack.com/nf-core/viralrecon/master/docs/html/multiqc_report.html) to see an example MultiQC report generated using the parameters defined in [this configuration file](https://github.com/nf-core/viralrecon/blob/master/conf/test_full.config) to run the pipeline on [samples](https://zenodo.org/record/3735111) which were prepared from the [ncov-2019 ARTIC Network V1 amplicon set](https://artic.network/ncov-2019) and sequenced on the Illumina MiSeq platform in 301bp paired-end format. + +The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. + +## Pipeline overview + +The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: + +* [Preprocessing](#Preprocessing) + * [parallel-fastq-dump](#parallel-fastq-dump) - Download samples from SRA + * [cat](#cat) - Merge re-sequenced FastQ files + * [FastQC](#fastqc) - Raw read QC + * [fastp](#fastp) - Adapter and quality trimming +* [Variant calling](#variant-calling) + * [Bowtie 2](#bowtie-2) - Read alignment relative to reference genome + * [SAMtools](#samtools) - Sort, index and generate metrics for alignments + * [iVar trim](#ivar-trim) - Primer sequence removal for amplicon data + * [picard MarkDuplicates](#picard-markduplicates) - Duplicate read marking and removal + * [picard CollectMultipleMetrics](#picard-collectmultiplemetrics) - Whole genome coverage and alignment metrics + * [VarScan 2, BCFTools, BEDTools](#varscan-2-bcftools-bedtools) *||* [iVar variants and iVar consensus](#ivar-variants-and-ivar-consensus) *||* [BCFTools and BEDTools](#bcftools-and-bedtools) - Variant calling and consensus sequence generation + * [SnpEff and SnpSift](#snpeff-and-snpsift) - Genetic variant annotation and functional effect prediction + * [QUAST](#quast) - Consensus assessment report +* [De novo assembly](#de-novo-assembly) + * [Cutadapt](#cutadapt) - Primer trimming for amplicon data + * [Kraken 2](#kraken-2) - Removal of host reads + * [SPAdes](#spades) *||* [metaSPAdes](#metaspades) *||* [Unicycler](#unicycler) *||* [minia](#minia) - Viral genome assembly + * [BLAST](#blast) - Blast to reference assembly + * [ABACAS](#abacas) - Order contigs according to reference genome + * [PlasmidID](#plasmidid) - Assembly report and visualisation + * [Assembly QUAST](#assembly-quast) - Assembly quality assessment + * [Minimap2, seqwish, vg](#minimap2-seqwish-vg) - Call variants from induced genome variation graph + * [Assembly SnpEff and SnpSift](#assembly-snpeff-and-snpsift) - Genetic variant annotation and functional effect prediction +* [Workflow reporting and genomes](#workflow-reporting-and-genomes) + * [MultiQC](#multiqc) - Present QC for raw reads, alignment, assembly and variant calling + * [Reference genome files](#reference-genome-files) - Saving reference genome indices/files + * [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution + +## Preprocessing + +### parallel-fastq-dump + +Please see the [usage docs](https://github.com/nf-core/viralrecon/blob/master/docs/usage.md#supported-public-repository-ids) for a list of supported public repository identifiers and how to provide them to the pipeline. The final sample information for all identifiers is obtained from the ENA which provides direct download links for FastQ files as well as their associated md5sums. If a download link exists, the files will be downloaded by FTP otherwise they will be downloaded using [parallel-fastq-dump](https://github.com/rvalieris/parallel-fastq-dump). + +**Output files:** + +* `preprocess/sra/` + * `sra_run_info.tsv`: Run information file for all samples to be downloaded from the ENA/SRA. + * `*.fastq.gz`: Paired-end/single-end reads downloaded and extracted from the ENA/SRA. +* `preprocess/sra/md5/` + * `*.md5`: Files containing `md5` sum for FastQ files downloaded from ENA/SRA. +* `preprocess/sra/log/` + * `*.fastq_dump.log`: Log file generated from stdout whilst running `parallel-fastq-dump`. + +> **NB:** Downloaded FastQ files will only be saved in the results directory if the `--save_sra_fastq` parameter is supplied. + +### cat + +If multiple libraries/runs have been provided for the same sample in the input samplesheet (e.g. to increase sequencing depth) then these will be merged at the very beginning of the pipeline in order to have consistent sample naming throughout the pipeline. Please refer to the [usage docs](https://github.com/nf-core/viralrecon/blob/dev/docs/usage.md#format) to see how to specify these samples in the input samplesheet. + +### FastQC + +[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). + +**Output files:** + +* `preprocess/fastqc/` + * `*_fastqc.html`: FastQC report containing quality metrics. +* `preprocess/fastqc/zips/` + * `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. + +![MultiQC - FastQC per base sequence plot](images/mqc_fastqc_plot.png) + +> **NB:** The FastQC plots in this directory are generated relative to the raw, input reads. They may contain adapter sequence and regions of low quality. To see how your reads look after trimming please refer to the FastQC reports in the `preprocess/fastp/fastqc/` directory. + +### fastp + +[fastp](https://github.com/OpenGene/fastp) is a tool designed to provide fast, all-in-one preprocessing for FastQ files. It has been developed in C++ with multithreading support to achieve higher performance. fastp is used in this pipeline for standard adapter trimming and quality filtering. + +**Output files:** + +* `preprocess/fastp/` + * `*.fastp.html`: Trimming report in html format. + * `*.fastp.json`: Trimming report in json format. + * `*.trim.fastq.gz`: Paired-end/single-end trimmed reads. + * `*.trim.fail.gz`: Unpaired trimmed reads (only for paired-end data). +* `preprocess/fastp/log/` + * `*.fastp.log`: Trimming log file. +* `preprocess/fastp/fastqc/`: + * `*.trim_fastqc.html`: FastQC report of the trimmed reads. +* `preprocess/fastp/fastqc/zips/` + * `*.trim_fastqc.zip`: Zip archive containing the FastQC report. + +![MultiQC - fastp filtered reads plot](images/mqc_fastp_plot.png) + +> **NB:** Post-trimmed FastQ files will only be saved in the results directory if the `--save_trimmed` parameter is supplied. + +## Variant calling + +A file called `summary_variants_metrics_mqc.tsv` containing a selection of read and variant calling metrics will be saved in the `variants/` results directory. The same metrics have also been added to the top of the MultiQC report. + +### Bowtie 2 + +[Bowtie 2](http://bio-bwa.sourceforge.net/) is an ultrafast and memory-efficient tool for aligning sequencing reads to long reference sequences. Bowtie 2 supports gapped, local, and paired-end alignment modes. + +**Output files:** + +* `variants/bam/` + * `.bam`: Original BAM file created by Bowtie 2. Only present if `--save_align_intermeds` parameter is supplied. +* `variants/bam/log/` + * `.bowtie2.log`: Bowtie 2 mapping log file. + +![MultiQC - Bowtie2 alignment score plot](images/mqc_bowtie2_plot.png) + +### SAMtools + +Bowtie 2 BAM files are further processed with [SAMtools](http://samtools.sourceforge.net/) to sort them by coordinate, for indexing, as well as to generate read mapping statistics. + +**Output files:** + +* `variants/bam/` + * `.sorted.bam`: Coordinate sorted BAM file containing read alignment information. + * `.sorted.bam.bai`: Index file for coordinate sorted BAM file. +* `variants/bam/samtools_stats/` + * SAMtools `.sorted.bam.flagstat`, `.sorted.bam.idxstats` and `.sorted.bam.stats` files generated from the alignment files. + +![MultiQC - SAMtools alignment scores plot](images/mqc_samtools_stats_plot.png) + +> **NB:** BAM files and their associated indices will only be saved in the results directory if the `--save_align_intermeds` parameter is supplied. + +### iVar trim + +If the `--protocol amplicon` parameter is provided then [iVar](http://gensoft.pasteur.fr/docs/ivar/1.0/manualpage.html) is used to trim amplicon primer sequences from the aligned reads. iVar uses the primer positions supplied in `--amplicon_bed` to soft clip primer sequences from a coordinate sorted BAM file. + +**Output files:** + +* `variants/bam/` + * `.trim.sorted.bam`: Coordinate sorted BAM file after primer trimming. + * `.trim.sorted.bam.bai`: Index file for coordinate sorted BAM file after primer trimming. +* `variants/bam/samtools_stats/` + * SAMtools `.trim.flagstat`, `.trim.idxstats` and `.trim.stats` files generated from the primer trimmed alignment files. +* `variants/bam/log/` + * `.trim.ivar.log`: iVar trim log file obtained from stdout. + +![MultiQC - iVar trim primer heatmap](images/mqc_ivar_trim_plot.png) + +> **NB:** Post-trimmed BAM files and their associated indices will only be saved in the results directory if the `--save_align_intermeds` parameter is supplied. + +### picard MarkDuplicates + +Unless you are using [UMIs](https://emea.illumina.com/science/sequencing-method-explorer/kits-and-arrays/umi.html) it is not possible to establish whether the fragments you have sequenced from your sample were derived via true biological duplication (i.e. sequencing independent template fragments) or as a result of PCR biases introduced during the library preparation. By default, the pipeline uses picard MarkDuplicates to *mark* the duplicate reads identified amongst the alignments to allow you to guage the overall level of duplication in your samples. However, you can also choose to remove any reads identified as duplicates via the `--filter_dups` parameter. + +**Output files:** + +* `variants/bam/` + * `..sorted.bam`: Coordinate sorted BAM file after duplicate marking. + * `..sorted.bam.bai`: Index file for coordinate sorted BAM file after duplicate marking. +* `variants/bam/samtools_stats/` + * SAMtools `..flagstat`, `..idxstats` and `..stats` files generated from the duplicate marked alignment files. +* `variants/bam/picard_metrics/` + * `..MarkDuplicates.metrics.txt`: Metrics file from MarkDuplicates. + +![MultiQC - Picard MarkDuplicates metrics plot](images/mqc_picard_duplicates_plot.png) + +> **NB:** The value of `` in the output file names above will depend on the preceeding steps that were run in the pipeline. If `--protocol amplicon` is specified then this process will be run on the iVar trimmed alignments and the value of `` will be `trim.mkD`. However, if `--protocol metagenomic` is specified then the process will be run on the alignments obtained directly from Bowtie 2 and the value of `` will be `mkD`; where `mkD` is an abbreviation for MarkDuplicates. + +### picard CollectMultipleMetrics + +[picard-tools](https://broadinstitute.github.io/picard/command-line-overview.html) is a set of command-line tools for manipulating high-throughput sequencing data. We use picard-tools in this pipeline to obtain mapping and coverage metrics. + +**Output files:** + +* `variants/bam/picard_metrics/` + * `..CollectMultipleMetrics.*`: Alignment QC files from picard CollectMultipleMetrics in `*_metrics` textual format and plotted in `*.pdf` format. + * `..CollectWgsMetrics.coverage_metrics`: Coverage metrics file from CollectWgsMetrics. + +![MultiQC - Picard whole genome coverage plot](images/mqc_picard_wgs_coverage_plot.png) + +![MultiQC - Picard insert size plot](images/mqc_picard_insert_size_plot.png) + +> **NB:** The value of `` in the output file names above will depend on the preceeding steps that were run in the pipeline. If `--protocol amplicon` is specified then this process will be run on the iVar trimmed alignments and the value of `` will be `trim.mkD`. However, if `--protocol metagenomic` is specified then the process will be run on the alignments obtained directly from Bowtie 2 and the value of `` will be `mkD`; where `mkD` is an abbreviation for MarkDuplicates. + +### VarScan 2, BCFTools, BEDTools + +[VarScan 2](http://dkoboldt.github.io/varscan/) is a platform-independent software tool to detect variants in NGS data. In this pipeline, VarScan 2 is used in conjunction with SAMtools in order to call both high and low frequency variants. + +[BCFtools](http://samtools.github.io/bcftools/bcftools.html) is a set of utilities that manipulate variant calls in [VCF](https://vcftools.github.io/specs.html) and its binary counterpart BCF format. BCFTools is used in the variant calling and *de novo* assembly steps of this pipeline to obtain basic statistics from the VCF output. It is also used in the VarScan 2 variant calling branch of the pipeline to generate a consensus sequence by integrating high frequency variant calls into the reference genome. + +[BEDTools](https://bedtools.readthedocs.io/en/latest/) is a swiss-army knife of tools for a wide-range of genomics analysis tasks. In this pipeline we use `bedtools genomecov` to compute the per-base mapped read coverage in bedGraph format, and `bedtools maskfasta` to mask sequences in a Fasta file based on intervals defined in a feature file. This may be useful for creating your own masked genome file based on custom annotations or for masking all but your target regions when aligning sequence data from a targeted capture experiment. + +**Output files:** + +* `variants/varscan2/` + * `.vcf.gz`: Low frequency variants VCF file. + * `.vcf.gz.tbi`: Low frequency variants VCF index file. + * `.AF.vcf.gz`: High frequency variants VCF file. + * `.AF.vcf.gz.tbi`: High frequency variants VCF index file. +* `variants/varscan2/consensus/` + * `.AF.consensus.fa`: Consensus Fasta file generated by integrating the high frequency variants called by VarScan into the reference genome. + * `.AF.consensus.masked.fa`: Masked consensus Fasta file. +* `variants/varscan2/log/` + * `.varscan2.log`: Log file generated from stderr by VarScan 2. +* `variants/varscan2/bcftools_stats/` + * `.bcftools_stats.txt`: Statistics and counts obtained from low frequency variants VCF file. + * `.AF.bcftools_stats.txt`: Statistics and counts obtained from high frequency variants VCF file. +* `variants/bam/mpileup/` + * `..mpileup`: mpileup files summarize all the data from aligned reads at a given genomic position. Each row of the mpileup file gives similar information to a single vertical column of reads as visualised in IGV. + +![MultiQC - VarScan 2 variants called plot](images/mqc_varscan2_plot.png) + +> **NB:** The value of `` in the output file names above is determined by the `--max_allele_freq` parameter (Default: 0.8). +> **NB:** Output mpileup files will only be saved in the directory if the `--save_mpileup` parameter is supplied. The naming convention for these files will depend on the preceeding steps that were run in the pipeline as described in the paragraph explaining the value of `` in the section above. + +### iVar variants and iVar consensus + +[iVar](https://github.com/andersen-lab/ivar/blob/master/docs/MANUAL.md) is a computational package that contains functions broadly useful for viral amplicon-based sequencing. We use iVar in this pipeline to [trim primer sequences](#ivar-trim) for amplicon input data as well as to call variants and for consensus sequence generation. + +**Output files:** + +* `variants/ivar/` + * `.tsv`: Low frequency variants in TSV format. + * `.vcf.gz`: Low frequency variants VCF file. + * `.vcf.gz.tbi`: Low frequency variants VCF index file. + * `.AF.vcf.gz`: High frequency variants VCF file. + * `.AF.vcf.gz.tbi`: High frequency variants VCF index file. +* `variants/ivar/consensus/` + * `.AF.consensus.fa`: Consensus Fasta file generated by iVar at the frequency threshold set by the `--max_allele_freq` parameter. + * `.AF.consensus.qual.txt`: File with the average quality of each base in the consensus sequence. +* `variants/ivar/log/` + * `.variant.counts.log`: Variant counts for low frequency variants. + * `.AF.variant.counts.log`: Variant counts for high frequency variants. +* `variants/ivar/bcftools_stats/` + * `.bcftools_stats.txt`: Statistics and counts obtained from low frequency variants VCF file. + * `.AF.bcftools_stats.txt`: Statistics and counts obtained from high frequency variants VCF file. + +![MultiQC - iVar variants called plot](images/mqc_ivar_variants_plot.png) + +### BCFTools and BEDTools + +[BCFtools](http://samtools.github.io/bcftools/bcftools.html) can be used to call variants directly from BAM alignment files. The functionality to call variants with BCFTools in this pipeline was inspired by work carried out by [Conor Walker](https://github.com/conorwalker/covid19/blob/3cb26ec399417bedb7e60487415c78a405f517d6/scripts/call_variants.sh). In contrast to VarScan 2 and iVar, the original variant calls obtained by BCFTools are not filtered further by a higher allele frequency. It seems that the default calls obtained by BCFTools appear to be comparable with the high frequency variants generated by VarScan 2 and iVar. + +**Output files:** + +* `variants/bcftools/` + * `.vcf.gz`: Variants VCF file. + * `.vcf.gz.tbi`: Variants VCF index file. +* `variants/bcftools/consensus/` + * `.consensus.fa`: Consensus Fasta file generated by integrating the variants called by BCFTools into the reference genome. + * `.consensus.masked.fa`: Masked consensus Fasta file. +* `variants/bcftools/bcftools_stats/` + * `.bcftools_stats.txt`: Statistics and counts obtained from VCF file. + +![MultiQC - BCFTools variant counts](images/mqc_bcftools_stats_plot.png) + +### SnpEff and SnpSift + +[SnpEff](http://snpeff.sourceforge.net/SnpEff.html) is a genetic variant annotation and functional effect prediction toolbox. It annotates and predicts the effects of genetic variants on genes and proteins (such as amino acid changes). + +[SnpSift](http://snpeff.sourceforge.net/SnpSift.html) annotates genomic variants using databases, filters, and manipulates genomic annotated variants. After annotation with SnpEff, you can use SnpSift to help filter large genomic datasets in order to find the most significant variants. + +**Output files:** + +* `variants//snpeff/` + * `*.snpEff.csv`: Variant annotation csv file. + * `*.snpEff.genes.txt`: Gene table for annotated variants. + * `*.snpEff.summary.html`: Summary html file for variants. + * `*.snpEff.vcf.gz`: VCF file with variant annotations. + * `*.snpEff.vcf.gz.tbi`: Index for VCF file with variant annotations. + * `*.snpSift.table.txt`: SnpSift summary table. + +![MultiQC - SnpEff annotation counts](images/mqc_snpeff_plot.png) + +> **NB:** The value of `` in the output directory name above is determined by the `--callers` parameter (Default: 'varscan2,ivar,bcftools'). If applicable, you will have two sets of files where the file name prefix will be `` for low-frequency variants and `.AF` for high frequency variants. + +### QUAST + +[QUAST](http://bioinf.spbau.ru/quast) is used to generate a single report with which to evaluate the quality of the consensus sequence across all of the samples provided to the pipeline. The HTML results can be opened within any browser (we recommend using Google Chrome). Please see the [QUAST output docs](http://quast.sourceforge.net/docs/manual.html#sec3) for more detailed information regarding the output files. + +**Output files:** + +* `variants//quast/AF/` + * `report.html`: Results report in HTML format. Also available in various other file formats i.e. `report.pdf`, `report.tex`, `report.tsv` and `report.txt`. + +> **NB:** The value of `` in the output directory name above is determined by the `--callers` parameter (Default: 'varscan2,ivar,bcftools') and the value of `` is determined by the `--max_allele_freq` parameter (Default: 0.8). + +## De novo assembly + +A file called `summary_assembly_metrics_mqc.tsv` containing a selection of read and *de novo* assembly related metrics will be saved in the `assembly/` results directory. The same metrics have also been added to the top of the MultiQC report. + +### Cutadapt + +In the variant calling branch of the pipeline we are using [iVar trim](#ivar-trim) to remove primer sequences from the aligned BAM files for amplicon data. Since in the *de novo* assembly branch we don't align the reads, we use [Cutadapt](https://cutadapt.readthedocs.io/en/stable/guide.html) as an alternative option to remove and clean the primer sequences directly from FastQ files. + +**Output files:** + +* `assembly/cutadapt/` + * `*.ptrim.fastq.gz`: FastQ files after primer sequence trimming. +* `assembly/cutadapt/log/` + * `*.cutadapt.log`: Cutadapt log file generated from stdout. +* `assembly/cutadapt/fastqc/` + * `*.ptrim_fastqc.html`: FastQC report of the trimmed reads. +* `assembly/cutadapt/fastqc/zips/` + * `*.ptrim_fastqc.zip`: Zip archive containing the FastQC report. + +![MultiQC - Cutadapt filtered reads plot](images/mqc_cutadapt_plot.png) + +> **NB:** Trimmed FastQ files will only be saved in the results directory if the `--save_trimmed` parameter is supplied. + +### Kraken 2 + +[Kraken 2](https://ccb.jhu.edu/software/kraken2/index.shtml?t=manual) is a sequence classifier that assigns taxonomic labels to DNA sequences. Kraken 2 examines the k-mers within a query sequence and uses the information within those k-mers to query a database. That database maps k-mers to the lowest common ancestor (LCA) of all genomes known to contain a given k-mer. + +We used a Kraken 2 database in this workflow to filter out reads specific to the host genome. The remainder of the reads are then passed to numerous *de novo* assembly algorithms in order to reconstruct the viral genome. + +**Output files:** + +* `assembly/kraken2/` + * `*.host*.fastq.gz`: Reads that were classified to the host database. + * `*.viral*.fastq.gz`: Reads that were unclassified to the host database. + * `*.kraken2.report.txt`: Kraken 2 taxonomic report. See [here](https://ccb.jhu.edu/software/kraken2/index.shtml?t=manual#sample-report-output-format) for a detailed description of the format. + +![MultiQC - Kraken 2 classification plot](images/mqc_kraken2_plot.png) + +> **NB:** Output FastQ files will only be saved in the results directory if the `--save_kraken2_fastq` parameter is supplied. + +### SPAdes + +[SPAdes](http://cab.spbu.ru/software/spades/) is an assembly toolkit containing various assembly pipelines. Generically speaking, SPAdes is one of the most popular de Bruijn graph-based assembly algorithms used for bacterial/viral genome reconstruction. + +[Bandage](https://rrwick.github.io/Bandage/) is a program for visualising *de novo* assembly graphs. By displaying connections which are not present in the contigs file, Bandage opens up new possibilities for analysing *de novo* assemblies. + +**Output files:** + +* `assembly/spades/` + * `*.scaffolds.fa`: SPAdes scaffold assembly. + * `*.assembly.gfa`: SPAdes assembly graph in [GFA](https://github.com/GFA-spec/GFA-spec/blob/master/GFA1.md) format. +* `assembly/spades/bandage/` + * `*.png`: Bandage visualisation for SPAdes assembly graph in PNG format. + * `*.svg`: Bandage visualisation for SPAdes assembly graph in SVG format. + +### metaSPAdes + +[metaSPAdes](http://cab.spbu.ru/software/meta-spades/) is a de Bruijn graph-based assembler that is distributed with SPAdes and executed via the `--meta` option. It can be used for the simultaneous reconstruction of multiple genomes as observed in metagenomics data. + +**Output files:** + +* `assembly/metaspades/` + * `*.scaffolds.fa`: metaSPAdes scaffold assembly. + * `*.assembly.gfa`: metaSPAdes assembly graph in GFA format. +* `assembly/metaspades/bandage/` + * `*.png`: Bandage visualisation for metaSPAdes assembly graph in PNG format. + * `*.svg`: Bandage visualisation for metaSPAdes assembly graph in SVG format. + +### Unicycler + +[Unicycler](https://github.com/rrwick/Unicycler) is an assembly pipeline for bacterial genomes. It can assemble Illumina-only read sets where it functions as a SPAdes-optimiser. + +**Output files:** + +* `assembly/unicycler/` + * `*.scaffolds.fa`: Unicycler scaffold assembly. + * `*.assembly.gfa`: Unicycler assembly graph in GFA format. +* `assembly/unicycler/bandage/` + * `*.png`: Bandage visualisation for Unicycler assembly graph in PNG format. + * `*.svg`: Bandage visualisation for Unicycler assembly graph in SVG format. + +### minia + +[Minia](https://github.com/GATB/minia) is a short-read assembler based on a de Bruijn graph, capable of assembling a human genome on a desktop computer in a day. The output of Minia is a set of contigs. Minia produces results of similar contiguity and accuracy to other de Bruijn assemblers. + +**Output files:** + +* `assembly/minia//` + * `*.scaffolds.fa`: Minia scaffold assembly. + +> **NB:** The value of `` in the output directory name above is determined by the `--minia_kmer` parameter (Default: 31). + +### BLAST + +[blastn](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastSearch) is used to align the assembled contigs against the virus reference genome. + +**Output files:** + +* `assembly//blast/` + * `*.blast.txt`: BLAST results against the target virus. + * `*.blast.filt.header.txt`: Filtered BLAST results. + +> **NB:** The value of `` in the output directory name above is determined by the `--assemblers` parameter (Default: 'spades,metaspades,unicycler,minia'). + +### ABACAS + +[ABACAS](https://www.sanger.ac.uk/science/tools/pagit) was developed to rapidly contiguate (align, order, orientate), visualize and design primers to close gaps on shotgun assembled contigs based on a reference sequence. + +**Output files:** + +* `assembly//abacas/` + * `*.abacas.bin`: Bin file that contains contigs that are not used in ordering. + * `*.abacas.crunch`: Comparison file. + * `*.abacas.fasta`: Ordered and orientated sequence file. + * `*.abacas.gaps`: Gap information. + * `*.abacas.gaps.tab`: Gap information in tab-delimited format. + * `*.abacas.MULTIFASTA.fa`: A list of ordered and orientated contigs in a multi-fasta format. + * `*.abacas.tab`: Feature file + * `*.unused_contigs.out`: Information on contigs that have a mapping information but could not be used in the ordering. +* `assembly//abacas/nucmer/`: Folder containing the files generated by the NUCmer algorithm used by ABACAS. + +> **NB:** The value of `` in the output directory name above is determined by the `--assemblers` parameter (Default: 'spades,metaspades,unicycler,minia'). + +### PlasmidID + +[PlasmidID](https://github.com/BU-ISCIII/plasmidID) was used to graphically represent the alignment of the reference genome relative to a given assembly. This helps to visualize the coverage of the reference genome in the assembly. To find more information about the output files refer to the [documentation](https://github.com/BU-ISCIII/plasmidID/wiki/Understanding-the-image:-track-by-track). + +**Output files:** + +* `assembly//plasmidid//` + * `images/_.png`: PNG file with the visualization of the alignment between the viral assembly and the reference viral genome. + * `data/`: Files used for drawing the circos images. + * `database/`: Annotation files used for drawing the circos images. + * `fasta_files`: Folder with fasta files that correspond to the selection of contigs/scaffolds required to reconstruct the reference genome generated in the `images/` folder. + * `log/`: Log files. + +> **NB:** The value of `` in the output directory name above is determined by the `--assemblers` parameter (Default: 'spades,metaspades,unicycler,minia'). + +### Assembly QUAST + +[QUAST](http://bioinf.spbau.ru/quast) is used to generate a single report with which to evaluate the quality of the *de novo* assemblies across all of the samples provided to the pipeline. The HTML results can be opened within any browser (we recommend using Google Chrome). Please see the [QUAST output docs](http://quast.sourceforge.net/docs/manual.html#sec3) for more detailed information regarding the output files. + +**Output files:** + +* `assembly//quast/` + * `report.html`: Results report in HTML format. Also available in various other file formats i.e. `report.pdf`, `report.tex`, `report.tsv` and `report.txt`. + +![MultiQC - QUAST contig counts](images/mqc_quast_plot.png) + +> **NB:** The value of `` in the output directory name above is determined by the `--assemblers` parameter (Default: 'spades,metaspades,unicycler,minia'). + +### Minimap2, seqwish, vg + +[Minimap2](https://github.com/lh3/minimap2) is a versatile sequence alignment program that aligns DNA or mRNA sequences against a large reference database. Minimap2 was used to generate all-versus-all alignments between scaffold assembly contigs and the reference genome. + +[seqwish](https://github.com/ekg/seqwish) implements a lossless conversion from pairwise alignments between sequences to a variation graph encoding the sequences and their alignments. seqwish was used to induce a genome variation graph from the all-versus-all alignment generated by Minimap2. + +[vg](https://github.com/vgteam/vg) is a collection of tools for working with genome variation graphs. vg was used to call variants from the genome variation graph generated by seqwish. + +[Bandage](https://github.com/rrwick/Bandage), a Bioinformatics Application for Navigating De novo Assembly Graphs Easily, is a GUI program that allows users to interact with the assembly graphs made by de novo assemblers and other graphs in GFA format. Bandage was used to render induced genome variation graphs as static PNG and SVG images. + +**Output files:** + +* `assembly//variants/` + * `*.gfa`: Induced genome variation graph. + * `*.vcf.gz`: VCF file with variant annotations. + * `*.vcf.gz.tbi`: Index for VCF file with variant annotations. +* `assembly//variants/bcftools_stats/` + * `*.bcftools_stats.txt`: Statistics and counts for variants in VCF files. +* `assembly//bandage/` + * `*.png`: Bandage visualisation for induced genome variation graph in PNG format. + * `*.svg`: Bandage visualisation for induced genome variation graph in SVG format. + +> **NB:** The value of `` in the output directory name above is determined by the `--assemblers` parameter (Default: 'spades,metaspades,unicycler,minia'). + +### Assembly SnpEff and SnpSift + +[SnpEff](http://snpeff.sourceforge.net/SnpEff.html) is a genetic variant annotation and functional effect prediction toolbox. It annotates and predicts the effects of genetic variants on genes and proteins (such as amino acid changes). + +[SnpSift](http://snpeff.sourceforge.net/SnpSift.html) annotates genomic variants using databases, filters, and manipulates genomic annotated variants. After annotation with SnpEff, you can use SnpSift to help filter large genomic datasets in order to find the most significant variants. + +**Output files:** + +* `assembly//variants/snpeff/` + * `*.snpEff.csv`: Variant annotation csv file. + * `*.snpEff.genes.txt`: Gene table for annotated variants. + * `*.snpEff.summary.html`: Summary html file for variants. + * `*.snpEff.vcf.gz`: VCF file with variant annotations. + * `*.snpEff.vcf.gz.tbi`: Index for VCF file with variant annotations. + * `*.snpSift.table.txt`: SnpSift summary table. + +> **NB:** The value of `` in the output directory name above is determined by the `--assemblers` parameter (Default: 'spades,metaspades,unicycler,minia'). + +## Workflow reporting and genomes + +### MultiQC + +[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarizing all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. + +Results generated by MultiQC collate pipeline QC from FastQC, fastp, Cutadapt, Bowtie 2, Kraken 2, VarScan 2, iVar, samtools flagstat, samtools idxstats, samtools stats, picard CollectMultipleMetrics and CollectWgsMetrics, BCFTools, SnpEff and QUAST. + +The default [`multiqc config file`](https://github.com/nf-core/viralrecon/blob/master/assets/multiqc_config.yaml) has been written in a way in which to structure these QC metrics to make them more interpretable in the final report. + +The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . + +Please click [here](https://raw.githack.com/nf-core/viralrecon/master/docs/html/multiqc_report.html) to see an example MultiQC report generated using the parameters defined in [this configuration file](https://github.com/nf-core/viralrecon/blob/master/conf/test_full.config) to run the pipeline on [samples](https://zenodo.org/record/3735111) which were prepared from the [ncov-2019 ARTIC Network V1 amplicon set](https://artic.network/ncov-2019) and sequenced on the Illumina MiSeq platform in 301bp paired-end format. + +**Output files:** + +* `multiqc/` + * `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. + * `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. + * `multiqc_plots/`: directory containing static images from the report in various formats. + +### Reference genome files + +A number of genome-specific files are generated by the pipeline because they are required for the downstream processing of the results. If the `--save_reference` parameter is provided then the Bowtie 2 alignment indices, BLAST and Kraken 2 databases downloaded/generated by the pipeline will be saved in the `genome/` directory. It is recommended to use the `--save_reference` parameter if you are using the pipeline to build a Kraken 2 database for the host genome. This can be quite a time-consuming process and it permits their reuse for future runs of the pipeline or for other purposes. + +**Output files:** + +* `genome/` + * `BlastDB/`: BLAST database for viral genome. + * `Bowtie2Index/`: Bowtie 2 index for viral genome. + * `kraken2_/`: Kraken 2 database for host genome. + * `SnpEffDB/`: SnpEff database for viral genome. + * `snpeff.config`: SnpEff config file for viral genome. + * Unzipped genome fasta file for viral genome + * Unzipped genome annotation GFF file for viral genome + +### Pipeline information + +[Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. + +**Output files:** + +* `pipeline_info/` + * Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. + * Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.csv`. + * Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`. + * Documentation for interpretation of results in HTML format: `results_description.html`. diff --git a/docs/usage.md b/docs/usage.md index 338252e8..7089eaeb 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -9,12 +9,60 @@ * [Reproducibility](#reproducibility) * [Main arguments](#main-arguments) * [`-profile`](#-profile) - * [`--reads`](#--reads) - * [`--single_end`](#--single_end) + * [`--input`](#--input) + * [`--protocol`](#--protocol) + * [`--amplicon_bed`](#--amplicon_bed) + * [`--amplicon_fasta`](#--amplicon_fasta) +* [SRA download](#sra-download) + * [`--save_sra_fastq`](#--save_sra_fastq) + * [`--skip_sra`](#--skip_sra) * [Reference genomes](#reference-genomes) - * [`--genome` (using iGenomes)](#--genome-using-igenomes) + * [`--genome`](#--genome) * [`--fasta`](#--fasta) - * [`--igenomes_ignore`](#--igenomes_ignore) + * [`--gff`](#--gff) + * [`--save_reference`](#--save_reference) +* [Kraken 2](#kraken-2) + * [`--kraken2_db`](#--kraken2_db) + * [`--kraken2_db_name`](#--kraken2_db_name) + * [`--kraken2_use_ftp`](#--kraken2_use_ftp) + * [`--save_kraken2_fastq`](#--save_kraken2_fastq) + * [`--skip_kraken2`](#--skip_kraken2) +* [Read trimming](#read-trimming) + * [`--cut_mean_quality`](#--cut_mean_quality) + * [`--qualified_quality_phred`](#--qualified_quality_phred) + * [`--unqualified_percent_limit`](#--unqualified_percent_limit) + * [`--min_trim_length`](#--min_trim_length) + * [`--skip_adapter_trimming`](#--skip_adapter_trimming) + * [`--skip_amplicon_trimming`](#--skip_amplicon_trimming) + * [`--save_trimmed`](#--save_trimmed) +* [Variant calling](#variant-calling) + * [`--callers`](#-callers) + * [`--ivar_exclude_reads`](#--ivar_exclude_reads) + * [`--filter_dups`](#--filter_dups) + * [`--filter_unmapped`](#--filter_unmapped) + * [`--min_base_qual`](#--min_base_qual) + * [`--max_allele_freq`](#--max_allele_freq) + * [`--min_coverage`](#--min_coverage) + * [`--save_align_intermeds`](#--save_align_intermeds) + * [`--save_mpileup`](#--save_mpileup) + * [`--skip_markduplicates`](#--skip_markduplicates) + * [`--skip_snpeff`](#--skip_snpeff) + * [`--skip_variants_quast`](#--skip_variants_quast) + * [`--skip_variants`](#--skip_variants) +* [De novo assembly](#de-novo-assembly) + * [`--assemblers`](#--assemblers) + * [`--minia_kmer`](#--minia_kmer) + * [`--skip_blast`](#--skip_blast) + * [`--skip_abacas`](#--skip_abacas) + * [`--skip_plasmidid`](#--skip_plasmidid) + * [`--skip_vg`](#--skip_vg) + * [`--skip_assembly_quast`](#--skip_assembly_quast) + * [`--skip_assembly`](#--skip_assembly) +* [Skipping QC steps](#skipping-qc-steps) + * `--skip_fastqc` + * `--skip_picard_metrics` + * `--skip_multiqc` + * `--skip_qc` * [Job resources](#job-resources) * [Automatic resubmission](#automatic-resubmission) * [Custom resource requests](#custom-resource-requests) @@ -41,7 +89,7 @@ ## Introduction -Nextflow handles job submissions on SLURM or other environments, and supervises running the jobs. Thus the Nextflow process must run until the pipeline is finished. We recommend that you put the process running in the background through `screen` / `tmux` or similar tool. Alternatively you can run nextflow within a cluster job submitted your job scheduler. +Nextflow handles job submissions on SLURM or other environments, and supervises running the jobs. Thus the Nextflow process must run until the pipeline is finished. We recommend that you put the process running in the background through `screen` / `tmux` or similar tool. Alternatively you can run nextflow within a cluster job submitted your job scheduler. Finally, you can use nextflow `-bg` flag to execute nextflow in background. It is recommended to limit the Nextflow Java virtual machines memory. We recommend adding the following line to your environment (typically in `~/.bashrc` or `~./bash_profile`): @@ -49,14 +97,12 @@ It is recommended to limit the Nextflow Java virtual machines memory. We recomme NXF_OPTS='-Xms1g -Xmx4g' ``` - - ## Running the pipeline The typical command for running the pipeline is as follows: ```bash -nextflow run nf-core/viralrecon --reads '*_R{1,2}.fastq.gz' -profile docker +nextflow run nf-core/viralrecon --input samplesheet.csv --genome 'NC_045512.2' -profile docker ``` This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. @@ -80,7 +126,7 @@ nextflow pull nf-core/viralrecon ### Reproducibility -It's a good idea to specify a pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. +It is a good idea to specify a pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. First, go to the [nf-core/viralrecon releases page](https://github.com/nf-core/viralrecon/releases) and find the latest version number - numeric only (eg. `1.3.1`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.3.1`. @@ -110,92 +156,300 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof * A generic configuration profile to be used with [Singularity](http://singularity.lbl.gov/) * Pulls software from DockerHub: [`nfcore/viralrecon`](http://hub.docker.com/r/nfcore/viralrecon/) * `conda` - * Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker or Singularity. + * Please only use Conda as a last resort i.e. when it is not possible to run the pipeline with Docker or Singularity. * A generic configuration profile to be used with [Conda](https://conda.io/docs/) * Pulls most software from [Bioconda](https://bioconda.github.io/) * `test` * A profile with a complete configuration for automated testing * Includes links to test data so needs no other parameters - +### `--input` + +You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. + +```bash +--input '[path to samplesheet file]' +``` + +#### Format -### `--reads` +The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once (e.g. to increase sequencing depth). The pipeline will perform the analysis in parallel, and subsequently merge them when required. -Use this to specify the location of your input FastQ files. For example: +A final design file may look something like the one below. `SAMPLE_1` was sequenced twice in Illumina PE format, `SAMPLE_2` was sequenced once in Illumina SE format, and `SRR11605097`, `GSM4432381` and `ERX4009132` need to be downloaded from the ENA/SRA before the main pipeline execution. ```bash ---reads 'path/to/data/sample_*_{1,2}.fastq' +sample,fastq_1,fastq_2 +SAMPLE_1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz +SAMPLE_1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz +SAMPLE_2,AEG588A2_S4_L003_R1_001.fastq.gz, +SRR11605097,, +GSM4432381,, +ERX4009132,, ``` -Please note the following requirements: +| Column | Description | +|-----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `sample` | Custom sample name or [database identifier](#supported-public-repository-ids). This entry will be identical for multiple sequencing libraries/runs from the same sample. | +| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -1. The path must be enclosed in quotes -2. The path must have at least one `*` wildcard character -3. When using the pipeline with paired end data, the path must use `{1,2}` notation to specify read pairs. +#### Supported public repository ids -If left unspecified, a default pattern is used: `data/*{1,2}.fastq.gz` +The pipeline has been set-up to automatically download and process the raw FastQ files from public repositories. Currently, the following identifiers are supported: -### `--single_end` +| `SRA` | `ENA` | `GEO` | +|--------------|--------------|------------| +| SRR11605097 | ERR4007730 | GSM4432381 | +| SRX8171613 | ERX4009132 | GSE147507 | +| SRS6531847 | ERS4399630 | | +| SAMN14689442 | SAMEA6638373 | | +| SRP256957 | ERP120836 | | +| SRA1068758 | ERA2420837 | | +| PRJNA625551 | PRJEB37513 | | -By default, the pipeline expects paired-end data. If you have single-end data, you need to specify `--single_end` on the command line when you launch the pipeline. A normal glob pattern, enclosed in quotation marks, can then be used for `--reads`. For example: +If `SRR`/`ERR` run ids are provided then these will be resolved back to their appropriate `SRX`/`ERX` ids to be able to merge multiple runs from the same experiment. + +The final sample information for all identifiers is obtained from the ENA which provides direct download links for FastQ files as well as their associated md5 sums. If download links exist, the files will be downloaded by FTP otherwise they will be downloaded using [`parallel-fastq-dump`](https://github.com/rvalieris/parallel-fastq-dump). + +### `--protocol` + +Specifies the type of protocol used for sequencing i.e. 'metagenomic' or 'amplicon' (Default: 'metagenomic'). + +### `--amplicon_bed` + +If the `--protocol amplicon` parameter is provided then iVar is used to trim amplicon primer sequences after read alignment and before variant calling. iVar uses the primer positions relative to the viral genome supplied in `--amplicon_bed` to soft clip primer sequences from a coordinate sorted BAM file. The file must be in [BED](https://genome.ucsc.edu/FAQ/FAQformat.html#format1) format as highlighted below: ```bash ---single_end --reads '*.fastq' +NC_045512.2 30 54 nCoV-2019_1_LEFT 60 - +NC_045512.2 385 410 nCoV-2019_1_RIGHT 60 + +NC_045512.2 320 342 nCoV-2019_2_LEFT 60 - +NC_045512.2 704 726 nCoV-2019_2_RIGHT 60 + ``` -It is not possible to run a mixture of single-end and paired-end files in one run. +### `--amplicon_fasta` -## Reference genomes +If the `--protocol amplicon` parameter is provided then Cutadapt is used to trim amplicon primer sequences from FastQ files before *de novo* assembly. This file must contain amplicon primer sequences in Fasta format and is mandatory when `--protocol amplicon` is specified. An example is shown below: -The pipeline config files come bundled with paths to the illumina iGenomes reference index files. If running with docker or AWS, the configuration is set up to use the [AWS-iGenomes](https://ewels.github.io/AWS-iGenomes/) resource. +```bash +>nCoV-2019_1_LEFT +ACCAACCAACTTTCGATCTCTTGT +>nCoV-2019_1_RIGHT +CATCTTTAAGATGTTGACGTGCCTC +>nCoV-2019_2_LEFT +CTGTTTTACAGGTTCGCGACGT +>nCoV-2019_2_RIGHT +TAAGGATCAGTGCCAAGCTCGT +>nCoV-2019_3_LEFT +CGGTAATAAAGGAGCTGGTGGC +>nCoV-2019_3_RIGHT +AAGGTGTCTGCAATTCATAGCTCT +``` + +## SRA download -### `--genome` (using iGenomes) +### `--save_sra_fastq` -There are 31 different species supported in the iGenomes references. To run the pipeline, you must specify which to use with the `--genome` flag. +Save FastQ files created from SRA identifiers in the results directory (Default: false). -You can find the keys to specify the genomes in the [iGenomes config file](../conf/igenomes.config). Common genomes that are supported are: +### `--skip_sra` -* Human - * `--genome GRCh37` -* Mouse - * `--genome GRCm38` -* _Drosophila_ - * `--genome BDGP6` -* _S. cerevisiae_ - * `--genome 'R64-1-1'` +Skip steps involving the download and validation of FastQ files using SRA identifiers (Default: false). -> There are numerous others - check the config file for more. +## Reference genomes -Note that you can use the same configuration setup to save sets of reference files for your own use, even if they are not part of the iGenomes resource. See the [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html) for instructions on where to save such a file. +### `--genome` -The syntax for this reference configuration is as follows: +This parameter allows you to provide a key for the viral genome you would like to use with the pipeline. To run the pipeline, you must specify which to use with the `--genome` flag. + +Note that you can use the same configuration setup to save sets of reference files for your own use. See the [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html) for instructions on where to save such a file. - +The syntax for this reference configuration is as follows: ```nextflow params { + // Genome reference file paths genomes { - 'GRCh37' { - fasta = '' // Used if no star index given + 'NC_045512.2' { + fasta = "" + gff = "" + } + 'MN908947.3' { + fasta = "" + gff = "" } // Any number of additional genomes, key is used with --genome } } ``` - +You can find the keys to specify the genomes in the [Genomes config file](https://github.com/nf-core/configs/blob/master/conf/pipeline/viralrecon/genomes.config). ### `--fasta` -If you prefer, you can specify the full path to your reference genome when you run the pipeline: +Full path to Fasta file containing reference genome for the viral species (*mandatory* if `--genome` is not specified). If you don't have a Bowtie2 index available this will be generated for you automatically. Combine with `--save_reference` to save Bowtie2 index for future runs. ```bash --fasta '[path to Fasta reference]' ``` -### `--igenomes_ignore` +### `--gff` + +Full path to viral [GFF](http://www.gmod.org/wiki/GFF3) annotation file (Default: false). + +### `--save_reference` + +If the Bowtie2 index is generated by the pipeline use this parameter to save it to your results folder. These can then be used for future pipeline runs, reducing processing times (Default: false). + +## Kraken 2 + +### `--kraken2_db` + +Full path to Kraken 2 database built from host genome (Default: ''). + +### `--kraken2_db_name` + +Name for host genome as recognised by Kraken 2 when using the `kraken2 build` command (Default: 'human'). + +### `--kraken2_use_ftp` + +Option for Kraken 2 using ftp download instead of rsync (Default: false). + +### `--save_kraken2_fastq` + +Save the host and viral FastQ files in the results directory (Default: false). + +### `--skip_kraken2` + +Skip Kraken 2 process for removing host classified reads (Default: false). + +## Read trimming + +### `--cut_mean_quality` + +The mean quality requirement option shared by fastp cut_front, cut_tail or cut_sliding options. Range: 1~36 (Default: 30 (Q30)). + +### `--qualified_quality_phred` + +The quality value that a base is qualified. Default 30 means phred quality >=Q30 is qualified (Default: 30). + +### `--unqualified_percent_limit` + +Percentage of bases that are allowed to be unqualified (0~100) (Default: 10). + +### `--min_trim_length` + +Reads shorter than this length after trimming will be discarded (Default: 50). + +### `--skip_adapter_trimming` + +Skip the adapter trimming step performed by fastp. Use this if your input FastQ files have already been trimmed outside of the workflow or if you're very confident that there is no adapter contamination in your data (Default: false). + +### `--skip_amplicon_trimming` + +Skip the amplicon trimming step performed by Cutadapt. Use this if your input FastQ files have already been trimmed outside of the workflow or if you're very confident that there is no primer sequence contamination in your data (Default: false). + +### `--save_trimmed` -Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`. +By default, trimmed FastQ files will not be saved to the results directory. Specify this flag (or set to true in your config file) to copy these files to the results directory when complete (Default: false). + +## Variant calling + +### `--callers` + +Specify which variant calling algorithms you would like to use. Available options are `varscan2`, `ivar` and `bcftools` (Default: 'varscan2,ivar,bcftools'). + +### `--ivar_exclude_reads` + +This option unsets the `-e` parameter in `ivar trim` to discard reads without primers (Default: false). + +### `--filter_dups` + +Remove duplicate reads from alignments as identified by picard MarkDuplicates (Default: false). Note that unless you are using [UMIs](https://emea.illumina.com/science/sequencing-method-explorer/kits-and-arrays/umi.html) it is not possible to establish whether the fragments you have sequenced were derived via true biological duplication (i.e. sequencing independent template fragments) or as a result of PCR biases introduced during the library preparation. + +### `--filter_unmapped` + +Remove unmapped reads from alignments (Default: false). + +### `--min_base_qual` + +When performing variant calling skip bases with baseQ/BAQ smaller than this number (Default: 20). + +### `--min_coverage` + +When performing variant calling skip positions with an overall read depth smaller than this number (Default: 10). + +### `--max_allele_freq` + +Maximum allele frequency threshold for filtering variant calls (Default: 0.8). + +### `--save_align_intermeds` + +By default, intermediate [BAM](https://samtools.github.io/hts-specs/) files will not be saved. The final BAM files created after the appropriate filtering step are always saved to limit storage usage. Set to true to also save other intermediate BAM files (Default: false). + +### `--save_mpileup` + +Save Pileup files in the results directory. These tend to be quite large so are not saved by default (Default: false). + +### `--skip_markduplicates` + +Skip picard MarkDuplicates step (Default: false). + +### `--skip_snpeff` + +Skip SnpEff and SnpSift annotation of variants (Default: false). + +### `--skip_variants_quast` + +Skip generation of QUAST aggregated report for consensus sequences (Default: false). + +### `--skip_variants` + +Specify this parameter to skip all of the variant calling and mapping steps in the pipeline (Default: false). + +## De novo assembly + +### `--assemblers` + +Specify which assembly algorithms you would like to use. Available options are `spades`, `metaspades`, `unicycler` and `minia` (Default: 'spades,metaspades,unicycler,minia'). + +### `--minia_kmer` + +Kmer size to use when running minia (Default: 31). + +### `--skip_blast` + +Skip blastn of assemblies relative to reference genome (Default: false). + +### `--skip_abacas` + +Skip ABACAS process for assembly contiguation (Default: false). + +### `--skip_plasmidid` + +Skip assembly report generation by PlasmidID (Default: false). + +### `--skip_vg` + +Skip variant graph creation and variant calling relative to reference genome (Default: false). + +### `--skip_assembly_quast` + +Skip generation of QUAST aggregated report for assemblies (Default: false). + +### `--skip_assembly` + +Specify this parameter to skip all of the de novo assembly steps in the pipeline (Default: false). + +## Skipping QC steps + +The pipeline contains a large number of quality control steps. Sometimes, it may not be desirable to run all of them if time and compute resources are limited. The following options make this easy: + +| Step | Description | +|---------------------------|----------------------------------------------------------| +| `--skip_fastqc` | Skip FastQC | +| `--skip_picard_metrics` | Skip Picard CollectMultipleMetrics and CollectWgsMetrics | +| `--skip_multiqc` | Skip MultiQC | +| `--skip_qc` | Skip all QC steps except for MultiQC | ## Job resources @@ -225,14 +479,12 @@ The AWS region in which to run your job. Default is set to `eu-west-1` but can b ### `--awscli` -The [AWS CLI](https://www.nextflow.io/docs/latest/awscloud.html#aws-cli-installation) path in your custom AMI. Default: `/home/ec2-user/miniconda/bin/aws`. +The [AWS CLI](https://www.nextflow.io/docs/latest/awscloud.html#aws-cli-installation) path in your custom AMI (Default: `/home/ec2-user/miniconda/bin/aws`). Please make sure to also set the `-w/--work-dir` and `--outdir` parameters to a S3 storage bucket of your choice - you'll get an error message notifying you if you didn't. ## Other command line parameters - - ### `--outdir` The output directory where the results will be saved. @@ -275,7 +527,7 @@ Note - you can use this to override pipeline defaults. ### `--custom_config_version` -Provide git commit id for custom Institutional configs hosted at `nf-core/configs`. This was implemented for reproducibility purposes. Default: `master`. +Provide git commit id for custom Institutional configs hosted at `nf-core/configs`. This was implemented for reproducibility purposes (Default: `master`). ```bash ## Download and use config file with following git commid id @@ -284,7 +536,7 @@ Provide git commit id for custom Institutional configs hosted at `nf-core/config ### `--custom_config_base` -If you're running offline, nextflow will not be able to fetch the institutional config files +If you're running offline, Nextflow will not be able to fetch the institutional config files from the internet. If you don't need them, then this is not a problem. If you do need them, you should download the files from the repo and tell nextflow where to find them with the `custom_config_base` option. For example: diff --git a/environment.yml b/environment.yml index e01930db..417f1353 100644 --- a/environment.yml +++ b/environment.yml @@ -1,15 +1,51 @@ # You can use this file to create a conda environment for this pipeline: # conda env create -f environment.yml -name: nf-core-viralrecon-1.0dev +name: nf-core-viralrecon-1.0.0 channels: - conda-forge - bioconda - defaults + - hcc dependencies: - - conda-forge::python=3.7.3 - - conda-forge::markdown=3.1.1 - - conda-forge::pymdown-extensions=6.0 - - conda-forge::pygments=2.5.2 - # TODO nf-core: Add required software dependencies here - - bioconda::fastqc=0.11.8 - - bioconda::multiqc=1.7 + ## conda-forge packages + - conda-forge::python=3.6.10 + - conda-forge::markdown=3.2.2 + - conda-forge::pymdown-extensions=7.1 + - conda-forge::pygments=2.6.1 + - conda-forge::pigz=2.3.4 + - conda-forge::r-base=3.6.2 + - conda-forge::bc=1.07.1 + + ## bioconda packages + ## common + - bioconda::fastqc=0.11.9 + - bioconda::parallel-fastq-dump=0.6.6 + - bioconda::sra-tools=2.10.3 + - bioconda::fastp=0.20.1 + - bioconda::samtools=1.9 + - bioconda::bedtools=2.29.2 + - bioconda::multiqc=1.9 + + ## variants + - bioconda::bowtie2=2.3.5.1 + - bioconda::picard=2.22.8 + - bioconda::ivar=1.2.2 + - bioconda::bcftools=1.9 + - bioconda::varscan=2.4.4 + - bioconda::snpeff=4.5covid19 + - bioconda::snpsift=4.3.1t + + ## assembly + - bioconda::cutadapt=2.10 + - bioconda::kraken2=2.0.9beta + - bioconda::spades=3.14.0 + - bioconda::unicycler=0.4.7 + - bioconda::minia=3.2.3 + - bioconda::minimap2=2.17 + - bioconda::seqwish=0.4.1 + - bioconda::vg=1.24.0 + - bioconda::quast=5.0.2 + - bioconda::blast=2.9.0 + - bioconda::plasmidid=1.5.2 + - bioconda::bandage=0.8.1 + - hcc::abacas=1.3.1 diff --git a/main.nf b/main.nf index 3236ac18..c2d48f58 100644 --- a/main.nf +++ b/main.nf @@ -10,7 +10,6 @@ */ def helpMessage() { - // TODO nf-core: Add to this help message with new command line parameters log.info nfcoreHeader() log.info""" @@ -18,141 +17,2959 @@ def helpMessage() { The typical command for running the pipeline is as follows: - nextflow run nf-core/viralrecon --reads '*_R{1,2}.fastq.gz' -profile docker + nextflow run nf-core/viralrecon --input samplesheet.csv --genome 'NC_045512.2' -profile docker - Mandatory arguments: - --reads [file] Path to input data (must be surrounded with quotes) - -profile [str] Configuration profile to use. Can use multiple (comma separated) - Available: conda, docker, singularity, test, awsbatch, and more + Mandatory arguments + --input [file] Comma-separated file containing information about the samples in the experiment (see docs/usage.md) + --fasta [file] Path to fasta reference for viral genome. Mandatory when --genome not supplied + --amplicon_bed [file] Path to BED file containing amplicon positions. Mandatory when --protocol 'amplicon' + --amplicon_fasta [file] Path to fasta file containing amplicon sequences. Mandatory when --protocol 'amplicon' + -profile [str] Configuration profile to use. Can use multiple (comma separated) + Available: conda, docker, singularity, test, awsbatch, and more - Options: - --genome [str] Name of iGenomes reference - --single_end [bool] Specifies that the input is single-end reads + Generic + --protocol [str] Specifies the type of protocol used for sequencing i.e. 'metagenomic' or 'amplicon' (Default: 'metagenomic') - References If not specified in the configuration file or you wish to overwrite any of the references - --fasta [file] Path to fasta reference + SRA download + --save_sra_fastq [bool] Save FastQ files created from SRA identifiers in the results directory (Default: false) + --skip_sra [bool] Skip steps involving the download and validation of FastQ files using SRA identifiers (Default: false) + + References If not specified in the configuration file or you wish to overwrite any of the references + --genome [str] Name of genome reference key for viral genome (Default: '') + --gff [file] Full path to viral gff annotation file (Default: '') + --save_reference [bool] If generated by the pipeline save the Bowtie2 indices in the results directory (Default: false) + + Kraken2 + --kraken2_db [file] Full path to Kraken2 database built from host genome (Default: kraken2_human.tar.gz hosted on Zenodo) + --kraken2_db_name [str] Name of host genome for building Kraken2 database (Default: 'human') + --kraken2_use_ftp [bool] Use FTP instead of rsync when building kraken2 databases (Default: false) + --save_kraken2_fastq [bool] Save the host and viral fastq files in the results directory (Default: false) + --skip_kraken2 [bool] Skip Kraken2 process for removing host classified reads (Default: false) + + Read trimming + --cut_mean_quality [int] The mean quality requirement option shared by fastp cut_front, cut_tail or cut_sliding options. Range: 1~36 (Default: 30 (Q30)) + --qualified_quality_phred [int] The quality value that a base is qualified. Default 30 means phred quality >=Q30 is qualified (Default: 30) + --unqualified_percent_limit [int] Percentage of bases that are allowed to be unqualified (0~100) (Default: 10) + --min_trim_length [int] Reads shorter than this length after trimming will be discarded (Default: 50) + --skip_adapter_trimming [bool] Skip the adapter trimming step with fastp (Default: false) + --skip_amplicon_trimming [bool] Skip the amplicon trimming step with Cutadapt (Default: false) + --save_trimmed [bool] Save the trimmed FastQ files in the results directory (Default: false) + + Variant calling + --callers [str] Specify which variant calling algorithms you would like to use (Default: 'varscan2,ivar,bcftools') + --ivar_exclude_reads [bool] Unset -e parameter for iVar trim. Reads with primers are included by default (Default: false) + --filter_dups [bool] Remove duplicate reads from alignments as identified by picard MarkDuplicates (Default: false) + --filter_unmapped [bool] Remove unmapped reads from alignments (Default: false) + --min_base_qual [int] When performing variant calling skip bases with baseQ/BAQ smaller than this number (Default: 20) + --min_coverage [int] When performing variant calling skip positions with an overall read depth smaller than this number (Default: 10) + --max_allele_freq [float] Maximum allele frequency threshold for filtering variant calls (Default: 0.8) + --save_align_intermeds [bool] Save the intermediate BAM files from the alignment steps (Default: false) + --save_mpileup [bool] Save MPileup files generated during variant calling (Default: false) + --skip_markduplicates [bool] Skip picard MarkDuplicates step (Default: false) + --skip_snpeff [bool] Skip SnpEff and SnpSift annotation of variants (Default: false) + --skip_variants_quast [bool] Skip generation of QUAST aggregated report for consensus sequences (Default: false) + --skip_variants [bool] Skip variant calling steps in the pipeline (Default: false) + + De novo assembly + --assemblers [str] Specify which assembly algorithms you would like to use (Default: 'spades,metaspades,unicycler,minia') + --minia_kmer [int] Kmer size to use when running minia (Default: 31) + --skip_blast [bool] Skip blastn of assemblies relative to reference genome (Default: false) + --skip_abacas [bool] Skip ABACAS process for assembly contiguation (Default: false) + --skip_plasmidid [bool] Skip assembly report generation by PlasmidID (Default: false) + --skip_vg [bool] Skip variant graph creation and variant calling relative to reference (Default: false) + --skip_assembly_quast [bool] Skip generation of QUAST aggregated report for assemblies (Default: false) + --skip_assembly [bool] Skip assembly steps in the pipeline (Default: false) + + QC + --skip_fastqc [bool] Skip FastQC (Default: false) + --skip_picard_metrics Skip Picard CollectMultipleMetrics and CollectWgsMetrics (Default: false) + --skip_multiqc [bool] Skip MultiQC (Default: false) + --skip_qc [bool] Skip all QC steps apart from MultiQC (Default: false) Other options: - --outdir [file] The output directory where the results will be saved - --email [email] Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits - --email_on_fail [email] Same as --email, except only send mail if the workflow is not successful - --max_multiqc_email_size [str] Theshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB) - -name [str] Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic + --outdir [file] The output directory where the results will be saved + --email [email] Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits + --email_on_fail [email] Same as --email, except only send mail if the workflow is not successful + --max_multiqc_email_size [str] Theshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB) + -name [str] Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic + + AWSBatch options: + --awsqueue [str] The AWSBatch JobQueue that needs to be set when running on AWSBatch + --awsregion [str] The AWS Region for your AWS Batch job to run on + --awscli [str] Path to the AWS CLI tool + """.stripIndent() +} + +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// +/* -- -- */ +/* -- SET UP CONFIGURATION VARIABLES -- */ +/* -- -- */ +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// + +// Show help message +if (params.help) { + helpMessage() + exit 0 +} + +// Has the run name been specified by the user? +// this has the bonus effect of catching both -name and --name +custom_runName = params.name +if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) { + custom_runName = workflow.runName +} + +//////////////////////////////////////////////////// +/* -- VALIDATE INPUTS -- */ +//////////////////////////////////////////////////// + +if (params.input) { ch_input = file(params.input, checkIfExists: true) } else { exit 1, "Input samplesheet file not specified!" } + +if (params.protocol != 'metagenomic' && params.protocol != 'amplicon') { + exit 1, "Invalid protocol option: ${params.protocol}. Valid options: 'metagenomic' or 'amplicon'!" +} + +if (params.protocol == 'amplicon' && !params.skip_assembly && !params.amplicon_fasta) { + exit 1, "To perform de novo assembly in 'amplicon' mode please provide a valid amplicon fasta file!" +} +if (params.amplicon_fasta) { ch_amplicon_fasta = file(params.amplicon_fasta, checkIfExists: true) } + +if (params.protocol == 'amplicon' && !params.skip_variants && !params.amplicon_bed) { + exit 1, "To perform variant calling in 'amplicon' mode please provide a valid amplicon BED file!" +} +if (params.amplicon_bed) { ch_amplicon_bed = file(params.amplicon_bed, checkIfExists: true) } + +callerList = [ 'varscan2', 'ivar', 'bcftools'] +callers = params.callers ? params.callers.split(',').collect{ it.trim().toLowerCase() } : [] +if ((callerList + callers).unique().size() != callerList.size()) { + exit 1, "Invalid variant calller option: ${params.callers}. Valid options: ${callerList.join(', ')}" +} + +assemblerList = [ 'spades', 'metaspades', 'unicycler', 'minia' ] +assemblers = params.assemblers ? params.assemblers.split(',').collect{ it.trim().toLowerCase() } : [] +if ((assemblerList + assemblers).unique().size() != assemblerList.size()) { + exit 1, "Invalid assembler option: ${params.assemblers}. Valid options: ${assemblerList.join(', ')}" +} + +// Viral reference files +if (params.genomes && params.genome && !params.genomes.containsKey(params.genome)) { + exit 1, "The provided genome '${params.genome}' is not available in the Genome file. Currently the available genomes are ${params.genomes.keySet().join(", ")}" +} +params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false +params.gff = params.genome ? params.genomes[ params.genome ].gff ?: false : false + +if (params.fasta) { + file(params.fasta, checkIfExists: true) + + lastPath = params.fasta.lastIndexOf(File.separator) + lastExt = params.fasta.lastIndexOf(".") + fasta_base = params.fasta.substring(lastPath+1) + index_base = params.fasta.substring(lastPath+1,lastExt) + if (params.fasta.endsWith('.gz')) { + fasta_base = params.fasta.substring(lastPath+1,lastExt) + index_base = fasta_base.substring(0,fasta_base.lastIndexOf(".")) + } +} else { + exit 1, "Viral genome fasta file not specified!" +} + +//////////////////////////////////////////////////// +/* -- CONFIG FILES -- */ +//////////////////////////////////////////////////// + +ch_multiqc_config = file("$baseDir/assets/multiqc_config.yaml", checkIfExists: true) +ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config, checkIfExists: true) : Channel.empty() +ch_output_docs = file("$baseDir/docs/output.md", checkIfExists: true) +ch_output_docs_images = file("$baseDir/docs/images/", checkIfExists: true) + +//////////////////////////////////////////////////// +/* -- HEADER FILES -- */ +//////////////////////////////////////////////////// + +ch_blast_outfmt6_header = file("$baseDir/assets/headers/blast_outfmt6_header.txt", checkIfExists: true) +ch_ivar_variants_header_mqc = file("$baseDir/assets/headers/ivar_variants_header_mqc.txt", checkIfExists: true) + +//////////////////////////////////////////////////// +/* -- AWS -- */ +//////////////////////////////////////////////////// + +// Check AWS batch settings +if (workflow.profile.contains('awsbatch')) { + // AWSBatch sanity checking + if (!params.awsqueue || !params.awsregion) exit 1, "Specify correct --awsqueue and --awsregion parameters on AWSBatch!" + // Check outdir paths to be S3 buckets if running on AWSBatch + // related: https://github.com/nextflow-io/nextflow/issues/813 + if (!params.outdir.startsWith('s3:')) exit 1, "Outdir not on S3 - specify S3 Bucket to run on AWSBatch!" + // Prevent trace files to be stored on S3 since S3 does not support rolling files. + if (params.tracedir.startsWith('s3:')) exit 1, "Specify a local tracedir or run without trace! S3 cannot be used for tracefiles." +} + +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// +/* -- -- */ +/* -- HEADER LOG INFO -- */ +/* -- -- */ +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// + +// Header log info +log.info nfcoreHeader() +def summary = [:] +if (workflow.revision) summary['Pipeline Release'] = workflow.revision +summary['Run Name'] = custom_runName ?: workflow.runName +summary['Samplesheet'] = params.input +summary['Protocol'] = params.protocol +if (params.protocol == 'amplicon') summary['Amplicon Fasta File'] = params.amplicon_fasta +if (params.protocol == 'amplicon') summary['Amplicon BED File'] = params.amplicon_bed +summary['Viral Genome'] = params.genome ?: 'Not supplied' +summary['Viral Fasta File'] = params.fasta +if (params.gff) summary['Viral GFF'] = params.gff +if (params.save_reference) summary['Save Genome Indices'] = 'Yes' +if (params.save_sra_fastq) summary['Save SRA FastQ'] = params.save_sra_fastq +if (params.skip_sra) summary['Skip SRA Download'] = params.skip_sra +if (!params.skip_kraken2) { + if (params.kraken2_db) summary['Host Kraken2 DB'] = params.kraken2_db + if (params.kraken2_db_name) summary['Host Kraken2 Name'] = params.kraken2_db_name + if (params.kraken2_use_ftp) summary['Kraken2 Use FTP'] = params.kraken2_use_ftp + if (params.save_kraken2_fastq) summary['Save Kraken2 FastQ'] = params.save_kraken2_fastq +} else { + summary['Skip Kraken2'] = 'Yes' +} +if (!params.skip_adapter_trimming) { + if (params.cut_mean_quality) summary['Cut Mean Quality'] = params.cut_mean_quality + if (params.qualified_quality_phred) summary['Qualified Phred'] = params.qualified_quality_phred + if (params.unqualified_percent_limit) summary['Unqualified Perc Limit'] = params.unqualified_percent_limit + if (params.min_trim_length) summary['Min Trim Length'] = params.min_trim_length +} else { + summary['Skip Adapter Trimming'] = 'Yes' +} +if (params.skip_amplicon_trimming) summary['Skip Amplicon Trimming'] = 'Yes' +if (params.save_trimmed) summary['Save Trimmed'] = 'Yes' +if (!params.skip_variants) { + summary['Variant Calling Tools'] = params.callers + if (params.ivar_exclude_reads) summary['iVar Trim Exclude'] = 'Yes' + if (params.filter_dups) summary['Remove Duplicate Reads'] = 'Yes' + if (params.filter_unmapped) summary['Remove Unmapped Reads'] = 'Yes' + summary['Min Base Quality'] = params.min_base_qual + summary['Min Read Depth'] = params.min_coverage + summary['Max Allele Freq'] = params.max_allele_freq + if (params.save_align_intermeds) summary['Save Align Intermeds'] = 'Yes' + if (params.save_mpileup) summary['Save MPileup'] = 'Yes' + if (params.skip_markduplicates) summary['Skip MarkDuplicates'] = 'Yes' + if (params.skip_snpeff) summary['Skip SnpEff'] = 'Yes' + if (params.skip_variants_quast) summary['Skip Variants QUAST'] = 'Yes' +} else { + summary['Skip Variant Calling'] = 'Yes' +} +if (!params.skip_assembly) { + summary['Assembly Tools'] = params.assemblers + summary['Minia Kmer Size'] = params.minia_kmer + if (params.skip_vg) summary['Skip Variant Graph'] = 'Yes' + if (params.skip_blast) summary['Skip BLAST'] = 'Yes' + if (params.skip_abacas) summary['Skip ABACAS'] = 'Yes' + if (params.skip_plasmidid) summary['Skip PlasmidID'] = 'Yes' + if (params.skip_assembly_quast) summary['Skip Assembly QUAST'] = 'Yes' +} else { + summary['Skip Assembly'] = 'Yes' +} +if (!params.skip_qc) { + if (params.skip_fastqc) summary['Skip FastQC'] = 'Yes' + if (params.skip_picard_metrics) summary['Skip Picard Metrics'] = 'Yes' +} else { + summary['Skip QC'] = 'Yes' +} +if (params.skip_multiqc) summary['Skip MultiQC'] = 'Yes' +summary['Max Resources'] = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job" +if (workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container" +summary['Output dir'] = params.outdir +summary['Publish dir mode'] = params.publish_dir_mode +summary['Launch dir'] = workflow.launchDir +summary['Working dir'] = workflow.workDir +summary['Script dir'] = workflow.projectDir +summary['User'] = workflow.userName +if (workflow.profile.contains('awsbatch')) { + summary['AWS Region'] = params.awsregion + summary['AWS Queue'] = params.awsqueue + summary['AWS CLI'] = params.awscli +} +summary['Config Profile'] = workflow.profile +if (params.config_profile_description) summary['Config Description'] = params.config_profile_description +if (params.config_profile_contact) summary['Config Contact'] = params.config_profile_contact +if (params.config_profile_url) summary['Config URL'] = params.config_profile_url +if (params.email || params.email_on_fail) { + summary['E-mail Address'] = params.email + summary['E-mail on failure'] = params.email_on_fail + summary['MultiQC maxsize'] = params.max_multiqc_email_size +} +log.info summary.collect { k,v -> "${k.padRight(21)}: $v" }.join("\n") +log.info "-\033[2m--------------------------------------------------\033[0m-" + +// Check the hostnames against configured profiles +checkHostname() + +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// +/* -- -- */ +/* -- UNZIP/UNTAR REFERENCE FILES -- */ +/* -- -- */ +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// + +/* + * PREPROCESSING: Uncompress genome fasta file + */ +if (params.fasta.endsWith('.gz')) { + process GUNZIP_FASTA { + label 'error_retry' + if (params.save_reference) { + publishDir "${params.outdir}/genome", mode: params.publish_dir_mode + } + + input: + path fasta from params.fasta + + output: + path "$unzip" into ch_fasta + + script: + unzip = fasta.toString() - '.gz' + """ + pigz -f -d -p $task.cpus $fasta + """ + } +} else { + ch_fasta = file(params.fasta) +} + +// Print warning if viral genome fasta has more than one sequence +def count = 0 +ch_fasta.withReader { reader -> + while (line = reader.readLine()) { + if (line.contains('>')) { + count++ + if (count > 1) { + log.info "[nf-core/viralrecon] WARNING: This pipeline does not support multi-fasta genome files. Please amend the '--fasta' parameter." + break + } + } + } +} + +/* + * PREPROCESSING: Uncompress gff annotation file + */ +if (params.gff) { + file(params.gff, checkIfExists: true) + if (params.gff.endsWith('.gz')) { + process GUNZIP_GFF { + label 'error_retry' + if (params.save_reference) { + publishDir "${params.outdir}/genome", mode: params.publish_dir_mode + } + + input: + path gff from params.gff + + output: + path "$unzip" into ch_gff + + script: + unzip = gff.toString() - '.gz' + """ + pigz -f -d -p $task.cpus $gff + """ + } + } else { + ch_gff = file(params.gff) + } +} else { + //See: https://nextflow-io.github.io/patterns/index.html#_optional_input + ch_gff = file('NO_FILE') +} + +/* + * PREPROCESSING: Uncompress Kraken2 database + */ +if (!params.skip_kraken2 && params.kraken2_db) { + file(params.kraken2_db, checkIfExists: true) + if (params.kraken2_db.endsWith('.tar.gz')) { + process UNTAR_KRAKEN2_DB { + label 'error_retry' + if (params.save_reference) { + publishDir "${params.outdir}/genome", mode: params.publish_dir_mode + } + + input: + path db from params.kraken2_db + + output: + path "$untar" into ch_kraken2_db + + script: + untar = db.toString() - '.tar.gz' + """ + tar -xvf $db + """ + } + } else { + ch_kraken2_db = file(params.kraken2_db) + } +} + +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// +/* -- -- */ +/* -- PARSE DESIGN FILE -- */ +/* -- -- */ +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// + +/* + * PREPROCESSING: Reformat samplesheet and check validity + */ +process CHECK_SAMPLESHEET { + tag "$samplesheet" + publishDir "${params.outdir}/", mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename.endsWith(".tsv")) "preprocess/sra/$filename" + else "pipeline_info/$filename" + } + + input: + path samplesheet from ch_input + + output: + path "samplesheet.valid.csv" into ch_samplesheet_reformat + path "sra_run_info.tsv" optional true + + script: // These scripts are bundled with the pipeline, in nf-core/viralrecon/bin/ + run_sra = !params.skip_sra && !isOffline() + """ + awk -F, '{if(\$1 != "" && \$2 != "") {print \$0}}' $samplesheet > nonsra_id.csv + check_samplesheet.py nonsra_id.csv nonsra.samplesheet.csv + + awk -F, '{if(\$1 != "" && \$2 == "" && \$3 == "") {print \$1}}' $samplesheet > sra_id.list + if $run_sra && [ -s sra_id.list ] + then + fetch_sra_runinfo.py sra_id.list sra_run_info.tsv --platform ILLUMINA --library_layout SINGLE,PAIRED + sra_runinfo_to_samplesheet.py sra_run_info.tsv sra.samplesheet.csv + fi + + if [ -f nonsra.samplesheet.csv ] + then + head -n 1 nonsra.samplesheet.csv > samplesheet.valid.csv + else + head -n 1 sra.samplesheet.csv > samplesheet.valid.csv + fi + tail -n +2 -q *sra.samplesheet.csv >> samplesheet.valid.csv + """ +} + +// Function to get list of [ sample, single_end?, is_sra?, is_ftp?, [ fastq_1, fastq_2 ], [ md5_1, md5_2] ] +def validate_input(LinkedHashMap sample) { + def sample_id = sample.sample_id + def single_end = sample.single_end.toBoolean() + def is_sra = sample.is_sra.toBoolean() + def is_ftp = sample.is_ftp.toBoolean() + def fastq_1 = sample.fastq_1 + def fastq_2 = sample.fastq_2 + def md5_1 = sample.md5_1 + def md5_2 = sample.md5_2 + + def array = [] + if (!is_sra) { + if (single_end) { + array = [ sample_id, single_end, is_sra, is_ftp, [ file(fastq_1, checkIfExists: true) ] ] + } else { + array = [ sample_id, single_end, is_sra, is_ftp, [ file(fastq_1, checkIfExists: true), file(fastq_2, checkIfExists: true) ] ] + } + } else { + array = [ sample_id, single_end, is_sra, is_ftp, [ fastq_1, fastq_2 ], [ md5_1, md5_2 ] ] + } + + return array +} + +/* + * Create channels for input fastq files + */ +ch_samplesheet_reformat + .splitCsv(header:true, sep:',') + .map { validate_input(it) } + .into { ch_reads_all + ch_reads_sra } + +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// +/* -- -- */ +/* -- DOWNLOAD SRA FILES -- */ +/* -- -- */ +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// + +/* + * STEP 1: Download and check SRA data + */ +if (!params.skip_sra || !isOffline()) { + ch_reads_sra + .filter { it[2] } + .into { ch_reads_sra_ftp + ch_reads_sra_dump } + + process SRA_FASTQ_FTP { + tag "$sample" + label 'process_medium' + label 'error_retry' + publishDir "${params.outdir}/preprocess/sra", mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename.endsWith(".md5")) "md5/$filename" + else params.save_sra_fastq ? filename : null + } + + when: + is_ftp + + input: + tuple val(sample), val(single_end), val(is_sra), val(is_ftp), val(fastq), val(md5) from ch_reads_sra_ftp + + output: + tuple val(sample), val(single_end), val(is_sra), val(is_ftp), path("*.fastq.gz") into ch_sra_fastq_ftp + path "*.md5" + + script: + if (single_end) { + """ + curl -L ${fastq[0]} -o ${sample}.fastq.gz + echo "${md5[0]} ${sample}.fastq.gz" > ${sample}.fastq.gz.md5 + md5sum -c ${sample}.fastq.gz.md5 + """ + } else { + """ + curl -L ${fastq[0]} -o ${sample}_1.fastq.gz + echo "${md5[0]} ${sample}_1.fastq.gz" > ${sample}_1.fastq.gz.md5 + md5sum -c ${sample}_1.fastq.gz.md5 + + curl -L ${fastq[1]} -o ${sample}_2.fastq.gz + echo "${md5[1]} ${sample}_2.fastq.gz" > ${sample}_2.fastq.gz.md5 + md5sum -c ${sample}_2.fastq.gz.md5 + """ + } + } + + process SRA_FASTQ_DUMP { + tag "$sample" + label 'process_medium' + label 'error_retry' + publishDir "${params.outdir}/preprocess/sra", mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename.endsWith(".log")) "log/$filename" + else params.save_sra_fastq ? filename : null + } + + when: + !is_ftp + + input: + tuple val(sample), val(single_end), val(is_sra), val(is_ftp) from ch_reads_sra_dump.map { it[0..3] } + + output: + tuple val(sample), val(single_end), val(is_sra), val(is_ftp), path("*.fastq.gz") into ch_sra_fastq_dump + path "*.log" + + script: + prefix = "${sample.split('_')[0..-2].join('_')}" + pe = single_end ? "" : "--readids --split-e" + rm_orphan = single_end ? "" : "[ -f ${prefix}.fastq.gz ] && rm ${prefix}.fastq.gz" + """ + parallel-fastq-dump \\ + --sra-id $prefix \\ + --threads $task.cpus \\ + --outdir ./ \\ + --tmpdir ./ \\ + --gzip \\ + $pe \\ + > ${prefix}.fastq_dump.log + + $rm_orphan + """ + } + + ch_reads_all + .filter { !it[2] } + .concat(ch_sra_fastq_ftp, ch_sra_fastq_dump) + .set { ch_reads_all } +} + +ch_reads_all + .map { [ it[0].split('_')[0..-2].join('_'), it[1], it[4] ] } + .groupTuple(by: [0, 1]) + .map { [ it[0], it[1], it[2].flatten() ] } + .set { ch_reads_all } + +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// +/* -- -- */ +/* -- MERGE RESEQUENCED FASTQ -- */ +/* -- -- */ +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// + +/* + * STEP 2: Merge FastQ files with the same sample identifier + */ +process CAT_FASTQ { + tag "$sample" + + input: + tuple val(sample), val(single_end), path(reads) from ch_reads_all + + output: + tuple val(sample), val(single_end), path("*.merged.fastq.gz") into ch_cat_fastqc, + ch_cat_fastp + + script: + readList = reads.collect{it.toString()} + if (!single_end) { + if (readList.size > 2) { + def read1 = [] + def read2 = [] + readList.eachWithIndex{ v, ix -> ( ix & 1 ? read2 : read1 ) << v } + """ + cat ${read1.sort().join(' ')} > ${sample}_1.merged.fastq.gz + cat ${read2.sort().join(' ')} > ${sample}_2.merged.fastq.gz + """ + } else { + """ + ln -s ${reads[0]} ${sample}_1.merged.fastq.gz + ln -s ${reads[1]} ${sample}_2.merged.fastq.gz + """ + } + } else { + if (readList.size > 1) { + """ + cat ${readList.sort().join(' ')} > ${sample}.merged.fastq.gz + """ + } else { + """ + ln -s $reads ${sample}.merged.fastq.gz + """ + } + } +} + +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// +/* -- -- */ +/* -- FASTQ QC -- */ +/* -- -- */ +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// + +/* + * STEP 3: FastQC on input reads after merging libraries from the same sample + */ +process FASTQC { + tag "$sample" + label 'process_medium' + publishDir "${params.outdir}/preprocess/fastqc", mode: params.publish_dir_mode, + saveAs: { filename -> + filename.endsWith(".zip") ? "zips/$filename" : filename + } + + when: + !params.skip_fastqc && !params.skip_qc + + input: + tuple val(sample), val(single_end), path(reads) from ch_cat_fastqc + + output: + path "*.{zip,html}" into ch_fastqc_raw_reports_mqc + + script: + """ + fastqc --quiet --threads $task.cpus *.fastq.gz + """ +} + +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// +/* -- -- */ +/* -- ADAPTER TRIMMING -- */ +/* -- -- */ +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// + +/* + * STEP 4: Fastp adapter trimming and quality filtering + */ +if (!params.skip_adapter_trimming) { + process FASTP { + tag "$sample" + label 'process_medium' + publishDir "${params.outdir}/preprocess/fastp", mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename.endsWith(".json")) filename + else if (filename.endsWith(".fastp.html")) filename + else if (filename.endsWith("_fastqc.html")) "fastqc/$filename" + else if (filename.endsWith(".zip")) "fastqc/zips/$filename" + else if (filename.endsWith(".log")) "log/$filename" + else params.save_trimmed ? filename : null + } + + when: + !params.skip_variants || !params.skip_assembly + + input: + tuple val(sample), val(single_end), path(reads) from ch_cat_fastp + + output: + tuple val(sample), val(single_end), path("*.trim.fastq.gz") into ch_fastp_bowtie2, + ch_fastp_cutadapt, + ch_fastp_kraken2 + path "*.{log,fastp.html,json}" into ch_fastp_mqc + path "*_fastqc.{zip,html}" into ch_fastp_fastqc_mqc + path "*.fail.fastq.gz" + + script: + // Added soft-links to original fastqs for consistent naming in MultiQC + autodetect = single_end ? "" : "--detect_adapter_for_pe" + """ + IN_READS='--in1 ${sample}.fastq.gz' + OUT_READS='--out1 ${sample}.trim.fastq.gz --failed_out ${sample}.fail.fastq.gz' + if $single_end; then + [ ! -f ${sample}.fastq.gz ] && ln -s $reads ${sample}.fastq.gz + else + [ ! -f ${sample}_1.fastq.gz ] && ln -s ${reads[0]} ${sample}_1.fastq.gz + [ ! -f ${sample}_2.fastq.gz ] && ln -s ${reads[1]} ${sample}_2.fastq.gz + IN_READS='--in1 ${sample}_1.fastq.gz --in2 ${sample}_2.fastq.gz' + OUT_READS='--out1 ${sample}_1.trim.fastq.gz --out2 ${sample}_2.trim.fastq.gz --unpaired1 ${sample}_1.fail.fastq.gz --unpaired2 ${sample}_2.fail.fastq.gz' + fi + + fastp \\ + \$IN_READS \\ + \$OUT_READS \\ + $autodetect \\ + --cut_front \\ + --cut_tail \\ + --cut_mean_quality $params.cut_mean_quality \\ + --qualified_quality_phred $params.qualified_quality_phred \\ + --unqualified_percent_limit $params.unqualified_percent_limit \\ + --length_required $params.min_trim_length \\ + --trim_poly_x \\ + --thread $task.cpus \\ + --json ${sample}.fastp.json \\ + --html ${sample}.fastp.html \\ + 2> ${sample}.fastp.log + + fastqc --quiet --threads $task.cpus *.trim.fastq.gz + """ + } +} else { + ch_cat_fastp + .into { ch_fastp_bowtie2 + ch_fastp_cutadapt + ch_fastp_kraken2 } + ch_fastp_mqc = Channel.empty() + ch_fastp_fastqc_mqc = Channel.empty() +} + +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// +/* -- -- */ +/* -- VARIANT CALLING PROCESSES -- */ +/* -- -- */ +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// + +/* + * PREPROCESSING: Build Bowtie2 index for viral genome + */ +process BOWTIE2_INDEX { + tag "$fasta" + label 'process_medium' + if (params.save_reference) { + publishDir "${params.outdir}/genome", mode: params.publish_dir_mode + } + + when: + !params.skip_variants + + input: + path fasta from ch_fasta + + output: + path "Bowtie2Index" into ch_index + + script: + """ + bowtie2-build \\ + --seed 1 \\ + --threads $task.cpus \\ + $fasta \\ + $index_base + mkdir Bowtie2Index && mv ${index_base}* Bowtie2Index + """ +} + +/* + * PREPROCESSING: Build SnpEff database for viral genome + */ +process MAKE_SNPEFF_DB { + tag "${index_base}.fa" + label 'process_low' + if (params.save_reference) { + publishDir "${params.outdir}/genome", mode: params.publish_dir_mode + } + + when: + (!params.skip_variants || !params.skip_assembly) && params.gff && !params.skip_snpeff + + input: + path ("SnpEffDB/genomes/${index_base}.fa") from ch_fasta + path ("SnpEffDB/${index_base}/genes.gff") from ch_gff + + output: + tuple path("SnpEffDB"), path("*.config") into ch_snpeff_db_varscan2, + ch_snpeff_db_ivar, + ch_snpeff_db_bcftools, + ch_snpeff_db_spades, + ch_snpeff_db_metaspades, + ch_snpeff_db_unicycler, + ch_snpeff_db_minia + + script: + """ + echo "${index_base}.genome : ${index_base}" > snpeff.config + snpEff build -config snpeff.config -dataDir ./SnpEffDB -gff3 -v ${index_base} + """ +} + +/* + * STEP 5.1: Map read(s) with Bowtie 2 + */ +process BOWTIE2 { + tag "$sample" + label 'process_medium' + publishDir "${params.outdir}/variants/bam", mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename.endsWith(".log")) "log/$filename" + else params.save_align_intermeds ? filename : null + } + + when: + !params.skip_variants + + input: + tuple val(sample), val(single_end), path(reads) from ch_fastp_bowtie2 + path index from ch_index + + output: + tuple val(sample), val(single_end), path("*.bam") into ch_bowtie2_bam + path "*.log" into ch_bowtie2_mqc + + script: + input_reads = single_end ? "-U $reads" : "-1 ${reads[0]} -2 ${reads[1]}" + filter = params.filter_unmapped ? "-F4" : "" + """ + bowtie2 \\ + --threads $task.cpus \\ + --local \\ + --very-sensitive-local \\ + -x ${index}/${index_base} \\ + $input_reads \\ + 2> ${sample}.bowtie2.log \\ + | samtools view -@ $task.cpus -b -h -O BAM -o ${sample}.bam $filter - + """ +} + +/* + * STEP 5.2: Convert BAM to coordinate sorted BAM + */ +process SORT_BAM { + tag "$sample" + label 'process_medium' + publishDir "${params.outdir}/variants/bam", mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename.endsWith(".flagstat")) "samtools_stats/$filename" + else if (filename.endsWith(".idxstats")) "samtools_stats/$filename" + else if (filename.endsWith(".stats")) "samtools_stats/$filename" + else (params.protocol != 'amplicon' && params.skip_markduplicates) || params.save_align_intermeds ? filename : null + } + + when: + !params.skip_variants + + input: + tuple val(sample), val(single_end), path(bam) from ch_bowtie2_bam + + output: + tuple val(sample), val(single_end), path("*.sorted.{bam,bam.bai}") into ch_sort_bam + path "*.{flagstat,idxstats,stats}" into ch_sort_bam_flagstat_mqc + + script: + """ + samtools sort -@ $task.cpus -o ${sample}.sorted.bam -T $sample $bam + samtools index ${sample}.sorted.bam + samtools flagstat ${sample}.sorted.bam > ${sample}.sorted.bam.flagstat + samtools idxstats ${sample}.sorted.bam > ${sample}.sorted.bam.idxstats + samtools stats ${sample}.sorted.bam > ${sample}.sorted.bam.stats + """ +} + +/* + * STEP 5.3: Trim amplicon sequences with iVar + */ +if (params.protocol != 'amplicon') { + ch_sort_bam + .set { ch_ivar_trim_bam } + ch_ivar_trim_flagstat_mqc = Channel.empty() + ch_ivar_trim_log_mqc = Channel.empty() +} else { + process IVAR_TRIM { + tag "$sample" + label 'process_medium' + publishDir "${params.outdir}/variants/bam", mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename.endsWith(".flagstat")) "samtools_stats/$filename" + else if (filename.endsWith(".idxstats")) "samtools_stats/$filename" + else if (filename.endsWith(".stats")) "samtools_stats/$filename" + else if (filename.endsWith(".log")) "log/$filename" + else params.skip_markduplicates || params.save_align_intermeds ? filename : null + } + + when: + !params.skip_variants + + input: + tuple val(sample), val(single_end), path(bam) from ch_sort_bam + path bed from ch_amplicon_bed + + output: + tuple val(sample), val(single_end), path("*.sorted.{bam,bam.bai}") into ch_ivar_trim_bam + path "*.{flagstat,idxstats,stats}" into ch_ivar_trim_flagstat_mqc + path "*.log" into ch_ivar_trim_log_mqc + + script: + exclude_reads = params.ivar_exclude_reads ? "" : "-e" + prefix = "${sample}.trim" + """ + samtools view -b -F 4 ${bam[0]} > ${sample}.mapped.bam + samtools index ${sample}.mapped.bam + + ivar trim \\ + -i ${sample}.mapped.bam \\ + $exclude_reads \\ + -b $bed \\ + -p $prefix > ${prefix}.ivar.log + + samtools sort -@ $task.cpus -o ${prefix}.sorted.bam -T $prefix ${prefix}.bam + samtools index ${prefix}.sorted.bam + samtools flagstat ${prefix}.sorted.bam > ${prefix}.sorted.bam.flagstat + samtools idxstats ${prefix}.sorted.bam > ${prefix}.sorted.bam.idxstats + samtools stats ${prefix}.sorted.bam > ${prefix}.sorted.bam.stats + """ + } +} + +/* + * STEP 5.4: Picard MarkDuplicates + */ +if (params.skip_markduplicates) { + ch_ivar_trim_bam + .into { ch_markdup_bam_metrics + ch_markdup_bam_mpileup + ch_markdup_bam_varscan2_consensus + ch_markdup_bam_bcftools + ch_markdup_bam_bcftools_consensus } + ch_markdup_bam_flagstat_mqc = Channel.empty() + ch_markdup_bam_metrics_mqc = Channel.empty() +} else { + process PICARD_MARKDUPLICATES { + tag "$sample" + label 'process_medium' + publishDir "${params.outdir}/variants/bam", mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename.endsWith(".flagstat")) "samtools_stats/$filename" + else if (filename.endsWith(".idxstats")) "samtools_stats/$filename" + else if (filename.endsWith(".stats")) "samtools_stats/$filename" + else if (filename.endsWith(".metrics.txt")) "picard_metrics/$filename" + else filename + } + + when: + !params.skip_variants + + input: + tuple val(sample), val(single_end), path(bam) from ch_ivar_trim_bam + path fasta from ch_fasta + + output: + tuple val(sample), val(single_end), path("*.sorted.{bam,bam.bai}") into ch_markdup_bam_metrics, + ch_markdup_bam_mpileup, + ch_markdup_bam_varscan2_consensus, + ch_markdup_bam_bcftools, + ch_markdup_bam_bcftools_consensus + path "*.{flagstat,idxstats,stats}" into ch_markdup_bam_flagstat_mqc + path "*.txt" into ch_markdup_bam_metrics_mqc + + script: + def avail_mem = 3 + if (!task.memory) { + log.info "[Picard MarkDuplicates] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this." + } else { + avail_mem = task.memory.toGiga() + } + prefix = params.protocol == 'amplicon' ? "${sample}.trim.mkD" : "${sample}.mkD" + keep_dup = params.filter_dups ? "true" : "false" + """ + picard -Xmx${avail_mem}g MarkDuplicates \\ + INPUT=${bam[0]} \\ + OUTPUT=${prefix}.sorted.bam \\ + ASSUME_SORTED=true \\ + REMOVE_DUPLICATES=$keep_dup \\ + METRICS_FILE=${prefix}.MarkDuplicates.metrics.txt \\ + VALIDATION_STRINGENCY=LENIENT \\ + TMP_DIR=tmp + samtools index ${prefix}.sorted.bam + samtools idxstats ${prefix}.sorted.bam > ${prefix}.sorted.bam.idxstats + samtools flagstat ${prefix}.sorted.bam > ${prefix}.sorted.bam.flagstat + samtools stats ${prefix}.sorted.bam > ${prefix}.sorted.bam.stats + """ + } +} + +/* + * STEP 5.5: Picard CollectMultipleMetrics and CollectWgsMetrics + */ +process PICARD_METRICS { + tag "$sample" + label 'process_medium' + publishDir "${params.outdir}/variants/bam/picard_metrics", mode: params.publish_dir_mode + + when: + !params.skip_variants && !params.skip_picard_metrics && !params.skip_qc + + input: + tuple val(sample), val(single_end), path(bam) from ch_markdup_bam_metrics + path fasta from ch_fasta + + output: + path "*metrics" into ch_picard_metrics_mqc + path "*.pdf" + + script: + def avail_mem = 3 + if (!task.memory) { + log.info "[Picard CollectMultipleMetrics] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this." + } else { + avail_mem = task.memory.toGiga() + } + suffix = params.skip_markduplicates ? "" : ".mkD" + prefix = params.protocol == 'amplicon' ? "${sample}.trim${suffix}" : "${sample}${suffix}" + """ + picard -Xmx${avail_mem}g CollectMultipleMetrics \\ + INPUT=${bam[0]} \\ + OUTPUT=${prefix}.CollectMultipleMetrics \\ + REFERENCE_SEQUENCE=$fasta \\ + VALIDATION_STRINGENCY=LENIENT \\ + TMP_DIR=tmp + + picard -Xmx${avail_mem}g CollectWgsMetrics \\ + COVERAGE_CAP=1000000 \\ + INPUT=${bam[0]} \\ + OUTPUT=${prefix}.CollectWgsMetrics.coverage_metrics \\ + REFERENCE_SEQUENCE=$fasta \\ + VALIDATION_STRINGENCY=LENIENT \\ + TMP_DIR=tmp + """ +} + +//////////////////////////////////////////////////// +/* -- VARSCAN2 -- */ +//////////////////////////////////////////////////// + +/* + * STEP 5.6: Create mpileup file for all variant callers + */ +process SAMTOOLS_MPILEUP { + tag "$sample" + label 'process_medium' + if (params.save_mpileup) { + publishDir "${params.outdir}/variants/bam/mpileup", mode: params.publish_dir_mode + } + + when: + !params.skip_variants + + input: + tuple val(sample), val(single_end), path(bam) from ch_markdup_bam_mpileup + path fasta from ch_fasta + + output: + tuple val(sample), val(single_end), path("*.mpileup") into ch_mpileup_varscan2, + ch_mpileup_ivar_variants, + ch_mpileup_ivar_consensus, + ch_mpileup_ivar_bcftools + + script: + suffix = params.skip_markduplicates ? "" : ".mkD" + prefix = params.protocol == 'amplicon' ? "${sample}.trim${suffix}" : "${sample}${suffix}" + """ + samtools mpileup \\ + --count-orphans \\ + --no-BAQ \\ + --max-depth 50000 \\ + --fasta-ref $fasta \\ + --min-BQ $params.min_base_qual \\ + --output ${prefix}.mpileup \\ + ${bam[0]} + """ +} + +/* + * STEP 5.6.1: Variant calling with VarScan 2 + */ +process VARSCAN2 { + tag "$sample" + label 'process_medium' + publishDir "${params.outdir}/variants/varscan2", mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename.endsWith(".log")) "log/$filename" + else if (filename.endsWith(".txt")) "bcftools_stats/$filename" + else filename + } + + when: + !params.skip_variants && 'varscan2' in callers + + input: + tuple val(sample), val(single_end), path(mpileup) from ch_mpileup_varscan2 + path fasta from ch_fasta + + output: + tuple val(sample), val(single_end), path("${prefix}.vcf.gz*") into ch_varscan2_highfreq_consensus, + ch_varscan2_highfreq_snpeff + tuple val(sample), val(single_end), path("${sample}.vcf.gz*") into ch_varscan2_lowfreq_snpeff + path "${prefix}.bcftools_stats.txt" into ch_varscan2_bcftools_highfreq_mqc + path "*.varscan2.log" into ch_varscan2_log_mqc + path "${sample}.bcftools_stats.txt" + + script: + prefix = "${sample}.AF${params.max_allele_freq}" + """ + echo "$sample" > sample_name.list + varscan mpileup2cns \\ + $mpileup \\ + --min-coverage $params.min_coverage \\ + --min-reads2 5 \\ + --min-avg-qual $params.min_base_qual \\ + --min-var-freq 0.03 \\ + --p-value 0.99 \\ + --output-vcf 1 \\ + --vcf-sample-list sample_name.list \\ + --variants \\ + 2> ${sample}.varscan2.log \\ + | bgzip -c > ${sample}.vcf.gz + tabix -p vcf -f ${sample}.vcf.gz + bcftools stats ${sample}.vcf.gz > ${sample}.bcftools_stats.txt + sed -i.bak '/LC_ALL/d' ${sample}.varscan2.log + + bcftools filter \\ + -i 'FORMAT/AD / (FORMAT/AD + FORMAT/RD) >= $params.max_allele_freq' \\ + --output-type z \\ + --output ${prefix}.vcf.gz \\ + ${sample}.vcf.gz + tabix -p vcf -f ${prefix}.vcf.gz + bcftools stats ${prefix}.vcf.gz > ${prefix}.bcftools_stats.txt + """ +} + +/* + * STEP 5.6.1.1: Genome consensus generation with BCFtools and masked with BEDTools + */ +process VARSCAN2_CONSENSUS { + tag "$sample" + label 'process_medium' + publishDir "${params.outdir}/variants/varscan2/consensus", mode: params.publish_dir_mode + + when: + !params.skip_variants && 'varscan2' in callers + + input: + tuple val(sample), val(single_end), path(bam), path(vcf) from ch_markdup_bam_varscan2_consensus.join(ch_varscan2_highfreq_consensus, by: [0,1]) + path fasta from ch_fasta + + output: + tuple val(sample), val(single_end), path("*consensus.masked.fa") into ch_varscan2_consensus + path "*consensus.fa" + + script: + prefix = "${sample}.AF${params.max_allele_freq}" + """ + cat $fasta | bcftools consensus ${vcf[0]} > ${prefix}.consensus.fa + + bedtools genomecov \\ + -bga \\ + -ibam ${bam[0]} \\ + -g $fasta \\ + | awk '\$4 < $params.min_coverage' | bedtools merge > ${prefix}.mask.bed + + bedtools maskfasta \\ + -fi ${prefix}.consensus.fa \\ + -bed ${prefix}.mask.bed \\ + -fo ${prefix}.consensus.masked.fa + header=\$(head -n 1 ${prefix}.consensus.masked.fa | sed 's/>//g') + sed -i "s/\${header}/${sample}/g" ${prefix}.consensus.masked.fa + """ +} + +/* + * STEP 5.6.1.2: VarScan 2 variant calling annotation with SnpEff and SnpSift + */ +process VARSCAN2_SNPEFF { + tag "$sample" + label 'process_medium' + publishDir "${params.outdir}/variants/varscan2/snpeff", mode: params.publish_dir_mode + + when: + !params.skip_variants && 'varscan2' in callers && params.gff && !params.skip_snpeff + + input: + tuple val(sample), val(single_end), path(highfreq_vcf), path(lowfreq_vcf) from ch_varscan2_highfreq_snpeff.join(ch_varscan2_lowfreq_snpeff, by: [0,1]) + tuple file(db), file(config) from ch_snpeff_db_varscan2 + + output: + path "${prefix}.snpEff.csv" into ch_varscan2_snpeff_highfreq_mqc + path "${sample}.snpEff.csv" + path "*.vcf.gz*" + path "*.{txt,html}" + + script: + prefix = "${sample}.AF${params.max_allele_freq}" + """ + snpEff ${index_base} \\ + -config $config \\ + -dataDir $db \\ + ${lowfreq_vcf[0]} \\ + -csvStats ${sample}.snpEff.csv \\ + | bgzip -c > ${sample}.snpEff.vcf.gz + tabix -p vcf -f ${sample}.snpEff.vcf.gz + mv snpEff_summary.html ${sample}.snpEff.summary.html + + SnpSift extractFields -s "," \\ + -e "." \\ + ${sample}.snpEff.vcf.gz \\ + CHROM POS REF ALT \\ + "ANN[*].GENE" "ANN[*].GENEID" \\ + "ANN[*].IMPACT" "ANN[*].EFFECT" \\ + "ANN[*].FEATURE" "ANN[*].FEATUREID" \\ + "ANN[*].BIOTYPE" "ANN[*].RANK" "ANN[*].HGVS_C" \\ + "ANN[*].HGVS_P" "ANN[*].CDNA_POS" "ANN[*].CDNA_LEN" \\ + "ANN[*].CDS_POS" "ANN[*].CDS_LEN" "ANN[*].AA_POS" \\ + "ANN[*].AA_LEN" "ANN[*].DISTANCE" "EFF[*].EFFECT" \\ + "EFF[*].FUNCLASS" "EFF[*].CODON" "EFF[*].AA" "EFF[*].AA_LEN" \\ + > ${sample}.snpSift.table.txt + + snpEff ${index_base} \\ + -config $config \\ + -dataDir $db \\ + ${highfreq_vcf[0]} \\ + -csvStats ${prefix}.snpEff.csv \\ + | bgzip -c > ${prefix}.snpEff.vcf.gz + tabix -p vcf -f ${prefix}.snpEff.vcf.gz + mv snpEff_summary.html ${prefix}.snpEff.summary.html + + SnpSift extractFields -s "," \\ + -e "." \\ + ${prefix}.snpEff.vcf.gz \\ + CHROM POS REF ALT \\ + "ANN[*].GENE" "ANN[*].GENEID" \\ + "ANN[*].IMPACT" "ANN[*].EFFECT" \\ + "ANN[*].FEATURE" "ANN[*].FEATUREID" \\ + "ANN[*].BIOTYPE" "ANN[*].RANK" "ANN[*].HGVS_C" \\ + "ANN[*].HGVS_P" "ANN[*].CDNA_POS" "ANN[*].CDNA_LEN" \\ + "ANN[*].CDS_POS" "ANN[*].CDS_LEN" "ANN[*].AA_POS" \\ + "ANN[*].AA_LEN" "ANN[*].DISTANCE" "EFF[*].EFFECT" \\ + "EFF[*].FUNCLASS" "EFF[*].CODON" "EFF[*].AA" "EFF[*].AA_LEN" \\ + > ${prefix}.snpSift.table.txt + """ +} + +/* + * STEP 5.6.1.3: VarScan 2 consensus sequence report with QUAST + */ +process VARSCAN2_QUAST { + label 'process_medium' + publishDir "${params.outdir}/variants/varscan2/quast", mode: params.publish_dir_mode + + when: + !params.skip_variants && 'varscan2' in callers && !params.skip_variants_quast + + input: + path consensus from ch_varscan2_consensus.collect{ it[2] } + path fasta from ch_fasta + path gff from ch_gff + + output: + path "AF${params.max_allele_freq}" into ch_varscan2_quast_mqc + + script: + features = params.gff ? "--features $gff" : "" + """ + quast.py \\ + --output-dir AF${params.max_allele_freq} \\ + -r $fasta \\ + $features \\ + --threads $task.cpus \\ + ${consensus.join(' ')} + """ +} + +//////////////////////////////////////////////////// +/* -- IVAR -- */ +//////////////////////////////////////////////////// + +/* + * STEP 5.6.2: Variant calling with iVar + */ +process IVAR_VARIANTS { + tag "$sample" + label 'process_medium' + publishDir "${params.outdir}/variants/ivar", mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename.endsWith(".bcftools_stats.txt")) "bcftools_stats/$filename" + else if (filename.endsWith(".log")) "log/$filename" + else if (filename.endsWith("_mqc.tsv")) null + else filename + } + + when: + !params.skip_variants && 'ivar' in callers + + input: + tuple val(sample), val(single_end), path(mpileup) from ch_mpileup_ivar_variants + path header from ch_ivar_variants_header_mqc + path fasta from ch_fasta + path gff from ch_gff + + output: + tuple val(sample), val(single_end), path("${prefix}.vcf.gz*") into ch_ivar_highfreq_snpeff + tuple val(sample), val(single_end), path("${sample}.vcf.gz*") into ch_ivar_lowfreq_snpeff + path "${prefix}.bcftools_stats.txt" into ch_ivar_bcftools_highfreq_mqc + path "${sample}.variant.counts_mqc.tsv" into ch_ivar_count_mqc + path "${sample}.bcftools_stats.txt" + path "${sample}.tsv" + path "*.log" + + script: + features = params.gff ? "-g $gff" : "" + prefix = "${sample}.AF${params.max_allele_freq}" + """ + cat $mpileup | ivar variants -q $params.min_base_qual -t 0.03 -m $params.min_coverage -r $fasta -p $sample $features + + ivar_variants_to_vcf.py ${sample}.tsv ${sample}.vcf > ${sample}.variant.counts.log + bgzip -c ${sample}.vcf > ${sample}.vcf.gz + tabix -p vcf -f ${sample}.vcf.gz + bcftools stats ${sample}.vcf.gz > ${sample}.bcftools_stats.txt + cat $header ${sample}.variant.counts.log > ${sample}.variant.counts_mqc.tsv + + ivar_variants_to_vcf.py ${sample}.tsv ${prefix}.vcf --pass_only --min_allele_freq $params.max_allele_freq > ${prefix}.variant.counts.log + bgzip -c ${prefix}.vcf > ${prefix}.vcf.gz + tabix -p vcf -f ${prefix}.vcf.gz + bcftools stats ${prefix}.vcf.gz > ${prefix}.bcftools_stats.txt + """ +} + +/* + * STEP 5.6.2.1: Generate consensus sequence with iVar + */ +process IVAR_CONSENSUS { + tag "$sample" + label 'process_medium' + publishDir "${params.outdir}/variants/ivar/consensus", mode: params.publish_dir_mode + + when: + !params.skip_variants && 'ivar' in callers + + input: + tuple val(sample), val(single_end), path(mpileup) from ch_mpileup_ivar_consensus + path fasta from ch_fasta + + output: + tuple val(sample), val(single_end), path("*.fa") into ch_ivar_consensus + path "*.txt" + + script: + prefix = "${sample}.AF${params.max_allele_freq}" + """ + cat $mpileup | ivar consensus -q $params.min_base_qual -t $params.max_allele_freq -m $params.min_coverage -n N -p ${prefix}.consensus + header=\$(head -n1 ${prefix}.consensus.fa | sed 's/>//g') + sed -i "s/\${header}/${sample}/g" ${prefix}.consensus.fa + """ +} + +/* + * STEP 5.6.2.2: iVar variant calling annotation with SnpEff and SnpSift + */ +process IVAR_SNPEFF { + tag "$sample" + label 'process_medium' + publishDir "${params.outdir}/variants/ivar/snpeff", mode: params.publish_dir_mode + + when: + !params.skip_variants && 'ivar' in callers && params.gff && !params.skip_snpeff + + input: + tuple val(sample), val(single_end), path(highfreq_vcf), path(lowfreq_vcf) from ch_ivar_highfreq_snpeff.join(ch_ivar_lowfreq_snpeff, by: [0,1]) + tuple file(db), file(config) from ch_snpeff_db_ivar + + output: + path "${prefix}.snpEff.csv" into ch_ivar_snpeff_highfreq_mqc + path "${sample}.snpEff.csv" + path "*.vcf.gz*" + path "*.{txt,html}" + + script: + prefix = "${sample}.AF${params.max_allele_freq}" + """ + snpEff ${index_base} \\ + -config $config \\ + -dataDir $db \\ + ${lowfreq_vcf[0]} \\ + -csvStats ${sample}.snpEff.csv \\ + | bgzip -c > ${sample}.snpEff.vcf.gz + tabix -p vcf -f ${sample}.snpEff.vcf.gz + mv snpEff_summary.html ${sample}.snpEff.summary.html + + SnpSift extractFields -s "," \\ + -e "." \\ + ${sample}.snpEff.vcf.gz \\ + CHROM POS REF ALT \\ + "ANN[*].GENE" "ANN[*].GENEID" \\ + "ANN[*].IMPACT" "ANN[*].EFFECT" \\ + "ANN[*].FEATURE" "ANN[*].FEATUREID" \\ + "ANN[*].BIOTYPE" "ANN[*].RANK" "ANN[*].HGVS_C" \\ + "ANN[*].HGVS_P" "ANN[*].CDNA_POS" "ANN[*].CDNA_LEN" \\ + "ANN[*].CDS_POS" "ANN[*].CDS_LEN" "ANN[*].AA_POS" \\ + "ANN[*].AA_LEN" "ANN[*].DISTANCE" "EFF[*].EFFECT" \\ + "EFF[*].FUNCLASS" "EFF[*].CODON" "EFF[*].AA" "EFF[*].AA_LEN" \\ + > ${sample}.snpSift.table.txt + + snpEff ${index_base} \\ + -config $config \\ + -dataDir $db \\ + ${highfreq_vcf[0]} \\ + -csvStats ${prefix}.snpEff.csv \\ + | bgzip -c > ${prefix}.snpEff.vcf.gz + tabix -p vcf -f ${prefix}.snpEff.vcf.gz + mv snpEff_summary.html ${prefix}.snpEff.summary.html + + SnpSift extractFields -s "," \\ + -e "." \\ + ${prefix}.snpEff.vcf.gz \\ + CHROM POS REF ALT \\ + "ANN[*].GENE" "ANN[*].GENEID" \\ + "ANN[*].IMPACT" "ANN[*].EFFECT" \\ + "ANN[*].FEATURE" "ANN[*].FEATUREID" \\ + "ANN[*].BIOTYPE" "ANN[*].RANK" "ANN[*].HGVS_C" \\ + "ANN[*].HGVS_P" "ANN[*].CDNA_POS" "ANN[*].CDNA_LEN" \\ + "ANN[*].CDS_POS" "ANN[*].CDS_LEN" "ANN[*].AA_POS" \\ + "ANN[*].AA_LEN" "ANN[*].DISTANCE" "EFF[*].EFFECT" \\ + "EFF[*].FUNCLASS" "EFF[*].CODON" "EFF[*].AA" "EFF[*].AA_LEN" \\ + > ${prefix}.snpSift.table.txt + """ +} + +/* + * STEP 5.6.2.3: iVar consensus sequence report with QUAST + */ +process IVAR_QUAST { + label 'process_medium' + publishDir "${params.outdir}/variants/ivar/quast", mode: params.publish_dir_mode + + when: + !params.skip_variants && 'ivar' in callers && !params.skip_variants_quast + + input: + path consensus from ch_ivar_consensus.collect{ it[2] } + path fasta from ch_fasta + path gff from ch_gff + + output: + path "AF${params.max_allele_freq}" into ch_ivar_quast_mqc + + script: + features = params.gff ? "--features $gff" : "" + """ + quast.py \\ + --output-dir AF${params.max_allele_freq} \\ + -r $fasta \\ + $features \\ + --threads $task.cpus \\ + ${consensus.join(' ')} + """ +} + +//////////////////////////////////////////////////// +/* -- BCFTOOLS -- */ +//////////////////////////////////////////////////// + +/* + * STEP 5.6.3: Variant calling with BCFTools + */ +process BCFTOOLS_VARIANTS { + tag "$sample" + label 'process_medium' + publishDir "${params.outdir}/variants/bcftools", mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename.endsWith(".txt")) "bcftools_stats/$filename" + else filename + } + + when: + !params.skip_variants && 'bcftools' in callers + + input: + tuple val(sample), val(single_end), path(bam) from ch_markdup_bam_bcftools + path fasta from ch_fasta + + output: + tuple val(sample), val(single_end), path("*.vcf.gz*") into ch_bcftools_variants_consensus, + ch_bcftools_variants_snpeff + path "*.bcftools_stats.txt" into ch_bcftools_variants_mqc + + script: + """ + echo "$sample" > sample_name.list + bcftools mpileup \\ + --count-orphans \\ + --no-BAQ \\ + --max-depth 50000 \\ + --fasta-ref $fasta \\ + --min-BQ $params.min_base_qual \\ + --annotate FORMAT/AD,FORMAT/ADF,FORMAT/ADR,FORMAT/DP,FORMAT/SP,INFO/AD,INFO/ADF,INFO/ADR \\ + ${bam[0]} \\ + | bcftools call --output-type v --ploidy 1 --keep-alts --keep-masked-ref --multiallelic-caller --variants-only \\ + | bcftools reheader --samples sample_name.list \\ + | bcftools view --output-file ${sample}.vcf.gz --output-type z --include 'INFO/DP>=$params.min_coverage' + tabix -p vcf -f ${sample}.vcf.gz + bcftools stats ${sample}.vcf.gz > ${sample}.bcftools_stats.txt + """ +} + +/* + * STEP 5.6.3.1: Genome consensus generation with BCFtools and masked with BEDTools + */ +process BCFTOOLS_CONSENSUS { + tag "$sample" + label 'process_medium' + publishDir "${params.outdir}/variants/bcftools/consensus", mode: params.publish_dir_mode + + when: + !params.skip_variants && 'bcftools' in callers + + input: + tuple val(sample), val(single_end), path(bam), path(vcf) from ch_markdup_bam_bcftools_consensus.join(ch_bcftools_variants_consensus, by: [0,1]) + path fasta from ch_fasta + + output: + tuple val(sample), val(single_end), path("*consensus.masked.fa") into ch_bcftools_consensus_masked + path "*consensus.fa" + + script: + """ + cat $fasta | bcftools consensus ${vcf[0]} > ${sample}.consensus.fa + + bedtools genomecov \\ + -bga \\ + -ibam ${bam[0]} \\ + -g $fasta \\ + | awk '\$4 < $params.min_coverage' | bedtools merge > ${sample}.mask.bed + + bedtools maskfasta \\ + -fi ${sample}.consensus.fa \\ + -bed ${sample}.mask.bed \\ + -fo ${sample}.consensus.masked.fa + sed -i 's/${index_base}/${sample}/g' ${sample}.consensus.masked.fa + header=\$(head -n1 ${sample}.consensus.masked.fa | sed 's/>//g') + sed -i "s/\${header}/${sample}/g" ${sample}.consensus.masked.fa + """ +} + +/* + * STEP 5.6.3.2: BCFTools variant calling annotation with SnpEff and SnpSift + */ +process BCFTOOLS_SNPEFF { + tag "$sample" + label 'process_medium' + publishDir "${params.outdir}/variants/bcftools/snpeff", mode: params.publish_dir_mode + + when: + !params.skip_variants && 'bcftools' in callers && params.gff && !params.skip_snpeff + + input: + tuple val(sample), val(single_end), path(vcf) from ch_bcftools_variants_snpeff + tuple file(db), file(config) from ch_snpeff_db_bcftools + + output: + path "*.snpEff.csv" into ch_bcftools_snpeff_mqc + path "*.vcf.gz*" + path "*.{txt,html}" + + script: + """ + snpEff ${index_base} \\ + -config $config \\ + -dataDir $db \\ + ${vcf[0]} \\ + -csvStats ${sample}.snpEff.csv \\ + | bgzip -c > ${sample}.snpEff.vcf.gz + tabix -p vcf -f ${sample}.snpEff.vcf.gz + mv snpEff_summary.html ${sample}.snpEff.summary.html + + SnpSift extractFields -s "," \\ + -e "." \\ + ${sample}.snpEff.vcf.gz \\ + CHROM POS REF ALT \\ + "ANN[*].GENE" "ANN[*].GENEID" \\ + "ANN[*].IMPACT" "ANN[*].EFFECT" \\ + "ANN[*].FEATURE" "ANN[*].FEATUREID" \\ + "ANN[*].BIOTYPE" "ANN[*].RANK" "ANN[*].HGVS_C" \\ + "ANN[*].HGVS_P" "ANN[*].CDNA_POS" "ANN[*].CDNA_LEN" \\ + "ANN[*].CDS_POS" "ANN[*].CDS_LEN" "ANN[*].AA_POS" \\ + "ANN[*].AA_LEN" "ANN[*].DISTANCE" "EFF[*].EFFECT" \\ + "EFF[*].FUNCLASS" "EFF[*].CODON" "EFF[*].AA" "EFF[*].AA_LEN" \\ + > ${sample}.snpSift.table.txt + """ +} + +/* + * STEP 5.6.3.3: BCFTools consensus sequence report with QUAST + */ +process BCFTOOLS_QUAST { + label 'process_medium' + publishDir "${params.outdir}/variants/bcftools", mode: params.publish_dir_mode + + when: + !params.skip_variants && 'bcftools' in callers && !params.skip_variants_quast + + input: + path consensus from ch_bcftools_consensus_masked.collect{ it[2] } + path fasta from ch_fasta + path gff from ch_gff + + output: + path "quast" into ch_bcftools_quast_mqc + + script: + features = params.gff ? "--features $gff" : "" + """ + quast.py \\ + --output-dir quast \\ + -r $fasta \\ + $features \\ + --threads $task.cpus \\ + ${consensus.join(' ')} + """ +} + +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// +/* -- -- */ +/* -- DENOVO ASSEMBLY PROCESSES -- */ +/* -- -- */ +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// + +/* + * PREPROCESSING: Build Blast database for viral genome + */ +process MAKE_BLAST_DB { + tag "$fasta" + label 'process_medium' + if (params.save_reference) { + publishDir "${params.outdir}/genome", mode: params.publish_dir_mode + } + + when: + !params.skip_assembly && !params.skip_blast + + input: + path fasta from ch_fasta + + output: + path "BlastDB" into ch_blast_db + + script: + """ + makeblastdb \\ + -in $fasta \\ + -parse_seqids \\ + -dbtype nucl + mkdir BlastDB && mv ${fasta}* BlastDB + """ +} + +/* + * PREPROCESSING: Build Kraken2 database for host genome + */ +if (!isOffline()) { + if (!params.skip_kraken2 && !params.kraken2_db) { + if (!params.kraken2_db_name) { exit 1, "Please specify a valid name to build Kraken2 database for host e.g. 'human'!" } + + process KRAKEN2_BUILD { + tag "$db" + label 'process_high' + if (params.save_reference) { + publishDir "${params.outdir}/genome", mode: params.publish_dir_mode + } + + when: + !params.skip_assembly + + output: + path "$db" into ch_kraken2_db + + script: + db = "kraken2_${params.kraken2_db_name}" + ftp = params.kraken2_use_ftp ? "--use-ftp" : "" + """ + kraken2-build --db $db --threads $task.cpus $ftp --download-taxonomy + kraken2-build --db $db --threads $task.cpus $ftp --download-library $params.kraken2_db_name + kraken2-build --db $db --threads $task.cpus $ftp --build + """ + } + } +} else { + exit 1, "NXF_OFFLINE=true or -offline has been set so cannot download files required to build Kraken2 database!" +} + +/* + * STEP 6.1: Amplicon trimming with Cutadapt + */ +if (params.protocol == 'amplicon' && !params.skip_amplicon_trimming) { + process CUTADAPT { + tag "$sample" + label 'process_medium' + publishDir "${params.outdir}/assembly/cutadapt", mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename.endsWith(".html")) "fastqc/$filename" + else if (filename.endsWith(".zip")) "fastqc/zips/$filename" + else if (filename.endsWith(".log")) "log/$filename" + else params.save_trimmed ? filename : null + } + + when: + !params.skip_assembly + + input: + tuple val(sample), val(single_end), path(reads) from ch_fastp_cutadapt + path amplicons from ch_amplicon_fasta + + output: + tuple val(sample), val(single_end), path("*.ptrim.fastq.gz") into ch_cutadapt_kraken2 + path "*.{zip,html}" into ch_cutadapt_fastqc_mqc + path "*.log" into ch_cutadapt_mqc + + script: + adapters = single_end ? "-a file:primers.fasta" : "-a file:primers.fasta -A file:primers.fasta" + out_reads = single_end ? "-o ${sample}.ptrim.fastq.gz" : "-o ${sample}_1.ptrim.fastq.gz -p ${sample}_2.ptrim.fastq.gz" + """ + sed -r '/^[ACTGactg]+\$/ s/\$/X/g' $amplicons > primers.fasta + + cutadapt \\ + --cores $task.cpus \\ + --overlap 5 \\ + --minimum-length 30 \\ + --error-rate 0.1 \\ + $adapters \\ + $out_reads \\ + $reads \\ + > ${sample}.cutadapt.log + + fastqc --quiet --threads $task.cpus *.ptrim.fastq.gz + """ + } + ch_fastp_kraken2 = ch_cutadapt_kraken2 + +} else { + ch_cutadapt_mqc = Channel.empty() + ch_cutadapt_fastqc_mqc = Channel.empty() +} + +/* + * STEP 6.2: Filter reads with Kraken2 + */ +if (!params.skip_kraken2) { + process KRAKEN2 { + tag "$db" + label 'process_high' + publishDir "${params.outdir}/assembly/kraken2", mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename.endsWith(".txt")) filename + else params.save_kraken2_fastq ? filename : null + } + + when: + !params.skip_assembly + + input: + tuple val(sample), val(single_end), path(reads) from ch_fastp_kraken2 + path db from ch_kraken2_db + + output: + tuple val(sample), val(single_end), path("*.viral*") into ch_kraken2_spades, + ch_kraken2_metaspades, + ch_kraken2_unicycler, + ch_kraken2_minia + path "*.report.txt" into ch_kraken2_report_mqc + path "*.host*" + + + script: + pe = single_end ? "" : "--paired" + classified = single_end ? "${sample}.host.fastq" : "${sample}.host#.fastq" + unclassified = single_end ? "${sample}.viral.fastq" : "${sample}.viral#.fastq" + """ + kraken2 \\ + --db $db \\ + --threads $task.cpus \\ + --unclassified-out $unclassified \\ + --classified-out $classified \\ + --report ${sample}.kraken2.report.txt \\ + --report-zero-counts \\ + $pe \\ + --gzip-compressed \\ + $reads + pigz -p $task.cpus *.fastq + """ + } +} else { + ch_fastp_kraken2 + .into { ch_kraken2_spades + ch_kraken2_metaspades + ch_kraken2_unicycler + ch_kraken2_minia } + ch_kraken2_report_mqc = Channel.empty() +} + +//////////////////////////////////////////////////// +/* -- SPADES -- */ +//////////////////////////////////////////////////// + +/* + * STEP 6.3: De novo assembly with SPAdes + */ +process SPADES { + tag "$sample" + label 'process_high' + label 'error_ignore' + publishDir "${params.outdir}/assembly/spades", mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename.endsWith(".png")) "bandage/$filename" + else if (filename.endsWith(".svg")) "bandage/$filename" + else filename + } + + when: + !params.skip_assembly && 'spades' in assemblers + + input: + tuple val(sample), val(single_end), path(reads) from ch_kraken2_spades + + output: + tuple val(sample), val(single_end), path("*scaffolds.fa") into ch_spades_blast, + ch_spades_abacas, + ch_spades_plasmidid, + ch_spades_quast, + ch_spades_vg + path "*assembly.{gfa,png,svg}" + + + script: + input_reads = single_end ? "-s $reads" : "-1 ${reads[0]} -2 ${reads[1]}" + """ + spades.py \\ + --threads $task.cpus \\ + $input_reads \\ + -o ./ + mv scaffolds.fasta ${sample}.scaffolds.fa + mv assembly_graph_with_scaffolds.gfa ${sample}.assembly.gfa + + if [ -s ${sample}.assembly.gfa ] + then + Bandage image ${sample}.assembly.gfa ${sample}.assembly.png --height 1000 + Bandage image ${sample}.assembly.gfa ${sample}.assembly.svg --height 1000 + fi + """ +} + +/* + * STEP 6.3.1: Run Blast on SPAdes de novo assembly + */ +process SPADES_BLAST { + tag "$sample" + label 'process_medium' + label 'error_ignore' + publishDir "${params.outdir}/assembly/spades/blast", mode: params.publish_dir_mode + + when: + !params.skip_assembly && 'spades' in assemblers && !params.skip_blast + + input: + tuple val(sample), val(single_end), path(scaffold) from ch_spades_blast + path db from ch_blast_db + path header from ch_blast_outfmt6_header + + output: + path "*.blast*" + + script: + """ + blastn \\ + -num_threads $task.cpus \\ + -db $db/$fasta_base \\ + -query $scaffold \\ + -outfmt \'6 stitle std slen qlen qcovs\' \\ + -out ${sample}.blast.txt + + awk 'BEGIN{OFS=\"\\t\";FS=\"\\t\"}{print \$0,\$5/\$15,\$5/\$14}' ${sample}.blast.txt | awk 'BEGIN{OFS=\"\\t\";FS=\"\\t\"} \$15 > 200 && \$17 > 0.7 && \$1 !~ /phage/ {print \$0}' > ${sample}.blast.filt.txt + cat $header ${sample}.blast.filt.txt > ${sample}.blast.filt.header.txt + """ +} + +/* + * STEP 6.3.2: Run ABACAS on SPAdes de novo assembly + */ +process SPADES_ABACAS { + tag "$sample" + label 'process_medium' + label 'error_ignore' + publishDir "${params.outdir}/assembly/spades/abacas", mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename.indexOf("nucmer") > 0) "nucmer/$filename" + else filename + } + + when: + !params.skip_assembly && 'spades' in assemblers && !params.skip_abacas + + input: + tuple val(sample), val(single_end), path(scaffold) from ch_spades_abacas + path fasta from ch_fasta + + output: + path "*.abacas*" + + script: + """ + abacas.pl -r $fasta -q $scaffold -m -p nucmer -o ${sample}.abacas + mv nucmer.delta ${sample}.abacas.nucmer.delta + mv nucmer.filtered.delta ${sample}.abacas.nucmer.filtered.delta + mv nucmer.tiling ${sample}.abacas.nucmer.tiling + mv unused_contigs.out ${sample}.abacas.unused.contigs.out + """ +} + +/* + * STEP 6.3.3: Run PlasmidID on SPAdes de novo assembly + */ +process SPADES_PLASMIDID { + tag "$sample" + label 'process_medium' + label 'error_ignore' + publishDir "${params.outdir}/assembly/spades/plasmidid", mode: params.publish_dir_mode + + when: + !params.skip_assembly && 'spades' in assemblers && !params.skip_plasmidid + + input: + tuple val(sample), val(single_end), path(scaffold) from ch_spades_plasmidid.filter { it.size() > 0 } + path fasta from ch_fasta + + output: + path "$sample" + + script: + """ + plasmidID -d $fasta -s $sample -c $scaffold --only-reconstruct -C 47 -S 47 -i 60 --no-trim -o . + mv NO_GROUP/$sample ./$sample + """ +} + +/* + * STEP 6.3.4: Run Quast on SPAdes de novo assembly + */ +process SPADES_QUAST { + label 'process_medium' + label 'error_ignore' + publishDir "${params.outdir}/assembly/spades", mode: params.publish_dir_mode + + when: + !params.skip_assembly && 'spades' in assemblers && !params.skip_assembly_quast + + input: + path scaffolds from ch_spades_quast.collect{ it[2] } + path fasta from ch_fasta + path gff from ch_gff + + output: + path "quast" into ch_quast_spades_mqc + + script: + features = params.gff ? "--features $gff" : "" + """ + quast.py \\ + --output-dir quast \\ + -r $fasta \\ + $features \\ + --threads $task.cpus \\ + ${scaffolds.join(' ')} + """ +} + +/* + * STEP 6.3.5: Overlap scaffolds with Minimap2, induce and polish assembly, and call variants with seqwish and vg + */ +process SPADES_VG { + tag "$sample" + label 'process_medium' + label 'error_ignore' + publishDir "${params.outdir}/assembly/spades/variants", mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename.endsWith(".txt")) "bcftools_stats/$filename" + else if (filename.endsWith(".png")) "bandage/$filename" + else if (filename.endsWith(".svg")) "bandage/$filename" + else filename + } + + when: + !params.skip_assembly && 'spades' in assemblers && !params.skip_vg + + input: + tuple val(sample), val(single_end), path(scaffolds) from ch_spades_vg + path fasta from ch_fasta + + output: + tuple val(sample), val(single_end), path("${sample}.vcf.gz*") into ch_spades_vg_vcf + path "*.bcftools_stats.txt" into ch_spades_vg_bcftools_mqc + path "*.{gfa,png,svg}" + + script: + """ + minimap2 -c -t $task.cpus -x asm20 $fasta $scaffolds > ${sample}.paf + + cat $scaffolds $fasta > ${sample}.withRef.fasta + seqwish --paf-alns ${sample}.paf --seqs ${sample}.withRef.fasta --gfa ${sample}.gfa --threads $task.cpus + + vg view -Fv ${sample}.gfa --threads $task.cpus > ${sample}.vg + vg convert -x ${sample}.vg > ${sample}.xg + + samtools faidx $fasta + vg snarls ${sample}.xg > ${sample}.snarls + for chrom in `cat ${fasta}.fai | cut -f1` + do + vg deconstruct -p \$chrom ${sample}.xg -r ${sample}.snarls --threads $task.cpus \\ + | bcftools sort -O v -T ./ \\ + | bgzip -c > ${sample}.\$chrom.vcf.gz + done + bcftools concat --output-type z --output ${sample}.vcf.gz *.vcf.gz + tabix -p vcf -f ${sample}.vcf.gz + bcftools stats ${sample}.vcf.gz > ${sample}.bcftools_stats.txt + + if [ -s ${sample}.gfa ] + then + Bandage image ${sample}.gfa ${sample}.png --height 1000 + Bandage image ${sample}.gfa ${sample}.svg --height 1000 + fi + """ +} + +/* + * STEP 6.3.6: Variant annotation with SnpEff and SnpSift + */ +process SPADES_SNPEFF { + tag "$sample" + label 'process_medium' + label 'error_ignore' + publishDir "${params.outdir}/assembly/spades/variants/snpeff", mode: params.publish_dir_mode + + when: + !params.skip_assembly && 'spades' in assemblers && !params.skip_vg && params.gff && !params.skip_snpeff + + input: + tuple val(sample), val(single_end), path(vcf) from ch_spades_vg_vcf + tuple file(db), file(config) from ch_snpeff_db_spades + + output: + path "*.snpEff.csv" into ch_spades_snpeff_mqc + path "*.vcf.gz*" + path "*.{txt,html}" + + script: + """ + snpEff ${index_base} \\ + -config $config \\ + -dataDir $db \\ + ${vcf[0]} \\ + -csvStats ${sample}.snpEff.csv \\ + | bgzip -c > ${sample}.snpEff.vcf.gz + tabix -p vcf -f ${sample}.snpEff.vcf.gz + mv snpEff_summary.html ${sample}.snpEff.summary.html + + SnpSift extractFields -s "," \\ + -e "." \\ + ${sample}.snpEff.vcf.gz \\ + CHROM POS REF ALT \\ + "ANN[*].GENE" "ANN[*].GENEID" \\ + "ANN[*].IMPACT" "ANN[*].EFFECT" \\ + "ANN[*].FEATURE" "ANN[*].FEATUREID" \\ + "ANN[*].BIOTYPE" "ANN[*].RANK" "ANN[*].HGVS_C" \\ + "ANN[*].HGVS_P" "ANN[*].CDNA_POS" "ANN[*].CDNA_LEN" \\ + "ANN[*].CDS_POS" "ANN[*].CDS_LEN" "ANN[*].AA_POS" \\ + "ANN[*].AA_LEN" "ANN[*].DISTANCE" "EFF[*].EFFECT" \\ + "EFF[*].FUNCLASS" "EFF[*].CODON" "EFF[*].AA" "EFF[*].AA_LEN" \\ + > ${sample}.snpSift.table.txt + """ +} + +//////////////////////////////////////////////////// +/* -- METASPADES -- */ +//////////////////////////////////////////////////// + +/* + * STEP 6.3: De novo assembly with MetaSPAdes + */ +process METASPADES { + tag "$sample" + label 'process_high' + label 'error_ignore' + publishDir "${params.outdir}/assembly/metaspades", mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename.endsWith(".png")) "bandage/$filename" + else if (filename.endsWith(".svg")) "bandage/$filename" + else filename + } + + when: + !params.skip_assembly && 'metaspades' in assemblers && !single_end + + input: + tuple val(sample), val(single_end), path(reads) from ch_kraken2_metaspades + + output: + tuple val(sample), val(single_end), path("*scaffolds.fa") into ch_metaspades_blast, + ch_metaspades_abacas, + ch_metaspades_plasmidid, + ch_metaspades_quast, + ch_metaspades_vg + path "*assembly.{gfa,png,svg}" + + + script: + """ + spades.py \\ + --meta \\ + --threads $task.cpus \\ + -1 ${reads[0]} \\ + -2 ${reads[1]} \\ + -o ./ + mv scaffolds.fasta ${sample}.scaffolds.fa + mv assembly_graph_with_scaffolds.gfa ${sample}.assembly.gfa + + if [ -s ${sample}.assembly.gfa ] + then + Bandage image ${sample}.assembly.gfa ${sample}.assembly.png --height 1000 + Bandage image ${sample}.assembly.gfa ${sample}.assembly.svg --height 1000 + fi + """ +} + +/* + * STEP 6.3.1: Run Blast on MetaSPAdes de novo assembly + */ +process METASPADES_BLAST { + tag "$sample" + label 'process_medium' + label 'error_ignore' + publishDir "${params.outdir}/assembly/metaspades/blast", mode: params.publish_dir_mode + + when: + !params.skip_assembly && 'metaspades' in assemblers && !single_end && !params.skip_blast + + input: + tuple val(sample), val(single_end), path(scaffold) from ch_metaspades_blast + path db from ch_blast_db + path header from ch_blast_outfmt6_header + + output: + path "*.blast*" + + script: + """ + blastn \\ + -num_threads $task.cpus \\ + -db $db/$fasta_base \\ + -query $scaffold \\ + -outfmt \'6 stitle std slen qlen qcovs\' \\ + -out ${sample}.blast.txt + + awk 'BEGIN{OFS=\"\\t\";FS=\"\\t\"}{print \$0,\$5/\$15,\$5/\$14}' ${sample}.blast.txt | awk 'BEGIN{OFS=\"\\t\";FS=\"\\t\"} \$15 > 200 && \$17 > 0.7 && \$1 !~ /phage/ {print \$0}' > ${sample}.blast.filt.txt + cat $header ${sample}.blast.filt.txt > ${sample}.blast.filt.header.txt + """ +} + +/* + * STEP 6.3.2: Run ABACAS on MetaSPAdes de novo assembly + */ +process METASPADES_ABACAS { + tag "$sample" + label 'process_medium' + label 'error_ignore' + publishDir "${params.outdir}/assembly/metaspades/abacas", mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename.indexOf("nucmer") > 0) "nucmer/$filename" + else filename + } + + when: + !params.skip_assembly && 'metaspades' in assemblers && !single_end && !params.skip_abacas + + input: + tuple val(sample), val(single_end), path(scaffold) from ch_metaspades_abacas + path fasta from ch_fasta + + output: + path "*.abacas*" + + script: + """ + abacas.pl -r $fasta -q $scaffold -m -p nucmer -o ${sample}.abacas + mv nucmer.delta ${sample}.abacas.nucmer.delta + mv nucmer.filtered.delta ${sample}.abacas.nucmer.filtered.delta + mv nucmer.tiling ${sample}.abacas.nucmer.tiling + mv unused_contigs.out ${sample}.abacas.unused.contigs.out + """ +} + +/* + * STEP 6.3.3: Run PlasmidID on MetaSPAdes de novo assembly + */ +process METASPADES_PLASMIDID { + tag "$sample" + label 'process_medium' + label 'error_ignore' + publishDir "${params.outdir}/assembly/metaspades/plasmidid", mode: params.publish_dir_mode + + when: + !params.skip_assembly && 'metaspades' in assemblers && !single_end && !params.skip_plasmidid + + input: + tuple val(sample), val(single_end), path(scaffold) from ch_metaspades_plasmidid.filter { it.size() > 0 } + path fasta from ch_fasta + + output: + path "$sample" + + script: + """ + plasmidID -d $fasta -s $sample -c $scaffold --only-reconstruct -C 47 -S 47 -i 60 --no-trim -o . + mv NO_GROUP/$sample ./$sample + """ +} + +/* + * STEP 6.3.4: Run Quast on MetaSPAdes de novo assembly + */ +process METASPADES_QUAST { + label 'process_medium' + label 'error_ignore' + publishDir "${params.outdir}/assembly/metaspades", mode: params.publish_dir_mode + + when: + !params.skip_assembly && 'metaspades' in assemblers && !single_end && !params.skip_assembly_quast + + input: + path scaffolds from ch_metaspades_quast.collect{ it[2] } + path fasta from ch_fasta + path gff from ch_gff + + output: + path "quast" into ch_quast_metaspades_mqc + + script: + features = params.gff ? "--features $gff" : "" + """ + quast.py \\ + --output-dir quast \\ + -r $fasta \\ + $features \\ + --threads $task.cpus \\ + ${scaffolds.join(' ')} + """ +} + +/* + * STEP 6.3.5: Overlap scaffolds with Minimap2, induce and polish assembly, and call variants with seqwish and vg + */ +process METASPADES_VG { + tag "$sample" + label 'process_medium' + label 'error_ignore' + publishDir "${params.outdir}/assembly/metaspades/variants", mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename.endsWith(".txt")) "bcftools_stats/$filename" + else if (filename.endsWith(".png")) "bandage/$filename" + else if (filename.endsWith(".svg")) "bandage/$filename" + else filename + } + + when: + !params.skip_assembly && 'metaspades' in assemblers && !single_end && !params.skip_vg + + input: + tuple val(sample), val(single_end), path(scaffolds) from ch_metaspades_vg + path fasta from ch_fasta + + output: + tuple val(sample), val(single_end), path("${sample}.vcf.gz*") into ch_metaspades_vg_vcf + path "*.bcftools_stats.txt" into ch_metaspades_vg_bcftools_mqc + path "*.{gfa,png,svg}" + + script: + """ + minimap2 -c -t $task.cpus -x asm20 $fasta $scaffolds > ${sample}.paf + + cat $scaffolds $fasta > ${sample}.withRef.fasta + seqwish --paf-alns ${sample}.paf --seqs ${sample}.withRef.fasta --gfa ${sample}.gfa --threads $task.cpus + + vg view -Fv ${sample}.gfa --threads $task.cpus > ${sample}.vg + vg convert -x ${sample}.vg > ${sample}.xg + + samtools faidx $fasta + vg snarls ${sample}.xg > ${sample}.snarls + for chrom in `cat ${fasta}.fai | cut -f1` + do + vg deconstruct -p \$chrom ${sample}.xg -r ${sample}.snarls --threads $task.cpus \\ + | bcftools sort -O v -T ./ \\ + | bgzip -c > ${sample}.\$chrom.vcf.gz + done + bcftools concat --output-type z --output ${sample}.vcf.gz *.vcf.gz + tabix -p vcf -f ${sample}.vcf.gz + bcftools stats ${sample}.vcf.gz > ${sample}.bcftools_stats.txt + + if [ -s ${sample}.gfa ] + then + Bandage image ${sample}.gfa ${sample}.png --height 1000 + Bandage image ${sample}.gfa ${sample}.svg --height 1000 + fi + """ +} + +/* + * STEP 6.3.6: Variant annotation with SnpEff and SnpSift + */ +process METASPADES_SNPEFF { + tag "$sample" + label 'process_medium' + label 'error_ignore' + publishDir "${params.outdir}/assembly/metaspades/variants/snpeff", mode: params.publish_dir_mode + + when: + !params.skip_assembly && 'metaspades' in assemblers && !single_end && !params.skip_vg && params.gff && !params.skip_snpeff + + input: + tuple val(sample), val(single_end), path(vcf) from ch_metaspades_vg_vcf + tuple file(db), file(config) from ch_snpeff_db_metaspades + + output: + path "*.snpEff.csv" into ch_metaspades_snpeff_mqc + path "*.vcf.gz*" + path "*.{txt,html}" - AWSBatch options: - --awsqueue [str] The AWSBatch JobQueue that needs to be set when running on AWSBatch - --awsregion [str] The AWS Region for your AWS Batch job to run on - --awscli [str] Path to the AWS CLI tool - """.stripIndent() + script: + """ + snpEff ${index_base} \\ + -config $config \\ + -dataDir $db \\ + ${vcf[0]} \\ + -csvStats ${sample}.snpEff.csv \\ + | bgzip -c > ${sample}.snpEff.vcf.gz + tabix -p vcf -f ${sample}.snpEff.vcf.gz + mv snpEff_summary.html ${sample}.snpEff.summary.html + + SnpSift extractFields -s "," \\ + -e "." \\ + ${sample}.snpEff.vcf.gz \\ + CHROM POS REF ALT \\ + "ANN[*].GENE" "ANN[*].GENEID" \\ + "ANN[*].IMPACT" "ANN[*].EFFECT" \\ + "ANN[*].FEATURE" "ANN[*].FEATUREID" \\ + "ANN[*].BIOTYPE" "ANN[*].RANK" "ANN[*].HGVS_C" \\ + "ANN[*].HGVS_P" "ANN[*].CDNA_POS" "ANN[*].CDNA_LEN" \\ + "ANN[*].CDS_POS" "ANN[*].CDS_LEN" "ANN[*].AA_POS" \\ + "ANN[*].AA_LEN" "ANN[*].DISTANCE" "EFF[*].EFFECT" \\ + "EFF[*].FUNCLASS" "EFF[*].CODON" "EFF[*].AA" "EFF[*].AA_LEN" \\ + > ${sample}.snpSift.table.txt + """ } -// Show help message -if (params.help) { - helpMessage() - exit 0 +//////////////////////////////////////////////////// +/* -- UNICYCLER -- */ +//////////////////////////////////////////////////// + +/* + * STEP 6.3: De novo assembly with Unicycler + */ +process UNICYCLER { + tag "$sample" + label 'process_high' + label 'error_ignore' + publishDir "${params.outdir}/assembly/unicycler", mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename.endsWith(".png")) "bandage/$filename" + else if (filename.endsWith(".svg")) "bandage/$filename" + else filename + } + + when: + !params.skip_assembly && 'unicycler' in assemblers + + input: + tuple val(sample), val(single_end), path(reads) from ch_kraken2_unicycler + + output: + tuple val(sample), val(single_end), path("*scaffolds.fa") into ch_unicycler_blast, + ch_unicycler_abacas, + ch_unicycler_plasmidid, + ch_unicycler_quast, + ch_unicycler_vg + path "*assembly.{gfa,png,svg}" + + script: + input_reads = single_end ? "-s $reads" : "-1 ${reads[0]} -2 ${reads[1]}" + """ + unicycler \\ + --threads $task.cpus \\ + $input_reads \\ + --out ./ + mv assembly.fasta ${sample}.scaffolds.fa + mv assembly.gfa ${sample}.assembly.gfa + + if [ -s ${sample}.assembly.gfa ] + then + Bandage image ${sample}.assembly.gfa ${sample}.assembly.png --height 1000 + Bandage image ${sample}.assembly.gfa ${sample}.assembly.svg --height 1000 + fi + """ } /* - * SET UP CONFIGURATION VARIABLES + * STEP 6.3.1: Run Blast on MetaSPAdes de novo assembly */ +process UNICYCLER_BLAST { + tag "$sample" + label 'process_medium' + label 'error_ignore' + publishDir "${params.outdir}/assembly/unicycler/blast", mode: params.publish_dir_mode -// Check if genome exists in the config file -if (params.genomes && params.genome && !params.genomes.containsKey(params.genome)) { - exit 1, "The provided genome '${params.genome}' is not available in the iGenomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}" + when: + !params.skip_assembly && 'unicycler' in assemblers && !params.skip_blast + + input: + tuple val(sample), val(single_end), path(scaffold) from ch_unicycler_blast + path db from ch_blast_db + path header from ch_blast_outfmt6_header + + output: + path "*.blast*" + + script: + """ + blastn \\ + -num_threads $task.cpus \\ + -db $db/$fasta_base \\ + -query $scaffold \\ + -outfmt \'6 stitle std slen qlen qcovs\' \\ + -out ${sample}.blast.txt + + awk 'BEGIN{OFS=\"\\t\";FS=\"\\t\"}{print \$0,\$5/\$15,\$5/\$14}' ${sample}.blast.txt | awk 'BEGIN{OFS=\"\\t\";FS=\"\\t\"} \$15 > 200 && \$17 > 0.7 && \$1 !~ /phage/ {print \$0}' > ${sample}.blast.filt.txt + cat $header ${sample}.blast.filt.txt > ${sample}.blast.filt.header.txt + """ } -// TODO nf-core: Add any reference files that are needed -// Configurable reference genomes -// -// NOTE - THIS IS NOT USED IN THIS PIPELINE, EXAMPLE ONLY -// If you want to use the channel below in a process, define the following: -// input: -// file fasta from ch_fasta -// -params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false -if (params.fasta) { ch_fasta = file(params.fasta, checkIfExists: true) } +/* + * STEP 6.3.2: Run ABACAS on Unicycler de novo assembly + */ +process UNICYCLER_ABACAS { + tag "$sample" + label 'process_medium' + label 'error_ignore' + publishDir "${params.outdir}/assembly/unicycler/abacas", mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename.indexOf("nucmer") > 0) "nucmer/$filename" + else filename + } -// Has the run name been specified by the user? -// this has the bonus effect of catching both -name and --name -custom_runName = params.name -if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) { - custom_runName = workflow.runName + when: + !params.skip_assembly && 'unicycler' in assemblers && !params.skip_abacas + + input: + tuple val(sample), val(single_end), path(scaffold) from ch_unicycler_abacas + path fasta from ch_fasta + + output: + path "*.abacas*" + + script: + """ + abacas.pl -r $fasta -q $scaffold -m -p nucmer -o ${sample}.abacas + mv nucmer.delta ${sample}.abacas.nucmer.delta + mv nucmer.filtered.delta ${sample}.abacas.nucmer.filtered.delta + mv nucmer.tiling ${sample}.abacas.nucmer.tiling + mv unused_contigs.out ${sample}.abacas.unused.contigs.out + """ } -if (workflow.profile.contains('awsbatch')) { - // AWSBatch sanity checking - if (!params.awsqueue || !params.awsregion) exit 1, "Specify correct --awsqueue and --awsregion parameters on AWSBatch!" - // Check outdir paths to be S3 buckets if running on AWSBatch - // related: https://github.com/nextflow-io/nextflow/issues/813 - if (!params.outdir.startsWith('s3:')) exit 1, "Outdir not on S3 - specify S3 Bucket to run on AWSBatch!" - // Prevent trace files to be stored on S3 since S3 does not support rolling files. - if (params.tracedir.startsWith('s3:')) exit 1, "Specify a local tracedir or run without trace! S3 cannot be used for tracefiles." +/* + * STEP 6.3.3: Run PlasmidID on Unicycler de novo assembly + */ +process UNICYCLER_PLASMIDID { + tag "$sample" + label 'process_medium' + label 'error_ignore' + publishDir "${params.outdir}/assembly/unicycler/plasmidid", mode: params.publish_dir_mode + + when: + !params.skip_assembly && 'unicycler' in assemblers && !params.skip_plasmidid + + input: + tuple val(sample), val(single_end), path(scaffold) from ch_unicycler_plasmidid.filter { it.size() > 0 } + path fasta from ch_fasta + + output: + path "$sample" + + script: + """ + plasmidID -d $fasta -s $sample -c $scaffold --only-reconstruct -C 47 -S 47 -i 60 --no-trim -o . + mv NO_GROUP/$sample ./$sample + """ } -// Stage config files -ch_multiqc_config = file("$baseDir/assets/multiqc_config.yaml", checkIfExists: true) -ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config, checkIfExists: true) : Channel.empty() -ch_output_docs = file("$baseDir/docs/output.md", checkIfExists: true) +/* + * STEP 6.3.4: Run Quast on Unicycler de novo assembly + */ +process UNICYCLER_QUAST { + label 'process_medium' + label 'error_ignore' + publishDir "${params.outdir}/assembly/unicycler", mode: params.publish_dir_mode + + when: + !params.skip_assembly && 'unicycler' in assemblers && !params.skip_assembly_quast + + input: + path scaffolds from ch_unicycler_quast.collect{ it[2] } + path fasta from ch_fasta + path gff from ch_gff + + output: + path "quast" into ch_quast_unicycler_mqc + + script: + features = params.gff ? "--features $gff" : "" + """ + quast.py \\ + --output-dir quast \\ + -r $fasta \\ + $features \\ + --threads $task.cpus \\ + ${scaffolds.join(' ')} + """ +} /* - * Create a channel for input read files + * STEP 6.3.5: Overlap scaffolds with Minimap2, induce and polish assembly, and call variants with seqwish and vg */ -if (params.readPaths) { - if (params.single_end) { - Channel - .from(params.readPaths) - .map { row -> [ row[0], [ file(row[1][0], checkIfExists: true) ] ] } - .ifEmpty { exit 1, "params.readPaths was empty - no input files supplied" } - .into { ch_read_files_fastqc; ch_read_files_trimming } - } else { - Channel - .from(params.readPaths) - .map { row -> [ row[0], [ file(row[1][0], checkIfExists: true), file(row[1][1], checkIfExists: true) ] ] } - .ifEmpty { exit 1, "params.readPaths was empty - no input files supplied" } - .into { ch_read_files_fastqc; ch_read_files_trimming } - } -} else { - Channel - .fromFilePairs(params.reads, size: params.single_end ? 1 : 2) - .ifEmpty { exit 1, "Cannot find any reads matching: ${params.reads}\nNB: Path needs to be enclosed in quotes!\nIf this is single-end data, please specify --single_end on the command line." } - .into { ch_read_files_fastqc; ch_read_files_trimming } +process UNICYCLER_VG { + tag "$sample" + label 'process_medium' + label 'error_ignore' + publishDir "${params.outdir}/assembly/unicycler/variants", mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename.endsWith(".txt")) "bcftools_stats/$filename" + else if (filename.endsWith(".png")) "bandage/$filename" + else if (filename.endsWith(".svg")) "bandage/$filename" + else filename + } + + when: + !params.skip_assembly && 'unicycler' in assemblers && !params.skip_vg + + input: + tuple val(sample), val(single_end), path(scaffolds) from ch_unicycler_vg + path fasta from ch_fasta + + output: + tuple val(sample), val(single_end), path("${sample}.vcf.gz*") into ch_unicycler_vg_vcf + path "*.bcftools_stats.txt" into ch_unicycler_vg_bcftools_mqc + path "*.{gfa,png,svg}" + + script: + """ + minimap2 -c -t $task.cpus -x asm20 $fasta $scaffolds > ${sample}.paf + + cat $scaffolds $fasta > ${sample}.withRef.fasta + seqwish --paf-alns ${sample}.paf --seqs ${sample}.withRef.fasta --gfa ${sample}.gfa --threads $task.cpus + + vg view -Fv ${sample}.gfa --threads $task.cpus > ${sample}.vg + vg convert -x ${sample}.vg > ${sample}.xg + + samtools faidx $fasta + vg snarls ${sample}.xg > ${sample}.snarls + for chrom in `cat ${fasta}.fai | cut -f1` + do + vg deconstruct -p \$chrom ${sample}.xg -r ${sample}.snarls --threads $task.cpus \\ + | bcftools sort -O v -T ./ \\ + | bgzip -c > ${sample}.\$chrom.vcf.gz + done + bcftools concat --output-type z --output ${sample}.vcf.gz *.vcf.gz + tabix -p vcf -f ${sample}.vcf.gz + bcftools stats ${sample}.vcf.gz > ${sample}.bcftools_stats.txt + + if [ -s ${sample}.gfa ] + then + Bandage image ${sample}.gfa ${sample}.png --height 1000 + Bandage image ${sample}.gfa ${sample}.svg --height 1000 + fi + """ } -// Header log info -log.info nfcoreHeader() -def summary = [:] -if (workflow.revision) summary['Pipeline Release'] = workflow.revision -summary['Run Name'] = custom_runName ?: workflow.runName -// TODO nf-core: Report custom parameters here -summary['Reads'] = params.reads -summary['Fasta Ref'] = params.fasta -summary['Data Type'] = params.single_end ? 'Single-End' : 'Paired-End' -summary['Max Resources'] = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job" -if (workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container" -summary['Output dir'] = params.outdir -summary['Launch dir'] = workflow.launchDir -summary['Working dir'] = workflow.workDir -summary['Script dir'] = workflow.projectDir -summary['User'] = workflow.userName -if (workflow.profile.contains('awsbatch')) { - summary['AWS Region'] = params.awsregion - summary['AWS Queue'] = params.awsqueue - summary['AWS CLI'] = params.awscli +/* + * STEP 6.3.6: Variant annotation with SnpEff and SnpSift + */ +process UNICYCLER_SNPEFF { + tag "$sample" + label 'process_medium' + label 'error_ignore' + publishDir "${params.outdir}/assembly/unicycler/variants/snpeff", mode: params.publish_dir_mode + + when: + !params.skip_assembly && 'unicycler' in assemblers && !params.skip_vg && params.gff && !params.skip_snpeff + + input: + tuple val(sample), val(single_end), path(vcf) from ch_unicycler_vg_vcf + tuple file(db), file(config) from ch_snpeff_db_unicycler + + output: + path "*.snpEff.csv" into ch_unicycler_snpeff_mqc + path "*.vcf.gz*" + path "*.{txt,html}" + + script: + """ + snpEff ${index_base} \\ + -config $config \\ + -dataDir $db \\ + ${vcf[0]} \\ + -csvStats ${sample}.snpEff.csv \\ + | bgzip -c > ${sample}.snpEff.vcf.gz + tabix -p vcf -f ${sample}.snpEff.vcf.gz + mv snpEff_summary.html ${sample}.snpEff.summary.html + + SnpSift extractFields -s "," \\ + -e "." \\ + ${sample}.snpEff.vcf.gz \\ + CHROM POS REF ALT \\ + "ANN[*].GENE" "ANN[*].GENEID" \\ + "ANN[*].IMPACT" "ANN[*].EFFECT" \\ + "ANN[*].FEATURE" "ANN[*].FEATUREID" \\ + "ANN[*].BIOTYPE" "ANN[*].RANK" "ANN[*].HGVS_C" \\ + "ANN[*].HGVS_P" "ANN[*].CDNA_POS" "ANN[*].CDNA_LEN" \\ + "ANN[*].CDS_POS" "ANN[*].CDS_LEN" "ANN[*].AA_POS" \\ + "ANN[*].AA_LEN" "ANN[*].DISTANCE" "EFF[*].EFFECT" \\ + "EFF[*].FUNCLASS" "EFF[*].CODON" "EFF[*].AA" "EFF[*].AA_LEN" \\ + > ${sample}.snpSift.table.txt + """ } -summary['Config Profile'] = workflow.profile -if (params.config_profile_description) summary['Config Description'] = params.config_profile_description -if (params.config_profile_contact) summary['Config Contact'] = params.config_profile_contact -if (params.config_profile_url) summary['Config URL'] = params.config_profile_url -if (params.email || params.email_on_fail) { - summary['E-mail Address'] = params.email - summary['E-mail on failure'] = params.email_on_fail - summary['MultiQC maxsize'] = params.max_multiqc_email_size + +//////////////////////////////////////////////////// +/* -- MINIA -- */ +//////////////////////////////////////////////////// + +/* + * STEP 6.3: De novo assembly with minia + */ +process MINIA { + tag "$sample" + label 'process_high' + label 'error_ignore' + publishDir "${params.outdir}/assembly/minia/${params.minia_kmer}", mode: params.publish_dir_mode + + when: + !params.skip_assembly && 'minia' in assemblers + + input: + tuple val(sample), val(single_end), path(reads) from ch_kraken2_minia + + output: + tuple val(sample), val(single_end), path("*scaffolds.fa") into ch_minia_vg, + ch_minia_blast, + ch_minia_abacas, + ch_minia_plasmidid, + ch_minia_quast + + script: + """ + echo "${reads.join("\n")}" > input_files.txt + minia \\ + -kmer-size $params.minia_kmer \\ + -abundance-min 20 \\ + -nb-cores $task.cpus \\ + -in input_files.txt \\ + -out ${sample}.k${params.minia_kmer}.a20 + mv ${sample}.k${params.minia_kmer}.a20.contigs.fa ${sample}.k${params.minia_kmer}.scaffolds.fa + """ } -log.info summary.collect { k,v -> "${k.padRight(18)}: $v" }.join("\n") -log.info "-\033[2m--------------------------------------------------\033[0m-" -// Check the hostnames against configured profiles -checkHostname() +/* + * STEP 6.3.1: Run Blast on minia de novo assembly + */ +process MINIA_BLAST { + tag "$sample" + label 'process_medium' + label 'error_ignore' + publishDir "${params.outdir}/assembly/minia/${params.minia_kmer}/blast", mode: params.publish_dir_mode + + when: + !params.skip_assembly && 'minia' in assemblers && !params.skip_blast + + input: + tuple val(sample), val(single_end), path(scaffold) from ch_minia_blast + path db from ch_blast_db + path header from ch_blast_outfmt6_header + + output: + path "*.blast*" + + script: + """ + blastn \\ + -num_threads $task.cpus \\ + -db $db/$fasta_base \\ + -query $scaffold \\ + -outfmt \'6 stitle std slen qlen qcovs\' \\ + -out ${sample}.blast.txt + + awk 'BEGIN{OFS=\"\\t\";FS=\"\\t\"}{print \$0,\$5/\$15,\$5/\$14}' ${sample}.blast.txt | awk 'BEGIN{OFS=\"\\t\";FS=\"\\t\"} \$15 > 200 && \$17 > 0.7 && \$1 !~ /phage/ {print \$0}' > ${sample}.blast.filt.txt + cat $header ${sample}.blast.filt.txt > ${sample}.blast.filt.header.txt + """ +} + +/* + * STEP 6.3.2: Run ABACAS on minia de novo assembly + */ +process MINIA_ABACAS { + tag "$sample" + label 'process_medium' + label 'error_ignore' + publishDir "${params.outdir}/assembly/minia/${params.minia_kmer}/abacas", mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename.indexOf("nucmer") > 0) "nucmer/$filename" + else filename + } + + when: + !params.skip_assembly && 'minia' in assemblers && !params.skip_abacas + + input: + tuple val(sample), val(single_end), path(scaffold) from ch_minia_abacas + path fasta from ch_fasta + + output: + path "*.abacas*" + + script: + """ + abacas.pl -r $fasta -q $scaffold -m -p nucmer -o ${sample}.abacas + mv nucmer.delta ${sample}.abacas.nucmer.delta + mv nucmer.filtered.delta ${sample}.abacas.nucmer.filtered.delta + mv nucmer.tiling ${sample}.abacas.nucmer.tiling + mv unused_contigs.out ${sample}.abacas.unused.contigs.out + """ +} + +/* + * STEP 6.3.3: Run PlasmidID on minia de novo assembly + */ +process MINIA_PLASMIDID { + tag "$sample" + label 'process_medium' + label 'error_ignore' + publishDir "${params.outdir}/assembly/minia/${params.minia_kmer}/plasmidid", mode: params.publish_dir_mode + + when: + !params.skip_assembly && 'minia' in assemblers && !params.skip_plasmidid + + input: + tuple val(sample), val(single_end), path(scaffold) from ch_minia_plasmidid.filter { it.size() > 0 } + path fasta from ch_fasta + + output: + path "$sample" + + script: + """ + plasmidID -d $fasta -s $sample -c $scaffold --only-reconstruct -C 47 -S 47 -i 60 --no-trim -o . + mv NO_GROUP/$sample ./$sample + """ +} + +/* + * STEP 6.3.4: Run Quast on minia de novo assembly + */ +process MINIA_QUAST { + label 'process_medium' + label 'error_ignore' + publishDir "${params.outdir}/assembly/minia/${params.minia_kmer}", mode: params.publish_dir_mode + + when: + !params.skip_assembly && 'minia' in assemblers && !params.skip_assembly_quast + + input: + path scaffolds from ch_minia_quast.collect{ it[2] } + path fasta from ch_fasta + path gff from ch_gff + + output: + path "quast" into ch_quast_minia_mqc + + script: + features = params.gff ? "--features $gff" : "" + """ + quast.py \\ + --output-dir quast \\ + -r $fasta \\ + $features \\ + --threads $task.cpus \\ + ${scaffolds.join(' ')} + """ +} + +/* + * STEP 6.3.5: Overlap scaffolds with Minimap2, induce and polish assembly, and call variants with seqwish and vg + */ +process MINIA_VG { + tag "$sample" + label 'process_medium' + label 'error_ignore' + publishDir "${params.outdir}/assembly/minia/${params.minia_kmer}/variants", mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename.endsWith(".txt")) "bcftools_stats/$filename" + else if (filename.endsWith(".png")) "bandage/$filename" + else if (filename.endsWith(".svg")) "bandage/$filename" + else filename + } + + when: + !params.skip_assembly && 'minia' in assemblers && !params.skip_vg + + input: + tuple val(sample), val(single_end), path(scaffolds) from ch_minia_vg + path fasta from ch_fasta + + output: + tuple val(sample), val(single_end), path("${sample}.vcf.gz*") into ch_minia_vg_vcf + path "*.bcftools_stats.txt" into ch_minia_vg_bcftools_mqc + path "*.{gfa,png,svg}" + + script: + """ + minimap2 -c -t $task.cpus -x asm20 $fasta $scaffolds > ${sample}.paf + + cat $scaffolds $fasta > ${sample}.withRef.fasta + seqwish --paf-alns ${sample}.paf --seqs ${sample}.withRef.fasta --gfa ${sample}.gfa --threads $task.cpus + + vg view -Fv ${sample}.gfa --threads $task.cpus > ${sample}.vg + vg convert -x ${sample}.vg > ${sample}.xg + + samtools faidx $fasta + vg snarls ${sample}.xg > ${sample}.snarls + for chrom in `cat ${fasta}.fai | cut -f1` + do + vg deconstruct -p \$chrom ${sample}.xg -r ${sample}.snarls --threads $task.cpus \\ + | bcftools sort -O v -T ./ \\ + | bgzip -c > ${sample}.\$chrom.vcf.gz + done + bcftools concat --output-type z --output ${sample}.vcf.gz *.vcf.gz + tabix -p vcf -f ${sample}.vcf.gz + bcftools stats ${sample}.vcf.gz > ${sample}.bcftools_stats.txt + + if [ -s ${sample}.gfa ] + then + Bandage image ${sample}.gfa ${sample}.png --height 1000 + Bandage image ${sample}.gfa ${sample}.svg --height 1000 + fi + """ +} + +/* + * STEP 6.3.6: Variant annotation with SnpEff and SnpSift + */ +process MINIA_SNPEFF { + tag "$sample" + label 'process_medium' + label 'error_ignore' + publishDir "${params.outdir}/assembly/minia/${params.minia_kmer}/variants/snpeff", mode: params.publish_dir_mode + + when: + !params.skip_assembly && 'minia' in assemblers && !params.skip_vg && params.gff && !params.skip_snpeff + + input: + tuple val(sample), val(single_end), path(vcf) from ch_minia_vg_vcf + tuple file(db), file(config) from ch_snpeff_db_minia + + output: + path "*.snpEff.csv" into ch_minia_snpeff_mqc + path "*.vcf.gz*" + path "*.{txt,html}" + + script: + """ + snpEff ${index_base} \\ + -config $config \\ + -dataDir $db \\ + ${vcf[0]} \\ + -csvStats ${sample}.snpEff.csv \\ + | bgzip -c > ${sample}.snpEff.vcf.gz + tabix -p vcf -f ${sample}.snpEff.vcf.gz + mv snpEff_summary.html ${sample}.snpEff.summary.html + + SnpSift extractFields -s "," \\ + -e "." \\ + ${sample}.snpEff.vcf.gz \\ + CHROM POS REF ALT \\ + "ANN[*].GENE" "ANN[*].GENEID" \\ + "ANN[*].IMPACT" "ANN[*].EFFECT" \\ + "ANN[*].FEATURE" "ANN[*].FEATUREID" \\ + "ANN[*].BIOTYPE" "ANN[*].RANK" "ANN[*].HGVS_C" \\ + "ANN[*].HGVS_P" "ANN[*].CDNA_POS" "ANN[*].CDNA_LEN" \\ + "ANN[*].CDS_POS" "ANN[*].CDS_LEN" "ANN[*].AA_POS" \\ + "ANN[*].AA_LEN" "ANN[*].DISTANCE" "EFF[*].EFFECT" \\ + "EFF[*].FUNCLASS" "EFF[*].CODON" "EFF[*].AA" "EFF[*].AA_LEN" \\ + > ${sample}.snpSift.table.txt + """ +} + +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// +/* -- -- */ +/* -- MULTIQC -- */ +/* -- -- */ +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// Channel.from(summary.collect{ [it.key, it.value] }) .map { k,v -> "
        $k
        ${v ?: 'N/A'}
        " } @@ -174,90 +2991,130 @@ Channel.from(summary.collect{ [it.key, it.value] }) * Parse software version numbers */ process get_software_versions { - publishDir "${params.outdir}/pipeline_info", mode: 'copy', + publishDir "${params.outdir}/pipeline_info", mode: params.publish_dir_mode, saveAs: { filename -> - if (filename.indexOf(".csv") > 0) filename + if (filename.endsWith(".csv")) filename else null } output: - file 'software_versions_mqc.yaml' into ch_software_versions_yaml - file "software_versions.csv" + path 'software_versions_mqc.yaml' into ch_software_versions_yaml + path "software_versions.csv" script: - // TODO nf-core: Get all tools to print their version number here """ echo $workflow.manifest.version > v_pipeline.txt echo $workflow.nextflow.version > v_nextflow.txt + parallel-fastq-dump --version > v_parallel_fastq_dump.txt fastqc --version > v_fastqc.txt + fastp --version 2> v_fastp.txt + bowtie2 --version > v_bowtie2.txt + samtools --version > v_samtools.txt + bedtools --version > v_bedtools.txt + picard CollectMultipleMetrics --version &> v_picard.txt || true + ivar -v > v_ivar.txt + echo \$(varscan 2>&1) > v_varscan.txt + snpEff -version > v_snpeff.txt + echo \$(SnpSift 2>&1) > v_snpsift.txt + bcftools -v > v_bcftools.txt + cutadapt --version > v_cutadapt.txt + kraken2 --version > v_kraken2.txt + spades.py --version > v_spades.txt + unicycler --version > v_unicycler.txt + minia --version > v_minia.txt + minimap2 --version > v_minimap2.txt + vg version > v_vg.txt + blastn -version > v_blast.txt + abacas.pl -v &> v_abacas.txt || true + quast.py --version > v_quast.txt + Bandage --version > v_bandage.txt + echo \$(R --version 2>&1) > v_R.txt multiqc --version > v_multiqc.txt scrape_software_versions.py &> software_versions_mqc.yaml """ } /* - * STEP 1 - FastQC + * STEP 7: MultiQC */ -process fastqc { - tag "$name" - label 'process_medium' - publishDir "${params.outdir}/fastqc", mode: 'copy', +process MULTIQC { + publishDir "${params.outdir}", mode: params.publish_dir_mode, saveAs: { filename -> - filename.indexOf(".zip") > 0 ? "zips/$filename" : "$filename" + if (filename.endsWith("assembly_metrics_mqc.tsv")) "assembly/$filename" + else if (filename.endsWith("variants_metrics_mqc.tsv")) "variants/$filename" + else "multiqc/$filename" } - input: - set val(name), file(reads) from ch_read_files_fastqc - - output: - file "*_fastqc.{zip,html}" into ch_fastqc_results - - script: - """ - fastqc --quiet --threads $task.cpus $reads - """ -} - -/* - * STEP 2 - MultiQC - */ -process multiqc { - publishDir "${params.outdir}/MultiQC", mode: 'copy' + when: + !params.skip_multiqc input: - file (multiqc_config) from ch_multiqc_config - file (mqc_custom_config) from ch_multiqc_custom_config.collect().ifEmpty([]) - // TODO nf-core: Add in log files from your new processes for MultiQC to find! - file ('fastqc/*') from ch_fastqc_results.collect().ifEmpty([]) - file ('software_versions/*') from ch_software_versions_yaml.collect() - file workflow_summary from ch_workflow_summary.collectFile(name: "workflow_summary_mqc.yaml") + path (multiqc_config) from ch_multiqc_config + path (mqc_custom_config) from ch_multiqc_custom_config.collect().ifEmpty([]) + path ('fastqc/*') from ch_fastqc_raw_reports_mqc.collect().ifEmpty([]) + path ('fastp/log/*') from ch_fastp_mqc.collect().ifEmpty([]) + path ('fastp/fastqc/*') from ch_fastp_fastqc_mqc.collect().ifEmpty([]) + path ('bowtie2/log/*') from ch_bowtie2_mqc.collect().ifEmpty([]) + path ('bowtie2/flagstat/*') from ch_sort_bam_flagstat_mqc.collect().ifEmpty([]) + path ('ivar/trim/flagstat/*') from ch_ivar_trim_flagstat_mqc.collect().ifEmpty([]) + path ('ivar/trim/log/*') from ch_ivar_trim_log_mqc.collect().ifEmpty([]) + path ('picard/markdup/*') from ch_markdup_bam_flagstat_mqc.collect().ifEmpty([]) + path ('picard/metrics/*') from ch_markdup_bam_metrics_mqc.collect().ifEmpty([]) + path ('picard/metrics/*') from ch_picard_metrics_mqc.collect().ifEmpty([]) + path ('varscan2/counts/lowfreq/*') from ch_varscan2_log_mqc.collect().ifEmpty([]) + path ('varscan2/bcftools/highfreq/*') from ch_varscan2_bcftools_highfreq_mqc.collect().ifEmpty([]) + path ('varscan2/snpeff/highfreq/*') from ch_varscan2_snpeff_highfreq_mqc.collect().ifEmpty([]) + path ('varscan2/quast/highfreq/*') from ch_varscan2_quast_mqc.collect().ifEmpty([]) + path ('ivar/variants/counts/lowfreq/*') from ch_ivar_count_mqc.collect().ifEmpty([]) + path ('ivar/variants/bcftools/highfreq/*') from ch_ivar_bcftools_highfreq_mqc.collect().ifEmpty([]) + path ('ivar/variants/snpeff/highfreq/*') from ch_ivar_snpeff_highfreq_mqc.collect().ifEmpty([]) + path ('ivar/consensus/quast/highfreq/*') from ch_ivar_quast_mqc.collect().ifEmpty([]) + path ('bcftools/variants/bcftools/*') from ch_bcftools_variants_mqc.collect().ifEmpty([]) + path ('bcftools/variants/snpeff/*') from ch_bcftools_snpeff_mqc.collect().ifEmpty([]) + path ('bcftools/consensus/quast/*') from ch_bcftools_quast_mqc.collect().ifEmpty([]) + path ('cutadapt/log/*') from ch_cutadapt_mqc.collect().ifEmpty([]) + path ('cutadapt/fastqc/*') from ch_cutadapt_fastqc_mqc.collect().ifEmpty([]) + path ('kraken2/*') from ch_kraken2_report_mqc.collect().ifEmpty([]) + path ('spades/bcftools/*') from ch_spades_vg_bcftools_mqc.collect().ifEmpty([]) + path ('spades/snpeff/*') from ch_spades_snpeff_mqc.collect().ifEmpty([]) + path ('spades/quast/*') from ch_quast_spades_mqc.collect().ifEmpty([]) + path ('metaspades/bcftools/*') from ch_metaspades_vg_bcftools_mqc.collect().ifEmpty([]) + path ('metaspades/snpeff/*') from ch_metaspades_snpeff_mqc.collect().ifEmpty([]) + path ('metaspades/quast/*') from ch_quast_metaspades_mqc.collect().ifEmpty([]) + path ('unicycler/bcftools/*') from ch_unicycler_vg_bcftools_mqc.collect().ifEmpty([]) + path ('unicycler/snpeff/*') from ch_unicycler_snpeff_mqc.collect().ifEmpty([]) + path ('unicycler/quast/*') from ch_quast_unicycler_mqc.collect().ifEmpty([]) + path ('minia/bcftools/*') from ch_minia_vg_bcftools_mqc.collect().ifEmpty([]) + path ('minia/snpeff/*') from ch_minia_snpeff_mqc.collect().ifEmpty([]) + path ('minia/quast/*') from ch_quast_minia_mqc.collect().ifEmpty([]) + path ('software_versions/*') from ch_software_versions_yaml.collect() + path workflow_summary from ch_workflow_summary.collectFile(name: "workflow_summary_mqc.yaml") output: - file "*multiqc_report.html" into ch_multiqc_report - file "*_data" - file "multiqc_plots" + path "*multiqc_report.html" into ch_multiqc_report + path "*_data" + path "*.tsv" script: rtitle = custom_runName ? "--title \"$custom_runName\"" : '' rfilename = custom_runName ? "--filename " + custom_runName.replaceAll('\\W','_').replaceAll('_+','_') + "_multiqc_report" : '' custom_config_file = params.multiqc_config ? "--config $mqc_custom_config" : '' - // TODO nf-core: Specify which MultiQC modules to use with -m for a faster run time """ - multiqc -f $rtitle $rfilename $custom_config_file . + multiqc . -f $rtitle $rfilename $custom_config_file + multiqc_to_custom_tsv.py + multiqc . -f $rtitle $rfilename $custom_config_file """ } -/* - * STEP 3 - Output Description HTML - */ process output_documentation { - publishDir "${params.outdir}/pipeline_info", mode: 'copy' + publishDir "${params.outdir}/pipeline_info", mode: params.publish_dir_mode input: - file output_docs from ch_output_docs + path output_docs from ch_output_docs + path images from ch_output_docs_images output: - file "results_description.html" + path "results_description.html" script: """ @@ -298,7 +3155,6 @@ workflow.onComplete { email_fields['summary']['Nextflow Build'] = workflow.nextflow.build email_fields['summary']['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp - // TODO nf-core: If not using MultiQC, strip out this code (including params.max_multiqc_email_size) // On success try attach the multiqc report def mqc_report = null try { @@ -380,7 +3236,6 @@ workflow.onComplete { } - def nfcoreHeader() { // Log colors ANSI codes c_black = params.monochrome_logs ? '' : "\033[0;30m"; @@ -424,3 +3279,13 @@ def checkHostname() { } } } + +// Function to check if running offline +def isOffline() { + try { + return NXF_OFFLINE as Boolean + } + catch( Exception e ) { + return false + } +} diff --git a/nextflow.config b/nextflow.config index d13de739..b0432401 100644 --- a/nextflow.config +++ b/nextflow.config @@ -8,14 +8,70 @@ // Global default params, used in configs params { - // Workflow flags - // TODO nf-core: Specify your pipeline's command line flags + // Options: Generic + input = './samplesheet.csv' + protocol = 'metagenomic' + amplicon_fasta = false + amplicon_bed =false + + // Options: SRA download + save_sra_fastq = false + skip_sra = false + + // Options: Reference genomes genome = false - reads = "data/*{1,2}.fastq.gz" - single_end = false - outdir = './results' + save_reference = false + + // Options: Kraken2 + kraken2_db = 'https://zenodo.org/record/3738199/files/kraken2_human.tar.gz' + kraken2_db_name = 'human' + kraken2_use_ftp = false + save_kraken2_fastq = false + skip_kraken2 = false + + // Options: Read Trimming + cut_mean_quality = 30 + qualified_quality_phred = 30 + unqualified_percent_limit = 10 + min_trim_length = 50 + skip_adapter_trimming = false + skip_amplicon_trimming = false + save_trimmed = false + + // Options: Variant calling + callers = 'varscan2,ivar,bcftools' + ivar_exclude_reads = false + filter_dups = false + filter_unmapped = false + min_base_qual = 20 + min_coverage = 10 + max_allele_freq = 0.8 + save_align_intermeds = false + save_mpileup = false + skip_markduplicates = false + skip_snpeff = false + skip_variants_quast = false + skip_variants = false + + // Options: De novo assembly + assemblers = 'spades,metaspades,unicycler,minia' + minia_kmer = 31 + skip_blast = false + skip_abacas = false + skip_plasmidid = false + skip_vg = false + skip_assembly_quast = false + skip_assembly = false + + // Options: QC + skip_fastqc = false + skip_picard_metrics = false + skip_multiqc = false + skip_qc = false // Boilerplate options + outdir = './results' + publish_dir_mode = 'copy' name = false multiqc_config = false email = false @@ -24,9 +80,7 @@ params { plaintext_email = false monochrome_logs = false help = false - igenomes_base = 's3://ngi-igenomes/igenomes/' tracedir = "${params.outdir}/pipeline_info" - igenomes_ignore = false custom_config_version = 'master' custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" hostnames = false @@ -43,7 +97,7 @@ params { // Container slug. Stable releases should specify release tag! // Developmental code should specify :dev -process.container = 'nfcore/viralrecon:dev' +process.container = 'nfcore/viralrecon:1.0.0' // Load base.config by default for all pipelines includeConfig 'conf/base.config' @@ -55,6 +109,13 @@ try { System.err.println("WARNING: Could not load nf-core/config profiles: ${params.custom_config_base}/nfcore_custom.config") } +// Load nf-core/viralrecon custom config +try { + includeConfig "${params.custom_config_base}/pipeline/viralrecon.config" +} catch (Exception e) { + System.err.println("WARNING: Could not load nf-core/config/viralrecon profiles: ${params.custom_config_base}/pipeline/viralrecon.config") +} + profiles { conda { process.conda = "$baseDir/environment.yml" } debug { process.beforeScript = 'echo $HOSTNAME' } @@ -70,17 +131,18 @@ profiles { singularity.enabled = true singularity.autoMounts = true } - test { includeConfig 'conf/test.config' } -} - -// Load igenomes.config if required -if (!params.igenomes_ignore) { - includeConfig 'conf/igenomes.config' + test { includeConfig 'conf/test.config' } + test_sra { includeConfig 'conf/test_sra.config' } + test_sispa { includeConfig 'conf/test_sispa.config' } + test_full { includeConfig 'conf/test_full.config' } + test_full_sispa { includeConfig 'conf/test_full_sispa.config' } } -// Export this variable to prevent local Python libraries from conflicting with those in the container +// Export these variables to prevent local Python/R libraries from conflicting with those in the container env { PYTHONNOUSERSITE = 1 + R_PROFILE_USER = "/.Rprofile" + R_ENVIRON_USER = "/.Renviron" } // Capture exit codes from upstream processes when piping @@ -110,7 +172,7 @@ manifest { description = 'Assembly and intrahost/low-frequency variant calling for viral samples' mainScript = 'main.nf' nextflowVersion = '>=19.10.0' - version = '1.0dev' + version = '1.0.0' } // Function to ensure that resource requirements don't go beyond