diff --git a/CHANGELOG.md b/CHANGELOG.md index 3db051c1..244fd1fb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,15 +3,27 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## v0.9.1dev - [date] +## [[0.10.0](https://github.com/sanger-tol/genomeassembly/releases/tag/0.10.0)] - Hideous Zippleback - [2024-04-16] -### `Added` +### Enhancements & fixes + +- OATK module is added into the ORGANELLES subworkflow +- ORGANELLES subworkflow is now called once in the main workflow and runs MITOHIFI in read and assembly mode along with OATK +- ORGANELLES module is now tested in github CI +- NCBI API secret introduced to run MITOHIFI_FINDMITOREFERENCE module +- hifiasm haplotigs are not purged anymore +- Longranger container version is updated + +### Software dependencies -### `Fixed` +Note, since the pipeline is using Nextflow DSL2, each process will be run with its own [Biocontainer](https://biocontainers.pro/#/registry). This means that on occasion it is entirely possible for the pipeline to be using different versions of the same tool. However, the overall software dependency changes compared to the last release have been listed below for reference. -### `Dependencies` +| Dependency | Old version | New version | +| ---------- | ----------- | ----------- | +| mitohifi | 3.0.0 | 3.1.1 | +| oatk | | 1.0 | -### `Deprecated` +**NB:** Dependency has been **added** if just the new version information is present. ## [[0.9.0](https://github.com/sanger-tol/genomeassembly/releases/tag/0.9.0)] - Night Fury - [2023-12-15] @@ -60,6 +72,7 @@ Note, since the pipeline is using Nextflow DSL2, each process will be run with i | fastk | | f18a4e6d2207539f7b84461daebc54530a9559b0 | | freebayes | | 1.3.6 | | gatk4 | | 4.4.0.0 | +| genescope | | 380815c420f50171f9234a0fd1ff426b39829b91 | | gfastats | | 1.3.5 | | GNU Awk | | 5.1.0 | | hifiasm | | 0.19.3-r572 | diff --git a/README.md b/README.md index b1a0e925..33c5d629 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,8 @@ While the steps are described in a sequential order, many of them can be execute 1. Illumina 10X reads to the joined primary and alt contigs. 2. polish initial assembly based on the aligment produced in [9i]. Set polished primary contigs as the primary assembly and polished haplotigs as the haplotig assembly. 3. produce numerical stats, BUSCO score and QV, completeness metrics, and kmer spectra for [9ii]. -10. Run organelles subworkflow on the joined primary and haplotigs contigs. +10. If organelles_on + 1. Run organelles subworkflow on the raw HiFi read data and the joined primary and haplotigs contigs. 11. Map HiC data onto primary contigs. 12. Run scaffolding for primary contigs. 13. Produce numerical stats, BUSCO score and QV, completeness metrics, and kmer spectra for [12]. diff --git a/assets/schema_input.json b/assets/schema_input.json index 36f31f17..9d5ca366 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -44,6 +44,40 @@ "errorMessage": "busco lineage to run" } } + }, + "mito": { + "type": "object", + "properties": { + "species": { + "type": "string", + "errorMessage": "Latin name" + }, + "min_length": { + "type": "string", + "errorMessage": "Minimal allowed length of the mito reference" + }, + "email": { + "type": "string", + "errorMessage": "email to query NCBI" + }, + "code": { + "type": "string", + "errorMessage": "Mitochondrial code" + }, + "fam": { + "type": "string", + "errorMessage": "Path to mitochondrial HMM for OATK" + } + } + }, + "plastid": { + "type": "object", + "properties": { + "fam": { + "type": "string", + "errorMessage": "Path to plastid HMM for OATK" + } + } } }, "required": ["dataset", "busco"] diff --git a/assets/test.yaml b/assets/test.yaml index d58fbb56..783f980c 100644 --- a/assets/test.yaml +++ b/assets/test.yaml @@ -16,3 +16,5 @@ mito: min_length: 15000 code: 5 fam: /lustre/scratch124/tol/projects/darwin/users/cz3/organelle_asm/hmm_db/insecta_mito.fam +plastid: + fam: /lustre/scratch124/tol/projects/darwin/users/cz3/organelle_asm/hmm_db/acrogymnospermae_pltd.fam diff --git a/conf/modules.config b/conf/modules.config index 399594cc..3673f5d1 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -218,7 +218,7 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: '.*OATK' { + withName: '.*ORGANELLES:OATK' { ext.args = "-k1001 -c90 -Ttmp" publishDir = [ path: { "${params.outdir}/${meta.id}.${params.hifiasm}/oatk" }, diff --git a/conf/test.config b/conf/test.config index 4c6c4a7e..09b6896a 100644 --- a/conf/test.config +++ b/conf/test.config @@ -26,11 +26,3 @@ params { polishing_on = true hifiasm_hic_on = true } - -process { - // Set up of the scffolding pipeline - withName: 'YAHS' { - // Skip the initial assembly error correction step - ext.args = '-r 1000,2000,5000' - } -} diff --git a/docs/images/v1/organelles.drawio b/docs/images/v1/organelles.drawio index 1c4ca31d..98f46f68 100644 --- a/docs/images/v1/organelles.drawio +++ b/docs/images/v1/organelles.drawio @@ -1,31 +1,13 @@ - + - + - - - - - - - - - - - - - + - - - - - - - + @@ -34,16 +16,7 @@ - - - - - - - - - - + @@ -53,39 +26,93 @@ - + - + - + - + - + - - - - + - - + + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + + diff --git a/docs/images/v1/organelles.png b/docs/images/v1/organelles.png index 36fa4dc7..1b75a03d 100644 Binary files a/docs/images/v1/organelles.png and b/docs/images/v1/organelles.png differ diff --git a/docs/output.md b/docs/output.md index a858863b..5a92b0b9 100644 --- a/docs/output.md +++ b/docs/output.md @@ -176,10 +176,19 @@ This subworkflow is used to evaluate the quality of sequences. It is performed a - \*.hifiasm.\*/mito..\*/contigs_stats.tsv - summary of mitochondrial findings - output also includes other output files produced by MitoHiFi +- \*.hifiasm.\*/oatk/.\*mito.ctg.fasta + - mitochondrion assembly +- \*.hifiasm.\*/oatk/.\*mito.gfa + - assembly graph for the mitochondrion assembly +- \*.hifiasm.\*/oatk/.\*pltd.ctg.fasta + - plastid assembly +- \*.hifiasm.\*/oatk/.\*pltd.gfa + - assembly graph for the plastid assembly +- output also includes other output files produced by oatk -This subworkflow implements assembly of organelles. In the main pipeline it is called twice - for assembling mitochondrion from HiFi reads and as an alternative it runs identification of the mitochondrion for the genome assembly

+This subworkflow implements assembly of organelles. First it identifies a reference mitochondrion assembly by quering NCBI then MitoHiFi is called on raw HIFI reads and separately on the assembled contigs using the queried reference. Separately OATK is called on the raw reads. For plants an optional path to plastid HMM can be provided in YAML then OATK will be tried for both types of organelles

![Organelles subworkflow](images/v1/organelles.png) diff --git a/nextflow.config b/nextflow.config index 06f099e4..fa065707 100644 --- a/nextflow.config +++ b/nextflow.config @@ -219,7 +219,7 @@ manifest { description = """A bioinformatics best-practice analysis pipeline for genome assembly from PacBio CCS and HiC reads""" mainScript = 'main.nf' nextflowVersion = '!>=22.10.1' - version = '0.9.1dev' + version = '0.9.1' doi = '10.5281/zenodo.10391851' } diff --git a/subworkflows/local/organelles.nf b/subworkflows/local/organelles.nf index c23593a9..4b5eb96e 100644 --- a/subworkflows/local/organelles.nf +++ b/subworkflows/local/organelles.nf @@ -10,6 +10,7 @@ workflow ORGANELLES { reads_input // channel: [ val(meta), datafile ] contigs_input // channel: [ val(meta), datafile ] mito_info // channel: [ val(species), val(min_length), val(code), val(email), val(fam) ] + plastid_info // channel: [ val(fam) ] main: ch_versions = Channel.empty() @@ -54,11 +55,17 @@ workflow ORGANELLES { file(fam.toString()+'.h3i', checkIfExists: true), file(fam.toString()+'.h3m', checkIfExists: true), file(fam.toString()+'.h3p', checkIfExists: true) ]} - .set { hmm_input } + .set { mito_hmm_input } + plastid_info.map{ fam -> fam ? [ file(fam.toString(), checkIfExists: true), + file(fam.toString()+'.h3f', checkIfExists: true), + file(fam.toString()+'.h3i', checkIfExists: true), + file(fam.toString()+'.h3m', checkIfExists: true), + file(fam.toString()+'.h3p', checkIfExists: true) ] : [[],[],[],[],[]]} + .set { plastid_hmm_input } // // MODULE: RUN OATK TO IDENTIFY MITO // - OATK(reads_input, hmm_input, [[],[],[],[],[]]) + OATK(reads_input, mito_hmm_input, plastid_hmm_input) ch_versions = ch_versions.mix(OATK.out.versions.first()) emit: diff --git a/subworkflows/local/prepare_input.nf b/subworkflows/local/prepare_input.nf index 53e660d8..3b3848cc 100644 --- a/subworkflows/local/prepare_input.nf +++ b/subworkflows/local/prepare_input.nf @@ -27,6 +27,7 @@ workflow PREPARE_INPUT { dataset : (data.dataset ? data.dataset : []) busco : (data.busco ? data.busco : []) mito: ( data.mito ? ['\"'+data.mito.species+'\"', data.mito.min_length, data.mito.code, data.mito.email ? data.mito.email : "\"\"", data.mito.fam ? data.mito.fam : "\"\"" ] : []) + plastid : ( data.plastid ? ( data.plastid.fam ? data.plastid.fam : "\"\"" ) : []) hic_motif : (data.hic_motif ? data.hic_motif : []) } .set{ ch_yml_data } @@ -74,6 +75,7 @@ workflow PREPARE_INPUT { illumina_10X = dataset_ch.illumina_10X_ch busco = busco_ch mito = ch_yml_data.mito + plastid = ch_yml_data.plastid versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ] } diff --git a/workflows/genomeassembly.nf b/workflows/genomeassembly.nf index 77936038..5d6d26c9 100644 --- a/workflows/genomeassembly.nf +++ b/workflows/genomeassembly.nf @@ -152,7 +152,9 @@ workflow GENOMEASSEMBLY { // // SUBWORKFLOW: INDETIFY MITO IN THE RAW READS AND ASSEMBLY CONTIGS // - ORGANELLES(CAT_CAT_MITOHIFI_READS.out.file_out, merged_pri_alt_raw, PREPARE_INPUT.out.mito) + ORGANELLES(CAT_CAT_MITOHIFI_READS.out.file_out, merged_pri_alt_raw, + PREPARE_INPUT.out.mito, PREPARE_INPUT.out.plastid) + ch_versions = ch_versions.mix(ORGANELLES.out.versions) } //