diff --git a/bin/compare_busco.py b/bin/compare_busco.py new file mode 100755 index 0000000..d49f983 --- /dev/null +++ b/bin/compare_busco.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 + +import argparse +import json +from typing import Tuple + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "json_files", + metavar="FILE", + type=str, + nargs="+", + help="Paths to one or more BUSCO JSON files" + ) + return parser.parse_args() + +def get_complete_buscos(json_file) -> Tuple[str, float]: + """ + Get the assembler name and complete BUSCO % from a json + """ + with open(json_file, 'r') as f: + j = json.load(f) + assembler = j['parameters']['out'] + # get assembler only from e.g. barcode01_consensus_busco + assembler = assembler.split('_')[1] + assert assembler in ['flye', 'unicycler', 'consensus'] + complete_busco = j['results']['Complete percentage'] + return assembler, complete_busco + +if __name__ == "__main__": + args = parse_args() + results = [get_complete_buscos(file) for file in args.json_files] + highest_busco = max(results, key=lambda x: x[1]) + print(highest_busco[0], end="") # don't print newline diff --git a/main.nf b/main.nf index 4e98c84..9ad7bed 100755 --- a/main.nf +++ b/main.nf @@ -6,6 +6,7 @@ nextflow.enable.dsl=2 // Import processes or subworkflows to be run in the workflow include { check_input } from './modules/check_input' include { check_samplesheet } from './modules/check_samplesheet' +include { concat_fastas } from './modules/concat_fa' include { concat_fastqs } from './modules/run_pigz' include { porechop } from './modules/run_porechop' include { pycoqc_summary } from './modules/run_pycoqc' @@ -26,22 +27,17 @@ include { select_assembly } from './modules/select_assembly' include { trycycler_msa } from './modules/run_trycycler_msa' include { trycycler_partition } from './modules/run_trycycler_partition' include { trycycler_consensus} from './modules/run_trycycler_consensus' +include { medaka_polish_denovo } from './modules/run_medaka_polish_denovo' include { medaka_polish_consensus } from './modules/run_medaka_polish_consensus' -include { medaka_polish_flye } from './modules/run_medaka_polish_flye' include { plassembler } from './modules/run_plassembler' include { bakta_annotation_plasmids } from './modules/run_bakta_annotation_plasmids' include { busco_annotation_plasmids } from './modules/run_busco_annotation_plasmids' include { quast_qc_chromosomes } from './modules/run_quast_qc_chromosomes' -include { quast_qc_flye_chromosomes } from './modules/run_quast_qc_flye_chromosomes' include { bakta_annotation_chromosomes } from './modules/run_bakta_annotation_chromosomes' -include { bakta_annotation_flye_chromosomes } from './modules/run_bakta_annotation_flye_chromosomes' -include { busco_annotation_chromosomes } from './modules/run_busco_annotation_chromosomes' -include { busco_annotation_flye_chromosomes } from './modules/run_busco_annotation_flye_chromosomes' +include { busco_qc_chromosomes } from './modules/run_busco_qc_chromosomes' include { abricateVFDB_annotation_chromosomes } from './modules/run_abricateVFDB_annotation_chromosomes' -include { abricateVFDB_annotation_flye_chromosomes } from './modules/run_abricateVFDB_annotation_flye_chromosomes' include { abricateVFDB_annotation_reference } from './modules/run_abricateVFDB_annotation_reference' include { amrfinderplus_annotation_chromosomes } from './modules/run_amrfinderplus_annotation_chromosomes' -include { amrfinderplus_annotation_flye_chromosomes } from './modules/run_amrfinderplus_annotation_flye_chromosomes' include { create_samplesheet_for_processed } from './modules/create_samplesheet_for_processed' include { create_phylogeny_tree_related_files } from './modules/create_phylogeny_tree_related_files' include { run_orthofinder } from './modules/run_orthofinder' @@ -187,10 +183,8 @@ if ( params.help || (!params.input_directory && !params.samplesheet) || !params. // SCREEN FOR CONTAMINANTS kraken2(porechop.out.trimmed_fq, kraken2_db) - // ASSEMBLE GENOME WITH FLYE + // DE NOVO GENOME ASSEMBLIES flye_assembly(porechop.out.trimmed_fq) - - // ASSEMBLE GENOME WITH UNICYCLER unicycler_assembly(porechop.out.trimmed_fq) // DETECT PLASMIDS AND OTHER MOBILE ELEMENTS @@ -200,296 +194,290 @@ if ( params.help || (!params.input_directory && !params.samplesheet) || !params. plassembler(plassembler_in, get_plassembler.out.plassembler_db) - // CLUSTER CONTIGS WITH TRYCYCLER - combined_assemblies = unicycler_assembly.out.unicycler_assembly - .join(flye_assembly.out.flye_assembly, by:0) - .join(porechop.out.trimmed_fq, by:0) - .map { barcode, unicycler_assembly, flye_assembly, trimmed_fq -> - tuple(barcode, unicycler_assembly, flye_assembly, trimmed_fq)} - - trycycler_cluster(combined_assemblies) - - /* - * Building a tree requires >2 contigs. - * Use `contigs.phylip` to check the number of contigs. The number of lines - * in a `.phylip` indicates the number of contigs/tips. - */ - assemblies_with_trycycler_clusters = trycycler_cluster.out.trycycler_cluster - .map { barcode, assemblies, cluster_phylip -> - def phylip_lines = cluster_phylip.text.readLines().size() - 1 // Exclude header - return [barcode, assemblies, phylip_lines] + /* + * CONSENSUS ASSEMBLY PRE-PROCESSING + * + * Trycycler requires >= 3 contigs to run. Use the in-built .countFasta() + * operator to count the total number of contigs assembled for each barcode. + * + * If additional assemblers/read subsets/replicates are added, ensure it + * is added here. + */ + num_contigs_per_barcode = + unicycler_assembly.out.unicycler_assembly + .mix(flye_assembly.out.flye_assembly) + .map { barcode, assembly_dir -> + // Count num contigs per assembly + def fa = assembly_dir + "/assembly.fasta" + def ncontigs = fa.countFasta() + return [barcode, ncontigs] } - .branch { barcode, assemblies, phylip_lines -> - run_trycycler: phylip_lines >= 3 // Enough clusters/contigs - skip_trycycler: phylip_lines < 3 + .groupTuple() + .map { barcode, ncontigs -> + // Add total contigs across assemblies + def total_contigs = ncontigs[0].toInteger() + ncontigs[1].toInteger() + return [barcode, total_contigs] + } + + denovo_assemblies = + unicycler_assembly.out.unicycler_assembly + .join(flye_assembly.out.flye_assembly, by:0) + .join(porechop.out.trimmed_fq, by:0) + .join(num_contigs_per_barcode, by:0) + + denovo_assembly_contigs = + denovo_assemblies + .branch { barcode, unicycler, flye, trimmed_fq, num_contigs -> + run_trycycler: num_contigs >= 3 + skip_trycycler: num_contigs < 3 } trycycler_skipped_barcodes = // barcodes with insufficient contigs for trycycler - assemblies_with_trycycler_clusters.skip_trycycler - .map { barcode, assemblies, phylip_lines -> barcode } + // TODO: Remove this when select_assembly is revised. + // denovo_assemblies can be passed directly when ready. + denovo_assembly_contigs.skip_trycycler + .map { + barcode, unicycler_assembly, flye_assembly, trimmed_fq, num_contigs -> barcode + } .collect() // RUN TRYCYCLER (CONSENSUS ASSEMBLY) IF SUFFICIENT # CONTIGS - // CLASSIFY CONTIGS WITH TRYCYCLER - classify_trycycler(assemblies_with_trycycler_clusters.run_trycycler) - - // RECONCILE CONTIGS WITH TRYCYCLER - contigs_to_reconcile = classify_trycycler.out.reconcile_contigs - .join(porechop.out.trimmed_fq, by: 0) - .flatMap { barcode, reconcile_contigs, trimmed_fq -> - if (reconcile_contigs instanceof List) { - reconcile_contigs.collect { cluster_dir -> - tuple(barcode, cluster_dir, trimmed_fq) - } - } else { - [tuple(barcode, reconcile_contigs, trimmed_fq)] - } - } - - trycycler_reconcile(contigs_to_reconcile) - - /* - * SELECT "BEST" ASSEMBLY (CONSENSUS (TRYCYCLER) OR FLYE) - * TODO: Revise reference-free approach - * TODO: Include unicycler assembly too - */ - trycycler_reconciled = - // Channel for successful trycycler assemblies - trycycler_reconcile.out.reconciled_seqs - .groupTuple(by:[0]) - - select_in = trycycler_reconcile.out.reconciled_seqs - .groupTuple(by:[0]) - .join(flye_assembly.out.flye_assembly, by:0) - .join(kraken2.out.kraken2_screen, by:0) - .map { barcode, reconciled, flye_assembly, k2_report -> - tuple(barcode, reconciled, flye_assembly, k2_report)} - - select_assembly(select_in, get_ncbi.out.ncbi_lookup) - - // IF CONSENSUS ASSEMBLY IS BEST... - - // TRYCYCLER MULTIPLE SEQUENCE ALIGNMENT - // CONSENSUS APPROACH : Check if file Consensus.txt exists - msa_in_consensus = select_assembly.out.consensus_good - .filter { it[1].exists() } // Ensure the correct path is checked for existence - .flatMap { tuple -> - def barcode = tuple[0] - def consensus_file = tuple[1] - def final_path = tuple[2] - if (final_path instanceof List) { - // If final_path is a list, return a stream of separate tuples - final_path.collect { path -> - [barcode, consensus_file, path] - } - } else { - // If final_path is not a list, return a single-element list with the original tuple - [[barcode, consensus_file, final_path]] - } - } - - trycycler_msa(msa_in_consensus) - trycycler_msa_out = trycycler_msa.out.three_msa - - // TRYCYCLER PARTITIONING READS - partition_in = select_assembly.out.consensus_good - .filter { it[1].exists() } // Ensure the correct path is checked for existence - .join(porechop.out.trimmed_fq, by: 0) - .map { barcode, consensus_file, final_path, trimmed_fq -> - tuple(barcode, consensus_file, final_path, trimmed_fq) - } - - trycycler_partition(partition_in) - - // TRYCYCLER CONSENSUS - partition_out_raw = trycycler_partition.out.four_reads - - partition_out = partition_out_raw.flatMap { barcode, path -> - def directory = new File(path.toString()) - def files = directory.listFiles().findAll { it.name.endsWith('_reconciled') } - files.collect { file -> [barcode, "${barcode}_${file.name}", file.path] } - } - -// TODO MAKE NAMING CONSISTENT WITH OTHER CHANNELS - consensus_in = trycycler_msa_out - .join(partition_out, by:1) - .map { row -> - [row[1], row[0], row[2], row[4]] - } - - trycycler_consensus(consensus_in) - - // MEDAKA POLISH CONSENSUS ASSEMBLY - // TODO MAKE NAMING CONSISTENT WITH OTHER CHANNELS - consensus_polish_in = partition_out - .join(trycycler_consensus.out.consensus_consensus, by:1) - .map { row ->[row[1], row[0], row[2], row[4]]} - - medaka_polish_consensus(consensus_polish_in) - - // ANNOTATE VARIOUS CONSENSUS-CHROMOSOME FEATURES - // TODO MAKE NAMING CONSISTENT WITH OTHER CHANNELS - polish_grouped_by_barcode = medaka_polish_consensus.out.consensus_polished - .groupTuple(by:[0]) - .map { row -> [row[0], row[2]]} - - // QUAST QC CONSENSUS-ASSEMBLY - quast_qc_chromosomes(polish_grouped_by_barcode) - - // BAKTA ANNOTATE GENE FEATURES - bakta_annotation_chromosomes(polish_grouped_by_barcode,get_bakta.out.bakta_db) - - // BUSCO ANNOTATE CONSENSUS-CHROMOSOME FEATURES - busco_annotation_chromosomes(bakta_annotation_chromosomes.out.bakta_annotations, get_busco.out.busco_db) - - // AMRFINDERPLUS ANNOTATE CONSENSUS-CHROMOSOME AMR-GENES - amrfinderplus_annotation_chromosomes(bakta_annotation_chromosomes.out.bakta_annotations, - get_amrfinderplus.out.amrfinderplus_db) - - // ABRICATE ANNOTATE CONSENSUS-CHROMOSOME WITH VFDB-GENES - abricateVFDB_annotation_chromosomes(polish_grouped_by_barcode) - - consensus_processed_samples=amrfinderplus_annotation_chromosomes.out.map { it[0] }.collect() + // TRYCYCLER: Cluster contigs + trycycler_cluster(denovo_assembly_contigs.run_trycycler) + + barcode_cluster_sizes = + // trycycler_cluster filters out small contigs (default < 5000nt) and can + // result in < 2 contigs here again. Use the same branching logic to avoid + // < 2 contig errors. + trycycler_cluster.out.phylip + .map { barcode, contigs_phylip -> + def phylip_lines = contigs_phylip.text.readLines().size - 1 // exclude header + return [barcode, phylip_lines] + } + .branch { barcode, phylip_lines -> + run_trycycler: phylip_lines >= 3 + skip_trycycler: phylip_lines < 3 + // create a channel from skip_trycycler if failed barcodes need to be reported/used + } - // IF FLYE ASSEMBLY IS BEST... + // TRYCYCLER: Classify clusters + clusters_to_classify = + // Add path to clusters with sufficient contigs + trycycler_cluster.out.clusters + .join(barcode_cluster_sizes.run_trycycler) - /* - * Channel that has the ids for all samples that either: - * 1. Has too few contigs for trycycler/consensus assembly - * 2. Enough contigs but flye > consensus assembly - * - * flye_better_barcode channel emits each sample id separately for joining - * with the paths to the flye assembly and trimmed fqs later - * - */ - flye_better_barcodes = select_assembly.out.consensus_discard - .map { barcode, consensus_file, filtered_flye_contigs -> barcode } - .mix(trycycler_skipped_barcodes) - .flatMap() + classify_trycycler(clusters_to_classify) - /* - * MEDAKA-POLISH FLYE ASSEMBLY - * - * The current select_assembly process discards any non-chromosomal - * contigs based on a reference-based NCBI lookup. - * - * The following channel passes in ALL contigs assembled by flye for - * polishing as select_assembly is revised. i.e. not just the putatively - * chromosmal ones. - * - */ - flye_polish_in = - flye_better_barcodes - .join(flye_assembly.out.flye_assembly, by: 0) + // TRYCYCLER: Reconcile contigs + clusters_to_reconcile_flat = + classify_trycycler.out.clusters_to_reconcile .join(porechop.out.trimmed_fq, by: 0) + // from [barcode, [pathA, pathB], ...] + // to [barcode, pathA, ...]; [barcode, pathB, ...] + .transpose() + + trycycler_reconcile(clusters_to_reconcile_flat) + + reconciled_cluster_dirs = + trycycler_reconcile.out.reconciled_seqs // successfully reconciled + .map { barcode, seq -> + // drops the last part of the path (reconciled_seqs file) as trycycler + // searches the parent dir for it + Path cluster_dir = seq.getParent() + return [barcode, cluster_dir] + } + + // TRYCYCLER: Align + trycycler_msa(reconciled_cluster_dirs) + + // TRYCYCLER: Partitioning reads + clusters_to_partition = + trycycler_msa.out.results_dir + .groupTuple() + .join(porechop.out.trimmed_fq) + + trycycler_partition(clusters_to_partition) + + // TRYCYCLER: Generate consensus assemblies + clusters_for_consensus = + trycycler_partition.out.partitioned_reads + .transpose() // "ungroup" tuple + .map { barcode, reads -> + // Get directories of successfully partitioned reads + Path cluster_dir = reads.getParent() + return [barcode, cluster_dir] + } - medaka_polish_flye(flye_polish_in) - - // QUAST QC FLYE-ONLY ASSEMBLY - quast_qc_flye_chromosomes(medaka_polish_flye.out.flye_polished) - - // BAKTA ANNOTATE FLYE-ONLY CHROMOSOME FEATURES - bakta_annotation_flye_chromosomes(medaka_polish_flye.out.flye_polished,get_bakta.out.bakta_db) - - // BUSCO ANNOTATE FLYE-ONLY-CHROMOSOME FEATURES - busco_annotation_flye_chromosomes(bakta_annotation_flye_chromosomes.out.bakta_annotations, get_busco.out.busco_db) - - // AMRFINDERPLUS ANNOTATE FLYE-ONLY AMR-GENES - amrfinderplus_annotation_flye_chromosomes(bakta_annotation_flye_chromosomes.out.bakta_annotations, - get_amrfinderplus.out.amrfinderplus_db) + trycycler_consensus(clusters_for_consensus) - flye_only_processed_samples=amrfinderplus_annotation_flye_chromosomes.out - .map { it[0] }.collect() + // MEDAKA: Polish consensus assembly + consensus_dir = + // Get parent dir for assembly + trycycler_consensus.out.cluster_assembly + .map { barcode, assembly -> + Path assembly_dir = assembly.getParent() + return [barcode, assembly_dir] + } - // ABRICATE ANNOTATE FLYE-CHROMOSOME WITH VFDB-GENES - abricateVFDB_annotation_flye_chromosomes(medaka_polish_flye.out.flye_polished) + medaka_polish_consensus(consensus_dir) + + // CAT: Combine polished clusters consensus assemblies into a single fasta + polished_clusters = + medaka_polish_consensus.out.cluster_assembly + .groupTuple() + .map { barcode, fastas -> + // Need to avoid filename collisions with consensus.fasta files + // by renaming them + def indexed_fas = + fastas.withIndex() + .collect { fa, idx -> + def new_fa = fa.getParent() / "consensus_${idx}.fasta" + fa.copyTo(new_fa) + return new_fa + } + return [barcode, indexed_fas] + } - kraken_input_to_create_phylogeny_tree = kraken2.out - .map { [it[1]] } - .collect() - - consensus_bakta=bakta_annotation_chromosomes.out.bakta_annotations - .map { [it[1]] }.collect() + concat_fastas(polished_clusters) + + polished_consensus_per_barcode = concat_fastas.out + + // MEDAKA: POLISH DE NOVO ASSEMBLIES + unpolished_denovo_assemblies = + unicycler_assembly.out.unicycler_assembly + .mix(flye_assembly.out.flye_assembly) + .map { barcode, assembly_dir -> + // Get the assembler name by parsing the publishDir path. + // A better way to do this is output i.e. val(assembler_name) in the + // assembly processes but will required more changes + String assembler_name = assembly_dir.toString().tokenize("_")[-2] + return [barcode, assembler_name, assembly_dir] + } + .combine(porechop.out.trimmed_fq, by: 0) + + medaka_polish_denovo(unpolished_denovo_assemblies) + + // ASSEMBLY QC + all_polished = + polished_consensus_per_barcode + .map { barcode, consensus_fa -> + // technically should be "trycycler" but want to separate it out from + // the denovo assemblies clearly + String assembler = "consensus" + return [barcode, assembler, consensus_fa] + } + .mix(medaka_polish_denovo.out.assembly) - flye_only_bakta=bakta_annotation_flye_chromosomes.out.bakta_annotations - .map { [it[1]] }.collect() + // TODO: probably better to collect all per-barcode assemblies in one quast + // run to void a parsing/merging step + quast_qc_chromosomes(all_polished) + busco_qc_chromosomes(all_polished, get_busco.out.busco_db) + + // SELECT "BEST" ASSEMBLY + // TODO: Discuss better approaches. Currently selects the best assembly per + // barcode by most Complete BUSCO % + barcode_busco_jsons = + // Gather all jsons for each barcode + busco_qc_chromosomes.out.json_summary + .groupTuple() + + select_assembly(barcode_busco_jsons) + + best_chr_assembly = + select_assembly.out + .map { barcode, txt -> + // best assembly stored in txt file for pipeline caching + String best = txt.splitText()[0].strip() + return [barcode, best] + } + .join(all_polished, by: [0, 1]) + + // ANNOTATE THE BEST CHROMOSOMAL ASSEMBLY PER-BARCODE + // BAKTA: Annotate gene features + bakta_annotation_chromosomes(best_chr_assembly, get_bakta.out.bakta_db) + + // ABRICATE: Annotate VFDB genes + abricateVFDB_annotation_chromosomes(best_chr_assembly) + + // AMRFINDERPLUS: Annotate AMR genes + amrfinderplus_annotation_chromosomes( + bakta_annotation_chromosomes.out.faa, + get_amrfinderplus.out.amrfinderplus_db + ) + + // CREATE FILES FOR PHYLOGENETIC TREE BUILDING + // Collect all the output (annotation) reports required for tree building + kraken2_reports = + // Collect all k2 reports and drop barcodes + kraken2.out + .map { barcode, k2_report -> k2_report } + .collect() -// CREATE FILES FOR PHYLOGENETIC TREE BUILDING -// Check if flye_only_bakta is empty, and use only consensus_bakta if it is -all_bakta_input_to_create_phylogeny_tree = flye_only_bakta - .ifEmpty([]) // If flye_only_bakta is empty, provide an empty list - .merge(consensus_bakta) // Merge with consensus_bakta - //.view() + bakta_results_dirs = + // TODO: Currently takes dir as input, amend the phylo building .py file + // to take direct inputs + bakta_annotation_chromosomes.out.txt + .map { barcode, assembler, txt -> + Path dir = txt.getParent() + return dir + } + .collect() -create_phylogeny_tree_related_files( + create_phylogeny_tree_related_files( get_ncbi.out.assembly_summary_refseq, - kraken_input_to_create_phylogeny_tree, - all_bakta_input_to_create_phylogeny_tree -) - - // ORTHOFINDER PHYLOGENETIC ORTHOLOGY INFERENCE - run_orthofinder(create_phylogeny_tree_related_files.out.phylogeny_folder) - - - // AMRIFINDER ANNOTATE REFERENCE STRAINS FOR AMR GENES - amrfinderplus_annotation_reference(create_phylogeny_tree_related_files.out.phylogeny_folder, - get_amrfinderplus.out.amrfinderplus_db) - - // ABRICATE ANNOTATE REFERENCE STRAINS FOR AMR GENES - abricateVFDB_annotation_reference(create_phylogeny_tree_related_files.out.phylogeny_folder) - - // GENERATE AMRFINDERPLUS gene matrix (FOR PHYLOGENY-AMR HEATMAP) - consensus_amrfinderplus_output=amrfinderplus_annotation_chromosomes.out - .map { it[1] } - .collect() - - flye_only_amrfinderplus_output=amrfinderplus_annotation_flye_chromosomes.out - .map { it[1] } - .collect() - -// Check if flye_only_amrfinderplus_output is empty, and use only consensus_amrfinderplus_output if it is - all_samples_amrfinderplus_output = flye_only_amrfinderplus_output - .ifEmpty([]) // If flye_only_amrfinderplus_output is empty, provide an empty list - .merge(consensus_amrfinderplus_output) // Merge with consensus_amrfinderplus_output + kraken2_reports, + bakta_results_dirs + ) + + // ORTHOFINDER: Infer phylogeny using orthologous genes + phylogeny_in = create_phylogeny_tree_related_files.out.phylogeny_folder + run_orthofinder(phylogeny_in) + + // ANNOTATE REFERENCE STRAINS + // AMRFINDERPLUS: Annotate AMR genes + amrfinderplus_annotation_reference( + phylogeny_in, + get_amrfinderplus.out.amrfinderplus_db + ) + + // ABRICATE: Annotate VFDB genes + abricateVFDB_annotation_reference(phylogeny_in) + + // PYTHON: Generate gene matrix for phylogeny-AMR heatmap (AMRFINDERPLUS) + amrfinderplus_chr_reports = + amrfinderplus_annotation_chromosomes.out.report + .map { barcode, assembler, report -> report } + .collect() - all_references_amrfinderplus_output = amrfinderplus_annotation_reference.out.amrfinderplus_annotations - barcode_species_table = create_phylogeny_tree_related_files.out.barcode_species_table - generate_amrfinderplus_gene_matrix(all_samples_amrfinderplus_output, - all_references_amrfinderplus_output, - barcode_species_table) - - // GENERATE ABRICATE-VFDB gene matrix (FOR PHYLOGENY-AMR HEATMAP) - consensus_abricate_output=abricateVFDB_annotation_chromosomes.out - .map { it[1] } - .collect() - - flye_only_abricate_output=abricateVFDB_annotation_flye_chromosomes.out - .map { it[1] } - .collect() - -// all_samples_abricate_output = consensus_abricate_output -// .merge(flye_only_abricate_output) + generate_amrfinderplus_gene_matrix( + amrfinderplus_chr_reports, + amrfinderplus_annotation_reference.out.amrfinderplus_annotations, + barcode_species_table + ) -// Check if flye_only_abricate_output is empty, and use only consensus_abricate_output if it is -all_samples_abricate_output = flye_only_abricate_output - .ifEmpty([]) // If flye_only_abricate_output is empty, provide an empty list - .merge(consensus_abricate_output) // Merge with consensus_abricate_output - - all_references_abricate_output = abricateVFDB_annotation_reference.out.abricate_annotations + // PYTHON: Generate gene matrix for phylogeny-AMR heatmap (ABRICATE) + abricate_chr_reports = + abricateVFDB_annotation_chromosomes.out.report + .map { barcode, report -> report } + .collect() - generate_abricate_gene_matrix(all_samples_abricate_output, - all_references_abricate_output, - barcode_species_table) + generate_abricate_gene_matrix( + abricate_chr_reports, + abricateVFDB_annotation_reference.out.abricate_annotations, + barcode_species_table + ) - // CREATE PHYLOGENETIC TREE + HEATMAP IMAGE - create_phylogeny_And_Heatmap_image(run_orthofinder.out.phylogeny_tree, - generate_amrfinderplus_gene_matrix.out.amrfinderplus_gene_matrix, - generate_abricate_gene_matrix.out.abricate_gene_matrix) + // R: Plot phylogeny and heatmap + create_phylogeny_And_Heatmap_image( + run_orthofinder.out.phylogeny_tree, + generate_amrfinderplus_gene_matrix.out.amrfinderplus_gene_matrix, + generate_abricate_gene_matrix.out.abricate_gene_matrix + ) - // ANNOTATE PLASMID FEATURES (BATKA) + // BAKTA: Annotate plasmid gene features bakta_annotation_plasmids(plassembler.out.plassembler_fasta, get_bakta.out.bakta_db) // SUMMARISE RUN WITH MULTIQC REPORT @@ -497,53 +485,41 @@ all_samples_abricate_output = flye_only_abricate_output nanoplot_required_for_multiqc = nanoplot_summary.out.nanoplot_summary.ifEmpty([]) pycoqc_required_for_multiqc = pycoqc_summary.out.pycoqc_json.ifEmpty([]) -kraken2_required_for_multiqc = kraken2.out.kraken2_screen + kraken2_required_for_multiqc = + kraken2.out.kraken2_screen .map { it[1] } .collect() .ifEmpty([]) -quast_required_for_multiqc = quast_qc_chromosomes.out.quast_qc_multiqc - .map { it[1] } + quast_required_for_multiqc = + quast_qc_chromosomes.out.tsv + .map { barcode, assembler, tsv -> tsv } .collect() - .merge( - quast_qc_flye_chromosomes.out.quast_qc_multiqc - .map { it[1] } - .collect() - .ifEmpty([]) - ) - -bakta_required_for_multiqc = bakta_annotation_chromosomes.out.bakta_annotations_multiqc - .map { it[1] } + + bakta_required_for_multiqc = + bakta_annotation_chromosomes.out.txt + .map { barcode, assembler, txt -> txt } .collect() - .merge( - bakta_annotation_flye_chromosomes.out.bakta_annotations_multiqc - .map { it[1] } - .collect() - .ifEmpty([]) - ) - -busco_required_for_multiqc = busco_annotation_chromosomes.out.busco_annotations - .map { it[1] } + + busco_required_for_multiqc = + busco_qc_chromosomes.out.txt_summary + .map { barcode, assembler, txt -> txt } .collect() - .merge( - busco_annotation_flye_chromosomes.out.busco_annotations - .map { it[1] } - .collect() - .ifEmpty([]) - ) - -bakta_plasmids_required_for_multiqc = bakta_annotation_plasmids.out.bakta_annotations + + bakta_plasmids_required_for_multiqc = + bakta_annotation_plasmids.out.bakta_annotations .map { it[1] } .collect() .ifEmpty([]) -phylogeny_heatmap_plot_required_for_multiqc = create_phylogeny_And_Heatmap_image.out.combined_plot_mqc + phylogeny_heatmap_plot_required_for_multiqc = + create_phylogeny_And_Heatmap_image.out.combined_plot_mqc .ifEmpty([]) -multiqc_config = params.multiqc_config + multiqc_config = params.multiqc_config -// Run MultiQC with the gathered inputs -multiqc_report( + // Run MultiQC with the gathered inputs + multiqc_report( pycoqc_required_for_multiqc, nanoplot_required_for_multiqc, multiqc_config, @@ -553,7 +529,7 @@ multiqc_report( bakta_plasmids_required_for_multiqc, busco_required_for_multiqc, phylogeny_heatmap_plot_required_for_multiqc -) + ) } // Print workflow execution summary diff --git a/modules/concat_fa.nf b/modules/concat_fa.nf new file mode 100755 index 0000000..c98a9d0 --- /dev/null +++ b/modules/concat_fa.nf @@ -0,0 +1,14 @@ +process concat_fastas { + tag "${barcode}" + + input: + tuple val(barcode), path(fasta_files) + + output: + tuple val(barcode), path("${barcode}_combined.fasta") + + script: + """ + cat $fasta_files > ${barcode}_combined.fasta + """ +} diff --git a/modules/create_phylogeny_And_Heatmap_image.nf b/modules/create_phylogeny_And_Heatmap_image.nf index 611c4e5..6918615 100644 --- a/modules/create_phylogeny_And_Heatmap_image.nf +++ b/modules/create_phylogeny_And_Heatmap_image.nf @@ -1,7 +1,7 @@ process create_phylogeny_And_Heatmap_image { tag "PLOTTING PHYLOGENETIC TREE WITH AMR AND VIRULENCE GENES" container 'oras://community.wave.seqera.io/library/bioconductor-ggtree_r-ape_r-phytools_r-tidyverse:bd310f2405bed388' - publishDir "${params.outdir}/taxonomy", mode: 'symlink' + publishDir "${params.outdir}/taxonomy", mode: 'copy' input: path(phylogeny_tree_base_path) diff --git a/modules/create_phylogeny_tree_related_files.nf b/modules/create_phylogeny_tree_related_files.nf index 632c0ad..e2b601c 100755 --- a/modules/create_phylogeny_tree_related_files.nf +++ b/modules/create_phylogeny_tree_related_files.nf @@ -1,16 +1,15 @@ process create_phylogeny_tree_related_files { - tag "PREPARING INPUTS FOR PHYLOGENETIC TREE" container 'python:3.8' - publishDir "${params.outdir}/taxonomy", mode: 'symlink' + publishDir "${params.outdir}/taxonomy", mode: 'copy' input: path(assembly_summary_refseq) path(kraken2_reports) - path(bakta_results) + path(bakta_results) output: - path("phylogeny"), emit: phylogeny_folder - path("barcode_species_table_mqc.txt"), emit: barcode_species_table + path("phylogeny"), emit: phylogeny_folder + path("barcode_species_table_mqc.txt"), emit: barcode_species_table script: """ diff --git a/modules/generate_abricate_gene_matrix.nf b/modules/generate_abricate_gene_matrix.nf index c328dd7..9b57a82 100755 --- a/modules/generate_abricate_gene_matrix.nf +++ b/modules/generate_abricate_gene_matrix.nf @@ -1,7 +1,7 @@ process generate_abricate_gene_matrix { tag "GENERATING ABRICATE GENE MATRIX FOR ALL SAMPLES: ${params.input}" container 'python:3.8' - publishDir "${params.outdir}/taxonomy", mode: 'symlink' + publishDir "${params.outdir}/taxonomy", mode: 'copy' input: path(abricate_output_all_samples) diff --git a/modules/generate_amrfinderplus_gene_matrix.nf b/modules/generate_amrfinderplus_gene_matrix.nf index 0c98cf1..e9f2d76 100755 --- a/modules/generate_amrfinderplus_gene_matrix.nf +++ b/modules/generate_amrfinderplus_gene_matrix.nf @@ -1,7 +1,7 @@ process generate_amrfinderplus_gene_matrix { tag "GENERATING AMRFINDERPLUS GENE MATRIX FOR ALL SAMPLES: ${params.input}" container 'python:3.8' - publishDir "${params.outdir}/taxonomy", mode: 'symlink' + publishDir "${params.outdir}/taxonomy", mode: 'copy' input: path(amrfinderplus_output_all_samples) diff --git a/modules/run_abricateVFDB_annotation_chromosomes.nf b/modules/run_abricateVFDB_annotation_chromosomes.nf index 59f225e..7ed04b9 100644 --- a/modules/run_abricateVFDB_annotation_chromosomes.nf +++ b/modules/run_abricateVFDB_annotation_chromosomes.nf @@ -1,26 +1,23 @@ process abricateVFDB_annotation_chromosomes { - tag "ANNOTATING CONSENSUS ASSEMBLY WITH ABRICATE VFDB: ${barcode}" + tag "VFDB GENES: ${barcode}: ${assembler}" container 'quay.io/biocontainers/abricate:1.0.1--ha8f3691_2' - publishDir "${params.outdir}/annotations/${barcode}", mode: 'symlink' + publishDir "${params.outdir}/annotations/${barcode}/abricate", mode: 'copy' -input: - tuple val(barcode), path(medaka_consensus) + input: + tuple val(barcode), val(assembler), path(polished_fasta) -output: - tuple val(barcode), path("abricate/*"), emit: abricate_annotations, optional: true + output: + tuple val(barcode), path("*.txt"), emit: report -script: + script: + prefix = "${barcode}_${assembler}_chr" + db_name = 'vfdb' """ - for dir in ${medaka_consensus}; do - cat \${dir}/consensus.fasta >> concatenated_consensus.fasta - done - - db_name='vfdb' - mkdir -p abricate - - abricate concatenated_consensus.fasta \\ - -db \${db_name} > abricate/${barcode}.txt - + abricate \\ + $polished_fasta \\ + -db ${db_name} \\ + --threads $task.cpus \\ + > ${prefix}.txt """ } diff --git a/modules/run_abricateVFDB_annotation_flye_chromosomes.nf b/modules/run_abricateVFDB_annotation_flye_chromosomes.nf deleted file mode 100644 index e1ddc4d..0000000 --- a/modules/run_abricateVFDB_annotation_flye_chromosomes.nf +++ /dev/null @@ -1,21 +0,0 @@ -process abricateVFDB_annotation_flye_chromosomes { - tag "ANNOTATING FLYE ASSEMBLY WITH ABRICATE VFDB: ${barcode}" - container 'quay.io/biocontainers/abricate:1.0.1--ha8f3691_2' - publishDir "${params.outdir}/annotations/${barcode}", mode: 'symlink' - -input: - tuple val(barcode), path(medaka_flye) - -output: - tuple val(barcode), path("abricate/*"), emit: abricate_annotations, optional: true - -script: - """ - db_name='vfdb' - mkdir -p abricate - - abricate ${medaka_flye}/consensus.fasta \\ - -db \${db_name} > abricate/${barcode}.txt - """ - -} diff --git a/modules/run_amrfinderplus_annotation_chromosomes.nf b/modules/run_amrfinderplus_annotation_chromosomes.nf index 06e62d6..9c96517 100644 --- a/modules/run_amrfinderplus_annotation_chromosomes.nf +++ b/modules/run_amrfinderplus_annotation_chromosomes.nf @@ -1,22 +1,22 @@ process amrfinderplus_annotation_chromosomes { - tag "DETECTING AMR GENES: ${barcode}" + tag "${barcode}: ${assembler}" container 'quay.io/biocontainers/ncbi-amrfinderplus:3.12.8--h283d18e_0' - publishDir "${params.outdir}/annotations/${barcode}", mode: 'symlink' + publishDir "${params.outdir}/annotations/${barcode}/amrfinderplus", mode: 'copy' -input: - tuple val(barcode), path(bakta_annotations) + input: + tuple val(barcode), val(assembler), path(annotated_faa) path(amrfinderplus_db) -output: - tuple val(barcode), path("amrfinderplus/*"), emit: amrfinderplus_annotations + output: + tuple val(barcode), val(assembler), path("${prefix}.tsv"), emit: report -script: + script: + prefix = "${barcode}_${assembler}_chr" """ - mkdir -p amrfinderplus - - amrfinder -p ${barcode}_bakta/${barcode}_chr.faa \\ - -d ${amrfinderplus_db}/latest > amrfinderplus/${barcode}.txt - + amrfinder \\ + -p $annotated_faa \\ + -d ${amrfinderplus_db}/latest \\ + --threads $task.cpus > ${prefix}.tsv """ } diff --git a/modules/run_amrfinderplus_annotation_flye_chromosomes.nf b/modules/run_amrfinderplus_annotation_flye_chromosomes.nf deleted file mode 100644 index 04bb4b9..0000000 --- a/modules/run_amrfinderplus_annotation_flye_chromosomes.nf +++ /dev/null @@ -1,22 +0,0 @@ -process amrfinderplus_annotation_flye_chromosomes { - tag "DETECTING AMR GENES: ${barcode}" - container 'quay.io/biocontainers/ncbi-amrfinderplus:3.12.8--h283d18e_0' - publishDir "${params.outdir}/annotations/${barcode}", mode: 'symlink' - -input: - tuple val(barcode), path(bakta_annotations) - path(amrfinderplus_db) - -output: - tuple val(barcode), path("amrfinderplus/*"), emit: amrfinderplus_annotations, optional: true - -script: - """ - mkdir -p amrfinderplus - - amrfinder -p ${barcode}_bakta/${barcode}_chr.faa \\ - -d ${amrfinderplus_db}/latest > amrfinderplus/${barcode}.txt - - """ - -} diff --git a/modules/run_bakta_annotation_chromosomes.nf b/modules/run_bakta_annotation_chromosomes.nf index f89d9ff..1b7a077 100644 --- a/modules/run_bakta_annotation_chromosomes.nf +++ b/modules/run_bakta_annotation_chromosomes.nf @@ -1,29 +1,24 @@ process bakta_annotation_chromosomes { - tag "ANNOTATING CONSENSUS ASSEMBLY WITH BATKA: ${barcode}" + tag "GENE FEATURES: ${barcode}: ${assembler}" container 'quay.io/biocontainers/bakta:1.9.2--pyhdfd78af_0' - publishDir "${params.outdir}/annotations/${barcode}", mode: 'symlink' + publishDir "${params.outdir}/annotations/${barcode}/bakta", mode: 'copy' -input: - tuple val(barcode), path(medaka_consensus) + input: + tuple val(barcode), val(assembler), path(polished_fasta) path(bakta_db) -output: - tuple val(barcode), path("${barcode}_bakta"), emit: bakta_annotations - tuple val(barcode), path("${barcode}_bakta/${barcode}_chr.txt"), emit: bakta_annotations_multiqc + output: + tuple val(barcode), val(assembler), path("${prefix}.faa"), emit: faa + tuple val(barcode), val(assembler), path("${prefix}.txt"), emit: txt -script: + script: + prefix = "${barcode}_${assembler}_chr" """ - for dir in ${medaka_consensus}; do - cat \${dir}/consensus.fasta >> concatenated_consensus.fasta - done - bakta \\ - concatenated_consensus.fasta \\ - --db ${bakta_db} \\ - --output ${barcode}_bakta \\ - --prefix ${barcode}_chr \\ + $polished_fasta \\ + --db $bakta_db \\ + --prefix $prefix \\ --force \\ --threads ${task.cpus} - """ } diff --git a/modules/run_bakta_annotation_flye_chromosomes.nf b/modules/run_bakta_annotation_flye_chromosomes.nf deleted file mode 100644 index a760063..0000000 --- a/modules/run_bakta_annotation_flye_chromosomes.nf +++ /dev/null @@ -1,25 +0,0 @@ -process bakta_annotation_flye_chromosomes { - tag "ANNOTATING CONSENSUS ASSEMBLY WITH BATKA: ${barcode}" - container 'quay.io/biocontainers/bakta:1.9.2--pyhdfd78af_0' - publishDir "${params.outdir}/annotations/${barcode}", mode: 'symlink' - -input: - tuple val(barcode), path(medaka_flye) - path(bakta_db) - -output: - tuple val(barcode), path("${barcode}_bakta"), emit: bakta_annotations - tuple val(barcode), path("${barcode}_bakta/${barcode}_chr.txt"), emit: bakta_annotations_multiqc - -script: - """ - bakta \\ - ${medaka_flye}/consensus.fasta \\ - --db ${bakta_db} \\ - --output ${barcode}_bakta \\ - --prefix ${barcode}_chr \\ - --force \\ - --threads ${task.cpus} - """ - -} diff --git a/modules/run_bakta_annotation_plasmids.nf b/modules/run_bakta_annotation_plasmids.nf index dc6b283..6fcafb5 100644 --- a/modules/run_bakta_annotation_plasmids.nf +++ b/modules/run_bakta_annotation_plasmids.nf @@ -1,7 +1,7 @@ process bakta_annotation_plasmids { tag "ANNOTATING PLASMIDS: ${barcode}" container 'quay.io/biocontainers/bakta:1.9.2--pyhdfd78af_0' - publishDir "${params.outdir}/annotations/${barcode}/plasmids", mode: 'symlink' + publishDir "${params.outdir}/annotations/${barcode}/plasmids", mode: 'copy' input: tuple val(barcode), path(plassembler_plasmids) diff --git a/modules/run_busco_annotation_chromosomes.nf b/modules/run_busco_annotation_chromosomes.nf deleted file mode 100644 index aa07300..0000000 --- a/modules/run_busco_annotation_chromosomes.nf +++ /dev/null @@ -1,21 +0,0 @@ -process busco_annotation_chromosomes { - tag "EVALUATING GENOME COMPLETENESS: ${barcode}" - container 'quay.io/biocontainers/busco:5.6.1--pyhdfd78af_0' - publishDir "${params.outdir}/annotations/${barcode}", mode: 'symlink' - -input: - tuple val(barcode), path(bakta_annotations) - path(busco_db) - -output: - tuple val(barcode), path("${barcode}_busco/short_summary.specific.*.txt"), emit: busco_annotations - -script: - """ - busco \\ - -f -i ${bakta_annotations}/${barcode}_chr.faa \\ - -m proteins \\ - --lineage_dataset busco_downloads/lineages/bacteria_odb10 \\ - --out ${barcode}_busco --offline - """ -} diff --git a/modules/run_busco_annotation_flye_chromosomes.nf b/modules/run_busco_annotation_flye_chromosomes.nf deleted file mode 100644 index 316378b..0000000 --- a/modules/run_busco_annotation_flye_chromosomes.nf +++ /dev/null @@ -1,21 +0,0 @@ -process busco_annotation_flye_chromosomes { - tag "EVALUATING GENOME COMPLETENESS: ${barcode}" - container 'quay.io/biocontainers/busco:5.6.1--pyhdfd78af_0' - publishDir "${params.outdir}/annotations/${barcode}", mode: 'symlink' - -input: - tuple val(barcode), path(bakta_annotations) - path(busco_db) - -output: - tuple val(barcode), path("${barcode}_busco/short_summary.specific.*.txt"), emit: busco_annotations - -script: - """ - busco \\ - -f -i ${bakta_annotations}/${barcode}_chr.faa \\ - -m proteins --lineage_dataset busco_downloads/lineages/bacteria_odb10 \\ - --out ${barcode}_busco --offline - """ - -} diff --git a/modules/run_busco_annotation_plasmids.nf b/modules/run_busco_annotation_plasmids.nf index 926a04a..b215850 100644 --- a/modules/run_busco_annotation_plasmids.nf +++ b/modules/run_busco_annotation_plasmids.nf @@ -1,7 +1,7 @@ process busco_annotation_plasmids { tag "EVALUATING PLASMID COMPLETENESS: ${barcode}" container 'quay.io/biocontainers/busco:5.6.1--pyhdfd78af_0' - publishDir "${params.outdir}/assemblies/${barcode}/plasmids", mode: 'symlink' + publishDir "${params.outdir}/assemblies/${barcode}/plasmids", mode: 'copy' input: tuple val(barcode), path(bakta_annotations) diff --git a/modules/run_busco_qc_chromosomes.nf b/modules/run_busco_qc_chromosomes.nf new file mode 100644 index 0000000..d3c5703 --- /dev/null +++ b/modules/run_busco_qc_chromosomes.nf @@ -0,0 +1,27 @@ +process busco_qc_chromosomes { + tag "EVALUATING GENOME COMPLETENESS: ${barcode}: ${assembler}" + container 'quay.io/biocontainers/busco:5.6.1--pyhdfd78af_0' + publishDir "${params.outdir}/quality_control/${barcode}", mode: 'copy' + + input: + tuple val(barcode), val(assembler), path(polished_assembly) + path(busco_db) + + output: + tuple val(barcode), val(assembler), path("${prefix}"), emit: results + tuple val(barcode), val(assembler), path("${prefix}/short_summary.*.txt"), emit: txt_summary + tuple val(barcode), path("${prefix}/short_summary.*.json"), emit: json_summary + + script: + prefix = "${barcode}_${assembler}_busco" + """ + busco \\ + --cpu $task.cpus \\ + --in $polished_assembly \\ + --out ${prefix} \\ + --mode genome \\ + --lineage_dataset busco_downloads/lineages/bacteria_odb10 \\ + --force \\ + --offline + """ +} diff --git a/modules/run_kraken2.nf b/modules/run_kraken2.nf index 48bcf11..a805de7 100755 --- a/modules/run_kraken2.nf +++ b/modules/run_kraken2.nf @@ -1,7 +1,7 @@ process kraken2 { tag "DETECTING POSSIBLE CONTAMINATION: ${barcode}" container 'quay.io/biocontainers/kraken2:2.1.3--pl5321hdcf5f25_0' - publishDir "${params.outdir}/quality_control/${barcode}_kraken2", mode: 'symlink' + publishDir "${params.outdir}/quality_control/${barcode}_kraken2", mode: 'copy' input: tuple val(barcode), path(trimmed_fq) diff --git a/modules/run_medaka_polish_consensus.nf b/modules/run_medaka_polish_consensus.nf index 2ec72d5..99dbb16 100644 --- a/modules/run_medaka_polish_consensus.nf +++ b/modules/run_medaka_polish_consensus.nf @@ -1,20 +1,20 @@ process medaka_polish_consensus { - tag "POLISHING CONSENSUS ASSEMBLY: ${barcode}" + tag "${barcode}: ${cluster_dir}" container 'quay.io/biocontainers/medaka:1.11.3--py39h05d5c5e_0' - publishDir "${params.outdir}/assemblies/${barcode}_consensus", mode: 'symlink' + publishDir "${params.outdir}/assemblies/${barcode}_consensus", mode: 'copy' input: - tuple val(barcode), val(cluster_id), path(consensus_partition), path(consensus_consensus) + tuple val(barcode), path(cluster_dir) output: - tuple val(barcode), val(cluster_id), path("${cluster_id}_polished"), emit: consensus_polished, optional: true + tuple val(barcode), path("consensus.fasta"), emit: cluster_assembly script: """ medaka_consensus \\ - -i ${consensus_partition}/4_reads.fastq \\ - -d ${consensus_consensus}/7_final_consensus.fasta \\ - -o ${cluster_id}_polished \\ - -t ${task.cpus} + -i ${cluster_dir}/4_reads.fastq \\ + -d ${cluster_dir}/7_final_consensus.fasta \\ + -o ./ \\ + -t ${task.cpus} """ } diff --git a/modules/run_medaka_polish_denovo.nf b/modules/run_medaka_polish_denovo.nf new file mode 100644 index 0000000..aa4720b --- /dev/null +++ b/modules/run_medaka_polish_denovo.nf @@ -0,0 +1,21 @@ +process medaka_polish_denovo { + tag "${assembler_name}: ${barcode}" + container 'quay.io/biocontainers/medaka:1.11.3--py39h05d5c5e_0' + publishDir "${params.outdir}/assemblies/${barcode}_${assembler_name}", mode: 'copy' + + input: + tuple val(barcode), val(assembler_name), path(assembly), path(trimmed_fq) + + output: + tuple val(barcode), val(assembler_name), path("consensus.fasta"), emit: assembly + + script: + """ + # ideally the assembly file is passed in directly + medaka_consensus \\ + -i ${trimmed_fq} \\ + -d ${assembly}/assembly.fasta \\ + -o ./ \\ + -t ${task.cpus} + """ +} diff --git a/modules/run_medaka_polish_flye.nf b/modules/run_medaka_polish_flye.nf deleted file mode 100644 index ff20401..0000000 --- a/modules/run_medaka_polish_flye.nf +++ /dev/null @@ -1,20 +0,0 @@ -process medaka_polish_flye { - tag "POLISHING FLYE ASSEMBLY: ${barcode}" - container 'quay.io/biocontainers/medaka:1.11.3--py39h05d5c5e_0' - publishDir "${params.outdir}/assemblies/${barcode}_flye", mode: 'symlink' - - input: - tuple val(barcode), path(flye_assembly), path(trimmed_fq) - - output: - tuple val(barcode), path("*"), emit: flye_polished, optional: true - - script: - """ - medaka_consensus \\ - -i ${trimmed_fq} \\ - -d ${flye_assembly}/assembly.fasta \\ - -o ${barcode}_polished \\ - -t ${task.cpus} - """ -} diff --git a/modules/run_multiqc.nf b/modules/run_multiqc.nf index 0c27daf..7758018 100644 --- a/modules/run_multiqc.nf +++ b/modules/run_multiqc.nf @@ -1,7 +1,7 @@ process multiqc_report { tag "GENERATING SUMMARY REPORT" container 'quay.io/biocontainers/multiqc:1.21--pyhdfd78af_0' - publishDir "${params.outdir}/report", mode: 'symlink' + publishDir "${params.outdir}/report", mode: 'copy' input: path(pycoqc) diff --git a/modules/run_nanoplot.nf b/modules/run_nanoplot.nf index 8099498..ef2f179 100755 --- a/modules/run_nanoplot.nf +++ b/modules/run_nanoplot.nf @@ -1,7 +1,7 @@ process nanoplot_summary { tag "SUMMARISING RAW OUTPUT FROM ONT RUN: ${sequencing_summary.fileName}" container 'quay.io/biocontainers/nanoplot:1.42.0--pyhdfd78af_0' - publishDir "${params.outdir}/quality_control", mode: 'symlink' + publishDir "${params.outdir}/quality_control", mode: 'copy' input: path(sequencing_summary) diff --git a/modules/run_orthofinder.nf b/modules/run_orthofinder.nf index a412368..3c4d9ce 100644 --- a/modules/run_orthofinder.nf +++ b/modules/run_orthofinder.nf @@ -1,5 +1,5 @@ process run_orthofinder{ - tag "RUN ORTHOFINDER to generate Phylogeny tree" + tag "GENERATE PHYLOGENY" container 'quay.io/biocontainers/orthofinder:2.5.5--hdfd78af_2' input: diff --git a/modules/run_plassembler.nf b/modules/run_plassembler.nf index 4e3ce3c..88e5e41 100644 --- a/modules/run_plassembler.nf +++ b/modules/run_plassembler.nf @@ -1,7 +1,7 @@ process plassembler { tag "DETECTING PLASMIDS AND OTHER MOBILE ELEMENTS: ${barcode}" container 'quay.io/gbouras13/plassembler:1.6.2' - publishDir "${params.outdir}/assemblies", mode: 'symlink' + publishDir "${params.outdir}/assemblies", mode: 'copy' input: tuple val(barcode), path(trimmed_fq), path(flye_assembly) diff --git a/modules/run_pycoqc.nf b/modules/run_pycoqc.nf index 7405d0b..cd3308e 100755 --- a/modules/run_pycoqc.nf +++ b/modules/run_pycoqc.nf @@ -1,7 +1,7 @@ process pycoqc_summary { tag "SUMMARISING RAW OUTPUT FROM ONT RUN: ${sequencing_summary.fileName}" container 'quay.io/biocontainers/pycoqc:2.5.2--py_0' - publishDir "${params.outdir}/quality_control", mode: 'symlink' + publishDir "${params.outdir}/quality_control", mode: 'copy' input: path(sequencing_summary) diff --git a/modules/run_quast_qc_chromosomes.nf b/modules/run_quast_qc_chromosomes.nf index d711638..62c2d2e 100644 --- a/modules/run_quast_qc_chromosomes.nf +++ b/modules/run_quast_qc_chromosomes.nf @@ -1,27 +1,25 @@ process quast_qc_chromosomes { - tag "EVALUATING GENOME QUALITY: ${barcode}" + tag "EVALUATING GENOME QUALITY: ${barcode}: ${assembler}" container 'quay.io/biocontainers/quast:5.2.0--py39pl5321h2add14b_1' - publishDir "${params.outdir}/quality_control", mode: 'symlink' + publishDir "${params.outdir}/quality_control/${barcode}", mode: 'copy' -input: - tuple val(barcode), path(medaka_consensus) + input: + tuple val(barcode), val(assembler), path(polished_assembly) -output: - tuple val(barcode), path("${barcode}_quast"), emit: quast_qc - tuple val(barcode), path("${barcode}_quast/${barcode}.tsv"), emit: quast_qc_multiqc + output: + tuple val(barcode), val(assembler), path("${prefix}"), emit: results + tuple val(barcode), val(assembler), path("${prefix}.tsv"), emit: tsv -script: + script: + prefix = "${barcode}_${assembler}" """ - for dir in ${medaka_consensus}; do - cat \${dir}/consensus.fasta >> concatenated_consensus.fasta - done - quast.py \\ - --output-dir ${barcode}_quast \\ - -l ${barcode} \\ - concatenated_consensus.fasta + --output-dir $prefix \\ + $polished_assembly \\ + --threads $task.cpus - mv ${barcode}_quast/report.tsv ${barcode}_quast/${barcode}.tsv + # multiqc requires tsv to be named with sample + ln -s ${prefix}/report.tsv ${prefix}.tsv """ } diff --git a/modules/run_quast_qc_flye_chromosomes.nf b/modules/run_quast_qc_flye_chromosomes.nf deleted file mode 100644 index 1bb6c6a..0000000 --- a/modules/run_quast_qc_flye_chromosomes.nf +++ /dev/null @@ -1,24 +0,0 @@ -process quast_qc_flye_chromosomes { - tag "EVALUATING GENOME QUALITY: ${barcode}" - container 'quay.io/biocontainers/quast:5.2.0--py39pl5321h2add14b_1' - publishDir "${params.outdir}/quality_control", mode: 'symlink' - -input: - tuple val(barcode), path(medaka_flye) - -output: - tuple val(barcode), path("${barcode}_quast"), emit: quast_qc - tuple val(barcode), path("${barcode}_quast/${barcode}.tsv"), emit: quast_qc_multiqc - -script: - """ - - quast.py \\ - --output-dir ${barcode}_quast \\ - -l ${barcode} \\ - ${medaka_flye}/consensus.fasta - - mv ${barcode}_quast/report.tsv ${barcode}_quast/${barcode}.tsv - """ - -} diff --git a/modules/run_trycycler_classify.nf b/modules/run_trycycler_classify.nf index 573fff9..2214aff 100755 --- a/modules/run_trycycler_classify.nf +++ b/modules/run_trycycler_classify.nf @@ -6,12 +6,11 @@ process classify_trycycler { tuple val(barcode), path(trycycler_cluster), val(num_contigs) output: - tuple val(barcode), path("${barcode}_discarded/*"), emit: discard_contigs, optional: true - tuple val(barcode), path("${barcode}_for_reconciliation/*"), emit: reconcile_contigs + tuple val(barcode), path("${barcode}_discarded/*"), emit: clusters_to_discard, optional: true + tuple val(barcode), path("${barcode}_for_reconciliation/*"), emit: clusters_to_reconcile script: """ - classify_trycycler_clusters.py \\ - ${barcode} + classify_trycycler_clusters.py ${barcode} """ } diff --git a/modules/run_trycycler_cluster.nf b/modules/run_trycycler_cluster.nf index 842e8dd..16ac675 100755 --- a/modules/run_trycycler_cluster.nf +++ b/modules/run_trycycler_cluster.nf @@ -2,21 +2,23 @@ process trycycler_cluster { tag "CLUSTERING CONTIGS: ${barcode}" container 'quay.io/biocontainers/trycycler:0.5.4--pyhdfd78af_0' - errorStrategy 'ignore' // Ignore the error status + // Contigs can be filtered out to be < 2 within the process and segfault + errorStrategy { task.exitStatus == 1 ? 'ignore' : 'terminate' } input: - tuple val(barcode), path(unicycler_assembly), path(flye_assembly), path(trimmed_fq) + tuple val(barcode), path(unicycler_assembly), path(flye_assembly), path(trimmed_fq), val(num_contigs) output: - tuple val(barcode), path("*"), path("${barcode}_cluster/contigs.phylip"), emit: trycycler_cluster + tuple val(barcode), path("${barcode}_cluster/"), emit: clusters + tuple val(barcode), path("${barcode}_cluster/contigs.phylip"), emit: phylip - script: + script: """ trycycler cluster \\ - --assemblies ${barcode}_unicycler_assembly/assembly.fasta ${barcode}_flye_assembly/assembly.fasta \\ - --reads ${barcode}_trimmed.fastq.gz \\ + --assemblies ${unicycler_assembly}/assembly.fasta ${flye_assembly}/assembly.fasta \\ + --reads $trimmed_fq \\ --out_dir ${barcode}_cluster \\ --min_contig_len ${params.trycycler_min_contig_length} \\ - --threads ${task.cpus} || true + --threads ${task.cpus} """ } diff --git a/modules/run_trycycler_consensus.nf b/modules/run_trycycler_consensus.nf index 15b66b7..4d57d44 100644 --- a/modules/run_trycycler_consensus.nf +++ b/modules/run_trycycler_consensus.nf @@ -1,38 +1,17 @@ process trycycler_consensus { - tag "GENERATING CONSENSUS ASSEMBLY: ${barcode}" + tag "GENERATING CONSENSUS ASSEMBLY: ${barcode}: ${cluster_dir}" container 'quay.io/biocontainers/trycycler:0.5.4--pyhdfd78af_0' input: - tuple val(barcode), val(cluster_id), path(msa), path(twoseq_and_partition) + tuple val(barcode), path(cluster_dir) output: - tuple val(barcode), val("${cluster_id}"), path("consensus_out"), emit: consensus_consensus - + tuple val(barcode), path("**/7_final_consensus.fasta"), emit: cluster_assembly script: - """ - # Make a faux input directory to meet trycycler's expectations - mkdir -p consensus_in - - # Capture path to 2_all_seqs.fasta - cp ${twoseq_and_partition}/2_all_seqs.fasta consensus_in/ - - # Capture path to 3_msa.fasta - cp ${msa}/3_msa.fasta consensus_in/ - - # Capture path to 4_reads.fastq - cp ${twoseq_and_partition}/4_reads.fastq consensus_in/ - - # Run trycycler reconcile step trycycler consensus \\ - --cluster_dir consensus_in \\ - --threads ${task.cpus} - - # Save output to new directory - mkdir -p consensus_out - cp consensus_in/5_chunked_sequence.gfa consensus_out/5_chunked_sequence.gfa - cp consensus_in/6_initial_consensus.fasta consensus_out/6_initial_consensus.fasta - cp consensus_in/7_final_consensus.fasta consensus_out/7_final_consensus.fasta + --cluster_dir $cluster_dir \\ + --threads ${task.cpus} """ } diff --git a/modules/run_trycycler_msa.nf b/modules/run_trycycler_msa.nf index 4aba759..74bff0a 100644 --- a/modules/run_trycycler_msa.nf +++ b/modules/run_trycycler_msa.nf @@ -1,30 +1,18 @@ process trycycler_msa { - tag "MULTI SEQUENCE ALIGNMENT FOR SUCCESSFUL TRYCYCLER ASSEMBLY: ${barcode}" + tag "ALIGNING RECONCILED TRYCYCLER SEQS: ${barcode}: ${cluster_dir}" container 'quay.io/biocontainers/trycycler:0.5.4--pyhdfd78af_0' input: - tuple val(barcode), path(consensus_file), path(reconciled_dir) - - //when: - //consensus_good.exists() + tuple val(barcode), path(cluster_dir) output: - tuple val(barcode), val("${barcode}_${reconciled_dir}"), path("${barcode}_${reconciled_dir}_msa"), emit: three_msa + tuple val(barcode), path("${cluster_dir}/"), emit: results_dir + tuple val(barcode), path("${cluster_dir}/3_msa.fasta"), emit: aligned_seqs script: """ - # Capture path to reconciled directory - #reconciled_dir=\$(ls -d */* | grep 'barcode01_final/cluster_[0-9]*_reconciled') - #echo \$reconciled_dir - - # Run trycycler MSA step: https://github.com/rrwick/Trycycler/wiki/Multiple-sequence-alignment trycycler msa \\ - --cluster_dir $reconciled_dir \\ + --cluster_dir $cluster_dir \\ --threads ${task.cpus} - - # Move the 3_msa.fasta to out directory - mkdir -p ${barcode}_${reconciled_dir}_msa - cp ${reconciled_dir}/3_msa.fasta ${barcode}_${reconciled_dir}_msa/3_msa.fasta - """ } diff --git a/modules/run_trycycler_partition.nf b/modules/run_trycycler_partition.nf index a1a2a78..be669a0 100644 --- a/modules/run_trycycler_partition.nf +++ b/modules/run_trycycler_partition.nf @@ -3,35 +3,17 @@ process trycycler_partition { container 'quay.io/biocontainers/trycycler:0.5.4--pyhdfd78af_0' input: - tuple val(barcode), path(consensus_file), path(consensus_good), path(trimmed_fq) + tuple val(barcode), path(cluster_dirs), path(trimmed_fq) - //when: - //consensus_good.exists() - output: - tuple val(barcode), path("${barcode}_partitioned"), emit: four_reads + tuple val(barcode), path("**/4_reads.fastq"), emit: partitioned_reads script: """ - # Capture path to reconciled directory - reconciled_dir=\$(find -L . -type d -name 'cluster_*_reconciled') - echo \$reconciled_dir - - echo $consensus_good - - # Run trycycler partition step: https://github.com/rrwick/Trycycler/wiki/Partitioning-reads trycycler partition \\ - --reads ${barcode}_trimmed.fastq.gz \\ - --cluster_dirs \$reconciled_dir - - # Move files to out directory - mkdir -p ${barcode}_partitioned - for dir in \$reconciled_dir; do - mkdir -p ${barcode}_partitioned/\${dir} - cp \${dir}/2_all_seqs.fasta ${barcode}_partitioned/\${dir}/2_all_seqs.fasta - cp \${dir}/4_reads.fastq ${barcode}_partitioned/\${dir}/4_reads.fastq - - done + --reads $trimmed_fq \\ + --cluster_dirs $cluster_dirs \\ + --threads ${task.cpus} """ } diff --git a/modules/run_trycycler_reconcile.nf b/modules/run_trycycler_reconcile.nf index 0f7536d..7350d2d 100755 --- a/modules/run_trycycler_reconcile.nf +++ b/modules/run_trycycler_reconcile.nf @@ -1,35 +1,26 @@ process trycycler_reconcile { - tag "RECONCILING CONTIGS: ${barcode}:${reconcile_contigs}" + tag "RECONCILING CONTIGS: ${barcode}: ${cluster_dir}" container 'quay.io/biocontainers/trycycler:0.5.4--pyhdfd78af_0' input: - tuple val(barcode), path(reconcile_contigs), path(trimmed_fq) + tuple val(barcode), path(cluster_dir), path(trimmed_fq) output: - tuple val(barcode), path("${reconcile_contigs}_reconciled/"), emit: reconciled_seqs, optional: true - + tuple val(barcode), path("${cluster_dir}/"), emit: results_dir + tuple val(barcode), path("${cluster_dir}/2_all_seqs.fasta"), emit: reconciled_seqs, optional: true + /* + * optional:true as reconcililation may be unsuccessful (true negative) + * i.e. if there is too much of a length difference between contigs in the + * one cluster + */ + script: """ - echo "Running trycycler reconcile for barcode: ${barcode}" - echo "Input directory: ${reconcile_contigs}" - echo "Output directory: ${reconcile_contigs}_reconciled/" - echo "Trimmed reads: ${barcode}_trimmed.fastq.gz" - trycycler reconcile \\ - --reads ${barcode}_trimmed.fastq.gz \\ - --cluster_dir ${reconcile_contigs} \\ + --reads $trimmed_fq \\ + --cluster_dir $cluster_dir \\ --threads ${task.cpus} \\ --min_1kbp_identity 10 \\ --max_add_seq 2500 - - if [ -f "${reconcile_contigs}/2_all_seqs.fasta" ]; then - echo "File 2_all_seqs.fasta found, moving to output directory." - # Set up the output directory - mkdir -p ${reconcile_contigs}_reconciled/ - # Move output from input directory to _reconciled output directory - mv ${reconcile_contigs}/2_all_seqs.fasta ${reconcile_contigs}_reconciled/2_all_seqs.fasta - else - echo "File 2_all_seqs.fasta not found. Please check the trycycler reconcile command and input files." - fi """ } diff --git a/modules/select_assembly.nf b/modules/select_assembly.nf index e53e489..dbd2c5b 100755 --- a/modules/select_assembly.nf +++ b/modules/select_assembly.nf @@ -1,23 +1,17 @@ process select_assembly { - tag "EVALUATING CONSENSUS ASSEMBLY QUALITY: ${barcode}" + tag "${barcode}" container 'python:3.8' input: - tuple val(barcode), path(reconciled), path(flye_assembly), path(kraken2_report) - path(ncbi_lookup) + tuple val(barcode), path(busco_jsons) output: - tuple val(barcode), path("Consensus.txt"), path("${barcode}_final/*"), emit: consensus_good, optional: true - tuple val(barcode), path("FlyeOnly.txt"), path("${barcode}_flye_assembly/Chr_contigs/"), emit: consensus_discard, optional: true + tuple val(barcode), path("best_assembly.txt") script: - //TODO double check this is capturing all reconciled clusters + // Save to file (instead of print to stdout) for nextflow caching """ - select_assembly.py \\ - ${barcode} \\ - ${reconciled.join(' ')} \\ - ${flye_assembly} \\ - ${kraken2_report} \\ - ${ncbi_lookup} + compare_busco.py $busco_jsons > best_assembly.txt """ + } diff --git a/nextflow.config b/nextflow.config index c613a75..4d3ed58 100755 --- a/nextflow.config +++ b/nextflow.config @@ -210,7 +210,7 @@ process { executor = 'pbspro' queue = 'normal' cpus = 1 - time = '10h' + time = '10h' memory = 3.Gb } @@ -238,12 +238,12 @@ process { memory = 20.Gb } - withName: 'medaka_polish_flye' { + withName: 'medaka_polish_denovo' { executor = 'pbspro' queue = 'normal' cpus = 2 // medaka itself says more than 2 cpus is a waste time = '10h' - memory = 40.Gb + memory = 20.Gb } withName: 'plassembler' { @@ -270,14 +270,6 @@ process { memory = 10.Gb } - withName: 'bakta_annotation_flye_chromosomes' { - executor = 'pbspro' - queue = 'normal' - cpus = 1 - time = '10h' - memory = 10.Gb - } - withName: 'quast_qc_chromosomes' { executor = 'pbspro' queue = 'normal' @@ -286,14 +278,6 @@ process { memory = 5.Gb } - withName: 'quast_qc_flye_chromosomes' { - executor = 'pbspro' - queue = 'normal' - cpus = 1 - time = '1h' - memory = 5.Gb - } - withName: 'abricateVFDB_annotation_chromosomes' { executor = 'pbspro' queue = 'normal' @@ -302,14 +286,6 @@ process { memory = 2.Gb } - withName: 'abricateVFDB_annotation_flye_chromosomes' { - executor = 'pbspro' - queue = 'normal' - cpus = 1 - time = '1h' - memory = 2.Gb - } - withName: 'abricateVFDB_annotation_reference' { executor = 'pbspro' queue = 'normal' @@ -326,14 +302,6 @@ process { memory = 2.Gb } - withName: 'amrfinderplus_annotation_flye_chromosomes' { - executor = 'pbspro' - queue = 'normal' - cpus = 1 - time = '1h' - memory = 2.Gb - } - withName: 'amrfinderplus_annotation_reference' { executor = 'pbspro' queue = 'normal'