From c18de39a419659720e2482df14df21affdc30f47 Mon Sep 17 00:00:00 2001 From: Maxime U Garcia Date: Thu, 6 Feb 2025 19:34:23 +0100 Subject: [PATCH] add subworfklow + schema for references (#7412) * add subworfklow + schema for references * s3 -> https * no function tests * better function name * restore fake workflow for tests * add DISCLAIMER * better tests * use format: path --- .../nf-core/utils_references/README.md | 13 + subworkflows/nf-core/utils_references/main.nf | 60 +++ .../nf-core/utils_references/meta.yml | 14 + .../utils_references/schema_references.json | 376 ++++++++++++++++++ .../utils_references/tests/main.nf.test | 27 ++ .../utils_references/tests/main.nf.test.snap | 45 +++ 6 files changed, 535 insertions(+) create mode 100644 subworkflows/nf-core/utils_references/README.md create mode 100644 subworkflows/nf-core/utils_references/main.nf create mode 100644 subworkflows/nf-core/utils_references/meta.yml create mode 100644 subworkflows/nf-core/utils_references/schema_references.json create mode 100644 subworkflows/nf-core/utils_references/tests/main.nf.test create mode 100644 subworkflows/nf-core/utils_references/tests/main.nf.test.snap diff --git a/subworkflows/nf-core/utils_references/README.md b/subworkflows/nf-core/utils_references/README.md new file mode 100644 index 00000000000..03da289f335 --- /dev/null +++ b/subworkflows/nf-core/utils_references/README.md @@ -0,0 +1,13 @@ +# Disclaimer + +This `utils_references/` folder contains for now two functions and a schema. +This is really meant for a POC and should not be installed by anyone except @maxulysse. +But that was the easiest way to share functions and a schema between three different pipelines and still showcase the logic. +This might evolve in the future, possibly towards a proper plugin. + +If you do so, please be aware that: + +- @maxulysse has hacked the `main.nf` to test the functions and the schema +- This is really meant to evolve in the future and can be deleted at any moment without prior notice. + +That being said, if you still want to use it or want to know more about it, please check the `#references` channel on the nf-core slack. diff --git a/subworkflows/nf-core/utils_references/main.nf b/subworkflows/nf-core/utils_references/main.nf new file mode 100644 index 00000000000..c557acc0a6e --- /dev/null +++ b/subworkflows/nf-core/utils_references/main.nf @@ -0,0 +1,60 @@ +// DISCLAIMER: +// This subworkflow is just to test the functions and the schema +// It should not be used in any pipeline + +// This include statement can also be deleted +include { samplesheetToList } from 'plugin/nf-schema' + +workflow UTILS_REFERENCES { + take: + yaml_reference + param_file + param_value + attribute_file + attribute_value + basepath + + main: + references = Channel.fromList(samplesheetToList(yaml_reference, "${projectDir}/subworkflows/nf-core/utils_references/schema_references.json")) + + // GIVING up writing a test for the functions, so writing a subworkflow to test it + references_file = get_references_file(references, param_file, attribute_file, basepath) + references_value = get_references_value(references, param_value, attribute_value) + + emit: + references_file + references_value +} +// You can delete everything before this line (including this line) + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + FUNCTIONS TO EXTRACT REFERENCES FILES OR VALUES FROM THE REFERENCES YAML OR PARAMS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +def get_references_file(references, param, attribute, basepath) { + return references + .map { meta, _readme -> + if (param || meta[attribute]) { + [meta.subMap(['id']), file(param ?: meta[attribute].replace('${params.igenomes_base}', basepath), checkIfExists: true)] + } + else { + null + } + } + .collect() +} + +def get_references_value(references, param, attribute) { + return references + .map { meta, _readme -> + if (param || meta[attribute]) { + [meta.subMap(['id']), param ?: meta[attribute]] + } + else { + null + } + } + .collect() +} diff --git a/subworkflows/nf-core/utils_references/meta.yml b/subworkflows/nf-core/utils_references/meta.yml new file mode 100644 index 00000000000..491c79c969e --- /dev/null +++ b/subworkflows/nf-core/utils_references/meta.yml @@ -0,0 +1,14 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "utils_references" +description: Functionality for dealing with references that may be useful for any Nextflow pipeline +keywords: + - utility + - pipeline + - references +components: [] +input: [] +output: [] +authors: + - "@maxulysse" +maintainers: + - "@maxulysse" diff --git a/subworkflows/nf-core/utils_references/schema_references.json b/subworkflows/nf-core/utils_references/schema_references.json new file mode 100644 index 00000000000..56829a1086b --- /dev/null +++ b/subworkflows/nf-core/utils_references/schema_references.json @@ -0,0 +1,376 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://raw.githubusercontent.com/nf-core/references/master/assets/schema_asset.json", + "title": "nf-core/references pipeline - params.asset schema", + "description": "Schema for the file provided with params.asset", + "type": "array", + "items": { + "type": "object", + "properties": { + "genome": { + "meta": ["genome", "id"], + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "Genome name must be provided, cannot contain spaces" + }, + "site": { + "meta": ["site"], + "type": "string", + "pattern": "^\\S+$", + "default": "unknown", + "errorMessage": "Website of origin of the reference, cannot contain spaces" + }, + "source": { + "meta": ["source"], + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "Source of genome must be provided, cannot contain spaces" + }, + "source_version": { + "meta": ["source_version"], + "type": "string", + "pattern": "^\\S+$", + "default": "unknown", + "errorMessage": "Source version used to create annotation files (gff/gtf related files), cannot contain spaces" + }, + "species": { + "meta": ["species"], + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "Species of the reference, cannot contain spaces" + }, + "ascat_alleles": { + "meta": ["ascat_alleles"], + "type": "string", + "format": "path", + "pattern": "^\\S+$", + "errorMessage": "TODO" + }, + "ascat_loci": { + "meta": ["ascat_loci"], + "type": "string", + "format": "path", + "pattern": "^\\S+$", + "errorMessage": "TODO" + }, + "ascat_loci_gc": { + "meta": ["ascat_loci_gc"], + "type": "string", + "format": "path", + "pattern": "^\\S+$", + "errorMessage": "TODO" + }, + "bed12": { + "meta": ["bed12"], + "type": "string", + "format": "path", + "pattern": "^\\S+$", + "errorMessage": "TODO" + }, + "bowtie1_index": { + "meta": ["bowtie1_index"], + "type": "string", + "format": "path", + "pattern": "^\\S+$", + "errorMessage": "Bowtie1 index, cannot contain spaces" + }, + "bowtie2_index": { + "meta": ["bowtie2_index"], + "type": "string", + "format": "path", + "pattern": "^\\S+$", + "errorMessage": "Bowtie2 index, cannot contain spaces" + }, + "bwamem1_index": { + "meta": ["bwamem1_index"], + "type": "string", + "format": "path", + "pattern": "^\\S+$", + "errorMessage": "BWA-MEM index, cannot contain spaces" + }, + "bwamem2_index": { + "meta": ["bwamem2_index"], + "type": "string", + "format": "path", + "pattern": "^\\S+$", + "errorMessage": "BWA-MEM2 index, cannot contain spaces" + }, + "dragmap_hashtable": { + "meta": ["dragmap_hashtable"], + "type": "string", + "format": "path", + "pattern": "^\\S+$", + "errorMessage": "DRAGMAP hashtable, cannot contain spaces" + }, + "chr_dir": { + "meta": ["chr_dir"], + "type": "string", + "format": "path", + "pattern": "^\\S+$", + "errorMessage": "TODO" + }, + "fasta": { + "meta": ["fasta"], + "type": "string", + "format": "path", + "pattern": "^\\S+\\.f(ast|n)?a(\\.gz)?$", + "errorMessage": "Fasta file [required when creating a reference], cannot contain spaces" + }, + "fasta_dict": { + "meta": ["fasta_dict"], + "type": "string", + "format": "path", + "pattern": "^\\S+\\.dict(\\.gz)?$", + "errorMessage": "Fasta dictionary, cannot contain spaces" + }, + "fasta_fai": { + "meta": ["fasta_fai"], + "type": "string", + "format": "path", + "pattern": "^\\S+\\.f(ast|n)?a\\.fai(\\.gz)?$", + "errorMessage": "Fasta index, cannot contain spaces" + }, + "fasta_sizes": { + "meta": ["fasta_sizes"], + "type": "string", + "format": "path", + "pattern": "^\\S+\\.f(ast|n)?a\\.sizes(\\.gz)?$", + "errorMessage": "Fasta sizes, cannot contain spaces" + }, + "gff": { + "meta": ["gff"], + "type": "string", + "format": "path", + "pattern": "^\\S+\\.gff(\\.gz)?$", + "errorMessage": "GFF3 file, required when no GTF is provided and wanting to build a reference needing such genes annotation, cannot contain spaces" + }, + "gtf": { + "meta": ["gtf"], + "type": "string", + "format": "path", + "pattern": "^\\S+\\.gtf(\\.gz)?$", + "errorMessage": "GTF file, required when no GFF3 is provided and wanting to build a reference needing such genes annotation, cannot contain spaces" + }, + "hisat2_index": { + "meta": ["hisat2_index"], + "type": "string", + "format": "path", + "pattern": "^\\S+$", + "errorMessage": "HISAT2 index, cannot contain spaces" + }, + "intervals_bed": { + "meta": ["intervals_bed"], + "type": "string", + "format": "path", + "pattern": "^\\S+\\.bed$", + "errorMessage": "Fasta intervals bed, cannot contain spaces " + }, + "kallisto_index": { + "meta": ["kallisto_index"], + "type": "string", + "format": "path", + "pattern": "^\\S+$", + "errorMessage": "Kallisto index, cannot contain spaces" + }, + "macs_gsize": { + "meta": ["macs_gsize"], + "type": "number", + "errorMessage": "TODO" + }, + "mito_name": { + "meta": ["mito_name"], + "type": "string", + "format": "path", + "pattern": "^\\S+$", + "errorMessage": "TODO" + }, + "msisensorpro_list": { + "meta": ["msisensorpro_list"], + "type": "string", + "format": "path", + "pattern": "^\\S+$", + "errorMessage": "MSIsensor-pro list, cannot contain spaces" + }, + "ngscheckmate_bed": { + "meta": ["ngscheckmate_bed"], + "type": "string", + "format": "path", + "pattern": "^\\S+\\.bed$", + "errorMessage": "ngscheckmate bed, cannot contain spaces " + }, + "readme": { + "type": "string", + "format": "path", + "pattern": "^\\S+$", + "errorMessage": "README file describing the reference, cannot contain spaces" + }, + "rsem_index": { + "meta": ["rsem_index"], + "type": "string", + "format": "path", + "pattern": "^\\S+$", + "errorMessage": "RSEM index, cannot contain spaces" + }, + "salmon_index": { + "meta": ["salmon_index"], + "type": "string", + "format": "path", + "pattern": "^\\S+$", + "errorMessage": "Salmon index, cannot contain spaces" + }, + "splice_sites": { + "meta": ["splice_sites"], + "type": "string", + "format": "path", + "pattern": "^\\S+(\\.splice_sites)(\\.txt)?$", + "errorMessage": "Splice sites [can be generated with HISAT2], cannot contain spaces" + }, + "star_index": { + "meta": ["star_index"], + "type": "string", + "format": "path", + "pattern": "^\\S+$", + "errorMessage": "STAR index, cannot contain spaces" + }, + "snpeff_db": { + "meta": ["snpeff_db"], + "type": "string", + "format": "path", + "pattern": "^\\S+$", + "errorMessage": "SnpEff database, cannot contain spaces" + }, + "transcript_fasta": { + "meta": ["transcript_fasta"], + "type": "string", + "format": "path", + "pattern": "^\\S+\\.f(ast|n)?a(\\.gz)?$", + "errorMessage": "Transcript fasta [can be generated with RSEM], cannot contain spaces" + }, + "vep_cache_version": { + "meta": ["vep_cache_version"], + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "VEP cache version, cannot contain spaces" + }, + "vep_genome": { + "meta": ["vep_genome"], + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "VEP genome, cannot contain spaces" + }, + "vep_species": { + "meta": ["vep_species"], + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "VEP species, cannot contain spaces" + }, + "vcf_dbsnp_vcf": { + "meta": ["vcf_dbsnp_vcf"], + "type": "string", + "format": "path", + "pattern": "^\\S+\\.vcf(\\.gz)?$", + "errorMessage": "VCF file (can be bgzipped), cannot contain spaces" + }, + "vcf_dbsnp_vcf_tbi": { + "meta": ["vcf_dbsnp_vcf_tbi"], + "type": "string", + "format": "path", + "pattern": "^\\S+\\.vcf\\.gz\\.tbi?$", + "errorMessage": "VCF tabix index, cannot contain spaces" + }, + "vcf_dbsnp_vcf_vqsr": { + "meta": ["vcf_dbsnp_vcf_vqsr"], + "type": "string", + "errorMessage": "VCF VQSR input, can contain spaces" + }, + "vcf_dbsnp_vcf_source": { + "meta": ["vcf_dbsnp_vcf_source"], + "type": "string", + "format": "path", + "pattern": "^\\S+$", + "errorMessage": "Source of dbsnp, cannot contain spaces" + }, + "vcf_germline_resource_vcf": { + "meta": ["vcf_germline_resource_vcf"], + "type": "string", + "format": "path", + "pattern": "^\\S+\\.vcf(\\.gz)?$", + "errorMessage": "VCF file (can be bgzipped), cannot contain spaces" + }, + "vcf_germline_resource_vcf_tbi": { + "meta": ["vcf_germline_resource_vcf_tbi"], + "type": "string", + "format": "path", + "pattern": "^\\S+\\.vcf\\.gz\\.tbi?$", + "errorMessage": "VCF tabix index, cannot contain spaces" + }, + "vcf_germline_resource_vcf_source": { + "meta": ["vcf_germline_resource_vcf_source"], + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "Source of germline_resource, cannot contain spaces" + }, + "vcf_known_indels_vcf": { + "meta": ["vcf_known_indels_vcf"], + "type": "string", + "format": "path", + "pattern": "^\\S+\\.vcf(\\.gz)?$", + "errorMessage": "VCF file (can be bgzipped), cannot contain spaces" + }, + "vcf_known_indels_vcf_tbi": { + "meta": ["vcf_known_indels_vcf_tbi"], + "type": "string", + "format": "path", + "pattern": "^\\S+\\.vcf\\.gz\\.tbi?$", + "errorMessage": "VCF tabix index, cannot contain spaces" + }, + "vcf_known_indels_vcf_source": { + "meta": ["vcf_known_indels_vcf_source"], + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "Source of known_indels, cannot contain spaces" + }, + "vcf_known_snps_vcf": { + "meta": ["vcf_known_snps_vcf"], + "type": "string", + "format": "path", + "pattern": "^\\S+\\.vcf(\\.gz)?$", + "errorMessage": "VCF file (can be bgzipped), cannot contain spaces" + }, + "vcf_known_snps_vcf_tbi": { + "meta": ["vcf_known_snps_vcf_tbi"], + "type": "string", + "format": "path", + "pattern": "^\\S+\\.vcf\\.gz\\.tbi?$", + "errorMessage": "VCF tabix index, cannot contain spaces" + }, + "vcf_known_snps_vcf_source": { + "meta": ["vcf_known_snps_vcf_source"], + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "Source of known_snps, cannot contain spaces" + }, + "vcf_pon_vcf": { + "meta": ["vcf_pon_vcf"], + "type": "string", + "format": "path", + "pattern": "^\\S+\\.vcf(\\.gz)?$", + "errorMessage": "VCF file (can be bgzipped), cannot contain spaces" + }, + "vcf_pon_vcf_tbi": { + "meta": ["vcf_pon_vcf_tbi"], + "type": "string", + "format": "path", + "pattern": "^\\S+\\.vcf\\.gz\\.tbi?$", + "errorMessage": "VCF tabix index, cannot contain spaces" + }, + "vcf_pon_vcf_source": { + "meta": ["vcf_pon_vcf_source"], + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "Source of pon, cannot contain spaces" + } + }, + "required": ["genome"] + } +} diff --git a/subworkflows/nf-core/utils_references/tests/main.nf.test b/subworkflows/nf-core/utils_references/tests/main.nf.test new file mode 100644 index 00000000000..d16de16e686 --- /dev/null +++ b/subworkflows/nf-core/utils_references/tests/main.nf.test @@ -0,0 +1,27 @@ +nextflow_workflow { + + name "Test Workflow UTILS_REFERENCES" + script "../main.nf" + workflow "UTILS_REFERENCES" + + test("references_file with params - references_value without params") { + + when { + workflow { + """ + input[0] = 'https://raw.githubusercontent.com/nf-core/references-assets/main/genomes/Homo_sapiens/test/GRCh38_chr22.yml' + input[1] = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/genome/genome.fasta' + input[2] = null + input[3] = 'fasta' + input[4] = 'species' + input[5] = 'https://raw.githubusercontent.com/nf-core/' + """ + } + } + + then { + assert workflow.success + assert snapshot(workflow.out).match() + } + } +} diff --git a/subworkflows/nf-core/utils_references/tests/main.nf.test.snap b/subworkflows/nf-core/utils_references/tests/main.nf.test.snap new file mode 100644 index 00000000000..f5bf3bb34ba --- /dev/null +++ b/subworkflows/nf-core/utils_references/tests/main.nf.test.snap @@ -0,0 +1,45 @@ +{ + "references_file with params - references_value without params": { + "content": [ + { + "0": [ + [ + { + "id": "GRCh38_chr22" + }, + "/nf-core/test-datasets/modules/data/genomics/sarscov2/genome/genome.fasta" + ] + ], + "1": [ + [ + { + "id": "GRCh38_chr22" + }, + "Homo_sapiens" + ] + ], + "references_file": [ + [ + { + "id": "GRCh38_chr22" + }, + "/nf-core/test-datasets/modules/data/genomics/sarscov2/genome/genome.fasta" + ] + ], + "references_value": [ + [ + { + "id": "GRCh38_chr22" + }, + "Homo_sapiens" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.4" + }, + "timestamp": "2025-02-03T18:21:58.076068554" + } +} \ No newline at end of file