diff --git a/CHANGELOG.md b/CHANGELOG.md index 9cbd4974..de529890 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -89,6 +89,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#489](https://github.com/genomic-medicine-sweden/nallo/pull/489) - Updated nf-core template to 3.0.2 - [#493](https://github.com/genomic-medicine-sweden/nallo/pull/493) - Refactored `nallo.nf` to remove many nested ifs and easier to follow logic - [#493](https://github.com/genomic-medicine-sweden/nallo/pull/493) - Updated rank_variants dependencies with sv_annotation +- [#502](https://github.com/genomic-medicine-sweden/nallo/pull/502) - Changed to annotating and ranking SNVs per family instead of per project +- [#502](https://github.com/genomic-medicine-sweden/nallo/pull/502) - Changed output documentation and structure to match `sample` and `family` for all variants ### `Removed` @@ -97,6 +99,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#379](https://github.com/genomic-medicine-sweden/nallo/pull/379) - Removed VEP Plugins from testdata ([genomic-medicine-sweden/test-datasets#16](https://github.com/genomic-medicine-sweden/test-datasets/pull/16)) - [#388](https://github.com/genomic-medicine-sweden/nallo/pull/388) - Removed support for co-phasing SVs with HiPhase, as the officially supported caller (pbsv) is not in the pipeline - [#412](https://github.com/genomic-medicine-sweden/nallo/pull/412) - Removed `bcftools/index`, as indexing is handled by other modules and no references remained. ([#377](https://github.com/genomic-medicine-sweden/nallo/issues/377)) +- [#502](https://github.com/genomic-medicine-sweden/nallo/pull/502) - Removed support for automatically creating an echvar database with SNVs and INDELs ### `Fixed` @@ -107,6 +110,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#402](https://github.com/genomic-medicine-sweden/nallo/pull/402) - Fixed double sample names in HiFiCNV output - [#438](https://github.com/genomic-medicine-sweden/nallo/pull/438) - Fixed missing/malformed software versions in `ADD_FOUND_IN_TAG`, `ADD_MOST_SEVERE_CSQ`, `ADD_MOST_SEVERE_PLI`, `SAMPLESHEET_PED`, `SOMALIER_PED` and `TRGT` - [#444](https://github.com/genomic-medicine-sweden/nallo/pull/444) - Fixed genmod assigning wrong models on chromosome X when named `chrX` ([#343](https://github.com/genomic-medicine-sweden/nallo/issues/343)) +- [#502](https://github.com/genomic-medicine-sweden/nallo/pull/502) - Fixed genmod only scoring compounds in one family [#501](https://github.com/genomic-medicine-sweden/nallo/issues/501) ### Parameters diff --git a/conf/modules/call_svs.config b/conf/modules/call_svs.config index 8781e3fd..2969b1ff 100644 --- a/conf/modules/call_svs.config +++ b/conf/modules/call_svs.config @@ -75,7 +75,7 @@ process { '--write-index=tbi' ].join(' ') publishDir = [ - path: { "${params.outdir}/svs/single_sample/${meta.id}" }, + path: { "${params.outdir}/svs/sample/${meta.id}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] diff --git a/conf/modules/general.config b/conf/modules/general.config index bcec5299..5560368e 100644 --- a/conf/modules/general.config +++ b/conf/modules/general.config @@ -27,7 +27,7 @@ process { withName: '.*:NALLO:BCFTOOLS_STATS' { ext.prefix = { "${vcf}" } publishDir = [ - path: { "${params.outdir}/snvs/stats/single_sample" }, + path: { "${params.outdir}/snvs/stats/sample" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -51,7 +51,7 @@ process { '--write-index=tbi' ].join(' ') } publishDir = [ - path: { "${params.outdir}/snvs/multi_sample/${meta.id}" }, + path: { "${params.outdir}/snvs/family/${meta.id}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -64,7 +64,7 @@ process { '--write-index=tbi' ].join(' ') publishDir = [ - path: { "${params.outdir}/snvs/single_sample/" }, + path: { "${params.outdir}/snvs/sample/" }, mode: params.publish_dir_mode, // Can't use prefix as it would come from the original file saveAs: { filename -> @@ -88,7 +88,7 @@ process { '--write-index=tbi' ].join(' ') publishDir = [ - path: { "${params.outdir}/svs/single_sample/" }, + path: { "${params.outdir}/svs/sample/" }, mode: params.publish_dir_mode, // Can't use prefix as it would come from the original file saveAs: { filename -> @@ -111,13 +111,6 @@ process { ] } - withName: '.*:NALLO:SOMALIER_PED' { - publishDir = [ - path: { "${params.outdir}/pedigree/project/" }, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - ] - } - withName: '.*:NALLO:SOMALIER_PED_FAMILY' { publishDir = [ path: { "${params.outdir}/pedigree/family/" }, @@ -198,14 +191,6 @@ process { ] } - withName: '.*:NALLO:ECHTVAR_ENCODE' { - publishDir = [ - path: { "${params.outdir}/databases/echtvar/encode/${meta.id}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Summary diff --git a/docs/output.md b/docs/output.md index a3bb523a..a72973a4 100644 --- a/docs/output.md +++ b/docs/output.md @@ -1,5 +1,7 @@ # genomic-medicine-sweden/nallo: Output +In general, annotated variant calls are output per family in while unannotated calls are output per sample. + ## Aligned reads [Minimap2](https://github.com/lh3/minimap2) is used to map the reads to a reference genome. The aligned reads are sorted, (merged) and indexed using [samtools](https://github.com/samtools/samtools). @@ -121,17 +123,12 @@ If the pipeline is run with phasing, the aligned reads will be happlotagged usin [somalier](https://github.com/brentp/somalier) checks relatedness and sex. -| Path | Description | -| ----------------------------------------------------- | ------------------------------------------------------- | -| `pedigree/{project}/{project}.ped` | PED file updated with somalier-inferred sex per project | -| `pedigree/{family}/{family).ped` | PED file updated with somalier-inferred sex per family | -| `qc/somalier/relate/{project}/{project}.html` | HTML report | -| `qc/somalier/relate/{project}/{project}.pairs.tsv` | Information about sample pairs | -| `/qc/somalier/relate/{project}/{project}.samples.tsv` | Information about individual samples | - -| Path | Description | -| ------------------------- | ------------------------------------------- | -| `predigree/{project}.ped` | PED file updated with somalier-inferred sex | +| Path | Description | +| ---------------------------------------------------- | ------------------------------------------------------ | +| `pedigree/family/{family).ped` | PED file updated with somalier-inferred sex per family | +| `qc/somalier/relate/{project}/{project}.html` | HTML report | +| `qc/somalier/relate/{project}/{project}.pairs.tsv` | Information about sample pairs | +| `qc/somalier/relate/{project}/{project}.samples.tsv` | Information about individual samples | ### DeepVariant @@ -147,13 +144,13 @@ If the pipeline is run with phasing, the aligned reads will be happlotagged usin [Paraphase](https://github.com/PacificBiosciences/paraphase) is used to call paralogous genes. -| Path | Description | -| ----------------------------------------------------------- | --------------------------------------- | -| `paraphase/{sample}/*.bam` | BAM file with haplotypes grouped by HP | -| `paraphase/{sample}/*.bai` | Index of the BAM file | -| `paraphase/{sample}/*.json` | Summary of haplotypes and variant calls | -| `paraphase/{sample}_paraphase_vcfs/{sample}_{gene}_vcf` | VCF file per gene | -| `paraphase/{sample}_paraphase_vcfs/{sample}_{gene}_vcf.tbi` | Index of the VCF file | +| Path | Description | +| -------------------------------------------------------------------- | --------------------------------------- | +| `paraphase/{sample}/*.bam` | BAM file with haplotypes grouped by HP | +| `paraphase/{sample}/*.bai` | Index of the BAM file | +| `paraphase/{sample}/*.json` | Summary of haplotypes and variant calls | +| `paraphase/{sample}/{sample}_paraphase_vcfs/{sample}_{gene}_vcf` | VCF file per gene | +| `paraphase/{sample}/{sample}_paraphase_vcfs/{sample}_{gene}_vcf.tbi` | Index of the VCF file | ### Repeats @@ -163,21 +160,21 @@ If the pipeline is run with phasing, the aligned reads will be happlotagged usin Merged variants per family are only output without annotation if `--skip_repeat_annotation` is true. Variants per sample are always output without annotation. -| Path | Description | -| -------------------------------------------------------- | ----------------------------------------- | -| `repeats/{family}/{family}_repeat_expansions.vcf.gz` | Merged VCF file per family | -| `repeats/{family}/{family}_repeat_expansions.vcf.gz.tbi` | Index of the VCF file | -| `repeats/sample/{sample}/{sample}_sorted.vcf.gz` | VCF file with called repeats for a sample | -| `repeats/sample/{sample}/{sample}_sorted.vcf.gz.tbi` | Index of the VCF file | -| `repeats/sample/{sample}/{sample}_spanning_sorted.bam` | BAM file with sorted spanning reads | -| `repeats/sample/{sample}/{sample}_spanning_sorted.bai` | Index of the BAM file | +| Path | Description | +| --------------------------------------------------------------- | ----------------------------------------- | +| `repeats/family/{family}/{family}_repeat_expansions.vcf.gz` | Merged VCF file per family | +| `repeats/family/{family}/{family}_repeat_expansions.vcf.gz.tbi` | Index of the VCF file | +| `repeats/sample/{sample}/{sample}_sorted.vcf.gz` | VCF file with called repeats for a sample | +| `repeats/sample/{sample}/{sample}_sorted.vcf.gz.tbi` | Index of the VCF file | +| `repeats/sample/{sample}/{sample}_spanning_sorted.bam` | BAM file with sorted spanning reads | +| `repeats/sample/{sample}/{sample}_spanning_sorted.bai` | Index of the BAM file | [Stranger](https://github.com/Clinical-Genomics/stranger) is used to annotate repeats. -| Path | Description | -| ---------------------------------------------------------------------------- | ------------------------------------- | -| `repeat_expansions/{family}/{family}_repeat_expansions_annotated.vcf.gz` | Merged, annotated VCF file per family | -| `repeat_expansions/{family}/{family}_repeat_expansions_annotated.vcf.gz.tbi` | Index of the VCF file | +| Path | Description | +| ----------------------------------------------------------------------------------- | ------------------------------------- | +| `repeat_expansions/family/{family}/{family}_repeat_expansions_annotated.vcf.gz` | Merged, annotated VCF file per family | +| `repeat_expansions/family/{family}/{family}_repeat_expansions_annotated.vcf.gz.tbi` | Index of the VCF file | ### SNVs @@ -189,11 +186,11 @@ If the pipeline is run with phasing, the aligned reads will be happlotagged usin | Path | Description | | --------------------------------------------------------------------- | --------------------------------------------------------------------------- | -| `snvs/single_sample/{sample}/{sample}_snv.vcf.gz` | VCF file containing called variants with alternative genotypes for a sample | -| `snvs/single_sample/{sample}/{sample}_snv.vcf.gz.tbi` | Index of the corresponding VCF file | -| `snvs/multi_sample/{project}/{project}_snv.vcf.gz` | VCF file containing called variants for all samples | -| `snvs/multi_sample/{project}/{project}_snv.vcf.gz.tbi` | Index of the corresponding VCF file | -| `snvs/stats/single_sample/*.stats.txt` | Variant statistics | +| `snvs/sample/{sample}/{sample}_snv.vcf.gz` | VCF file containing called variants with alternative genotypes for a sample | +| `snvs/sample/{sample}/{sample}_snv.vcf.gz.tbi` | Index of the corresponding VCF file | +| `snvs/family/{family}/{family}_snv.vcf.gz` | VCF file containing called variants for all samples | +| `snvs/family/{family}/{family}_snv.vcf.gz.tbi` | Index of the corresponding VCF file | +| `snvs/stats/sample/*.stats.txt` | Variant statistics | | `qc/deepvariant_vcfstatsreport/{sample}/${sample}.visual_report.html` | Visual report of SNV calls from DeepVariant | [echtvar](https://github.com/brentp/echtvar) and [VEP](https://www.ensembl.org/vep) are used for annotating SNVs, while [CADD](https://cadd.gs.washington.edu/) is used to annotate INDELs with CADD scores. @@ -202,22 +199,21 @@ If the pipeline is run with phasing, the aligned reads will be happlotagged usin Variants are only output without ranking if that subworkflows are turned off. -| Path | Description | -| ---------------------------------------------------------------- | ------------------------------------------------------------------------------ | -| `databases/echtvar/encode/{project}/*.zip` | Database with allele frequency (AF) and allele count (AC) for all samples | -| `snvs/single_sample/{sample}/{sample}_snv_annotated.vcf.gz` | VCF file containing annotated variants with alternative genotypes for a sample | -| `snvs/single_sample/{sample}/{sample}_snv_annotated.vcf.gz.tbi` | Index of the annotated VCF file | -| `snvs/multi_sample/{project}/{project}_snv_annotated.vcf.gz` | VCF file containing annotated variants for all samples | -| `snvs/multi_sample/{project}/{project}_snv_annotated.vcf.gz.tbi` | Index of the annotated VCF file | +| Path | Description | +| -------------------------------------------------------- | ------------------------------------------------------------------------------ | +| `snvs/sample/{sample}/{sample}_snv_annotated.vcf.gz` | VCF file containing annotated variants with alternative genotypes for a sample | +| `snvs/sample/{sample}/{sample}_snv_annotated.vcf.gz.tbi` | Index of the annotated VCF file | +| `snvs/family/{family}/{family}_snv_annotated.vcf.gz` | VCF file containing annotated variants per family | +| `snvs/family/{family}/{family}_snv_annotated.vcf.gz.tbi` | Index of the annotated VCF file | [GENMOD](https://github.com/Clinical-Genomics/genmod) is used to rank the annotated SNVs and INDELs. -| Path | Description | -| ----------------------------------------------------------------------- | ----------------------------------------------------------- | -| `snvs/single_sample/{sample}/{sample}_snv_annotated_ranked.vcf.gz` | VCF file with annotated and ranked variants for a sample | -| `snvs/single_sample/{sample}/{sample}_snv_annotated_ranked.vcf.gz.tbi` | Index of the ranked VCF file | -| `snvs/multi_sample/{project}/{project}_snv_annotated_ranked.vcf.gz` | VCF file with annotated and ranked variants for all samples | -| `snvs/multi_sample/{project}/{project}_snv_annotated_ranked.vcf.gz.tbi` | Index of the ranked VCF file | +| Path | Description | +| --------------------------------------------------------------- | -------------------------------------------------------- | +| `snvs/sample/{sample}/{sample}_snv_annotated_ranked.vcf.gz` | VCF file with annotated and ranked variants for a sample | +| `snvs/sample/{sample}/{sample}_snv_annotated_ranked.vcf.gz.tbi` | Index of the ranked VCF file | +| `snvs/family/{family}/{family}_snv_annotated_ranked.vcf.gz` | VCF file with annotated and ranked variants per family | +| `snvs/family/{family}/{family}_snv_annotated_ranked.vcf.gz.tbi` | Index of the ranked VCF file | ### SVs (and CNVs) @@ -242,10 +238,10 @@ If the pipeline is run with phasing, the aligned reads will be happlotagged usin | `svs/family/{family_id}/{family_id}_cnvs_svs_merged.vcf.gz.tbi` | Index of the merged VCF file | | `svs/family/{family_id}/{family_id}_svs_merged.vcf.gz` | VCF file with merged SVs per family (output if CNV-calling is off) | | `svs/family/{family_id}/{family_id}_svs_merged.vcf.gz.tbi` | Index of the merged VCF file | -| `svs/single_sample/{sample}/{sample}_cnvs.vcf.gz` | VCF file with CNVs per sample | -| `svs/single_sample/{sample}/{sample}_cnvs.vcf.gz.tbi` | VCF file with CNVs per sample | -| `svs/single_sample/{sample}/{sample}_svs.vcf.gz` | VCF file with SVs per sample | -| `svs/single_sample/{sample}/{sample}_svs.vcf.gz.tbi` | VCF file with SVs per sample | +| `svs/sample/{sample}/{sample}_cnvs.vcf.gz` | VCF file with CNVs per sample | +| `svs/sample/{sample}/{sample}_cnvs.vcf.gz.tbi` | VCF file with CNVs per sample | +| `svs/sample/{sample}/{sample}_svs.vcf.gz` | VCF file with SVs per sample | +| `svs/sample/{sample}/{sample}_svs.vcf.gz.tbi` | VCF file with SVs per sample | [SVDB](https://github.com/J35P312/SVDB) and [VEP](https://www.ensembl.org/vep) are used to annotate structural variants. @@ -269,8 +265,8 @@ If the pipeline is run with phasing, the aligned reads will be happlotagged usin [HiFiCNV](https://github.com/PacificBiosciences/HiFiCNV) is used to call CNVs, but it also produces copy number, depth, and MAF tracks that can be visualized in for example IGV. -| Path | Description | -| --------------------------------------------------- | ----------------------------------------- | -|  `visualization_tracks/{sample}/*.copynum.bedgraph` | Copy number in bedgraph format | -| `visualization_tracks/{sample}/*.depth.bw` | Depth track in BigWig format | -| `visualization_tracks/{sample}/*.maf.bw` | Minor allele frequencies in BigWig format | +| Path | Description | +| -------------------------------------------------- | ----------------------------------------- | +| `visualization_tracks/{sample}/*.copynum.bedgraph` | Copy number in bedgraph format | +| `visualization_tracks/{sample}/*.depth.bw` | Depth track in BigWig format | +| `visualization_tracks/{sample}/*.maf.bw` | Minor allele frequencies in BigWig format | diff --git a/samplesheet_multisample_bam_ont.csv b/samplesheet_multisample_bam_ont.csv new file mode 100644 index 00000000..ef6d8488 --- /dev/null +++ b/samplesheet_multisample_bam_ont.csv @@ -0,0 +1,4 @@ +project,sample,file,family_id,paternal_id,maternal_id,sex,phenotype +test,HG002_ONT_A,https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/nallo/testdata/HG002_ONT.bam,FAM1,0,0,1,2 +test,HG002_ONT_B,https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/nallo/testdata/HG002_ONT.bam,FAM2,0,0,2,2 +test,HG002_ONT_B,https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/nallo/testdata/HG002_ONT_copy.bam,FAM2,0,0,1,2 diff --git a/subworkflows/local/rank_variants/tests/main.nf.test b/subworkflows/local/rank_variants/tests/main.nf.test index 4be209d1..aba1c085 100644 --- a/subworkflows/local/rank_variants/tests/main.nf.test +++ b/subworkflows/local/rank_variants/tests/main.nf.test @@ -60,7 +60,7 @@ nextflow_workflow { script "../../snv_annotation/main.nf" process { """ - input[0] = SHORT_VARIANT_CALLING.out.combined_bcf + input[0] = SHORT_VARIANT_CALLING.out.family_bcf input[1] = [ file(params.pipelines_testdata_base_path + 'reference/cadd.v1.6.hg38.test_data.zip', checkIfExists: true) ] diff --git a/subworkflows/local/short_variant_calling/main.nf b/subworkflows/local/short_variant_calling/main.nf index d7183dca..d23d2887 100644 --- a/subworkflows/local/short_variant_calling/main.nf +++ b/subworkflows/local/short_variant_calling/main.nf @@ -61,22 +61,12 @@ workflow SHORT_VARIANT_CALLING { BCFTOOLS_NORM_SINGLESAMPLE ( BCFTOOLS_CONCAT.out.vcf.map { meta, vcf -> [ meta, vcf, [] ] }, ch_fasta ) ch_versions = ch_versions.mix(BCFTOOLS_NORM_SINGLESAMPLE.out.versions) - // This creates a multisample VCF, with regions from ONE bed file + // This creates one multisample VCF per family, with regions from ONE bed file DEEPVARIANT_RUNDEEPVARIANT.out.gvcf .map { meta, gvcf -> - [ meta.region.name, meta.project, meta.phenotype == 2, gvcf ] - } - .groupTuple() // Group all files together per region - // If any of the samples in the VCF have an affected phenotype (2) - // add this to the meta of the multisample VCF to know if we should run RANK_VARIANTS or not - .map { meta, project, affected, gvcfs -> - new_meta = [ - 'id': meta, - 'project': project.first(), // Works only because only one project per run is allowed - 'contains_affected': affected.any(), - ] - [ new_meta, gvcfs ] + [ [ id:meta.region.name, family_id:meta.family_id ], gvcf ] } + .groupTuple() // Group files from the same family together per region .set{ glnexus_in } GLNEXUS( glnexus_in, ch_bed ) @@ -107,8 +97,8 @@ workflow SHORT_VARIANT_CALLING { emit: snp_calls_vcf = BCFTOOLS_NORM_SINGLESAMPLE.out.vcf // channel: [ val(meta), path(vcf) ] snp_calls_tbi = BCFTOOLS_NORM_SINGLESAMPLE.out.tbi // channel: [ val(meta), path(tbi) ] - combined_bcf = BCFTOOLS_NORM_MULTISAMPLE.out.vcf // channel: [ val(meta), path(bcf) ] - combined_csi = BCFTOOLS_NORM_MULTISAMPLE.out.csi // channel: [ val(meta), path(csi) ] + family_bcf = BCFTOOLS_NORM_MULTISAMPLE.out.vcf // channel: [ val(meta), path(bcf) ] + family_csi = BCFTOOLS_NORM_MULTISAMPLE.out.csi // channel: [ val(meta), path(csi) ] vcfstatsreport = DEEPVARIANT_VCFSTATSREPORT.out.report // channel: [ val(meta), path(html) ] versions = ch_versions // channel: [ path(versions.yml) ] } diff --git a/subworkflows/local/snv_annotation/tests/main.nf.test b/subworkflows/local/snv_annotation/tests/main.nf.test index dd5ee5ef..6cf9e390 100644 --- a/subworkflows/local/snv_annotation/tests/main.nf.test +++ b/subworkflows/local/snv_annotation/tests/main.nf.test @@ -82,7 +82,7 @@ nextflow_workflow { when { workflow { """ - input[0] = SHORT_VARIANT_CALLING.out.combined_bcf + input[0] = SHORT_VARIANT_CALLING.out.family_bcf input[1] = [ file(params.pipelines_testdata_base_path + 'reference/cadd.v1.6.hg38.test_data.zip', checkIfExists: true) ] @@ -124,7 +124,7 @@ nextflow_workflow { } workflow { """ - input[0] = SHORT_VARIANT_CALLING.out.combined_bcf + input[0] = SHORT_VARIANT_CALLING.out.family_bcf input[1] = [ file(params.pipelines_testdata_base_path + 'reference/cadd.v1.6.hg38.test_data.zip', checkIfExists: true) ] diff --git a/subworkflows/local/utils_nfcore_nallo_pipeline/main.nf b/subworkflows/local/utils_nfcore_nallo_pipeline/main.nf index 0c4308ec..2ac3b504 100644 --- a/subworkflows/local/utils_nfcore_nallo_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_nallo_pipeline/main.nf @@ -161,6 +161,7 @@ workflow PIPELINE_INITIALISATION { UTILS_NFCORE_PIPELINE ( nextflow_cli_args ) + // // Custom validation for pipeline parameters // @@ -190,25 +191,11 @@ workflow PIPELINE_INITIALISATION { } .set { ch_samplesheet } - // Check that there's samples with affected phenotype if we are ranking variants - ch_samplesheet - .filter { meta, reads -> meta.phenotype == 2 } - .ifEmpty { - if(!params.skip_rank_variants) { - error("No samples in samplesheet has affected phenotype (=2), --skip_rank_variants has to be active.") - } - } + // Check that all families has at least one sample with affected phenotype if ranking is active + validateAllFamiliesHasAffectedSamples(ch_samplesheet, params) // Check that there's no more than one project - // TODO: Try to do this in nf-schema - ch_samplesheet - .map { meta, reads -> meta.project } - .unique() - .collect() - .filter{ it.size() == 1 } - .ifEmpty { - error("Only one project may be specified per run") - } + validateSingleProjectPerRun(ch_samplesheet) emit: samplesheet = ch_samplesheet @@ -658,3 +645,40 @@ def validatePacBioLicense() { error "ERROR: The HiPhase license only permits analysis of data from PacBio." } } + +// Genmod within RANK_VARIANTS requires affected individuals in the samplesheet. +// This is a convinience function to fail early if there are families without affected individuals. +def validateAllFamiliesHasAffectedSamples(ch_samplesheet, params) { + + if (params.skip_rank_variants) { + return + } + + def familiesWithPhenotypes = ch_samplesheet + .map { meta, reads -> [ meta.family_id, meta.phenotype ] } + .groupTuple() + + def familiesWithoutAffected = familiesWithPhenotypes + .filter { family, phenotype -> !phenotype.contains(2) } + + familiesWithoutAffected + .map { family, phenotype -> family } + .collect() + .subscribe { familyList -> + if (familyList) { + error("ERROR: No samples in families: ${familyList.join(", ")} have affected phenotype (=2); --skip_rank_variants has to be active.") + } + } +} + +def validateSingleProjectPerRun(ch_samplesheet) { + def oneProject = ch_samplesheet + .map { meta, reads -> meta.project } + .unique() + .collect() + .filter{ it.size() == 1 } + + if(!oneProject) { + error("Only one project may be specified per run") + } +} diff --git a/tests/samplesheet.nf.test.snap b/tests/samplesheet.nf.test.snap index e1c5ce90..0060fe6c 100644 --- a/tests/samplesheet.nf.test.snap +++ b/tests/samplesheet.nf.test.snap @@ -1,7 +1,7 @@ { "test profile": { "content": [ - 113, + 111, { "ADD_FOUND_IN_TAG": { "bcftools": 1.2, @@ -81,9 +81,6 @@ "ECHTVAR_ANNO": { "echtvar": "0.2.0" }, - "ECHTVAR_ENCODE": { - "echtvar": "0.2.0" - }, "ENSEMBLVEP_SNV": { "ensemblvep": 110.0 }, @@ -184,10 +181,6 @@ "SOMALIER_EXTRACT": { "somalier": "0.2.18" }, - "SOMALIER_PED": { - "create_pedigree_file": 1.0, - "python": "3.8.3" - }, "SOMALIER_PED_FAMILY": { "create_pedigree_file": 1.0, "python": "3.8.3" @@ -276,11 +269,6 @@ "assembly_variant_calling/dipcall/HG002_Revio/HG002_Revio.hap2.sam.gz", "assembly_variant_calling/dipcall/HG002_Revio/HG002_Revio.hap2.var.gz", "assembly_variant_calling/dipcall/HG002_Revio/HG002_Revio.pair.vcf.gz", - "databases", - "databases/echtvar", - "databases/echtvar/encode", - "databases/echtvar/encode/test", - "databases/echtvar/encode/test/test.zip", "methylation", "methylation/modkit", "methylation/modkit/pileup", @@ -427,8 +415,6 @@ "pedigree", "pedigree/family", "pedigree/family/FAM.ped", - "pedigree/project", - "pedigree/project/test.ped", "phased_variants", "phased_variants/HG002_Revio", "phased_variants/HG002_Revio/HG002_Revio_phased.vcf.gz", @@ -481,28 +467,30 @@ "repeats/sample/HG002_Revio/HG002_Revio_spanning_sorted.bam", "repeats/sample/HG002_Revio/HG002_Revio_spanning_sorted.bam.bai", "snvs", - "snvs/multi_sample", - "snvs/multi_sample/test", - "snvs/multi_sample/test/test_snv_annotated_ranked.vcf.gz", - "snvs/multi_sample/test/test_snv_annotated_ranked.vcf.gz.tbi", - "snvs/single_sample", - "snvs/single_sample/HG002_Revio", - "snvs/single_sample/HG002_Revio/HG002_Revio_snv_annotated_ranked.vcf.gz", - "snvs/single_sample/HG002_Revio/HG002_Revio_snv_annotated_ranked.vcf.gz.tbi", + "snvs/family", + "snvs/family/FAM", + "snvs/family/FAM/FAM_snv_annotated_ranked.vcf.gz", + "snvs/family/FAM/FAM_snv_annotated_ranked.vcf.gz.tbi", + "snvs/sample", + "snvs/sample/HG002_Revio", + "snvs/sample/HG002_Revio/HG002_Revio_snv_annotated_ranked.vcf.gz", + "snvs/sample/HG002_Revio/HG002_Revio_snv_annotated_ranked.vcf.gz.tbi", "snvs/stats", - "snvs/stats/single_sample", - "snvs/stats/single_sample/HG002_Revio.vcf.gz.bcftools_stats.txt", + "snvs/stats/sample", + "snvs/stats/sample/HG002_Revio.vcf.gz.bcftools_stats.txt", "svs", "svs/family", "svs/family/FAM", "svs/family/FAM/FAM_svs_cnvs_merged_annotated_ranked.vcf.gz", "svs/family/FAM/FAM_svs_cnvs_merged_annotated_ranked.vcf.gz.tbi", + "svs/sample", + "svs/sample/HG002_Revio", + "svs/sample/HG002_Revio/HG002_Revio_svs.vcf.gz", + "svs/sample/HG002_Revio/HG002_Revio_svs.vcf.gz.tbi", "svs/single_sample", "svs/single_sample/HG002_Revio", "svs/single_sample/HG002_Revio/HG002_Revio_cnvs.vcf.gz", "svs/single_sample/HG002_Revio/HG002_Revio_cnvs.vcf.gz.tbi", - "svs/single_sample/HG002_Revio/HG002_Revio_svs.vcf.gz", - "svs/single_sample/HG002_Revio/HG002_Revio_svs.vcf.gz.tbi", "visualization_tracks", "visualization_tracks/HG002_Revio", "visualization_tracks/HG002_Revio/HG002_Revio_hificnv.copynum.bedgraph", @@ -529,7 +517,6 @@ "HG002_Revio.hap2.sam.gz:md5,6d512a060c74428f7758aa51a99ae8c8", "HG002_Revio.hap2.var.gz:md5,9ba9303b30730e419138e177c7d1e0c2", "HG002_Revio.pair.vcf.gz:md5,1909dffa43850282fe7e5ae2b2d273a7", - "test.zip:md5,a0abe28a72b11b68126040dd38fa8e37", "HG002_Revio_modkit_pileup_1.bed.gz:md5,ac9bfc455d0b697a4bd565b510c155e0", "HG002_Revio_modkit_pileup_1.bed.gz.tbi:md5,5a9ac375a0ce33af002e0073635a3425", "HG002_Revio_modkit_pileup_2.bed.gz:md5,b2c6c3fc27d34b7c588a4a4b3e9611b0", @@ -563,7 +550,6 @@ "somalier_stats.txt:md5,150fa6b9c197c539947168c11d2115f9", "whatshap-stats-table.txt:md5,c89df1fe6abe77dcbfce8e443c1cecde", "FAM.ped:md5,bd5cec27ba7337a85cf98e787131e2b5", - "test.ped:md5,bd5cec27ba7337a85cf98e787131e2b5", "HG002_Revio_cramino_aligned_phased.arrow:md5,72df2934ff8aa7e1bf8cf8a4881a0d2a", "HG002_Revio_cramino_aligned.arrow:md5,72df2934ff8aa7e1bf8cf8a4881a0d2a", "HG002_Revio.mosdepth.global.dist.txt:md5,6186315d4d65eda85553af82a98829d1", @@ -625,7 +611,7 @@ "VcfFile [chromosomes=[chr20], sampleCount=1, variantCount=1, phased=false, phasedAutodetect=false]" ], [ - "test_snv_annotated_ranked.vcf.gz", + "FAM_snv_annotated_ranked.vcf.gz", "VcfFile [chromosomes=[chrX, chr16], sampleCount=1, variantCount=103, phased=false, phasedAutodetect=false]" ], [ @@ -636,13 +622,13 @@ "FAM_svs_cnvs_merged_annotated_ranked.vcf.gz", "VcfFile [chromosomes=[chrX, chr16], sampleCount=1, variantCount=87, phased=false, phasedAutodetect=false]" ], - [ - "HG002_Revio_cnvs.vcf.gz", - "VcfFile [chromosomes=[chrX, chr16, chr20], sampleCount=1, variantCount=36, phased=false, phasedAutodetect=false]" - ], [ "HG002_Revio_svs.vcf.gz", "VcfFile [chromosomes=[chrX, chr16], sampleCount=1, variantCount=55, phased=false, phasedAutodetect=false]" + ], + [ + "HG002_Revio_cnvs.vcf.gz", + "VcfFile [chromosomes=[chrX, chr16, chr20], sampleCount=1, variantCount=36, phased=false, phasedAutodetect=false]" ] ], [ @@ -668,6 +654,6 @@ "nf-test": "0.9.0", "nextflow": "24.04.4" }, - "timestamp": "2024-11-07T12:46:21.477060114" + "timestamp": "2024-11-12T16:46:53.528609548" } } \ No newline at end of file diff --git a/tests/samplesheet_multisample_bam.nf.test.snap b/tests/samplesheet_multisample_bam.nf.test.snap index 0df96a6b..ebbae97f 100644 --- a/tests/samplesheet_multisample_bam.nf.test.snap +++ b/tests/samplesheet_multisample_bam.nf.test.snap @@ -1,7 +1,7 @@ { "samplesheet_multisample_bam | --phaser hiphase": { "content": [ - 156, + 154, { "ADD_FOUND_IN_TAG": { "bcftools": 1.2, @@ -81,9 +81,6 @@ "ECHTVAR_ANNO": { "echtvar": "0.2.0" }, - "ECHTVAR_ENCODE": { - "echtvar": "0.2.0" - }, "ENSEMBLVEP_SNV": { "ensemblvep": 110.0 }, @@ -181,10 +178,6 @@ "SOMALIER_EXTRACT": { "somalier": "0.2.18" }, - "SOMALIER_PED": { - "create_pedigree_file": 1.0, - "python": "3.8.3" - }, "SOMALIER_PED_FAMILY": { "create_pedigree_file": 1.0, "python": "3.8.3" @@ -294,11 +287,6 @@ "assembly_variant_calling/dipcall/HG002_Revio_B/HG002_Revio_B.hap2.sam.gz", "assembly_variant_calling/dipcall/HG002_Revio_B/HG002_Revio_B.hap2.var.gz", "assembly_variant_calling/dipcall/HG002_Revio_B/HG002_Revio_B.pair.vcf.gz", - "databases", - "databases/echtvar", - "databases/echtvar/encode", - "databases/echtvar/encode/test", - "databases/echtvar/encode/test/test.zip", "methylation", "methylation/modkit", "methylation/modkit/pileup", @@ -471,8 +459,6 @@ "pedigree", "pedigree/family", "pedigree/family/FAM.ped", - "pedigree/project", - "pedigree/project/test.ped", "phased_variants", "phased_variants/HG002_Revio_A", "phased_variants/HG002_Revio_A/HG002_Revio_A_phased.blocks.tsv", @@ -559,37 +545,40 @@ "repeats/sample/HG002_Revio_B/HG002_Revio_B_spanning_sorted.bam", "repeats/sample/HG002_Revio_B/HG002_Revio_B_spanning_sorted.bam.bai", "snvs", - "snvs/multi_sample", - "snvs/multi_sample/test", - "snvs/multi_sample/test/test_snv_annotated_ranked.vcf.gz", - "snvs/multi_sample/test/test_snv_annotated_ranked.vcf.gz.tbi", - "snvs/single_sample", - "snvs/single_sample/HG002_Revio_A", - "snvs/single_sample/HG002_Revio_A/HG002_Revio_A_snv_annotated_ranked.vcf.gz", - "snvs/single_sample/HG002_Revio_A/HG002_Revio_A_snv_annotated_ranked.vcf.gz.tbi", - "snvs/single_sample/HG002_Revio_B", - "snvs/single_sample/HG002_Revio_B/HG002_Revio_B_snv_annotated_ranked.vcf.gz", - "snvs/single_sample/HG002_Revio_B/HG002_Revio_B_snv_annotated_ranked.vcf.gz.tbi", + "snvs/family", + "snvs/family/FAM", + "snvs/family/FAM/FAM_snv_annotated_ranked.vcf.gz", + "snvs/family/FAM/FAM_snv_annotated_ranked.vcf.gz.tbi", + "snvs/sample", + "snvs/sample/HG002_Revio_A", + "snvs/sample/HG002_Revio_A/HG002_Revio_A_snv_annotated_ranked.vcf.gz", + "snvs/sample/HG002_Revio_A/HG002_Revio_A_snv_annotated_ranked.vcf.gz.tbi", + "snvs/sample/HG002_Revio_B", + "snvs/sample/HG002_Revio_B/HG002_Revio_B_snv_annotated_ranked.vcf.gz", + "snvs/sample/HG002_Revio_B/HG002_Revio_B_snv_annotated_ranked.vcf.gz.tbi", "snvs/stats", - "snvs/stats/single_sample", - "snvs/stats/single_sample/HG002_Revio_A.vcf.gz.bcftools_stats.txt", - "snvs/stats/single_sample/HG002_Revio_B.vcf.gz.bcftools_stats.txt", + "snvs/stats/sample", + "snvs/stats/sample/HG002_Revio_A.vcf.gz.bcftools_stats.txt", + "snvs/stats/sample/HG002_Revio_B.vcf.gz.bcftools_stats.txt", "svs", "svs/family", "svs/family/FAM", "svs/family/FAM/FAM_svs_cnvs_merged_annotated_ranked.vcf.gz", "svs/family/FAM/FAM_svs_cnvs_merged_annotated_ranked.vcf.gz.tbi", + "svs/sample", + "svs/sample/HG002_Revio_A", + "svs/sample/HG002_Revio_A/HG002_Revio_A_svs.vcf.gz", + "svs/sample/HG002_Revio_A/HG002_Revio_A_svs.vcf.gz.tbi", + "svs/sample/HG002_Revio_B", + "svs/sample/HG002_Revio_B/HG002_Revio_B_svs.vcf.gz", + "svs/sample/HG002_Revio_B/HG002_Revio_B_svs.vcf.gz.tbi", "svs/single_sample", "svs/single_sample/HG002_Revio_A", "svs/single_sample/HG002_Revio_A/HG002_Revio_A_cnvs.vcf.gz", "svs/single_sample/HG002_Revio_A/HG002_Revio_A_cnvs.vcf.gz.tbi", - "svs/single_sample/HG002_Revio_A/HG002_Revio_A_svs.vcf.gz", - "svs/single_sample/HG002_Revio_A/HG002_Revio_A_svs.vcf.gz.tbi", "svs/single_sample/HG002_Revio_B", "svs/single_sample/HG002_Revio_B/HG002_Revio_B_cnvs.vcf.gz", "svs/single_sample/HG002_Revio_B/HG002_Revio_B_cnvs.vcf.gz.tbi", - "svs/single_sample/HG002_Revio_B/HG002_Revio_B_svs.vcf.gz", - "svs/single_sample/HG002_Revio_B/HG002_Revio_B_svs.vcf.gz.tbi", "visualization_tracks", "visualization_tracks/HG002_Revio_A", "visualization_tracks/HG002_Revio_A/HG002_Revio_A_hificnv.copynum.bedgraph", @@ -639,7 +628,6 @@ "HG002_Revio_B.hap2.sam.gz:md5,ecdf9b9d50b54776d0555c71a194af7e", "HG002_Revio_B.hap2.var.gz:md5,f09b01fcb43bf64f92ca7ec8a2380c25", "HG002_Revio_B.pair.vcf.gz:md5,676492865a9d4765835638d52651af73", - "test.zip:md5,d3fc0769a4c707953cdb3fade9450660", "HG002_Revio_A_modkit_pileup_1.bed.gz:md5,7af3b6246d0c007aec686714b96a0f7f", "HG002_Revio_A_modkit_pileup_1.bed.gz.tbi:md5,b7083ebf6ba176ed5a472ad653c5be27", "HG002_Revio_A_modkit_pileup_2.bed.gz:md5,a2e1291b468361412899cf811022fe72", @@ -680,7 +668,6 @@ "somalier_stats.txt:md5,5f523bdaa8f6b6bf1581694564521891", "whatshap-stats-table.txt:md5,862f90185133703811c955c45f5750fa", "FAM.ped:md5,24d8694d580f782ed77d4d1b5c6f6fb4", - "test.ped:md5,24d8694d580f782ed77d4d1b5c6f6fb4", "HG002_Revio_A_phased.blocks.tsv:md5,27ce044ba581da15ef838cbb343a64cf", "HG002_Revio_A_phased.stats.tsv:md5,58ae1b01d01b922d022ad025a81f1026", "HG002_Revio_A_phased.summary.tsv:md5,714496292cba53669081ac3be3078c28", @@ -800,7 +787,7 @@ "VcfFile [chromosomes=[chr20], sampleCount=1, variantCount=1, phased=false, phasedAutodetect=false]" ], [ - "test_snv_annotated_ranked.vcf.gz", + "FAM_snv_annotated_ranked.vcf.gz", "VcfFile [chromosomes=[chrX, chr16], sampleCount=2, variantCount=104, phased=false, phasedAutodetect=false]" ], [ @@ -816,20 +803,20 @@ "VcfFile [chromosomes=[chrX, chr16], sampleCount=2, variantCount=87, phased=false, phasedAutodetect=false]" ], [ - "HG002_Revio_A_cnvs.vcf.gz", - "VcfFile [chromosomes=[chrX, chr16, chr20], sampleCount=1, variantCount=36, phased=false, phasedAutodetect=false]" + "HG002_Revio_A_svs.vcf.gz", + "VcfFile [chromosomes=[chrX, chr16], sampleCount=1, variantCount=55, phased=false, phasedAutodetect=false]" ], [ - "HG002_Revio_A_svs.vcf.gz", + "HG002_Revio_B_svs.vcf.gz", "VcfFile [chromosomes=[chrX, chr16], sampleCount=1, variantCount=55, phased=false, phasedAutodetect=false]" ], [ - "HG002_Revio_B_cnvs.vcf.gz", + "HG002_Revio_A_cnvs.vcf.gz", "VcfFile [chromosomes=[chrX, chr16, chr20], sampleCount=1, variantCount=36, phased=false, phasedAutodetect=false]" ], [ - "HG002_Revio_B_svs.vcf.gz", - "VcfFile [chromosomes=[chrX, chr16], sampleCount=1, variantCount=55, phased=false, phasedAutodetect=false]" + "HG002_Revio_B_cnvs.vcf.gz", + "VcfFile [chromosomes=[chrX, chr16, chr20], sampleCount=1, variantCount=36, phased=false, phasedAutodetect=false]" ] ], [ @@ -867,6 +854,6 @@ "nf-test": "0.9.0", "nextflow": "24.04.4" }, - "timestamp": "2024-11-07T12:48:14.873817687" + "timestamp": "2024-11-12T16:48:50.801407496" } } \ No newline at end of file diff --git a/tests/samplesheet_multisample_ont_bam.nf.test.snap b/tests/samplesheet_multisample_ont_bam.nf.test.snap index 099f1b81..e5fca20e 100644 --- a/tests/samplesheet_multisample_ont_bam.nf.test.snap +++ b/tests/samplesheet_multisample_ont_bam.nf.test.snap @@ -1,7 +1,7 @@ { "samplesheet_multisample_ont_bam | --preset ONT_R10 --phaser whatshap --parallel_alignments 1 --parallel_snv 1": { "content": [ - 105, + 103, { "ADD_FOUND_IN_TAG": { "bcftools": 1.2, @@ -69,9 +69,6 @@ "ECHTVAR_ANNO": { "echtvar": "0.2.0" }, - "ECHTVAR_ENCODE": { - "echtvar": "0.2.0" - }, "ENSEMBLVEP_SNV": { "ensemblvep": 110.0 }, @@ -142,10 +139,6 @@ "SOMALIER_EXTRACT": { "somalier": "0.2.18" }, - "SOMALIER_PED": { - "create_pedigree_file": 1.0, - "python": "3.8.3" - }, "SOMALIER_PED_FAMILY": { "create_pedigree_file": 1.0, "python": "3.8.3" @@ -208,11 +201,6 @@ "aligned_reads/HG002_ONT_B", "aligned_reads/HG002_ONT_B/HG002_ONT_B_haplotagged.bam", "aligned_reads/HG002_ONT_B/HG002_ONT_B_haplotagged.bam.bai", - "databases", - "databases/echtvar", - "databases/echtvar/encode", - "databases/echtvar/encode/test", - "databases/echtvar/encode/test/test.zip", "methylation", "methylation/modkit", "methylation/modkit/pileup", @@ -385,8 +373,6 @@ "pedigree", "pedigree/family", "pedigree/family/FAM.ped", - "pedigree/project", - "pedigree/project/test.ped", "phased_variants", "phased_variants/HG002_ONT_A", "phased_variants/HG002_ONT_A/HG002_ONT_A_phased.vcf.gz", @@ -451,37 +437,40 @@ "qc/somalier/relate/test/test.pairs.tsv", "qc/somalier/relate/test/test.samples.tsv", "snvs", - "snvs/multi_sample", - "snvs/multi_sample/test", - "snvs/multi_sample/test/test_snv_annotated_ranked.vcf.gz", - "snvs/multi_sample/test/test_snv_annotated_ranked.vcf.gz.tbi", - "snvs/single_sample", - "snvs/single_sample/HG002_ONT_A", - "snvs/single_sample/HG002_ONT_A/HG002_ONT_A_snv_annotated_ranked.vcf.gz", - "snvs/single_sample/HG002_ONT_A/HG002_ONT_A_snv_annotated_ranked.vcf.gz.tbi", - "snvs/single_sample/HG002_ONT_B", - "snvs/single_sample/HG002_ONT_B/HG002_ONT_B_snv_annotated_ranked.vcf.gz", - "snvs/single_sample/HG002_ONT_B/HG002_ONT_B_snv_annotated_ranked.vcf.gz.tbi", + "snvs/family", + "snvs/family/FAM", + "snvs/family/FAM/FAM_snv_annotated_ranked.vcf.gz", + "snvs/family/FAM/FAM_snv_annotated_ranked.vcf.gz.tbi", + "snvs/sample", + "snvs/sample/HG002_ONT_A", + "snvs/sample/HG002_ONT_A/HG002_ONT_A_snv_annotated_ranked.vcf.gz", + "snvs/sample/HG002_ONT_A/HG002_ONT_A_snv_annotated_ranked.vcf.gz.tbi", + "snvs/sample/HG002_ONT_B", + "snvs/sample/HG002_ONT_B/HG002_ONT_B_snv_annotated_ranked.vcf.gz", + "snvs/sample/HG002_ONT_B/HG002_ONT_B_snv_annotated_ranked.vcf.gz.tbi", "snvs/stats", - "snvs/stats/single_sample", - "snvs/stats/single_sample/HG002_ONT_A.vcf.gz.bcftools_stats.txt", - "snvs/stats/single_sample/HG002_ONT_B.vcf.gz.bcftools_stats.txt", + "snvs/stats/sample", + "snvs/stats/sample/HG002_ONT_A.vcf.gz.bcftools_stats.txt", + "snvs/stats/sample/HG002_ONT_B.vcf.gz.bcftools_stats.txt", "svs", "svs/family", "svs/family/FAM", "svs/family/FAM/FAM_svs_cnvs_merged_annotated_ranked.vcf.gz", "svs/family/FAM/FAM_svs_cnvs_merged_annotated_ranked.vcf.gz.tbi", + "svs/sample", + "svs/sample/HG002_ONT_A", + "svs/sample/HG002_ONT_A/HG002_ONT_A_svs.vcf.gz", + "svs/sample/HG002_ONT_A/HG002_ONT_A_svs.vcf.gz.tbi", + "svs/sample/HG002_ONT_B", + "svs/sample/HG002_ONT_B/HG002_ONT_B_svs.vcf.gz", + "svs/sample/HG002_ONT_B/HG002_ONT_B_svs.vcf.gz.tbi", "svs/single_sample", "svs/single_sample/HG002_ONT_A", "svs/single_sample/HG002_ONT_A/HG002_ONT_A_cnvs.vcf.gz", "svs/single_sample/HG002_ONT_A/HG002_ONT_A_cnvs.vcf.gz.tbi", - "svs/single_sample/HG002_ONT_A/HG002_ONT_A_svs.vcf.gz", - "svs/single_sample/HG002_ONT_A/HG002_ONT_A_svs.vcf.gz.tbi", "svs/single_sample/HG002_ONT_B", "svs/single_sample/HG002_ONT_B/HG002_ONT_B_cnvs.vcf.gz", "svs/single_sample/HG002_ONT_B/HG002_ONT_B_cnvs.vcf.gz.tbi", - "svs/single_sample/HG002_ONT_B/HG002_ONT_B_svs.vcf.gz", - "svs/single_sample/HG002_ONT_B/HG002_ONT_B_svs.vcf.gz.tbi", "visualization_tracks", "visualization_tracks/HG002_ONT_A", "visualization_tracks/HG002_ONT_A/HG002_ONT_A_hificnv.copynum.bedgraph", @@ -493,7 +482,6 @@ "visualization_tracks/HG002_ONT_B/HG002_ONT_B_hificnv.maf.bw" ], [ - "test.zip:md5,779efd29ec2cc66bef05c173c44e4168", "HG002_ONT_A_modkit_pileup_1.bed.gz:md5,2be86a42ac0384ac6d27fdf6e65304b7", "HG002_ONT_A_modkit_pileup_1.bed.gz.tbi:md5,a7955eabc34456dce834a2c966071eaa", "HG002_ONT_A_modkit_pileup_2.bed.gz:md5,ec565bd66fc73d380b7f9eb768369a65", @@ -534,7 +522,6 @@ "somalier_stats.txt:md5,fec840d4fe202c0e3ff7ce94b2a3bf92", "whatshap-stats-table.txt:md5,d552daaaef1c92c3351ec2f4de64e6ad", "FAM.ped:md5,deb1ee6bd38d6e8f7cb92801d8a12f12", - "test.ped:md5,deb1ee6bd38d6e8f7cb92801d8a12f12", "HG002_ONT_A_cramino_aligned_phased.arrow:md5,d2a5c81595fa34925ab8f03078487d81", "HG002_ONT_B_cramino_aligned_phased.arrow:md5,61af72539e105cec79db7c9b78eb15a7", "HG002_ONT_A_cramino_aligned.arrow:md5,d2a5c81595fa34925ab8f03078487d81", @@ -596,7 +583,7 @@ "VcfFile [chromosomes=[chrX, chr16, chr20], sampleCount=1, variantCount=236, phased=false, phasedAutodetect=false]" ], [ - "test_snv_annotated_ranked.vcf.gz", + "FAM_snv_annotated_ranked.vcf.gz", "VcfFile [chromosomes=[chrX, chr16], sampleCount=2, variantCount=105, phased=false, phasedAutodetect=false]" ], [ @@ -612,20 +599,20 @@ "VcfFile [chromosomes=[chrX, chr16], sampleCount=2, variantCount=98, phased=false, phasedAutodetect=false]" ], [ - "HG002_ONT_A_cnvs.vcf.gz", - "VcfFile [chromosomes=[chrX, chr16, chr20], sampleCount=1, variantCount=30, phased=false, phasedAutodetect=false]" + "HG002_ONT_A_svs.vcf.gz", + "VcfFile [chromosomes=[chrX, chr16], sampleCount=1, variantCount=68, phased=false, phasedAutodetect=false]" ], [ - "HG002_ONT_A_svs.vcf.gz", + "HG002_ONT_B_svs.vcf.gz", "VcfFile [chromosomes=[chrX, chr16], sampleCount=1, variantCount=68, phased=false, phasedAutodetect=false]" ], [ - "HG002_ONT_B_cnvs.vcf.gz", - "VcfFile [chromosomes=[chrX, chr16, chr20], sampleCount=1, variantCount=31, phased=false, phasedAutodetect=false]" + "HG002_ONT_A_cnvs.vcf.gz", + "VcfFile [chromosomes=[chrX, chr16, chr20], sampleCount=1, variantCount=30, phased=false, phasedAutodetect=false]" ], [ - "HG002_ONT_B_svs.vcf.gz", - "VcfFile [chromosomes=[chrX, chr16], sampleCount=1, variantCount=68, phased=false, phasedAutodetect=false]" + "HG002_ONT_B_cnvs.vcf.gz", + "VcfFile [chromosomes=[chrX, chr16, chr20], sampleCount=1, variantCount=31, phased=false, phasedAutodetect=false]" ] ], [ @@ -636,6 +623,6 @@ "nf-test": "0.9.0", "nextflow": "24.04.4" }, - "timestamp": "2024-11-07T12:51:02.903461641" + "timestamp": "2024-11-12T16:50:40.322312965" } } \ No newline at end of file diff --git a/workflows/nallo.nf b/workflows/nallo.nf index 50061e57..0c551e54 100644 --- a/workflows/nallo.nf +++ b/workflows/nallo.nf @@ -41,7 +41,6 @@ include { SNV_ANNOTATION } from '../subworkflows/local/ include { CREATE_PEDIGREE_FILE as SAMPLESHEET_PED } from '../modules/local/create_pedigree_file/main' include { CREATE_PEDIGREE_FILE as SOMALIER_PED } from '../modules/local/create_pedigree_file/main' include { CREATE_PEDIGREE_FILE as SOMALIER_PED_FAMILY } from '../modules/local/create_pedigree_file/main' -include { ECHTVAR_ENCODE } from '../modules/local/echtvar/encode/main' include { SAMTOOLS_MERGE } from '../modules/nf-core/samtools/merge/main' // nf-core @@ -303,8 +302,8 @@ workflow NALLO { SHORT_VARIANT_CALLING( ch_snv_calling_in, fasta, fai, SCATTER_GENOME.out.bed, ch_par ) ch_versions = ch_versions.mix(SHORT_VARIANT_CALLING.out.versions) - SHORT_VARIANT_CALLING.out.combined_bcf - .join( SHORT_VARIANT_CALLING.out.combined_csi ) + SHORT_VARIANT_CALLING.out.family_bcf + .join( SHORT_VARIANT_CALLING.out.family_csi ) .set { ch_vcf_tbi_per_region } } @@ -313,9 +312,9 @@ workflow NALLO { // if(!params.skip_snv_annotation) { - // Annotates one multisample VCF per variant call region + // Annotates family VCFs per variant call region SNV_ANNOTATION( - SHORT_VARIANT_CALLING.out.combined_bcf, + SHORT_VARIANT_CALLING.out.family_bcf, ch_databases.map { meta, databases -> databases }.collect(), fasta, fai.map { name, fai -> [ [ id: name ], fai ] }, @@ -342,33 +341,33 @@ workflow NALLO { } // - // Ranks one multisample VCF per variant call region + // Ranks family VCFs per variant call region // Can only run if samplesheet has affected samples // if(!params.skip_rank_variants) { - // Create PED with updated sex per project (should perhaps be per SNV-calling region) - SOMALIER_PED ( + // Create PED with updated sex - per family + SOMALIER_PED_FAMILY ( bam - .map { meta, files -> [ [ id: meta.project ], meta ] } + .map { meta, files -> [ [ id: meta.family_id ], meta ] } .groupTuple() ) - ch_versions = ch_versions.mix(SOMALIER_PED.out.versions) - - SOMALIER_PED.out.ped - .collect() - .set { ch_updated_pedfile } + ch_versions = ch_versions.mix(SOMALIER_PED_FAMILY.out.versions) - // Give pedfile meta from variants + // Give PED file SNV meta so they can be joined later in the subworkflow. + // Since we don't always have matching number of ped files and call regions + // we need to combine and filter instead of join ANN_CSQ_PLI_SNV.out.vcf - .combine(ch_updated_pedfile.map { meta, ped -> ped } ) - .map { meta, vcf, ped -> [ meta, ped ] } - .set { rank_snvs_ped_in } + .map { meta, vcf -> [ [ id:meta.family_id ], meta ] } + .combine ( SOMALIER_PED_FAMILY.out.ped ) + .filter { family_id_snv, meta, family_id_ped, ped -> family_id_snv == family_id_ped } + .map { family_id_snv, meta, family_id_ped, ped -> [ meta, ped ] } + .set { snv_ranking_ped_file } // Only run if we have affected individuals RANK_VARIANTS_SNV ( ANN_CSQ_PLI_SNV.out.vcf, - rank_snvs_ped_in, + snv_ranking_ped_file, ch_reduced_penetrance, ch_score_config_snv ) @@ -385,11 +384,11 @@ workflow NALLO { if(!params.skip_short_variant_calling) { ch_vcf_tbi_per_region - .map { meta, vcf, tbi -> [ [ id: meta.project ], vcf, tbi ] } + .map { meta, vcf, tbi -> [ [ id: meta.family_id ], vcf, tbi ] } .groupTuple() .set { ch_bcftools_concat_in } - // Concat into a multisample VCF with all regions + // Concat into family VCFs per family with all regions BCFTOOLS_CONCAT ( ch_bcftools_concat_in ) ch_versions = ch_versions.mix(BCFTOOLS_CONCAT.out.versions) @@ -397,11 +396,7 @@ workflow NALLO { BCFTOOLS_SORT ( BCFTOOLS_CONCAT.out.vcf ) ch_versions = ch_versions.mix(BCFTOOLS_SORT.out.versions) - // Make an echtvar database of all samples - ECHTVAR_ENCODE ( BCFTOOLS_SORT.out.vcf ) - ch_versions = ch_versions.mix(ECHTVAR_ENCODE.out.versions) - - // Split multisample VCF to also publish a VCF per sample + // Split family VCFs to also publish a VCF per sample BCFTOOLS_PLUGINSPLIT_SNVS ( BCFTOOLS_SORT.out.vcf.join(BCFTOOLS_SORT.out.tbi ), [], [], [], [] ) ch_versions = ch_versions.mix(BCFTOOLS_PLUGINSPLIT_SNVS.out.versions) @@ -510,14 +505,6 @@ workflow NALLO { // if (!params.skip_rank_variants) { - // Create PED with updated sex - per family - SOMALIER_PED_FAMILY ( - bam - .map { meta, files -> [ [ id: meta.family_id ], meta ] } - .groupTuple() - ) - ch_versions = ch_versions.mix(SOMALIER_PED_FAMILY.out.versions) - RANK_VARIANTS_SVS ( ANN_CSQ_PLI_SVS.out.vcf, SOMALIER_PED_FAMILY.out.ped,