From 070a9cbefc1acbb22738bdcd2d89ae821efdcc78 Mon Sep 17 00:00:00 2001
From: Nicolas Vannieuwkerke <nicolas.vannieuwkerke@ugent.be>
Date: Mon, 23 Sep 2024 13:50:31 +0200
Subject: [PATCH] fix an issue with the validation workflow

---
 .../local/vcf_validate_small_variants/main.nf | 32 -------------------
 workflows/germline.nf                         | 21 ++++--------
 2 files changed, 7 insertions(+), 46 deletions(-)

diff --git a/subworkflows/local/vcf_validate_small_variants/main.nf b/subworkflows/local/vcf_validate_small_variants/main.nf
index eaa17579..63ca83fd 100644
--- a/subworkflows/local/vcf_validate_small_variants/main.nf
+++ b/subworkflows/local/vcf_validate_small_variants/main.nf
@@ -14,38 +14,6 @@ workflow VCF_VALIDATE_SMALL_VARIANTS {
 
     ch_versions                             = Channel.empty()
 
-    happy_vcf                               = Channel.empty()
-    happy_tbi                               = Channel.empty()
-    happy_indel_roc                         = Channel.empty()
-    happy_indel_roc_pass                    = Channel.empty()
-    happy_snp_roc                           = Channel.empty()
-    happy_snp_roc_pass                      = Channel.empty()
-    happy_roc                               = Channel.empty()
-    happy_summary                           = Channel.empty()
-    happy_extended_csv                      = Channel.empty()
-
-    vcfeval_true_positive_vcf               = Channel.empty()
-    vcfeval_true_positive_vcf_tbi           = Channel.empty()
-    vcfeval_false_negative_vcf              = Channel.empty()
-    vcfeval_false_negative_vcf_tbi          = Channel.empty()
-    vcfeval_false_positive_vcf              = Channel.empty()
-    vcfeval_false_positive_vcf_tbi          = Channel.empty()
-    vcfeval_true_positive_baseline_vcf      = Channel.empty()
-    vcfeval_true_positive_baseline_vcf_tbi  = Channel.empty()
-    vcfeval_summary                         = Channel.empty()
-    vcfeval_phasing                         = Channel.empty()
-    vcfeval_snp_roc                         = Channel.empty()
-    vcfeval_non_snp_roc                     = Channel.empty()
-    vcfeval_weighted_roc                    = Channel.empty()
-
-    rtgtools_snp_png_rocplot                = Channel.empty()
-    rtgtools_non_snp_png_rocplot            = Channel.empty()
-    rtgtools_weighted_png_rocplot           = Channel.empty()
-
-    rtgtools_snp_svg_rocplot                = Channel.empty()
-    rtgtools_non_snp_svg_rocplot            = Channel.empty()
-    rtgtools_weighted_svg_rocplot           = Channel.empty()
-
     ch_input = ch_vcf.join(ch_beds, failOnDuplicate: true, failOnMismatch: true)
 
     RTGTOOLS_VCFEVAL(
diff --git a/workflows/germline.nf b/workflows/germline.nf
index ec5987a7..fe3e04b6 100644
--- a/workflows/germline.nf
+++ b/workflows/germline.nf
@@ -342,12 +342,9 @@ workflow GERMLINE {
             gvcf:           [new_meta, gvcf, tbi] // Optional channel containing the GVCFs and their optional indices
             cram:           [new_meta, cram, crai]  // Mandatory channel containing the CRAM files and their optional indices
             roi:            [new_meta, roi_file] // Optional channel containing the ROI BED files for WES samples
-            family_samples: [meta.family, meta.family_samples.tokenize(",")] // A channel containing the samples per family
         }
         .set { ch_input }
 
-    ch_family_samples = ch_input.family_samples.distinct()
-
     //
     // Create the GVCF index if it's missing
     //
@@ -595,14 +592,15 @@ workflow GERMLINE {
 
             ch_input.truth_variants
                 .map { meta, vcf, tbi, bed ->
-                    [ groupKey(meta, meta.duplicate_count), vcf, tbi, bed ]
+                    def new_meta = meta - meta.subMap("duplicate_count")
+                    [ groupKey(new_meta, meta.duplicate_count), vcf, tbi, bed ]
                 }
                 .groupTuple()
                 .map { meta, vcf, tbi, bed ->
                     // Get only one VCF for samples that were given multiple times
-                    one_vcf = vcf.find { vcf_file -> vcf_file != [] } ?: []
-                    one_tbi = tbi.find { tbi_file -> tbi_file != [] } ?: []
-                    one_bed = bed.find { bed_file -> bed_file != [] } ?: []
+                    def one_vcf = vcf.find { vcf_file -> vcf_file != [] } ?: []
+                    def one_tbi = tbi.find { tbi_file -> tbi_file != [] } ?: []
+                    def one_bed = bed.find { bed_file -> bed_file != [] } ?: []
                     [ meta, one_vcf, one_tbi, one_bed ]
                 }
                 .branch { meta, vcf, tbi, bed ->
@@ -637,12 +635,7 @@ workflow GERMLINE {
             ch_final_vcfs
                 .map { meta, vcf, tbi ->
                     def new_meta = meta - meta.subMap("family_samples")
-                    [ meta.family, new_meta, vcf, tbi ]
-                }
-                .combine(ch_family_samples, by:0)
-                .map { family, meta, vcf, tbi, samples ->
-                    def sample = meta.sample ? [meta.sample] : samples
-                    [ meta, vcf, tbi, sample ]
+                    [ new_meta, vcf, tbi, meta.family_samples.tokenize(",") ]
                 }
                 .transpose(by: 3)
                 .map { meta, vcf, tbi, sample ->
@@ -654,7 +647,7 @@ workflow GERMLINE {
                     ]
                     [ new_meta, vcf, tbi ]
                 }
-                .combine(ch_truths, by:0)
+                .join(ch_truths, failOnMismatch:true, failOnDuplicate:true)
                 .filter { meta, vcf, tbi, truth_vcf, truth_tbi, truth_bed ->
                     // Filter out all samples that have no truth VCF
                     truth_vcf != []