allow tailoring the z thresholds

wtsi-hgi · Apr 2, 2024 · d40b264 · d40b264
1 parent ba93796
commit d40b264
Show file tree

Hide file tree

Showing 4 changed files with 31 additions and 20 deletions.
diff --git a/bin/gtcheck_assign_summary.py b/bin/gtcheck_assign_summary.py
@@ -9,9 +9,25 @@
 import pandas as pd
 
 VERBOSE = True
+
+
+nargs = len(sys.argv)
+if nargs < 3:
+    sys.exit("usage: {:s} <summary output> <assignment output panel 1> [<assignment output panel 2> ....]"
+        .format(sys.argv[0]))
+
+oufn = sys.argv[1] 
+ZSCORE_THRESH = float(sys.argv[2]) # Default is 8 - cardinal has been run with this threshold
+ZSCORE_DIST_THRESH = float(sys.argv[3]) # Default is 8 - cardinal has been run with this threshold
+infns = sys.argv[4:]
+
+sys.stdout.write("compare scores from {:d} files and write overall assignment to file {:s}\n".
+    format(len(infns), oufn))
+
+
 SMVAL = float(1e-9)
-ZSCORE_THRESH = float(8)
-ZSCORE_DIST_THRESH = float(8)
+# ZSCORE_THRESH = float(3)
+# ZSCORE_DIST_THRESH = float(3)
 
 DONOR_CELLINE_MATCHSTR = re.compile("^celline_(\S+)$")
 FNAM_MATCHSTR = re.compile("^pool_(\S+)_panel_(\S+)_gtcheck_donor_assignments")
@@ -230,16 +246,7 @@ def assign_donors(self, oufh):
         return pd.DataFrame(df).T
 
 if __name__ == '__main__':
-    nargs = len(sys.argv)
-    if nargs < 3:
-        sys.exit("usage: {:s} <summary output> <assignment output panel 1> [<assignment output panel 2> ....]"
-            .format(sys.argv[0]))
-
-    oufn = sys.argv[1]
-    infns = sys.argv[2:]
 
-    sys.stdout.write("compare scores from {:d} files and write overall assignment to file {:s}\n".
-        format(len(infns), oufn))
     asstab = AssignmentTables()
     fctr = asstab.parse_assignment_output_files(infns)
     if VERBOSE:

diff --git a/conf/base.conf b/conf/base.conf
@@ -85,6 +85,8 @@ params{
         subset_vireo_genotypes = true // # This is a switch that determines whether we want to provide full genotype file in the vireo as an input or subset it down to the expected donors. NOTE: you may want to provide already merged shards for this, otherwise pipeline will merge this for you.
         posterior_assignment = false // #if this is set to true, we will perform the genotype donor matching after the deconvolution is performed.
         tsv_donor_panel_vcfs = "" // #tlist of vcf/bcf inputs 1) Can be a single file 2) Can be multiple cohorts (in cases where we dont want to merge the genotypes together) 3) Can be sharded inputs (for example per chromosome)
+        ZSCORE_THRESH = 8 //# Minimum z0 threshold required to call the gt assignment confident. 
+        ZSCORE_DIST_THRESH = 8 //# Minimum bifference between z0 and z1 to call the assignment confident,
     }
 
     cellsnp {

diff --git a/main.nf b/main.nf
@@ -288,20 +288,22 @@ workflow REPORT_UPDATE{
     CREATE_ARTIFICIAL_BAM_CHANNEL(input_channel)
     bam_split_channel = CREATE_ARTIFICIAL_BAM_CHANNEL.out.ch_experiment_bam_bai_barcodes
     ch_poolid_csv_donor_assignments = CREATE_ARTIFICIAL_BAM_CHANNEL.out.ch_poolid_csv_donor_assignments
-
+    out_ch = params.outdir
+    ? Channel.fromPath(params.outdir, checkIfExists:true)
+    : Channel.fromPath("${launchDir}/${outdir}")
     // // 1) The pihat values were impemented posthoc, hence we are runing this on each of the independent tranches. 
     // myFileChannel = Channel.fromPath( "${params.outdir}/deconvolution/vireo/*/GT_donors.vireo.vcf.gz" )
     // myFileChannel.map{row -> tuple(row[-2], row)}.set{vireo_out_sample_donor_vcf}
     // match_genotypes(vireo_out_sample_donor_vcf)
     // match_genotypes.out.out_finish_val.set{o1}
     // // updating the Metadata if something new has been fetched,
     // // UKBB is sending us samples once a week and sometimes the sample mappings may be present at a later date, hence we update previously run samples accordingly.
-    Channel.from([["${params.RUN}","${params.outdir}"]]).set{update_input_channel}
-    // // We sometimes aslo chnge apporach in the data fetch and we need to add in some extra metadata
-    // metadata_posthoc(update_input_channel)
-    // metadata_posthoc.out.dummy_out.set{o3}
-    replace_donors_posthoc(update_input_channel)
-    replace_donors_posthoc.out.dummy_out.set{o2}
+    // Channel.from([["${params.RUN}",out_ch]]).set{update_input_channel}
+    // // // We sometimes aslo chnge apporach in the data fetch and we need to add in some extra metadata
+    // // metadata_posthoc(update_input_channel)
+    // // metadata_posthoc.out.dummy_out.set{o3}
+    // replace_donors_posthoc(update_input_channel)
+    // replace_donors_posthoc.out.dummy_out.set{o2}
     // o1.mix(o2).last().set{o3}
     o3 = Channel.of('dummys')
     // Once everything is updated we need to make sure that the dataon the website and in the cardinal analysis foder is accurate and up to date, hence we rerun the data_handover scripts.
@@ -312,7 +314,7 @@ workflow REPORT_UPDATE{
 
 
 
-    data_handover(params.outdir,
+    data_handover(out_ch,
                 input_channel,
                 o3,
                 ch_poolid_csv_donor_assignments,

diff --git a/modules/nf-core/modules/genotypes/main.nf b/modules/nf-core/modules/genotypes/main.nf
@@ -358,7 +358,7 @@ process ASSIGN_DONOR_OVERALL
   donor_assignment_file = "${pool_id}_gt_donor_assignments.csv"
   stats_assignment_table_out = "stats_${pool_id}_gt_donor_assignments.csv"
   """
-    gtcheck_assign_summary.py ${donor_assignment_file} ${gtcheck_assign_files}
+    gtcheck_assign_summary.py ${donor_assignment_file} ${params.genotype_input.ZSCORE_THRESH} ${params.genotype_input.ZSCORE_DIST_THRESH} ${gtcheck_assign_files}
   """
 }