From d40b264f8c34d4625524ff4625ca31821975d193 Mon Sep 17 00:00:00 2001 From: maxozo Date: Tue, 2 Apr 2024 14:27:03 +0100 Subject: [PATCH] allow tailoring the z thresholds --- bin/gtcheck_assign_summary.py | 29 ++++++++++++++--------- conf/base.conf | 2 ++ main.nf | 18 +++++++------- modules/nf-core/modules/genotypes/main.nf | 2 +- 4 files changed, 31 insertions(+), 20 deletions(-) diff --git a/bin/gtcheck_assign_summary.py b/bin/gtcheck_assign_summary.py index a2378793..7e91ce21 100755 --- a/bin/gtcheck_assign_summary.py +++ b/bin/gtcheck_assign_summary.py @@ -9,9 +9,25 @@ import pandas as pd VERBOSE = True + + +nargs = len(sys.argv) +if nargs < 3: + sys.exit("usage: {:s} [ ....]" + .format(sys.argv[0])) + +oufn = sys.argv[1] +ZSCORE_THRESH = float(sys.argv[2]) # Default is 8 - cardinal has been run with this threshold +ZSCORE_DIST_THRESH = float(sys.argv[3]) # Default is 8 - cardinal has been run with this threshold +infns = sys.argv[4:] + +sys.stdout.write("compare scores from {:d} files and write overall assignment to file {:s}\n". + format(len(infns), oufn)) + + SMVAL = float(1e-9) -ZSCORE_THRESH = float(8) -ZSCORE_DIST_THRESH = float(8) +# ZSCORE_THRESH = float(3) +# ZSCORE_DIST_THRESH = float(3) DONOR_CELLINE_MATCHSTR = re.compile("^celline_(\S+)$") FNAM_MATCHSTR = re.compile("^pool_(\S+)_panel_(\S+)_gtcheck_donor_assignments") @@ -230,16 +246,7 @@ def assign_donors(self, oufh): return pd.DataFrame(df).T if __name__ == '__main__': - nargs = len(sys.argv) - if nargs < 3: - sys.exit("usage: {:s} [ ....]" - .format(sys.argv[0])) - - oufn = sys.argv[1] - infns = sys.argv[2:] - sys.stdout.write("compare scores from {:d} files and write overall assignment to file {:s}\n". - format(len(infns), oufn)) asstab = AssignmentTables() fctr = asstab.parse_assignment_output_files(infns) if VERBOSE: diff --git a/conf/base.conf b/conf/base.conf index af01aeff..b28c18bc 100755 --- a/conf/base.conf +++ b/conf/base.conf @@ -85,6 +85,8 @@ params{ subset_vireo_genotypes = true // # This is a switch that determines whether we want to provide full genotype file in the vireo as an input or subset it down to the expected donors. NOTE: you may want to provide already merged shards for this, otherwise pipeline will merge this for you. posterior_assignment = false // #if this is set to true, we will perform the genotype donor matching after the deconvolution is performed. tsv_donor_panel_vcfs = "" // #tlist of vcf/bcf inputs 1) Can be a single file 2) Can be multiple cohorts (in cases where we dont want to merge the genotypes together) 3) Can be sharded inputs (for example per chromosome) + ZSCORE_THRESH = 8 //# Minimum z0 threshold required to call the gt assignment confident. + ZSCORE_DIST_THRESH = 8 //# Minimum bifference between z0 and z1 to call the assignment confident, } cellsnp { diff --git a/main.nf b/main.nf index dd2da88d..93fc1f0c 100755 --- a/main.nf +++ b/main.nf @@ -288,7 +288,9 @@ workflow REPORT_UPDATE{ CREATE_ARTIFICIAL_BAM_CHANNEL(input_channel) bam_split_channel = CREATE_ARTIFICIAL_BAM_CHANNEL.out.ch_experiment_bam_bai_barcodes ch_poolid_csv_donor_assignments = CREATE_ARTIFICIAL_BAM_CHANNEL.out.ch_poolid_csv_donor_assignments - + out_ch = params.outdir + ? Channel.fromPath(params.outdir, checkIfExists:true) + : Channel.fromPath("${launchDir}/${outdir}") // // 1) The pihat values were impemented posthoc, hence we are runing this on each of the independent tranches. // myFileChannel = Channel.fromPath( "${params.outdir}/deconvolution/vireo/*/GT_donors.vireo.vcf.gz" ) // myFileChannel.map{row -> tuple(row[-2], row)}.set{vireo_out_sample_donor_vcf} @@ -296,12 +298,12 @@ workflow REPORT_UPDATE{ // match_genotypes.out.out_finish_val.set{o1} // // updating the Metadata if something new has been fetched, // // UKBB is sending us samples once a week and sometimes the sample mappings may be present at a later date, hence we update previously run samples accordingly. - Channel.from([["${params.RUN}","${params.outdir}"]]).set{update_input_channel} - // // We sometimes aslo chnge apporach in the data fetch and we need to add in some extra metadata - // metadata_posthoc(update_input_channel) - // metadata_posthoc.out.dummy_out.set{o3} - replace_donors_posthoc(update_input_channel) - replace_donors_posthoc.out.dummy_out.set{o2} + // Channel.from([["${params.RUN}",out_ch]]).set{update_input_channel} + // // // We sometimes aslo chnge apporach in the data fetch and we need to add in some extra metadata + // // metadata_posthoc(update_input_channel) + // // metadata_posthoc.out.dummy_out.set{o3} + // replace_donors_posthoc(update_input_channel) + // replace_donors_posthoc.out.dummy_out.set{o2} // o1.mix(o2).last().set{o3} o3 = Channel.of('dummys') // Once everything is updated we need to make sure that the dataon the website and in the cardinal analysis foder is accurate and up to date, hence we rerun the data_handover scripts. @@ -312,7 +314,7 @@ workflow REPORT_UPDATE{ - data_handover(params.outdir, + data_handover(out_ch, input_channel, o3, ch_poolid_csv_donor_assignments, diff --git a/modules/nf-core/modules/genotypes/main.nf b/modules/nf-core/modules/genotypes/main.nf index 62e5da6b..54f53a16 100755 --- a/modules/nf-core/modules/genotypes/main.nf +++ b/modules/nf-core/modules/genotypes/main.nf @@ -358,7 +358,7 @@ process ASSIGN_DONOR_OVERALL donor_assignment_file = "${pool_id}_gt_donor_assignments.csv" stats_assignment_table_out = "stats_${pool_id}_gt_donor_assignments.csv" """ - gtcheck_assign_summary.py ${donor_assignment_file} ${gtcheck_assign_files} + gtcheck_assign_summary.py ${donor_assignment_file} ${params.genotype_input.ZSCORE_THRESH} ${params.genotype_input.ZSCORE_DIST_THRESH} ${gtcheck_assign_files} """ }