Skip to content

Commit

Permalink
allow tailoring the z thresholds
Browse files Browse the repository at this point in the history
  • Loading branch information
maxozo committed Apr 2, 2024
1 parent ba93796 commit d40b264
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 20 deletions.
29 changes: 18 additions & 11 deletions bin/gtcheck_assign_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,25 @@
import pandas as pd

VERBOSE = True


nargs = len(sys.argv)
if nargs < 3:
sys.exit("usage: {:s} <summary output> <assignment output panel 1> [<assignment output panel 2> ....]"
.format(sys.argv[0]))

oufn = sys.argv[1]
ZSCORE_THRESH = float(sys.argv[2]) # Default is 8 - cardinal has been run with this threshold
ZSCORE_DIST_THRESH = float(sys.argv[3]) # Default is 8 - cardinal has been run with this threshold
infns = sys.argv[4:]

sys.stdout.write("compare scores from {:d} files and write overall assignment to file {:s}\n".
format(len(infns), oufn))


SMVAL = float(1e-9)
ZSCORE_THRESH = float(8)
ZSCORE_DIST_THRESH = float(8)
# ZSCORE_THRESH = float(3)
# ZSCORE_DIST_THRESH = float(3)

DONOR_CELLINE_MATCHSTR = re.compile("^celline_(\S+)$")
FNAM_MATCHSTR = re.compile("^pool_(\S+)_panel_(\S+)_gtcheck_donor_assignments")
Expand Down Expand Up @@ -230,16 +246,7 @@ def assign_donors(self, oufh):
return pd.DataFrame(df).T

if __name__ == '__main__':
nargs = len(sys.argv)
if nargs < 3:
sys.exit("usage: {:s} <summary output> <assignment output panel 1> [<assignment output panel 2> ....]"
.format(sys.argv[0]))

oufn = sys.argv[1]
infns = sys.argv[2:]

sys.stdout.write("compare scores from {:d} files and write overall assignment to file {:s}\n".
format(len(infns), oufn))
asstab = AssignmentTables()
fctr = asstab.parse_assignment_output_files(infns)
if VERBOSE:
Expand Down
2 changes: 2 additions & 0 deletions conf/base.conf
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ params{
subset_vireo_genotypes = true // # This is a switch that determines whether we want to provide full genotype file in the vireo as an input or subset it down to the expected donors. NOTE: you may want to provide already merged shards for this, otherwise pipeline will merge this for you.
posterior_assignment = false // #if this is set to true, we will perform the genotype donor matching after the deconvolution is performed.
tsv_donor_panel_vcfs = "" // #tlist of vcf/bcf inputs 1) Can be a single file 2) Can be multiple cohorts (in cases where we dont want to merge the genotypes together) 3) Can be sharded inputs (for example per chromosome)
ZSCORE_THRESH = 8 //# Minimum z0 threshold required to call the gt assignment confident.
ZSCORE_DIST_THRESH = 8 //# Minimum bifference between z0 and z1 to call the assignment confident,
}

cellsnp {
Expand Down
18 changes: 10 additions & 8 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -288,20 +288,22 @@ workflow REPORT_UPDATE{
CREATE_ARTIFICIAL_BAM_CHANNEL(input_channel)
bam_split_channel = CREATE_ARTIFICIAL_BAM_CHANNEL.out.ch_experiment_bam_bai_barcodes
ch_poolid_csv_donor_assignments = CREATE_ARTIFICIAL_BAM_CHANNEL.out.ch_poolid_csv_donor_assignments

out_ch = params.outdir
? Channel.fromPath(params.outdir, checkIfExists:true)
: Channel.fromPath("${launchDir}/${outdir}")
// // 1) The pihat values were impemented posthoc, hence we are runing this on each of the independent tranches.
// myFileChannel = Channel.fromPath( "${params.outdir}/deconvolution/vireo/*/GT_donors.vireo.vcf.gz" )
// myFileChannel.map{row -> tuple(row[-2], row)}.set{vireo_out_sample_donor_vcf}
// match_genotypes(vireo_out_sample_donor_vcf)
// match_genotypes.out.out_finish_val.set{o1}
// // updating the Metadata if something new has been fetched,
// // UKBB is sending us samples once a week and sometimes the sample mappings may be present at a later date, hence we update previously run samples accordingly.
Channel.from([["${params.RUN}","${params.outdir}"]]).set{update_input_channel}
// // We sometimes aslo chnge apporach in the data fetch and we need to add in some extra metadata
// metadata_posthoc(update_input_channel)
// metadata_posthoc.out.dummy_out.set{o3}
replace_donors_posthoc(update_input_channel)
replace_donors_posthoc.out.dummy_out.set{o2}
// Channel.from([["${params.RUN}",out_ch]]).set{update_input_channel}
// // // We sometimes aslo chnge apporach in the data fetch and we need to add in some extra metadata
// // metadata_posthoc(update_input_channel)
// // metadata_posthoc.out.dummy_out.set{o3}
// replace_donors_posthoc(update_input_channel)
// replace_donors_posthoc.out.dummy_out.set{o2}
// o1.mix(o2).last().set{o3}
o3 = Channel.of('dummys')
// Once everything is updated we need to make sure that the dataon the website and in the cardinal analysis foder is accurate and up to date, hence we rerun the data_handover scripts.
Expand All @@ -312,7 +314,7 @@ workflow REPORT_UPDATE{



data_handover(params.outdir,
data_handover(out_ch,
input_channel,
o3,
ch_poolid_csv_donor_assignments,
Expand Down
2 changes: 1 addition & 1 deletion modules/nf-core/modules/genotypes/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,7 @@ process ASSIGN_DONOR_OVERALL
donor_assignment_file = "${pool_id}_gt_donor_assignments.csv"
stats_assignment_table_out = "stats_${pool_id}_gt_donor_assignments.csv"
"""
gtcheck_assign_summary.py ${donor_assignment_file} ${gtcheck_assign_files}
gtcheck_assign_summary.py ${donor_assignment_file} ${params.genotype_input.ZSCORE_THRESH} ${params.genotype_input.ZSCORE_DIST_THRESH} ${gtcheck_assign_files}
"""
}

Expand Down

0 comments on commit d40b264

Please sign in to comment.