diff --git a/assets/deploy_scripts/module_exacutables/yascp b/assets/deploy_scripts/module_exacutables/yascp index 174b8ac..e555c03 100755 --- a/assets/deploy_scripts/module_exacutables/yascp +++ b/assets/deploy_scripts/module_exacutables/yascp @@ -5,6 +5,7 @@ bold=$(tput bold) normal=$(tput sgr0) + if [[ "$QUEUE" == '' ]]; then export QUEUE='long' diff --git a/bin/4.add_vdj.R b/bin/4.add_vdj.R index 4a0ed4d..0af69d2 100755 --- a/bin/4.add_vdj.R +++ b/bin/4.add_vdj.R @@ -34,8 +34,7 @@ data_dir <- './' args = commandArgs(trailingOnly=TRUE) wnn_integrated_file = args[1] -n2 = strsplit(as.character(wnn_integrated_file), split="__all_samples")[[1]][1] -outname = paste0(n2,'__all_samples_integrated.vdj.RDS') + myPalette <- colorRampPalette(rev(brewer.pal(11, "Spectral"))) @@ -155,8 +154,14 @@ vdj_dir <- paste0('./vdj/') dir.create(vdj_dir,showWarnings = F) names(immdata_TCR$data) <- tcr_names names(immdata_BCR$data) <- bcr_names -saveRDS(immdata_BCR, file=paste0(vdj_dir,'/slemap_BCR.rds')) -saveRDS(immdata_TCR, paste0(vdj_dir,'/slemap_TCR.rds')) + +n2 = strsplit(as.character(wnn_integrated_file), split="__all_samples")[[1]][1] +outname = paste0(n2,'__all_samples_integrated.vdj.RDS') +outname_BCR = paste0(n2,'__all_samples_integrated.BCR.RDS') +outname_TCR = paste0(n2,'__all_samples_integrated.TCR.RDS') + +saveRDS(immdata_BCR, file=outname_BCR) +saveRDS(immdata_TCR, file=outname_TCR) saveRDS(all_samples, file=outname) diff --git a/bin/concordance_calculations.py b/bin/concordance_calculations.py index 0aac3a4..5b61c83 100755 --- a/bin/concordance_calculations.py +++ b/bin/concordance_calculations.py @@ -35,8 +35,11 @@ mode = 'no_het' #We can not determine whether hets are concordant or discordant in terms of reads. -class Concordances: + + +class Concordances: + def reset(self): self.cell_concordance_table ={} @@ -55,1069 +58,1095 @@ def __init__(self, donor_assignments_table,cell_assignments_table,exclusive_don_ self.uninformative_sites = uninformative_sites self.record_dict={} - def norm_genotypes(self,expected_vars): - expected_vars = pd.DataFrame(expected_vars) - if len(expected_vars) > 0: - split_str=expected_vars[0].str.split("_") - expected_vars['ids'] = split_str.str[0]+'_'+split_str.str[1]+'_'+split_str.str[2]+'_'+split_str.str[3] - expected_vars['pos'] = split_str.str[0]+'_'+split_str.str[1] - expected_vars['vars'] = split_str.str[4] - expected_vars['vars'] = expected_vars['vars'].str.replace('|','/',regex=False) - expected_vars = expected_vars[expected_vars['vars']!='./.'] - expected_vars.loc[expected_vars['vars']=='0/1','vars']='1/0' - expected_vars['combo']= expected_vars['ids']+'_'+expected_vars['vars'] - return expected_vars - - - - def get_strict_discordance(self, expected_vars, cell_vars): - ''' - take a list of SNP array genotypes and a list of cellSNP genotypes, return counts of truly discordant - sites and relaxed concordant sites and list of discordant sites1 - 1) If you have 1/1 on SNP array you can not get a 0/1 or 0/0 genotype - 2) if you have a 0/0 you can not get a 1/1 or 0/1 - 3) if you genotype is 0/1 you can get all copies: 0/0 . 0/1. 1/1 - So - each obversed cellsnp allele must be in the array SNP gtype - ''' - snp_gtypes = expected_vars[0] - cellsnp_gtypes = cell_vars[0] - true_discordant = 0 - relaxed_concordant = 0 - relaxed_concordant_informative = 0 - relaxed_concordant_informative_ids = [] - relaxed_concordant_uninformative_ids = [] - true_discordant_uninformative_ids =[] - true_discordant_informative_ids=[] - relaxed_concordant_uninformative = 0 - true_discordant_informative = 0 - true_discordant_uninformative = 0 - discordant_vars = [] - concordant_vars = [] - subset_informative_concordant = 0 - subset_informative_discordant = 0 - - #print(self.uninformative_sites) - #print(self.informative_sites) + def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match, donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data,donor_assignments_table): - #create sets of the ids (chrom, pos, ref, alt) in each set of genotypes. Filter to the ids present in both - #then filter to informative and uninformative. If uninformative >0 then create a subset of informative - # with the same number of vars (at random) - split_snp_gts=snp_gtypes.str.split("_") - snp_gtypes_ids = set(split_snp_gts.str[0]+'_'+split_snp_gts.str[1]+'_'+split_snp_gts.str[2]+'_'+split_snp_gts.str[3]) + Concordant_Sites, \ + Discordant_sites, \ + Total_Overlapping_sites, \ + discordant_sites, \ + cell_vars_norm, \ + true_discordant_count, \ + relaxed_concordant_count, \ + relaxed_concordant_informative_count, \ + relaxed_concordant_uninformative_count, \ + true_discordant_informative_count, \ + true_discordant_uninformative_count, \ + total_sites, \ + informative_sites, \ + uninformative_sites, \ + total_reads, \ + total_reads_informative, \ + total_reads_uninformative, \ + discordant_reads, \ + discordant_reads_informative, \ + discordant_reads_uninformative, \ + discordant_vars, \ + concordant_vars, \ + discordant_read_fraction_in_concordant_sites, \ + discordant_read_fraction_in_discordant_sites, \ + discordant_reads_uninformative_fraction, \ + discordant_reads_informative_fraction, discordant_reads_informative_subset = self.retrieve_concordant_discordant_sites(expected_vars_norm,cell_vars) + + total_concordant_sites = len(Concordant_Sites) + relaxed_concordant_count + dds = self.donor_distinct_sites[donor_gt_match] + Nr_donor_distinct_sites = len(dds) + Nr_Concordant = len(Concordant_Sites) + Nr_Relaxed_concordant = relaxed_concordant_count + Nr_Discordant = len(Discordant_sites) + Nr_Total_Overlapping_sites = len(Total_Overlapping_sites) + Number_of_sites_that_are_donor_concordant_and_exclusive = len(set(dds).intersection(set(Discordant_sites))) + Number_of_sites_in_cellsnp_but_not_in_reference = set(cell_vars_norm['pos'])-set(expected_vars_norm['pos']) + #Quantify donor variation in other donors + discordant_vars_in_pool = [] + donor_table_of_concordances = [] + total_discordant_sites_that_are_concordant_with_other_donors_in_pool = set() + total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown = {} + total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort']={} + total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort']={} + + total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown_totalReads = {} + total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown_totalReads['Whithin_Cohort']={} + total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown_totalReads['Out_of_Cohort']={} + + total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown = {} + total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort']={} + total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort']={} + + + informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool = set() + total_cordant_sites_that_are_concordant_with_other_donors_in_pool = set() + donor_gt_match_cohort = donor_cohorts[donor_gt_match] + donors_contributing_to_out_of_cohort= [] + donors_contributing_to_Whithin_Cohort=[] + sites_contributing_to_out_of_cohort= [] + sites_contributing_to_Whithin_Cohort=[] + for donor in set(donor_assignments_table['donor_gt']): + try: + expected_vars_norm_of_other_donor = all_donor_data[donor] + except: + continue + try: + donor_cohort = donor_cohorts[donor] + donor_vars = vars_per_donor_gt[donor] + except: + continue + + Concordant_Sites_otherDonor, \ + Discordant_sites_otherDonor, \ + Total_Overlapping_sites_otherDonor, \ + discordant_sites_otherDonor, \ + cell_vars_norm_otherDonor, \ + true_discordant_count_otherDonor, \ + relaxed_concordant_count_otherDonor, \ + relaxed_concordant_informative_count_otherDonor, \ + relaxed_concordant_uninformative_count_otherDonor, \ + true_discordant_informative_count_otherDonor, \ + true_discordant_uninformative_count_otherDonor, \ + total_sites_otherDonor, \ + informative_sites_otherDonor, \ + uninformative_sites_otherDonor, \ + total_reads_otherDonor, \ + total_reads_informative_otherDonor, \ + total_reads_uninformative_otherDonor, \ + discordant_reads_otherDonor, \ + discordant_reads_informative_otherDonor, \ + discordant_reads_uninformative_otherDonor, \ + discordant_vars_otherDonor, \ + concordant_vars_otherDonor, \ + discordant_read_fraction_in_concordant_sites_otherDonor, \ + discordant_read_fraction_in_discordant_sites_otherDonor, \ + discordant_reads_uninformative_fraction_otherDonor, \ + discordant_reads_informative_fraction_otherDonor, discordant_reads_informative_subset_otherDonor = self.retrieve_concordant_discordant_sites(expected_vars_norm_of_other_donor,cell_vars,donor_cohort=donor_cohort) + + if total_sites_otherDonor==0: + total_sites_otherDonor=0.000000000001 + total_concordant_sites_otherDonor = relaxed_concordant_count_otherDonor + concordant_percent_in_other_donor= total_concordant_sites_otherDonor/total_sites_otherDonor*100 + discordant_percent_in_other_donor= true_discordant_count_otherDonor/total_sites_otherDonor*100 + DonorDiscordant_Sites_that_are_atributed_to_other_donor = set(discordant_vars).intersection(set(concordant_vars_otherDonor)) + TotalSites_that_are_atributed_to_other_donor = set(discordant_vars).union(set(concordant_vars)).intersection(set(concordant_vars_otherDonor)) - split_cellsnp_gts=cellsnp_gtypes.str.split("_") - cellsnp_gtypes_ids = set(split_cellsnp_gts.str[0]+'_'+split_cellsnp_gts.str[1]+'_'+split_cellsnp_gts.str[2]+'_'+split_cellsnp_gts.str[3]) + Informative__DonorDiscordant_Sites_that_are_atributed_to_other_donor = set(true_discordant_informative_count).intersection(set(relaxed_concordant_informative_count_otherDonor)) + DonorCordant_Sites_that_are_atributed_to_other_donor = set(concordant_vars).intersection(set(concordant_vars_otherDonor)) + + total_reads_for_discordant_sites_that_are_concordant_with_other_donor,discordant_reads_for_discordant_sites_that_are_concordant_with_other_donor,concordant_reads_for_discordant_sites_that_are_concordant_with_other_donor,\ + _,_,_= self.read_extraction(DonorDiscordant_Sites_that_are_atributed_to_other_donor,expected_vars_norm_of_other_donor,cell_vars_norm_otherDonor) - shared_gts = snp_gtypes_ids.intersection(cellsnp_gtypes_ids) + if not donor == donor_gt_match: + # We want to kow how many of these discordant site + if 'U937' in donor: + continue + if 'THP1' in donor: + continue + + if donor_gt_match_cohort == donor_cohort: + coh = 'Whithin_Cohort' + sites_contributing_to_Whithin_Cohort.extend(list(TotalSites_that_are_atributed_to_other_donor)) + if len(DonorDiscordant_Sites_that_are_atributed_to_other_donor)>0: + donors_contributing_to_Whithin_Cohort.append(donor) + + else: + coh = 'Out_of_Cohort' + sites_contributing_to_out_of_cohort.extend(list(TotalSites_that_are_atributed_to_other_donor)) + if len(DonorDiscordant_Sites_that_are_atributed_to_other_donor)>0: + donors_contributing_to_out_of_cohort.append(donor) + + + total_discordant_sites_that_are_concordant_with_other_donors_in_pool = total_discordant_sites_that_are_concordant_with_other_donors_in_pool.union(set(DonorDiscordant_Sites_that_are_atributed_to_other_donor)) + # now we addit for a cohort since the biggest issue comes from cohort cross-contamination + # for each of these sites now we calculate the number of reads that it accounts: + # tree level set: cohort: site: counts + for site in DonorDiscordant_Sites_that_are_atributed_to_other_donor: + total_reads_for_site,discordant_reads_for_site,concordant_for_site,\ + total_reads_noHet,discordant_reads_noHet,concordant_reads_noHet= self.read_extraction([site],expected_vars_norm_of_other_donor,cell_vars_norm_otherDonor) + if concordant_for_site==0: + pass + try: + total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_reads_noHet) - shared_informative = shared_gts.intersection(self.informative_sites) - shared_uninformative = shared_gts.intersection(self.uninformative_sites) - # print("shared informative " + str(len(shared_informative))) - # print("shared uninformative " + str(len(shared_uninformative))) + except: + try: + total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site]=[] + total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_reads_noHet) + + except: + total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh]={} + total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site]=[] + total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_reads_noHet) - #store the numbers of informative and uninformative sites shared between cellSNP and gt data as these - #are the sites used for concordance - self.informative_covered = len(shared_informative) - self.uninformative_covered = len(shared_uninformative) + for site in DonorDiscordant_Sites_that_are_atributed_to_other_donor: + total_reads_for_site,discordant_reads_for_site,concordant_for_site,\ + _,_,_= self.read_extraction([site],expected_vars_norm_of_other_donor,cell_vars_norm_otherDonor) + # if discordant_reads_for_site>0: + # print('here') + if concordant_for_site==0: + pass + try: + total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_for_site) + except: + try: + total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site]=[] + total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_for_site) + + except: + total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh]={} + total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site]=[] + total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_for_site) + + # to get the total reads that can be atributed to the other donor i have to check if site is already covered in the total_discordant_sites_that_are_concordant_with_other_donors_in_pool. + # the ones that havent, i have to add the reads up for them. + informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool = informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool.union(set(Informative__DonorDiscordant_Sites_that_are_atributed_to_other_donor)) + + total_cordant_sites_that_are_concordant_with_other_donors_in_pool = total_cordant_sites_that_are_concordant_with_other_donors_in_pool.union(set(DonorCordant_Sites_that_are_atributed_to_other_donor)) + - if len(shared_uninformative) > 0: - #print(len(shared_uninformative)) - # print(len(shared_informative)) - if len(shared_uninformative) <= len(shared_informative): - informative_subset = set(random.sample(shared_informative, len(shared_uninformative))) + common_vars = list(set(discordant_vars) & set(donor_vars)) + common_var_count = str(len(common_vars)) + donor_cohort_common = donor + ":" + donor_cohort + ":" + common_var_count + discordant_vars_in_pool.append(donor_cohort_common) + + # Here we want to calculate the number of discordant sites in other donors and see if in terms of concordance the same donor is picked as per GT assignment. + # We do this to investigate the potential of a cell coming from this other donor. else: - informative_subset = set()#if there are more shared uninformative than shared informative we will not subset - # print(informative_subset) - # exit(0) - else: - informative_subset = set() - - # print(informative_subset) - self.informative_subset = informative_subset - - snp_gtypes_set = set(snp_gtypes) - snp_gtypes_set = sorted(snp_gtypes_set) - - cellsnp_gtypes_set = set(cellsnp_gtypes) - cellsnp_gtypes_set = sorted(cellsnp_gtypes_set) - - #for i in range(0, len(snp_gtypes)): - # cellsnp_gtypes_str = pd.DataFrame(cellsnp_gtypes_set,columns=['cellsnp_gtypes']) - # cellsnp_gtypes_str['snp_gtypes'] = pd.DataFrame(snp_gtypes_set,columns=['snp_gtypes'])['snp_gtypes'] - for i in range(0, len(snp_gtypes_set)): - discordant = False - # snp_data = snp_gtypes[i].split('_') - # cellsnp_data = cellsnp_gtypes[i].split('_') - snp_data = snp_gtypes_set[i].split('_') - cellsnp_data = cellsnp_gtypes_set[i].split('_') - - snp_alleles_set = set([snp_data[4][0], snp_data[4][2]]) - cellsnp_alleles_set = set([cellsnp_data[4][0], cellsnp_data[4][2]]) - - snp_var = ('_').join(snp_data[0:4]) - cellsnp_var = ('_').join(cellsnp_data[0:4]) - - if not cellsnp_var == snp_var: - print("Error with strict discordance calculations: " + snp_gtypes[i] + " " + cellsnp_gtypes[i]) - exit(1) - else: - for allele in cellsnp_alleles_set: - if not allele in snp_alleles_set:#if a cellSNP allele is found that is not in the array data this is discordant - discordant = True + _='check' + try: + percent_relaxed_concordant = Nr_Relaxed_concordant/(Nr_Relaxed_concordant+true_discordant_count)*100 + except: + percent_relaxed_concordant = 0 + # g2 = total_concordant_sites_otherDonor/total_sites_otherDonor*100 + if not (percent_relaxed_concordant == concordant_percent_in_other_donor): + _='something not right' + break - if discordant == True: - true_discordant+=1 - discordant_vars.append(cellsnp_var) - if snp_var in self.uninformative_sites: - true_discordant_uninformative+=1 - true_discordant_uninformative_ids.append(snp_var) - elif snp_var in self.informative_sites: - true_discordant_informative+=1 - true_discordant_informative_ids.append(snp_var) - else: - relaxed_concordant+=1 - concordant_vars.append(cellsnp_var) - if snp_var in self.uninformative_sites: - relaxed_concordant_uninformative+=1 - relaxed_concordant_uninformative_ids.append(snp_var) - elif snp_var in self.informative_sites: - relaxed_concordant_informative+=1 - relaxed_concordant_informative_ids.append(snp_var) - if len(shared_uninformative) > 0: - if snp_var in informative_subset: - if discordant == True: - subset_informative_discordant+=1 - else: - subset_informative_concordant+=1 - # true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count - cell_vars2 = cell_vars.set_index('ids') - disc = pd.DataFrame(set(cell_vars2.loc[discordant_vars]['combo']),columns=['combo_x']) - df_cd = pd.merge(cell_vars, expected_vars, how='inner', on = 'pos') - disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x') - disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y'] - disc_sites_string = ';'.join(disc2['expected_retrieved']) - return true_discordant, relaxed_concordant, relaxed_concordant_informative_ids, relaxed_concordant_uninformative_ids, true_discordant_informative_ids, true_discordant_uninformative_ids, discordant_vars, concordant_vars, disc_sites_string + donor_table_of_concordances.append({'donor':donor, 'cell':cell1, 'donor_cohort':donor_cohort, \ + 'gt matched donor':donor == donor_gt_match, \ + 'DonorCordant_Sites_that_are_atributed_to_other_donor':len(DonorCordant_Sites_that_are_atributed_to_other_donor), \ + 'DonorCordant_Sites_that_are_atributed_to_other_donor/total':f"{len(DonorCordant_Sites_that_are_atributed_to_other_donor)}/{len(concordant_vars)}", \ + 'DonorDiscordant_Sites_that_are_atributed_to_other_donor':len(DonorDiscordant_Sites_that_are_atributed_to_other_donor), \ + 'DonorDiscordant_Sites_that_are_atributed_to_other_donor/total':f"{len(DonorDiscordant_Sites_that_are_atributed_to_other_donor)}/{len(discordant_vars)}", \ + 'concordant_percent_in_other_donor':concordant_percent_in_other_donor, \ + 'discordant_percent_in_other_donor':discordant_percent_in_other_donor, \ + 'discordant_reads_otherDonor':discordant_reads_otherDonor, \ + 'discordant_sites_otherDonor':len(discordant_vars_otherDonor), \ + 'concordant_sites_otherDonor':len(concordant_vars_otherDonor), \ + 'total_sites_otherDonor':total_sites_otherDonor, \ + 'discordant_reads_otherDonor':discordant_reads_otherDonor, \ + 'total_reads_otherDonor':total_reads_otherDonor, \ + # 'discordant_read_fraction_in_concordant_sites_otherDonor':discordant_read_fraction_in_concordant_sites_otherDonor, \ + # 'discordant_read_fraction_in_discordant_sites_otherDonor':discordant_read_fraction_in_discordant_sites_otherDonor, \ + 'concordant_reads_For_discordant_sites_that_are_Concordant_with_other_donor':concordant_reads_for_discordant_sites_that_are_concordant_with_other_donor + }) + + + #here now we want to see overall how many reads potentially come from different cohorts. + # cohort_specific_site_quant_string="" + # cohort_specific_read_quant_string="" + + Whithin_Cohort__total_number_of_potential_contaminent_reads=0 + Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool=0 + Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool=0 + try: + Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool = len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) + # for k1 in total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys(): + # Whithin_Cohort__total_number_of_potential_contaminent_reads+= max(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'][k1]) + except: + _='Doesnt Exist' + + Out_of_Cohort__total_number_of_potential_contaminent_reads=0 + try: + Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool = len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys()) + # for k1 in total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys(): + # Out_of_Cohort__total_number_of_potential_contaminent_reads+= max(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'][k1]) + except: + _='Doesnt Exist' - def read_concordance_calc(self,expected_vars,cell_vars): - - # This is a wrapper to add up the discordant reads in the cellsnp file. + + + + + + + + # No het total calcs + + # Since we can not tell just from the cellsnp file if the discordant read at concordant site is actually concordant with another donor in a pool + # we assume the worse case scenario and add idscordant reads at concordant sites to the total. + # Number of reads on the sites that are concordant, but has discordant reads with another donor + Whithin_Cohort__total_number_of_potential_contaminent_reads=0 + try: + concordant_sites_atributed_to_other_donor = list(set(sites_contributing_to_Whithin_Cohort)) + if not total_sites >= len(concordant_sites_atributed_to_other_donor): + raise Exception("Sorry, the number of sites atributed can not be larger than total sites") - # expected genotype 0/0 - expected_hom_ref = expected_vars[expected_vars['vars'] == '0/0'] - hom_ref_sites = set(expected_hom_ref['ids']) - cell_vars2 = cell_vars[cell_vars['ids'].isin(hom_ref_sites)] - ad_hom_ref = cell_vars2['AD'].sum() - oth_hom_ref = cell_vars2['OTH'].sum() - discordant_hom_ref = ad_hom_ref + oth_hom_ref + _,_,_,\ + _,Whithin_Cohort__total_number_of_potential_contaminent_reads, \ + _= self.read_extraction(concordant_sites_atributed_to_other_donor,expected_vars_norm,cell_vars_norm) - # expected genotype 0/1 or 1/0 - hets = ['0/1', '1/0'] - expected_het = expected_vars[expected_vars['vars'].isin(hets)] - het_sites = set(expected_het['ids']) - cell_vars3 = cell_vars[cell_vars['ids'].isin(het_sites)] - discordant_het = cell_vars3['OTH'].sum() + if not discordant_reads >= len(Whithin_Cohort__total_number_of_potential_contaminent_reads): + raise Exception("Total discordant reads can not be less than within cohort discordant reads.") - # expected genotype 1/1 - expected_hom_alt = expected_vars[expected_vars['vars'] == '1/1'] - hom_alt_sites = set(expected_hom_alt['ids']) - cell_vars4 = cell_vars[cell_vars['ids'].isin(hom_alt_sites)] - # DP + OTH - AD - ad_hom_alt = cell_vars4['AD'].sum() - dp_hom_alt = cell_vars4['DP'].sum() - oth_hom_alt = cell_vars4['OTH'].sum() - discordant_hom_alt = (dp_hom_alt + oth_hom_alt) - ad_hom_alt - # Total analysis - discordant_reads = discordant_hom_ref + discordant_het + discordant_hom_alt - total_dp = cell_vars['DP'].sum() - total_oth = cell_vars['OTH'].sum() - total_reads = total_dp + total_oth - discordantRead_ratio = discordant_reads/total_reads + except: + _='Doesnt Exist' + + + Out_of_Cohort__total_number_of_potential_contaminent_reads=0 + try: + concordant_sites_atributed_to_other_donor = list(set(sites_contributing_to_out_of_cohort)) + if not total_sites >= len(concordant_sites_atributed_to_other_donor): + raise Exception("Sorry, the number of sites atributed can not be larger than total sites") - # No het analysis - they are not discordant or concordant - we cannot make any call at those sites - discordant_reads_noHet = discordant_hom_ref + discordant_hom_alt - total_reads_noHet = total_reads - discordant_het - discordantRead_ratio_noHet = discordant_reads_noHet/total_reads_noHet + _,_,_,_,Out_of_Cohort__total_number_of_potential_contaminent_reads, _= self.read_extraction(concordant_sites_atributed_to_other_donor,expected_vars_norm,cell_vars_norm) - return total_reads,total_dp,total_oth,discordant_reads,discordantRead_ratio,discordant_reads_noHet,total_reads_noHet,discordantRead_ratio_noHet - - def read_condordance(self, expected_vars, cell_vars,discordant_vars, concordant_vars): - ''' - get read level concordance using DP, AD and OTH format fields - ##FORMAT= - ##FORMAT= - ##FORMAT= - ''' - if not len(expected_vars) == len(cell_vars): - print("length mismatch between expected vars and cell vars") - exit(1) + if not discordant_reads >= len(Out_of_Cohort__total_number_of_potential_contaminent_reads): + raise Exception("Total discordant reads can not be less than out of cohort discordant reads.") - total_sites = len(expected_vars) - #add cols for DP, AD< OTH - cell_vars['DP'] = cell_vars[0].str.split("_").str[5].astype(int) - cell_vars['AD'] = cell_vars[0].str.split("_").str[6].astype(int) - cell_vars['OTH'] = cell_vars[0].str.split("_").str[7].astype(int) + except: + _='Doesnt Exist' + + + try: + Out_of_Cohort__sites = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys()) + except: + Out_of_Cohort__sites = set() + + try: + Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys()) - set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) + shared_sites_between_out_of_cohort_and_within_cohort = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys()).intersection(set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) ) + except: + Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set() + shared_sites_between_out_of_cohort_and_within_cohort = set() + + try: + Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) - set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys()) + except: + Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) + Out_of_Cohort__unique_sites_total_reads,_,_,_,_,_ = self.read_extraction(Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool,expected_vars_norm,cell_vars_norm) + + try: + Whithin_Cohort__sites = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) + except: + Whithin_Cohort__sites = set() + + + + Total__discordant_sites_that_are_concordant_with_other_donors_in_pool = Whithin_Cohort__sites.union(Out_of_Cohort__sites) + concordant_vars_in_pool_str = (";").join(concordant_vars) + DF = pd.DataFrame(donor_table_of_concordances) - # Total - total_reads,total_dp,total_oth,discordant_reads, \ - discordantRead_ratio,discordant_reads_noHet,total_reads_noHet,discordantRead_ratio_noHet = self.read_concordance_calc(expected_vars,cell_vars) - - # uninformative - cell_vars_uninformative = cell_vars[cell_vars['ids'].isin(self.uninformative_sites)] - total_reads_uninformative,total_dp_uninformative,total_oth_uninformative,discordant_reads_uninformative, \ - discordantRead_ratio_uninformative,discordant_reads_uninformative_noHet,total_reads_uninformative_noHet,discordantRead_ratio_uninformative_noHet= self.read_concordance_calc(expected_vars,cell_vars_uninformative) - - # informative - cell_vars_informative = cell_vars[cell_vars['ids'].isin(self.informative_sites)] - total_reads_informative,total_dp_informative,total_oth_informative,discordant_reads_informative, \ - discordantRead_ratio_informative,discordant_reads_informative_noHet,total_reads_informative_noHet,discordantRead_ratio_informative_noHet = self.read_concordance_calc(expected_vars,cell_vars_informative) - - # Split into concordant and discordant sites - # concordant - concordant_sites = cell_vars[cell_vars['ids'].isin(set(concordant_vars))] - total_reads_for_concordant_sites,total_dp_for_concordant_sites,total_oth_for_concordant_sites,discordant_reads_for_concordant_sites, \ - discordantRead_ratio_for_concordant_sites,discordant_reads_for_concordant_sites_noHet,total_reads_for_concordant_sites_noHet,discordantRead_ratio_for_concordant_sites_noHet = self.read_concordance_calc(expected_vars,concordant_sites) + Donor_With_Lowest_DisConcordance = ';'.join(DF[DF['discordant_percent_in_other_donor']==min(DF['discordant_percent_in_other_donor'])]['donor'].values) + Lowest_Disconcordance_value_in_all_donors= DF[DF['discordant_percent_in_other_donor']==min(DF['discordant_percent_in_other_donor'])]['discordant_percent_in_other_donor'].values[0] - # discordant - discordant_sites = cell_vars[cell_vars['ids'].isin(set(discordant_vars))] - total_reads_for_discconcordant_sites,total_dp_for_discconcordant_sites,total_oth_for_discconcordant_sites,discordant_reads_for_discconcordant_sites, \ - discordantRead_ratio_for_discconcordant_sites,discordant_reads_for_discconcordant_sites_noHet,total_reads_for_discconcordant_sites_noHet,discordantRead_ratio_for_discconcordant_sites_noHet = self.read_concordance_calc(expected_vars,discordant_sites) - - # Subset analysis - cell_vars_informative_subset = cell_vars[cell_vars['ids'].isin(self.informative_subset)] - total_reads_informative_subset,total_dp_informative_subset,total_oth_informative_subset,discordant_reads_informative_subset, \ - discordantRead_ratio_informative_subset,discordant_reads_informative_subset_noHet,total_reads_informative_subset_noHet,discordantRead_ratio_informative_subset_noHet = self.read_concordance_calc(expected_vars,cell_vars_informative_subset) - - if mode == 'no_het': - total_reads = total_reads_noHet - discordant_reads = discordant_reads_noHet - total_reads_informative = total_reads_informative_noHet - discordant_reads_informative = discordant_reads_informative_noHet - total_reads_uninformative =total_reads_uninformative_noHet - discordant_reads_uninformative = discordant_reads_uninformative_noHet - total_reads_informative_subset = total_reads_informative_subset_noHet - discordant_reads_informative_subset = discordant_reads_informative_subset_noHet - total_reads_for_concordant_sites = total_reads_for_concordant_sites_noHet - discordant_reads_for_concordant_sites = discordant_reads_for_concordant_sites_noHet - total_reads_for_discconcordant_sites = total_reads_for_discconcordant_sites_noHet - discordant_reads_for_discconcordant_sites = discordant_reads_for_discconcordant_sites_noHet - - return total_sites, \ - self.informative_covered, \ - self.uninformative_covered, \ - total_reads, \ - discordant_reads, \ - total_reads_informative, \ - discordant_reads_informative, \ - total_reads_uninformative, \ - discordant_reads_uninformative, \ - total_reads_informative_subset, \ - discordant_reads_informative_subset, \ - total_reads_for_concordant_sites, \ - discordant_reads_for_concordant_sites, \ - total_reads_for_discconcordant_sites, \ - discordant_reads_for_discconcordant_sites - - - - def get_discordance(self,expected_vars2,cell_vars2): - Concordant_Sites = set(cell_vars2['combo']).intersection(set(expected_vars2['combo'])) - Discordant_sites = set(cell_vars2['combo'])-set(expected_vars2['combo']) - disc = pd.DataFrame(Discordant_sites,columns=['combo_x']) - df_cd = pd.merge(cell_vars2, expected_vars2, how='inner', on = 'pos') - disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x') - disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y'] - disc_sites = ';'.join(disc2['expected_retrieved']) - return Concordant_Sites,Discordant_sites,disc_sites - - - def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars,donor_cohort=False): - # This function has been inspired by Hails Concordance implementations, however hail has a pitfall that it performs a lot of other stuff under hood and requires intermediate sorting operations. - # Since the single cell calculations requires concordance calculations per cell this becomes very computationally heavy on Hail, hence we have implemented concordance calculations here as part of the pipeline. - # Author: M.Ozols - - cell_vars_norm = self.norm_genotypes(cell_vars) - - if len(cell_vars_norm) > 0: - - if mode == 'no_het': - hets = ['0/1', '1/0'] - expected_vars_norm = expected_vars_norm[~expected_vars_norm['vars'].isin(hets)] + Donor_With_Highest_Concordance = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['donor'].values) + Highest_Concordance_value_in_all_donors= DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['concordant_percent_in_other_donor'].values[0] + Total_sites_other_donor = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['total_sites_otherDonor'].astype(str).values) + Total_reads_other_donor = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['total_reads_otherDonor'].astype(str).values) + + return [{ + 'cell1':cell1, + 'donor_gt_match':donor_gt_match, + 'Nr_Concordant':Nr_Concordant, + 'Nr_Discordant':Nr_Discordant, + 'Nr_Relaxed_concordant':Nr_Relaxed_concordant, + 'true_discordant_count':true_discordant_count, + 'relaxed_concordant_informative_count':relaxed_concordant_informative_count, + 'relaxed_concordant_uninformative_count':relaxed_concordant_uninformative_count, + 'true_discordant_informative_count':true_discordant_informative_count, + 'true_discordant_uninformative_count':true_discordant_uninformative_count, + 'Nr_Total_Overlapping_sites':Nr_Total_Overlapping_sites, + 'Number_of_sites_that_are_donor_concordant_and_exclusive':Number_of_sites_that_are_donor_concordant_and_exclusive, + 'Nr_donor_distinct_sites':Nr_donor_distinct_sites, + 'count':count, + 'discordant_sites':discordant_sites, + 'total_sites':total_sites, + 'informative_sites':informative_sites, + 'uninformative_sites':uninformative_sites, + 'total_reads_informative':total_reads_informative, + 'total_reads_uninformative':total_reads_uninformative, + 'discordant_reads_informative':discordant_reads_informative, + 'discordant_reads_uninformative':discordant_reads_uninformative, + 'Discordant_sites_in_pool': discordant_vars, + 'Lowest_Disconcordance_value_in_all_donors':Lowest_Disconcordance_value_in_all_donors, + 'Donor_With_Lowest_DisConcordance':Donor_With_Lowest_DisConcordance, + 'Concordant_Site_Identities':concordant_vars_in_pool_str, + 'Donor_With_Highest_Concordance':Donor_With_Highest_Concordance, + 'Highest_Concordance_value_in_all_donors':Highest_Concordance_value_in_all_donors, + 'Total_sites_other_donor':Total_sites_other_donor, + 'Total_reads_other_donor':Total_reads_other_donor, + 'total_discordant_sites_that_are_concordant_with_other_donors_in_pool':f"{len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool)}/{len(discordant_vars)}", + 'informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool':f"{len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool)}/{len(true_discordant_informative_count)}", + 'total_reads':total_reads, + 'discordant_reads':discordant_reads, + 'discordant_reads_informative_subset':discordant_reads_informative_subset, \ + 'discordant_read_fraction_in_concordant_sites':discordant_read_fraction_in_concordant_sites, \ + 'discordant_read_fraction_in_discordant_sites':discordant_read_fraction_in_discordant_sites, \ + 'Whithin_Cohort__total_number_of_potential_contaminent_reads':Whithin_Cohort__total_number_of_potential_contaminent_reads, \ + 'Out_of_Cohort__total_number_of_potential_contaminent_reads':Out_of_Cohort__total_number_of_potential_contaminent_reads, \ + 'NrDonors_contributing_to_out_of_cohort':len(set(donors_contributing_to_out_of_cohort)), \ + 'NrDonors_contributing_to_Whithin_Cohort':len(set(donors_contributing_to_Whithin_Cohort)), \ - Total_Overlapping_sites = set(expected_vars_norm['ids']).intersection(set(cell_vars_norm['ids'])) - expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlapping_sites)] - cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)] - cell_vars2 = cell_vars2.drop_duplicates(subset=['ids']) - expected_vars2 = expected_vars2.drop_duplicates(subset=['ids']) - if len(cell_vars2)!=len(expected_vars2): - print('failed to select comon variants') - exit(1) - - # Find exact discordant sites - Concordant_Sites, Discordant_sites, _ = self.get_discordance(expected_vars2, cell_vars2) - #find truly discordant sites - true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, discordant_vars, concordant_vars, disc_sites_string = self.get_strict_discordance(expected_vars2, cell_vars2) - #find discordant reads - total_sites, \ - informative_sites, \ - uninformative_sites, \ - total_reads, \ - discordant_reads, \ - total_reads_informative, \ - discordant_reads_informative, \ - total_reads_uninformative, \ - discordant_reads_uninformative, \ - total_reads_informative_subset, \ - discordant_reads_informative_subset, \ - total_reads_for_concordant_sites, \ - discordant_reads_for_concordant_sites, \ - total_reads_for_discconcordant_sites, \ - discordant_reads_for_discconcordant_sites = self.read_condordance(expected_vars2, cell_vars2, discordant_vars, concordant_vars) - - discordant_read_fraction_in_concordant_sites = f"{discordant_reads_for_concordant_sites}/{total_reads_for_concordant_sites}" - discordant_read_fraction_in_discordant_sites = f"{discordant_reads_for_discconcordant_sites}/{total_reads_for_discconcordant_sites}" - discordant_reads_uninformative_fraction = f"{discordant_reads_uninformative}/{total_reads_uninformative}" - discordant_reads_informative_fraction = f"{discordant_reads_informative}/{total_reads_informative}" - - # sanity checks - if total_reads != total_reads_for_concordant_sites+total_reads_for_discconcordant_sites: - print("Error: total reads dont add up ") - exit(1) - if discordant_reads != discordant_reads_for_concordant_sites+discordant_reads_for_discconcordant_sites: - print("Error: discordant reads dont add up ") - exit(1) - - - else: - Total_Overlapping_sites = set() - Concordant_Sites = set() - Discordant_sites = set() - disc_sites = '' - true_discordant_count = 0 - relaxed_concordant_count = 0 - total_sites = 0 - relaxed_concordant_informative_count = 0 - relaxed_concordant_uninformative_count = 0 - true_discordant_informative_count = 0 - true_discordant_uninformative_count = 0 - informative_sites = 0 - disc_sites_string = '' - discordant_reads = 0 - uninformative_sites = 0 - total_reads = 0 - total_reads_informative =0 - total_reads_uninformative=0 - discordant_reads=0 - discordant_reads_informative=0 - discordant_reads_uninformative=0 - discordant_vars=0 - concordant_vars=0 - discordant_read_fraction_in_concordant_sites=0 - discordant_read_fraction_in_discordant_sites=0 - discordant_reads_uninformative_fraction=0 - discordant_reads_informative_fraction=0 - discordant_reads_informative_subset=0 - - return Concordant_Sites, \ - Discordant_sites, \ - Total_Overlapping_sites, \ - disc_sites_string, \ - cell_vars_norm, \ - true_discordant_count, \ - relaxed_concordant_count, \ - relaxed_concordant_informative_count, \ - relaxed_concordant_uninformative_count, \ - true_discordant_informative_count, \ - true_discordant_uninformative_count, \ - total_sites, \ - informative_sites, \ - uninformative_sites, \ - total_reads, \ - total_reads_informative, \ - total_reads_uninformative, \ - discordant_reads, \ - discordant_reads_informative, \ - discordant_reads_uninformative, \ - discordant_vars, \ - concordant_vars, \ - discordant_read_fraction_in_concordant_sites, \ - discordant_read_fraction_in_discordant_sites, \ - discordant_reads_uninformative_fraction, \ - discordant_reads_informative_fraction, \ - discordant_reads_informative_subset - + 'Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool':Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool, \ + 'Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool':Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool, \ + 'shared_sites_between_Out_of_Cohort_and_Within_Cohort':len(shared_sites_between_out_of_cohort_and_within_cohort), \ + 'Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool':len(Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool), \ + 'Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool':len(Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool), \ + 'Out_of_Cohort__unique_sites_total_reads':Out_of_Cohort__unique_sites_total_reads, \ + 'Total__discordant_sites_that_are_concordant_with_other_donors_in_pool':len(Total__discordant_sites_that_are_concordant_with_other_donors_in_pool) + }, donor_table_of_concordances] - - def set_results(self,to_set,id): - # Recod to disk to save the loading mmeory time. - hash = random.getrandbits(128) - with open(f'tmp_{id}_{hash}.pkl', 'wb') as f: - pickle.dump(to_set, f) - self.record_dict[id]=f'tmp_{id}_{hash}.pkl' - return - # def append_results_cell_concordances(self,result): - def append_results_cell_concordances(self,result,cell_concordance_table,other_donor_concordances,other_donor_concordance_table): - other_donor_concordance_table = other_donor_concordance_table + other_donor_concordances - count=result['count'] - try: - percent_concordant = result['Nr_Concordant']/(result['Nr_Discordant']+result['Nr_Concordant'])*100 - except: - percent_concordant = 0 - - try: - percent_discordant = result['Nr_Discordant']/(result['Nr_Discordant']+result['Nr_Concordant'])*100 - except: - percent_discordant = 0 - try: - percent_relaxed_concordant = result['Nr_Relaxed_concordant']/(result['Nr_Relaxed_concordant']+result['true_discordant_count'])*100 - except: - percent_relaxed_concordant = 0 - - try: - percent_strict_discordant = result['true_discordant_count']/(result['Nr_Relaxed_concordant']+result['true_discordant_count'])*100 - except: - percent_strict_discordant = 0 + def norm_genotypes(self,expected_vars): + expected_vars = pd.DataFrame(expected_vars) + if len(expected_vars) > 0: + split_str=expected_vars[0].str.split("_") + expected_vars['ids'] = split_str.str[0]+'_'+split_str.str[1]+'_'+split_str.str[2]+'_'+split_str.str[3] + expected_vars['pos'] = split_str.str[0]+'_'+split_str.str[1] + expected_vars['vars'] = split_str.str[4] + expected_vars['vars'] = expected_vars['vars'].str.replace('|','/',regex=False) + expected_vars = expected_vars[expected_vars['vars']!='./.'] + expected_vars.loc[expected_vars['vars']=='0/1','vars']='1/0' + expected_vars['combo']= expected_vars['ids']+'_'+expected_vars['vars'] + return expected_vars - try: - read_discordance = result['discordant_reads']/result['total_sites'] - except: - read_discordance = 0 - cohort = 'UNKNOWN' - donor_split = result['donor_gt_match'].split("_") - if (len(donor_split) == 2) and (donor_split[0] == donor_split[1]): - cohort = 'UKB' - elif (len(donor_split) == 3) and (len(donor_split[0]) == 14): - cohort = 'ELGH' - same_as_asigned_donor = result['donor_gt_match'] in result['Donor_With_Highest_Concordance'] - if not same_as_asigned_donor: - same_as_asigned_donor = result['donor_gt_match'] in result['Donor_With_Lowest_DisConcordance'] - - cell_concordance_table[f"{result['cell1']} --- {result['donor_gt_match']}"] = {'GT 1':result['cell1'], - 'GT 2':result['donor_gt_match'], - 'cohort': cohort, - - # 'Nr_Concordant':result['Nr_Concordant'], - # 'Nr_Discordant':result['Nr_Discordant'], - 'Nr_Relaxed_concordant':result['Nr_Relaxed_concordant'], - 'Nr_strict_discordant':result['true_discordant_count'], - # 'Percent Concordant':percent_concordant, - # 'Percent Discordant':percent_discordant, - 'Percent_relaxed_concordant': percent_relaxed_concordant, - 'Percent_strict_discordant': percent_strict_discordant, - 'Nr_concordant_informative': len(result['relaxed_concordant_informative_count']), - 'Nr_concordant_uninformative': len(result['relaxed_concordant_uninformative_count']), - 'Nr_discordant_informative': len(result['true_discordant_informative_count']), - 'Nr_discordant_uninformative': len(result['true_discordant_uninformative_count']), - 'NrTotal_Overlapping_sites_between_two_genotypes':result['Nr_Total_Overlapping_sites'], - 'Nr_donor_distinct_sites_within_pool_individuals':result['Nr_donor_distinct_sites'], - 'Number_of_sites_that_are_donor_concordant_and_exclusive':result['Number_of_sites_that_are_donor_concordant_and_exclusive'], - 'Total_sites': result['total_sites'], - 'Total_informative_sites': result['informative_sites'], - 'Total_uninformative_sites': result['uninformative_sites'], - 'Total_reads': result['total_reads'], - 'Total_reads_informative': result['total_reads_informative'], - 'Total_reads_uninformative': result['total_reads_uninformative'], - 'Discordant_reads': result['discordant_reads'], - 'Discordant_reads_informtive': result['discordant_reads_informative'], - 'Discordant_reads_uninformtive': result['discordant_reads_uninformative'], - 'discordant_reads_informative_subset':result['discordant_reads_informative_subset'], - 'Discordant_reads_by_n_sites': read_discordance, - 'Discordant_sites_in_pool': len(result['Discordant_sites_in_pool']), - 'Lowest_Disconcordance_value_in_all_donors':result['Lowest_Disconcordance_value_in_all_donors'], - 'Donor_With_Lowest_DisConcordance':result['Donor_With_Lowest_DisConcordance'], - 'Concordant_Site_Identities':result['Concordant_Site_Identities'], - 'Donor_With_Highest_Concordance':result['Donor_With_Highest_Concordance'], - 'Highest_Concordance_value_in_all_donors':result['Highest_Concordance_value_in_all_donors'], - 'same_as_asigned_donor':same_as_asigned_donor, - 'Total_sites_other_donor (if same_as_asigned_donor=False)':result['Total_sites_other_donor'], - 'Total_reads_other_donor (if same_as_asigned_donor=False)':result['Total_reads_other_donor'], - 'total_discordant_sites_that_are_concordant_with_other_donors_in_pool':result['total_discordant_sites_that_are_concordant_with_other_donors_in_pool'], - 'discordant_read_fraction_in_concordant_site':result['discordant_read_fraction_in_concordant_sites'], - 'discordant_read_fraction_in_discordant_sites':result['discordant_read_fraction_in_discordant_sites'], - 'Whithin_Cohort__total_number_of_potential_contaminent_reads':result['Whithin_Cohort__total_number_of_potential_contaminent_reads'], - 'Out_of_Cohort__total_number_of_potential_contaminent_reads':result['Out_of_Cohort__total_number_of_potential_contaminent_reads'], - 'NrDonors_contributing_to_out_of_cohort':result['NrDonors_contributing_to_out_of_cohort'], - 'NrDonors_contributing_to_Whithin_Cohort':result['NrDonors_contributing_to_Whithin_Cohort'], - - 'Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool'], - 'Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool'], - 'shared_sites_between_Out_of_Cohort_and_Within_Cohort':result['shared_sites_between_Out_of_Cohort_and_Within_Cohort'], - 'Within_Cohort__unique_discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool'], - 'Out_of_Cohort__unique_discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool'], - 'Out_of_Cohort__unique_sites_total_reads':result['Out_of_Cohort__unique_sites_total_reads'], - 'Total__discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Total__discordant_sites_that_are_concordant_with_other_donors_in_pool'] - } - - return [cell_concordance_table,other_donor_concordance_table] - - # def combine_written_files(self):#this one is for concordance class - # to_export = self.cell_concordance_table - # for val1 in self.record_dict.values(): - # # here remove the int files. - # print(f"merging temp file: {val1}") - # with open(val1, 'rb') as f: - # loaded_dict = pickle.load(f) - # for k1 in loaded_dict.keys(): - # to_export[k1]=loaded_dict[k1] - # os.remove(val1) - # return to_export - - - def combine_written_lists(self,exclusive_donor_variants,record_dict):#this is for VCF loader class - to_export = exclusive_donor_variants - for val1 in record_dict.values(): - # here remove the int files. - print(f"merging temp file: {val1}") - with open(val1, 'rb') as f: - loaded_dict = pickle.load(f) - self.other_donor_comp = self.other_donor_comp+ loaded_dict - os.remove(val1) - return self.other_donor_comp - - def combine_written_files(self,exclusive_donor_variants,record_dict):#this is for VCF loader class - to_export = exclusive_donor_variants - for val1 in record_dict.values(): - # here remove the int files. - print(f"merging temp file: {val1}") - with open(val1, 'rb') as f: - loaded_dict = pickle.load(f) - for k1 in loaded_dict.keys(): - try: - to_export[k1]=to_export[k1].union(loaded_dict[k1]) - except: - to_export[k1]=set() - to_export[k1]=to_export[k1].union(loaded_dict[k1]) - os.remove(val1) - return to_export - - def set_results(self,to_set,id): - # Recod to disk to save the loading mmeory time. - hash = random.getrandbits(128) - with open(f'tmp_{id}_{hash}.pkl', 'wb') as f: - pickle.dump(to_set, f) - self.record_dict[id]=f'tmp_{id}_{hash}.pkl' - - def analyse_donor(self,analyse_donor,Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm,donor_assignments_table): - donor_concordance_table = {} - other_donor_concordance_table = [] - for cell1 in Cells_to_keep_pre: - count+=1 + def get_strict_discordance(self, expected_vars, cell_vars): + ''' + take a list of SNP array genotypes and a list of cellSNP genotypes, return counts of truly discordant + sites and relaxed concordant sites and list of discordant sites1 + 1) If you have 1/1 on SNP array you can not get a 0/1 or 0/0 genotype + 2) if you have a 0/0 you can not get a 1/1 or 0/1 + 3) if you genotype is 0/1 you can get all copies: 0/0 . 0/1. 1/1 + So - each obversed cellsnp allele must be in the array SNP gtype + ''' + snp_gtypes = expected_vars[0] + cellsnp_gtypes = cell_vars[0] + true_discordant = 0 + relaxed_concordant = 0 + relaxed_concordant_informative = 0 + relaxed_concordant_informative_ids = [] + relaxed_concordant_uninformative_ids = [] + true_discordant_uninformative_ids =[] + true_discordant_informative_ids=[] + relaxed_concordant_uninformative = 0 + true_discordant_informative = 0 + true_discordant_uninformative = 0 + discordant_vars = [] + concordant_vars = [] + subset_informative_concordant = 0 + subset_informative_discordant = 0 - cell_vars = exclusive_cell_variants[cell1] - try: - result1, other_donor_concordances = self.concordance_table_production(expected_vars_norm,cell_vars,cell1,donor_gt_match,donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data,donor_assignments_table) - cell_concordance_table,other_donor_concordance_table = self.append_results_cell_concordances(result1,donor_concordance_table,other_donor_concordances,other_donor_concordance_table) - except: - continue - # if count>800: - # break - # here we should write these independently to the files - if (count % 50 == 0): - self.set_results(other_donor_concordance_table,f"{count}--{donor_gt_match}") - other_donor_concordance_table = [] - + #print(self.uninformative_sites) + #print(self.informative_sites) - self.set_results(other_donor_concordance_table,f"{count}--{donor_gt_match}") - output2 = self.combine_written_lists(self.other_donor_comp,self.record_dict) - pd.DataFrame(output2).sort_values(by=['cell']).to_csv(f'{donor_gt_match}-{analyse_donor}--each_cells_comparison_with_other_donor.tsv',sep='\t',index=False) - # del output2 - self.record_dict={} - return donor_concordance_table + #create sets of the ids (chrom, pos, ref, alt) in each set of genotypes. Filter to the ids present in both + #then filter to informative and uninformative. If uninformative >0 then create a subset of informative + # with the same number of vars (at random) + split_snp_gts=snp_gtypes.str.split("_") + snp_gtypes_ids = set(split_snp_gts.str[0]+'_'+split_snp_gts.str[1]+'_'+split_snp_gts.str[2]+'_'+split_snp_gts.str[3]) - def combine_concordances(self,result): - - self.cell_concordance_table = {**self.cell_concordance_table, **result} - + split_cellsnp_gts=cellsnp_gtypes.str.split("_") + cellsnp_gtypes_ids = set(split_cellsnp_gts.str[0]+'_'+split_cellsnp_gts.str[1]+'_'+split_cellsnp_gts.str[2]+'_'+split_cellsnp_gts.str[3]) - def conc_table(self): - donor_assignments_table=self.donor_assignments_table - cell_assignments_table=self.cell_assignments_table - exclusive_don_variants=self.exclusive_don_variants - exclusive_cell_variants= self.exclusive_cell_variants - donor_list = exclusive_don_variants.keys() - # cpus = 1 - pool = mp.Pool(cpus) - count = 0 - - - #create a list of variants that are on each donor genotype file - vars_per_donor_gt = {} - for don_id in donor_list: - donor_gt_vars = list(exclusive_don_variants[don_id]) - donor_gt_vars = pd.DataFrame(donor_gt_vars) - donor_gt_vars = self.norm_genotypes(donor_gt_vars) - donor_gt_vars = donor_gt_vars[donor_gt_vars['vars'] != '0/0'] - donor_gt_varids = list(donor_gt_vars['ids']) - vars_per_donor_gt[don_id] = donor_gt_varids - - #work out what cohort each donor belongs to - donor_cohorts = {} - for don_id in donor_list: - cohort = 'UNKNOWN' - donor_split = don_id.split("_") - if (len(donor_split) == 2) and (donor_split[0] == donor_split[1]): - cohort = 'UKB' - elif (len(donor_split) == 3) and (len(donor_split[0]) == 14): - cohort = 'ELGH' - donor_cohorts[don_id] = cohort + shared_gts = snp_gtypes_ids.intersection(cellsnp_gtypes_ids) - all_donor_data={} - # here we calvculate all the expected donor datasets - for row1 in exclusive_don_variants.keys(): - # donor_in_question = row1['donor_query'] - donor_gt_match = row1 - expected_vars_of_other_donor = self.exclusive_don_variants[donor_gt_match] - expected_vars_norm_of_other_donor = self.norm_genotypes(expected_vars_of_other_donor) - all_donor_data[donor_gt_match]=expected_vars_norm_of_other_donor + shared_informative = shared_gts.intersection(self.informative_sites) + shared_uninformative = shared_gts.intersection(self.uninformative_sites) + # print("shared informative " + str(len(shared_informative))) + # print("shared uninformative " + str(len(shared_uninformative))) - results = [] - for i,row1 in donor_assignments_table.iterrows(): - donor_in_question = row1['donor_query'] - donor_gt_match = row1['donor_gt'] - print(donor_gt_match) - # if i>4: - # continue - if (donor_gt_match=='NONE'): - continue - try: - donor_gt_match_cohort = donor_cohorts[donor_gt_match] - except: - continue - Cells_to_keep_pre = list(set(cell_assignments_table.loc[cell_assignments_table['donor_id']==donor_in_question,'cell'])) - expected_vars = exclusive_don_variants[donor_gt_match] - expected_vars_norm = self.norm_genotypes(expected_vars) - try: - # Now we subset this down to each of the uniqie variants per donor and check which of the concordant sites are exclusive to donor. - dds = self.donor_distinct_sites[donor_gt_match] - except: - continue - if cpus==1: - result = self.analyse_donor(donor_in_question,Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm,donor_assignments_table) - self.combine_concordances(result) + #store the numbers of informative and uninformative sites shared between cellSNP and gt data as these + #are the sites used for concordance + self.informative_covered = len(shared_informative) + self.uninformative_covered = len(shared_uninformative) + + if len(shared_uninformative) > 0: + #print(len(shared_uninformative)) + # print(len(shared_informative)) + if len(shared_uninformative) <= len(shared_informative): + informative_subset = set(random.sample(shared_informative, len(shared_uninformative))) else: + informative_subset = set()#if there are more shared uninformative than shared informative we will not subset + # print(informative_subset) + # exit(0) + else: + informative_subset = set() - result = pool.apply_async(self.analyse_donor, args=([donor_in_question,Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm,donor_assignments_table]),callback=self.combine_concordances) - results.append(result) - try: - [result.wait() for result in results] - except: - print('done') - print('Done') - pool.close() - pool.join() - - - - # output = self.combine_written_files(self.cell_concordance_table,self.record_dict) - - return self.cell_concordance_table + # print(informative_subset) + self.informative_subset = informative_subset - + snp_gtypes_set = set(snp_gtypes) + snp_gtypes_set = sorted(snp_gtypes_set) - def read_extraction(self,DonorDiscordant_Sites_that_are_atributed_to_other_donor,expected_vars_norm,cell_vars_norm): - # we need this function wrapper to calculate the concordant, discordant read - # counts for each of the discordant sites that are concordant with another donor. - - Total_Overlapping_sites = set(DonorDiscordant_Sites_that_are_atributed_to_other_donor) - expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlapping_sites)] - cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)] - - cell_vars2 = cell_vars2.drop_duplicates(subset=['ids']) - expected_vars2 = expected_vars2.drop_duplicates(subset=['ids']) - cell_vars2['DP'] = cell_vars2[0].str.split("_").str[5].astype(int) - cell_vars2['AD'] = cell_vars2[0].str.split("_").str[6].astype(int) - cell_vars2['OTH'] = cell_vars2[0].str.split("_").str[7].astype(int) - total_reads,_,_,discordant_reads, \ - discordantRead_ratio,discordant_reads_noHet,total_reads_noHet,discordantRead_ratio_noHet= self.read_concordance_calc(expected_vars2,cell_vars2) - concordant_reads = total_reads - discordant_reads - concordant_reads_noHet = total_reads_noHet-discordant_reads_noHet - return total_reads,discordant_reads,concordant_reads,\ - total_reads_noHet,discordant_reads_noHet,concordant_reads_noHet - - def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match, donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data,donor_assignments_table): + cellsnp_gtypes_set = set(cellsnp_gtypes) + cellsnp_gtypes_set = sorted(cellsnp_gtypes_set) - Concordant_Sites, \ - Discordant_sites, \ - Total_Overlapping_sites, \ - discordant_sites, \ - cell_vars_norm, \ - true_discordant_count, \ - relaxed_concordant_count, \ - relaxed_concordant_informative_count, \ - relaxed_concordant_uninformative_count, \ - true_discordant_informative_count, \ - true_discordant_uninformative_count, \ - total_sites, \ - informative_sites, \ - uninformative_sites, \ - total_reads, \ - total_reads_informative, \ - total_reads_uninformative, \ - discordant_reads, \ - discordant_reads_informative, \ - discordant_reads_uninformative, \ - discordant_vars, \ - concordant_vars, \ - discordant_read_fraction_in_concordant_sites, \ - discordant_read_fraction_in_discordant_sites, \ - discordant_reads_uninformative_fraction, \ - discordant_reads_informative_fraction, discordant_reads_informative_subset = self.retrieve_concordant_discordant_sites(expected_vars_norm,cell_vars) + #for i in range(0, len(snp_gtypes)): + # cellsnp_gtypes_str = pd.DataFrame(cellsnp_gtypes_set,columns=['cellsnp_gtypes']) + # cellsnp_gtypes_str['snp_gtypes'] = pd.DataFrame(snp_gtypes_set,columns=['snp_gtypes'])['snp_gtypes'] + for i in range(0, len(snp_gtypes_set)): + discordant = False + # snp_data = snp_gtypes[i].split('_') + # cellsnp_data = cellsnp_gtypes[i].split('_') + snp_data = snp_gtypes_set[i].split('_') + cellsnp_data = cellsnp_gtypes_set[i].split('_') + + snp_alleles_set = set([snp_data[4][0], snp_data[4][2]]) + cellsnp_alleles_set = set([cellsnp_data[4][0], cellsnp_data[4][2]]) + + snp_var = ('_').join(snp_data[0:4]) + cellsnp_var = ('_').join(cellsnp_data[0:4]) + + if not cellsnp_var == snp_var: + print("Error with strict discordance calculations: " + snp_gtypes[i] + " " + cellsnp_gtypes[i]) + exit(1) + else: + for allele in cellsnp_alleles_set: + if not allele in snp_alleles_set:#if a cellSNP allele is found that is not in the array data this is discordant + discordant = True + + if discordant == True: + true_discordant+=1 + discordant_vars.append(cellsnp_var) + if snp_var in self.uninformative_sites: + true_discordant_uninformative+=1 + true_discordant_uninformative_ids.append(snp_var) + elif snp_var in self.informative_sites: + true_discordant_informative+=1 + true_discordant_informative_ids.append(snp_var) + else: + relaxed_concordant+=1 + concordant_vars.append(cellsnp_var) + if snp_var in self.uninformative_sites: + relaxed_concordant_uninformative+=1 + relaxed_concordant_uninformative_ids.append(snp_var) + elif snp_var in self.informative_sites: + relaxed_concordant_informative+=1 + relaxed_concordant_informative_ids.append(snp_var) + if len(shared_uninformative) > 0: + if snp_var in informative_subset: + if discordant == True: + subset_informative_discordant+=1 + else: + subset_informative_concordant+=1 + # true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count + cell_vars2 = cell_vars.set_index('ids') + disc = pd.DataFrame(set(cell_vars2.loc[discordant_vars]['combo']),columns=['combo_x']) + df_cd = pd.merge(cell_vars, expected_vars, how='inner', on = 'pos') + disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x') + disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y'] + disc_sites_string = ';'.join(disc2['expected_retrieved']) + return true_discordant, relaxed_concordant, relaxed_concordant_informative_ids, relaxed_concordant_uninformative_ids, true_discordant_informative_ids, true_discordant_uninformative_ids, discordant_vars, concordant_vars, disc_sites_string + + def read_concordance_calc(self,expected_vars,cell_vars): + + # This is a wrapper to add up the discordant reads in the cellsnp file. + + # expected genotype 0/0 + expected_hom_ref = expected_vars[expected_vars['vars'] == '0/0'] + hom_ref_sites = set(expected_hom_ref['ids']) + cell_vars2 = cell_vars[cell_vars['ids'].isin(hom_ref_sites)] + ad_hom_ref = cell_vars2['AD'].sum() + oth_hom_ref = cell_vars2['OTH'].sum() + discordant_hom_ref = ad_hom_ref + oth_hom_ref + + # expected genotype 0/1 or 1/0 + hets = ['0/1', '1/0'] + expected_het = expected_vars[expected_vars['vars'].isin(hets)] + het_sites = set(expected_het['ids']) + cell_vars3 = cell_vars[cell_vars['ids'].isin(het_sites)] + discordant_het = cell_vars3['OTH'].sum() + + # expected genotype 1/1 + expected_hom_alt = expected_vars[expected_vars['vars'] == '1/1'] + hom_alt_sites = set(expected_hom_alt['ids']) + cell_vars4 = cell_vars[cell_vars['ids'].isin(hom_alt_sites)] + # DP + OTH - AD + ad_hom_alt = cell_vars4['AD'].sum() + dp_hom_alt = cell_vars4['DP'].sum() + oth_hom_alt = cell_vars4['OTH'].sum() + discordant_hom_alt = (dp_hom_alt + oth_hom_alt) - ad_hom_alt - total_concordant_sites = len(Concordant_Sites) + relaxed_concordant_count - dds = self.donor_distinct_sites[donor_gt_match] - Nr_donor_distinct_sites = len(dds) - Nr_Concordant = len(Concordant_Sites) - Nr_Relaxed_concordant = relaxed_concordant_count - Nr_Discordant = len(Discordant_sites) - Nr_Total_Overlapping_sites = len(Total_Overlapping_sites) - Number_of_sites_that_are_donor_concordant_and_exclusive = len(set(dds).intersection(set(Discordant_sites))) - Number_of_sites_in_cellsnp_but_not_in_reference = set(cell_vars_norm['pos'])-set(expected_vars_norm['pos']) - #Quantify donor variation in other donors - discordant_vars_in_pool = [] - donor_table_of_concordances = [] - total_discordant_sites_that_are_concordant_with_other_donors_in_pool = set() - total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown = {} - total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort']={} - total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort']={} - - total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown_totalReads = {} - total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown_totalReads['Whithin_Cohort']={} - total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown_totalReads['Out_of_Cohort']={} - - total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown = {} - total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort']={} - total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort']={} - - - informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool = set() - total_cordant_sites_that_are_concordant_with_other_donors_in_pool = set() - donor_gt_match_cohort = donor_cohorts[donor_gt_match] - donors_contributing_to_out_of_cohort= [] - donors_contributing_to_Whithin_Cohort=[] - sites_contributing_to_out_of_cohort= [] - sites_contributing_to_Whithin_Cohort=[] - for donor in set(donor_assignments_table['donor_gt']): - try: - expected_vars_norm_of_other_donor = all_donor_data[donor] - except: - continue - try: - donor_cohort = donor_cohorts[donor] - donor_vars = vars_per_donor_gt[donor] - except: - continue + # Total analysis + discordant_reads = discordant_hom_ref + discordant_het + discordant_hom_alt + total_dp = cell_vars['DP'].sum() + total_oth = cell_vars['OTH'].sum() + total_reads = total_dp + total_oth + discordantRead_ratio = discordant_reads/total_reads - Concordant_Sites_otherDonor, \ - Discordant_sites_otherDonor, \ - Total_Overlapping_sites_otherDonor, \ - discordant_sites_otherDonor, \ - cell_vars_norm_otherDonor, \ - true_discordant_count_otherDonor, \ - relaxed_concordant_count_otherDonor, \ - relaxed_concordant_informative_count_otherDonor, \ - relaxed_concordant_uninformative_count_otherDonor, \ - true_discordant_informative_count_otherDonor, \ - true_discordant_uninformative_count_otherDonor, \ - total_sites_otherDonor, \ - informative_sites_otherDonor, \ - uninformative_sites_otherDonor, \ - total_reads_otherDonor, \ - total_reads_informative_otherDonor, \ - total_reads_uninformative_otherDonor, \ - discordant_reads_otherDonor, \ - discordant_reads_informative_otherDonor, \ - discordant_reads_uninformative_otherDonor, \ - discordant_vars_otherDonor, \ - concordant_vars_otherDonor, \ - discordant_read_fraction_in_concordant_sites_otherDonor, \ - discordant_read_fraction_in_discordant_sites_otherDonor, \ - discordant_reads_uninformative_fraction_otherDonor, \ - discordant_reads_informative_fraction_otherDonor, discordant_reads_informative_subset_otherDonor = self.retrieve_concordant_discordant_sites(expected_vars_norm_of_other_donor,cell_vars,donor_cohort=donor_cohort) + # No het analysis - they are not discordant or concordant - we cannot make any call at those sites + discordant_reads_noHet = discordant_hom_ref + discordant_hom_alt + total_reads_noHet = total_reads - discordant_het + discordantRead_ratio_noHet = discordant_reads_noHet/total_reads_noHet - if total_sites_otherDonor==0: - total_sites_otherDonor=0.000000000001 - total_concordant_sites_otherDonor = relaxed_concordant_count_otherDonor - concordant_percent_in_other_donor= total_concordant_sites_otherDonor/total_sites_otherDonor*100 - discordant_percent_in_other_donor= true_discordant_count_otherDonor/total_sites_otherDonor*100 - DonorDiscordant_Sites_that_are_atributed_to_other_donor = set(discordant_vars).intersection(set(concordant_vars_otherDonor)) - TotalSites_that_are_atributed_to_other_donor = set(discordant_vars).union(set(concordant_vars)).intersection(set(concordant_vars_otherDonor)) + return total_reads,total_dp,total_oth,discordant_reads,discordantRead_ratio,discordant_reads_noHet,total_reads_noHet,discordantRead_ratio_noHet - Informative__DonorDiscordant_Sites_that_are_atributed_to_other_donor = set(true_discordant_informative_count).intersection(set(relaxed_concordant_informative_count_otherDonor)) - DonorCordant_Sites_that_are_atributed_to_other_donor = set(concordant_vars).intersection(set(concordant_vars_otherDonor)) - - total_reads_for_discordant_sites_that_are_concordant_with_other_donor,discordant_reads_for_discordant_sites_that_are_concordant_with_other_donor,concordant_reads_for_discordant_sites_that_are_concordant_with_other_donor,\ - _,_,_= self.read_extraction(DonorDiscordant_Sites_that_are_atributed_to_other_donor,expected_vars_norm_of_other_donor,cell_vars_norm_otherDonor) + def read_condordance(self, expected_vars, cell_vars,discordant_vars, concordant_vars): + ''' + get read level concordance using DP, AD and OTH format fields + ##FORMAT= + ##FORMAT= + ##FORMAT= + ''' + if not len(expected_vars) == len(cell_vars): + print("length mismatch between expected vars and cell vars") + exit(1) - if not donor == donor_gt_match: - # We want to kow how many of these discordant site - if 'U937' in donor: - continue - if 'THP1' in donor: - continue - - if donor_gt_match_cohort == donor_cohort: - coh = 'Whithin_Cohort' - sites_contributing_to_Whithin_Cohort.extend(list(TotalSites_that_are_atributed_to_other_donor)) - if len(DonorDiscordant_Sites_that_are_atributed_to_other_donor)>0: - donors_contributing_to_Whithin_Cohort.append(donor) - - else: - coh = 'Out_of_Cohort' - sites_contributing_to_out_of_cohort.extend(list(TotalSites_that_are_atributed_to_other_donor)) - if len(DonorDiscordant_Sites_that_are_atributed_to_other_donor)>0: - donors_contributing_to_out_of_cohort.append(donor) - - - total_discordant_sites_that_are_concordant_with_other_donors_in_pool = total_discordant_sites_that_are_concordant_with_other_donors_in_pool.union(set(DonorDiscordant_Sites_that_are_atributed_to_other_donor)) - # now we addit for a cohort since the biggest issue comes from cohort cross-contamination - # for each of these sites now we calculate the number of reads that it accounts: - # tree level set: cohort: site: counts - for site in DonorDiscordant_Sites_that_are_atributed_to_other_donor: - total_reads_for_site,discordant_reads_for_site,concordant_for_site,\ - total_reads_noHet,discordant_reads_noHet,concordant_reads_noHet= self.read_extraction([site],expected_vars_norm_of_other_donor,cell_vars_norm_otherDonor) - if concordant_for_site==0: - pass - try: - total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_reads_noHet) + total_sites = len(expected_vars) + #add cols for DP, AD< OTH + cell_vars['DP'] = cell_vars[0].str.split("_").str[5].astype(int) + cell_vars['AD'] = cell_vars[0].str.split("_").str[6].astype(int) + cell_vars['OTH'] = cell_vars[0].str.split("_").str[7].astype(int) + - except: - try: - total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site]=[] - total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_reads_noHet) - - except: - total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh]={} - total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site]=[] - total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_reads_noHet) + + # Total + total_reads,total_dp,total_oth,discordant_reads, \ + discordantRead_ratio,discordant_reads_noHet,total_reads_noHet,discordantRead_ratio_noHet = self.read_concordance_calc(expected_vars,cell_vars) + + # uninformative + cell_vars_uninformative = cell_vars[cell_vars['ids'].isin(self.uninformative_sites)] + total_reads_uninformative,total_dp_uninformative,total_oth_uninformative,discordant_reads_uninformative, \ + discordantRead_ratio_uninformative,discordant_reads_uninformative_noHet,total_reads_uninformative_noHet,discordantRead_ratio_uninformative_noHet= self.read_concordance_calc(expected_vars,cell_vars_uninformative) + + # informative + cell_vars_informative = cell_vars[cell_vars['ids'].isin(self.informative_sites)] + total_reads_informative,total_dp_informative,total_oth_informative,discordant_reads_informative, \ + discordantRead_ratio_informative,discordant_reads_informative_noHet,total_reads_informative_noHet,discordantRead_ratio_informative_noHet = self.read_concordance_calc(expected_vars,cell_vars_informative) + + # Split into concordant and discordant sites + # concordant + concordant_sites = cell_vars[cell_vars['ids'].isin(set(concordant_vars))] + total_reads_for_concordant_sites,total_dp_for_concordant_sites,total_oth_for_concordant_sites,discordant_reads_for_concordant_sites, \ + discordantRead_ratio_for_concordant_sites,discordant_reads_for_concordant_sites_noHet,total_reads_for_concordant_sites_noHet,discordantRead_ratio_for_concordant_sites_noHet = self.read_concordance_calc(expected_vars,concordant_sites) + + # discordant + discordant_sites = cell_vars[cell_vars['ids'].isin(set(discordant_vars))] + total_reads_for_discconcordant_sites,total_dp_for_discconcordant_sites,total_oth_for_discconcordant_sites,discordant_reads_for_discconcordant_sites, \ + discordantRead_ratio_for_discconcordant_sites,discordant_reads_for_discconcordant_sites_noHet,total_reads_for_discconcordant_sites_noHet,discordantRead_ratio_for_discconcordant_sites_noHet = self.read_concordance_calc(expected_vars,discordant_sites) + + # Subset analysis + cell_vars_informative_subset = cell_vars[cell_vars['ids'].isin(self.informative_subset)] + total_reads_informative_subset,total_dp_informative_subset,total_oth_informative_subset,discordant_reads_informative_subset, \ + discordantRead_ratio_informative_subset,discordant_reads_informative_subset_noHet,total_reads_informative_subset_noHet,discordantRead_ratio_informative_subset_noHet = self.read_concordance_calc(expected_vars,cell_vars_informative_subset) - for site in DonorDiscordant_Sites_that_are_atributed_to_other_donor: - total_reads_for_site,discordant_reads_for_site,concordant_for_site,\ - _,_,_= self.read_extraction([site],expected_vars_norm_of_other_donor,cell_vars_norm_otherDonor) - # if discordant_reads_for_site>0: - # print('here') - if concordant_for_site==0: - pass - try: - total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_for_site) - except: - try: - total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site]=[] - total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_for_site) - - except: - total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh]={} - total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site]=[] - total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_for_site) + if mode == 'no_het': + total_reads = total_reads_noHet + discordant_reads = discordant_reads_noHet + total_reads_informative = total_reads_informative_noHet + discordant_reads_informative = discordant_reads_informative_noHet + total_reads_uninformative =total_reads_uninformative_noHet + discordant_reads_uninformative = discordant_reads_uninformative_noHet + total_reads_informative_subset = total_reads_informative_subset_noHet + discordant_reads_informative_subset = discordant_reads_informative_subset_noHet + total_reads_for_concordant_sites = total_reads_for_concordant_sites_noHet + discordant_reads_for_concordant_sites = discordant_reads_for_concordant_sites_noHet + total_reads_for_discconcordant_sites = total_reads_for_discconcordant_sites_noHet + discordant_reads_for_discconcordant_sites = discordant_reads_for_discconcordant_sites_noHet + + return total_sites, \ + self.informative_covered, \ + self.uninformative_covered, \ + total_reads, \ + discordant_reads, \ + total_reads_informative, \ + discordant_reads_informative, \ + total_reads_uninformative, \ + discordant_reads_uninformative, \ + total_reads_informative_subset, \ + discordant_reads_informative_subset, \ + total_reads_for_concordant_sites, \ + discordant_reads_for_concordant_sites, \ + total_reads_for_discconcordant_sites, \ + discordant_reads_for_discconcordant_sites + - # to get the total reads that can be atributed to the other donor i have to check if site is already covered in the total_discordant_sites_that_are_concordant_with_other_donors_in_pool. - # the ones that havent, i have to add the reads up for them. - informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool = informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool.union(set(Informative__DonorDiscordant_Sites_that_are_atributed_to_other_donor)) - - total_cordant_sites_that_are_concordant_with_other_donors_in_pool = total_cordant_sites_that_are_concordant_with_other_donors_in_pool.union(set(DonorCordant_Sites_that_are_atributed_to_other_donor)) + + def get_discordance(self,expected_vars2,cell_vars2): + Concordant_Sites = set(cell_vars2['combo']).intersection(set(expected_vars2['combo'])) + Discordant_sites = set(cell_vars2['combo'])-set(expected_vars2['combo']) + disc = pd.DataFrame(Discordant_sites,columns=['combo_x']) + df_cd = pd.merge(cell_vars2, expected_vars2, how='inner', on = 'pos') + disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x') + disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y'] + disc_sites = ';'.join(disc2['expected_retrieved']) + return Concordant_Sites,Discordant_sites,disc_sites + + + def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars,donor_cohort=False): + # This function has been inspired by Hails Concordance implementations, however hail has a pitfall that it performs a lot of other stuff under hood and requires intermediate sorting operations. + # Since the single cell calculations requires concordance calculations per cell this becomes very computationally heavy on Hail, hence we have implemented concordance calculations here as part of the pipeline. + # Author: M.Ozols + + cell_vars_norm = self.norm_genotypes(cell_vars) + + if len(cell_vars_norm) > 0: + + if mode == 'no_het': + hets = ['0/1', '1/0'] + expected_vars_norm = expected_vars_norm[~expected_vars_norm['vars'].isin(hets)] + Total_Overlapping_sites = set(expected_vars_norm['ids']).intersection(set(cell_vars_norm['ids'])) + expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlapping_sites)] + cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)] + cell_vars2 = cell_vars2.drop_duplicates(subset=['ids']) + expected_vars2 = expected_vars2.drop_duplicates(subset=['ids']) + if len(cell_vars2)!=len(expected_vars2): + print('failed to select comon variants') + exit(1) + + # Find exact discordant sites + Concordant_Sites, Discordant_sites, _ = self.get_discordance(expected_vars2, cell_vars2) + #find truly discordant sites + true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, discordant_vars, concordant_vars, disc_sites_string = self.get_strict_discordance(expected_vars2, cell_vars2) + #find discordant reads + total_sites, \ + informative_sites, \ + uninformative_sites, \ + total_reads, \ + discordant_reads, \ + total_reads_informative, \ + discordant_reads_informative, \ + total_reads_uninformative, \ + discordant_reads_uninformative, \ + total_reads_informative_subset, \ + discordant_reads_informative_subset, \ + total_reads_for_concordant_sites, \ + discordant_reads_for_concordant_sites, \ + total_reads_for_discconcordant_sites, \ + discordant_reads_for_discconcordant_sites = self.read_condordance(expected_vars2, cell_vars2, discordant_vars, concordant_vars) + + discordant_read_fraction_in_concordant_sites = f"{discordant_reads_for_concordant_sites}/{total_reads_for_concordant_sites}" + discordant_read_fraction_in_discordant_sites = f"{discordant_reads_for_discconcordant_sites}/{total_reads_for_discconcordant_sites}" + discordant_reads_uninformative_fraction = f"{discordant_reads_uninformative}/{total_reads_uninformative}" + discordant_reads_informative_fraction = f"{discordant_reads_informative}/{total_reads_informative}" + + # sanity checks + if total_reads != total_reads_for_concordant_sites+total_reads_for_discconcordant_sites: + print("Error: total reads dont add up ") + exit(1) + if discordant_reads != discordant_reads_for_concordant_sites+discordant_reads_for_discconcordant_sites: + print("Error: discordant reads dont add up ") + exit(1) + - common_vars = list(set(discordant_vars) & set(donor_vars)) - common_var_count = str(len(common_vars)) - donor_cohort_common = donor + ":" + donor_cohort + ":" + common_var_count - discordant_vars_in_pool.append(donor_cohort_common) - - # Here we want to calculate the number of discordant sites in other donors and see if in terms of concordance the same donor is picked as per GT assignment. - # We do this to investigate the potential of a cell coming from this other donor. - else: - _='check' - try: - percent_relaxed_concordant = Nr_Relaxed_concordant/(Nr_Relaxed_concordant+true_discordant_count)*100 - except: - percent_relaxed_concordant = 0 - # g2 = total_concordant_sites_otherDonor/total_sites_otherDonor*100 - if not (percent_relaxed_concordant == concordant_percent_in_other_donor): - _='something not right' - break + else: + Total_Overlapping_sites = set() + Concordant_Sites = set() + Discordant_sites = set() + disc_sites = '' + true_discordant_count = 0 + relaxed_concordant_count = 0 + total_sites = 0 + relaxed_concordant_informative_count = 0 + relaxed_concordant_uninformative_count = 0 + true_discordant_informative_count = 0 + true_discordant_uninformative_count = 0 + informative_sites = 0 + disc_sites_string = '' + discordant_reads = 0 + uninformative_sites = 0 + total_reads = 0 + total_reads_informative =0 + total_reads_uninformative=0 + discordant_reads=0 + discordant_reads_informative=0 + discordant_reads_uninformative=0 + discordant_vars=0 + concordant_vars=0 + discordant_read_fraction_in_concordant_sites=0 + discordant_read_fraction_in_discordant_sites=0 + discordant_reads_uninformative_fraction=0 + discordant_reads_informative_fraction=0 + discordant_reads_informative_subset=0 + + return Concordant_Sites, \ + Discordant_sites, \ + Total_Overlapping_sites, \ + disc_sites_string, \ + cell_vars_norm, \ + true_discordant_count, \ + relaxed_concordant_count, \ + relaxed_concordant_informative_count, \ + relaxed_concordant_uninformative_count, \ + true_discordant_informative_count, \ + true_discordant_uninformative_count, \ + total_sites, \ + informative_sites, \ + uninformative_sites, \ + total_reads, \ + total_reads_informative, \ + total_reads_uninformative, \ + discordant_reads, \ + discordant_reads_informative, \ + discordant_reads_uninformative, \ + discordant_vars, \ + concordant_vars, \ + discordant_read_fraction_in_concordant_sites, \ + discordant_read_fraction_in_discordant_sites, \ + discordant_reads_uninformative_fraction, \ + discordant_reads_informative_fraction, \ + discordant_reads_informative_subset - donor_table_of_concordances.append({'donor':donor, 'cell':cell1, 'donor_cohort':donor_cohort, \ - 'gt matched donor':donor == donor_gt_match, \ - 'DonorCordant_Sites_that_are_atributed_to_other_donor':len(DonorCordant_Sites_that_are_atributed_to_other_donor), \ - 'DonorCordant_Sites_that_are_atributed_to_other_donor/total':f"{len(DonorCordant_Sites_that_are_atributed_to_other_donor)}/{len(concordant_vars)}", \ - 'DonorDiscordant_Sites_that_are_atributed_to_other_donor':len(DonorDiscordant_Sites_that_are_atributed_to_other_donor), \ - 'DonorDiscordant_Sites_that_are_atributed_to_other_donor/total':f"{len(DonorDiscordant_Sites_that_are_atributed_to_other_donor)}/{len(discordant_vars)}", \ - 'concordant_percent_in_other_donor':concordant_percent_in_other_donor, \ - 'discordant_percent_in_other_donor':discordant_percent_in_other_donor, \ - 'discordant_reads_otherDonor':discordant_reads_otherDonor, \ - 'discordant_sites_otherDonor':len(discordant_vars_otherDonor), \ - 'concordant_sites_otherDonor':len(concordant_vars_otherDonor), \ - 'total_sites_otherDonor':total_sites_otherDonor, \ - 'discordant_reads_otherDonor':discordant_reads_otherDonor, \ - 'total_reads_otherDonor':total_reads_otherDonor, \ - # 'discordant_read_fraction_in_concordant_sites_otherDonor':discordant_read_fraction_in_concordant_sites_otherDonor, \ - # 'discordant_read_fraction_in_discordant_sites_otherDonor':discordant_read_fraction_in_discordant_sites_otherDonor, \ - 'concordant_reads_For_discordant_sites_that_are_Concordant_with_other_donor':concordant_reads_for_discordant_sites_that_are_concordant_with_other_donor - }) - - - #here now we want to see overall how many reads potentially come from different cohorts. - # cohort_specific_site_quant_string="" - # cohort_specific_read_quant_string="" + + + def set_results(self,to_set,id): + # Recod to disk to save the loading mmeory time. + hash = random.getrandbits(128) + with open(f'tmp_{id}_{hash}.pkl', 'wb') as f: + pickle.dump(to_set, f) + self.record_dict[id]=f'tmp_{id}_{hash}.pkl' + return + + # def append_results_cell_concordances(self,result): + def append_results_cell_concordances(self,result,cell_concordance_table,other_donor_concordances,other_donor_concordance_table): + other_donor_concordance_table = other_donor_concordance_table + other_donor_concordances + # other_donor_concordance_table = {**other_donor_concordance_table, **other_donor_concordances} + count=result['count'] + try: + percent_concordant = result['Nr_Concordant']/(result['Nr_Discordant']+result['Nr_Concordant'])*100 + except: + percent_concordant = 0 - Whithin_Cohort__total_number_of_potential_contaminent_reads=0 - Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool=0 - Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool=0 try: - Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool = len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) - # for k1 in total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys(): - # Whithin_Cohort__total_number_of_potential_contaminent_reads+= max(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'][k1]) + percent_discordant = result['Nr_Discordant']/(result['Nr_Discordant']+result['Nr_Concordant'])*100 except: - _='Doesnt Exist' + percent_discordant = 0 + + try: + percent_relaxed_concordant = result['Nr_Relaxed_concordant']/(result['Nr_Relaxed_concordant']+result['true_discordant_count'])*100 + except: + percent_relaxed_concordant = 0 - Out_of_Cohort__total_number_of_potential_contaminent_reads=0 try: - Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool = len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys()) - # for k1 in total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys(): - # Out_of_Cohort__total_number_of_potential_contaminent_reads+= max(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'][k1]) + percent_strict_discordant = result['true_discordant_count']/(result['Nr_Relaxed_concordant']+result['true_discordant_count'])*100 except: - _='Doesnt Exist' - - - - + percent_strict_discordant = 0 + + try: + read_discordance = result['discordant_reads']/result['total_sites'] + except: + read_discordance = 0 + + cohort = 'UNKNOWN' + donor_split = result['donor_gt_match'].split("_") + if (len(donor_split) == 2) and (donor_split[0] == donor_split[1]): + cohort = 'UKB' + elif (len(donor_split) == 3) and (len(donor_split[0]) == 14): + cohort = 'ELGH' + + same_as_asigned_donor = result['donor_gt_match'] in result['Donor_With_Highest_Concordance'] + if not same_as_asigned_donor: + same_as_asigned_donor = result['donor_gt_match'] in result['Donor_With_Lowest_DisConcordance'] + + cell_concordance_table[f"{result['cell1']} --- {result['donor_gt_match']}"] = {'GT 1':result['cell1'], + 'GT 2':result['donor_gt_match'], + 'cohort': cohort, + + # 'Nr_Concordant':result['Nr_Concordant'], + # 'Nr_Discordant':result['Nr_Discordant'], + 'Nr_Relaxed_concordant':result['Nr_Relaxed_concordant'], + 'Nr_strict_discordant':result['true_discordant_count'], + # 'Percent Concordant':percent_concordant, + # 'Percent Discordant':percent_discordant, + 'Percent_relaxed_concordant': percent_relaxed_concordant, + 'Percent_strict_discordant': percent_strict_discordant, + 'Nr_concordant_informative': len(result['relaxed_concordant_informative_count']), + 'Nr_concordant_uninformative': len(result['relaxed_concordant_uninformative_count']), + 'Nr_discordant_informative': len(result['true_discordant_informative_count']), + 'Nr_discordant_uninformative': len(result['true_discordant_uninformative_count']), + 'NrTotal_Overlapping_sites_between_two_genotypes':result['Nr_Total_Overlapping_sites'], + 'Nr_donor_distinct_sites_within_pool_individuals':result['Nr_donor_distinct_sites'], + 'Number_of_sites_that_are_donor_concordant_and_exclusive':result['Number_of_sites_that_are_donor_concordant_and_exclusive'], + 'Total_sites': result['total_sites'], + 'Total_informative_sites': result['informative_sites'], + 'Total_uninformative_sites': result['uninformative_sites'], + 'Total_reads': result['total_reads'], + 'Total_reads_informative': result['total_reads_informative'], + 'Total_reads_uninformative': result['total_reads_uninformative'], + 'Discordant_reads': result['discordant_reads'], + 'Discordant_reads_informtive': result['discordant_reads_informative'], + 'Discordant_reads_uninformtive': result['discordant_reads_uninformative'], + 'discordant_reads_informative_subset':result['discordant_reads_informative_subset'], + 'Discordant_reads_by_n_sites': read_discordance, + 'Discordant_sites_in_pool': len(result['Discordant_sites_in_pool']), + 'Lowest_Disconcordance_value_in_all_donors':result['Lowest_Disconcordance_value_in_all_donors'], + 'Donor_With_Lowest_DisConcordance':result['Donor_With_Lowest_DisConcordance'], + 'Concordant_Site_Identities':result['Concordant_Site_Identities'], + 'Donor_With_Highest_Concordance':result['Donor_With_Highest_Concordance'], + 'Highest_Concordance_value_in_all_donors':result['Highest_Concordance_value_in_all_donors'], + 'same_as_asigned_donor':same_as_asigned_donor, + 'Total_sites_other_donor (if same_as_asigned_donor=False)':result['Total_sites_other_donor'], + 'Total_reads_other_donor (if same_as_asigned_donor=False)':result['Total_reads_other_donor'], + 'total_discordant_sites_that_are_concordant_with_other_donors_in_pool':result['total_discordant_sites_that_are_concordant_with_other_donors_in_pool'], + 'discordant_read_fraction_in_concordant_site':result['discordant_read_fraction_in_concordant_sites'], + 'discordant_read_fraction_in_discordant_sites':result['discordant_read_fraction_in_discordant_sites'], + 'Whithin_Cohort__total_number_of_potential_contaminent_reads':result['Whithin_Cohort__total_number_of_potential_contaminent_reads'], + 'Out_of_Cohort__total_number_of_potential_contaminent_reads':result['Out_of_Cohort__total_number_of_potential_contaminent_reads'], + 'NrDonors_contributing_to_out_of_cohort':result['NrDonors_contributing_to_out_of_cohort'], + 'NrDonors_contributing_to_Whithin_Cohort':result['NrDonors_contributing_to_Whithin_Cohort'], + + 'Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool'], + 'Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool'], + 'shared_sites_between_Out_of_Cohort_and_Within_Cohort':result['shared_sites_between_Out_of_Cohort_and_Within_Cohort'], + 'Within_Cohort__unique_discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool'], + 'Out_of_Cohort__unique_discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool'], + 'Out_of_Cohort__unique_sites_total_reads':result['Out_of_Cohort__unique_sites_total_reads'], + 'Total__discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Total__discordant_sites_that_are_concordant_with_other_donors_in_pool'] + } + + return [cell_concordance_table,other_donor_concordance_table] + def combine_written_lists(self,exclusive_donor_variants,record_dict):#this is for VCF loader class + to_export = exclusive_donor_variants + for val1 in record_dict.values(): + # here remove the int files. + print(f"merging temp file: {val1}") + with open(val1, 'rb') as f: + loaded_dict = pickle.load(f) + self.other_donor_comp = self.other_donor_comp+ loaded_dict + os.remove(val1) + return self.other_donor_comp + def combine_written_files(self,exclusive_donor_variants,record_dict):#this is for VCF loader class + to_export = exclusive_donor_variants + for val1 in record_dict.values(): + # here remove the int files. + print(f"merging temp file: {val1}") + with open(val1, 'rb') as f: + loaded_dict = pickle.load(f) + for k1 in loaded_dict.keys(): + try: + to_export[k1]=to_export[k1].union(loaded_dict[k1]) + except: + to_export[k1]=set() + to_export[k1]=to_export[k1].union(loaded_dict[k1]) + os.remove(val1) + return to_export + + def set_results(self,to_set,id): + # Recod to disk to save the loading mmeory time. + hash = random.getrandbits(128) + with open(f'tmp_{id}_{hash}.pkl', 'wb') as f: + pickle.dump(to_set, f) + self.record_dict[id]=f'tmp_{id}_{hash}.pkl' - # No het total calcs + + def combine_concordances(self,result,other_donor_concordance,donor_gt_match,analyse_donor): + pd.DataFrame(other_donor_concordance).sort_values(by=['cell']).to_csv(f'{donor_gt_match}-{analyse_donor}--each_cells_comparison_with_other_donor.tsv',sep='\t',index=False) + self.cell_concordance_table = {**self.cell_concordance_table, **result} + + def combine_dict(self,cell_concordance_table,result): + cell_concordance_table = {**cell_concordance_table, **result} + return cell_concordance_table + + def conc_table(self): + donor_assignments_table=self.donor_assignments_table + cell_assignments_table=self.cell_assignments_table + exclusive_don_variants=self.exclusive_don_variants + exclusive_cell_variants= self.exclusive_cell_variants + donor_list = exclusive_don_variants.keys() + # cpus = 1 - # Since we can not tell just from the cellsnp file if the discordant read at concordant site is actually concordant with another donor in a pool - # we assume the worse case scenario and add idscordant reads at concordant sites to the total. - # Number of reads on the sites that are concordant, but has discordant reads with another donor - Whithin_Cohort__total_number_of_potential_contaminent_reads=0 - try: - concordant_sites_atributed_to_other_donor = list(set(sites_contributing_to_Whithin_Cohort)) - if not total_sites >= len(concordant_sites_atributed_to_other_donor): - raise Exception("Sorry, the number of sites atributed can not be larger than total sites") - - _,_,_,\ - _,Whithin_Cohort__total_number_of_potential_contaminent_reads, \ - _= self.read_extraction(concordant_sites_atributed_to_other_donor,expected_vars_norm,cell_vars_norm) - - if not discordant_reads >= len(Whithin_Cohort__total_number_of_potential_contaminent_reads): - raise Exception("Total discordant reads can not be less than within cohort discordant reads.") - - - except: - _='Doesnt Exist' + count = 0 + + + #create a list of variants that are on each donor genotype file + vars_per_donor_gt = {} + for don_id in donor_list: + donor_gt_vars = list(exclusive_don_variants[don_id]) + donor_gt_vars = pd.DataFrame(donor_gt_vars) + donor_gt_vars = self.norm_genotypes(donor_gt_vars) + donor_gt_vars = donor_gt_vars[donor_gt_vars['vars'] != '0/0'] + donor_gt_varids = list(donor_gt_vars['ids']) + vars_per_donor_gt[don_id] = donor_gt_varids + + #work out what cohort each donor belongs to + donor_cohorts = {} + for don_id in donor_list: + cohort = 'UNKNOWN' + donor_split = don_id.split("_") + if (len(donor_split) == 2) and (donor_split[0] == donor_split[1]): + cohort = 'UKB' + elif (len(donor_split) == 3) and (len(donor_split[0]) == 14): + cohort = 'ELGH' + donor_cohorts[don_id] = cohort + all_donor_data={} + # here we calvculate all the expected donor datasets + for row1 in exclusive_don_variants.keys(): + # donor_in_question = row1['donor_query'] + donor_gt_match = row1 + expected_vars_of_other_donor = self.exclusive_don_variants[donor_gt_match] + expected_vars_norm_of_other_donor = self.norm_genotypes(expected_vars_of_other_donor) + all_donor_data[donor_gt_match]=expected_vars_norm_of_other_donor - Out_of_Cohort__total_number_of_potential_contaminent_reads=0 - try: - concordant_sites_atributed_to_other_donor = list(set(sites_contributing_to_out_of_cohort)) - if not total_sites >= len(concordant_sites_atributed_to_other_donor): - raise Exception("Sorry, the number of sites atributed can not be larger than total sites") - - _,_,_,_,Out_of_Cohort__total_number_of_potential_contaminent_reads, _= self.read_extraction(concordant_sites_atributed_to_other_donor,expected_vars_norm,cell_vars_norm) - - if not discordant_reads >= len(Out_of_Cohort__total_number_of_potential_contaminent_reads): - raise Exception("Total discordant reads can not be less than out of cohort discordant reads.") + results = [] + for i,row1 in donor_assignments_table.iterrows(): + donor_in_question = row1['donor_query'] + donor_gt_match = row1['donor_gt'] + print(donor_gt_match) + # if i>4: + # continue + if (donor_gt_match=='NONE'): + continue + try: + donor_gt_match_cohort = donor_cohorts[donor_gt_match] + except: + continue + Cells_to_keep_pre = list(set(cell_assignments_table.loc[cell_assignments_table['donor_id']==donor_in_question,'cell'])) + expected_vars = exclusive_don_variants[donor_gt_match] + expected_vars_norm = self.norm_genotypes(expected_vars) + try: + # Now we subset this down to each of the uniqie variants per donor and check which of the concordant sites are exclusive to donor. + dds = self.donor_distinct_sites[donor_gt_match] + except: + continue + # if cpus==1: + result,other_donor_concordance,donor_gt_match,donor_id = Donor(donor_in_question,self.informative_sites,self.uninformative_sites,self.donor_assignments_table,self.cell_assignments_table,self.exclusive_cell_variants,self.donor_distinct_sites,cpus).analyse_donor(Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,all_donor_data,expected_vars_norm,donor_assignments_table,exclusive_cell_variants) + self.combine_concordances(result,other_donor_concordance,donor_gt_match,donor_id) + + + + return self.cell_concordance_table - except: - _='Doesnt Exist' - + + def read_extraction(self,DonorDiscordant_Sites_that_are_atributed_to_other_donor,expected_vars_norm,cell_vars_norm): + # we need this function wrapper to calculate the concordant, discordant read + # counts for each of the discordant sites that are concordant with another donor. - try: - Out_of_Cohort__sites = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys()) - except: - Out_of_Cohort__sites = set() - - try: - Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys()) - set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) - shared_sites_between_out_of_cohort_and_within_cohort = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys()).intersection(set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) ) - except: - Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set() - shared_sites_between_out_of_cohort_and_within_cohort = set() - - try: - Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) - set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys()) - except: - Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) + Total_Overlapping_sites = set(DonorDiscordant_Sites_that_are_atributed_to_other_donor) + expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlapping_sites)] + cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)] + + cell_vars2 = cell_vars2.drop_duplicates(subset=['ids']) + expected_vars2 = expected_vars2.drop_duplicates(subset=['ids']) + cell_vars2['DP'] = cell_vars2[0].str.split("_").str[5].astype(int) + cell_vars2['AD'] = cell_vars2[0].str.split("_").str[6].astype(int) + cell_vars2['OTH'] = cell_vars2[0].str.split("_").str[7].astype(int) + total_reads,_,_,discordant_reads, \ + discordantRead_ratio,discordant_reads_noHet,total_reads_noHet,discordantRead_ratio_noHet= self.read_concordance_calc(expected_vars2,cell_vars2) + concordant_reads = total_reads - discordant_reads + concordant_reads_noHet = total_reads_noHet-discordant_reads_noHet + return total_reads,discordant_reads,concordant_reads,\ + total_reads_noHet,discordant_reads_noHet,concordant_reads_noHet + - Out_of_Cohort__unique_sites_total_reads,_,_,_,_,_ = self.read_extraction(Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool,expected_vars_norm,cell_vars_norm) - - try: - Whithin_Cohort__sites = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) - except: - Whithin_Cohort__sites = set() - +class Donor(Concordances): + def __init__(self, donor_id,informative_sites,uninformative_sites,donor_assignments_table,cell_assignments_table,exclusive_cell_variants,donor_distinct_sites,pool): + self.donor_concordance_table = {} + self.other_donor_concordance = [] + self.pool = pool + self.donor_id = donor_id + self.informative_sites = informative_sites + self.uninformative_sites = uninformative_sites + self.donor_assignments_table=donor_assignments_table + self.cell_assignments_table=cell_assignments_table + self.exclusive_don_variants=exclusive_don_variants + self.exclusive_cell_variants=exclusive_cell_variants + self.donor_distinct_sites=donor_distinct_sites - - Total__discordant_sites_that_are_concordant_with_other_donors_in_pool = Whithin_Cohort__sites.union(Out_of_Cohort__sites) - concordant_vars_in_pool_str = (";").join(concordant_vars) - DF = pd.DataFrame(donor_table_of_concordances) + def combine_results(self,result): + donor_concordance_table=result[0] + other_donor_concordance_table=result[1] + # other_donor_concordance_table + other_donor_concordances + self.other_donor_concordance = self.other_donor_concordance + other_donor_concordance_table + self.donor_concordance_table = {**self.donor_concordance_table, **donor_concordance_table} - Donor_With_Lowest_DisConcordance = ';'.join(DF[DF['discordant_percent_in_other_donor']==min(DF['discordant_percent_in_other_donor'])]['donor'].values) - Lowest_Disconcordance_value_in_all_donors= DF[DF['discordant_percent_in_other_donor']==min(DF['discordant_percent_in_other_donor'])]['discordant_percent_in_other_donor'].values[0] - - Donor_With_Highest_Concordance = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['donor'].values) - Highest_Concordance_value_in_all_donors= DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['concordant_percent_in_other_donor'].values[0] - Total_sites_other_donor = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['total_sites_otherDonor'].astype(str).values) - Total_reads_other_donor = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['total_reads_otherDonor'].astype(str).values) - - return [{ - 'cell1':cell1, - 'donor_gt_match':donor_gt_match, - 'Nr_Concordant':Nr_Concordant, - 'Nr_Discordant':Nr_Discordant, - 'Nr_Relaxed_concordant':Nr_Relaxed_concordant, - 'true_discordant_count':true_discordant_count, - 'relaxed_concordant_informative_count':relaxed_concordant_informative_count, - 'relaxed_concordant_uninformative_count':relaxed_concordant_uninformative_count, - 'true_discordant_informative_count':true_discordant_informative_count, - 'true_discordant_uninformative_count':true_discordant_uninformative_count, - 'Nr_Total_Overlapping_sites':Nr_Total_Overlapping_sites, - 'Number_of_sites_that_are_donor_concordant_and_exclusive':Number_of_sites_that_are_donor_concordant_and_exclusive, - 'Nr_donor_distinct_sites':Nr_donor_distinct_sites, - 'count':count, - 'discordant_sites':discordant_sites, - 'total_sites':total_sites, - 'informative_sites':informative_sites, - 'uninformative_sites':uninformative_sites, - 'total_reads_informative':total_reads_informative, - 'total_reads_uninformative':total_reads_uninformative, - 'discordant_reads_informative':discordant_reads_informative, - 'discordant_reads_uninformative':discordant_reads_uninformative, - 'Discordant_sites_in_pool': discordant_vars, - 'Lowest_Disconcordance_value_in_all_donors':Lowest_Disconcordance_value_in_all_donors, - 'Donor_With_Lowest_DisConcordance':Donor_With_Lowest_DisConcordance, - 'Concordant_Site_Identities':concordant_vars_in_pool_str, - 'Donor_With_Highest_Concordance':Donor_With_Highest_Concordance, - 'Highest_Concordance_value_in_all_donors':Highest_Concordance_value_in_all_donors, - 'Total_sites_other_donor':Total_sites_other_donor, - 'Total_reads_other_donor':Total_reads_other_donor, - 'total_discordant_sites_that_are_concordant_with_other_donors_in_pool':f"{len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool)}/{len(discordant_vars)}", - 'informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool':f"{len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool)}/{len(true_discordant_informative_count)}", - 'total_reads':total_reads, - 'discordant_reads':discordant_reads, - 'discordant_reads_informative_subset':discordant_reads_informative_subset, \ - 'discordant_read_fraction_in_concordant_sites':discordant_read_fraction_in_concordant_sites, \ - 'discordant_read_fraction_in_discordant_sites':discordant_read_fraction_in_discordant_sites, \ - 'Whithin_Cohort__total_number_of_potential_contaminent_reads':Whithin_Cohort__total_number_of_potential_contaminent_reads, \ - 'Out_of_Cohort__total_number_of_potential_contaminent_reads':Out_of_Cohort__total_number_of_potential_contaminent_reads, \ - 'NrDonors_contributing_to_out_of_cohort':len(set(donors_contributing_to_out_of_cohort)), \ - 'NrDonors_contributing_to_Whithin_Cohort':len(set(donors_contributing_to_Whithin_Cohort)), \ - - 'Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool':Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool, \ - 'Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool':Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool, \ - 'shared_sites_between_Out_of_Cohort_and_Within_Cohort':len(shared_sites_between_out_of_cohort_and_within_cohort), \ - 'Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool':len(Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool), \ - 'Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool':len(Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool), \ - 'Out_of_Cohort__unique_sites_total_reads':Out_of_Cohort__unique_sites_total_reads, \ - 'Total__discordant_sites_that_are_concordant_with_other_donors_in_pool':len(Total__discordant_sites_that_are_concordant_with_other_donors_in_pool) - }, donor_table_of_concordances] + def analyse_cells(self,chunk_df,exclusive_cell_variants,expected_vars_norm,donor_gt_match,vars_per_donor_gt,donor_cohorts,all_donor_data,donor_assignments_table,donor_gt_match_cohort): + donor_concordance_table = {} + other_donor_concordance_table = [] + count=0 + for cell1 in chunk_df: + count+=1 + cell_vars = exclusive_cell_variants[cell1] + # try: + result1, other_donor_concordances = self.concordance_table_production(expected_vars_norm,cell_vars,cell1,donor_gt_match,donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data,donor_assignments_table) + donor_concordance_table,other_donor_concordance_table = self.append_results_cell_concordances(result1,donor_concordance_table,other_donor_concordances,other_donor_concordance_table) + # except: + # continue + return [donor_concordance_table, other_donor_concordance_table] + def analyse_donor(self,Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,all_donor_data,expected_vars_norm,donor_assignments_table,exclusive_cell_variants): + n=200 + print(f'-- Using {cpus} cpus ---') + pool = mp.Pool(cpus) + # Cells_to_keep_pre = Cells_to_keep_pre[:13] + list_df = [Cells_to_keep_pre[i:i+n] for i in range(0,len(Cells_to_keep_pre),n)] + # results = [] + for chunk_df in list_df: + pool.apply_async(self.analyse_cells,args =([chunk_df,exclusive_cell_variants,expected_vars_norm,donor_gt_match,vars_per_donor_gt,donor_cohorts,all_donor_data,donor_assignments_table,donor_gt_match_cohort]),callback=self.combine_results) + # results.append(result) + # while True: + # time.sleep(1) + # # catch exception if results are not ready yet + # try: + # ready = [result.ready() for result in results] + # successful = [result.successful() for result in results] + # except Exception: + # continue + # # exit loop if all tasks returned success + # if all(successful): + # break + # # raise exception reporting exceptions received from workers + # if all(ready) and not all(successful): + # raise Exception(f'Workers raised following exceptions {[result._value for result in results if not result.successful()]}') + # self.pool.join() + # try: + # [result.wait() for result in results] + # except: + # print('done') + # r1,r2 = self.analyse_cells(chunk_df,exclusive_cell_variants,expected_vars_norm,donor_gt_match,vars_per_donor_gt,donor_cohorts,all_donor_data,donor_assignments_table,donor_gt_match_cohort) + # self.combine_results(r1,r2) + print('Done') + pool.close() + pool.join() + return self.donor_concordance_table,self.other_donor_concordance,donor_gt_match,self.donor_id class VCF_Loader: diff --git a/bin/gather_minimal_dataset.py b/bin/gather_minimal_dataset.py index 3a6cd6b..4349485 100755 --- a/bin/gather_minimal_dataset.py +++ b/bin/gather_minimal_dataset.py @@ -413,6 +413,9 @@ def gather_donor(donor_id, ad, ad_lane_raw, azimuth_annot, qc_obs, columns_outpu ad.obs = ad.obs.loc[:,~ad.obs.columns.duplicated()] if write_h5: path1=os.path.join(outdir, oufnam + '.h5ad') + ad.obs['qc.filter.pass.AZ:L0'] = ad.obs['qc.filter.pass.AZ:L0'].astype('bool') + ad.obs['cell_passes_hard_filters'] = ad.obs['cell_passes_hard_filters'].astype('bool') + ad.obs['qc.filter.pass'] = ad.obs['qc.filter.pass'].astype('bool') ad.write(path1,compression='gzip') return { diff --git a/conf/base.conf b/conf/base.conf index 249dc9f..98bb86d 100755 --- a/conf/base.conf +++ b/conf/base.conf @@ -9,7 +9,7 @@ */ params{ - input = 'cellbender' + input = 'cellbender' //# cellbender|cellranger rsync_to_web_file = "${launchDir}/yascp/bin/rsync_to_web.sh" profile = 'normal_run' citeseq = false @@ -59,11 +59,12 @@ params{ cluster_validate_resolution_keras = true input_tables_column_delimiter = '\t' outdir= "${launchDir}/results" + tracedir = "${params.outdir}/pipeline_info" do_deconvolution = true split_bam = false run_multiplet = true utilise_gpu = true - split_ad_per_bach = false + split_ad_per_bach = true cellbender_resolution_to_use='0pt1' reference_assembly_fasta_dir = "https://yascp.cog.sanger.ac.uk/public/10x_reference_assembly" webtransfer = false @@ -239,246 +240,31 @@ process { withLabel:process_medium_memory { memory = { 30.GB * task.attempt } } - - withName: SCRUBLET{ - maxRetries = 3 - errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' } - } - - - withName: DOUBLET_DECON{ - maxRetries = 3 - errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' } - memory = { 100.GB * task.attempt } - } - - withName: AZIMUTH{ - maxRetries = 3 - errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' } - } - withName: CELLTYPIST{ - maxRetries = 3 - errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' } - } - withName: cellbender__remove_background{ - maxRetries = 2 - //# errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' } - } - - withLabel:error_ignore { - errorStrategy = 'ignore' - } - withLabel:error_retry { - errorStrategy = 'retry' - maxRetries = 2 - } - - withName:cluster_validate_resolution_keras{ - maxForks=4 + withName: ASSESS_CALL_RATE{ maxRetries = 3 - memory = { 60.GB * task.attempt } - time = { 12.h * task.attempt } + memory = { 10.GB * task.attempt } errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' } } - withName: CELLTYPIST{ - maxForks=7 - } - - withName: CELLTYPE_FILE_MERGE{ - memory = { 60.GB * task.attempt } - } - - withName: NORMALISE_AND_PCA{ - maxForks=7 - errorStrategy = 'retry' - memory = { 50.GB * task.attempt} - maxRetries = 8 - cpus = 4 - } - - withName: LISI{ - maxForks=7 - memory =300.GB - } - - withName: RESOLVE_POOL_VCFS{ - cpus = 1 - memory = { 1.GB * task.attempt } - time = { 12.h * task.attempt } - maxRetries = 3 - } - - withName: SPLIT_BATCH_H5AD{ - cpus = 2 - memory = { 25.GB * task.attempt * 0.5} - time = { 12.h * task.attempt } - maxRetries = 3 - } - - withName: SUBSET_GENOTYPE2{ - cpus = 2 - memory = { 1.GB * task.attempt } - time = { 12.h * task.attempt } - maxRetries = 3 - } - - withName: JOIN_STUDIES_MERGE{ - cpus = 1 - memory = { 20.GB * task.attempt } - time = { 12.h * task.attempt } - maxRetries = 3 - } - - withName: FREEBAYES{ - cpus = 1 - time = { 12.h * task.attempt } - maxRetries = 2 - } - - withName: VIREO_ADD_SAMPLE_PREFIX{ - cpus = 1 - memory = { 2.GB * task.attempt } - time = { 12.h * task.attempt } - maxRetries = 3 - } - - - withName: REPLACE_GT_DONOR_ID2{ - cpus = 1 - memory = { 1.GB * task.attempt } - time = 12.h - maxRetries = 3 - } - - withName: JOIN_CHROMOSOMES{ - cpus = 1 - memory = { 2.GB * task.attempt } - time = 12.h - maxRetries = 3 - } - - withName: serialize_known_markers{ - cpus = 1 - memory = { 1.GB * task.attempt } - time = { 12.h * task.attempt } - maxRetries = 3 - } - - withName: OUTLIER_FILTER{ - errorStrategy = 'retry' - memory = { 50.GB * task.attempt} - maxRetries = 8 - } - - - - withName: cluster{ - cpus = { 3 * task.attempt } - } - - withName: LISI{ - maxForks=7 - errorStrategy = 'retry' - maxRetries = 8 - memory = { 200.GB * task.attempt} - } - - withName: VIREO_GT_FIX_HEADER{ - errorStrategy = 'retry' - maxRetries = 4 - cpus = 1 - memory = { 1.GB * task.attempt } - } - - - withName: JOIN_CHROMOSOMES{ - errorStrategy = 'retry' - maxRetries = 4 - } - - withName: cluster{ - errorStrategy = 'retry' - maxRetries = 4 - } - - withName: SPLIT_BAM_BY_CELL_BARCODES { - cpus = 1 - memory = { 8.GB * task.attempt} - time = 4.h - } - - withName: CONCORDANCE_CALCLULATIONS{ - cpus = 5 - time = { 12.h * task.attempt } - memory = { 70.GB * task.attempt } - } - - withName: OTHER_DONOR_CONCORDANCE_CALCLULATIONS{ - cpus = 3 - time = { 6.h * task.attempt } - memory = { 20.GB * task.attempt } - } - - - withName: CELLSNP{ - memory = { 5.GB * task.attempt } - } - - withName: DYNAMIC_DONOR_EXCLUSIVE_SNP_SELECTION{ - cpus = 5 - time = { 12.h * task.attempt } - memory = { 20.GB * task.attempt } - } - - withName: prep_collectmetadata{ - memory = { 150.MB * task.attempt } - } - - withName: VIREO{ - //# maxForks=7 - cpus = 5 - time = { 12.h * task.attempt } - memory = { 70.GB * task.attempt } - } - withName: DSB_INTEGRATE{ - memory = { 200.GB * task.attempt } - cpus = { 4 * task.attempt } - maxRetries = 3 - } - - withName: MULTIMODAL_INTEGRATION{ - memory = { 200.GB * task.attempt } - cpus = { 4 * task.attempt } - maxRetries = 3 - } - - withName: umap_gather{ - memory = { 100.GB * task.attempt } - errorStrategy = 'retry' - maxRetries = 3 - } - - withName: DOUBLET_FINDER{ - memory = { 100.GB * task.attempt } - } - - - - withName: GT_MATCH_POOL_AGAINST_PANEL{ - time = { 24.h * task.attempt } - } - - withName: plot_predicted_sex{ - memory = { 50.GB * task.attempt } - maxRetries = 5 - cpus = 2 - - } - - +} +def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') +timeline { + enabled = true + file = "${params.tracedir}/execution_timeline_${trace_timestamp}.html" +} +report { + enabled = true + file = "${params.tracedir}/execution_report_${trace_timestamp}.html" +} +trace { + enabled = true + file = "${params.tracedir}/execution_trace_${trace_timestamp}.txt" +} +dag { + enabled = true + file = "${params.tracedir}/pipeline_dag_${trace_timestamp}.svg" } singularity { diff --git a/conf/cellbender.conf b/conf/cellbender.conf index 636dd16..a8449ec 100755 --- a/conf/cellbender.conf +++ b/conf/cellbender.conf @@ -59,8 +59,6 @@ params { value{ expected_nemptydroplets_umi_cutoff = 0 method_estimate_ncells = 'dropletutils::barcoderanks::inflection' - //method_estimate_ncells = 'cellrangerv2::expected' //this method feeds in the cellranger estimate ncells - //method_estimate_ncells = 'expected' lower_bound_umis_estimate_ncells = 1000 method_estimate_nemptydroplets = 'dropletutils::barcoderanks::inflection,dropletutils::barcoderanks::knee,0.33' lower_bound_umis_estimate_nemptydroplets = 10 diff --git a/conf/modules.conf b/conf/modules.conf index 1885d64..0df8fa4 100755 --- a/conf/modules.conf +++ b/conf/modules.conf @@ -52,4 +52,250 @@ params { } } + + withName: SCRUBLET{ + maxRetries = 3 + errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' } + } + + + withName: DOUBLET_DECON{ + maxRetries = 3 + errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' } + memory = { 100.GB * task.attempt } + } + + withName: AZIMUTH{ + maxRetries = 3 + errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' } + } + withName: CELLTYPIST{ + maxRetries = 3 + errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' } + } + withName: cellbender__remove_background{ + maxRetries = 2 + //# errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' } + } + + withLabel:error_ignore { + errorStrategy = 'ignore' + } + withLabel:error_retry { + errorStrategy = 'retry' + maxRetries = 2 + } + + withName:cluster_validate_resolution_keras{ + maxForks=4 + maxRetries = 3 + memory = { 60.GB * task.attempt } + time = { 12.h * task.attempt } + errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' } + } + + withName: CELLTYPIST{ + maxForks=7 + } + + withName: ASSESS_CALL_RATE{ + maxRetries = 3 + memory = { 10.GB * task.attempt } + errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' } + } + + + withName: CELLTYPE_FILE_MERGE{ + memory = { 60.GB * task.attempt } + } + + withName: NORMALISE_AND_PCA{ + maxForks=7 + errorStrategy = 'retry' + memory = { 50.GB * task.attempt} + maxRetries = 8 + cpus = 4 + } + + withName: LISI{ + maxForks=7 + memory =300.GB + } + + withName: RESOLVE_POOL_VCFS{ + cpus = 1 + memory = { 1.GB * task.attempt } + time = { 12.h * task.attempt } + maxRetries = 3 + } + + withName: SPLIT_BATCH_H5AD{ + cpus = 2 + memory = { 25.GB * task.attempt * 0.5} + time = { 12.h * task.attempt } + maxRetries = 3 + } + + withName: SUBSET_GENOTYPE2{ + cpus = 2 + memory = { 1.GB * task.attempt } + time = { 12.h * task.attempt } + maxRetries = 3 + } + + withName: JOIN_STUDIES_MERGE{ + cpus = 1 + memory = { 20.GB * task.attempt } + time = { 12.h * task.attempt } + maxRetries = 3 + } + + withName: FREEBAYES{ + cpus = 1 + time = { 12.h * task.attempt } + maxRetries = 2 + } + + withName: VIREO_ADD_SAMPLE_PREFIX{ + cpus = 1 + memory = { 2.GB * task.attempt } + time = { 12.h * task.attempt } + maxRetries = 3 + } + + + withName: REPLACE_GT_DONOR_ID2{ + cpus = 1 + memory = { 1.GB * task.attempt } + time = 12.h + maxRetries = 3 + } + + withName: JOIN_CHROMOSOMES{ + cpus = 1 + memory = { 2.GB * task.attempt } + time = 12.h + maxRetries = 3 + } + + withName: serialize_known_markers{ + cpus = 1 + memory = { 1.GB * task.attempt } + time = { 12.h * task.attempt } + maxRetries = 3 + } + + withName: OUTLIER_FILTER{ + errorStrategy = 'retry' + memory = { 50.GB * task.attempt} + maxRetries = 8 + } + + + + withName: cluster{ + cpus = { 3 * task.attempt } + } + + withName: LISI{ + maxForks=7 + errorStrategy = 'retry' + maxRetries = 8 + memory = { 200.GB * task.attempt} + } + + withName: VIREO_GT_FIX_HEADER{ + errorStrategy = 'retry' + maxRetries = 4 + cpus = 1 + memory = { 1.GB * task.attempt } + } + + + withName: JOIN_CHROMOSOMES{ + errorStrategy = 'retry' + maxRetries = 4 + } + + withName: cluster{ + errorStrategy = 'retry' + maxRetries = 4 + } + + withName: SPLIT_BAM_BY_CELL_BARCODES { + cpus = 1 + memory = { 8.GB * task.attempt} + time = 4.h + } + + withName: CONCORDANCE_CALCLULATIONS{ + cpus = 20 + time = { 24.h * task.attempt } + memory = { 100.GB * task.attempt } + } + + withName: OTHER_DONOR_CONCORDANCE_CALCLULATIONS{ + cpus = 3 + time = { 6.h * task.attempt } + memory = { 20.GB * task.attempt } + } + + + withName: CELLSNP{ + memory = { 5.GB * task.attempt } + } + + withName: DYNAMIC_DONOR_EXCLUSIVE_SNP_SELECTION{ + cpus = 5 + time = { 12.h * task.attempt } + memory = { 20.GB * task.attempt } + } + + withName: prep_collectmetadata{ + memory = { 150.MB * task.attempt } + } + + withName: VIREO{ + //# maxForks=7 + cpus = 5 + time = { 12.h * task.attempt } + memory = { 70.GB * task.attempt } + } + withName: DSB_INTEGRATE{ + memory = { 200.GB * task.attempt } + cpus = { 4 * task.attempt } + maxRetries = 3 + } + + withName: MULTIMODAL_INTEGRATION{ + memory = { 200.GB * task.attempt } + cpus = { 4 * task.attempt } + maxRetries = 3 + } + + withName: umap_gather{ + memory = { 100.GB * task.attempt } + errorStrategy = 'retry' + maxRetries = 3 + } + + withName: DOUBLET_FINDER{ + memory = { 100.GB * task.attempt } + } + + + + withName: GT_MATCH_POOL_AGAINST_PANEL{ + time = { 24.h * task.attempt } + } + + withName: plot_predicted_sex{ + memory = { 50.GB * task.attempt } + maxRetries = 5 + cpus = 2 + + } + + + } diff --git a/conf/test.conf b/conf/test.conf index 6b947ed..185ca42 100755 --- a/conf/test.conf +++ b/conf/test.conf @@ -23,7 +23,10 @@ params { posterior_assignment = false //if this is set to true, we will perform the genotype donor matching after the deconvolution is performed. subset_genotypes = false tsv_donor_panel_vcfs = "https://yascp.cog.sanger.ac.uk/public/test_datasets/full_test_dataset/vcf_inputs_v2.tsv" //this is a panel of vcf files that we want to compar the genotypes with + ZSCORE_THRESH = 8 //# Minimum z0 threshold required to call the gt assignment confident. + ZSCORE_DIST_THRESH = 8 //# Minimum bifference between z0 and z1 to call the assignment confident, } + hard_filters_file = "${projectDir}/sample_input/sample_qc.yml" //this file defilnes what hard filters we want to use to flag/drop the cells //# For the training purposes we reduce the cellbender epochs and learning rate as this step takes a long time to compute. diff --git a/conf/test_full.conf b/conf/test_full.conf index 6b947ed..9b6150a 100755 --- a/conf/test_full.conf +++ b/conf/test_full.conf @@ -23,6 +23,9 @@ params { posterior_assignment = false //if this is set to true, we will perform the genotype donor matching after the deconvolution is performed. subset_genotypes = false tsv_donor_panel_vcfs = "https://yascp.cog.sanger.ac.uk/public/test_datasets/full_test_dataset/vcf_inputs_v2.tsv" //this is a panel of vcf files that we want to compar the genotypes with + ZSCORE_THRESH = 8 //# Minimum z0 threshold required to call the gt assignment confident. + ZSCORE_DIST_THRESH = 8 //# Minimum bifference between z0 and z1 to call the assignment confident, + } hard_filters_file = "${projectDir}/sample_input/sample_qc.yml" //this file defilnes what hard filters we want to use to flag/drop the cells diff --git a/main.nf b/main.nf index 93fc1f0..3cecc35 100755 --- a/main.nf +++ b/main.nf @@ -38,7 +38,7 @@ include {collect_file} from "$projectDir/modules/nf-core/modules/collect_file/ma include { CELLSNP;capture_cellsnp_files } from "$projectDir/modules/nf-core/modules/cellsnp/main" - +// Channel.of(params.outdir).mkdirs() ////// WORKFLOW: Run main nf-core/yascp analysis pipeline // This is the default entry point, we have others to update ceirtain parts of the results. diff --git a/modules/nf-core/modules/citeseq/main.nf b/modules/nf-core/modules/citeseq/main.nf index 70edf6e..272029a 100755 --- a/modules/nf-core/modules/citeseq/main.nf +++ b/modules/nf-core/modules/citeseq/main.nf @@ -26,7 +26,7 @@ process SPLIT_CITESEQ_GEX { tuple val(sample_name), path("${sample_name}__Gene_Expression"), emit:gex_data tuple val(sample_name), path("antibody-${sample_name}.h5ad"), emit: ab_data2 optional true tuple val(sample_name), path("Gene_Expression-${sample_name}.h5ad"), emit: gex_h5ad optional true - tuple val(sample_name), path("${sample_name}__*"), emit: ab_data + tuple val(sample_name), path("${sample_name}__*"), emit: ab_data optional true tuple val(sample_name), path("${sample_name}__Gene_Expression/barcodes.tsv.gz"), path("${sample_name}__Gene_Expression/features.tsv.gz"), path("${sample_name}__Gene_Expression/matrix.mtx.gz"), emit: channel__file_paths_10x script: @@ -166,6 +166,8 @@ process VDJ_INTEGRATION{ output: path("*all_samples_integrated.vdj.RDS"), emit: all_data_integrated_vdj_rds + path("*all_samples_integrated.BCR.RDS"), emit: all_data_integrated_BCR_rds + path("*all_samples_integrated.TCR.RDS"), emit: all_data_integrated_TCR_rds input: path(all_cellranger_samples) diff --git a/modules/nf-core/modules/doubletdecon/main.nf b/modules/nf-core/modules/doubletdecon/main.nf index 0e9ef94..b50bc01 100644 --- a/modules/nf-core/modules/doubletdecon/main.nf +++ b/modules/nf-core/modules/doubletdecon/main.nf @@ -1,4 +1,4 @@ -process DOUBLET_DECON { +process DOUBLET_DECON{ tag "${experiment_id}" label 'process_medium' diff --git a/modules/nf-core/modules/prepere_yascp_inputs/main.nf b/modules/nf-core/modules/prepere_yascp_inputs/main.nf index a83c72c..ebabfe1 100644 --- a/modules/nf-core/modules/prepere_yascp_inputs/main.nf +++ b/modules/nf-core/modules/prepere_yascp_inputs/main.nf @@ -16,6 +16,7 @@ process YASCP_INPUTS { input: path(input_file) + // the output for this is a correct format input files as per cb 6.1 output: path("input_file_corectly_formatted.tsv"), emit: input_file_corectly_formatted diff --git a/nextflow.config b/nextflow.config index 572aa8b..2378616 100755 --- a/nextflow.config +++ b/nextflow.config @@ -25,8 +25,8 @@ params { max_multiqc_email_size = '25.MB' // Boilerplate options - outdir = output_dir = "${launchDir}/results" - tracedir = "${params.outdir}/pipeline_info" + outdir = "${launchDir}/results" + publish_dir_mode = 'copy' email = null email_on_fail = null @@ -38,7 +38,8 @@ params { schema_ignore_params = 'genomes,modules' enable_conda = false singularity_pull_docker_container = false - + tracedir = "${params.outdir}/pipeline_info" + // Config options custom_config_version = 'master' custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" @@ -78,13 +79,6 @@ includeConfig 'conf/qc.conf' includeConfig 'conf/modules.conf' -// Load igenomes.config if required -if (!params.igenomes_ignore) { - includeConfig 'conf/igenomes.conf' -} else { - params.genomes = [:] -} - profiles { debug { process.beforeScript = 'echo $HOSTNAME' } conda { diff --git a/subworkflows/celltype.nf b/subworkflows/celltype.nf index 7966b7a..4e611ad 100755 --- a/subworkflows/celltype.nf +++ b/subworkflows/celltype.nf @@ -70,9 +70,9 @@ workflow celltype{ }else{ sc_out = Channel.of() } - all_extra_fields = all_extra_fields.mix(sc_out) + all_extra_fields2 = all_extra_fields.mix(sc_out) - CELLTYPE_FILE_MERGE(az_out,ct_out,all_extra_fields,SPLIT_BATCH_H5AD.out.keras_outfile.collect()) + CELLTYPE_FILE_MERGE(az_out,ct_out,all_extra_fields2,SPLIT_BATCH_H5AD.out.keras_outfile.collect()) file__anndata_merged2=CELLTYPE_FILE_MERGE.out.file__anndata_merged2 file__anndata_merged2.view() diff --git a/subworkflows/local/retrieve_recourses.nf b/subworkflows/local/retrieve_recourses.nf index 6a48228..505b1c3 100755 --- a/subworkflows/local/retrieve_recourses.nf +++ b/subworkflows/local/retrieve_recourses.nf @@ -64,4 +64,22 @@ process RETRIEVE_RECOURSES_TEST_DATASET{ sed -i 's#/path/to/replace/pointing/to/downloaded/files#${params.outdir}/recourses/full_test_dataset#' input_test_data_file.tsv touch Done > Done.tmp """ +} + + +process STAGE_FILE{ + label 'process_tiny' + // In nf there is a function collectFile - however if you provide a symlinked file directory tusing nf function will overwrite the source instead of replacing the file + // This snipped is a replication of the function but as a nf module and hence the problem is avoided. + // publishDir "${params.outdir}/recourses", mode: "${params.copy_mode}", overwrite: true + input: + path(file) + output: + path(file) + + script: + + """ + echo 'staged' + """ } \ No newline at end of file diff --git a/subworkflows/main_deconvolution.nf b/subworkflows/main_deconvolution.nf index 08fd451..25828bf 100755 --- a/subworkflows/main_deconvolution.nf +++ b/subworkflows/main_deconvolution.nf @@ -17,7 +17,7 @@ include { match_genotypes } from './match_genotypes' include {ENHANCE_STATS_GT_MATCH } from "$projectDir/modules/nf-core/modules/genotypes/main" include {SUBSET_WORKF} from "$projectDir/modules/nf-core/modules/subset_genotype/main" include {REPLACE_GT_DONOR_ID2; REPLACE_GT_DONOR_ID2 as REPLACE_GT_DONOR_ID_SUBS } from "$projectDir/modules/nf-core/modules/genotypes/main" -include { RETRIEVE_RECOURSES } from "$projectDir/subworkflows/local/retrieve_recourses" +include { RETRIEVE_RECOURSES; STAGE_FILE } from "$projectDir/subworkflows/local/retrieve_recourses" include {GT_MATCH_POOL_IBD } from "$projectDir/modules/nf-core/modules/genotypes/main" include { MATCH_GT_VIREO } from "$projectDir/modules/nf-core/modules/genotypes/main" @@ -52,7 +52,7 @@ workflow main_deconvolution { }else{ genome = "${params.reference_assembly_fasta_dir}" } - + vcf_candidate_snps = STAGE_FILE(params.cellsnp.vcf_candidate_snps) // genome.subscribe { println "genome: $it" } if (params.genotype_input.run_with_genotype_input) { @@ -73,7 +73,7 @@ workflow main_deconvolution { // This will subsequently result in a joint vcf file per cohort per donors listed for each of the pools that can be used in VIREO and/or GT matching algorythm. SUBSET_WORKF(ch_ref_vcf,donors_in_pools,'AllExpectedGT',genome) merged_expected_genotypes = SUBSET_WORKF.out.merged_expected_genotypes - merged_expected_genotypes2 = merged_expected_genotypes.combine(Channel.fromPath(params.cellsnp.vcf_candidate_snps)) + merged_expected_genotypes2 = merged_expected_genotypes.combine(vcf_candidate_snps) // merged_expected_genotypes2.subscribe { println "merged_expected_genotypes2: $it" } GT_MATCH_POOL_IBD(SUBSET_WORKF.out.samplename_subsetvcf_ibd,'Withing_expected','Expected') @@ -105,10 +105,10 @@ workflow main_deconvolution { if (params.provide_within_pool_donor_specific_sites_for_pilup){ cellsnp_with_npooled2 = cellsnp_with_npooled.combine(cellsnp_panels, by: 0) }else{ - cellsnp_with_npooled2 = cellsnp_with_npooled.combine(Channel.fromPath(params.cellsnp.vcf_candidate_snps)) + cellsnp_with_npooled2 = cellsnp_with_npooled.combine(vcf_candidate_snps) } }else{ - cellsnp_with_npooled2 = cellsnp_with_npooled.combine(Channel.fromPath(params.cellsnp.vcf_candidate_snps)) + cellsnp_with_npooled2 = cellsnp_with_npooled.combine(vcf_candidate_snps) } log.info('Capturing some of the existing CELLSNP files') diff --git a/subworkflows/prepare_inputs.nf b/subworkflows/prepare_inputs.nf index 69c438c..be61c90 100755 --- a/subworkflows/prepare_inputs.nf +++ b/subworkflows/prepare_inputs.nf @@ -15,6 +15,7 @@ workflow prepare_inputs { YASCP_INPUTS(channel_input_data_table) channel_input_data_table = YASCP_INPUTS.out.input_file_corectly_formatted + channel_input_data_table .splitCsv(header: true, sep: params.input_tables_column_delimiter) diff --git a/workflows/yascp.nf b/workflows/yascp.nf index 85492f5..22774af 100755 --- a/workflows/yascp.nf +++ b/workflows/yascp.nf @@ -57,7 +57,7 @@ workflow YASCP { bam_split_channel = Channel.of() out_ch = params.outdir ? Channel.fromPath(params.outdir, checkIfExists:true) - : Channel.fromPath("${launchDir}/${params.outdir}") + : Channel.from("${launchDir}/${params.outdir}") // out_ch.map{row->"${row[0]}/possorted_genome_bam.bam" } prepare_inputs(input_channel)