diff --git a/assets/deploy_scripts/module_exacutables/yascp b/assets/deploy_scripts/module_exacutables/yascp
index 174b8ac..e555c03 100755
--- a/assets/deploy_scripts/module_exacutables/yascp
+++ b/assets/deploy_scripts/module_exacutables/yascp
@@ -5,6 +5,7 @@ bold=$(tput bold)
 normal=$(tput sgr0)
 
 
+
 if [[ "$QUEUE" == '' ]];
     then
         export QUEUE='long'
diff --git a/bin/4.add_vdj.R b/bin/4.add_vdj.R
index 4a0ed4d..0af69d2 100755
--- a/bin/4.add_vdj.R
+++ b/bin/4.add_vdj.R
@@ -34,8 +34,7 @@ data_dir <- './'
 args = commandArgs(trailingOnly=TRUE)
 
 wnn_integrated_file = args[1]
-n2 = strsplit(as.character(wnn_integrated_file), split="__all_samples")[[1]][1]
-outname = paste0(n2,'__all_samples_integrated.vdj.RDS')
+
 
 myPalette <- colorRampPalette(rev(brewer.pal(11, "Spectral")))
 
@@ -155,8 +154,14 @@ vdj_dir <- paste0('./vdj/')
 dir.create(vdj_dir,showWarnings = F)
 names(immdata_TCR$data) <- tcr_names
 names(immdata_BCR$data) <- bcr_names
-saveRDS(immdata_BCR, file=paste0(vdj_dir,'/slemap_BCR.rds'))
-saveRDS(immdata_TCR, paste0(vdj_dir,'/slemap_TCR.rds'))
+
+n2 = strsplit(as.character(wnn_integrated_file), split="__all_samples")[[1]][1]
+outname = paste0(n2,'__all_samples_integrated.vdj.RDS')
+outname_BCR = paste0(n2,'__all_samples_integrated.BCR.RDS')
+outname_TCR = paste0(n2,'__all_samples_integrated.TCR.RDS')
+
+saveRDS(immdata_BCR, file=outname_BCR)
+saveRDS(immdata_TCR, file=outname_TCR)
 
 saveRDS(all_samples, file=outname)
 
diff --git a/bin/concordance_calculations.py b/bin/concordance_calculations.py
index 0aac3a4..5b61c83 100755
--- a/bin/concordance_calculations.py
+++ b/bin/concordance_calculations.py
@@ -35,8 +35,11 @@
 mode = 'no_het'  #We can not determine whether hets are concordant or discordant in terms of reads. 
 
 
-class Concordances:
     
+    
+
+class Concordances:
+
     def reset(self):
         self.cell_concordance_table ={}
 
@@ -55,1069 +58,1095 @@ def __init__(self, donor_assignments_table,cell_assignments_table,exclusive_don_
         self.uninformative_sites = uninformative_sites
         self.record_dict={}
 
-    def norm_genotypes(self,expected_vars):
-        expected_vars = pd.DataFrame(expected_vars)
-        if len(expected_vars) > 0:
-            split_str=expected_vars[0].str.split("_")
-            expected_vars['ids'] = split_str.str[0]+'_'+split_str.str[1]+'_'+split_str.str[2]+'_'+split_str.str[3]
-            expected_vars['pos'] = split_str.str[0]+'_'+split_str.str[1]
-            expected_vars['vars'] = split_str.str[4]
-            expected_vars['vars'] = expected_vars['vars'].str.replace('|','/',regex=False)
-            expected_vars = expected_vars[expected_vars['vars']!='./.']
-            expected_vars.loc[expected_vars['vars']=='0/1','vars']='1/0'
-            expected_vars['combo']= expected_vars['ids']+'_'+expected_vars['vars']
-        return expected_vars
-
-
-
-    def get_strict_discordance(self, expected_vars, cell_vars):
-        '''
-        take a list of SNP array genotypes and a list of cellSNP genotypes, return counts of truly discordant 
-        sites and relaxed concordant sites and list of discordant sites1
-        1) If you have 1/1 on SNP array you can not get a 0/1 or 0/0 genotype
-        2) if you have a 0/0 you can not get a 1/1 or 0/1
-        3) if you genotype is 0/1 you can get all copies: 0/0 . 0/1. 1/1
-        So - each obversed cellsnp allele must be in the array SNP gtype
-        '''
-        snp_gtypes = expected_vars[0]
-        cellsnp_gtypes = cell_vars[0]
-        true_discordant = 0
-        relaxed_concordant = 0
-        relaxed_concordant_informative = 0
-        relaxed_concordant_informative_ids = []
-        relaxed_concordant_uninformative_ids = []
-        true_discordant_uninformative_ids =[]
-        true_discordant_informative_ids=[]
-        relaxed_concordant_uninformative = 0
-        true_discordant_informative = 0
-        true_discordant_uninformative = 0
-        discordant_vars = []
-        concordant_vars = []
-        subset_informative_concordant = 0
-        subset_informative_discordant = 0
-
-        #print(self.uninformative_sites)
-        #print(self.informative_sites)
+    def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match, donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data,donor_assignments_table):
 
-        #create sets of the ids (chrom, pos, ref, alt) in each set of genotypes. Filter to the ids present in both 
-        #then filter to informative and uninformative. If uninformative >0 then create a subset of informative
-        # with the same number of vars (at random)
-        split_snp_gts=snp_gtypes.str.split("_")
-        snp_gtypes_ids = set(split_snp_gts.str[0]+'_'+split_snp_gts.str[1]+'_'+split_snp_gts.str[2]+'_'+split_snp_gts.str[3])
+        Concordant_Sites, \
+            Discordant_sites, \
+            Total_Overlapping_sites, \
+            discordant_sites, \
+            cell_vars_norm, \
+            true_discordant_count, \
+            relaxed_concordant_count, \
+            relaxed_concordant_informative_count, \
+            relaxed_concordant_uninformative_count, \
+            true_discordant_informative_count, \
+            true_discordant_uninformative_count, \
+            total_sites, \
+            informative_sites, \
+            uninformative_sites, \
+            total_reads, \
+            total_reads_informative, \
+            total_reads_uninformative, \
+            discordant_reads, \
+            discordant_reads_informative, \
+            discordant_reads_uninformative, \
+            discordant_vars, \
+            concordant_vars, \
+            discordant_read_fraction_in_concordant_sites, \
+            discordant_read_fraction_in_discordant_sites, \
+            discordant_reads_uninformative_fraction, \
+            discordant_reads_informative_fraction, discordant_reads_informative_subset  = self.retrieve_concordant_discordant_sites(expected_vars_norm,cell_vars)
+            
+        total_concordant_sites = len(Concordant_Sites) + relaxed_concordant_count
+        dds = self.donor_distinct_sites[donor_gt_match]
+        Nr_donor_distinct_sites = len(dds)
+        Nr_Concordant = len(Concordant_Sites)
+        Nr_Relaxed_concordant = relaxed_concordant_count
+        Nr_Discordant = len(Discordant_sites)
+        Nr_Total_Overlapping_sites = len(Total_Overlapping_sites)
+        Number_of_sites_that_are_donor_concordant_and_exclusive = len(set(dds).intersection(set(Discordant_sites)))
+        Number_of_sites_in_cellsnp_but_not_in_reference = set(cell_vars_norm['pos'])-set(expected_vars_norm['pos'])
+        #Quantify donor variation in other donors
+        discordant_vars_in_pool = []
+        donor_table_of_concordances = []
+        total_discordant_sites_that_are_concordant_with_other_donors_in_pool = set()
+        total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown = {}
+        total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort']={}
+        total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort']={}
+      
+        total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown_totalReads = {}
+        total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown_totalReads['Whithin_Cohort']={}
+        total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown_totalReads['Out_of_Cohort']={}      
+      
+        total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown = {}
+        total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort']={}
+        total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort']={}      
+      
+        
+        informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool = set()
+        total_cordant_sites_that_are_concordant_with_other_donors_in_pool = set()
+        donor_gt_match_cohort = donor_cohorts[donor_gt_match]
+        donors_contributing_to_out_of_cohort= []
+        donors_contributing_to_Whithin_Cohort=[]
+        sites_contributing_to_out_of_cohort= []
+        sites_contributing_to_Whithin_Cohort=[]
+        for donor in set(donor_assignments_table['donor_gt']):
+            try:
+                expected_vars_norm_of_other_donor = all_donor_data[donor]
+            except:
+                continue
+            try:
+                donor_cohort = donor_cohorts[donor]
+                donor_vars = vars_per_donor_gt[donor]
+            except:
+                continue        
+            
+            Concordant_Sites_otherDonor, \
+                Discordant_sites_otherDonor, \
+                Total_Overlapping_sites_otherDonor, \
+                discordant_sites_otherDonor, \
+                cell_vars_norm_otherDonor, \
+                true_discordant_count_otherDonor, \
+                relaxed_concordant_count_otherDonor, \
+                relaxed_concordant_informative_count_otherDonor, \
+                relaxed_concordant_uninformative_count_otherDonor, \
+                true_discordant_informative_count_otherDonor, \
+                true_discordant_uninformative_count_otherDonor, \
+                total_sites_otherDonor, \
+                informative_sites_otherDonor, \
+                uninformative_sites_otherDonor, \
+                total_reads_otherDonor, \
+                total_reads_informative_otherDonor, \
+                total_reads_uninformative_otherDonor, \
+                discordant_reads_otherDonor, \
+                discordant_reads_informative_otherDonor, \
+                discordant_reads_uninformative_otherDonor, \
+                discordant_vars_otherDonor, \
+                concordant_vars_otherDonor, \
+                discordant_read_fraction_in_concordant_sites_otherDonor, \
+                discordant_read_fraction_in_discordant_sites_otherDonor, \
+                discordant_reads_uninformative_fraction_otherDonor, \
+                discordant_reads_informative_fraction_otherDonor, discordant_reads_informative_subset_otherDonor  = self.retrieve_concordant_discordant_sites(expected_vars_norm_of_other_donor,cell_vars,donor_cohort=donor_cohort)
+            
+            if total_sites_otherDonor==0:
+                total_sites_otherDonor=0.000000000001
+            total_concordant_sites_otherDonor = relaxed_concordant_count_otherDonor
+            concordant_percent_in_other_donor= total_concordant_sites_otherDonor/total_sites_otherDonor*100
+            discordant_percent_in_other_donor= true_discordant_count_otherDonor/total_sites_otherDonor*100
+            DonorDiscordant_Sites_that_are_atributed_to_other_donor = set(discordant_vars).intersection(set(concordant_vars_otherDonor))
+            TotalSites_that_are_atributed_to_other_donor = set(discordant_vars).union(set(concordant_vars)).intersection(set(concordant_vars_otherDonor))
 
-        split_cellsnp_gts=cellsnp_gtypes.str.split("_")
-        cellsnp_gtypes_ids = set(split_cellsnp_gts.str[0]+'_'+split_cellsnp_gts.str[1]+'_'+split_cellsnp_gts.str[2]+'_'+split_cellsnp_gts.str[3])
+            Informative__DonorDiscordant_Sites_that_are_atributed_to_other_donor = set(true_discordant_informative_count).intersection(set(relaxed_concordant_informative_count_otherDonor))
+            DonorCordant_Sites_that_are_atributed_to_other_donor = set(concordant_vars).intersection(set(concordant_vars_otherDonor))
+           
+            total_reads_for_discordant_sites_that_are_concordant_with_other_donor,discordant_reads_for_discordant_sites_that_are_concordant_with_other_donor,concordant_reads_for_discordant_sites_that_are_concordant_with_other_donor,\
+                _,_,_= self.read_extraction(DonorDiscordant_Sites_that_are_atributed_to_other_donor,expected_vars_norm_of_other_donor,cell_vars_norm_otherDonor)
 
-        shared_gts = snp_gtypes_ids.intersection(cellsnp_gtypes_ids)
+            if not donor == donor_gt_match:
+                # We want to kow how many of these discordant site
+                if 'U937' in donor:
+                    continue
+                if 'THP1' in donor:
+                    continue
+                
+                if donor_gt_match_cohort == donor_cohort:
+                    coh = 'Whithin_Cohort'
+                    sites_contributing_to_Whithin_Cohort.extend(list(TotalSites_that_are_atributed_to_other_donor))
+                    if len(DonorDiscordant_Sites_that_are_atributed_to_other_donor)>0:
+                        donors_contributing_to_Whithin_Cohort.append(donor)
+                        
+                else:
+                    coh = 'Out_of_Cohort'
+                    sites_contributing_to_out_of_cohort.extend(list(TotalSites_that_are_atributed_to_other_donor))
+                    if len(DonorDiscordant_Sites_that_are_atributed_to_other_donor)>0:
+                        donors_contributing_to_out_of_cohort.append(donor)
+                        
+                
+                total_discordant_sites_that_are_concordant_with_other_donors_in_pool = total_discordant_sites_that_are_concordant_with_other_donors_in_pool.union(set(DonorDiscordant_Sites_that_are_atributed_to_other_donor))
+                # now we addit for a cohort since the biggest issue comes from cohort cross-contamination
+                # for each of these sites now we calculate the number of reads that it accounts:
+                # tree level set: cohort: site: counts
+                for site in DonorDiscordant_Sites_that_are_atributed_to_other_donor:
+                    total_reads_for_site,discordant_reads_for_site,concordant_for_site,\
+                        total_reads_noHet,discordant_reads_noHet,concordant_reads_noHet= self.read_extraction([site],expected_vars_norm_of_other_donor,cell_vars_norm_otherDonor)
+                    if concordant_for_site==0:
+                        pass
+                    try:
+                        total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_reads_noHet)
 
-        shared_informative = shared_gts.intersection(self.informative_sites)
-        shared_uninformative = shared_gts.intersection(self.uninformative_sites)
-        # print("shared informative " + str(len(shared_informative)))
-        # print("shared uninformative " + str(len(shared_uninformative)))
+                    except:
+                        try:
+                            total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site]=[]
+                            total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_reads_noHet)
+                            
+                        except:
+                            total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh]={}
+                            total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site]=[]
+                            total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_reads_noHet)
 
-        #store the numbers of informative and uninformative sites shared between cellSNP and gt data as these
-        #are the sites used for concordance
-        self.informative_covered = len(shared_informative)
-        self.uninformative_covered = len(shared_uninformative)
+                for site in DonorDiscordant_Sites_that_are_atributed_to_other_donor:
+                    total_reads_for_site,discordant_reads_for_site,concordant_for_site,\
+                        _,_,_= self.read_extraction([site],expected_vars_norm_of_other_donor,cell_vars_norm_otherDonor)
+                    # if discordant_reads_for_site>0:
+                    #     print('here')
+                    if concordant_for_site==0:
+                        pass
+                    try:
+                        total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_for_site)
+                    except:
+                        try:
+                            total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site]=[]
+                            total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_for_site)
+                        
+                        except:
+                            total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh]={}
+                            total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site]=[]
+                            total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_for_site)
+                                                    
+                # to get the total reads that can be atributed to the other donor i have to check if site is already covered in the total_discordant_sites_that_are_concordant_with_other_donors_in_pool.
+                # the ones that havent, i have to add the reads up for them.
+                informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool = informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool.union(set(Informative__DonorDiscordant_Sites_that_are_atributed_to_other_donor))
+                
+                total_cordant_sites_that_are_concordant_with_other_donors_in_pool = total_cordant_sites_that_are_concordant_with_other_donors_in_pool.union(set(DonorCordant_Sites_that_are_atributed_to_other_donor))
+                
 
-        if len(shared_uninformative) > 0:
-            #print(len(shared_uninformative))
-            # print(len(shared_informative))
-            if len(shared_uninformative) <= len(shared_informative):
-                informative_subset = set(random.sample(shared_informative, len(shared_uninformative)))
+                common_vars = list(set(discordant_vars) & set(donor_vars))
+                common_var_count = str(len(common_vars))
+                donor_cohort_common = donor + ":" + donor_cohort + ":" + common_var_count
+                discordant_vars_in_pool.append(donor_cohort_common)
+                
+                # Here we want to calculate the number of discordant sites in other donors and see if in terms of concordance the same donor is picked as per GT assignment.
+                # We do this to investigate the potential of a cell coming from this other donor.
             else:
-                informative_subset = set()#if there are more shared uninformative than shared informative we will not subset
-            # print(informative_subset)
-            # exit(0)
-        else:
-            informative_subset = set()
-
-        # print(informative_subset)
-        self.informative_subset = informative_subset
-
-        snp_gtypes_set = set(snp_gtypes)
-        snp_gtypes_set = sorted(snp_gtypes_set)
-
-        cellsnp_gtypes_set = set(cellsnp_gtypes)
-        cellsnp_gtypes_set = sorted(cellsnp_gtypes_set)
-
-        #for i in range(0, len(snp_gtypes)):
-        # cellsnp_gtypes_str = pd.DataFrame(cellsnp_gtypes_set,columns=['cellsnp_gtypes'])
-        # cellsnp_gtypes_str['snp_gtypes'] = pd.DataFrame(snp_gtypes_set,columns=['snp_gtypes'])['snp_gtypes']
-        for i in range(0, len(snp_gtypes_set)):
-            discordant = False
-            # snp_data = snp_gtypes[i].split('_')
-            # cellsnp_data = cellsnp_gtypes[i].split('_')
-            snp_data = snp_gtypes_set[i].split('_')
-            cellsnp_data = cellsnp_gtypes_set[i].split('_')
-
-            snp_alleles_set = set([snp_data[4][0], snp_data[4][2]])
-            cellsnp_alleles_set = set([cellsnp_data[4][0], cellsnp_data[4][2]])
-            
-            snp_var = ('_').join(snp_data[0:4])
-            cellsnp_var = ('_').join(cellsnp_data[0:4])
-
-            if not cellsnp_var == snp_var:
-                print("Error with strict discordance calculations: " + snp_gtypes[i] + " " + cellsnp_gtypes[i])
-                exit(1)
-            else:
-                for allele in cellsnp_alleles_set:
-                    if not allele in snp_alleles_set:#if a cellSNP allele is found that is not in the array data this is discordant
-                        discordant = True
+                _='check'
+                try:
+                    percent_relaxed_concordant = Nr_Relaxed_concordant/(Nr_Relaxed_concordant+true_discordant_count)*100
+                except:
+                    percent_relaxed_concordant = 0
+                # g2 = total_concordant_sites_otherDonor/total_sites_otherDonor*100
+                if not (percent_relaxed_concordant == concordant_percent_in_other_donor):
+                    _='something not right'
+                    break
             
-            if discordant == True:
-                true_discordant+=1
-                discordant_vars.append(cellsnp_var)
-                if snp_var in self.uninformative_sites:
-                    true_discordant_uninformative+=1
-                    true_discordant_uninformative_ids.append(snp_var)
-                elif snp_var in self.informative_sites:
-                    true_discordant_informative+=1
-                    true_discordant_informative_ids.append(snp_var)
-            else:
-                relaxed_concordant+=1
-                concordant_vars.append(cellsnp_var)
-                if snp_var in self.uninformative_sites:
-                    relaxed_concordant_uninformative+=1
-                    relaxed_concordant_uninformative_ids.append(snp_var)
-                elif snp_var in self.informative_sites:
-                    relaxed_concordant_informative+=1
-                    relaxed_concordant_informative_ids.append(snp_var)
-            if len(shared_uninformative) > 0:
-                if snp_var in informative_subset:
-                    if discordant == True:
-                        subset_informative_discordant+=1
-                    else:
-                        subset_informative_concordant+=1
-            #    true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count 
-        cell_vars2 = cell_vars.set_index('ids') 
-        disc = pd.DataFrame(set(cell_vars2.loc[discordant_vars]['combo']),columns=['combo_x'])
-        df_cd = pd.merge(cell_vars, expected_vars, how='inner', on = 'pos')
-        disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x')
-        disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y']
-        disc_sites_string = ';'.join(disc2['expected_retrieved'])
-        return true_discordant, relaxed_concordant, relaxed_concordant_informative_ids, relaxed_concordant_uninformative_ids, true_discordant_informative_ids, true_discordant_uninformative_ids, discordant_vars, concordant_vars, disc_sites_string
+            donor_table_of_concordances.append({'donor':donor, 'cell':cell1, 'donor_cohort':donor_cohort, \
+                                                'gt matched donor':donor == donor_gt_match, \
+                                                'DonorCordant_Sites_that_are_atributed_to_other_donor':len(DonorCordant_Sites_that_are_atributed_to_other_donor), \
+                                                'DonorCordant_Sites_that_are_atributed_to_other_donor/total':f"{len(DonorCordant_Sites_that_are_atributed_to_other_donor)}/{len(concordant_vars)}", \
+                                                'DonorDiscordant_Sites_that_are_atributed_to_other_donor':len(DonorDiscordant_Sites_that_are_atributed_to_other_donor), \
+                                                'DonorDiscordant_Sites_that_are_atributed_to_other_donor/total':f"{len(DonorDiscordant_Sites_that_are_atributed_to_other_donor)}/{len(discordant_vars)}", \
+                                                'concordant_percent_in_other_donor':concordant_percent_in_other_donor, \
+                                                'discordant_percent_in_other_donor':discordant_percent_in_other_donor, \
+                                                'discordant_reads_otherDonor':discordant_reads_otherDonor, \
+                                                'discordant_sites_otherDonor':len(discordant_vars_otherDonor), \
+                                                'concordant_sites_otherDonor':len(concordant_vars_otherDonor), \
+                                                'total_sites_otherDonor':total_sites_otherDonor, \
+                                                'discordant_reads_otherDonor':discordant_reads_otherDonor, \
+                                                'total_reads_otherDonor':total_reads_otherDonor, \
+                                                # 'discordant_read_fraction_in_concordant_sites_otherDonor':discordant_read_fraction_in_concordant_sites_otherDonor, \
+                                                # 'discordant_read_fraction_in_discordant_sites_otherDonor':discordant_read_fraction_in_discordant_sites_otherDonor, \
+                                                'concordant_reads_For_discordant_sites_that_are_Concordant_with_other_donor':concordant_reads_for_discordant_sites_that_are_concordant_with_other_donor
+                                                })
+        
+        
+        #here now we want to see overall how many reads potentially come from different cohorts.
+        # cohort_specific_site_quant_string=""
+        # cohort_specific_read_quant_string=""
+        
+        Whithin_Cohort__total_number_of_potential_contaminent_reads=0
+        Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool=0
+        Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool=0
+        try:
+            Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool = len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys())
+            # for k1 in total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys():
+            #     Whithin_Cohort__total_number_of_potential_contaminent_reads+= max(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'][k1])
+        except:
+            _='Doesnt Exist'
+        
+        Out_of_Cohort__total_number_of_potential_contaminent_reads=0
+        try:
+            Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool = len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys())
+            # for k1 in total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys():
+            #     Out_of_Cohort__total_number_of_potential_contaminent_reads+= max(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'][k1])
+        except:
+            _='Doesnt Exist'
     
-    def read_concordance_calc(self,expected_vars,cell_vars):
-            
-            # This is a wrapper to add up the discordant reads in the cellsnp file.
+    
+    
+    
+    
+    
+    
+    
+        # No het total calcs
+        
+        # Since we can not tell just from the cellsnp file if the discordant read at concordant site is actually concordant with another donor in a pool
+        # we assume the worse case scenario and add idscordant reads at concordant sites to the total.
+        # Number of reads on the sites that are concordant, but has discordant reads with another donor
+        Whithin_Cohort__total_number_of_potential_contaminent_reads=0
+        try:
+            concordant_sites_atributed_to_other_donor = list(set(sites_contributing_to_Whithin_Cohort))
+            if not total_sites >= len(concordant_sites_atributed_to_other_donor):
+                raise Exception("Sorry, the number of sites atributed can not be larger than total sites")
             
-            # expected genotype 0/0
-            expected_hom_ref = expected_vars[expected_vars['vars'] == '0/0']
-            hom_ref_sites = set(expected_hom_ref['ids'])
-            cell_vars2 = cell_vars[cell_vars['ids'].isin(hom_ref_sites)]
-            ad_hom_ref = cell_vars2['AD'].sum()
-            oth_hom_ref = cell_vars2['OTH'].sum() 
-            discordant_hom_ref = ad_hom_ref + oth_hom_ref
+            _,_,_,\
+                _,Whithin_Cohort__total_number_of_potential_contaminent_reads, \
+                    _= self.read_extraction(concordant_sites_atributed_to_other_donor,expected_vars_norm,cell_vars_norm)
                     
-            # expected genotype 0/1 or 1/0
-            hets = ['0/1', '1/0']
-            expected_het = expected_vars[expected_vars['vars'].isin(hets)]
-            het_sites = set(expected_het['ids'])
-            cell_vars3 = cell_vars[cell_vars['ids'].isin(het_sites)]
-            discordant_het = cell_vars3['OTH'].sum()
+            if not discordant_reads >= len(Whithin_Cohort__total_number_of_potential_contaminent_reads):
+                raise Exception("Total discordant reads can not be less than within cohort discordant reads.")            
             
-            # expected genotype 1/1
-            expected_hom_alt = expected_vars[expected_vars['vars'] == '1/1']
-            hom_alt_sites = set(expected_hom_alt['ids'])
-            cell_vars4 = cell_vars[cell_vars['ids'].isin(hom_alt_sites)]
-            # DP + OTH - AD
-            ad_hom_alt = cell_vars4['AD'].sum()
-            dp_hom_alt = cell_vars4['DP'].sum()
-            oth_hom_alt = cell_vars4['OTH'].sum()
-            discordant_hom_alt = (dp_hom_alt + oth_hom_alt) - ad_hom_alt
             
-            # Total analysis
-            discordant_reads =  discordant_hom_ref + discordant_het + discordant_hom_alt
-            total_dp = cell_vars['DP'].sum()
-            total_oth = cell_vars['OTH'].sum()
-            total_reads = total_dp + total_oth
-            discordantRead_ratio = discordant_reads/total_reads
+        except:
+            _='Doesnt Exist'
+
+
+        Out_of_Cohort__total_number_of_potential_contaminent_reads=0
+        try:
+            concordant_sites_atributed_to_other_donor =  list(set(sites_contributing_to_out_of_cohort))
+            if not total_sites >= len(concordant_sites_atributed_to_other_donor):
+                raise Exception("Sorry, the number of sites atributed can not be larger than total sites")
             
-            # No het analysis - they are not discordant or concordant - we cannot make any call at those sites
-            discordant_reads_noHet =  discordant_hom_ref + discordant_hom_alt
-            total_reads_noHet = total_reads - discordant_het
-            discordantRead_ratio_noHet = discordant_reads_noHet/total_reads_noHet
+            _,_,_,_,Out_of_Cohort__total_number_of_potential_contaminent_reads, _= self.read_extraction(concordant_sites_atributed_to_other_donor,expected_vars_norm,cell_vars_norm)
             
-            return total_reads,total_dp,total_oth,discordant_reads,discordantRead_ratio,discordant_reads_noHet,total_reads_noHet,discordantRead_ratio_noHet
-
-    def read_condordance(self, expected_vars, cell_vars,discordant_vars, concordant_vars):
-        '''
-        get read level concordance using DP, AD and OTH format fields
-        ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="total counts for ALT and REF">
-        ##FORMAT=<ID=AD,Number=1,Type=Integer,Description="total counts for ALT">
-        ##FORMAT=<ID=OTH,Number=1,Type=Integer,Description="total counts for other bases from REF and ALT">
-        '''
-        if not len(expected_vars) == len(cell_vars):
-            print("length mismatch between expected vars and cell vars")
-            exit(1)
+            if not discordant_reads >= len(Out_of_Cohort__total_number_of_potential_contaminent_reads):
+                raise Exception("Total discordant reads can not be less than out of cohort discordant reads.")                       
 
-        total_sites = len(expected_vars)
-        #add cols for DP, AD< OTH
-        cell_vars['DP'] = cell_vars[0].str.split("_").str[5].astype(int)
-        cell_vars['AD'] = cell_vars[0].str.split("_").str[6].astype(int)
-        cell_vars['OTH'] = cell_vars[0].str.split("_").str[7].astype(int)
+        except:
+            _='Doesnt Exist'
+                
+    
         
+        try:
+            Out_of_Cohort__sites = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys())
+        except:
+            Out_of_Cohort__sites = set()
+            
+        try:
+            Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys()) - set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) 
+            shared_sites_between_out_of_cohort_and_within_cohort = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys()).intersection(set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) )
+        except:
+            Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set()
+            shared_sites_between_out_of_cohort_and_within_cohort = set()
+            
+        try:
+            Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) - set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys()) 
+        except:
+            Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys())
 
+        Out_of_Cohort__unique_sites_total_reads,_,_,_,_,_ = self.read_extraction(Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool,expected_vars_norm,cell_vars_norm)
+            
+        try:
+            Whithin_Cohort__sites = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys())
+        except:
+            Whithin_Cohort__sites = set()  
+            
+
+          
+        Total__discordant_sites_that_are_concordant_with_other_donors_in_pool = Whithin_Cohort__sites.union(Out_of_Cohort__sites)
+        concordant_vars_in_pool_str = (";").join(concordant_vars)
+        DF = pd.DataFrame(donor_table_of_concordances)
         
-        # Total
-        total_reads,total_dp,total_oth,discordant_reads, \
-            discordantRead_ratio,discordant_reads_noHet,total_reads_noHet,discordantRead_ratio_noHet = self.read_concordance_calc(expected_vars,cell_vars)
-        
-        # uninformative
-        cell_vars_uninformative = cell_vars[cell_vars['ids'].isin(self.uninformative_sites)]
-        total_reads_uninformative,total_dp_uninformative,total_oth_uninformative,discordant_reads_uninformative, \
-            discordantRead_ratio_uninformative,discordant_reads_uninformative_noHet,total_reads_uninformative_noHet,discordantRead_ratio_uninformative_noHet= self.read_concordance_calc(expected_vars,cell_vars_uninformative)
-        
-        # informative 
-        cell_vars_informative = cell_vars[cell_vars['ids'].isin(self.informative_sites)]
-        total_reads_informative,total_dp_informative,total_oth_informative,discordant_reads_informative, \
-            discordantRead_ratio_informative,discordant_reads_informative_noHet,total_reads_informative_noHet,discordantRead_ratio_informative_noHet = self.read_concordance_calc(expected_vars,cell_vars_informative)
-        
-        # Split into concordant and discordant sites
-        # concordant
-        concordant_sites = cell_vars[cell_vars['ids'].isin(set(concordant_vars))]
-        total_reads_for_concordant_sites,total_dp_for_concordant_sites,total_oth_for_concordant_sites,discordant_reads_for_concordant_sites, \
-            discordantRead_ratio_for_concordant_sites,discordant_reads_for_concordant_sites_noHet,total_reads_for_concordant_sites_noHet,discordantRead_ratio_for_concordant_sites_noHet = self.read_concordance_calc(expected_vars,concordant_sites)
+        Donor_With_Lowest_DisConcordance = ';'.join(DF[DF['discordant_percent_in_other_donor']==min(DF['discordant_percent_in_other_donor'])]['donor'].values)
+        Lowest_Disconcordance_value_in_all_donors= DF[DF['discordant_percent_in_other_donor']==min(DF['discordant_percent_in_other_donor'])]['discordant_percent_in_other_donor'].values[0]
         
-        # discordant
-        discordant_sites = cell_vars[cell_vars['ids'].isin(set(discordant_vars))]
-        total_reads_for_discconcordant_sites,total_dp_for_discconcordant_sites,total_oth_for_discconcordant_sites,discordant_reads_for_discconcordant_sites, \
-            discordantRead_ratio_for_discconcordant_sites,discordant_reads_for_discconcordant_sites_noHet,total_reads_for_discconcordant_sites_noHet,discordantRead_ratio_for_discconcordant_sites_noHet = self.read_concordance_calc(expected_vars,discordant_sites)
-              
-        # Subset analysis
-        cell_vars_informative_subset = cell_vars[cell_vars['ids'].isin(self.informative_subset)]
-        total_reads_informative_subset,total_dp_informative_subset,total_oth_informative_subset,discordant_reads_informative_subset, \
-            discordantRead_ratio_informative_subset,discordant_reads_informative_subset_noHet,total_reads_informative_subset_noHet,discordantRead_ratio_informative_subset_noHet = self.read_concordance_calc(expected_vars,cell_vars_informative_subset)
-
-        if mode == 'no_het':
-            total_reads = total_reads_noHet
-            discordant_reads = discordant_reads_noHet
-            total_reads_informative = total_reads_informative_noHet
-            discordant_reads_informative = discordant_reads_informative_noHet
-            total_reads_uninformative =total_reads_uninformative_noHet
-            discordant_reads_uninformative = discordant_reads_uninformative_noHet
-            total_reads_informative_subset = total_reads_informative_subset_noHet
-            discordant_reads_informative_subset = discordant_reads_informative_subset_noHet
-            total_reads_for_concordant_sites = total_reads_for_concordant_sites_noHet
-            discordant_reads_for_concordant_sites = discordant_reads_for_concordant_sites_noHet
-            total_reads_for_discconcordant_sites = total_reads_for_discconcordant_sites_noHet
-            discordant_reads_for_discconcordant_sites = discordant_reads_for_discconcordant_sites_noHet
-        
-        return total_sites, \
-            self.informative_covered, \
-                self.uninformative_covered, \
-                    total_reads, \
-                        discordant_reads, \
-                            total_reads_informative, \
-                                discordant_reads_informative, \
-                                    total_reads_uninformative, \
-                                        discordant_reads_uninformative, \
-                                            total_reads_informative_subset, \
-                                                    discordant_reads_informative_subset, \
-                                                        total_reads_for_concordant_sites, \
-                                                            discordant_reads_for_concordant_sites, \
-                                                                total_reads_for_discconcordant_sites, \
-                                                                    discordant_reads_for_discconcordant_sites
-                                                                    
-                                                    
-    
-    def get_discordance(self,expected_vars2,cell_vars2):
-        Concordant_Sites = set(cell_vars2['combo']).intersection(set(expected_vars2['combo']))
-        Discordant_sites = set(cell_vars2['combo'])-set(expected_vars2['combo'])  
-        disc = pd.DataFrame(Discordant_sites,columns=['combo_x'])
-        df_cd = pd.merge(cell_vars2, expected_vars2, how='inner', on = 'pos')
-        disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x')
-        disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y']
-        disc_sites = ';'.join(disc2['expected_retrieved'])
-        return Concordant_Sites,Discordant_sites,disc_sites
-    
-    
-    def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars,donor_cohort=False):
-        # This function has been inspired by Hails Concordance implementations, however hail has a pitfall that it performs a lot of other stuff under hood and requires intermediate sorting operations.
-        # Since the single cell calculations requires concordance calculations per cell this becomes very computationally heavy on Hail, hence we have implemented concordance calculations here as part of the pipeline.
-        # Author: M.Ozols
-        
-        cell_vars_norm = self.norm_genotypes(cell_vars)
-
-        if len(cell_vars_norm) > 0:
-            
-            if mode == 'no_het':
-                hets = ['0/1', '1/0']
-                expected_vars_norm = expected_vars_norm[~expected_vars_norm['vars'].isin(hets)]
+        Donor_With_Highest_Concordance = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['donor'].values)
+        Highest_Concordance_value_in_all_donors= DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['concordant_percent_in_other_donor'].values[0]
+        Total_sites_other_donor = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['total_sites_otherDonor'].astype(str).values)
+        Total_reads_other_donor = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['total_reads_otherDonor'].astype(str).values)
+                                                       
+        return [{
+            'cell1':cell1,
+            'donor_gt_match':donor_gt_match,
+            'Nr_Concordant':Nr_Concordant,
+            'Nr_Discordant':Nr_Discordant,
+            'Nr_Relaxed_concordant':Nr_Relaxed_concordant,
+            'true_discordant_count':true_discordant_count,
+            'relaxed_concordant_informative_count':relaxed_concordant_informative_count,
+            'relaxed_concordant_uninformative_count':relaxed_concordant_uninformative_count,
+            'true_discordant_informative_count':true_discordant_informative_count,
+            'true_discordant_uninformative_count':true_discordant_uninformative_count,
+            'Nr_Total_Overlapping_sites':Nr_Total_Overlapping_sites,
+            'Number_of_sites_that_are_donor_concordant_and_exclusive':Number_of_sites_that_are_donor_concordant_and_exclusive,
+            'Nr_donor_distinct_sites':Nr_donor_distinct_sites,
+            'count':count,
+            'discordant_sites':discordant_sites,
+            'total_sites':total_sites,
+            'informative_sites':informative_sites,
+            'uninformative_sites':uninformative_sites,
+            'total_reads_informative':total_reads_informative,
+            'total_reads_uninformative':total_reads_uninformative,
+            'discordant_reads_informative':discordant_reads_informative,
+            'discordant_reads_uninformative':discordant_reads_uninformative,
+            'Discordant_sites_in_pool': discordant_vars,
+            'Lowest_Disconcordance_value_in_all_donors':Lowest_Disconcordance_value_in_all_donors,
+            'Donor_With_Lowest_DisConcordance':Donor_With_Lowest_DisConcordance,
+            'Concordant_Site_Identities':concordant_vars_in_pool_str,
+            'Donor_With_Highest_Concordance':Donor_With_Highest_Concordance,
+            'Highest_Concordance_value_in_all_donors':Highest_Concordance_value_in_all_donors,
+            'Total_sites_other_donor':Total_sites_other_donor,
+            'Total_reads_other_donor':Total_reads_other_donor,
+            'total_discordant_sites_that_are_concordant_with_other_donors_in_pool':f"{len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool)}/{len(discordant_vars)}",
+            'informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool':f"{len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool)}/{len(true_discordant_informative_count)}",
+            'total_reads':total_reads,
+            'discordant_reads':discordant_reads,
+            'discordant_reads_informative_subset':discordant_reads_informative_subset, \
+            'discordant_read_fraction_in_concordant_sites':discordant_read_fraction_in_concordant_sites, \
+            'discordant_read_fraction_in_discordant_sites':discordant_read_fraction_in_discordant_sites, \
+            'Whithin_Cohort__total_number_of_potential_contaminent_reads':Whithin_Cohort__total_number_of_potential_contaminent_reads, \
+            'Out_of_Cohort__total_number_of_potential_contaminent_reads':Out_of_Cohort__total_number_of_potential_contaminent_reads, \
+            'NrDonors_contributing_to_out_of_cohort':len(set(donors_contributing_to_out_of_cohort)), \
+            'NrDonors_contributing_to_Whithin_Cohort':len(set(donors_contributing_to_Whithin_Cohort)), \
                 
-            Total_Overlapping_sites = set(expected_vars_norm['ids']).intersection(set(cell_vars_norm['ids']))
-            expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlapping_sites)]
-            cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)]
-            cell_vars2 = cell_vars2.drop_duplicates(subset=['ids'])
-            expected_vars2 = expected_vars2.drop_duplicates(subset=['ids'])
-            if len(cell_vars2)!=len(expected_vars2):
-                print('failed to select comon variants')
-                exit(1)
-            
-            # Find exact discordant sites
-            Concordant_Sites, Discordant_sites, _ = self.get_discordance(expected_vars2, cell_vars2)
-            #find truly discordant sites
-            true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, discordant_vars, concordant_vars, disc_sites_string = self.get_strict_discordance(expected_vars2, cell_vars2)
-            #find discordant reads
-            total_sites, \
-                informative_sites, \
-                    uninformative_sites, \
-                        total_reads, \
-                            discordant_reads, \
-                                total_reads_informative, \
-                                    discordant_reads_informative, \
-                                        total_reads_uninformative, \
-                                            discordant_reads_uninformative, \
-                                                    total_reads_informative_subset, \
-                                                        discordant_reads_informative_subset, \
-                                                            total_reads_for_concordant_sites, \
-                                                                discordant_reads_for_concordant_sites, \
-                                                                    total_reads_for_discconcordant_sites, \
-                                                                        discordant_reads_for_discconcordant_sites   = self.read_condordance(expected_vars2, cell_vars2, discordant_vars, concordant_vars)
-                                                                        
-            discordant_read_fraction_in_concordant_sites = f"{discordant_reads_for_concordant_sites}/{total_reads_for_concordant_sites}"
-            discordant_read_fraction_in_discordant_sites = f"{discordant_reads_for_discconcordant_sites}/{total_reads_for_discconcordant_sites}"
-            discordant_reads_uninformative_fraction = f"{discordant_reads_uninformative}/{total_reads_uninformative}"
-            discordant_reads_informative_fraction = f"{discordant_reads_informative}/{total_reads_informative}"
-            
-            # sanity checks
-            if total_reads != total_reads_for_concordant_sites+total_reads_for_discconcordant_sites:
-                print("Error: total reads dont add up ")
-                exit(1)
-            if discordant_reads != discordant_reads_for_concordant_sites+discordant_reads_for_discconcordant_sites:
-                print("Error: discordant reads dont add up ")
-                exit(1)
-            
-
-        else:
-            Total_Overlapping_sites = set()
-            Concordant_Sites = set()
-            Discordant_sites = set()
-            disc_sites = ''
-            true_discordant_count = 0
-            relaxed_concordant_count = 0
-            total_sites = 0
-            relaxed_concordant_informative_count = 0
-            relaxed_concordant_uninformative_count = 0
-            true_discordant_informative_count = 0
-            true_discordant_uninformative_count = 0
-            informative_sites = 0
-            disc_sites_string = ''
-            discordant_reads = 0
-            uninformative_sites = 0
-            total_reads = 0
-            total_reads_informative =0
-            total_reads_uninformative=0
-            discordant_reads=0
-            discordant_reads_informative=0
-            discordant_reads_uninformative=0
-            discordant_vars=0
-            concordant_vars=0
-            discordant_read_fraction_in_concordant_sites=0
-            discordant_read_fraction_in_discordant_sites=0
-            discordant_reads_uninformative_fraction=0
-            discordant_reads_informative_fraction=0
-            discordant_reads_informative_subset=0
-
-        return Concordant_Sites, \
-            Discordant_sites, \
-            Total_Overlapping_sites, \
-            disc_sites_string, \
-            cell_vars_norm, \
-            true_discordant_count, \
-            relaxed_concordant_count, \
-            relaxed_concordant_informative_count, \
-            relaxed_concordant_uninformative_count, \
-            true_discordant_informative_count, \
-            true_discordant_uninformative_count, \
-            total_sites, \
-            informative_sites, \
-            uninformative_sites, \
-            total_reads, \
-            total_reads_informative, \
-            total_reads_uninformative, \
-            discordant_reads, \
-            discordant_reads_informative, \
-            discordant_reads_uninformative,  \
-            discordant_vars, \
-            concordant_vars, \
-            discordant_read_fraction_in_concordant_sites, \
-            discordant_read_fraction_in_discordant_sites, \
-            discordant_reads_uninformative_fraction, \
-            discordant_reads_informative_fraction, \
-            discordant_reads_informative_subset
-            
+            'Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool':Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool, \
+            'Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool':Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool, \
+            'shared_sites_between_Out_of_Cohort_and_Within_Cohort':len(shared_sites_between_out_of_cohort_and_within_cohort), \
+            'Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool':len(Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool), \
+            'Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool':len(Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool), \
+            'Out_of_Cohort__unique_sites_total_reads':Out_of_Cohort__unique_sites_total_reads, \
+            'Total__discordant_sites_that_are_concordant_with_other_donors_in_pool':len(Total__discordant_sites_that_are_concordant_with_other_donors_in_pool)
+        }, donor_table_of_concordances]
     
-
-    def set_results(self,to_set,id):
-        # Recod to disk to save the loading mmeory time.
-        hash = random.getrandbits(128)
-        with open(f'tmp_{id}_{hash}.pkl', 'wb') as f:
-            pickle.dump(to_set, f)
-        self.record_dict[id]=f'tmp_{id}_{hash}.pkl'
-        return 
     
-    # def append_results_cell_concordances(self,result):
-    def append_results_cell_concordances(self,result,cell_concordance_table,other_donor_concordances,other_donor_concordance_table):
-        other_donor_concordance_table = other_donor_concordance_table + other_donor_concordances
-        count=result['count']
-        try:
-            percent_concordant = result['Nr_Concordant']/(result['Nr_Discordant']+result['Nr_Concordant'])*100
-        except:
-            percent_concordant = 0
-        
-        try:
-            percent_discordant = result['Nr_Discordant']/(result['Nr_Discordant']+result['Nr_Concordant'])*100
-        except:
-            percent_discordant = 0
 
-        try:
-            percent_relaxed_concordant = result['Nr_Relaxed_concordant']/(result['Nr_Relaxed_concordant']+result['true_discordant_count'])*100
-        except:
-            percent_relaxed_concordant = 0
-        
-        try:
-            percent_strict_discordant = result['true_discordant_count']/(result['Nr_Relaxed_concordant']+result['true_discordant_count'])*100
-        except:
-            percent_strict_discordant = 0
+    def norm_genotypes(self,expected_vars):
+        expected_vars = pd.DataFrame(expected_vars)
+        if len(expected_vars) > 0:
+            split_str=expected_vars[0].str.split("_")
+            expected_vars['ids'] = split_str.str[0]+'_'+split_str.str[1]+'_'+split_str.str[2]+'_'+split_str.str[3]
+            expected_vars['pos'] = split_str.str[0]+'_'+split_str.str[1]
+            expected_vars['vars'] = split_str.str[4]
+            expected_vars['vars'] = expected_vars['vars'].str.replace('|','/',regex=False)
+            expected_vars = expected_vars[expected_vars['vars']!='./.']
+            expected_vars.loc[expected_vars['vars']=='0/1','vars']='1/0'
+            expected_vars['combo']= expected_vars['ids']+'_'+expected_vars['vars']
+        return expected_vars
 
-        try:
-            read_discordance = result['discordant_reads']/result['total_sites']
-        except:
-            read_discordance = 0
 
-        cohort = 'UNKNOWN'
-        donor_split = result['donor_gt_match'].split("_")
-        if (len(donor_split) == 2) and (donor_split[0] == donor_split[1]):
-            cohort = 'UKB'
-        elif (len(donor_split) == 3) and (len(donor_split[0]) == 14):
-            cohort = 'ELGH'
 
-        same_as_asigned_donor = result['donor_gt_match'] in result['Donor_With_Highest_Concordance']
-        if not same_as_asigned_donor:
-            same_as_asigned_donor = result['donor_gt_match'] in result['Donor_With_Lowest_DisConcordance']
-            
-        cell_concordance_table[f"{result['cell1']} --- {result['donor_gt_match']}"] = {'GT 1':result['cell1'],
-                                                                'GT 2':result['donor_gt_match'],
-                                                                'cohort': cohort,
-                                                                
-                                                                # 'Nr_Concordant':result['Nr_Concordant'],
-                                                                # 'Nr_Discordant':result['Nr_Discordant'],
-                                                                'Nr_Relaxed_concordant':result['Nr_Relaxed_concordant'],
-                                                                'Nr_strict_discordant':result['true_discordant_count'],
-                                                                # 'Percent Concordant':percent_concordant,
-                                                                # 'Percent Discordant':percent_discordant,
-                                                                'Percent_relaxed_concordant': percent_relaxed_concordant,
-                                                                'Percent_strict_discordant': percent_strict_discordant,
-                                                                'Nr_concordant_informative': len(result['relaxed_concordant_informative_count']),
-                                                                'Nr_concordant_uninformative': len(result['relaxed_concordant_uninformative_count']),
-                                                                'Nr_discordant_informative': len(result['true_discordant_informative_count']),
-                                                                'Nr_discordant_uninformative': len(result['true_discordant_uninformative_count']),
-                                                                'NrTotal_Overlapping_sites_between_two_genotypes':result['Nr_Total_Overlapping_sites'],
-                                                                'Nr_donor_distinct_sites_within_pool_individuals':result['Nr_donor_distinct_sites'],
-                                                                'Number_of_sites_that_are_donor_concordant_and_exclusive':result['Number_of_sites_that_are_donor_concordant_and_exclusive'],
-                                                                'Total_sites': result['total_sites'],
-                                                                'Total_informative_sites': result['informative_sites'],
-                                                                'Total_uninformative_sites': result['uninformative_sites'],
-                                                                'Total_reads': result['total_reads'],
-                                                                'Total_reads_informative': result['total_reads_informative'],
-                                                                'Total_reads_uninformative': result['total_reads_uninformative'],
-                                                                'Discordant_reads': result['discordant_reads'],
-                                                                'Discordant_reads_informtive': result['discordant_reads_informative'],
-                                                                'Discordant_reads_uninformtive': result['discordant_reads_uninformative'],
-                                                                'discordant_reads_informative_subset':result['discordant_reads_informative_subset'],
-                                                                'Discordant_reads_by_n_sites': read_discordance,
-                                                                'Discordant_sites_in_pool': len(result['Discordant_sites_in_pool']),
-                                                                'Lowest_Disconcordance_value_in_all_donors':result['Lowest_Disconcordance_value_in_all_donors'],
-                                                                'Donor_With_Lowest_DisConcordance':result['Donor_With_Lowest_DisConcordance'],
-                                                                'Concordant_Site_Identities':result['Concordant_Site_Identities'],
-                                                                'Donor_With_Highest_Concordance':result['Donor_With_Highest_Concordance'],
-                                                                'Highest_Concordance_value_in_all_donors':result['Highest_Concordance_value_in_all_donors'],
-                                                                'same_as_asigned_donor':same_as_asigned_donor,
-                                                                'Total_sites_other_donor (if same_as_asigned_donor=False)':result['Total_sites_other_donor'],
-                                                                'Total_reads_other_donor (if same_as_asigned_donor=False)':result['Total_reads_other_donor'],
-                                                                'total_discordant_sites_that_are_concordant_with_other_donors_in_pool':result['total_discordant_sites_that_are_concordant_with_other_donors_in_pool'],
-                                                                'discordant_read_fraction_in_concordant_site':result['discordant_read_fraction_in_concordant_sites'], 
-                                                                'discordant_read_fraction_in_discordant_sites':result['discordant_read_fraction_in_discordant_sites'], 
-                                                                'Whithin_Cohort__total_number_of_potential_contaminent_reads':result['Whithin_Cohort__total_number_of_potential_contaminent_reads'],
-                                                                'Out_of_Cohort__total_number_of_potential_contaminent_reads':result['Out_of_Cohort__total_number_of_potential_contaminent_reads'], 
-                                                                'NrDonors_contributing_to_out_of_cohort':result['NrDonors_contributing_to_out_of_cohort'], 
-                                                                'NrDonors_contributing_to_Whithin_Cohort':result['NrDonors_contributing_to_Whithin_Cohort'],
-                                                                
-                                                                'Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool'],
-                                                                'Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool'],
-                                                                'shared_sites_between_Out_of_Cohort_and_Within_Cohort':result['shared_sites_between_Out_of_Cohort_and_Within_Cohort'],
-                                                                'Within_Cohort__unique_discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool'],
-                                                                'Out_of_Cohort__unique_discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool'],
-                                                                'Out_of_Cohort__unique_sites_total_reads':result['Out_of_Cohort__unique_sites_total_reads'],
-                                                                'Total__discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Total__discordant_sites_that_are_concordant_with_other_donors_in_pool']
-                                                                } 
-        
-        return [cell_concordance_table,other_donor_concordance_table]
-    
-    # def combine_written_files(self):#this one is for concordance class
-    #     to_export = self.cell_concordance_table
-    #     for val1 in self.record_dict.values():
-    #         # here remove the int files.
-    #         print(f"merging temp file: {val1}")
-    #         with open(val1, 'rb') as f:
-    #             loaded_dict = pickle.load(f)
-    #             for k1 in loaded_dict.keys():
-    #                 to_export[k1]=loaded_dict[k1]
-    #         os.remove(val1)
-    #     return to_export
-    
-    
-    def combine_written_lists(self,exclusive_donor_variants,record_dict):#this is for VCF loader class
-        to_export = exclusive_donor_variants
-        for val1 in record_dict.values():
-            # here remove the int files.
-            print(f"merging temp file: {val1}")
-            with open(val1, 'rb') as f:
-                loaded_dict = pickle.load(f)
-                self.other_donor_comp = self.other_donor_comp+ loaded_dict
-            os.remove(val1)
-        return self.other_donor_comp    
-    
-    def combine_written_files(self,exclusive_donor_variants,record_dict):#this is for VCF loader class
-        to_export = exclusive_donor_variants
-        for val1 in record_dict.values():
-            # here remove the int files.
-            print(f"merging temp file: {val1}")
-            with open(val1, 'rb') as f:
-                loaded_dict = pickle.load(f)
-                for k1 in loaded_dict.keys():
-                    try:
-                        to_export[k1]=to_export[k1].union(loaded_dict[k1])
-                    except:
-                        to_export[k1]=set()
-                        to_export[k1]=to_export[k1].union(loaded_dict[k1])
-            os.remove(val1)
-        return to_export
-   
-    def set_results(self,to_set,id):
-        # Recod to disk to save the loading mmeory time.
-        hash = random.getrandbits(128)
-        with open(f'tmp_{id}_{hash}.pkl', 'wb') as f:
-            pickle.dump(to_set, f)
-        self.record_dict[id]=f'tmp_{id}_{hash}.pkl' 
-    
-    def analyse_donor(self,analyse_donor,Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm,donor_assignments_table):
-        donor_concordance_table = {}
-        other_donor_concordance_table = []
-        for cell1 in Cells_to_keep_pre:
-            count+=1
+    def get_strict_discordance(self, expected_vars, cell_vars):
+        '''
+        take a list of SNP array genotypes and a list of cellSNP genotypes, return counts of truly discordant 
+        sites and relaxed concordant sites and list of discordant sites1
+        1) If you have 1/1 on SNP array you can not get a 0/1 or 0/0 genotype
+        2) if you have a 0/0 you can not get a 1/1 or 0/1
+        3) if you genotype is 0/1 you can get all copies: 0/0 . 0/1. 1/1
+        So - each obversed cellsnp allele must be in the array SNP gtype
+        '''
+        snp_gtypes = expected_vars[0]
+        cellsnp_gtypes = cell_vars[0]
+        true_discordant = 0
+        relaxed_concordant = 0
+        relaxed_concordant_informative = 0
+        relaxed_concordant_informative_ids = []
+        relaxed_concordant_uninformative_ids = []
+        true_discordant_uninformative_ids =[]
+        true_discordant_informative_ids=[]
+        relaxed_concordant_uninformative = 0
+        true_discordant_informative = 0
+        true_discordant_uninformative = 0
+        discordant_vars = []
+        concordant_vars = []
+        subset_informative_concordant = 0
+        subset_informative_discordant = 0
 
-            cell_vars = exclusive_cell_variants[cell1]
-            try:
-                result1, other_donor_concordances = self.concordance_table_production(expected_vars_norm,cell_vars,cell1,donor_gt_match,donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data,donor_assignments_table)
-                cell_concordance_table,other_donor_concordance_table = self.append_results_cell_concordances(result1,donor_concordance_table,other_donor_concordances,other_donor_concordance_table)
-            except:
-                continue
-            # if count>800:
-            #     break
-            # here we should write these independently to the files
-            if (count % 50 == 0):
-                self.set_results(other_donor_concordance_table,f"{count}--{donor_gt_match}")
-                other_donor_concordance_table = []
-                
+        #print(self.uninformative_sites)
+        #print(self.informative_sites)
 
-        self.set_results(other_donor_concordance_table,f"{count}--{donor_gt_match}")
-        output2 = self.combine_written_lists(self.other_donor_comp,self.record_dict)
-        pd.DataFrame(output2).sort_values(by=['cell']).to_csv(f'{donor_gt_match}-{analyse_donor}--each_cells_comparison_with_other_donor.tsv',sep='\t',index=False)
-        # del output2
-        self.record_dict={}
-        return donor_concordance_table
+        #create sets of the ids (chrom, pos, ref, alt) in each set of genotypes. Filter to the ids present in both 
+        #then filter to informative and uninformative. If uninformative >0 then create a subset of informative
+        # with the same number of vars (at random)
+        split_snp_gts=snp_gtypes.str.split("_")
+        snp_gtypes_ids = set(split_snp_gts.str[0]+'_'+split_snp_gts.str[1]+'_'+split_snp_gts.str[2]+'_'+split_snp_gts.str[3])
 
-    def combine_concordances(self,result):
-        
-        self.cell_concordance_table = {**self.cell_concordance_table, **result}
-        
+        split_cellsnp_gts=cellsnp_gtypes.str.split("_")
+        cellsnp_gtypes_ids = set(split_cellsnp_gts.str[0]+'_'+split_cellsnp_gts.str[1]+'_'+split_cellsnp_gts.str[2]+'_'+split_cellsnp_gts.str[3])
 
-    def conc_table(self):
-        donor_assignments_table=self.donor_assignments_table
-        cell_assignments_table=self.cell_assignments_table
-        exclusive_don_variants=self.exclusive_don_variants
-        exclusive_cell_variants= self.exclusive_cell_variants
-        donor_list = exclusive_don_variants.keys()
-        # cpus = 1
-        pool = mp.Pool(cpus)
-        count = 0
-        
-        
-        #create a list of variants that are on each donor genotype file
-        vars_per_donor_gt = {}
-        for don_id in donor_list:
-            donor_gt_vars = list(exclusive_don_variants[don_id])
-            donor_gt_vars = pd.DataFrame(donor_gt_vars)
-            donor_gt_vars = self.norm_genotypes(donor_gt_vars)
-            donor_gt_vars = donor_gt_vars[donor_gt_vars['vars'] != '0/0']
-            donor_gt_varids = list(donor_gt_vars['ids'])
-            vars_per_donor_gt[don_id] = donor_gt_varids
-        
-        #work out what cohort each donor belongs to
-        donor_cohorts = {}
-        for don_id in donor_list:
-            cohort = 'UNKNOWN'
-            donor_split = don_id.split("_")
-            if (len(donor_split) == 2) and (donor_split[0] == donor_split[1]):
-                cohort = 'UKB'
-            elif (len(donor_split) == 3) and (len(donor_split[0]) == 14):
-                cohort = 'ELGH'
-            donor_cohorts[don_id] = cohort
+        shared_gts = snp_gtypes_ids.intersection(cellsnp_gtypes_ids)
 
-        all_donor_data={}
-        # here we calvculate all the expected donor datasets
-        for row1 in exclusive_don_variants.keys():
-            # donor_in_question = row1['donor_query']
-            donor_gt_match = row1
-            expected_vars_of_other_donor = self.exclusive_don_variants[donor_gt_match]
-            expected_vars_norm_of_other_donor = self.norm_genotypes(expected_vars_of_other_donor)
-            all_donor_data[donor_gt_match]=expected_vars_norm_of_other_donor
+        shared_informative = shared_gts.intersection(self.informative_sites)
+        shared_uninformative = shared_gts.intersection(self.uninformative_sites)
+        # print("shared informative " + str(len(shared_informative)))
+        # print("shared uninformative " + str(len(shared_uninformative)))
 
-        results = []
-        for i,row1 in donor_assignments_table.iterrows():
-            donor_in_question = row1['donor_query']
-            donor_gt_match = row1['donor_gt']
-            print(donor_gt_match)
-            # if i>4:
-            #     continue
-            if (donor_gt_match=='NONE'):
-                continue
-            try:
-                donor_gt_match_cohort = donor_cohorts[donor_gt_match]
-            except:
-                continue
-            Cells_to_keep_pre = list(set(cell_assignments_table.loc[cell_assignments_table['donor_id']==donor_in_question,'cell']))
-            expected_vars = exclusive_don_variants[donor_gt_match]
-            expected_vars_norm = self.norm_genotypes(expected_vars)
-            try:
-                # Now we subset this down to each of the uniqie variants per donor and check which of the concordant sites are exclusive to donor.
-                dds = self.donor_distinct_sites[donor_gt_match]
-            except:
-                continue
-            if cpus==1:
-                result = self.analyse_donor(donor_in_question,Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm,donor_assignments_table)
-                self.combine_concordances(result)
+        #store the numbers of informative and uninformative sites shared between cellSNP and gt data as these
+        #are the sites used for concordance
+        self.informative_covered = len(shared_informative)
+        self.uninformative_covered = len(shared_uninformative)
+
+        if len(shared_uninformative) > 0:
+            #print(len(shared_uninformative))
+            # print(len(shared_informative))
+            if len(shared_uninformative) <= len(shared_informative):
+                informative_subset = set(random.sample(shared_informative, len(shared_uninformative)))
             else:
+                informative_subset = set()#if there are more shared uninformative than shared informative we will not subset
+            # print(informative_subset)
+            # exit(0)
+        else:
+            informative_subset = set()
 
-                result = pool.apply_async(self.analyse_donor, args=([donor_in_question,Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm,donor_assignments_table]),callback=self.combine_concordances)          
-                results.append(result)
-        try:
-            [result.wait() for result in results]        
-        except:
-            print('done')  
-        print('Done') 
-        pool.close()
-        pool.join() 
-        
-        
-        
-        # output = self.combine_written_files(self.cell_concordance_table,self.record_dict)
-        
-        return self.cell_concordance_table
+        # print(informative_subset)
+        self.informative_subset = informative_subset
 
-    
+        snp_gtypes_set = set(snp_gtypes)
+        snp_gtypes_set = sorted(snp_gtypes_set)
 
-    def read_extraction(self,DonorDiscordant_Sites_that_are_atributed_to_other_donor,expected_vars_norm,cell_vars_norm):
-        # we need this function wrapper to calculate the concordant, discordant read 
-        # counts for each of the discordant sites that are concordant with another donor.
-        
-        Total_Overlapping_sites = set(DonorDiscordant_Sites_that_are_atributed_to_other_donor)
-        expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlapping_sites)]
-        cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)]   
-        
-        cell_vars2 = cell_vars2.drop_duplicates(subset=['ids'])
-        expected_vars2 = expected_vars2.drop_duplicates(subset=['ids'])
-        cell_vars2['DP'] = cell_vars2[0].str.split("_").str[5].astype(int)
-        cell_vars2['AD'] = cell_vars2[0].str.split("_").str[6].astype(int)
-        cell_vars2['OTH'] = cell_vars2[0].str.split("_").str[7].astype(int)
-        total_reads,_,_,discordant_reads, \
-            discordantRead_ratio,discordant_reads_noHet,total_reads_noHet,discordantRead_ratio_noHet= self.read_concordance_calc(expected_vars2,cell_vars2)
-        concordant_reads = total_reads - discordant_reads
-        concordant_reads_noHet = total_reads_noHet-discordant_reads_noHet
-        return total_reads,discordant_reads,concordant_reads,\
-            total_reads_noHet,discordant_reads_noHet,concordant_reads_noHet
-    
-    def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match, donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data,donor_assignments_table):
+        cellsnp_gtypes_set = set(cellsnp_gtypes)
+        cellsnp_gtypes_set = sorted(cellsnp_gtypes_set)
 
-        Concordant_Sites, \
-            Discordant_sites, \
-            Total_Overlapping_sites, \
-            discordant_sites, \
-            cell_vars_norm, \
-            true_discordant_count, \
-            relaxed_concordant_count, \
-            relaxed_concordant_informative_count, \
-            relaxed_concordant_uninformative_count, \
-            true_discordant_informative_count, \
-            true_discordant_uninformative_count, \
-            total_sites, \
-            informative_sites, \
-            uninformative_sites, \
-            total_reads, \
-            total_reads_informative, \
-            total_reads_uninformative, \
-            discordant_reads, \
-            discordant_reads_informative, \
-            discordant_reads_uninformative, \
-            discordant_vars, \
-            concordant_vars, \
-            discordant_read_fraction_in_concordant_sites, \
-            discordant_read_fraction_in_discordant_sites, \
-            discordant_reads_uninformative_fraction, \
-            discordant_reads_informative_fraction, discordant_reads_informative_subset  = self.retrieve_concordant_discordant_sites(expected_vars_norm,cell_vars)
+        #for i in range(0, len(snp_gtypes)):
+        # cellsnp_gtypes_str = pd.DataFrame(cellsnp_gtypes_set,columns=['cellsnp_gtypes'])
+        # cellsnp_gtypes_str['snp_gtypes'] = pd.DataFrame(snp_gtypes_set,columns=['snp_gtypes'])['snp_gtypes']
+        for i in range(0, len(snp_gtypes_set)):
+            discordant = False
+            # snp_data = snp_gtypes[i].split('_')
+            # cellsnp_data = cellsnp_gtypes[i].split('_')
+            snp_data = snp_gtypes_set[i].split('_')
+            cellsnp_data = cellsnp_gtypes_set[i].split('_')
+
+            snp_alleles_set = set([snp_data[4][0], snp_data[4][2]])
+            cellsnp_alleles_set = set([cellsnp_data[4][0], cellsnp_data[4][2]])
+            
+            snp_var = ('_').join(snp_data[0:4])
+            cellsnp_var = ('_').join(cellsnp_data[0:4])
+
+            if not cellsnp_var == snp_var:
+                print("Error with strict discordance calculations: " + snp_gtypes[i] + " " + cellsnp_gtypes[i])
+                exit(1)
+            else:
+                for allele in cellsnp_alleles_set:
+                    if not allele in snp_alleles_set:#if a cellSNP allele is found that is not in the array data this is discordant
+                        discordant = True
+            
+            if discordant == True:
+                true_discordant+=1
+                discordant_vars.append(cellsnp_var)
+                if snp_var in self.uninformative_sites:
+                    true_discordant_uninformative+=1
+                    true_discordant_uninformative_ids.append(snp_var)
+                elif snp_var in self.informative_sites:
+                    true_discordant_informative+=1
+                    true_discordant_informative_ids.append(snp_var)
+            else:
+                relaxed_concordant+=1
+                concordant_vars.append(cellsnp_var)
+                if snp_var in self.uninformative_sites:
+                    relaxed_concordant_uninformative+=1
+                    relaxed_concordant_uninformative_ids.append(snp_var)
+                elif snp_var in self.informative_sites:
+                    relaxed_concordant_informative+=1
+                    relaxed_concordant_informative_ids.append(snp_var)
+            if len(shared_uninformative) > 0:
+                if snp_var in informative_subset:
+                    if discordant == True:
+                        subset_informative_discordant+=1
+                    else:
+                        subset_informative_concordant+=1
+            #    true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count 
+        cell_vars2 = cell_vars.set_index('ids') 
+        disc = pd.DataFrame(set(cell_vars2.loc[discordant_vars]['combo']),columns=['combo_x'])
+        df_cd = pd.merge(cell_vars, expected_vars, how='inner', on = 'pos')
+        disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x')
+        disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y']
+        disc_sites_string = ';'.join(disc2['expected_retrieved'])
+        return true_discordant, relaxed_concordant, relaxed_concordant_informative_ids, relaxed_concordant_uninformative_ids, true_discordant_informative_ids, true_discordant_uninformative_ids, discordant_vars, concordant_vars, disc_sites_string
+    
+    def read_concordance_calc(self,expected_vars,cell_vars):
+            
+            # This is a wrapper to add up the discordant reads in the cellsnp file.
+            
+            # expected genotype 0/0
+            expected_hom_ref = expected_vars[expected_vars['vars'] == '0/0']
+            hom_ref_sites = set(expected_hom_ref['ids'])
+            cell_vars2 = cell_vars[cell_vars['ids'].isin(hom_ref_sites)]
+            ad_hom_ref = cell_vars2['AD'].sum()
+            oth_hom_ref = cell_vars2['OTH'].sum() 
+            discordant_hom_ref = ad_hom_ref + oth_hom_ref
+                    
+            # expected genotype 0/1 or 1/0
+            hets = ['0/1', '1/0']
+            expected_het = expected_vars[expected_vars['vars'].isin(hets)]
+            het_sites = set(expected_het['ids'])
+            cell_vars3 = cell_vars[cell_vars['ids'].isin(het_sites)]
+            discordant_het = cell_vars3['OTH'].sum()
+            
+            # expected genotype 1/1
+            expected_hom_alt = expected_vars[expected_vars['vars'] == '1/1']
+            hom_alt_sites = set(expected_hom_alt['ids'])
+            cell_vars4 = cell_vars[cell_vars['ids'].isin(hom_alt_sites)]
+            # DP + OTH - AD
+            ad_hom_alt = cell_vars4['AD'].sum()
+            dp_hom_alt = cell_vars4['DP'].sum()
+            oth_hom_alt = cell_vars4['OTH'].sum()
+            discordant_hom_alt = (dp_hom_alt + oth_hom_alt) - ad_hom_alt
             
-        total_concordant_sites = len(Concordant_Sites) + relaxed_concordant_count
-        dds = self.donor_distinct_sites[donor_gt_match]
-        Nr_donor_distinct_sites = len(dds)
-        Nr_Concordant = len(Concordant_Sites)
-        Nr_Relaxed_concordant = relaxed_concordant_count
-        Nr_Discordant = len(Discordant_sites)
-        Nr_Total_Overlapping_sites = len(Total_Overlapping_sites)
-        Number_of_sites_that_are_donor_concordant_and_exclusive = len(set(dds).intersection(set(Discordant_sites)))
-        Number_of_sites_in_cellsnp_but_not_in_reference = set(cell_vars_norm['pos'])-set(expected_vars_norm['pos'])
-        #Quantify donor variation in other donors
-        discordant_vars_in_pool = []
-        donor_table_of_concordances = []
-        total_discordant_sites_that_are_concordant_with_other_donors_in_pool = set()
-        total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown = {}
-        total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort']={}
-        total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort']={}
-      
-        total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown_totalReads = {}
-        total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown_totalReads['Whithin_Cohort']={}
-        total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown_totalReads['Out_of_Cohort']={}      
-      
-        total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown = {}
-        total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort']={}
-        total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort']={}      
-      
-        
-        informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool = set()
-        total_cordant_sites_that_are_concordant_with_other_donors_in_pool = set()
-        donor_gt_match_cohort = donor_cohorts[donor_gt_match]
-        donors_contributing_to_out_of_cohort= []
-        donors_contributing_to_Whithin_Cohort=[]
-        sites_contributing_to_out_of_cohort= []
-        sites_contributing_to_Whithin_Cohort=[]
-        for donor in set(donor_assignments_table['donor_gt']):
-            try:
-                expected_vars_norm_of_other_donor = all_donor_data[donor]
-            except:
-                continue
-            try:
-                donor_cohort = donor_cohorts[donor]
-                donor_vars = vars_per_donor_gt[donor]
-            except:
-                continue        
+            # Total analysis
+            discordant_reads =  discordant_hom_ref + discordant_het + discordant_hom_alt
+            total_dp = cell_vars['DP'].sum()
+            total_oth = cell_vars['OTH'].sum()
+            total_reads = total_dp + total_oth
+            discordantRead_ratio = discordant_reads/total_reads
             
-            Concordant_Sites_otherDonor, \
-                Discordant_sites_otherDonor, \
-                Total_Overlapping_sites_otherDonor, \
-                discordant_sites_otherDonor, \
-                cell_vars_norm_otherDonor, \
-                true_discordant_count_otherDonor, \
-                relaxed_concordant_count_otherDonor, \
-                relaxed_concordant_informative_count_otherDonor, \
-                relaxed_concordant_uninformative_count_otherDonor, \
-                true_discordant_informative_count_otherDonor, \
-                true_discordant_uninformative_count_otherDonor, \
-                total_sites_otherDonor, \
-                informative_sites_otherDonor, \
-                uninformative_sites_otherDonor, \
-                total_reads_otherDonor, \
-                total_reads_informative_otherDonor, \
-                total_reads_uninformative_otherDonor, \
-                discordant_reads_otherDonor, \
-                discordant_reads_informative_otherDonor, \
-                discordant_reads_uninformative_otherDonor, \
-                discordant_vars_otherDonor, \
-                concordant_vars_otherDonor, \
-                discordant_read_fraction_in_concordant_sites_otherDonor, \
-                discordant_read_fraction_in_discordant_sites_otherDonor, \
-                discordant_reads_uninformative_fraction_otherDonor, \
-                discordant_reads_informative_fraction_otherDonor, discordant_reads_informative_subset_otherDonor  = self.retrieve_concordant_discordant_sites(expected_vars_norm_of_other_donor,cell_vars,donor_cohort=donor_cohort)
+            # No het analysis - they are not discordant or concordant - we cannot make any call at those sites
+            discordant_reads_noHet =  discordant_hom_ref + discordant_hom_alt
+            total_reads_noHet = total_reads - discordant_het
+            discordantRead_ratio_noHet = discordant_reads_noHet/total_reads_noHet
             
-            if total_sites_otherDonor==0:
-                total_sites_otherDonor=0.000000000001
-            total_concordant_sites_otherDonor = relaxed_concordant_count_otherDonor
-            concordant_percent_in_other_donor= total_concordant_sites_otherDonor/total_sites_otherDonor*100
-            discordant_percent_in_other_donor= true_discordant_count_otherDonor/total_sites_otherDonor*100
-            DonorDiscordant_Sites_that_are_atributed_to_other_donor = set(discordant_vars).intersection(set(concordant_vars_otherDonor))
-            TotalSites_that_are_atributed_to_other_donor = set(discordant_vars).union(set(concordant_vars)).intersection(set(concordant_vars_otherDonor))
+            return total_reads,total_dp,total_oth,discordant_reads,discordantRead_ratio,discordant_reads_noHet,total_reads_noHet,discordantRead_ratio_noHet
 
-            Informative__DonorDiscordant_Sites_that_are_atributed_to_other_donor = set(true_discordant_informative_count).intersection(set(relaxed_concordant_informative_count_otherDonor))
-            DonorCordant_Sites_that_are_atributed_to_other_donor = set(concordant_vars).intersection(set(concordant_vars_otherDonor))
-           
-            total_reads_for_discordant_sites_that_are_concordant_with_other_donor,discordant_reads_for_discordant_sites_that_are_concordant_with_other_donor,concordant_reads_for_discordant_sites_that_are_concordant_with_other_donor,\
-                _,_,_= self.read_extraction(DonorDiscordant_Sites_that_are_atributed_to_other_donor,expected_vars_norm_of_other_donor,cell_vars_norm_otherDonor)
+    def read_condordance(self, expected_vars, cell_vars,discordant_vars, concordant_vars):
+        '''
+        get read level concordance using DP, AD and OTH format fields
+        ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="total counts for ALT and REF">
+        ##FORMAT=<ID=AD,Number=1,Type=Integer,Description="total counts for ALT">
+        ##FORMAT=<ID=OTH,Number=1,Type=Integer,Description="total counts for other bases from REF and ALT">
+        '''
+        if not len(expected_vars) == len(cell_vars):
+            print("length mismatch between expected vars and cell vars")
+            exit(1)
 
-            if not donor == donor_gt_match:
-                # We want to kow how many of these discordant site
-                if 'U937' in donor:
-                    continue
-                if 'THP1' in donor:
-                    continue
-                
-                if donor_gt_match_cohort == donor_cohort:
-                    coh = 'Whithin_Cohort'
-                    sites_contributing_to_Whithin_Cohort.extend(list(TotalSites_that_are_atributed_to_other_donor))
-                    if len(DonorDiscordant_Sites_that_are_atributed_to_other_donor)>0:
-                        donors_contributing_to_Whithin_Cohort.append(donor)
-                        
-                else:
-                    coh = 'Out_of_Cohort'
-                    sites_contributing_to_out_of_cohort.extend(list(TotalSites_that_are_atributed_to_other_donor))
-                    if len(DonorDiscordant_Sites_that_are_atributed_to_other_donor)>0:
-                        donors_contributing_to_out_of_cohort.append(donor)
-                        
-                
-                total_discordant_sites_that_are_concordant_with_other_donors_in_pool = total_discordant_sites_that_are_concordant_with_other_donors_in_pool.union(set(DonorDiscordant_Sites_that_are_atributed_to_other_donor))
-                # now we addit for a cohort since the biggest issue comes from cohort cross-contamination
-                # for each of these sites now we calculate the number of reads that it accounts:
-                # tree level set: cohort: site: counts
-                for site in DonorDiscordant_Sites_that_are_atributed_to_other_donor:
-                    total_reads_for_site,discordant_reads_for_site,concordant_for_site,\
-                        total_reads_noHet,discordant_reads_noHet,concordant_reads_noHet= self.read_extraction([site],expected_vars_norm_of_other_donor,cell_vars_norm_otherDonor)
-                    if concordant_for_site==0:
-                        pass
-                    try:
-                        total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_reads_noHet)
+        total_sites = len(expected_vars)
+        #add cols for DP, AD< OTH
+        cell_vars['DP'] = cell_vars[0].str.split("_").str[5].astype(int)
+        cell_vars['AD'] = cell_vars[0].str.split("_").str[6].astype(int)
+        cell_vars['OTH'] = cell_vars[0].str.split("_").str[7].astype(int)
+        
 
-                    except:
-                        try:
-                            total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site]=[]
-                            total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_reads_noHet)
-                            
-                        except:
-                            total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh]={}
-                            total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site]=[]
-                            total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_reads_noHet)
+        
+        # Total
+        total_reads,total_dp,total_oth,discordant_reads, \
+            discordantRead_ratio,discordant_reads_noHet,total_reads_noHet,discordantRead_ratio_noHet = self.read_concordance_calc(expected_vars,cell_vars)
+        
+        # uninformative
+        cell_vars_uninformative = cell_vars[cell_vars['ids'].isin(self.uninformative_sites)]
+        total_reads_uninformative,total_dp_uninformative,total_oth_uninformative,discordant_reads_uninformative, \
+            discordantRead_ratio_uninformative,discordant_reads_uninformative_noHet,total_reads_uninformative_noHet,discordantRead_ratio_uninformative_noHet= self.read_concordance_calc(expected_vars,cell_vars_uninformative)
+        
+        # informative 
+        cell_vars_informative = cell_vars[cell_vars['ids'].isin(self.informative_sites)]
+        total_reads_informative,total_dp_informative,total_oth_informative,discordant_reads_informative, \
+            discordantRead_ratio_informative,discordant_reads_informative_noHet,total_reads_informative_noHet,discordantRead_ratio_informative_noHet = self.read_concordance_calc(expected_vars,cell_vars_informative)
+        
+        # Split into concordant and discordant sites
+        # concordant
+        concordant_sites = cell_vars[cell_vars['ids'].isin(set(concordant_vars))]
+        total_reads_for_concordant_sites,total_dp_for_concordant_sites,total_oth_for_concordant_sites,discordant_reads_for_concordant_sites, \
+            discordantRead_ratio_for_concordant_sites,discordant_reads_for_concordant_sites_noHet,total_reads_for_concordant_sites_noHet,discordantRead_ratio_for_concordant_sites_noHet = self.read_concordance_calc(expected_vars,concordant_sites)
+        
+        # discordant
+        discordant_sites = cell_vars[cell_vars['ids'].isin(set(discordant_vars))]
+        total_reads_for_discconcordant_sites,total_dp_for_discconcordant_sites,total_oth_for_discconcordant_sites,discordant_reads_for_discconcordant_sites, \
+            discordantRead_ratio_for_discconcordant_sites,discordant_reads_for_discconcordant_sites_noHet,total_reads_for_discconcordant_sites_noHet,discordantRead_ratio_for_discconcordant_sites_noHet = self.read_concordance_calc(expected_vars,discordant_sites)
+              
+        # Subset analysis
+        cell_vars_informative_subset = cell_vars[cell_vars['ids'].isin(self.informative_subset)]
+        total_reads_informative_subset,total_dp_informative_subset,total_oth_informative_subset,discordant_reads_informative_subset, \
+            discordantRead_ratio_informative_subset,discordant_reads_informative_subset_noHet,total_reads_informative_subset_noHet,discordantRead_ratio_informative_subset_noHet = self.read_concordance_calc(expected_vars,cell_vars_informative_subset)
 
-                for site in DonorDiscordant_Sites_that_are_atributed_to_other_donor:
-                    total_reads_for_site,discordant_reads_for_site,concordant_for_site,\
-                        _,_,_= self.read_extraction([site],expected_vars_norm_of_other_donor,cell_vars_norm_otherDonor)
-                    # if discordant_reads_for_site>0:
-                    #     print('here')
-                    if concordant_for_site==0:
-                        pass
-                    try:
-                        total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_for_site)
-                    except:
-                        try:
-                            total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site]=[]
-                            total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_for_site)
-                        
-                        except:
-                            total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh]={}
-                            total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site]=[]
-                            total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_for_site)
+        if mode == 'no_het':
+            total_reads = total_reads_noHet
+            discordant_reads = discordant_reads_noHet
+            total_reads_informative = total_reads_informative_noHet
+            discordant_reads_informative = discordant_reads_informative_noHet
+            total_reads_uninformative =total_reads_uninformative_noHet
+            discordant_reads_uninformative = discordant_reads_uninformative_noHet
+            total_reads_informative_subset = total_reads_informative_subset_noHet
+            discordant_reads_informative_subset = discordant_reads_informative_subset_noHet
+            total_reads_for_concordant_sites = total_reads_for_concordant_sites_noHet
+            discordant_reads_for_concordant_sites = discordant_reads_for_concordant_sites_noHet
+            total_reads_for_discconcordant_sites = total_reads_for_discconcordant_sites_noHet
+            discordant_reads_for_discconcordant_sites = discordant_reads_for_discconcordant_sites_noHet
+        
+        return total_sites, \
+            self.informative_covered, \
+                self.uninformative_covered, \
+                    total_reads, \
+                        discordant_reads, \
+                            total_reads_informative, \
+                                discordant_reads_informative, \
+                                    total_reads_uninformative, \
+                                        discordant_reads_uninformative, \
+                                            total_reads_informative_subset, \
+                                                    discordant_reads_informative_subset, \
+                                                        total_reads_for_concordant_sites, \
+                                                            discordant_reads_for_concordant_sites, \
+                                                                total_reads_for_discconcordant_sites, \
+                                                                    discordant_reads_for_discconcordant_sites
+                                                                    
                                                     
-                # to get the total reads that can be atributed to the other donor i have to check if site is already covered in the total_discordant_sites_that_are_concordant_with_other_donors_in_pool.
-                # the ones that havent, i have to add the reads up for them.
-                informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool = informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool.union(set(Informative__DonorDiscordant_Sites_that_are_atributed_to_other_donor))
-                
-                total_cordant_sites_that_are_concordant_with_other_donors_in_pool = total_cordant_sites_that_are_concordant_with_other_donors_in_pool.union(set(DonorCordant_Sites_that_are_atributed_to_other_donor))
+    
+    def get_discordance(self,expected_vars2,cell_vars2):
+        Concordant_Sites = set(cell_vars2['combo']).intersection(set(expected_vars2['combo']))
+        Discordant_sites = set(cell_vars2['combo'])-set(expected_vars2['combo'])  
+        disc = pd.DataFrame(Discordant_sites,columns=['combo_x'])
+        df_cd = pd.merge(cell_vars2, expected_vars2, how='inner', on = 'pos')
+        disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x')
+        disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y']
+        disc_sites = ';'.join(disc2['expected_retrieved'])
+        return Concordant_Sites,Discordant_sites,disc_sites
+    
+    
+    def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars,donor_cohort=False):
+        # This function has been inspired by Hails Concordance implementations, however hail has a pitfall that it performs a lot of other stuff under hood and requires intermediate sorting operations.
+        # Since the single cell calculations requires concordance calculations per cell this becomes very computationally heavy on Hail, hence we have implemented concordance calculations here as part of the pipeline.
+        # Author: M.Ozols
+        
+        cell_vars_norm = self.norm_genotypes(cell_vars)
+
+        if len(cell_vars_norm) > 0:
+            
+            if mode == 'no_het':
+                hets = ['0/1', '1/0']
+                expected_vars_norm = expected_vars_norm[~expected_vars_norm['vars'].isin(hets)]
                 
+            Total_Overlapping_sites = set(expected_vars_norm['ids']).intersection(set(cell_vars_norm['ids']))
+            expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlapping_sites)]
+            cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)]
+            cell_vars2 = cell_vars2.drop_duplicates(subset=['ids'])
+            expected_vars2 = expected_vars2.drop_duplicates(subset=['ids'])
+            if len(cell_vars2)!=len(expected_vars2):
+                print('failed to select comon variants')
+                exit(1)
+            
+            # Find exact discordant sites
+            Concordant_Sites, Discordant_sites, _ = self.get_discordance(expected_vars2, cell_vars2)
+            #find truly discordant sites
+            true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, discordant_vars, concordant_vars, disc_sites_string = self.get_strict_discordance(expected_vars2, cell_vars2)
+            #find discordant reads
+            total_sites, \
+                informative_sites, \
+                    uninformative_sites, \
+                        total_reads, \
+                            discordant_reads, \
+                                total_reads_informative, \
+                                    discordant_reads_informative, \
+                                        total_reads_uninformative, \
+                                            discordant_reads_uninformative, \
+                                                    total_reads_informative_subset, \
+                                                        discordant_reads_informative_subset, \
+                                                            total_reads_for_concordant_sites, \
+                                                                discordant_reads_for_concordant_sites, \
+                                                                    total_reads_for_discconcordant_sites, \
+                                                                        discordant_reads_for_discconcordant_sites   = self.read_condordance(expected_vars2, cell_vars2, discordant_vars, concordant_vars)
+                                                                        
+            discordant_read_fraction_in_concordant_sites = f"{discordant_reads_for_concordant_sites}/{total_reads_for_concordant_sites}"
+            discordant_read_fraction_in_discordant_sites = f"{discordant_reads_for_discconcordant_sites}/{total_reads_for_discconcordant_sites}"
+            discordant_reads_uninformative_fraction = f"{discordant_reads_uninformative}/{total_reads_uninformative}"
+            discordant_reads_informative_fraction = f"{discordant_reads_informative}/{total_reads_informative}"
+            
+            # sanity checks
+            if total_reads != total_reads_for_concordant_sites+total_reads_for_discconcordant_sites:
+                print("Error: total reads dont add up ")
+                exit(1)
+            if discordant_reads != discordant_reads_for_concordant_sites+discordant_reads_for_discconcordant_sites:
+                print("Error: discordant reads dont add up ")
+                exit(1)
+            
 
-                common_vars = list(set(discordant_vars) & set(donor_vars))
-                common_var_count = str(len(common_vars))
-                donor_cohort_common = donor + ":" + donor_cohort + ":" + common_var_count
-                discordant_vars_in_pool.append(donor_cohort_common)
-                
-                # Here we want to calculate the number of discordant sites in other donors and see if in terms of concordance the same donor is picked as per GT assignment.
-                # We do this to investigate the potential of a cell coming from this other donor.
-            else:
-                _='check'
-                try:
-                    percent_relaxed_concordant = Nr_Relaxed_concordant/(Nr_Relaxed_concordant+true_discordant_count)*100
-                except:
-                    percent_relaxed_concordant = 0
-                # g2 = total_concordant_sites_otherDonor/total_sites_otherDonor*100
-                if not (percent_relaxed_concordant == concordant_percent_in_other_donor):
-                    _='something not right'
-                    break
+        else:
+            Total_Overlapping_sites = set()
+            Concordant_Sites = set()
+            Discordant_sites = set()
+            disc_sites = ''
+            true_discordant_count = 0
+            relaxed_concordant_count = 0
+            total_sites = 0
+            relaxed_concordant_informative_count = 0
+            relaxed_concordant_uninformative_count = 0
+            true_discordant_informative_count = 0
+            true_discordant_uninformative_count = 0
+            informative_sites = 0
+            disc_sites_string = ''
+            discordant_reads = 0
+            uninformative_sites = 0
+            total_reads = 0
+            total_reads_informative =0
+            total_reads_uninformative=0
+            discordant_reads=0
+            discordant_reads_informative=0
+            discordant_reads_uninformative=0
+            discordant_vars=0
+            concordant_vars=0
+            discordant_read_fraction_in_concordant_sites=0
+            discordant_read_fraction_in_discordant_sites=0
+            discordant_reads_uninformative_fraction=0
+            discordant_reads_informative_fraction=0
+            discordant_reads_informative_subset=0
+
+        return Concordant_Sites, \
+            Discordant_sites, \
+            Total_Overlapping_sites, \
+            disc_sites_string, \
+            cell_vars_norm, \
+            true_discordant_count, \
+            relaxed_concordant_count, \
+            relaxed_concordant_informative_count, \
+            relaxed_concordant_uninformative_count, \
+            true_discordant_informative_count, \
+            true_discordant_uninformative_count, \
+            total_sites, \
+            informative_sites, \
+            uninformative_sites, \
+            total_reads, \
+            total_reads_informative, \
+            total_reads_uninformative, \
+            discordant_reads, \
+            discordant_reads_informative, \
+            discordant_reads_uninformative,  \
+            discordant_vars, \
+            concordant_vars, \
+            discordant_read_fraction_in_concordant_sites, \
+            discordant_read_fraction_in_discordant_sites, \
+            discordant_reads_uninformative_fraction, \
+            discordant_reads_informative_fraction, \
+            discordant_reads_informative_subset
             
-            donor_table_of_concordances.append({'donor':donor, 'cell':cell1, 'donor_cohort':donor_cohort, \
-                                                'gt matched donor':donor == donor_gt_match, \
-                                                'DonorCordant_Sites_that_are_atributed_to_other_donor':len(DonorCordant_Sites_that_are_atributed_to_other_donor), \
-                                                'DonorCordant_Sites_that_are_atributed_to_other_donor/total':f"{len(DonorCordant_Sites_that_are_atributed_to_other_donor)}/{len(concordant_vars)}", \
-                                                'DonorDiscordant_Sites_that_are_atributed_to_other_donor':len(DonorDiscordant_Sites_that_are_atributed_to_other_donor), \
-                                                'DonorDiscordant_Sites_that_are_atributed_to_other_donor/total':f"{len(DonorDiscordant_Sites_that_are_atributed_to_other_donor)}/{len(discordant_vars)}", \
-                                                'concordant_percent_in_other_donor':concordant_percent_in_other_donor, \
-                                                'discordant_percent_in_other_donor':discordant_percent_in_other_donor, \
-                                                'discordant_reads_otherDonor':discordant_reads_otherDonor, \
-                                                'discordant_sites_otherDonor':len(discordant_vars_otherDonor), \
-                                                'concordant_sites_otherDonor':len(concordant_vars_otherDonor), \
-                                                'total_sites_otherDonor':total_sites_otherDonor, \
-                                                'discordant_reads_otherDonor':discordant_reads_otherDonor, \
-                                                'total_reads_otherDonor':total_reads_otherDonor, \
-                                                # 'discordant_read_fraction_in_concordant_sites_otherDonor':discordant_read_fraction_in_concordant_sites_otherDonor, \
-                                                # 'discordant_read_fraction_in_discordant_sites_otherDonor':discordant_read_fraction_in_discordant_sites_otherDonor, \
-                                                'concordant_reads_For_discordant_sites_that_are_Concordant_with_other_donor':concordant_reads_for_discordant_sites_that_are_concordant_with_other_donor
-                                                })
-        
-        
-        #here now we want to see overall how many reads potentially come from different cohorts.
-        # cohort_specific_site_quant_string=""
-        # cohort_specific_read_quant_string=""
+    
+
+    def set_results(self,to_set,id):
+        # Recod to disk to save the loading mmeory time.
+        hash = random.getrandbits(128)
+        with open(f'tmp_{id}_{hash}.pkl', 'wb') as f:
+            pickle.dump(to_set, f)
+        self.record_dict[id]=f'tmp_{id}_{hash}.pkl'
+        return 
+    
+    # def append_results_cell_concordances(self,result):
+    def append_results_cell_concordances(self,result,cell_concordance_table,other_donor_concordances,other_donor_concordance_table):
+        other_donor_concordance_table = other_donor_concordance_table + other_donor_concordances
+        # other_donor_concordance_table = {**other_donor_concordance_table, **other_donor_concordances}
+        count=result['count']
+        try:
+            percent_concordant = result['Nr_Concordant']/(result['Nr_Discordant']+result['Nr_Concordant'])*100
+        except:
+            percent_concordant = 0
         
-        Whithin_Cohort__total_number_of_potential_contaminent_reads=0
-        Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool=0
-        Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool=0
         try:
-            Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool = len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys())
-            # for k1 in total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys():
-            #     Whithin_Cohort__total_number_of_potential_contaminent_reads+= max(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'][k1])
+            percent_discordant = result['Nr_Discordant']/(result['Nr_Discordant']+result['Nr_Concordant'])*100
         except:
-            _='Doesnt Exist'
+            percent_discordant = 0
+
+        try:
+            percent_relaxed_concordant = result['Nr_Relaxed_concordant']/(result['Nr_Relaxed_concordant']+result['true_discordant_count'])*100
+        except:
+            percent_relaxed_concordant = 0
         
-        Out_of_Cohort__total_number_of_potential_contaminent_reads=0
         try:
-            Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool = len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys())
-            # for k1 in total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys():
-            #     Out_of_Cohort__total_number_of_potential_contaminent_reads+= max(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'][k1])
+            percent_strict_discordant = result['true_discordant_count']/(result['Nr_Relaxed_concordant']+result['true_discordant_count'])*100
         except:
-            _='Doesnt Exist'
-    
-    
-    
-    
+            percent_strict_discordant = 0
+
+        try:
+            read_discordance = result['discordant_reads']/result['total_sites']
+        except:
+            read_discordance = 0
+
+        cohort = 'UNKNOWN'
+        donor_split = result['donor_gt_match'].split("_")
+        if (len(donor_split) == 2) and (donor_split[0] == donor_split[1]):
+            cohort = 'UKB'
+        elif (len(donor_split) == 3) and (len(donor_split[0]) == 14):
+            cohort = 'ELGH'
+
+        same_as_asigned_donor = result['donor_gt_match'] in result['Donor_With_Highest_Concordance']
+        if not same_as_asigned_donor:
+            same_as_asigned_donor = result['donor_gt_match'] in result['Donor_With_Lowest_DisConcordance']
+            
+        cell_concordance_table[f"{result['cell1']} --- {result['donor_gt_match']}"] = {'GT 1':result['cell1'],
+                                                                'GT 2':result['donor_gt_match'],
+                                                                'cohort': cohort,
+                                                                
+                                                                # 'Nr_Concordant':result['Nr_Concordant'],
+                                                                # 'Nr_Discordant':result['Nr_Discordant'],
+                                                                'Nr_Relaxed_concordant':result['Nr_Relaxed_concordant'],
+                                                                'Nr_strict_discordant':result['true_discordant_count'],
+                                                                # 'Percent Concordant':percent_concordant,
+                                                                # 'Percent Discordant':percent_discordant,
+                                                                'Percent_relaxed_concordant': percent_relaxed_concordant,
+                                                                'Percent_strict_discordant': percent_strict_discordant,
+                                                                'Nr_concordant_informative': len(result['relaxed_concordant_informative_count']),
+                                                                'Nr_concordant_uninformative': len(result['relaxed_concordant_uninformative_count']),
+                                                                'Nr_discordant_informative': len(result['true_discordant_informative_count']),
+                                                                'Nr_discordant_uninformative': len(result['true_discordant_uninformative_count']),
+                                                                'NrTotal_Overlapping_sites_between_two_genotypes':result['Nr_Total_Overlapping_sites'],
+                                                                'Nr_donor_distinct_sites_within_pool_individuals':result['Nr_donor_distinct_sites'],
+                                                                'Number_of_sites_that_are_donor_concordant_and_exclusive':result['Number_of_sites_that_are_donor_concordant_and_exclusive'],
+                                                                'Total_sites': result['total_sites'],
+                                                                'Total_informative_sites': result['informative_sites'],
+                                                                'Total_uninformative_sites': result['uninformative_sites'],
+                                                                'Total_reads': result['total_reads'],
+                                                                'Total_reads_informative': result['total_reads_informative'],
+                                                                'Total_reads_uninformative': result['total_reads_uninformative'],
+                                                                'Discordant_reads': result['discordant_reads'],
+                                                                'Discordant_reads_informtive': result['discordant_reads_informative'],
+                                                                'Discordant_reads_uninformtive': result['discordant_reads_uninformative'],
+                                                                'discordant_reads_informative_subset':result['discordant_reads_informative_subset'],
+                                                                'Discordant_reads_by_n_sites': read_discordance,
+                                                                'Discordant_sites_in_pool': len(result['Discordant_sites_in_pool']),
+                                                                'Lowest_Disconcordance_value_in_all_donors':result['Lowest_Disconcordance_value_in_all_donors'],
+                                                                'Donor_With_Lowest_DisConcordance':result['Donor_With_Lowest_DisConcordance'],
+                                                                'Concordant_Site_Identities':result['Concordant_Site_Identities'],
+                                                                'Donor_With_Highest_Concordance':result['Donor_With_Highest_Concordance'],
+                                                                'Highest_Concordance_value_in_all_donors':result['Highest_Concordance_value_in_all_donors'],
+                                                                'same_as_asigned_donor':same_as_asigned_donor,
+                                                                'Total_sites_other_donor (if same_as_asigned_donor=False)':result['Total_sites_other_donor'],
+                                                                'Total_reads_other_donor (if same_as_asigned_donor=False)':result['Total_reads_other_donor'],
+                                                                'total_discordant_sites_that_are_concordant_with_other_donors_in_pool':result['total_discordant_sites_that_are_concordant_with_other_donors_in_pool'],
+                                                                'discordant_read_fraction_in_concordant_site':result['discordant_read_fraction_in_concordant_sites'], 
+                                                                'discordant_read_fraction_in_discordant_sites':result['discordant_read_fraction_in_discordant_sites'], 
+                                                                'Whithin_Cohort__total_number_of_potential_contaminent_reads':result['Whithin_Cohort__total_number_of_potential_contaminent_reads'],
+                                                                'Out_of_Cohort__total_number_of_potential_contaminent_reads':result['Out_of_Cohort__total_number_of_potential_contaminent_reads'], 
+                                                                'NrDonors_contributing_to_out_of_cohort':result['NrDonors_contributing_to_out_of_cohort'], 
+                                                                'NrDonors_contributing_to_Whithin_Cohort':result['NrDonors_contributing_to_Whithin_Cohort'],
+                                                                
+                                                                'Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool'],
+                                                                'Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool'],
+                                                                'shared_sites_between_Out_of_Cohort_and_Within_Cohort':result['shared_sites_between_Out_of_Cohort_and_Within_Cohort'],
+                                                                'Within_Cohort__unique_discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool'],
+                                                                'Out_of_Cohort__unique_discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool'],
+                                                                'Out_of_Cohort__unique_sites_total_reads':result['Out_of_Cohort__unique_sites_total_reads'],
+                                                                'Total__discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Total__discordant_sites_that_are_concordant_with_other_donors_in_pool']
+                                                                } 
+        
+        return [cell_concordance_table,other_donor_concordance_table]
     
     
+    def combine_written_lists(self,exclusive_donor_variants,record_dict):#this is for VCF loader class
+        to_export = exclusive_donor_variants
+        for val1 in record_dict.values():
+            # here remove the int files.
+            print(f"merging temp file: {val1}")
+            with open(val1, 'rb') as f:
+                loaded_dict = pickle.load(f)
+                self.other_donor_comp = self.other_donor_comp+ loaded_dict
+            os.remove(val1)
+        return self.other_donor_comp    
     
+    def combine_written_files(self,exclusive_donor_variants,record_dict):#this is for VCF loader class
+        to_export = exclusive_donor_variants
+        for val1 in record_dict.values():
+            # here remove the int files.
+            print(f"merging temp file: {val1}")
+            with open(val1, 'rb') as f:
+                loaded_dict = pickle.load(f)
+                for k1 in loaded_dict.keys():
+                    try:
+                        to_export[k1]=to_export[k1].union(loaded_dict[k1])
+                    except:
+                        to_export[k1]=set()
+                        to_export[k1]=to_export[k1].union(loaded_dict[k1])
+            os.remove(val1)
+        return to_export
+   
+    def set_results(self,to_set,id):
+        # Recod to disk to save the loading mmeory time.
+        hash = random.getrandbits(128)
+        with open(f'tmp_{id}_{hash}.pkl', 'wb') as f:
+            pickle.dump(to_set, f)
+        self.record_dict[id]=f'tmp_{id}_{hash}.pkl' 
     
-        # No het total calcs
+
+    def combine_concordances(self,result,other_donor_concordance,donor_gt_match,analyse_donor):
+        pd.DataFrame(other_donor_concordance).sort_values(by=['cell']).to_csv(f'{donor_gt_match}-{analyse_donor}--each_cells_comparison_with_other_donor.tsv',sep='\t',index=False)
+        self.cell_concordance_table = {**self.cell_concordance_table, **result}
+
+    def combine_dict(self,cell_concordance_table,result): 
+        cell_concordance_table = {**cell_concordance_table, **result}    
+        return cell_concordance_table  
+
+    def conc_table(self):
+        donor_assignments_table=self.donor_assignments_table
+        cell_assignments_table=self.cell_assignments_table
+        exclusive_don_variants=self.exclusive_don_variants
+        exclusive_cell_variants= self.exclusive_cell_variants
+        donor_list = exclusive_don_variants.keys()
+        # cpus = 1
         
-        # Since we can not tell just from the cellsnp file if the discordant read at concordant site is actually concordant with another donor in a pool
-        # we assume the worse case scenario and add idscordant reads at concordant sites to the total.
-        # Number of reads on the sites that are concordant, but has discordant reads with another donor
-        Whithin_Cohort__total_number_of_potential_contaminent_reads=0
-        try:
-            concordant_sites_atributed_to_other_donor = list(set(sites_contributing_to_Whithin_Cohort))
-            if not total_sites >= len(concordant_sites_atributed_to_other_donor):
-                raise Exception("Sorry, the number of sites atributed can not be larger than total sites")
-            
-            _,_,_,\
-                _,Whithin_Cohort__total_number_of_potential_contaminent_reads, \
-                    _= self.read_extraction(concordant_sites_atributed_to_other_donor,expected_vars_norm,cell_vars_norm)
-                    
-            if not discordant_reads >= len(Whithin_Cohort__total_number_of_potential_contaminent_reads):
-                raise Exception("Total discordant reads can not be less than within cohort discordant reads.")            
-            
-            
-        except:
-            _='Doesnt Exist'
+        count = 0
+        
+        
+        #create a list of variants that are on each donor genotype file
+        vars_per_donor_gt = {}
+        for don_id in donor_list:
+            donor_gt_vars = list(exclusive_don_variants[don_id])
+            donor_gt_vars = pd.DataFrame(donor_gt_vars)
+            donor_gt_vars = self.norm_genotypes(donor_gt_vars)
+            donor_gt_vars = donor_gt_vars[donor_gt_vars['vars'] != '0/0']
+            donor_gt_varids = list(donor_gt_vars['ids'])
+            vars_per_donor_gt[don_id] = donor_gt_varids
+        
+        #work out what cohort each donor belongs to
+        donor_cohorts = {}
+        for don_id in donor_list:
+            cohort = 'UNKNOWN'
+            donor_split = don_id.split("_")
+            if (len(donor_split) == 2) and (donor_split[0] == donor_split[1]):
+                cohort = 'UKB'
+            elif (len(donor_split) == 3) and (len(donor_split[0]) == 14):
+                cohort = 'ELGH'
+            donor_cohorts[don_id] = cohort
 
+        all_donor_data={}
+        # here we calvculate all the expected donor datasets
+        for row1 in exclusive_don_variants.keys():
+            # donor_in_question = row1['donor_query']
+            donor_gt_match = row1
+            expected_vars_of_other_donor = self.exclusive_don_variants[donor_gt_match]
+            expected_vars_norm_of_other_donor = self.norm_genotypes(expected_vars_of_other_donor)
+            all_donor_data[donor_gt_match]=expected_vars_norm_of_other_donor
 
-        Out_of_Cohort__total_number_of_potential_contaminent_reads=0
-        try:
-            concordant_sites_atributed_to_other_donor =  list(set(sites_contributing_to_out_of_cohort))
-            if not total_sites >= len(concordant_sites_atributed_to_other_donor):
-                raise Exception("Sorry, the number of sites atributed can not be larger than total sites")
-            
-            _,_,_,_,Out_of_Cohort__total_number_of_potential_contaminent_reads, _= self.read_extraction(concordant_sites_atributed_to_other_donor,expected_vars_norm,cell_vars_norm)
-            
-            if not discordant_reads >= len(Out_of_Cohort__total_number_of_potential_contaminent_reads):
-                raise Exception("Total discordant reads can not be less than out of cohort discordant reads.")                       
+        results = []
+        for i,row1 in donor_assignments_table.iterrows():
+            donor_in_question = row1['donor_query']
+            donor_gt_match = row1['donor_gt']
+            print(donor_gt_match)
+            # if i>4:
+            #     continue
+            if (donor_gt_match=='NONE'):
+                continue
+            try:
+                donor_gt_match_cohort = donor_cohorts[donor_gt_match]
+            except:
+                continue
+            Cells_to_keep_pre = list(set(cell_assignments_table.loc[cell_assignments_table['donor_id']==donor_in_question,'cell']))
+            expected_vars = exclusive_don_variants[donor_gt_match]
+            expected_vars_norm = self.norm_genotypes(expected_vars)
+            try:
+                # Now we subset this down to each of the uniqie variants per donor and check which of the concordant sites are exclusive to donor.
+                dds = self.donor_distinct_sites[donor_gt_match]
+            except:
+                continue
+            # if cpus==1:
+            result,other_donor_concordance,donor_gt_match,donor_id = Donor(donor_in_question,self.informative_sites,self.uninformative_sites,self.donor_assignments_table,self.cell_assignments_table,self.exclusive_cell_variants,self.donor_distinct_sites,cpus).analyse_donor(Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,all_donor_data,expected_vars_norm,donor_assignments_table,exclusive_cell_variants)
+            self.combine_concordances(result,other_donor_concordance,donor_gt_match,donor_id)
+
+
+
+        return self.cell_concordance_table
 
-        except:
-            _='Doesnt Exist'
-                
     
+
+    def read_extraction(self,DonorDiscordant_Sites_that_are_atributed_to_other_donor,expected_vars_norm,cell_vars_norm):
+        # we need this function wrapper to calculate the concordant, discordant read 
+        # counts for each of the discordant sites that are concordant with another donor.
         
-        try:
-            Out_of_Cohort__sites = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys())
-        except:
-            Out_of_Cohort__sites = set()
-            
-        try:
-            Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys()) - set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) 
-            shared_sites_between_out_of_cohort_and_within_cohort = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys()).intersection(set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) )
-        except:
-            Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set()
-            shared_sites_between_out_of_cohort_and_within_cohort = set()
-            
-        try:
-            Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) - set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys()) 
-        except:
-            Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys())
+        Total_Overlapping_sites = set(DonorDiscordant_Sites_that_are_atributed_to_other_donor)
+        expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlapping_sites)]
+        cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)]   
+        
+        cell_vars2 = cell_vars2.drop_duplicates(subset=['ids'])
+        expected_vars2 = expected_vars2.drop_duplicates(subset=['ids'])
+        cell_vars2['DP'] = cell_vars2[0].str.split("_").str[5].astype(int)
+        cell_vars2['AD'] = cell_vars2[0].str.split("_").str[6].astype(int)
+        cell_vars2['OTH'] = cell_vars2[0].str.split("_").str[7].astype(int)
+        total_reads,_,_,discordant_reads, \
+            discordantRead_ratio,discordant_reads_noHet,total_reads_noHet,discordantRead_ratio_noHet= self.read_concordance_calc(expected_vars2,cell_vars2)
+        concordant_reads = total_reads - discordant_reads
+        concordant_reads_noHet = total_reads_noHet-discordant_reads_noHet
+        return total_reads,discordant_reads,concordant_reads,\
+            total_reads_noHet,discordant_reads_noHet,concordant_reads_noHet
+    
 
-        Out_of_Cohort__unique_sites_total_reads,_,_,_,_,_ = self.read_extraction(Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool,expected_vars_norm,cell_vars_norm)
-            
-        try:
-            Whithin_Cohort__sites = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys())
-        except:
-            Whithin_Cohort__sites = set()  
-            
+class Donor(Concordances):
+    def __init__(self, donor_id,informative_sites,uninformative_sites,donor_assignments_table,cell_assignments_table,exclusive_cell_variants,donor_distinct_sites,pool):
+        self.donor_concordance_table = {}
+        self.other_donor_concordance = []
+        self.pool = pool
+        self.donor_id = donor_id
+        self.informative_sites = informative_sites
+        self.uninformative_sites = uninformative_sites
+        self.donor_assignments_table=donor_assignments_table
+        self.cell_assignments_table=cell_assignments_table
+        self.exclusive_don_variants=exclusive_don_variants 
+        self.exclusive_cell_variants=exclusive_cell_variants
+        self.donor_distinct_sites=donor_distinct_sites
 
-          
-        Total__discordant_sites_that_are_concordant_with_other_donors_in_pool = Whithin_Cohort__sites.union(Out_of_Cohort__sites)
-        concordant_vars_in_pool_str = (";").join(concordant_vars)
-        DF = pd.DataFrame(donor_table_of_concordances)
+    def combine_results(self,result):
+        donor_concordance_table=result[0]
+        other_donor_concordance_table=result[1]
+        # other_donor_concordance_table + other_donor_concordances
+        self.other_donor_concordance = self.other_donor_concordance + other_donor_concordance_table
+        self.donor_concordance_table = {**self.donor_concordance_table, **donor_concordance_table}
         
-        Donor_With_Lowest_DisConcordance = ';'.join(DF[DF['discordant_percent_in_other_donor']==min(DF['discordant_percent_in_other_donor'])]['donor'].values)
-        Lowest_Disconcordance_value_in_all_donors= DF[DF['discordant_percent_in_other_donor']==min(DF['discordant_percent_in_other_donor'])]['discordant_percent_in_other_donor'].values[0]
-        
-        Donor_With_Highest_Concordance = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['donor'].values)
-        Highest_Concordance_value_in_all_donors= DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['concordant_percent_in_other_donor'].values[0]
-        Total_sites_other_donor = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['total_sites_otherDonor'].astype(str).values)
-        Total_reads_other_donor = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['total_reads_otherDonor'].astype(str).values)
-                                                       
-        return [{
-            'cell1':cell1,
-            'donor_gt_match':donor_gt_match,
-            'Nr_Concordant':Nr_Concordant,
-            'Nr_Discordant':Nr_Discordant,
-            'Nr_Relaxed_concordant':Nr_Relaxed_concordant,
-            'true_discordant_count':true_discordant_count,
-            'relaxed_concordant_informative_count':relaxed_concordant_informative_count,
-            'relaxed_concordant_uninformative_count':relaxed_concordant_uninformative_count,
-            'true_discordant_informative_count':true_discordant_informative_count,
-            'true_discordant_uninformative_count':true_discordant_uninformative_count,
-            'Nr_Total_Overlapping_sites':Nr_Total_Overlapping_sites,
-            'Number_of_sites_that_are_donor_concordant_and_exclusive':Number_of_sites_that_are_donor_concordant_and_exclusive,
-            'Nr_donor_distinct_sites':Nr_donor_distinct_sites,
-            'count':count,
-            'discordant_sites':discordant_sites,
-            'total_sites':total_sites,
-            'informative_sites':informative_sites,
-            'uninformative_sites':uninformative_sites,
-            'total_reads_informative':total_reads_informative,
-            'total_reads_uninformative':total_reads_uninformative,
-            'discordant_reads_informative':discordant_reads_informative,
-            'discordant_reads_uninformative':discordant_reads_uninformative,
-            'Discordant_sites_in_pool': discordant_vars,
-            'Lowest_Disconcordance_value_in_all_donors':Lowest_Disconcordance_value_in_all_donors,
-            'Donor_With_Lowest_DisConcordance':Donor_With_Lowest_DisConcordance,
-            'Concordant_Site_Identities':concordant_vars_in_pool_str,
-            'Donor_With_Highest_Concordance':Donor_With_Highest_Concordance,
-            'Highest_Concordance_value_in_all_donors':Highest_Concordance_value_in_all_donors,
-            'Total_sites_other_donor':Total_sites_other_donor,
-            'Total_reads_other_donor':Total_reads_other_donor,
-            'total_discordant_sites_that_are_concordant_with_other_donors_in_pool':f"{len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool)}/{len(discordant_vars)}",
-            'informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool':f"{len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool)}/{len(true_discordant_informative_count)}",
-            'total_reads':total_reads,
-            'discordant_reads':discordant_reads,
-            'discordant_reads_informative_subset':discordant_reads_informative_subset, \
-            'discordant_read_fraction_in_concordant_sites':discordant_read_fraction_in_concordant_sites, \
-            'discordant_read_fraction_in_discordant_sites':discordant_read_fraction_in_discordant_sites, \
-            'Whithin_Cohort__total_number_of_potential_contaminent_reads':Whithin_Cohort__total_number_of_potential_contaminent_reads, \
-            'Out_of_Cohort__total_number_of_potential_contaminent_reads':Out_of_Cohort__total_number_of_potential_contaminent_reads, \
-            'NrDonors_contributing_to_out_of_cohort':len(set(donors_contributing_to_out_of_cohort)), \
-            'NrDonors_contributing_to_Whithin_Cohort':len(set(donors_contributing_to_Whithin_Cohort)), \
-                
-            'Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool':Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool, \
-            'Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool':Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool, \
-            'shared_sites_between_Out_of_Cohort_and_Within_Cohort':len(shared_sites_between_out_of_cohort_and_within_cohort), \
-            'Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool':len(Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool), \
-            'Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool':len(Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool), \
-            'Out_of_Cohort__unique_sites_total_reads':Out_of_Cohort__unique_sites_total_reads, \
-            'Total__discordant_sites_that_are_concordant_with_other_donors_in_pool':len(Total__discordant_sites_that_are_concordant_with_other_donors_in_pool)
-        }, donor_table_of_concordances]
+    def analyse_cells(self,chunk_df,exclusive_cell_variants,expected_vars_norm,donor_gt_match,vars_per_donor_gt,donor_cohorts,all_donor_data,donor_assignments_table,donor_gt_match_cohort):
+        donor_concordance_table = {}
+        other_donor_concordance_table = []
+        count=0
+        for cell1 in chunk_df:
+            count+=1
+            cell_vars = exclusive_cell_variants[cell1]
+            # try:
+            result1, other_donor_concordances = self.concordance_table_production(expected_vars_norm,cell_vars,cell1,donor_gt_match,donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data,donor_assignments_table)
+            donor_concordance_table,other_donor_concordance_table = self.append_results_cell_concordances(result1,donor_concordance_table,other_donor_concordances,other_donor_concordance_table)
+            # except:
+            #     continue
+        return [donor_concordance_table, other_donor_concordance_table]
     
+    def analyse_donor(self,Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,all_donor_data,expected_vars_norm,donor_assignments_table,exclusive_cell_variants):
+        n=200
+        print(f'-- Using {cpus} cpus ---')
+        pool = mp.Pool(cpus)
+        # Cells_to_keep_pre = Cells_to_keep_pre[:13]
+        list_df = [Cells_to_keep_pre[i:i+n] for i in range(0,len(Cells_to_keep_pre),n)]
+        # results = []
+        for chunk_df in list_df:
+            pool.apply_async(self.analyse_cells,args =([chunk_df,exclusive_cell_variants,expected_vars_norm,donor_gt_match,vars_per_donor_gt,donor_cohorts,all_donor_data,donor_assignments_table,donor_gt_match_cohort]),callback=self.combine_results)
+            # results.append(result)
+        # while True:
+        #     time.sleep(1)
+        #     # catch exception if results are not ready yet
+        #     try:
+        #         ready = [result.ready() for result in results]
+        #         successful = [result.successful() for result in results]
+        #     except Exception:
+        #         continue
+        #     # exit loop if all tasks returned success
+        #     if all(successful):
+        #         break
+        #     # raise exception reporting exceptions received from workers
+        #     if all(ready) and not all(successful):
+        #         raise Exception(f'Workers raised following exceptions {[result._value for result in results if not result.successful()]}')
+        # self.pool.join()
+            # try:
+            #     [result.wait() for result in results]        
+            # except:
+            #     print('done')             
+            # r1,r2 = self.analyse_cells(chunk_df,exclusive_cell_variants,expected_vars_norm,donor_gt_match,vars_per_donor_gt,donor_cohorts,all_donor_data,donor_assignments_table,donor_gt_match_cohort)
+            # self.combine_results(r1,r2)
+        print('Done') 
+        pool.close()
+        pool.join() 
+        return self.donor_concordance_table,self.other_donor_concordance,donor_gt_match,self.donor_id 
     
 class VCF_Loader:
     
diff --git a/bin/gather_minimal_dataset.py b/bin/gather_minimal_dataset.py
index 3a6cd6b..4349485 100755
--- a/bin/gather_minimal_dataset.py
+++ b/bin/gather_minimal_dataset.py
@@ -413,6 +413,9 @@ def gather_donor(donor_id, ad, ad_lane_raw, azimuth_annot, qc_obs, columns_outpu
     ad.obs = ad.obs.loc[:,~ad.obs.columns.duplicated()]
     if write_h5:
         path1=os.path.join(outdir, oufnam + '.h5ad')
+        ad.obs['qc.filter.pass.AZ:L0'] = ad.obs['qc.filter.pass.AZ:L0'].astype('bool')
+        ad.obs['cell_passes_hard_filters'] = ad.obs['cell_passes_hard_filters'].astype('bool')
+        ad.obs['qc.filter.pass'] = ad.obs['qc.filter.pass'].astype('bool')
         ad.write(path1,compression='gzip')
 
     return {
diff --git a/conf/base.conf b/conf/base.conf
index 249dc9f..98bb86d 100755
--- a/conf/base.conf
+++ b/conf/base.conf
@@ -9,7 +9,7 @@
 */
 
 params{
-    input = 'cellbender' 
+    input = 'cellbender' //# cellbender|cellranger
     rsync_to_web_file = "${launchDir}/yascp/bin/rsync_to_web.sh"
     profile = 'normal_run'
     citeseq = false
@@ -59,11 +59,12 @@ params{
     cluster_validate_resolution_keras = true
     input_tables_column_delimiter = '\t'
     outdir= "${launchDir}/results"
+    tracedir                   = "${params.outdir}/pipeline_info"
     do_deconvolution = true
     split_bam = false
     run_multiplet = true
     utilise_gpu = true
-    split_ad_per_bach = false
+    split_ad_per_bach = true
     cellbender_resolution_to_use='0pt1'
     reference_assembly_fasta_dir = "https://yascp.cog.sanger.ac.uk/public/10x_reference_assembly"
     webtransfer = false
@@ -239,246 +240,31 @@ process {
     withLabel:process_medium_memory {
         memory = { 30.GB * task.attempt }
     }
-
-    withName: SCRUBLET{
-        maxRetries    = 3
-        errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' }
-    }
-
-
-    withName: DOUBLET_DECON{
-        maxRetries    = 3
-        errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' }
-        memory = { 100.GB * task.attempt }
-    }
-    
-    withName: AZIMUTH{
-        maxRetries    = 3
-        errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' }
-    }
-    withName: CELLTYPIST{
-        maxRetries    = 3
-        errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' }
-    }
-    withName: cellbender__remove_background{
-        maxRetries    = 2
-        //# errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' }
-    }
-
-    withLabel:error_ignore {
-        errorStrategy = 'ignore'
-    }
-    withLabel:error_retry {
-        errorStrategy = 'retry'
-        maxRetries    = 2
-    }
-
-    withName:cluster_validate_resolution_keras{
-        maxForks=4
+    withName: ASSESS_CALL_RATE{
         maxRetries    = 3
-        memory = { 60.GB * task.attempt }
-        time   = { 12.h   * task.attempt }
+        memory = { 10.GB * task.attempt }
         errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' }
     }
 
-    withName: CELLTYPIST{
-        maxForks=7
-    }
-
-    withName: CELLTYPE_FILE_MERGE{
-        memory = { 60.GB * task.attempt }
-    }
-
-    withName: NORMALISE_AND_PCA{
-        maxForks=7
-        errorStrategy = 'retry'
-        memory = { 50.GB * task.attempt}
-        maxRetries    = 8
-        cpus   = 4
-    }
-    
-    withName: LISI{
-        maxForks=7
-        memory =300.GB
-    }
-
-    withName: RESOLVE_POOL_VCFS{
-        cpus =  1
-        memory = { 1.GB * task.attempt }
-        time   = {  12.h   * task.attempt }
-        maxRetries    = 3
-    }
-
-    withName: SPLIT_BATCH_H5AD{
-        cpus =  2
-        memory = { 25.GB * task.attempt * 0.5}
-        time   = {  12.h   * task.attempt }
-        maxRetries    = 3
-    }
-
-    withName: SUBSET_GENOTYPE2{
-        cpus =  2
-        memory = { 1.GB * task.attempt }
-        time   = {  12.h   * task.attempt }
-        maxRetries    = 3
-    }
-
-    withName: JOIN_STUDIES_MERGE{
-        cpus =  1
-        memory = { 20.GB * task.attempt }
-        time   = {  12.h   * task.attempt }
-        maxRetries    = 3
-    }    
-    
-    withName: FREEBAYES{
-        cpus =  1
-        time   = {  12.h   * task.attempt }
-        maxRetries    = 2
-    }
-
-    withName: VIREO_ADD_SAMPLE_PREFIX{
-        cpus =  1
-        memory = { 2.GB * task.attempt }
-        time   = {  12.h   * task.attempt }
-        maxRetries    = 3
-    }
-
-
-    withName: REPLACE_GT_DONOR_ID2{
-        cpus =  1
-        memory = { 1.GB * task.attempt }
-        time   = 12.h
-        maxRetries    = 3
-    }
-
-    withName: JOIN_CHROMOSOMES{
-        cpus =  1
-        memory = { 2.GB * task.attempt }
-        time   = 12.h
-        maxRetries    = 3
-    }
-
-    withName: serialize_known_markers{
-        cpus =  1
-        memory = { 1.GB * task.attempt }
-        time   = {  12.h   * task.attempt }
-        maxRetries    = 3
-    }
-
-    withName: OUTLIER_FILTER{
-        errorStrategy = 'retry'
-        memory = { 50.GB * task.attempt}
-        maxRetries    = 8
-    }
-
-   
-
-    withName: cluster{
-        cpus   = {  3     * task.attempt }
-    }
-
-    withName: LISI{
-        maxForks=7
-        errorStrategy = 'retry'
-        maxRetries    = 8
-        memory = { 200.GB * task.attempt}
-    }
-
-    withName: VIREO_GT_FIX_HEADER{
-        errorStrategy = 'retry'
-        maxRetries    = 4
-        cpus = 1
-        memory = { 1.GB * task.attempt }
-    }
-
-
-    withName: JOIN_CHROMOSOMES{
-        errorStrategy = 'retry'
-        maxRetries    = 4
-    }
-
-    withName: cluster{
-        errorStrategy = 'retry'
-        maxRetries    = 4
-    }
-
-    withName: SPLIT_BAM_BY_CELL_BARCODES {
-      cpus = 1
-      memory = { 8.GB * task.attempt}
-      time = 4.h
-    }
-
-    withName: CONCORDANCE_CALCLULATIONS{
-        cpus = 5
-        time   = {  12.h   * task.attempt }
-        memory = { 70.GB * task.attempt }
-    }
-
-    withName: OTHER_DONOR_CONCORDANCE_CALCLULATIONS{
-        cpus = 3
-        time   = {  6.h   * task.attempt }
-        memory = { 20.GB * task.attempt }
-    }
-
-
-    withName: CELLSNP{
-        memory = { 5.GB * task.attempt }
-    }
-
-    withName: DYNAMIC_DONOR_EXCLUSIVE_SNP_SELECTION{
-        cpus = 5
-        time   = { 12.h   * task.attempt }
-        memory = { 20.GB * task.attempt }
-    }
-
-    withName: prep_collectmetadata{
-        memory = { 150.MB * task.attempt }
-    }
-
-    withName: VIREO{
-        //# maxForks=7
-        cpus   = 5
-        time   = { 12.h   * task.attempt }
-        memory = { 70.GB * task.attempt }
-    }
-    withName: DSB_INTEGRATE{
-        memory = { 200.GB * task.attempt }
-        cpus   = {  4     * task.attempt }
-        maxRetries    = 3
-    }
-
-    withName: MULTIMODAL_INTEGRATION{
-        memory = { 200.GB * task.attempt }
-        cpus   = {  4     * task.attempt }
-        maxRetries    = 3
-    }
-
-    withName: umap_gather{
-        memory = { 100.GB * task.attempt }
-        errorStrategy = 'retry'
-        maxRetries    = 3
-    }
-
-    withName: DOUBLET_FINDER{
-        memory = { 100.GB * task.attempt }
-    }
-
-
-
-    withName: GT_MATCH_POOL_AGAINST_PANEL{
-        time   = { 24.h   * task.attempt }
-    }
-    
-    withName: plot_predicted_sex{
-        memory = { 50.GB * task.attempt }
-        maxRetries = 5
-        cpus = 2
-
-    }
-
-
+}
 
 
+def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss')
+timeline {
+    enabled = true
+    file    = "${params.tracedir}/execution_timeline_${trace_timestamp}.html"
+}
+report {
+    enabled = true
+    file    = "${params.tracedir}/execution_report_${trace_timestamp}.html"
+}
+trace {
+    enabled = true
+    file    = "${params.tracedir}/execution_trace_${trace_timestamp}.txt"
+}
+dag {
+    enabled = true
+    file    = "${params.tracedir}/pipeline_dag_${trace_timestamp}.svg"
 }
 
 singularity {
diff --git a/conf/cellbender.conf b/conf/cellbender.conf
index 636dd16..a8449ec 100755
--- a/conf/cellbender.conf
+++ b/conf/cellbender.conf
@@ -59,8 +59,6 @@ params {
                 value{
                     expected_nemptydroplets_umi_cutoff = 0
                     method_estimate_ncells = 'dropletutils::barcoderanks::inflection'
-                    //method_estimate_ncells = 'cellrangerv2::expected' //this method feeds in the cellranger estimate ncells
-                    //method_estimate_ncells = 'expected'
                     lower_bound_umis_estimate_ncells = 1000
                     method_estimate_nemptydroplets = 'dropletutils::barcoderanks::inflection,dropletutils::barcoderanks::knee,0.33'
                     lower_bound_umis_estimate_nemptydroplets = 10
diff --git a/conf/modules.conf b/conf/modules.conf
index 1885d64..0df8fa4 100755
--- a/conf/modules.conf
+++ b/conf/modules.conf
@@ -52,4 +52,250 @@ params {
         }
 
     }
+
+    withName: SCRUBLET{
+        maxRetries    = 3
+        errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' }
+    }
+
+
+    withName: DOUBLET_DECON{
+        maxRetries    = 3
+        errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' }
+        memory = { 100.GB * task.attempt }
+    }
+    
+    withName: AZIMUTH{
+        maxRetries    = 3
+        errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' }
+    }
+    withName: CELLTYPIST{
+        maxRetries    = 3
+        errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' }
+    }
+    withName: cellbender__remove_background{
+        maxRetries    = 2
+        //# errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' }
+    }
+
+    withLabel:error_ignore {
+        errorStrategy = 'ignore'
+    }
+    withLabel:error_retry {
+        errorStrategy = 'retry'
+        maxRetries    = 2
+    }
+
+    withName:cluster_validate_resolution_keras{
+        maxForks=4
+        maxRetries    = 3
+        memory = { 60.GB * task.attempt }
+        time   = { 12.h   * task.attempt }
+        errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' }
+    }
+
+    withName: CELLTYPIST{
+        maxForks=7
+    }
+
+    withName: ASSESS_CALL_RATE{
+        maxRetries    = 3
+        memory = { 10.GB * task.attempt }
+        errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' }
+    }
+
+
+    withName: CELLTYPE_FILE_MERGE{
+        memory = { 60.GB * task.attempt }
+    }
+
+    withName: NORMALISE_AND_PCA{
+        maxForks=7
+        errorStrategy = 'retry'
+        memory = { 50.GB * task.attempt}
+        maxRetries    = 8
+        cpus   = 4
+    }
+    
+    withName: LISI{
+        maxForks=7
+        memory =300.GB
+    }
+
+    withName: RESOLVE_POOL_VCFS{
+        cpus =  1
+        memory = { 1.GB * task.attempt }
+        time   = {  12.h   * task.attempt }
+        maxRetries    = 3
+    }
+
+    withName: SPLIT_BATCH_H5AD{
+        cpus =  2
+        memory = { 25.GB * task.attempt * 0.5}
+        time   = {  12.h   * task.attempt }
+        maxRetries    = 3
+    }
+
+    withName: SUBSET_GENOTYPE2{
+        cpus =  2
+        memory = { 1.GB * task.attempt }
+        time   = {  12.h   * task.attempt }
+        maxRetries    = 3
+    }
+
+    withName: JOIN_STUDIES_MERGE{
+        cpus =  1
+        memory = { 20.GB * task.attempt }
+        time   = {  12.h   * task.attempt }
+        maxRetries    = 3
+    }    
+    
+    withName: FREEBAYES{
+        cpus =  1
+        time   = {  12.h   * task.attempt }
+        maxRetries    = 2
+    }
+
+    withName: VIREO_ADD_SAMPLE_PREFIX{
+        cpus =  1
+        memory = { 2.GB * task.attempt }
+        time   = {  12.h   * task.attempt }
+        maxRetries    = 3
+    }
+
+
+    withName: REPLACE_GT_DONOR_ID2{
+        cpus =  1
+        memory = { 1.GB * task.attempt }
+        time   = 12.h
+        maxRetries    = 3
+    }
+
+    withName: JOIN_CHROMOSOMES{
+        cpus =  1
+        memory = { 2.GB * task.attempt }
+        time   = 12.h
+        maxRetries    = 3
+    }
+
+    withName: serialize_known_markers{
+        cpus =  1
+        memory = { 1.GB * task.attempt }
+        time   = {  12.h   * task.attempt }
+        maxRetries    = 3
+    }
+
+    withName: OUTLIER_FILTER{
+        errorStrategy = 'retry'
+        memory = { 50.GB * task.attempt}
+        maxRetries    = 8
+    }
+
+   
+
+    withName: cluster{
+        cpus   = {  3     * task.attempt }
+    }
+
+    withName: LISI{
+        maxForks=7
+        errorStrategy = 'retry'
+        maxRetries    = 8
+        memory = { 200.GB * task.attempt}
+    }
+
+    withName: VIREO_GT_FIX_HEADER{
+        errorStrategy = 'retry'
+        maxRetries    = 4
+        cpus = 1
+        memory = { 1.GB * task.attempt }
+    }
+
+
+    withName: JOIN_CHROMOSOMES{
+        errorStrategy = 'retry'
+        maxRetries    = 4
+    }
+
+    withName: cluster{
+        errorStrategy = 'retry'
+        maxRetries    = 4
+    }
+
+    withName: SPLIT_BAM_BY_CELL_BARCODES {
+      cpus = 1
+      memory = { 8.GB * task.attempt}
+      time = 4.h
+    }
+
+    withName: CONCORDANCE_CALCLULATIONS{
+        cpus = 20
+        time   = {  24.h   * task.attempt }
+        memory = { 100.GB * task.attempt }
+    }
+
+    withName: OTHER_DONOR_CONCORDANCE_CALCLULATIONS{
+        cpus = 3
+        time   = {  6.h   * task.attempt }
+        memory = { 20.GB * task.attempt }
+    }
+
+
+    withName: CELLSNP{
+        memory = { 5.GB * task.attempt }
+    }
+
+    withName: DYNAMIC_DONOR_EXCLUSIVE_SNP_SELECTION{
+        cpus = 5
+        time   = { 12.h   * task.attempt }
+        memory = { 20.GB * task.attempt }
+    }
+
+    withName: prep_collectmetadata{
+        memory = { 150.MB * task.attempt }
+    }
+
+    withName: VIREO{
+        //# maxForks=7
+        cpus   = 5
+        time   = { 12.h   * task.attempt }
+        memory = { 70.GB * task.attempt }
+    }
+    withName: DSB_INTEGRATE{
+        memory = { 200.GB * task.attempt }
+        cpus   = {  4     * task.attempt }
+        maxRetries    = 3
+    }
+
+    withName: MULTIMODAL_INTEGRATION{
+        memory = { 200.GB * task.attempt }
+        cpus   = {  4     * task.attempt }
+        maxRetries    = 3
+    }
+
+    withName: umap_gather{
+        memory = { 100.GB * task.attempt }
+        errorStrategy = 'retry'
+        maxRetries    = 3
+    }
+
+    withName: DOUBLET_FINDER{
+        memory = { 100.GB * task.attempt }
+    }
+
+
+
+    withName: GT_MATCH_POOL_AGAINST_PANEL{
+        time   = { 24.h   * task.attempt }
+    }
+    
+    withName: plot_predicted_sex{
+        memory = { 50.GB * task.attempt }
+        maxRetries = 5
+        cpus = 2
+
+    }
+
+
+
 }
diff --git a/conf/test.conf b/conf/test.conf
index 6b947ed..185ca42 100755
--- a/conf/test.conf
+++ b/conf/test.conf
@@ -23,7 +23,10 @@ params {
         posterior_assignment = false //if this is set to true, we will perform the genotype donor matching after the deconvolution is performed.
         subset_genotypes = false
         tsv_donor_panel_vcfs = "https://yascp.cog.sanger.ac.uk/public/test_datasets/full_test_dataset/vcf_inputs_v2.tsv" //this is a panel of vcf files that we want to compar the genotypes with
+        ZSCORE_THRESH = 8 //# Minimum z0 threshold required to call the gt assignment confident. 
+        ZSCORE_DIST_THRESH = 8 //# Minimum bifference between z0 and z1 to call the assignment confident,
     }
+
     hard_filters_file = "${projectDir}/sample_input/sample_qc.yml" //this file defilnes what hard filters we want to use to flag/drop the cells
 
     //# For the training purposes we reduce the cellbender epochs and learning rate as this step takes a long time to compute.
diff --git a/conf/test_full.conf b/conf/test_full.conf
index 6b947ed..9b6150a 100755
--- a/conf/test_full.conf
+++ b/conf/test_full.conf
@@ -23,6 +23,9 @@ params {
         posterior_assignment = false //if this is set to true, we will perform the genotype donor matching after the deconvolution is performed.
         subset_genotypes = false
         tsv_donor_panel_vcfs = "https://yascp.cog.sanger.ac.uk/public/test_datasets/full_test_dataset/vcf_inputs_v2.tsv" //this is a panel of vcf files that we want to compar the genotypes with
+        ZSCORE_THRESH = 8 //# Minimum z0 threshold required to call the gt assignment confident. 
+        ZSCORE_DIST_THRESH = 8 //# Minimum bifference between z0 and z1 to call the assignment confident,
+    
     }
     hard_filters_file = "${projectDir}/sample_input/sample_qc.yml" //this file defilnes what hard filters we want to use to flag/drop the cells
 
diff --git a/main.nf b/main.nf
index 93fc1f0..3cecc35 100755
--- a/main.nf
+++ b/main.nf
@@ -38,7 +38,7 @@ include {collect_file} from "$projectDir/modules/nf-core/modules/collect_file/ma
 include { CELLSNP;capture_cellsnp_files } from "$projectDir/modules/nf-core/modules/cellsnp/main"
 
 
-
+// Channel.of(params.outdir).mkdirs()
 
 ////// WORKFLOW: Run main nf-core/yascp analysis pipeline
 // This is the default entry point, we have others to update ceirtain parts of the results. 
diff --git a/modules/nf-core/modules/citeseq/main.nf b/modules/nf-core/modules/citeseq/main.nf
index 70edf6e..272029a 100755
--- a/modules/nf-core/modules/citeseq/main.nf
+++ b/modules/nf-core/modules/citeseq/main.nf
@@ -26,7 +26,7 @@ process SPLIT_CITESEQ_GEX {
         tuple val(sample_name), path("${sample_name}__Gene_Expression"), emit:gex_data
         tuple val(sample_name), path("antibody-${sample_name}.h5ad"), emit: ab_data2 optional true
         tuple val(sample_name), path("Gene_Expression-${sample_name}.h5ad"), emit: gex_h5ad optional true
-        tuple val(sample_name), path("${sample_name}__*"), emit: ab_data
+        tuple val(sample_name), path("${sample_name}__*"), emit: ab_data optional true
         tuple val(sample_name), path("${sample_name}__Gene_Expression/barcodes.tsv.gz"), path("${sample_name}__Gene_Expression/features.tsv.gz"), path("${sample_name}__Gene_Expression/matrix.mtx.gz"), emit: channel__file_paths_10x
  
     script:
@@ -166,6 +166,8 @@ process VDJ_INTEGRATION{
     
     output:
         path("*all_samples_integrated.vdj.RDS"), emit: all_data_integrated_vdj_rds
+        path("*all_samples_integrated.BCR.RDS"), emit: all_data_integrated_BCR_rds
+        path("*all_samples_integrated.TCR.RDS"), emit: all_data_integrated_TCR_rds
 
     input:
         path(all_cellranger_samples)
diff --git a/modules/nf-core/modules/doubletdecon/main.nf b/modules/nf-core/modules/doubletdecon/main.nf
index 0e9ef94..b50bc01 100644
--- a/modules/nf-core/modules/doubletdecon/main.nf
+++ b/modules/nf-core/modules/doubletdecon/main.nf
@@ -1,4 +1,4 @@
-process DOUBLET_DECON {
+process DOUBLET_DECON{
 
     tag "${experiment_id}"
     label 'process_medium'
diff --git a/modules/nf-core/modules/prepere_yascp_inputs/main.nf b/modules/nf-core/modules/prepere_yascp_inputs/main.nf
index a83c72c..ebabfe1 100644
--- a/modules/nf-core/modules/prepere_yascp_inputs/main.nf
+++ b/modules/nf-core/modules/prepere_yascp_inputs/main.nf
@@ -16,6 +16,7 @@ process YASCP_INPUTS {
     input:
         path(input_file)
 
+
     // the output for this is a correct format input files as per cb 6.1
     output:
         path("input_file_corectly_formatted.tsv"), emit: input_file_corectly_formatted
diff --git a/nextflow.config b/nextflow.config
index 572aa8b..2378616 100755
--- a/nextflow.config
+++ b/nextflow.config
@@ -25,8 +25,8 @@ params {
     max_multiqc_email_size     = '25.MB'
 
     // Boilerplate options
-    outdir    = output_dir                 = "${launchDir}/results"
-    tracedir                   = "${params.outdir}/pipeline_info"
+    outdir                     = "${launchDir}/results"
+    
     publish_dir_mode           = 'copy'
     email                      = null
     email_on_fail              = null
@@ -38,7 +38,8 @@ params {
     schema_ignore_params       = 'genomes,modules'
     enable_conda               = false
     singularity_pull_docker_container = false
-
+    tracedir                   = "${params.outdir}/pipeline_info"
+    
     // Config options
     custom_config_version      = 'master'
     custom_config_base         = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}"
@@ -78,13 +79,6 @@ includeConfig 'conf/qc.conf'
 includeConfig 'conf/modules.conf'
 
 
-// Load igenomes.config if required
-if (!params.igenomes_ignore) {
-    includeConfig 'conf/igenomes.conf'
-} else {
-    params.genomes = [:]
-}
-
 profiles {
     debug { process.beforeScript = 'echo $HOSTNAME' }
     conda {
diff --git a/subworkflows/celltype.nf b/subworkflows/celltype.nf
index 7966b7a..4e611ad 100755
--- a/subworkflows/celltype.nf
+++ b/subworkflows/celltype.nf
@@ -70,9 +70,9 @@ workflow celltype{
         }else{
             sc_out = Channel.of()
         }        
-        all_extra_fields = all_extra_fields.mix(sc_out)
+        all_extra_fields2 = all_extra_fields.mix(sc_out)
         
-        CELLTYPE_FILE_MERGE(az_out,ct_out,all_extra_fields,SPLIT_BATCH_H5AD.out.keras_outfile.collect())       
+        CELLTYPE_FILE_MERGE(az_out,ct_out,all_extra_fields2,SPLIT_BATCH_H5AD.out.keras_outfile.collect())       
         file__anndata_merged2=CELLTYPE_FILE_MERGE.out.file__anndata_merged2
         file__anndata_merged2.view()
 
diff --git a/subworkflows/local/retrieve_recourses.nf b/subworkflows/local/retrieve_recourses.nf
index 6a48228..505b1c3 100755
--- a/subworkflows/local/retrieve_recourses.nf
+++ b/subworkflows/local/retrieve_recourses.nf
@@ -64,4 +64,22 @@ process RETRIEVE_RECOURSES_TEST_DATASET{
         sed -i 's#/path/to/replace/pointing/to/downloaded/files#${params.outdir}/recourses/full_test_dataset#' input_test_data_file.tsv
         touch Done > Done.tmp
     """    
+}
+
+
+process STAGE_FILE{
+  label 'process_tiny'
+    // In nf there is a function collectFile - however if you provide a symlinked file directory tusing nf function will overwrite the source instead of replacing the file
+    // This snipped is a replication of the function but as a nf module and hence the problem is avoided.
+  // publishDir "${params.outdir}/recourses",  mode: "${params.copy_mode}", overwrite: true    
+  input:
+    path(file)
+  output:
+    path(file)
+
+  script:
+
+    """
+    echo 'staged'
+    """
 }
\ No newline at end of file
diff --git a/subworkflows/main_deconvolution.nf b/subworkflows/main_deconvolution.nf
index 08fd451..25828bf 100755
--- a/subworkflows/main_deconvolution.nf
+++ b/subworkflows/main_deconvolution.nf
@@ -17,7 +17,7 @@ include { match_genotypes } from './match_genotypes'
 include {ENHANCE_STATS_GT_MATCH } from "$projectDir/modules/nf-core/modules/genotypes/main"
 include {SUBSET_WORKF} from "$projectDir/modules/nf-core/modules/subset_genotype/main"
 include {REPLACE_GT_DONOR_ID2; REPLACE_GT_DONOR_ID2 as REPLACE_GT_DONOR_ID_SUBS } from "$projectDir/modules/nf-core/modules/genotypes/main"
-include { RETRIEVE_RECOURSES } from "$projectDir/subworkflows/local/retrieve_recourses"
+include { RETRIEVE_RECOURSES; STAGE_FILE } from "$projectDir/subworkflows/local/retrieve_recourses"
 include {GT_MATCH_POOL_IBD } from "$projectDir/modules/nf-core/modules/genotypes/main"
 include { MATCH_GT_VIREO } from "$projectDir/modules/nf-core/modules/genotypes/main"
 
@@ -52,7 +52,7 @@ workflow  main_deconvolution {
         }else{
             genome = "${params.reference_assembly_fasta_dir}"
         }
-
+        vcf_candidate_snps = STAGE_FILE(params.cellsnp.vcf_candidate_snps)
         // genome.subscribe { println "genome: $it" }
 
         if (params.genotype_input.run_with_genotype_input) {
@@ -73,7 +73,7 @@ workflow  main_deconvolution {
             // This will subsequently result in a joint vcf file per cohort per donors listed for each of the pools that can be used in VIREO and/or GT matching algorythm.
             SUBSET_WORKF(ch_ref_vcf,donors_in_pools,'AllExpectedGT',genome)
             merged_expected_genotypes = SUBSET_WORKF.out.merged_expected_genotypes
-            merged_expected_genotypes2 = merged_expected_genotypes.combine(Channel.fromPath(params.cellsnp.vcf_candidate_snps))
+            merged_expected_genotypes2 = merged_expected_genotypes.combine(vcf_candidate_snps)
             // merged_expected_genotypes2.subscribe { println "merged_expected_genotypes2: $it" }
             GT_MATCH_POOL_IBD(SUBSET_WORKF.out.samplename_subsetvcf_ibd,'Withing_expected','Expected')
 
@@ -105,10 +105,10 @@ workflow  main_deconvolution {
             if (params.provide_within_pool_donor_specific_sites_for_pilup){
                 cellsnp_with_npooled2 = cellsnp_with_npooled.combine(cellsnp_panels, by: 0)
             }else{
-                cellsnp_with_npooled2 = cellsnp_with_npooled.combine(Channel.fromPath(params.cellsnp.vcf_candidate_snps))
+                cellsnp_with_npooled2 = cellsnp_with_npooled.combine(vcf_candidate_snps)
             }
         }else{
-            cellsnp_with_npooled2 = cellsnp_with_npooled.combine(Channel.fromPath(params.cellsnp.vcf_candidate_snps))
+            cellsnp_with_npooled2 = cellsnp_with_npooled.combine(vcf_candidate_snps)
         }
 
         log.info('Capturing some of the existing CELLSNP files')
diff --git a/subworkflows/prepare_inputs.nf b/subworkflows/prepare_inputs.nf
index 69c438c..be61c90 100755
--- a/subworkflows/prepare_inputs.nf
+++ b/subworkflows/prepare_inputs.nf
@@ -15,6 +15,7 @@ workflow prepare_inputs {
 
 		YASCP_INPUTS(channel_input_data_table)
 		channel_input_data_table = YASCP_INPUTS.out.input_file_corectly_formatted
+        
 
         channel_input_data_table
             .splitCsv(header: true, sep: params.input_tables_column_delimiter)
diff --git a/workflows/yascp.nf b/workflows/yascp.nf
index 85492f5..22774af 100755
--- a/workflows/yascp.nf
+++ b/workflows/yascp.nf
@@ -57,7 +57,7 @@ workflow YASCP {
         bam_split_channel = Channel.of()
         out_ch = params.outdir
             ? Channel.fromPath(params.outdir, checkIfExists:true)
-            : Channel.fromPath("${launchDir}/${params.outdir}")
+            : Channel.from("${launchDir}/${params.outdir}")
                 
         // out_ch.map{row->"${row[0]}/possorted_genome_bam.bam" }
         prepare_inputs(input_channel)