From 29688df3536c10aadd5761ec48224caaba667076 Mon Sep 17 00:00:00 2001
From: Matiss Ozols <mo11@sanger.ac.uk>
Date: Tue, 23 Apr 2024 13:48:03 +0100
Subject: [PATCH 1/8] pushing current changes

---
 assets/deploy_scripts/sanger_module_files/1.6 | 2 +-
 conf/qc.conf                                  | 2 +-
 modules/nf-core/modules/cellsnp/main.nf       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/assets/deploy_scripts/sanger_module_files/1.6 b/assets/deploy_scripts/sanger_module_files/1.6
index 8bf0d00..d73f9ac 100644
--- a/assets/deploy_scripts/sanger_module_files/1.6
+++ b/assets/deploy_scripts/sanger_module_files/1.6
@@ -33,7 +33,7 @@ module load ISG/singularity/3.11.4
 module load HGI/common/nextflow/22.04.4
 module load HGI/common/baton
 module load ISG/experimental/irods/4.2.7
-
+module load HGI/softpack/users/bh18/bcftools/1.0
 prepend-path PATH "/software/hgi/envs/conda/team151/mo11/mo11/bin"
 prepend-path PATH "/software/hgi/containers/yascp/modules/full_yascp"
 prepend-path PATH "/software/hgi/pipelines/yascp_versions/yascp_v1.6/bin"
diff --git a/conf/qc.conf b/conf/qc.conf
index 05e1d4a..ae026d4 100755
--- a/conf/qc.conf
+++ b/conf/qc.conf
@@ -62,7 +62,7 @@ params{
         expected_multiplet_rate = 0.1 //# used for scrublet and doubletFinder.
         
         doubletDetection{
-             run_process = true           
+            run_process = false           
         }
         doubletDecon{
             run_process = true
diff --git a/modules/nf-core/modules/cellsnp/main.nf b/modules/nf-core/modules/cellsnp/main.nf
index 27d8cf2..3c0abaa 100755
--- a/modules/nf-core/modules/cellsnp/main.nf
+++ b/modules/nf-core/modules/cellsnp/main.nf
@@ -41,7 +41,7 @@ process DYNAMIC_DONOR_EXCLUSIVE_SNP_SELECTION{
     script:       
       if (add_dynamic_sites_or_not_to_panel){
         cmd2 = "cat cellsnp_variants.tsv >> cellsnp_panel_${samplename}.vcf"
-      }{
+      }else{
         cmd2 = ''
       }
         cmd1="ln -s ${vcf_file} dynamic_snps.vcf.gz"

From 47a6876b4e8566660c9c4f5369ea9eb3396cd1e6 Mon Sep 17 00:00:00 2001
From: Matiss Ozols <mo11@sanger.ac.uk>
Date: Thu, 25 Apr 2024 10:32:50 +0100
Subject: [PATCH 2/8] merge

---
 README.md                                     |  3 +-
 .../module_exacutables/help.info              |  1 +
 bin/0026-filter_outlier_cells.py              |  3 +-
 bin/DoubletDecon.R                            | 58 +++++++++++++------
 bin/azimuth.R                                 | 18 +++---
 conf/base.conf                                |  7 ++-
 conf/qc.conf                                  |  3 +-
 modules/nf-core/modules/azimuth/main.nf       |  8 +--
 subworkflows/celltype.nf                      |  4 +-
 9 files changed, 68 insertions(+), 37 deletions(-)

diff --git a/README.md b/README.md
index d42296c..4583e3f 100755
--- a/README.md
+++ b/README.md
@@ -4,8 +4,7 @@
 
 ## Introduction
 
-**nf-core/yascp** is a bioinformatics best-practice analysis pipeline for deconvolution, qc, clustering, integration of a single cell datasets.
-This is a large scale single-cell pipeline developed initially for processing Cardinal project samples, however it is applicable to any other scRNA analysis. The pipeline has been inspired by [deconvolution](https://github.com/wtsi-hgi/nf_scrna_deconvolution.git ), [cellbender](https://github.com/wtsi-hgi/nf_cellbender ) and [qc](https://github.com/wtsi-hgi/nf_qc_cluster/tree/main ) pipelines initially developed in Anderson lab. 
+**nf-core/yascp** is a bioinformatics best-practice analysis pipeline tailored for deconvolution, quality control, clustering, and integration of single-cell datasets. Developed under the leadership of N.Soranzo and Human Genetics Informatics (HGI), this large-scale single-cell pipeline was originally crafted for the Cardinal project (profiling UKBB and ELGH participants) but is versatile enough for broader scRNA analysis applications. The foundational ideas were inspired by earlier pipelines from Anderson lab but has been expanded, specifically those for [deconvolution](https://github.com/wtsi-hgi/nf_scrna_deconvolution.git), [cellbender](https://github.com/wtsi-hgi/nf_cellbender), and [quality control and clustering](https://github.com/wtsi-hgi/nf_qc_cluster/tree/main). This ensures a robust integration of proven methodologies tailored to meet the demands of expansive single-cell data analysis.
 
 Input requires a tsv seperated file [(please read detailed documentation here)](https://github.com/wtsi-hgi/yascp/tree/yascp_docs) with paths and if running in an genotype  additional input is required to be provided in an input.nf file pointing to the vcf location. This pipeline is designed to be used any large scale single cell experiments.
 
diff --git a/assets/deploy_scripts/module_exacutables/help.info b/assets/deploy_scripts/module_exacutables/help.info
index 3180a5c..6829668 100644
--- a/assets/deploy_scripts/module_exacutables/help.info
+++ b/assets/deploy_scripts/module_exacutables/help.info
@@ -3,6 +3,7 @@
 ###################
 ${bold}YASCP (Yet Another Single Cell Pieline)${normal}[https://github.com/wtsi-hgi/yascp]
 Nextflow pipeline that QCs the scRNA Cellranger data by removing ambient RNA, deconvoluting donors, assigning celltypes, analysing concordances vs expected genotypes
+Please refear to the usage and outputs documentation here: https://hgi-projects.pages.internal.sanger.ac.uk/documentation/docs/tutorials/yascp_docs/#getting-started or here: https://github.com/wtsi-hgi/yascp/tree/78df9d94388045bc386ea32c2127abeb154a36c6 
 ###################
 
 Yascp module has been set to run in multiple modes:
diff --git a/bin/0026-filter_outlier_cells.py b/bin/0026-filter_outlier_cells.py
index 19a4d3b..1b623b6 100755
--- a/bin/0026-filter_outlier_cells.py
+++ b/bin/0026-filter_outlier_cells.py
@@ -348,7 +348,8 @@ def main():
             # gamma=0.1
         )
     elif method == 'MAD':
-        clf = onesidemad()
+        mad_thresh = [-5, -5, 5]
+        clf = onesidemad(mad_thresh)
     else:
         raise ValueError('ERROR: invalid method.')
 
diff --git a/bin/DoubletDecon.R b/bin/DoubletDecon.R
index 1e49897..238b72b 100755
--- a/bin/DoubletDecon.R
+++ b/bin/DoubletDecon.R
@@ -645,25 +645,49 @@ Main_Doublet_Decon<-function(rawDataFile, groupsFile, filename, location, fullDa
 }
 
 ## Run Doublet Decon ##
-results <- Main_Doublet_Decon(rawDataFile = processed$newExpressionFile, 
-  groupsFile = processed$newGroupsFile, 
-  filename = "DoubletDecon_results",
-  location = paste0(args$out, "/"),
-  fullDataFile = NULL, 
-  removeCC = args$removeCC, 
-  species = args$species, 
-  rhop = args$rhop,
-  write = TRUE, 
-  PMF = args$pmf, 
-  useFull = FALSE, 
-  heatmap = args$heatmap, 
-  centroids=args$centroids, 
-  num_doubs=args$num_doubs, 
-  only50=args$only50, 
-  min_uniq=args$min_uniq, 
-  nCores = args$nCores)
 
+tryCatch({
+
+  results <- Main_Doublet_Decon(rawDataFile = processed$newExpressionFile, 
+    groupsFile = processed$newGroupsFile, 
+    filename = "DoubletDecon_results",
+    location = paste0(args$out, "/"),
+    fullDataFile = NULL, 
+    removeCC = args$removeCC, 
+    species = args$species, 
+    rhop = args$rhop,
+    write = TRUE, 
+    PMF = args$pmf, 
+    useFull = FALSE, 
+    heatmap = args$heatmap, 
+    centroids=args$centroids, 
+    num_doubs=args$num_doubs, 
+    only50=args$only50, 
+    min_uniq=args$min_uniq, 
+    nCores = args$nCores)
+
+}, error = function(e) {
+
+  results <- Main_Doublet_Decon(rawDataFile = processed$newExpressionFile, 
+    groupsFile = processed$newGroupsFile, 
+    filename = "DoubletDecon_results",
+    location = paste0(args$out, "/"),
+    fullDataFile = NULL, 
+    removeCC = args$removeCC, 
+    species = args$species, 
+    rhop =  0.64,
+    write = TRUE, 
+    PMF = args$pmf, 
+    useFull = FALSE, 
+    heatmap = args$heatmap, 
+    centroids=args$centroids, 
+    num_doubs=args$num_doubs, 
+    only50=args$only50, 
+    min_uniq=args$min_uniq, 
+    nCores = args$nCores)
 
+}
+)
 
 
 doublets <- read.table(paste0(args$out, "/Final_doublets_groups_DoubletDecon_results.txt"))
diff --git a/bin/azimuth.R b/bin/azimuth.R
index 5e7933f..87e2d55 100755
--- a/bin/azimuth.R
+++ b/bin/azimuth.R
@@ -35,6 +35,7 @@ if (REFERENCE_DIR=='PBMC'){
 }
 
 levels = args[3]
+prefix = args[4]
 # levels = "celltype.l2,celltype.l1,celltype.l3"
 levels = unlist(x = strsplit(x = levels, split = ',', fixed = TRUE))
 # reference files are expected in the following directory
@@ -597,6 +598,7 @@ anchors <- FindTransferAnchors(
 for (celltype_level in levels) {
 
       id <- celltype_level
+      # id <- paste0(id,"_",prefix)
       print(id)
       refdata <- lapply(X = celltype_level, function(x) {
         reference$map[[x, drop = TRUE]]
@@ -668,7 +670,7 @@ for (celltype_level in levels) {
       predicted.id.score <- paste0(predicted.id, ".score")
 
       # write a table of cell-type assignments, prediction and mapping scores:
-      fnam.table <- paste0(gsub(".", "_", predicted.id, fixed = TRUE),".tsv")
+      fnam.table <- paste0(prefix,"_",gsub(".", "_", predicted.id, fixed = TRUE),".tsv")
       data <- FetchData(object = query, vars = c(predicted.id, predicted.id.score, paste0("mapping.score.", id)), slot = "data")
       write.table(data, fnam.table, quote = FALSE, sep="\t")
       #gzip(fnam.table, overwrite = TRUE)
@@ -695,7 +697,7 @@ for (celltype_level in levels) {
       p <- ggplot(tdf, aes(x=cell_type, y=count, fill = threshold))
       p <- p + geom_col(position = "dodge")
       p <- p + theme(axis.text.x = element_text(angle = 90))
-      ggsave(paste0(id,".ncells_by_type_barplot.pdf"))
+      ggsave(paste0(prefix,"_",id,".ncells_by_type_barplot.pdf"))
 
       # DimPlot of the reference
       #ref.plt <- DimPlot(object = reference$plot, reduction = "refUMAP", group.by = id, label = TRUE) + NoLegend()
@@ -704,23 +706,23 @@ for (celltype_level in levels) {
 
       # DimPlot of the query, colored by predicted cell type
       DimPlot(object = query, reduction = "proj.umap", group.by = predicted.id, label = TRUE) + NoLegend()
-      ggsave(paste0(id,".query_umap.pdf"))
+      ggsave(paste0(prefix,"_",id,".query_umap.pdf"))
 
       # Plot the score for the predicted cell type of the query
       FeaturePlot(object = query, features = paste0(predicted.id, ".score"), reduction = "proj.umap")
-      ggsave(paste0(id,".prediction_score_umap.pdf"))
+      ggsave(paste0(prefix,"_",id,".prediction_score_umap.pdf"))
       VlnPlot(object = query, features = paste0(predicted.id, ".score"), group.by = predicted.id) + NoLegend()
-      ggsave(paste0(id,".prediction_score_vln.pdf"))
+      ggsave(paste0(prefix,"_",id,".prediction_score_vln.pdf"))
 
       # Plot the mapping score
       FeaturePlot(object = query, features = paste0("mapping.score.", id), reduction = "proj.umap")
-      ggsave(paste0(id,".mapping_score_umap.pdf"))
+      ggsave(paste0(prefix,"_",id,".mapping_score_umap.pdf"))
       VlnPlot(object = query, features = paste0("mapping.score.", id), group.by = predicted.id) + NoLegend()
-      ggsave(paste0(id,".mapping_score_vln.pdf"))
+      ggsave(paste0(prefix,"_",id,".mapping_score_vln.pdf"))
 
 } 
 
   # save mapped data set
   #save(query, file = "azimuth.bin")
-  saveRDS(query, file = "azimuth.rds")
+  # saveRDS(query, file = paste0(prefix,"azimuth.rds"))
   # load("azimuth.bin" )
\ No newline at end of file
diff --git a/conf/base.conf b/conf/base.conf
index 6df1d62..249dc9f 100755
--- a/conf/base.conf
+++ b/conf/base.conf
@@ -249,6 +249,7 @@ process {
     withName: DOUBLET_DECON{
         maxRetries    = 3
         errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' }
+        memory = { 100.GB * task.attempt }
     }
     
     withName: AZIMUTH{
@@ -410,7 +411,7 @@ process {
     withName: CONCORDANCE_CALCLULATIONS{
         cpus = 5
         time   = {  12.h   * task.attempt }
-        memory = { 50.GB * task.attempt }
+        memory = { 70.GB * task.attempt }
     }
 
     withName: OTHER_DONOR_CONCORDANCE_CALCLULATIONS{
@@ -459,9 +460,11 @@ process {
     }
 
     withName: DOUBLET_FINDER{
-        memory = { 70.GB * task.attempt }
+        memory = { 100.GB * task.attempt }
     }
 
+
+
     withName: GT_MATCH_POOL_AGAINST_PANEL{
         time   = { 24.h   * task.attempt }
     }
diff --git a/conf/qc.conf b/conf/qc.conf
index 2a7873f..1e72efb 100755
--- a/conf/qc.conf
+++ b/conf/qc.conf
@@ -178,7 +178,8 @@ params{
                         each estimator. Only valid if method == IsolationForest."""
                 run_process = true
                 method = 'IsolationForest' //# Available methods: ocalOutlierFactor, IsolationForest, EllipticEnvelope, OneClassSVM, onesidemad
-                metadata_columns = 'pct_counts_gene_group__mito_transcript,log1p_total_counts,log1p_n_genes_by_counts'
+                metadata_columns = 'log1p_total_counts,log1p_n_genes_by_counts,pct_counts_gene_group__mito_transcript'
+                mad_tresholds = '-5,-5,5' //# These one side MAD filters will be used if selected.
                 outliers_fraction = 0.0
                 max_samples = 0.1
             }
diff --git a/modules/nf-core/modules/azimuth/main.nf b/modules/nf-core/modules/azimuth/main.nf
index fd7237f..e50bcf9 100755
--- a/modules/nf-core/modules/azimuth/main.nf
+++ b/modules/nf-core/modules/azimuth/main.nf
@@ -22,13 +22,13 @@ process AZIMUTH{
 
     input:
         val outdir_prev
-        path file_h5ad_batch
+        tuple val(samplename),path(file_h5ad_batch)
         each refset
         // path(mapping_file)
     output:
         // path(celltype_table, emit:predicted_celltypes)
-        tuple(val(outfil_prfx), val(refset.refset), path("predicted_*.tsv"),emit:celltype_tables_all) 
-        path("predicted_*.tsv"), emit:predicted_celltype_labels
+        tuple(val(outfil_prfx), val(refset.refset), path("*predicted_*.tsv"),emit:celltype_tables_all) 
+        path("*predicted_*.tsv"), emit:predicted_celltype_labels
         path "*ncells_by_type_barplot.pdf"
         path "*query_umap.pdf"
         path "*prediction_score_umap.pdf"
@@ -46,7 +46,7 @@ process AZIMUTH{
     //outfil_prfx = "${file_h5ad_batch}".minus(".h5ad")
     
     """ 
-        azimuth.R ./${file_h5ad_batch} ${refset.refset} ${refset.annotation_labels}
+        azimuth.R ./${file_h5ad_batch} ${refset.refset} ${refset.annotation_labels} ${samplename}
 
     """
 }
diff --git a/subworkflows/celltype.nf b/subworkflows/celltype.nf
index 5710f5f..7966b7a 100755
--- a/subworkflows/celltype.nf
+++ b/subworkflows/celltype.nf
@@ -42,7 +42,7 @@ workflow celltype{
         
         // AZIMUTH
         if (params.celltype_assignment.run_azimuth){
-            AZIMUTH(params.outdir,ch_batch_files,Channel.fromList( params.azimuth.celltype_refsets))
+            AZIMUTH(params.outdir,az_ch_experiment_filth5,Channel.fromList( params.azimuth.celltype_refsets))
             az_out = AZIMUTH.out.predicted_celltype_labels.collect()
             // REMAP_AZIMUTH(AZIMUTH.out.celltype_tables_all,params.mapping_file)
             // az_out = REMAP_AZIMUTH.out.predicted_celltype_labels.collect()
@@ -72,7 +72,7 @@ workflow celltype{
         }        
         all_extra_fields = all_extra_fields.mix(sc_out)
         
-        CELLTYPE_FILE_MERGE(az_out,ct_out,all_extra_fields,SPLIT_BATCH_H5AD.out.keras_outfile)       
+        CELLTYPE_FILE_MERGE(az_out,ct_out,all_extra_fields,SPLIT_BATCH_H5AD.out.keras_outfile.collect())       
         file__anndata_merged2=CELLTYPE_FILE_MERGE.out.file__anndata_merged2
         file__anndata_merged2.view()
 

From 70c53406cae2032908699ceb6b36f283a7efce02 Mon Sep 17 00:00:00 2001
From: Matiss <dr.matiss.ozols@gmail.com>
Date: Thu, 25 Apr 2024 10:46:34 +0100
Subject: [PATCH 3/8] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 4583e3f..40a68f3 100755
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 
 ## Introduction
 
-**nf-core/yascp** is a bioinformatics best-practice analysis pipeline tailored for deconvolution, quality control, clustering, and integration of single-cell datasets. Developed under the leadership of N.Soranzo and Human Genetics Informatics (HGI), this large-scale single-cell pipeline was originally crafted for the Cardinal project (profiling UKBB and ELGH participants) but is versatile enough for broader scRNA analysis applications. The foundational ideas were inspired by earlier pipelines from Anderson lab but has been expanded, specifically those for [deconvolution](https://github.com/wtsi-hgi/nf_scrna_deconvolution.git), [cellbender](https://github.com/wtsi-hgi/nf_cellbender), and [quality control and clustering](https://github.com/wtsi-hgi/nf_qc_cluster/tree/main). This ensures a robust integration of proven methodologies tailored to meet the demands of expansive single-cell data analysis.
+**nf-core/yascp** is a bioinformatics best-practice analysis pipeline tailored for deconvolution, quality control, clustering, and integration of single-cell datasets. Developed under the leadership of N.Soranzo and Human Genetics Informatics (HGI), this large-scale single-cell pipeline was originally crafted for the Cardinal project (profiling UKBB and ELGH participants) but is versatile enough for broader scRNA analysis applications and has had contributions from many other teams across Sanger. The foundational ideas were inspired by earlier pipelines from Anderson lab that have been significantly expanded and improved - specifically those for [deconvolution](https://github.com/wtsi-hgi/nf_scrna_deconvolution.git), [cellbender](https://github.com/wtsi-hgi/nf_cellbender), and [quality control and clustering](https://github.com/wtsi-hgi/nf_qc_cluster/tree/main). This ensures a robust integration of proven methodologies tailored to meet the demands of expansive single-cell data analysis.
 
 Input requires a tsv seperated file [(please read detailed documentation here)](https://github.com/wtsi-hgi/yascp/tree/yascp_docs) with paths and if running in an genotype  additional input is required to be provided in an input.nf file pointing to the vcf location. This pipeline is designed to be used any large scale single cell experiments.
 
@@ -71,7 +71,7 @@ Easyest to do is using a conda enviroment.
 
 ## Credits
 
-Yascp was originally written by Matiss Ozols as part of the Cardinal project but is applicable to many other projects with contributions from Leland Taylor, Guillaume Noell, Hannes Ponstingl, Vivek Iyer,  Henry Taylor, Tobi Alegbe, Monika Krzak, Alessandro Raveane, Carl Anderson, Anna Lorenc, Stephen Watt, Nicole Soranzo.
+Yascp was originally written by Matiss Ozols as part of the Cardinal project but is applicable to many other projects with contributions from Leland Taylor, Guillaume Noell, Hannes Ponstingl, Vivek Iyer,  Henry Taylor, Tobi Alegbe, Monika Krzak, Alessandro Raveane, Carl Anderson, Anna Lorenc, Haerin Jang, Niek de Klein, Stephen Watt, Nicole Soranzo.
 <!-- TODO nf-core: If applicable, make list of people who have also contributed -->
 
 The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community!

From 509a86d17c49992855ba8c1a6c12ea9a60028ca1 Mon Sep 17 00:00:00 2001
From: Matiss Ozols <mo11@sanger.ac.uk>
Date: Tue, 30 Apr 2024 16:24:13 +0100
Subject: [PATCH 4/8] test data tested and works

---
 .../deploy_scripts/module_exacutables/yascp   |    1 +
 bin/4.add_vdj.R                               |   13 +-
 bin/concordance_calculations.py               | 2019 +++++++++--------
 bin/gather_minimal_dataset.py                 |    3 +
 conf/base.conf                                |  258 +--
 conf/cellbender.conf                          |    2 -
 conf/modules.conf                             |  246 ++
 conf/test.conf                                |    3 +
 conf/test_full.conf                           |    3 +
 main.nf                                       |    2 +-
 modules/nf-core/modules/citeseq/main.nf       |    4 +-
 modules/nf-core/modules/doubletdecon/main.nf  |    2 +-
 .../modules/prepere_yascp_inputs/main.nf      |    1 +
 nextflow.config                               |   14 +-
 subworkflows/celltype.nf                      |    4 +-
 subworkflows/local/retrieve_recourses.nf      |   18 +
 subworkflows/main_deconvolution.nf            |   10 +-
 subworkflows/prepare_inputs.nf                |    1 +
 workflows/yascp.nf                            |    2 +-
 19 files changed, 1348 insertions(+), 1258 deletions(-)

diff --git a/assets/deploy_scripts/module_exacutables/yascp b/assets/deploy_scripts/module_exacutables/yascp
index 174b8ac..e555c03 100755
--- a/assets/deploy_scripts/module_exacutables/yascp
+++ b/assets/deploy_scripts/module_exacutables/yascp
@@ -5,6 +5,7 @@ bold=$(tput bold)
 normal=$(tput sgr0)
 
 
+
 if [[ "$QUEUE" == '' ]];
     then
         export QUEUE='long'
diff --git a/bin/4.add_vdj.R b/bin/4.add_vdj.R
index 4a0ed4d..0af69d2 100755
--- a/bin/4.add_vdj.R
+++ b/bin/4.add_vdj.R
@@ -34,8 +34,7 @@ data_dir <- './'
 args = commandArgs(trailingOnly=TRUE)
 
 wnn_integrated_file = args[1]
-n2 = strsplit(as.character(wnn_integrated_file), split="__all_samples")[[1]][1]
-outname = paste0(n2,'__all_samples_integrated.vdj.RDS')
+
 
 myPalette <- colorRampPalette(rev(brewer.pal(11, "Spectral")))
 
@@ -155,8 +154,14 @@ vdj_dir <- paste0('./vdj/')
 dir.create(vdj_dir,showWarnings = F)
 names(immdata_TCR$data) <- tcr_names
 names(immdata_BCR$data) <- bcr_names
-saveRDS(immdata_BCR, file=paste0(vdj_dir,'/slemap_BCR.rds'))
-saveRDS(immdata_TCR, paste0(vdj_dir,'/slemap_TCR.rds'))
+
+n2 = strsplit(as.character(wnn_integrated_file), split="__all_samples")[[1]][1]
+outname = paste0(n2,'__all_samples_integrated.vdj.RDS')
+outname_BCR = paste0(n2,'__all_samples_integrated.BCR.RDS')
+outname_TCR = paste0(n2,'__all_samples_integrated.TCR.RDS')
+
+saveRDS(immdata_BCR, file=outname_BCR)
+saveRDS(immdata_TCR, file=outname_TCR)
 
 saveRDS(all_samples, file=outname)
 
diff --git a/bin/concordance_calculations.py b/bin/concordance_calculations.py
index 0aac3a4..5b61c83 100755
--- a/bin/concordance_calculations.py
+++ b/bin/concordance_calculations.py
@@ -35,8 +35,11 @@
 mode = 'no_het'  #We can not determine whether hets are concordant or discordant in terms of reads. 
 
 
-class Concordances:
     
+    
+
+class Concordances:
+
     def reset(self):
         self.cell_concordance_table ={}
 
@@ -55,1069 +58,1095 @@ def __init__(self, donor_assignments_table,cell_assignments_table,exclusive_don_
         self.uninformative_sites = uninformative_sites
         self.record_dict={}
 
-    def norm_genotypes(self,expected_vars):
-        expected_vars = pd.DataFrame(expected_vars)
-        if len(expected_vars) > 0:
-            split_str=expected_vars[0].str.split("_")
-            expected_vars['ids'] = split_str.str[0]+'_'+split_str.str[1]+'_'+split_str.str[2]+'_'+split_str.str[3]
-            expected_vars['pos'] = split_str.str[0]+'_'+split_str.str[1]
-            expected_vars['vars'] = split_str.str[4]
-            expected_vars['vars'] = expected_vars['vars'].str.replace('|','/',regex=False)
-            expected_vars = expected_vars[expected_vars['vars']!='./.']
-            expected_vars.loc[expected_vars['vars']=='0/1','vars']='1/0'
-            expected_vars['combo']= expected_vars['ids']+'_'+expected_vars['vars']
-        return expected_vars
-
-
-
-    def get_strict_discordance(self, expected_vars, cell_vars):
-        '''
-        take a list of SNP array genotypes and a list of cellSNP genotypes, return counts of truly discordant 
-        sites and relaxed concordant sites and list of discordant sites1
-        1) If you have 1/1 on SNP array you can not get a 0/1 or 0/0 genotype
-        2) if you have a 0/0 you can not get a 1/1 or 0/1
-        3) if you genotype is 0/1 you can get all copies: 0/0 . 0/1. 1/1
-        So - each obversed cellsnp allele must be in the array SNP gtype
-        '''
-        snp_gtypes = expected_vars[0]
-        cellsnp_gtypes = cell_vars[0]
-        true_discordant = 0
-        relaxed_concordant = 0
-        relaxed_concordant_informative = 0
-        relaxed_concordant_informative_ids = []
-        relaxed_concordant_uninformative_ids = []
-        true_discordant_uninformative_ids =[]
-        true_discordant_informative_ids=[]
-        relaxed_concordant_uninformative = 0
-        true_discordant_informative = 0
-        true_discordant_uninformative = 0
-        discordant_vars = []
-        concordant_vars = []
-        subset_informative_concordant = 0
-        subset_informative_discordant = 0
-
-        #print(self.uninformative_sites)
-        #print(self.informative_sites)
+    def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match, donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data,donor_assignments_table):
 
-        #create sets of the ids (chrom, pos, ref, alt) in each set of genotypes. Filter to the ids present in both 
-        #then filter to informative and uninformative. If uninformative >0 then create a subset of informative
-        # with the same number of vars (at random)
-        split_snp_gts=snp_gtypes.str.split("_")
-        snp_gtypes_ids = set(split_snp_gts.str[0]+'_'+split_snp_gts.str[1]+'_'+split_snp_gts.str[2]+'_'+split_snp_gts.str[3])
+        Concordant_Sites, \
+            Discordant_sites, \
+            Total_Overlapping_sites, \
+            discordant_sites, \
+            cell_vars_norm, \
+            true_discordant_count, \
+            relaxed_concordant_count, \
+            relaxed_concordant_informative_count, \
+            relaxed_concordant_uninformative_count, \
+            true_discordant_informative_count, \
+            true_discordant_uninformative_count, \
+            total_sites, \
+            informative_sites, \
+            uninformative_sites, \
+            total_reads, \
+            total_reads_informative, \
+            total_reads_uninformative, \
+            discordant_reads, \
+            discordant_reads_informative, \
+            discordant_reads_uninformative, \
+            discordant_vars, \
+            concordant_vars, \
+            discordant_read_fraction_in_concordant_sites, \
+            discordant_read_fraction_in_discordant_sites, \
+            discordant_reads_uninformative_fraction, \
+            discordant_reads_informative_fraction, discordant_reads_informative_subset  = self.retrieve_concordant_discordant_sites(expected_vars_norm,cell_vars)
+            
+        total_concordant_sites = len(Concordant_Sites) + relaxed_concordant_count
+        dds = self.donor_distinct_sites[donor_gt_match]
+        Nr_donor_distinct_sites = len(dds)
+        Nr_Concordant = len(Concordant_Sites)
+        Nr_Relaxed_concordant = relaxed_concordant_count
+        Nr_Discordant = len(Discordant_sites)
+        Nr_Total_Overlapping_sites = len(Total_Overlapping_sites)
+        Number_of_sites_that_are_donor_concordant_and_exclusive = len(set(dds).intersection(set(Discordant_sites)))
+        Number_of_sites_in_cellsnp_but_not_in_reference = set(cell_vars_norm['pos'])-set(expected_vars_norm['pos'])
+        #Quantify donor variation in other donors
+        discordant_vars_in_pool = []
+        donor_table_of_concordances = []
+        total_discordant_sites_that_are_concordant_with_other_donors_in_pool = set()
+        total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown = {}
+        total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort']={}
+        total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort']={}
+      
+        total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown_totalReads = {}
+        total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown_totalReads['Whithin_Cohort']={}
+        total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown_totalReads['Out_of_Cohort']={}      
+      
+        total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown = {}
+        total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort']={}
+        total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort']={}      
+      
+        
+        informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool = set()
+        total_cordant_sites_that_are_concordant_with_other_donors_in_pool = set()
+        donor_gt_match_cohort = donor_cohorts[donor_gt_match]
+        donors_contributing_to_out_of_cohort= []
+        donors_contributing_to_Whithin_Cohort=[]
+        sites_contributing_to_out_of_cohort= []
+        sites_contributing_to_Whithin_Cohort=[]
+        for donor in set(donor_assignments_table['donor_gt']):
+            try:
+                expected_vars_norm_of_other_donor = all_donor_data[donor]
+            except:
+                continue
+            try:
+                donor_cohort = donor_cohorts[donor]
+                donor_vars = vars_per_donor_gt[donor]
+            except:
+                continue        
+            
+            Concordant_Sites_otherDonor, \
+                Discordant_sites_otherDonor, \
+                Total_Overlapping_sites_otherDonor, \
+                discordant_sites_otherDonor, \
+                cell_vars_norm_otherDonor, \
+                true_discordant_count_otherDonor, \
+                relaxed_concordant_count_otherDonor, \
+                relaxed_concordant_informative_count_otherDonor, \
+                relaxed_concordant_uninformative_count_otherDonor, \
+                true_discordant_informative_count_otherDonor, \
+                true_discordant_uninformative_count_otherDonor, \
+                total_sites_otherDonor, \
+                informative_sites_otherDonor, \
+                uninformative_sites_otherDonor, \
+                total_reads_otherDonor, \
+                total_reads_informative_otherDonor, \
+                total_reads_uninformative_otherDonor, \
+                discordant_reads_otherDonor, \
+                discordant_reads_informative_otherDonor, \
+                discordant_reads_uninformative_otherDonor, \
+                discordant_vars_otherDonor, \
+                concordant_vars_otherDonor, \
+                discordant_read_fraction_in_concordant_sites_otherDonor, \
+                discordant_read_fraction_in_discordant_sites_otherDonor, \
+                discordant_reads_uninformative_fraction_otherDonor, \
+                discordant_reads_informative_fraction_otherDonor, discordant_reads_informative_subset_otherDonor  = self.retrieve_concordant_discordant_sites(expected_vars_norm_of_other_donor,cell_vars,donor_cohort=donor_cohort)
+            
+            if total_sites_otherDonor==0:
+                total_sites_otherDonor=0.000000000001
+            total_concordant_sites_otherDonor = relaxed_concordant_count_otherDonor
+            concordant_percent_in_other_donor= total_concordant_sites_otherDonor/total_sites_otherDonor*100
+            discordant_percent_in_other_donor= true_discordant_count_otherDonor/total_sites_otherDonor*100
+            DonorDiscordant_Sites_that_are_atributed_to_other_donor = set(discordant_vars).intersection(set(concordant_vars_otherDonor))
+            TotalSites_that_are_atributed_to_other_donor = set(discordant_vars).union(set(concordant_vars)).intersection(set(concordant_vars_otherDonor))
 
-        split_cellsnp_gts=cellsnp_gtypes.str.split("_")
-        cellsnp_gtypes_ids = set(split_cellsnp_gts.str[0]+'_'+split_cellsnp_gts.str[1]+'_'+split_cellsnp_gts.str[2]+'_'+split_cellsnp_gts.str[3])
+            Informative__DonorDiscordant_Sites_that_are_atributed_to_other_donor = set(true_discordant_informative_count).intersection(set(relaxed_concordant_informative_count_otherDonor))
+            DonorCordant_Sites_that_are_atributed_to_other_donor = set(concordant_vars).intersection(set(concordant_vars_otherDonor))
+           
+            total_reads_for_discordant_sites_that_are_concordant_with_other_donor,discordant_reads_for_discordant_sites_that_are_concordant_with_other_donor,concordant_reads_for_discordant_sites_that_are_concordant_with_other_donor,\
+                _,_,_= self.read_extraction(DonorDiscordant_Sites_that_are_atributed_to_other_donor,expected_vars_norm_of_other_donor,cell_vars_norm_otherDonor)
 
-        shared_gts = snp_gtypes_ids.intersection(cellsnp_gtypes_ids)
+            if not donor == donor_gt_match:
+                # We want to kow how many of these discordant site
+                if 'U937' in donor:
+                    continue
+                if 'THP1' in donor:
+                    continue
+                
+                if donor_gt_match_cohort == donor_cohort:
+                    coh = 'Whithin_Cohort'
+                    sites_contributing_to_Whithin_Cohort.extend(list(TotalSites_that_are_atributed_to_other_donor))
+                    if len(DonorDiscordant_Sites_that_are_atributed_to_other_donor)>0:
+                        donors_contributing_to_Whithin_Cohort.append(donor)
+                        
+                else:
+                    coh = 'Out_of_Cohort'
+                    sites_contributing_to_out_of_cohort.extend(list(TotalSites_that_are_atributed_to_other_donor))
+                    if len(DonorDiscordant_Sites_that_are_atributed_to_other_donor)>0:
+                        donors_contributing_to_out_of_cohort.append(donor)
+                        
+                
+                total_discordant_sites_that_are_concordant_with_other_donors_in_pool = total_discordant_sites_that_are_concordant_with_other_donors_in_pool.union(set(DonorDiscordant_Sites_that_are_atributed_to_other_donor))
+                # now we addit for a cohort since the biggest issue comes from cohort cross-contamination
+                # for each of these sites now we calculate the number of reads that it accounts:
+                # tree level set: cohort: site: counts
+                for site in DonorDiscordant_Sites_that_are_atributed_to_other_donor:
+                    total_reads_for_site,discordant_reads_for_site,concordant_for_site,\
+                        total_reads_noHet,discordant_reads_noHet,concordant_reads_noHet= self.read_extraction([site],expected_vars_norm_of_other_donor,cell_vars_norm_otherDonor)
+                    if concordant_for_site==0:
+                        pass
+                    try:
+                        total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_reads_noHet)
 
-        shared_informative = shared_gts.intersection(self.informative_sites)
-        shared_uninformative = shared_gts.intersection(self.uninformative_sites)
-        # print("shared informative " + str(len(shared_informative)))
-        # print("shared uninformative " + str(len(shared_uninformative)))
+                    except:
+                        try:
+                            total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site]=[]
+                            total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_reads_noHet)
+                            
+                        except:
+                            total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh]={}
+                            total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site]=[]
+                            total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_reads_noHet)
 
-        #store the numbers of informative and uninformative sites shared between cellSNP and gt data as these
-        #are the sites used for concordance
-        self.informative_covered = len(shared_informative)
-        self.uninformative_covered = len(shared_uninformative)
+                for site in DonorDiscordant_Sites_that_are_atributed_to_other_donor:
+                    total_reads_for_site,discordant_reads_for_site,concordant_for_site,\
+                        _,_,_= self.read_extraction([site],expected_vars_norm_of_other_donor,cell_vars_norm_otherDonor)
+                    # if discordant_reads_for_site>0:
+                    #     print('here')
+                    if concordant_for_site==0:
+                        pass
+                    try:
+                        total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_for_site)
+                    except:
+                        try:
+                            total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site]=[]
+                            total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_for_site)
+                        
+                        except:
+                            total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh]={}
+                            total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site]=[]
+                            total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_for_site)
+                                                    
+                # to get the total reads that can be atributed to the other donor i have to check if site is already covered in the total_discordant_sites_that_are_concordant_with_other_donors_in_pool.
+                # the ones that havent, i have to add the reads up for them.
+                informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool = informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool.union(set(Informative__DonorDiscordant_Sites_that_are_atributed_to_other_donor))
+                
+                total_cordant_sites_that_are_concordant_with_other_donors_in_pool = total_cordant_sites_that_are_concordant_with_other_donors_in_pool.union(set(DonorCordant_Sites_that_are_atributed_to_other_donor))
+                
 
-        if len(shared_uninformative) > 0:
-            #print(len(shared_uninformative))
-            # print(len(shared_informative))
-            if len(shared_uninformative) <= len(shared_informative):
-                informative_subset = set(random.sample(shared_informative, len(shared_uninformative)))
+                common_vars = list(set(discordant_vars) & set(donor_vars))
+                common_var_count = str(len(common_vars))
+                donor_cohort_common = donor + ":" + donor_cohort + ":" + common_var_count
+                discordant_vars_in_pool.append(donor_cohort_common)
+                
+                # Here we want to calculate the number of discordant sites in other donors and see if in terms of concordance the same donor is picked as per GT assignment.
+                # We do this to investigate the potential of a cell coming from this other donor.
             else:
-                informative_subset = set()#if there are more shared uninformative than shared informative we will not subset
-            # print(informative_subset)
-            # exit(0)
-        else:
-            informative_subset = set()
-
-        # print(informative_subset)
-        self.informative_subset = informative_subset
-
-        snp_gtypes_set = set(snp_gtypes)
-        snp_gtypes_set = sorted(snp_gtypes_set)
-
-        cellsnp_gtypes_set = set(cellsnp_gtypes)
-        cellsnp_gtypes_set = sorted(cellsnp_gtypes_set)
-
-        #for i in range(0, len(snp_gtypes)):
-        # cellsnp_gtypes_str = pd.DataFrame(cellsnp_gtypes_set,columns=['cellsnp_gtypes'])
-        # cellsnp_gtypes_str['snp_gtypes'] = pd.DataFrame(snp_gtypes_set,columns=['snp_gtypes'])['snp_gtypes']
-        for i in range(0, len(snp_gtypes_set)):
-            discordant = False
-            # snp_data = snp_gtypes[i].split('_')
-            # cellsnp_data = cellsnp_gtypes[i].split('_')
-            snp_data = snp_gtypes_set[i].split('_')
-            cellsnp_data = cellsnp_gtypes_set[i].split('_')
-
-            snp_alleles_set = set([snp_data[4][0], snp_data[4][2]])
-            cellsnp_alleles_set = set([cellsnp_data[4][0], cellsnp_data[4][2]])
-            
-            snp_var = ('_').join(snp_data[0:4])
-            cellsnp_var = ('_').join(cellsnp_data[0:4])
-
-            if not cellsnp_var == snp_var:
-                print("Error with strict discordance calculations: " + snp_gtypes[i] + " " + cellsnp_gtypes[i])
-                exit(1)
-            else:
-                for allele in cellsnp_alleles_set:
-                    if not allele in snp_alleles_set:#if a cellSNP allele is found that is not in the array data this is discordant
-                        discordant = True
+                _='check'
+                try:
+                    percent_relaxed_concordant = Nr_Relaxed_concordant/(Nr_Relaxed_concordant+true_discordant_count)*100
+                except:
+                    percent_relaxed_concordant = 0
+                # g2 = total_concordant_sites_otherDonor/total_sites_otherDonor*100
+                if not (percent_relaxed_concordant == concordant_percent_in_other_donor):
+                    _='something not right'
+                    break
             
-            if discordant == True:
-                true_discordant+=1
-                discordant_vars.append(cellsnp_var)
-                if snp_var in self.uninformative_sites:
-                    true_discordant_uninformative+=1
-                    true_discordant_uninformative_ids.append(snp_var)
-                elif snp_var in self.informative_sites:
-                    true_discordant_informative+=1
-                    true_discordant_informative_ids.append(snp_var)
-            else:
-                relaxed_concordant+=1
-                concordant_vars.append(cellsnp_var)
-                if snp_var in self.uninformative_sites:
-                    relaxed_concordant_uninformative+=1
-                    relaxed_concordant_uninformative_ids.append(snp_var)
-                elif snp_var in self.informative_sites:
-                    relaxed_concordant_informative+=1
-                    relaxed_concordant_informative_ids.append(snp_var)
-            if len(shared_uninformative) > 0:
-                if snp_var in informative_subset:
-                    if discordant == True:
-                        subset_informative_discordant+=1
-                    else:
-                        subset_informative_concordant+=1
-            #    true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count 
-        cell_vars2 = cell_vars.set_index('ids') 
-        disc = pd.DataFrame(set(cell_vars2.loc[discordant_vars]['combo']),columns=['combo_x'])
-        df_cd = pd.merge(cell_vars, expected_vars, how='inner', on = 'pos')
-        disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x')
-        disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y']
-        disc_sites_string = ';'.join(disc2['expected_retrieved'])
-        return true_discordant, relaxed_concordant, relaxed_concordant_informative_ids, relaxed_concordant_uninformative_ids, true_discordant_informative_ids, true_discordant_uninformative_ids, discordant_vars, concordant_vars, disc_sites_string
+            donor_table_of_concordances.append({'donor':donor, 'cell':cell1, 'donor_cohort':donor_cohort, \
+                                                'gt matched donor':donor == donor_gt_match, \
+                                                'DonorCordant_Sites_that_are_atributed_to_other_donor':len(DonorCordant_Sites_that_are_atributed_to_other_donor), \
+                                                'DonorCordant_Sites_that_are_atributed_to_other_donor/total':f"{len(DonorCordant_Sites_that_are_atributed_to_other_donor)}/{len(concordant_vars)}", \
+                                                'DonorDiscordant_Sites_that_are_atributed_to_other_donor':len(DonorDiscordant_Sites_that_are_atributed_to_other_donor), \
+                                                'DonorDiscordant_Sites_that_are_atributed_to_other_donor/total':f"{len(DonorDiscordant_Sites_that_are_atributed_to_other_donor)}/{len(discordant_vars)}", \
+                                                'concordant_percent_in_other_donor':concordant_percent_in_other_donor, \
+                                                'discordant_percent_in_other_donor':discordant_percent_in_other_donor, \
+                                                'discordant_reads_otherDonor':discordant_reads_otherDonor, \
+                                                'discordant_sites_otherDonor':len(discordant_vars_otherDonor), \
+                                                'concordant_sites_otherDonor':len(concordant_vars_otherDonor), \
+                                                'total_sites_otherDonor':total_sites_otherDonor, \
+                                                'discordant_reads_otherDonor':discordant_reads_otherDonor, \
+                                                'total_reads_otherDonor':total_reads_otherDonor, \
+                                                # 'discordant_read_fraction_in_concordant_sites_otherDonor':discordant_read_fraction_in_concordant_sites_otherDonor, \
+                                                # 'discordant_read_fraction_in_discordant_sites_otherDonor':discordant_read_fraction_in_discordant_sites_otherDonor, \
+                                                'concordant_reads_For_discordant_sites_that_are_Concordant_with_other_donor':concordant_reads_for_discordant_sites_that_are_concordant_with_other_donor
+                                                })
+        
+        
+        #here now we want to see overall how many reads potentially come from different cohorts.
+        # cohort_specific_site_quant_string=""
+        # cohort_specific_read_quant_string=""
+        
+        Whithin_Cohort__total_number_of_potential_contaminent_reads=0
+        Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool=0
+        Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool=0
+        try:
+            Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool = len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys())
+            # for k1 in total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys():
+            #     Whithin_Cohort__total_number_of_potential_contaminent_reads+= max(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'][k1])
+        except:
+            _='Doesnt Exist'
+        
+        Out_of_Cohort__total_number_of_potential_contaminent_reads=0
+        try:
+            Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool = len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys())
+            # for k1 in total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys():
+            #     Out_of_Cohort__total_number_of_potential_contaminent_reads+= max(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'][k1])
+        except:
+            _='Doesnt Exist'
     
-    def read_concordance_calc(self,expected_vars,cell_vars):
-            
-            # This is a wrapper to add up the discordant reads in the cellsnp file.
+    
+    
+    
+    
+    
+    
+    
+        # No het total calcs
+        
+        # Since we can not tell just from the cellsnp file if the discordant read at concordant site is actually concordant with another donor in a pool
+        # we assume the worse case scenario and add idscordant reads at concordant sites to the total.
+        # Number of reads on the sites that are concordant, but has discordant reads with another donor
+        Whithin_Cohort__total_number_of_potential_contaminent_reads=0
+        try:
+            concordant_sites_atributed_to_other_donor = list(set(sites_contributing_to_Whithin_Cohort))
+            if not total_sites >= len(concordant_sites_atributed_to_other_donor):
+                raise Exception("Sorry, the number of sites atributed can not be larger than total sites")
             
-            # expected genotype 0/0
-            expected_hom_ref = expected_vars[expected_vars['vars'] == '0/0']
-            hom_ref_sites = set(expected_hom_ref['ids'])
-            cell_vars2 = cell_vars[cell_vars['ids'].isin(hom_ref_sites)]
-            ad_hom_ref = cell_vars2['AD'].sum()
-            oth_hom_ref = cell_vars2['OTH'].sum() 
-            discordant_hom_ref = ad_hom_ref + oth_hom_ref
+            _,_,_,\
+                _,Whithin_Cohort__total_number_of_potential_contaminent_reads, \
+                    _= self.read_extraction(concordant_sites_atributed_to_other_donor,expected_vars_norm,cell_vars_norm)
                     
-            # expected genotype 0/1 or 1/0
-            hets = ['0/1', '1/0']
-            expected_het = expected_vars[expected_vars['vars'].isin(hets)]
-            het_sites = set(expected_het['ids'])
-            cell_vars3 = cell_vars[cell_vars['ids'].isin(het_sites)]
-            discordant_het = cell_vars3['OTH'].sum()
+            if not discordant_reads >= len(Whithin_Cohort__total_number_of_potential_contaminent_reads):
+                raise Exception("Total discordant reads can not be less than within cohort discordant reads.")            
             
-            # expected genotype 1/1
-            expected_hom_alt = expected_vars[expected_vars['vars'] == '1/1']
-            hom_alt_sites = set(expected_hom_alt['ids'])
-            cell_vars4 = cell_vars[cell_vars['ids'].isin(hom_alt_sites)]
-            # DP + OTH - AD
-            ad_hom_alt = cell_vars4['AD'].sum()
-            dp_hom_alt = cell_vars4['DP'].sum()
-            oth_hom_alt = cell_vars4['OTH'].sum()
-            discordant_hom_alt = (dp_hom_alt + oth_hom_alt) - ad_hom_alt
             
-            # Total analysis
-            discordant_reads =  discordant_hom_ref + discordant_het + discordant_hom_alt
-            total_dp = cell_vars['DP'].sum()
-            total_oth = cell_vars['OTH'].sum()
-            total_reads = total_dp + total_oth
-            discordantRead_ratio = discordant_reads/total_reads
+        except:
+            _='Doesnt Exist'
+
+
+        Out_of_Cohort__total_number_of_potential_contaminent_reads=0
+        try:
+            concordant_sites_atributed_to_other_donor =  list(set(sites_contributing_to_out_of_cohort))
+            if not total_sites >= len(concordant_sites_atributed_to_other_donor):
+                raise Exception("Sorry, the number of sites atributed can not be larger than total sites")
             
-            # No het analysis - they are not discordant or concordant - we cannot make any call at those sites
-            discordant_reads_noHet =  discordant_hom_ref + discordant_hom_alt
-            total_reads_noHet = total_reads - discordant_het
-            discordantRead_ratio_noHet = discordant_reads_noHet/total_reads_noHet
+            _,_,_,_,Out_of_Cohort__total_number_of_potential_contaminent_reads, _= self.read_extraction(concordant_sites_atributed_to_other_donor,expected_vars_norm,cell_vars_norm)
             
-            return total_reads,total_dp,total_oth,discordant_reads,discordantRead_ratio,discordant_reads_noHet,total_reads_noHet,discordantRead_ratio_noHet
-
-    def read_condordance(self, expected_vars, cell_vars,discordant_vars, concordant_vars):
-        '''
-        get read level concordance using DP, AD and OTH format fields
-        ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="total counts for ALT and REF">
-        ##FORMAT=<ID=AD,Number=1,Type=Integer,Description="total counts for ALT">
-        ##FORMAT=<ID=OTH,Number=1,Type=Integer,Description="total counts for other bases from REF and ALT">
-        '''
-        if not len(expected_vars) == len(cell_vars):
-            print("length mismatch between expected vars and cell vars")
-            exit(1)
+            if not discordant_reads >= len(Out_of_Cohort__total_number_of_potential_contaminent_reads):
+                raise Exception("Total discordant reads can not be less than out of cohort discordant reads.")                       
 
-        total_sites = len(expected_vars)
-        #add cols for DP, AD< OTH
-        cell_vars['DP'] = cell_vars[0].str.split("_").str[5].astype(int)
-        cell_vars['AD'] = cell_vars[0].str.split("_").str[6].astype(int)
-        cell_vars['OTH'] = cell_vars[0].str.split("_").str[7].astype(int)
+        except:
+            _='Doesnt Exist'
+                
+    
         
+        try:
+            Out_of_Cohort__sites = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys())
+        except:
+            Out_of_Cohort__sites = set()
+            
+        try:
+            Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys()) - set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) 
+            shared_sites_between_out_of_cohort_and_within_cohort = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys()).intersection(set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) )
+        except:
+            Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set()
+            shared_sites_between_out_of_cohort_and_within_cohort = set()
+            
+        try:
+            Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) - set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys()) 
+        except:
+            Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys())
 
+        Out_of_Cohort__unique_sites_total_reads,_,_,_,_,_ = self.read_extraction(Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool,expected_vars_norm,cell_vars_norm)
+            
+        try:
+            Whithin_Cohort__sites = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys())
+        except:
+            Whithin_Cohort__sites = set()  
+            
+
+          
+        Total__discordant_sites_that_are_concordant_with_other_donors_in_pool = Whithin_Cohort__sites.union(Out_of_Cohort__sites)
+        concordant_vars_in_pool_str = (";").join(concordant_vars)
+        DF = pd.DataFrame(donor_table_of_concordances)
         
-        # Total
-        total_reads,total_dp,total_oth,discordant_reads, \
-            discordantRead_ratio,discordant_reads_noHet,total_reads_noHet,discordantRead_ratio_noHet = self.read_concordance_calc(expected_vars,cell_vars)
-        
-        # uninformative
-        cell_vars_uninformative = cell_vars[cell_vars['ids'].isin(self.uninformative_sites)]
-        total_reads_uninformative,total_dp_uninformative,total_oth_uninformative,discordant_reads_uninformative, \
-            discordantRead_ratio_uninformative,discordant_reads_uninformative_noHet,total_reads_uninformative_noHet,discordantRead_ratio_uninformative_noHet= self.read_concordance_calc(expected_vars,cell_vars_uninformative)
-        
-        # informative 
-        cell_vars_informative = cell_vars[cell_vars['ids'].isin(self.informative_sites)]
-        total_reads_informative,total_dp_informative,total_oth_informative,discordant_reads_informative, \
-            discordantRead_ratio_informative,discordant_reads_informative_noHet,total_reads_informative_noHet,discordantRead_ratio_informative_noHet = self.read_concordance_calc(expected_vars,cell_vars_informative)
-        
-        # Split into concordant and discordant sites
-        # concordant
-        concordant_sites = cell_vars[cell_vars['ids'].isin(set(concordant_vars))]
-        total_reads_for_concordant_sites,total_dp_for_concordant_sites,total_oth_for_concordant_sites,discordant_reads_for_concordant_sites, \
-            discordantRead_ratio_for_concordant_sites,discordant_reads_for_concordant_sites_noHet,total_reads_for_concordant_sites_noHet,discordantRead_ratio_for_concordant_sites_noHet = self.read_concordance_calc(expected_vars,concordant_sites)
+        Donor_With_Lowest_DisConcordance = ';'.join(DF[DF['discordant_percent_in_other_donor']==min(DF['discordant_percent_in_other_donor'])]['donor'].values)
+        Lowest_Disconcordance_value_in_all_donors= DF[DF['discordant_percent_in_other_donor']==min(DF['discordant_percent_in_other_donor'])]['discordant_percent_in_other_donor'].values[0]
         
-        # discordant
-        discordant_sites = cell_vars[cell_vars['ids'].isin(set(discordant_vars))]
-        total_reads_for_discconcordant_sites,total_dp_for_discconcordant_sites,total_oth_for_discconcordant_sites,discordant_reads_for_discconcordant_sites, \
-            discordantRead_ratio_for_discconcordant_sites,discordant_reads_for_discconcordant_sites_noHet,total_reads_for_discconcordant_sites_noHet,discordantRead_ratio_for_discconcordant_sites_noHet = self.read_concordance_calc(expected_vars,discordant_sites)
-              
-        # Subset analysis
-        cell_vars_informative_subset = cell_vars[cell_vars['ids'].isin(self.informative_subset)]
-        total_reads_informative_subset,total_dp_informative_subset,total_oth_informative_subset,discordant_reads_informative_subset, \
-            discordantRead_ratio_informative_subset,discordant_reads_informative_subset_noHet,total_reads_informative_subset_noHet,discordantRead_ratio_informative_subset_noHet = self.read_concordance_calc(expected_vars,cell_vars_informative_subset)
-
-        if mode == 'no_het':
-            total_reads = total_reads_noHet
-            discordant_reads = discordant_reads_noHet
-            total_reads_informative = total_reads_informative_noHet
-            discordant_reads_informative = discordant_reads_informative_noHet
-            total_reads_uninformative =total_reads_uninformative_noHet
-            discordant_reads_uninformative = discordant_reads_uninformative_noHet
-            total_reads_informative_subset = total_reads_informative_subset_noHet
-            discordant_reads_informative_subset = discordant_reads_informative_subset_noHet
-            total_reads_for_concordant_sites = total_reads_for_concordant_sites_noHet
-            discordant_reads_for_concordant_sites = discordant_reads_for_concordant_sites_noHet
-            total_reads_for_discconcordant_sites = total_reads_for_discconcordant_sites_noHet
-            discordant_reads_for_discconcordant_sites = discordant_reads_for_discconcordant_sites_noHet
-        
-        return total_sites, \
-            self.informative_covered, \
-                self.uninformative_covered, \
-                    total_reads, \
-                        discordant_reads, \
-                            total_reads_informative, \
-                                discordant_reads_informative, \
-                                    total_reads_uninformative, \
-                                        discordant_reads_uninformative, \
-                                            total_reads_informative_subset, \
-                                                    discordant_reads_informative_subset, \
-                                                        total_reads_for_concordant_sites, \
-                                                            discordant_reads_for_concordant_sites, \
-                                                                total_reads_for_discconcordant_sites, \
-                                                                    discordant_reads_for_discconcordant_sites
-                                                                    
-                                                    
-    
-    def get_discordance(self,expected_vars2,cell_vars2):
-        Concordant_Sites = set(cell_vars2['combo']).intersection(set(expected_vars2['combo']))
-        Discordant_sites = set(cell_vars2['combo'])-set(expected_vars2['combo'])  
-        disc = pd.DataFrame(Discordant_sites,columns=['combo_x'])
-        df_cd = pd.merge(cell_vars2, expected_vars2, how='inner', on = 'pos')
-        disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x')
-        disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y']
-        disc_sites = ';'.join(disc2['expected_retrieved'])
-        return Concordant_Sites,Discordant_sites,disc_sites
-    
-    
-    def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars,donor_cohort=False):
-        # This function has been inspired by Hails Concordance implementations, however hail has a pitfall that it performs a lot of other stuff under hood and requires intermediate sorting operations.
-        # Since the single cell calculations requires concordance calculations per cell this becomes very computationally heavy on Hail, hence we have implemented concordance calculations here as part of the pipeline.
-        # Author: M.Ozols
-        
-        cell_vars_norm = self.norm_genotypes(cell_vars)
-
-        if len(cell_vars_norm) > 0:
-            
-            if mode == 'no_het':
-                hets = ['0/1', '1/0']
-                expected_vars_norm = expected_vars_norm[~expected_vars_norm['vars'].isin(hets)]
+        Donor_With_Highest_Concordance = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['donor'].values)
+        Highest_Concordance_value_in_all_donors= DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['concordant_percent_in_other_donor'].values[0]
+        Total_sites_other_donor = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['total_sites_otherDonor'].astype(str).values)
+        Total_reads_other_donor = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['total_reads_otherDonor'].astype(str).values)
+                                                       
+        return [{
+            'cell1':cell1,
+            'donor_gt_match':donor_gt_match,
+            'Nr_Concordant':Nr_Concordant,
+            'Nr_Discordant':Nr_Discordant,
+            'Nr_Relaxed_concordant':Nr_Relaxed_concordant,
+            'true_discordant_count':true_discordant_count,
+            'relaxed_concordant_informative_count':relaxed_concordant_informative_count,
+            'relaxed_concordant_uninformative_count':relaxed_concordant_uninformative_count,
+            'true_discordant_informative_count':true_discordant_informative_count,
+            'true_discordant_uninformative_count':true_discordant_uninformative_count,
+            'Nr_Total_Overlapping_sites':Nr_Total_Overlapping_sites,
+            'Number_of_sites_that_are_donor_concordant_and_exclusive':Number_of_sites_that_are_donor_concordant_and_exclusive,
+            'Nr_donor_distinct_sites':Nr_donor_distinct_sites,
+            'count':count,
+            'discordant_sites':discordant_sites,
+            'total_sites':total_sites,
+            'informative_sites':informative_sites,
+            'uninformative_sites':uninformative_sites,
+            'total_reads_informative':total_reads_informative,
+            'total_reads_uninformative':total_reads_uninformative,
+            'discordant_reads_informative':discordant_reads_informative,
+            'discordant_reads_uninformative':discordant_reads_uninformative,
+            'Discordant_sites_in_pool': discordant_vars,
+            'Lowest_Disconcordance_value_in_all_donors':Lowest_Disconcordance_value_in_all_donors,
+            'Donor_With_Lowest_DisConcordance':Donor_With_Lowest_DisConcordance,
+            'Concordant_Site_Identities':concordant_vars_in_pool_str,
+            'Donor_With_Highest_Concordance':Donor_With_Highest_Concordance,
+            'Highest_Concordance_value_in_all_donors':Highest_Concordance_value_in_all_donors,
+            'Total_sites_other_donor':Total_sites_other_donor,
+            'Total_reads_other_donor':Total_reads_other_donor,
+            'total_discordant_sites_that_are_concordant_with_other_donors_in_pool':f"{len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool)}/{len(discordant_vars)}",
+            'informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool':f"{len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool)}/{len(true_discordant_informative_count)}",
+            'total_reads':total_reads,
+            'discordant_reads':discordant_reads,
+            'discordant_reads_informative_subset':discordant_reads_informative_subset, \
+            'discordant_read_fraction_in_concordant_sites':discordant_read_fraction_in_concordant_sites, \
+            'discordant_read_fraction_in_discordant_sites':discordant_read_fraction_in_discordant_sites, \
+            'Whithin_Cohort__total_number_of_potential_contaminent_reads':Whithin_Cohort__total_number_of_potential_contaminent_reads, \
+            'Out_of_Cohort__total_number_of_potential_contaminent_reads':Out_of_Cohort__total_number_of_potential_contaminent_reads, \
+            'NrDonors_contributing_to_out_of_cohort':len(set(donors_contributing_to_out_of_cohort)), \
+            'NrDonors_contributing_to_Whithin_Cohort':len(set(donors_contributing_to_Whithin_Cohort)), \
                 
-            Total_Overlapping_sites = set(expected_vars_norm['ids']).intersection(set(cell_vars_norm['ids']))
-            expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlapping_sites)]
-            cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)]
-            cell_vars2 = cell_vars2.drop_duplicates(subset=['ids'])
-            expected_vars2 = expected_vars2.drop_duplicates(subset=['ids'])
-            if len(cell_vars2)!=len(expected_vars2):
-                print('failed to select comon variants')
-                exit(1)
-            
-            # Find exact discordant sites
-            Concordant_Sites, Discordant_sites, _ = self.get_discordance(expected_vars2, cell_vars2)
-            #find truly discordant sites
-            true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, discordant_vars, concordant_vars, disc_sites_string = self.get_strict_discordance(expected_vars2, cell_vars2)
-            #find discordant reads
-            total_sites, \
-                informative_sites, \
-                    uninformative_sites, \
-                        total_reads, \
-                            discordant_reads, \
-                                total_reads_informative, \
-                                    discordant_reads_informative, \
-                                        total_reads_uninformative, \
-                                            discordant_reads_uninformative, \
-                                                    total_reads_informative_subset, \
-                                                        discordant_reads_informative_subset, \
-                                                            total_reads_for_concordant_sites, \
-                                                                discordant_reads_for_concordant_sites, \
-                                                                    total_reads_for_discconcordant_sites, \
-                                                                        discordant_reads_for_discconcordant_sites   = self.read_condordance(expected_vars2, cell_vars2, discordant_vars, concordant_vars)
-                                                                        
-            discordant_read_fraction_in_concordant_sites = f"{discordant_reads_for_concordant_sites}/{total_reads_for_concordant_sites}"
-            discordant_read_fraction_in_discordant_sites = f"{discordant_reads_for_discconcordant_sites}/{total_reads_for_discconcordant_sites}"
-            discordant_reads_uninformative_fraction = f"{discordant_reads_uninformative}/{total_reads_uninformative}"
-            discordant_reads_informative_fraction = f"{discordant_reads_informative}/{total_reads_informative}"
-            
-            # sanity checks
-            if total_reads != total_reads_for_concordant_sites+total_reads_for_discconcordant_sites:
-                print("Error: total reads dont add up ")
-                exit(1)
-            if discordant_reads != discordant_reads_for_concordant_sites+discordant_reads_for_discconcordant_sites:
-                print("Error: discordant reads dont add up ")
-                exit(1)
-            
-
-        else:
-            Total_Overlapping_sites = set()
-            Concordant_Sites = set()
-            Discordant_sites = set()
-            disc_sites = ''
-            true_discordant_count = 0
-            relaxed_concordant_count = 0
-            total_sites = 0
-            relaxed_concordant_informative_count = 0
-            relaxed_concordant_uninformative_count = 0
-            true_discordant_informative_count = 0
-            true_discordant_uninformative_count = 0
-            informative_sites = 0
-            disc_sites_string = ''
-            discordant_reads = 0
-            uninformative_sites = 0
-            total_reads = 0
-            total_reads_informative =0
-            total_reads_uninformative=0
-            discordant_reads=0
-            discordant_reads_informative=0
-            discordant_reads_uninformative=0
-            discordant_vars=0
-            concordant_vars=0
-            discordant_read_fraction_in_concordant_sites=0
-            discordant_read_fraction_in_discordant_sites=0
-            discordant_reads_uninformative_fraction=0
-            discordant_reads_informative_fraction=0
-            discordant_reads_informative_subset=0
-
-        return Concordant_Sites, \
-            Discordant_sites, \
-            Total_Overlapping_sites, \
-            disc_sites_string, \
-            cell_vars_norm, \
-            true_discordant_count, \
-            relaxed_concordant_count, \
-            relaxed_concordant_informative_count, \
-            relaxed_concordant_uninformative_count, \
-            true_discordant_informative_count, \
-            true_discordant_uninformative_count, \
-            total_sites, \
-            informative_sites, \
-            uninformative_sites, \
-            total_reads, \
-            total_reads_informative, \
-            total_reads_uninformative, \
-            discordant_reads, \
-            discordant_reads_informative, \
-            discordant_reads_uninformative,  \
-            discordant_vars, \
-            concordant_vars, \
-            discordant_read_fraction_in_concordant_sites, \
-            discordant_read_fraction_in_discordant_sites, \
-            discordant_reads_uninformative_fraction, \
-            discordant_reads_informative_fraction, \
-            discordant_reads_informative_subset
-            
+            'Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool':Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool, \
+            'Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool':Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool, \
+            'shared_sites_between_Out_of_Cohort_and_Within_Cohort':len(shared_sites_between_out_of_cohort_and_within_cohort), \
+            'Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool':len(Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool), \
+            'Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool':len(Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool), \
+            'Out_of_Cohort__unique_sites_total_reads':Out_of_Cohort__unique_sites_total_reads, \
+            'Total__discordant_sites_that_are_concordant_with_other_donors_in_pool':len(Total__discordant_sites_that_are_concordant_with_other_donors_in_pool)
+        }, donor_table_of_concordances]
     
-
-    def set_results(self,to_set,id):
-        # Recod to disk to save the loading mmeory time.
-        hash = random.getrandbits(128)
-        with open(f'tmp_{id}_{hash}.pkl', 'wb') as f:
-            pickle.dump(to_set, f)
-        self.record_dict[id]=f'tmp_{id}_{hash}.pkl'
-        return 
     
-    # def append_results_cell_concordances(self,result):
-    def append_results_cell_concordances(self,result,cell_concordance_table,other_donor_concordances,other_donor_concordance_table):
-        other_donor_concordance_table = other_donor_concordance_table + other_donor_concordances
-        count=result['count']
-        try:
-            percent_concordant = result['Nr_Concordant']/(result['Nr_Discordant']+result['Nr_Concordant'])*100
-        except:
-            percent_concordant = 0
-        
-        try:
-            percent_discordant = result['Nr_Discordant']/(result['Nr_Discordant']+result['Nr_Concordant'])*100
-        except:
-            percent_discordant = 0
 
-        try:
-            percent_relaxed_concordant = result['Nr_Relaxed_concordant']/(result['Nr_Relaxed_concordant']+result['true_discordant_count'])*100
-        except:
-            percent_relaxed_concordant = 0
-        
-        try:
-            percent_strict_discordant = result['true_discordant_count']/(result['Nr_Relaxed_concordant']+result['true_discordant_count'])*100
-        except:
-            percent_strict_discordant = 0
+    def norm_genotypes(self,expected_vars):
+        expected_vars = pd.DataFrame(expected_vars)
+        if len(expected_vars) > 0:
+            split_str=expected_vars[0].str.split("_")
+            expected_vars['ids'] = split_str.str[0]+'_'+split_str.str[1]+'_'+split_str.str[2]+'_'+split_str.str[3]
+            expected_vars['pos'] = split_str.str[0]+'_'+split_str.str[1]
+            expected_vars['vars'] = split_str.str[4]
+            expected_vars['vars'] = expected_vars['vars'].str.replace('|','/',regex=False)
+            expected_vars = expected_vars[expected_vars['vars']!='./.']
+            expected_vars.loc[expected_vars['vars']=='0/1','vars']='1/0'
+            expected_vars['combo']= expected_vars['ids']+'_'+expected_vars['vars']
+        return expected_vars
 
-        try:
-            read_discordance = result['discordant_reads']/result['total_sites']
-        except:
-            read_discordance = 0
 
-        cohort = 'UNKNOWN'
-        donor_split = result['donor_gt_match'].split("_")
-        if (len(donor_split) == 2) and (donor_split[0] == donor_split[1]):
-            cohort = 'UKB'
-        elif (len(donor_split) == 3) and (len(donor_split[0]) == 14):
-            cohort = 'ELGH'
 
-        same_as_asigned_donor = result['donor_gt_match'] in result['Donor_With_Highest_Concordance']
-        if not same_as_asigned_donor:
-            same_as_asigned_donor = result['donor_gt_match'] in result['Donor_With_Lowest_DisConcordance']
-            
-        cell_concordance_table[f"{result['cell1']} --- {result['donor_gt_match']}"] = {'GT 1':result['cell1'],
-                                                                'GT 2':result['donor_gt_match'],
-                                                                'cohort': cohort,
-                                                                
-                                                                # 'Nr_Concordant':result['Nr_Concordant'],
-                                                                # 'Nr_Discordant':result['Nr_Discordant'],
-                                                                'Nr_Relaxed_concordant':result['Nr_Relaxed_concordant'],
-                                                                'Nr_strict_discordant':result['true_discordant_count'],
-                                                                # 'Percent Concordant':percent_concordant,
-                                                                # 'Percent Discordant':percent_discordant,
-                                                                'Percent_relaxed_concordant': percent_relaxed_concordant,
-                                                                'Percent_strict_discordant': percent_strict_discordant,
-                                                                'Nr_concordant_informative': len(result['relaxed_concordant_informative_count']),
-                                                                'Nr_concordant_uninformative': len(result['relaxed_concordant_uninformative_count']),
-                                                                'Nr_discordant_informative': len(result['true_discordant_informative_count']),
-                                                                'Nr_discordant_uninformative': len(result['true_discordant_uninformative_count']),
-                                                                'NrTotal_Overlapping_sites_between_two_genotypes':result['Nr_Total_Overlapping_sites'],
-                                                                'Nr_donor_distinct_sites_within_pool_individuals':result['Nr_donor_distinct_sites'],
-                                                                'Number_of_sites_that_are_donor_concordant_and_exclusive':result['Number_of_sites_that_are_donor_concordant_and_exclusive'],
-                                                                'Total_sites': result['total_sites'],
-                                                                'Total_informative_sites': result['informative_sites'],
-                                                                'Total_uninformative_sites': result['uninformative_sites'],
-                                                                'Total_reads': result['total_reads'],
-                                                                'Total_reads_informative': result['total_reads_informative'],
-                                                                'Total_reads_uninformative': result['total_reads_uninformative'],
-                                                                'Discordant_reads': result['discordant_reads'],
-                                                                'Discordant_reads_informtive': result['discordant_reads_informative'],
-                                                                'Discordant_reads_uninformtive': result['discordant_reads_uninformative'],
-                                                                'discordant_reads_informative_subset':result['discordant_reads_informative_subset'],
-                                                                'Discordant_reads_by_n_sites': read_discordance,
-                                                                'Discordant_sites_in_pool': len(result['Discordant_sites_in_pool']),
-                                                                'Lowest_Disconcordance_value_in_all_donors':result['Lowest_Disconcordance_value_in_all_donors'],
-                                                                'Donor_With_Lowest_DisConcordance':result['Donor_With_Lowest_DisConcordance'],
-                                                                'Concordant_Site_Identities':result['Concordant_Site_Identities'],
-                                                                'Donor_With_Highest_Concordance':result['Donor_With_Highest_Concordance'],
-                                                                'Highest_Concordance_value_in_all_donors':result['Highest_Concordance_value_in_all_donors'],
-                                                                'same_as_asigned_donor':same_as_asigned_donor,
-                                                                'Total_sites_other_donor (if same_as_asigned_donor=False)':result['Total_sites_other_donor'],
-                                                                'Total_reads_other_donor (if same_as_asigned_donor=False)':result['Total_reads_other_donor'],
-                                                                'total_discordant_sites_that_are_concordant_with_other_donors_in_pool':result['total_discordant_sites_that_are_concordant_with_other_donors_in_pool'],
-                                                                'discordant_read_fraction_in_concordant_site':result['discordant_read_fraction_in_concordant_sites'], 
-                                                                'discordant_read_fraction_in_discordant_sites':result['discordant_read_fraction_in_discordant_sites'], 
-                                                                'Whithin_Cohort__total_number_of_potential_contaminent_reads':result['Whithin_Cohort__total_number_of_potential_contaminent_reads'],
-                                                                'Out_of_Cohort__total_number_of_potential_contaminent_reads':result['Out_of_Cohort__total_number_of_potential_contaminent_reads'], 
-                                                                'NrDonors_contributing_to_out_of_cohort':result['NrDonors_contributing_to_out_of_cohort'], 
-                                                                'NrDonors_contributing_to_Whithin_Cohort':result['NrDonors_contributing_to_Whithin_Cohort'],
-                                                                
-                                                                'Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool'],
-                                                                'Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool'],
-                                                                'shared_sites_between_Out_of_Cohort_and_Within_Cohort':result['shared_sites_between_Out_of_Cohort_and_Within_Cohort'],
-                                                                'Within_Cohort__unique_discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool'],
-                                                                'Out_of_Cohort__unique_discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool'],
-                                                                'Out_of_Cohort__unique_sites_total_reads':result['Out_of_Cohort__unique_sites_total_reads'],
-                                                                'Total__discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Total__discordant_sites_that_are_concordant_with_other_donors_in_pool']
-                                                                } 
-        
-        return [cell_concordance_table,other_donor_concordance_table]
-    
-    # def combine_written_files(self):#this one is for concordance class
-    #     to_export = self.cell_concordance_table
-    #     for val1 in self.record_dict.values():
-    #         # here remove the int files.
-    #         print(f"merging temp file: {val1}")
-    #         with open(val1, 'rb') as f:
-    #             loaded_dict = pickle.load(f)
-    #             for k1 in loaded_dict.keys():
-    #                 to_export[k1]=loaded_dict[k1]
-    #         os.remove(val1)
-    #     return to_export
-    
-    
-    def combine_written_lists(self,exclusive_donor_variants,record_dict):#this is for VCF loader class
-        to_export = exclusive_donor_variants
-        for val1 in record_dict.values():
-            # here remove the int files.
-            print(f"merging temp file: {val1}")
-            with open(val1, 'rb') as f:
-                loaded_dict = pickle.load(f)
-                self.other_donor_comp = self.other_donor_comp+ loaded_dict
-            os.remove(val1)
-        return self.other_donor_comp    
-    
-    def combine_written_files(self,exclusive_donor_variants,record_dict):#this is for VCF loader class
-        to_export = exclusive_donor_variants
-        for val1 in record_dict.values():
-            # here remove the int files.
-            print(f"merging temp file: {val1}")
-            with open(val1, 'rb') as f:
-                loaded_dict = pickle.load(f)
-                for k1 in loaded_dict.keys():
-                    try:
-                        to_export[k1]=to_export[k1].union(loaded_dict[k1])
-                    except:
-                        to_export[k1]=set()
-                        to_export[k1]=to_export[k1].union(loaded_dict[k1])
-            os.remove(val1)
-        return to_export
-   
-    def set_results(self,to_set,id):
-        # Recod to disk to save the loading mmeory time.
-        hash = random.getrandbits(128)
-        with open(f'tmp_{id}_{hash}.pkl', 'wb') as f:
-            pickle.dump(to_set, f)
-        self.record_dict[id]=f'tmp_{id}_{hash}.pkl' 
-    
-    def analyse_donor(self,analyse_donor,Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm,donor_assignments_table):
-        donor_concordance_table = {}
-        other_donor_concordance_table = []
-        for cell1 in Cells_to_keep_pre:
-            count+=1
+    def get_strict_discordance(self, expected_vars, cell_vars):
+        '''
+        take a list of SNP array genotypes and a list of cellSNP genotypes, return counts of truly discordant 
+        sites and relaxed concordant sites and list of discordant sites1
+        1) If you have 1/1 on SNP array you can not get a 0/1 or 0/0 genotype
+        2) if you have a 0/0 you can not get a 1/1 or 0/1
+        3) if you genotype is 0/1 you can get all copies: 0/0 . 0/1. 1/1
+        So - each obversed cellsnp allele must be in the array SNP gtype
+        '''
+        snp_gtypes = expected_vars[0]
+        cellsnp_gtypes = cell_vars[0]
+        true_discordant = 0
+        relaxed_concordant = 0
+        relaxed_concordant_informative = 0
+        relaxed_concordant_informative_ids = []
+        relaxed_concordant_uninformative_ids = []
+        true_discordant_uninformative_ids =[]
+        true_discordant_informative_ids=[]
+        relaxed_concordant_uninformative = 0
+        true_discordant_informative = 0
+        true_discordant_uninformative = 0
+        discordant_vars = []
+        concordant_vars = []
+        subset_informative_concordant = 0
+        subset_informative_discordant = 0
 
-            cell_vars = exclusive_cell_variants[cell1]
-            try:
-                result1, other_donor_concordances = self.concordance_table_production(expected_vars_norm,cell_vars,cell1,donor_gt_match,donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data,donor_assignments_table)
-                cell_concordance_table,other_donor_concordance_table = self.append_results_cell_concordances(result1,donor_concordance_table,other_donor_concordances,other_donor_concordance_table)
-            except:
-                continue
-            # if count>800:
-            #     break
-            # here we should write these independently to the files
-            if (count % 50 == 0):
-                self.set_results(other_donor_concordance_table,f"{count}--{donor_gt_match}")
-                other_donor_concordance_table = []
-                
+        #print(self.uninformative_sites)
+        #print(self.informative_sites)
 
-        self.set_results(other_donor_concordance_table,f"{count}--{donor_gt_match}")
-        output2 = self.combine_written_lists(self.other_donor_comp,self.record_dict)
-        pd.DataFrame(output2).sort_values(by=['cell']).to_csv(f'{donor_gt_match}-{analyse_donor}--each_cells_comparison_with_other_donor.tsv',sep='\t',index=False)
-        # del output2
-        self.record_dict={}
-        return donor_concordance_table
+        #create sets of the ids (chrom, pos, ref, alt) in each set of genotypes. Filter to the ids present in both 
+        #then filter to informative and uninformative. If uninformative >0 then create a subset of informative
+        # with the same number of vars (at random)
+        split_snp_gts=snp_gtypes.str.split("_")
+        snp_gtypes_ids = set(split_snp_gts.str[0]+'_'+split_snp_gts.str[1]+'_'+split_snp_gts.str[2]+'_'+split_snp_gts.str[3])
 
-    def combine_concordances(self,result):
-        
-        self.cell_concordance_table = {**self.cell_concordance_table, **result}
-        
+        split_cellsnp_gts=cellsnp_gtypes.str.split("_")
+        cellsnp_gtypes_ids = set(split_cellsnp_gts.str[0]+'_'+split_cellsnp_gts.str[1]+'_'+split_cellsnp_gts.str[2]+'_'+split_cellsnp_gts.str[3])
 
-    def conc_table(self):
-        donor_assignments_table=self.donor_assignments_table
-        cell_assignments_table=self.cell_assignments_table
-        exclusive_don_variants=self.exclusive_don_variants
-        exclusive_cell_variants= self.exclusive_cell_variants
-        donor_list = exclusive_don_variants.keys()
-        # cpus = 1
-        pool = mp.Pool(cpus)
-        count = 0
-        
-        
-        #create a list of variants that are on each donor genotype file
-        vars_per_donor_gt = {}
-        for don_id in donor_list:
-            donor_gt_vars = list(exclusive_don_variants[don_id])
-            donor_gt_vars = pd.DataFrame(donor_gt_vars)
-            donor_gt_vars = self.norm_genotypes(donor_gt_vars)
-            donor_gt_vars = donor_gt_vars[donor_gt_vars['vars'] != '0/0']
-            donor_gt_varids = list(donor_gt_vars['ids'])
-            vars_per_donor_gt[don_id] = donor_gt_varids
-        
-        #work out what cohort each donor belongs to
-        donor_cohorts = {}
-        for don_id in donor_list:
-            cohort = 'UNKNOWN'
-            donor_split = don_id.split("_")
-            if (len(donor_split) == 2) and (donor_split[0] == donor_split[1]):
-                cohort = 'UKB'
-            elif (len(donor_split) == 3) and (len(donor_split[0]) == 14):
-                cohort = 'ELGH'
-            donor_cohorts[don_id] = cohort
+        shared_gts = snp_gtypes_ids.intersection(cellsnp_gtypes_ids)
 
-        all_donor_data={}
-        # here we calvculate all the expected donor datasets
-        for row1 in exclusive_don_variants.keys():
-            # donor_in_question = row1['donor_query']
-            donor_gt_match = row1
-            expected_vars_of_other_donor = self.exclusive_don_variants[donor_gt_match]
-            expected_vars_norm_of_other_donor = self.norm_genotypes(expected_vars_of_other_donor)
-            all_donor_data[donor_gt_match]=expected_vars_norm_of_other_donor
+        shared_informative = shared_gts.intersection(self.informative_sites)
+        shared_uninformative = shared_gts.intersection(self.uninformative_sites)
+        # print("shared informative " + str(len(shared_informative)))
+        # print("shared uninformative " + str(len(shared_uninformative)))
 
-        results = []
-        for i,row1 in donor_assignments_table.iterrows():
-            donor_in_question = row1['donor_query']
-            donor_gt_match = row1['donor_gt']
-            print(donor_gt_match)
-            # if i>4:
-            #     continue
-            if (donor_gt_match=='NONE'):
-                continue
-            try:
-                donor_gt_match_cohort = donor_cohorts[donor_gt_match]
-            except:
-                continue
-            Cells_to_keep_pre = list(set(cell_assignments_table.loc[cell_assignments_table['donor_id']==donor_in_question,'cell']))
-            expected_vars = exclusive_don_variants[donor_gt_match]
-            expected_vars_norm = self.norm_genotypes(expected_vars)
-            try:
-                # Now we subset this down to each of the uniqie variants per donor and check which of the concordant sites are exclusive to donor.
-                dds = self.donor_distinct_sites[donor_gt_match]
-            except:
-                continue
-            if cpus==1:
-                result = self.analyse_donor(donor_in_question,Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm,donor_assignments_table)
-                self.combine_concordances(result)
+        #store the numbers of informative and uninformative sites shared between cellSNP and gt data as these
+        #are the sites used for concordance
+        self.informative_covered = len(shared_informative)
+        self.uninformative_covered = len(shared_uninformative)
+
+        if len(shared_uninformative) > 0:
+            #print(len(shared_uninformative))
+            # print(len(shared_informative))
+            if len(shared_uninformative) <= len(shared_informative):
+                informative_subset = set(random.sample(shared_informative, len(shared_uninformative)))
             else:
+                informative_subset = set()#if there are more shared uninformative than shared informative we will not subset
+            # print(informative_subset)
+            # exit(0)
+        else:
+            informative_subset = set()
 
-                result = pool.apply_async(self.analyse_donor, args=([donor_in_question,Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm,donor_assignments_table]),callback=self.combine_concordances)          
-                results.append(result)
-        try:
-            [result.wait() for result in results]        
-        except:
-            print('done')  
-        print('Done') 
-        pool.close()
-        pool.join() 
-        
-        
-        
-        # output = self.combine_written_files(self.cell_concordance_table,self.record_dict)
-        
-        return self.cell_concordance_table
+        # print(informative_subset)
+        self.informative_subset = informative_subset
 
-    
+        snp_gtypes_set = set(snp_gtypes)
+        snp_gtypes_set = sorted(snp_gtypes_set)
 
-    def read_extraction(self,DonorDiscordant_Sites_that_are_atributed_to_other_donor,expected_vars_norm,cell_vars_norm):
-        # we need this function wrapper to calculate the concordant, discordant read 
-        # counts for each of the discordant sites that are concordant with another donor.
-        
-        Total_Overlapping_sites = set(DonorDiscordant_Sites_that_are_atributed_to_other_donor)
-        expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlapping_sites)]
-        cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)]   
-        
-        cell_vars2 = cell_vars2.drop_duplicates(subset=['ids'])
-        expected_vars2 = expected_vars2.drop_duplicates(subset=['ids'])
-        cell_vars2['DP'] = cell_vars2[0].str.split("_").str[5].astype(int)
-        cell_vars2['AD'] = cell_vars2[0].str.split("_").str[6].astype(int)
-        cell_vars2['OTH'] = cell_vars2[0].str.split("_").str[7].astype(int)
-        total_reads,_,_,discordant_reads, \
-            discordantRead_ratio,discordant_reads_noHet,total_reads_noHet,discordantRead_ratio_noHet= self.read_concordance_calc(expected_vars2,cell_vars2)
-        concordant_reads = total_reads - discordant_reads
-        concordant_reads_noHet = total_reads_noHet-discordant_reads_noHet
-        return total_reads,discordant_reads,concordant_reads,\
-            total_reads_noHet,discordant_reads_noHet,concordant_reads_noHet
-    
-    def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match, donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data,donor_assignments_table):
+        cellsnp_gtypes_set = set(cellsnp_gtypes)
+        cellsnp_gtypes_set = sorted(cellsnp_gtypes_set)
 
-        Concordant_Sites, \
-            Discordant_sites, \
-            Total_Overlapping_sites, \
-            discordant_sites, \
-            cell_vars_norm, \
-            true_discordant_count, \
-            relaxed_concordant_count, \
-            relaxed_concordant_informative_count, \
-            relaxed_concordant_uninformative_count, \
-            true_discordant_informative_count, \
-            true_discordant_uninformative_count, \
-            total_sites, \
-            informative_sites, \
-            uninformative_sites, \
-            total_reads, \
-            total_reads_informative, \
-            total_reads_uninformative, \
-            discordant_reads, \
-            discordant_reads_informative, \
-            discordant_reads_uninformative, \
-            discordant_vars, \
-            concordant_vars, \
-            discordant_read_fraction_in_concordant_sites, \
-            discordant_read_fraction_in_discordant_sites, \
-            discordant_reads_uninformative_fraction, \
-            discordant_reads_informative_fraction, discordant_reads_informative_subset  = self.retrieve_concordant_discordant_sites(expected_vars_norm,cell_vars)
+        #for i in range(0, len(snp_gtypes)):
+        # cellsnp_gtypes_str = pd.DataFrame(cellsnp_gtypes_set,columns=['cellsnp_gtypes'])
+        # cellsnp_gtypes_str['snp_gtypes'] = pd.DataFrame(snp_gtypes_set,columns=['snp_gtypes'])['snp_gtypes']
+        for i in range(0, len(snp_gtypes_set)):
+            discordant = False
+            # snp_data = snp_gtypes[i].split('_')
+            # cellsnp_data = cellsnp_gtypes[i].split('_')
+            snp_data = snp_gtypes_set[i].split('_')
+            cellsnp_data = cellsnp_gtypes_set[i].split('_')
+
+            snp_alleles_set = set([snp_data[4][0], snp_data[4][2]])
+            cellsnp_alleles_set = set([cellsnp_data[4][0], cellsnp_data[4][2]])
+            
+            snp_var = ('_').join(snp_data[0:4])
+            cellsnp_var = ('_').join(cellsnp_data[0:4])
+
+            if not cellsnp_var == snp_var:
+                print("Error with strict discordance calculations: " + snp_gtypes[i] + " " + cellsnp_gtypes[i])
+                exit(1)
+            else:
+                for allele in cellsnp_alleles_set:
+                    if not allele in snp_alleles_set:#if a cellSNP allele is found that is not in the array data this is discordant
+                        discordant = True
+            
+            if discordant == True:
+                true_discordant+=1
+                discordant_vars.append(cellsnp_var)
+                if snp_var in self.uninformative_sites:
+                    true_discordant_uninformative+=1
+                    true_discordant_uninformative_ids.append(snp_var)
+                elif snp_var in self.informative_sites:
+                    true_discordant_informative+=1
+                    true_discordant_informative_ids.append(snp_var)
+            else:
+                relaxed_concordant+=1
+                concordant_vars.append(cellsnp_var)
+                if snp_var in self.uninformative_sites:
+                    relaxed_concordant_uninformative+=1
+                    relaxed_concordant_uninformative_ids.append(snp_var)
+                elif snp_var in self.informative_sites:
+                    relaxed_concordant_informative+=1
+                    relaxed_concordant_informative_ids.append(snp_var)
+            if len(shared_uninformative) > 0:
+                if snp_var in informative_subset:
+                    if discordant == True:
+                        subset_informative_discordant+=1
+                    else:
+                        subset_informative_concordant+=1
+            #    true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count 
+        cell_vars2 = cell_vars.set_index('ids') 
+        disc = pd.DataFrame(set(cell_vars2.loc[discordant_vars]['combo']),columns=['combo_x'])
+        df_cd = pd.merge(cell_vars, expected_vars, how='inner', on = 'pos')
+        disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x')
+        disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y']
+        disc_sites_string = ';'.join(disc2['expected_retrieved'])
+        return true_discordant, relaxed_concordant, relaxed_concordant_informative_ids, relaxed_concordant_uninformative_ids, true_discordant_informative_ids, true_discordant_uninformative_ids, discordant_vars, concordant_vars, disc_sites_string
+    
+    def read_concordance_calc(self,expected_vars,cell_vars):
+            
+            # This is a wrapper to add up the discordant reads in the cellsnp file.
+            
+            # expected genotype 0/0
+            expected_hom_ref = expected_vars[expected_vars['vars'] == '0/0']
+            hom_ref_sites = set(expected_hom_ref['ids'])
+            cell_vars2 = cell_vars[cell_vars['ids'].isin(hom_ref_sites)]
+            ad_hom_ref = cell_vars2['AD'].sum()
+            oth_hom_ref = cell_vars2['OTH'].sum() 
+            discordant_hom_ref = ad_hom_ref + oth_hom_ref
+                    
+            # expected genotype 0/1 or 1/0
+            hets = ['0/1', '1/0']
+            expected_het = expected_vars[expected_vars['vars'].isin(hets)]
+            het_sites = set(expected_het['ids'])
+            cell_vars3 = cell_vars[cell_vars['ids'].isin(het_sites)]
+            discordant_het = cell_vars3['OTH'].sum()
+            
+            # expected genotype 1/1
+            expected_hom_alt = expected_vars[expected_vars['vars'] == '1/1']
+            hom_alt_sites = set(expected_hom_alt['ids'])
+            cell_vars4 = cell_vars[cell_vars['ids'].isin(hom_alt_sites)]
+            # DP + OTH - AD
+            ad_hom_alt = cell_vars4['AD'].sum()
+            dp_hom_alt = cell_vars4['DP'].sum()
+            oth_hom_alt = cell_vars4['OTH'].sum()
+            discordant_hom_alt = (dp_hom_alt + oth_hom_alt) - ad_hom_alt
             
-        total_concordant_sites = len(Concordant_Sites) + relaxed_concordant_count
-        dds = self.donor_distinct_sites[donor_gt_match]
-        Nr_donor_distinct_sites = len(dds)
-        Nr_Concordant = len(Concordant_Sites)
-        Nr_Relaxed_concordant = relaxed_concordant_count
-        Nr_Discordant = len(Discordant_sites)
-        Nr_Total_Overlapping_sites = len(Total_Overlapping_sites)
-        Number_of_sites_that_are_donor_concordant_and_exclusive = len(set(dds).intersection(set(Discordant_sites)))
-        Number_of_sites_in_cellsnp_but_not_in_reference = set(cell_vars_norm['pos'])-set(expected_vars_norm['pos'])
-        #Quantify donor variation in other donors
-        discordant_vars_in_pool = []
-        donor_table_of_concordances = []
-        total_discordant_sites_that_are_concordant_with_other_donors_in_pool = set()
-        total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown = {}
-        total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort']={}
-        total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort']={}
-      
-        total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown_totalReads = {}
-        total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown_totalReads['Whithin_Cohort']={}
-        total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown_totalReads['Out_of_Cohort']={}      
-      
-        total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown = {}
-        total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort']={}
-        total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort']={}      
-      
-        
-        informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool = set()
-        total_cordant_sites_that_are_concordant_with_other_donors_in_pool = set()
-        donor_gt_match_cohort = donor_cohorts[donor_gt_match]
-        donors_contributing_to_out_of_cohort= []
-        donors_contributing_to_Whithin_Cohort=[]
-        sites_contributing_to_out_of_cohort= []
-        sites_contributing_to_Whithin_Cohort=[]
-        for donor in set(donor_assignments_table['donor_gt']):
-            try:
-                expected_vars_norm_of_other_donor = all_donor_data[donor]
-            except:
-                continue
-            try:
-                donor_cohort = donor_cohorts[donor]
-                donor_vars = vars_per_donor_gt[donor]
-            except:
-                continue        
+            # Total analysis
+            discordant_reads =  discordant_hom_ref + discordant_het + discordant_hom_alt
+            total_dp = cell_vars['DP'].sum()
+            total_oth = cell_vars['OTH'].sum()
+            total_reads = total_dp + total_oth
+            discordantRead_ratio = discordant_reads/total_reads
             
-            Concordant_Sites_otherDonor, \
-                Discordant_sites_otherDonor, \
-                Total_Overlapping_sites_otherDonor, \
-                discordant_sites_otherDonor, \
-                cell_vars_norm_otherDonor, \
-                true_discordant_count_otherDonor, \
-                relaxed_concordant_count_otherDonor, \
-                relaxed_concordant_informative_count_otherDonor, \
-                relaxed_concordant_uninformative_count_otherDonor, \
-                true_discordant_informative_count_otherDonor, \
-                true_discordant_uninformative_count_otherDonor, \
-                total_sites_otherDonor, \
-                informative_sites_otherDonor, \
-                uninformative_sites_otherDonor, \
-                total_reads_otherDonor, \
-                total_reads_informative_otherDonor, \
-                total_reads_uninformative_otherDonor, \
-                discordant_reads_otherDonor, \
-                discordant_reads_informative_otherDonor, \
-                discordant_reads_uninformative_otherDonor, \
-                discordant_vars_otherDonor, \
-                concordant_vars_otherDonor, \
-                discordant_read_fraction_in_concordant_sites_otherDonor, \
-                discordant_read_fraction_in_discordant_sites_otherDonor, \
-                discordant_reads_uninformative_fraction_otherDonor, \
-                discordant_reads_informative_fraction_otherDonor, discordant_reads_informative_subset_otherDonor  = self.retrieve_concordant_discordant_sites(expected_vars_norm_of_other_donor,cell_vars,donor_cohort=donor_cohort)
+            # No het analysis - they are not discordant or concordant - we cannot make any call at those sites
+            discordant_reads_noHet =  discordant_hom_ref + discordant_hom_alt
+            total_reads_noHet = total_reads - discordant_het
+            discordantRead_ratio_noHet = discordant_reads_noHet/total_reads_noHet
             
-            if total_sites_otherDonor==0:
-                total_sites_otherDonor=0.000000000001
-            total_concordant_sites_otherDonor = relaxed_concordant_count_otherDonor
-            concordant_percent_in_other_donor= total_concordant_sites_otherDonor/total_sites_otherDonor*100
-            discordant_percent_in_other_donor= true_discordant_count_otherDonor/total_sites_otherDonor*100
-            DonorDiscordant_Sites_that_are_atributed_to_other_donor = set(discordant_vars).intersection(set(concordant_vars_otherDonor))
-            TotalSites_that_are_atributed_to_other_donor = set(discordant_vars).union(set(concordant_vars)).intersection(set(concordant_vars_otherDonor))
+            return total_reads,total_dp,total_oth,discordant_reads,discordantRead_ratio,discordant_reads_noHet,total_reads_noHet,discordantRead_ratio_noHet
 
-            Informative__DonorDiscordant_Sites_that_are_atributed_to_other_donor = set(true_discordant_informative_count).intersection(set(relaxed_concordant_informative_count_otherDonor))
-            DonorCordant_Sites_that_are_atributed_to_other_donor = set(concordant_vars).intersection(set(concordant_vars_otherDonor))
-           
-            total_reads_for_discordant_sites_that_are_concordant_with_other_donor,discordant_reads_for_discordant_sites_that_are_concordant_with_other_donor,concordant_reads_for_discordant_sites_that_are_concordant_with_other_donor,\
-                _,_,_= self.read_extraction(DonorDiscordant_Sites_that_are_atributed_to_other_donor,expected_vars_norm_of_other_donor,cell_vars_norm_otherDonor)
+    def read_condordance(self, expected_vars, cell_vars,discordant_vars, concordant_vars):
+        '''
+        get read level concordance using DP, AD and OTH format fields
+        ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="total counts for ALT and REF">
+        ##FORMAT=<ID=AD,Number=1,Type=Integer,Description="total counts for ALT">
+        ##FORMAT=<ID=OTH,Number=1,Type=Integer,Description="total counts for other bases from REF and ALT">
+        '''
+        if not len(expected_vars) == len(cell_vars):
+            print("length mismatch between expected vars and cell vars")
+            exit(1)
 
-            if not donor == donor_gt_match:
-                # We want to kow how many of these discordant site
-                if 'U937' in donor:
-                    continue
-                if 'THP1' in donor:
-                    continue
-                
-                if donor_gt_match_cohort == donor_cohort:
-                    coh = 'Whithin_Cohort'
-                    sites_contributing_to_Whithin_Cohort.extend(list(TotalSites_that_are_atributed_to_other_donor))
-                    if len(DonorDiscordant_Sites_that_are_atributed_to_other_donor)>0:
-                        donors_contributing_to_Whithin_Cohort.append(donor)
-                        
-                else:
-                    coh = 'Out_of_Cohort'
-                    sites_contributing_to_out_of_cohort.extend(list(TotalSites_that_are_atributed_to_other_donor))
-                    if len(DonorDiscordant_Sites_that_are_atributed_to_other_donor)>0:
-                        donors_contributing_to_out_of_cohort.append(donor)
-                        
-                
-                total_discordant_sites_that_are_concordant_with_other_donors_in_pool = total_discordant_sites_that_are_concordant_with_other_donors_in_pool.union(set(DonorDiscordant_Sites_that_are_atributed_to_other_donor))
-                # now we addit for a cohort since the biggest issue comes from cohort cross-contamination
-                # for each of these sites now we calculate the number of reads that it accounts:
-                # tree level set: cohort: site: counts
-                for site in DonorDiscordant_Sites_that_are_atributed_to_other_donor:
-                    total_reads_for_site,discordant_reads_for_site,concordant_for_site,\
-                        total_reads_noHet,discordant_reads_noHet,concordant_reads_noHet= self.read_extraction([site],expected_vars_norm_of_other_donor,cell_vars_norm_otherDonor)
-                    if concordant_for_site==0:
-                        pass
-                    try:
-                        total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_reads_noHet)
+        total_sites = len(expected_vars)
+        #add cols for DP, AD< OTH
+        cell_vars['DP'] = cell_vars[0].str.split("_").str[5].astype(int)
+        cell_vars['AD'] = cell_vars[0].str.split("_").str[6].astype(int)
+        cell_vars['OTH'] = cell_vars[0].str.split("_").str[7].astype(int)
+        
 
-                    except:
-                        try:
-                            total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site]=[]
-                            total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_reads_noHet)
-                            
-                        except:
-                            total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh]={}
-                            total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site]=[]
-                            total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_reads_noHet)
+        
+        # Total
+        total_reads,total_dp,total_oth,discordant_reads, \
+            discordantRead_ratio,discordant_reads_noHet,total_reads_noHet,discordantRead_ratio_noHet = self.read_concordance_calc(expected_vars,cell_vars)
+        
+        # uninformative
+        cell_vars_uninformative = cell_vars[cell_vars['ids'].isin(self.uninformative_sites)]
+        total_reads_uninformative,total_dp_uninformative,total_oth_uninformative,discordant_reads_uninformative, \
+            discordantRead_ratio_uninformative,discordant_reads_uninformative_noHet,total_reads_uninformative_noHet,discordantRead_ratio_uninformative_noHet= self.read_concordance_calc(expected_vars,cell_vars_uninformative)
+        
+        # informative 
+        cell_vars_informative = cell_vars[cell_vars['ids'].isin(self.informative_sites)]
+        total_reads_informative,total_dp_informative,total_oth_informative,discordant_reads_informative, \
+            discordantRead_ratio_informative,discordant_reads_informative_noHet,total_reads_informative_noHet,discordantRead_ratio_informative_noHet = self.read_concordance_calc(expected_vars,cell_vars_informative)
+        
+        # Split into concordant and discordant sites
+        # concordant
+        concordant_sites = cell_vars[cell_vars['ids'].isin(set(concordant_vars))]
+        total_reads_for_concordant_sites,total_dp_for_concordant_sites,total_oth_for_concordant_sites,discordant_reads_for_concordant_sites, \
+            discordantRead_ratio_for_concordant_sites,discordant_reads_for_concordant_sites_noHet,total_reads_for_concordant_sites_noHet,discordantRead_ratio_for_concordant_sites_noHet = self.read_concordance_calc(expected_vars,concordant_sites)
+        
+        # discordant
+        discordant_sites = cell_vars[cell_vars['ids'].isin(set(discordant_vars))]
+        total_reads_for_discconcordant_sites,total_dp_for_discconcordant_sites,total_oth_for_discconcordant_sites,discordant_reads_for_discconcordant_sites, \
+            discordantRead_ratio_for_discconcordant_sites,discordant_reads_for_discconcordant_sites_noHet,total_reads_for_discconcordant_sites_noHet,discordantRead_ratio_for_discconcordant_sites_noHet = self.read_concordance_calc(expected_vars,discordant_sites)
+              
+        # Subset analysis
+        cell_vars_informative_subset = cell_vars[cell_vars['ids'].isin(self.informative_subset)]
+        total_reads_informative_subset,total_dp_informative_subset,total_oth_informative_subset,discordant_reads_informative_subset, \
+            discordantRead_ratio_informative_subset,discordant_reads_informative_subset_noHet,total_reads_informative_subset_noHet,discordantRead_ratio_informative_subset_noHet = self.read_concordance_calc(expected_vars,cell_vars_informative_subset)
 
-                for site in DonorDiscordant_Sites_that_are_atributed_to_other_donor:
-                    total_reads_for_site,discordant_reads_for_site,concordant_for_site,\
-                        _,_,_= self.read_extraction([site],expected_vars_norm_of_other_donor,cell_vars_norm_otherDonor)
-                    # if discordant_reads_for_site>0:
-                    #     print('here')
-                    if concordant_for_site==0:
-                        pass
-                    try:
-                        total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_for_site)
-                    except:
-                        try:
-                            total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site]=[]
-                            total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_for_site)
-                        
-                        except:
-                            total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh]={}
-                            total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site]=[]
-                            total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_for_site)
+        if mode == 'no_het':
+            total_reads = total_reads_noHet
+            discordant_reads = discordant_reads_noHet
+            total_reads_informative = total_reads_informative_noHet
+            discordant_reads_informative = discordant_reads_informative_noHet
+            total_reads_uninformative =total_reads_uninformative_noHet
+            discordant_reads_uninformative = discordant_reads_uninformative_noHet
+            total_reads_informative_subset = total_reads_informative_subset_noHet
+            discordant_reads_informative_subset = discordant_reads_informative_subset_noHet
+            total_reads_for_concordant_sites = total_reads_for_concordant_sites_noHet
+            discordant_reads_for_concordant_sites = discordant_reads_for_concordant_sites_noHet
+            total_reads_for_discconcordant_sites = total_reads_for_discconcordant_sites_noHet
+            discordant_reads_for_discconcordant_sites = discordant_reads_for_discconcordant_sites_noHet
+        
+        return total_sites, \
+            self.informative_covered, \
+                self.uninformative_covered, \
+                    total_reads, \
+                        discordant_reads, \
+                            total_reads_informative, \
+                                discordant_reads_informative, \
+                                    total_reads_uninformative, \
+                                        discordant_reads_uninformative, \
+                                            total_reads_informative_subset, \
+                                                    discordant_reads_informative_subset, \
+                                                        total_reads_for_concordant_sites, \
+                                                            discordant_reads_for_concordant_sites, \
+                                                                total_reads_for_discconcordant_sites, \
+                                                                    discordant_reads_for_discconcordant_sites
+                                                                    
                                                     
-                # to get the total reads that can be atributed to the other donor i have to check if site is already covered in the total_discordant_sites_that_are_concordant_with_other_donors_in_pool.
-                # the ones that havent, i have to add the reads up for them.
-                informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool = informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool.union(set(Informative__DonorDiscordant_Sites_that_are_atributed_to_other_donor))
-                
-                total_cordant_sites_that_are_concordant_with_other_donors_in_pool = total_cordant_sites_that_are_concordant_with_other_donors_in_pool.union(set(DonorCordant_Sites_that_are_atributed_to_other_donor))
+    
+    def get_discordance(self,expected_vars2,cell_vars2):
+        Concordant_Sites = set(cell_vars2['combo']).intersection(set(expected_vars2['combo']))
+        Discordant_sites = set(cell_vars2['combo'])-set(expected_vars2['combo'])  
+        disc = pd.DataFrame(Discordant_sites,columns=['combo_x'])
+        df_cd = pd.merge(cell_vars2, expected_vars2, how='inner', on = 'pos')
+        disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x')
+        disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y']
+        disc_sites = ';'.join(disc2['expected_retrieved'])
+        return Concordant_Sites,Discordant_sites,disc_sites
+    
+    
+    def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars,donor_cohort=False):
+        # This function has been inspired by Hails Concordance implementations, however hail has a pitfall that it performs a lot of other stuff under hood and requires intermediate sorting operations.
+        # Since the single cell calculations requires concordance calculations per cell this becomes very computationally heavy on Hail, hence we have implemented concordance calculations here as part of the pipeline.
+        # Author: M.Ozols
+        
+        cell_vars_norm = self.norm_genotypes(cell_vars)
+
+        if len(cell_vars_norm) > 0:
+            
+            if mode == 'no_het':
+                hets = ['0/1', '1/0']
+                expected_vars_norm = expected_vars_norm[~expected_vars_norm['vars'].isin(hets)]
                 
+            Total_Overlapping_sites = set(expected_vars_norm['ids']).intersection(set(cell_vars_norm['ids']))
+            expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlapping_sites)]
+            cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)]
+            cell_vars2 = cell_vars2.drop_duplicates(subset=['ids'])
+            expected_vars2 = expected_vars2.drop_duplicates(subset=['ids'])
+            if len(cell_vars2)!=len(expected_vars2):
+                print('failed to select comon variants')
+                exit(1)
+            
+            # Find exact discordant sites
+            Concordant_Sites, Discordant_sites, _ = self.get_discordance(expected_vars2, cell_vars2)
+            #find truly discordant sites
+            true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, discordant_vars, concordant_vars, disc_sites_string = self.get_strict_discordance(expected_vars2, cell_vars2)
+            #find discordant reads
+            total_sites, \
+                informative_sites, \
+                    uninformative_sites, \
+                        total_reads, \
+                            discordant_reads, \
+                                total_reads_informative, \
+                                    discordant_reads_informative, \
+                                        total_reads_uninformative, \
+                                            discordant_reads_uninformative, \
+                                                    total_reads_informative_subset, \
+                                                        discordant_reads_informative_subset, \
+                                                            total_reads_for_concordant_sites, \
+                                                                discordant_reads_for_concordant_sites, \
+                                                                    total_reads_for_discconcordant_sites, \
+                                                                        discordant_reads_for_discconcordant_sites   = self.read_condordance(expected_vars2, cell_vars2, discordant_vars, concordant_vars)
+                                                                        
+            discordant_read_fraction_in_concordant_sites = f"{discordant_reads_for_concordant_sites}/{total_reads_for_concordant_sites}"
+            discordant_read_fraction_in_discordant_sites = f"{discordant_reads_for_discconcordant_sites}/{total_reads_for_discconcordant_sites}"
+            discordant_reads_uninformative_fraction = f"{discordant_reads_uninformative}/{total_reads_uninformative}"
+            discordant_reads_informative_fraction = f"{discordant_reads_informative}/{total_reads_informative}"
+            
+            # sanity checks
+            if total_reads != total_reads_for_concordant_sites+total_reads_for_discconcordant_sites:
+                print("Error: total reads dont add up ")
+                exit(1)
+            if discordant_reads != discordant_reads_for_concordant_sites+discordant_reads_for_discconcordant_sites:
+                print("Error: discordant reads dont add up ")
+                exit(1)
+            
 
-                common_vars = list(set(discordant_vars) & set(donor_vars))
-                common_var_count = str(len(common_vars))
-                donor_cohort_common = donor + ":" + donor_cohort + ":" + common_var_count
-                discordant_vars_in_pool.append(donor_cohort_common)
-                
-                # Here we want to calculate the number of discordant sites in other donors and see if in terms of concordance the same donor is picked as per GT assignment.
-                # We do this to investigate the potential of a cell coming from this other donor.
-            else:
-                _='check'
-                try:
-                    percent_relaxed_concordant = Nr_Relaxed_concordant/(Nr_Relaxed_concordant+true_discordant_count)*100
-                except:
-                    percent_relaxed_concordant = 0
-                # g2 = total_concordant_sites_otherDonor/total_sites_otherDonor*100
-                if not (percent_relaxed_concordant == concordant_percent_in_other_donor):
-                    _='something not right'
-                    break
+        else:
+            Total_Overlapping_sites = set()
+            Concordant_Sites = set()
+            Discordant_sites = set()
+            disc_sites = ''
+            true_discordant_count = 0
+            relaxed_concordant_count = 0
+            total_sites = 0
+            relaxed_concordant_informative_count = 0
+            relaxed_concordant_uninformative_count = 0
+            true_discordant_informative_count = 0
+            true_discordant_uninformative_count = 0
+            informative_sites = 0
+            disc_sites_string = ''
+            discordant_reads = 0
+            uninformative_sites = 0
+            total_reads = 0
+            total_reads_informative =0
+            total_reads_uninformative=0
+            discordant_reads=0
+            discordant_reads_informative=0
+            discordant_reads_uninformative=0
+            discordant_vars=0
+            concordant_vars=0
+            discordant_read_fraction_in_concordant_sites=0
+            discordant_read_fraction_in_discordant_sites=0
+            discordant_reads_uninformative_fraction=0
+            discordant_reads_informative_fraction=0
+            discordant_reads_informative_subset=0
+
+        return Concordant_Sites, \
+            Discordant_sites, \
+            Total_Overlapping_sites, \
+            disc_sites_string, \
+            cell_vars_norm, \
+            true_discordant_count, \
+            relaxed_concordant_count, \
+            relaxed_concordant_informative_count, \
+            relaxed_concordant_uninformative_count, \
+            true_discordant_informative_count, \
+            true_discordant_uninformative_count, \
+            total_sites, \
+            informative_sites, \
+            uninformative_sites, \
+            total_reads, \
+            total_reads_informative, \
+            total_reads_uninformative, \
+            discordant_reads, \
+            discordant_reads_informative, \
+            discordant_reads_uninformative,  \
+            discordant_vars, \
+            concordant_vars, \
+            discordant_read_fraction_in_concordant_sites, \
+            discordant_read_fraction_in_discordant_sites, \
+            discordant_reads_uninformative_fraction, \
+            discordant_reads_informative_fraction, \
+            discordant_reads_informative_subset
             
-            donor_table_of_concordances.append({'donor':donor, 'cell':cell1, 'donor_cohort':donor_cohort, \
-                                                'gt matched donor':donor == donor_gt_match, \
-                                                'DonorCordant_Sites_that_are_atributed_to_other_donor':len(DonorCordant_Sites_that_are_atributed_to_other_donor), \
-                                                'DonorCordant_Sites_that_are_atributed_to_other_donor/total':f"{len(DonorCordant_Sites_that_are_atributed_to_other_donor)}/{len(concordant_vars)}", \
-                                                'DonorDiscordant_Sites_that_are_atributed_to_other_donor':len(DonorDiscordant_Sites_that_are_atributed_to_other_donor), \
-                                                'DonorDiscordant_Sites_that_are_atributed_to_other_donor/total':f"{len(DonorDiscordant_Sites_that_are_atributed_to_other_donor)}/{len(discordant_vars)}", \
-                                                'concordant_percent_in_other_donor':concordant_percent_in_other_donor, \
-                                                'discordant_percent_in_other_donor':discordant_percent_in_other_donor, \
-                                                'discordant_reads_otherDonor':discordant_reads_otherDonor, \
-                                                'discordant_sites_otherDonor':len(discordant_vars_otherDonor), \
-                                                'concordant_sites_otherDonor':len(concordant_vars_otherDonor), \
-                                                'total_sites_otherDonor':total_sites_otherDonor, \
-                                                'discordant_reads_otherDonor':discordant_reads_otherDonor, \
-                                                'total_reads_otherDonor':total_reads_otherDonor, \
-                                                # 'discordant_read_fraction_in_concordant_sites_otherDonor':discordant_read_fraction_in_concordant_sites_otherDonor, \
-                                                # 'discordant_read_fraction_in_discordant_sites_otherDonor':discordant_read_fraction_in_discordant_sites_otherDonor, \
-                                                'concordant_reads_For_discordant_sites_that_are_Concordant_with_other_donor':concordant_reads_for_discordant_sites_that_are_concordant_with_other_donor
-                                                })
-        
-        
-        #here now we want to see overall how many reads potentially come from different cohorts.
-        # cohort_specific_site_quant_string=""
-        # cohort_specific_read_quant_string=""
+    
+
+    def set_results(self,to_set,id):
+        # Recod to disk to save the loading mmeory time.
+        hash = random.getrandbits(128)
+        with open(f'tmp_{id}_{hash}.pkl', 'wb') as f:
+            pickle.dump(to_set, f)
+        self.record_dict[id]=f'tmp_{id}_{hash}.pkl'
+        return 
+    
+    # def append_results_cell_concordances(self,result):
+    def append_results_cell_concordances(self,result,cell_concordance_table,other_donor_concordances,other_donor_concordance_table):
+        other_donor_concordance_table = other_donor_concordance_table + other_donor_concordances
+        # other_donor_concordance_table = {**other_donor_concordance_table, **other_donor_concordances}
+        count=result['count']
+        try:
+            percent_concordant = result['Nr_Concordant']/(result['Nr_Discordant']+result['Nr_Concordant'])*100
+        except:
+            percent_concordant = 0
         
-        Whithin_Cohort__total_number_of_potential_contaminent_reads=0
-        Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool=0
-        Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool=0
         try:
-            Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool = len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys())
-            # for k1 in total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys():
-            #     Whithin_Cohort__total_number_of_potential_contaminent_reads+= max(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'][k1])
+            percent_discordant = result['Nr_Discordant']/(result['Nr_Discordant']+result['Nr_Concordant'])*100
         except:
-            _='Doesnt Exist'
+            percent_discordant = 0
+
+        try:
+            percent_relaxed_concordant = result['Nr_Relaxed_concordant']/(result['Nr_Relaxed_concordant']+result['true_discordant_count'])*100
+        except:
+            percent_relaxed_concordant = 0
         
-        Out_of_Cohort__total_number_of_potential_contaminent_reads=0
         try:
-            Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool = len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys())
-            # for k1 in total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys():
-            #     Out_of_Cohort__total_number_of_potential_contaminent_reads+= max(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'][k1])
+            percent_strict_discordant = result['true_discordant_count']/(result['Nr_Relaxed_concordant']+result['true_discordant_count'])*100
         except:
-            _='Doesnt Exist'
-    
-    
-    
-    
+            percent_strict_discordant = 0
+
+        try:
+            read_discordance = result['discordant_reads']/result['total_sites']
+        except:
+            read_discordance = 0
+
+        cohort = 'UNKNOWN'
+        donor_split = result['donor_gt_match'].split("_")
+        if (len(donor_split) == 2) and (donor_split[0] == donor_split[1]):
+            cohort = 'UKB'
+        elif (len(donor_split) == 3) and (len(donor_split[0]) == 14):
+            cohort = 'ELGH'
+
+        same_as_asigned_donor = result['donor_gt_match'] in result['Donor_With_Highest_Concordance']
+        if not same_as_asigned_donor:
+            same_as_asigned_donor = result['donor_gt_match'] in result['Donor_With_Lowest_DisConcordance']
+            
+        cell_concordance_table[f"{result['cell1']} --- {result['donor_gt_match']}"] = {'GT 1':result['cell1'],
+                                                                'GT 2':result['donor_gt_match'],
+                                                                'cohort': cohort,
+                                                                
+                                                                # 'Nr_Concordant':result['Nr_Concordant'],
+                                                                # 'Nr_Discordant':result['Nr_Discordant'],
+                                                                'Nr_Relaxed_concordant':result['Nr_Relaxed_concordant'],
+                                                                'Nr_strict_discordant':result['true_discordant_count'],
+                                                                # 'Percent Concordant':percent_concordant,
+                                                                # 'Percent Discordant':percent_discordant,
+                                                                'Percent_relaxed_concordant': percent_relaxed_concordant,
+                                                                'Percent_strict_discordant': percent_strict_discordant,
+                                                                'Nr_concordant_informative': len(result['relaxed_concordant_informative_count']),
+                                                                'Nr_concordant_uninformative': len(result['relaxed_concordant_uninformative_count']),
+                                                                'Nr_discordant_informative': len(result['true_discordant_informative_count']),
+                                                                'Nr_discordant_uninformative': len(result['true_discordant_uninformative_count']),
+                                                                'NrTotal_Overlapping_sites_between_two_genotypes':result['Nr_Total_Overlapping_sites'],
+                                                                'Nr_donor_distinct_sites_within_pool_individuals':result['Nr_donor_distinct_sites'],
+                                                                'Number_of_sites_that_are_donor_concordant_and_exclusive':result['Number_of_sites_that_are_donor_concordant_and_exclusive'],
+                                                                'Total_sites': result['total_sites'],
+                                                                'Total_informative_sites': result['informative_sites'],
+                                                                'Total_uninformative_sites': result['uninformative_sites'],
+                                                                'Total_reads': result['total_reads'],
+                                                                'Total_reads_informative': result['total_reads_informative'],
+                                                                'Total_reads_uninformative': result['total_reads_uninformative'],
+                                                                'Discordant_reads': result['discordant_reads'],
+                                                                'Discordant_reads_informtive': result['discordant_reads_informative'],
+                                                                'Discordant_reads_uninformtive': result['discordant_reads_uninformative'],
+                                                                'discordant_reads_informative_subset':result['discordant_reads_informative_subset'],
+                                                                'Discordant_reads_by_n_sites': read_discordance,
+                                                                'Discordant_sites_in_pool': len(result['Discordant_sites_in_pool']),
+                                                                'Lowest_Disconcordance_value_in_all_donors':result['Lowest_Disconcordance_value_in_all_donors'],
+                                                                'Donor_With_Lowest_DisConcordance':result['Donor_With_Lowest_DisConcordance'],
+                                                                'Concordant_Site_Identities':result['Concordant_Site_Identities'],
+                                                                'Donor_With_Highest_Concordance':result['Donor_With_Highest_Concordance'],
+                                                                'Highest_Concordance_value_in_all_donors':result['Highest_Concordance_value_in_all_donors'],
+                                                                'same_as_asigned_donor':same_as_asigned_donor,
+                                                                'Total_sites_other_donor (if same_as_asigned_donor=False)':result['Total_sites_other_donor'],
+                                                                'Total_reads_other_donor (if same_as_asigned_donor=False)':result['Total_reads_other_donor'],
+                                                                'total_discordant_sites_that_are_concordant_with_other_donors_in_pool':result['total_discordant_sites_that_are_concordant_with_other_donors_in_pool'],
+                                                                'discordant_read_fraction_in_concordant_site':result['discordant_read_fraction_in_concordant_sites'], 
+                                                                'discordant_read_fraction_in_discordant_sites':result['discordant_read_fraction_in_discordant_sites'], 
+                                                                'Whithin_Cohort__total_number_of_potential_contaminent_reads':result['Whithin_Cohort__total_number_of_potential_contaminent_reads'],
+                                                                'Out_of_Cohort__total_number_of_potential_contaminent_reads':result['Out_of_Cohort__total_number_of_potential_contaminent_reads'], 
+                                                                'NrDonors_contributing_to_out_of_cohort':result['NrDonors_contributing_to_out_of_cohort'], 
+                                                                'NrDonors_contributing_to_Whithin_Cohort':result['NrDonors_contributing_to_Whithin_Cohort'],
+                                                                
+                                                                'Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool'],
+                                                                'Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool'],
+                                                                'shared_sites_between_Out_of_Cohort_and_Within_Cohort':result['shared_sites_between_Out_of_Cohort_and_Within_Cohort'],
+                                                                'Within_Cohort__unique_discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool'],
+                                                                'Out_of_Cohort__unique_discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool'],
+                                                                'Out_of_Cohort__unique_sites_total_reads':result['Out_of_Cohort__unique_sites_total_reads'],
+                                                                'Total__discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Total__discordant_sites_that_are_concordant_with_other_donors_in_pool']
+                                                                } 
+        
+        return [cell_concordance_table,other_donor_concordance_table]
     
     
+    def combine_written_lists(self,exclusive_donor_variants,record_dict):#this is for VCF loader class
+        to_export = exclusive_donor_variants
+        for val1 in record_dict.values():
+            # here remove the int files.
+            print(f"merging temp file: {val1}")
+            with open(val1, 'rb') as f:
+                loaded_dict = pickle.load(f)
+                self.other_donor_comp = self.other_donor_comp+ loaded_dict
+            os.remove(val1)
+        return self.other_donor_comp    
     
+    def combine_written_files(self,exclusive_donor_variants,record_dict):#this is for VCF loader class
+        to_export = exclusive_donor_variants
+        for val1 in record_dict.values():
+            # here remove the int files.
+            print(f"merging temp file: {val1}")
+            with open(val1, 'rb') as f:
+                loaded_dict = pickle.load(f)
+                for k1 in loaded_dict.keys():
+                    try:
+                        to_export[k1]=to_export[k1].union(loaded_dict[k1])
+                    except:
+                        to_export[k1]=set()
+                        to_export[k1]=to_export[k1].union(loaded_dict[k1])
+            os.remove(val1)
+        return to_export
+   
+    def set_results(self,to_set,id):
+        # Recod to disk to save the loading mmeory time.
+        hash = random.getrandbits(128)
+        with open(f'tmp_{id}_{hash}.pkl', 'wb') as f:
+            pickle.dump(to_set, f)
+        self.record_dict[id]=f'tmp_{id}_{hash}.pkl' 
     
-        # No het total calcs
+
+    def combine_concordances(self,result,other_donor_concordance,donor_gt_match,analyse_donor):
+        pd.DataFrame(other_donor_concordance).sort_values(by=['cell']).to_csv(f'{donor_gt_match}-{analyse_donor}--each_cells_comparison_with_other_donor.tsv',sep='\t',index=False)
+        self.cell_concordance_table = {**self.cell_concordance_table, **result}
+
+    def combine_dict(self,cell_concordance_table,result): 
+        cell_concordance_table = {**cell_concordance_table, **result}    
+        return cell_concordance_table  
+
+    def conc_table(self):
+        donor_assignments_table=self.donor_assignments_table
+        cell_assignments_table=self.cell_assignments_table
+        exclusive_don_variants=self.exclusive_don_variants
+        exclusive_cell_variants= self.exclusive_cell_variants
+        donor_list = exclusive_don_variants.keys()
+        # cpus = 1
         
-        # Since we can not tell just from the cellsnp file if the discordant read at concordant site is actually concordant with another donor in a pool
-        # we assume the worse case scenario and add idscordant reads at concordant sites to the total.
-        # Number of reads on the sites that are concordant, but has discordant reads with another donor
-        Whithin_Cohort__total_number_of_potential_contaminent_reads=0
-        try:
-            concordant_sites_atributed_to_other_donor = list(set(sites_contributing_to_Whithin_Cohort))
-            if not total_sites >= len(concordant_sites_atributed_to_other_donor):
-                raise Exception("Sorry, the number of sites atributed can not be larger than total sites")
-            
-            _,_,_,\
-                _,Whithin_Cohort__total_number_of_potential_contaminent_reads, \
-                    _= self.read_extraction(concordant_sites_atributed_to_other_donor,expected_vars_norm,cell_vars_norm)
-                    
-            if not discordant_reads >= len(Whithin_Cohort__total_number_of_potential_contaminent_reads):
-                raise Exception("Total discordant reads can not be less than within cohort discordant reads.")            
-            
-            
-        except:
-            _='Doesnt Exist'
+        count = 0
+        
+        
+        #create a list of variants that are on each donor genotype file
+        vars_per_donor_gt = {}
+        for don_id in donor_list:
+            donor_gt_vars = list(exclusive_don_variants[don_id])
+            donor_gt_vars = pd.DataFrame(donor_gt_vars)
+            donor_gt_vars = self.norm_genotypes(donor_gt_vars)
+            donor_gt_vars = donor_gt_vars[donor_gt_vars['vars'] != '0/0']
+            donor_gt_varids = list(donor_gt_vars['ids'])
+            vars_per_donor_gt[don_id] = donor_gt_varids
+        
+        #work out what cohort each donor belongs to
+        donor_cohorts = {}
+        for don_id in donor_list:
+            cohort = 'UNKNOWN'
+            donor_split = don_id.split("_")
+            if (len(donor_split) == 2) and (donor_split[0] == donor_split[1]):
+                cohort = 'UKB'
+            elif (len(donor_split) == 3) and (len(donor_split[0]) == 14):
+                cohort = 'ELGH'
+            donor_cohorts[don_id] = cohort
 
+        all_donor_data={}
+        # here we calvculate all the expected donor datasets
+        for row1 in exclusive_don_variants.keys():
+            # donor_in_question = row1['donor_query']
+            donor_gt_match = row1
+            expected_vars_of_other_donor = self.exclusive_don_variants[donor_gt_match]
+            expected_vars_norm_of_other_donor = self.norm_genotypes(expected_vars_of_other_donor)
+            all_donor_data[donor_gt_match]=expected_vars_norm_of_other_donor
 
-        Out_of_Cohort__total_number_of_potential_contaminent_reads=0
-        try:
-            concordant_sites_atributed_to_other_donor =  list(set(sites_contributing_to_out_of_cohort))
-            if not total_sites >= len(concordant_sites_atributed_to_other_donor):
-                raise Exception("Sorry, the number of sites atributed can not be larger than total sites")
-            
-            _,_,_,_,Out_of_Cohort__total_number_of_potential_contaminent_reads, _= self.read_extraction(concordant_sites_atributed_to_other_donor,expected_vars_norm,cell_vars_norm)
-            
-            if not discordant_reads >= len(Out_of_Cohort__total_number_of_potential_contaminent_reads):
-                raise Exception("Total discordant reads can not be less than out of cohort discordant reads.")                       
+        results = []
+        for i,row1 in donor_assignments_table.iterrows():
+            donor_in_question = row1['donor_query']
+            donor_gt_match = row1['donor_gt']
+            print(donor_gt_match)
+            # if i>4:
+            #     continue
+            if (donor_gt_match=='NONE'):
+                continue
+            try:
+                donor_gt_match_cohort = donor_cohorts[donor_gt_match]
+            except:
+                continue
+            Cells_to_keep_pre = list(set(cell_assignments_table.loc[cell_assignments_table['donor_id']==donor_in_question,'cell']))
+            expected_vars = exclusive_don_variants[donor_gt_match]
+            expected_vars_norm = self.norm_genotypes(expected_vars)
+            try:
+                # Now we subset this down to each of the uniqie variants per donor and check which of the concordant sites are exclusive to donor.
+                dds = self.donor_distinct_sites[donor_gt_match]
+            except:
+                continue
+            # if cpus==1:
+            result,other_donor_concordance,donor_gt_match,donor_id = Donor(donor_in_question,self.informative_sites,self.uninformative_sites,self.donor_assignments_table,self.cell_assignments_table,self.exclusive_cell_variants,self.donor_distinct_sites,cpus).analyse_donor(Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,all_donor_data,expected_vars_norm,donor_assignments_table,exclusive_cell_variants)
+            self.combine_concordances(result,other_donor_concordance,donor_gt_match,donor_id)
+
+
+
+        return self.cell_concordance_table
 
-        except:
-            _='Doesnt Exist'
-                
     
+
+    def read_extraction(self,DonorDiscordant_Sites_that_are_atributed_to_other_donor,expected_vars_norm,cell_vars_norm):
+        # we need this function wrapper to calculate the concordant, discordant read 
+        # counts for each of the discordant sites that are concordant with another donor.
         
-        try:
-            Out_of_Cohort__sites = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys())
-        except:
-            Out_of_Cohort__sites = set()
-            
-        try:
-            Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys()) - set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) 
-            shared_sites_between_out_of_cohort_and_within_cohort = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys()).intersection(set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) )
-        except:
-            Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set()
-            shared_sites_between_out_of_cohort_and_within_cohort = set()
-            
-        try:
-            Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) - set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys()) 
-        except:
-            Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys())
+        Total_Overlapping_sites = set(DonorDiscordant_Sites_that_are_atributed_to_other_donor)
+        expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlapping_sites)]
+        cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)]   
+        
+        cell_vars2 = cell_vars2.drop_duplicates(subset=['ids'])
+        expected_vars2 = expected_vars2.drop_duplicates(subset=['ids'])
+        cell_vars2['DP'] = cell_vars2[0].str.split("_").str[5].astype(int)
+        cell_vars2['AD'] = cell_vars2[0].str.split("_").str[6].astype(int)
+        cell_vars2['OTH'] = cell_vars2[0].str.split("_").str[7].astype(int)
+        total_reads,_,_,discordant_reads, \
+            discordantRead_ratio,discordant_reads_noHet,total_reads_noHet,discordantRead_ratio_noHet= self.read_concordance_calc(expected_vars2,cell_vars2)
+        concordant_reads = total_reads - discordant_reads
+        concordant_reads_noHet = total_reads_noHet-discordant_reads_noHet
+        return total_reads,discordant_reads,concordant_reads,\
+            total_reads_noHet,discordant_reads_noHet,concordant_reads_noHet
+    
 
-        Out_of_Cohort__unique_sites_total_reads,_,_,_,_,_ = self.read_extraction(Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool,expected_vars_norm,cell_vars_norm)
-            
-        try:
-            Whithin_Cohort__sites = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys())
-        except:
-            Whithin_Cohort__sites = set()  
-            
+class Donor(Concordances):
+    def __init__(self, donor_id,informative_sites,uninformative_sites,donor_assignments_table,cell_assignments_table,exclusive_cell_variants,donor_distinct_sites,pool):
+        self.donor_concordance_table = {}
+        self.other_donor_concordance = []
+        self.pool = pool
+        self.donor_id = donor_id
+        self.informative_sites = informative_sites
+        self.uninformative_sites = uninformative_sites
+        self.donor_assignments_table=donor_assignments_table
+        self.cell_assignments_table=cell_assignments_table
+        self.exclusive_don_variants=exclusive_don_variants 
+        self.exclusive_cell_variants=exclusive_cell_variants
+        self.donor_distinct_sites=donor_distinct_sites
 
-          
-        Total__discordant_sites_that_are_concordant_with_other_donors_in_pool = Whithin_Cohort__sites.union(Out_of_Cohort__sites)
-        concordant_vars_in_pool_str = (";").join(concordant_vars)
-        DF = pd.DataFrame(donor_table_of_concordances)
+    def combine_results(self,result):
+        donor_concordance_table=result[0]
+        other_donor_concordance_table=result[1]
+        # other_donor_concordance_table + other_donor_concordances
+        self.other_donor_concordance = self.other_donor_concordance + other_donor_concordance_table
+        self.donor_concordance_table = {**self.donor_concordance_table, **donor_concordance_table}
         
-        Donor_With_Lowest_DisConcordance = ';'.join(DF[DF['discordant_percent_in_other_donor']==min(DF['discordant_percent_in_other_donor'])]['donor'].values)
-        Lowest_Disconcordance_value_in_all_donors= DF[DF['discordant_percent_in_other_donor']==min(DF['discordant_percent_in_other_donor'])]['discordant_percent_in_other_donor'].values[0]
-        
-        Donor_With_Highest_Concordance = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['donor'].values)
-        Highest_Concordance_value_in_all_donors= DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['concordant_percent_in_other_donor'].values[0]
-        Total_sites_other_donor = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['total_sites_otherDonor'].astype(str).values)
-        Total_reads_other_donor = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['total_reads_otherDonor'].astype(str).values)
-                                                       
-        return [{
-            'cell1':cell1,
-            'donor_gt_match':donor_gt_match,
-            'Nr_Concordant':Nr_Concordant,
-            'Nr_Discordant':Nr_Discordant,
-            'Nr_Relaxed_concordant':Nr_Relaxed_concordant,
-            'true_discordant_count':true_discordant_count,
-            'relaxed_concordant_informative_count':relaxed_concordant_informative_count,
-            'relaxed_concordant_uninformative_count':relaxed_concordant_uninformative_count,
-            'true_discordant_informative_count':true_discordant_informative_count,
-            'true_discordant_uninformative_count':true_discordant_uninformative_count,
-            'Nr_Total_Overlapping_sites':Nr_Total_Overlapping_sites,
-            'Number_of_sites_that_are_donor_concordant_and_exclusive':Number_of_sites_that_are_donor_concordant_and_exclusive,
-            'Nr_donor_distinct_sites':Nr_donor_distinct_sites,
-            'count':count,
-            'discordant_sites':discordant_sites,
-            'total_sites':total_sites,
-            'informative_sites':informative_sites,
-            'uninformative_sites':uninformative_sites,
-            'total_reads_informative':total_reads_informative,
-            'total_reads_uninformative':total_reads_uninformative,
-            'discordant_reads_informative':discordant_reads_informative,
-            'discordant_reads_uninformative':discordant_reads_uninformative,
-            'Discordant_sites_in_pool': discordant_vars,
-            'Lowest_Disconcordance_value_in_all_donors':Lowest_Disconcordance_value_in_all_donors,
-            'Donor_With_Lowest_DisConcordance':Donor_With_Lowest_DisConcordance,
-            'Concordant_Site_Identities':concordant_vars_in_pool_str,
-            'Donor_With_Highest_Concordance':Donor_With_Highest_Concordance,
-            'Highest_Concordance_value_in_all_donors':Highest_Concordance_value_in_all_donors,
-            'Total_sites_other_donor':Total_sites_other_donor,
-            'Total_reads_other_donor':Total_reads_other_donor,
-            'total_discordant_sites_that_are_concordant_with_other_donors_in_pool':f"{len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool)}/{len(discordant_vars)}",
-            'informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool':f"{len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool)}/{len(true_discordant_informative_count)}",
-            'total_reads':total_reads,
-            'discordant_reads':discordant_reads,
-            'discordant_reads_informative_subset':discordant_reads_informative_subset, \
-            'discordant_read_fraction_in_concordant_sites':discordant_read_fraction_in_concordant_sites, \
-            'discordant_read_fraction_in_discordant_sites':discordant_read_fraction_in_discordant_sites, \
-            'Whithin_Cohort__total_number_of_potential_contaminent_reads':Whithin_Cohort__total_number_of_potential_contaminent_reads, \
-            'Out_of_Cohort__total_number_of_potential_contaminent_reads':Out_of_Cohort__total_number_of_potential_contaminent_reads, \
-            'NrDonors_contributing_to_out_of_cohort':len(set(donors_contributing_to_out_of_cohort)), \
-            'NrDonors_contributing_to_Whithin_Cohort':len(set(donors_contributing_to_Whithin_Cohort)), \
-                
-            'Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool':Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool, \
-            'Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool':Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool, \
-            'shared_sites_between_Out_of_Cohort_and_Within_Cohort':len(shared_sites_between_out_of_cohort_and_within_cohort), \
-            'Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool':len(Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool), \
-            'Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool':len(Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool), \
-            'Out_of_Cohort__unique_sites_total_reads':Out_of_Cohort__unique_sites_total_reads, \
-            'Total__discordant_sites_that_are_concordant_with_other_donors_in_pool':len(Total__discordant_sites_that_are_concordant_with_other_donors_in_pool)
-        }, donor_table_of_concordances]
+    def analyse_cells(self,chunk_df,exclusive_cell_variants,expected_vars_norm,donor_gt_match,vars_per_donor_gt,donor_cohorts,all_donor_data,donor_assignments_table,donor_gt_match_cohort):
+        donor_concordance_table = {}
+        other_donor_concordance_table = []
+        count=0
+        for cell1 in chunk_df:
+            count+=1
+            cell_vars = exclusive_cell_variants[cell1]
+            # try:
+            result1, other_donor_concordances = self.concordance_table_production(expected_vars_norm,cell_vars,cell1,donor_gt_match,donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data,donor_assignments_table)
+            donor_concordance_table,other_donor_concordance_table = self.append_results_cell_concordances(result1,donor_concordance_table,other_donor_concordances,other_donor_concordance_table)
+            # except:
+            #     continue
+        return [donor_concordance_table, other_donor_concordance_table]
     
+    def analyse_donor(self,Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,all_donor_data,expected_vars_norm,donor_assignments_table,exclusive_cell_variants):
+        n=200
+        print(f'-- Using {cpus} cpus ---')
+        pool = mp.Pool(cpus)
+        # Cells_to_keep_pre = Cells_to_keep_pre[:13]
+        list_df = [Cells_to_keep_pre[i:i+n] for i in range(0,len(Cells_to_keep_pre),n)]
+        # results = []
+        for chunk_df in list_df:
+            pool.apply_async(self.analyse_cells,args =([chunk_df,exclusive_cell_variants,expected_vars_norm,donor_gt_match,vars_per_donor_gt,donor_cohorts,all_donor_data,donor_assignments_table,donor_gt_match_cohort]),callback=self.combine_results)
+            # results.append(result)
+        # while True:
+        #     time.sleep(1)
+        #     # catch exception if results are not ready yet
+        #     try:
+        #         ready = [result.ready() for result in results]
+        #         successful = [result.successful() for result in results]
+        #     except Exception:
+        #         continue
+        #     # exit loop if all tasks returned success
+        #     if all(successful):
+        #         break
+        #     # raise exception reporting exceptions received from workers
+        #     if all(ready) and not all(successful):
+        #         raise Exception(f'Workers raised following exceptions {[result._value for result in results if not result.successful()]}')
+        # self.pool.join()
+            # try:
+            #     [result.wait() for result in results]        
+            # except:
+            #     print('done')             
+            # r1,r2 = self.analyse_cells(chunk_df,exclusive_cell_variants,expected_vars_norm,donor_gt_match,vars_per_donor_gt,donor_cohorts,all_donor_data,donor_assignments_table,donor_gt_match_cohort)
+            # self.combine_results(r1,r2)
+        print('Done') 
+        pool.close()
+        pool.join() 
+        return self.donor_concordance_table,self.other_donor_concordance,donor_gt_match,self.donor_id 
     
 class VCF_Loader:
     
diff --git a/bin/gather_minimal_dataset.py b/bin/gather_minimal_dataset.py
index 3a6cd6b..4349485 100755
--- a/bin/gather_minimal_dataset.py
+++ b/bin/gather_minimal_dataset.py
@@ -413,6 +413,9 @@ def gather_donor(donor_id, ad, ad_lane_raw, azimuth_annot, qc_obs, columns_outpu
     ad.obs = ad.obs.loc[:,~ad.obs.columns.duplicated()]
     if write_h5:
         path1=os.path.join(outdir, oufnam + '.h5ad')
+        ad.obs['qc.filter.pass.AZ:L0'] = ad.obs['qc.filter.pass.AZ:L0'].astype('bool')
+        ad.obs['cell_passes_hard_filters'] = ad.obs['cell_passes_hard_filters'].astype('bool')
+        ad.obs['qc.filter.pass'] = ad.obs['qc.filter.pass'].astype('bool')
         ad.write(path1,compression='gzip')
 
     return {
diff --git a/conf/base.conf b/conf/base.conf
index 249dc9f..98bb86d 100755
--- a/conf/base.conf
+++ b/conf/base.conf
@@ -9,7 +9,7 @@
 */
 
 params{
-    input = 'cellbender' 
+    input = 'cellbender' //# cellbender|cellranger
     rsync_to_web_file = "${launchDir}/yascp/bin/rsync_to_web.sh"
     profile = 'normal_run'
     citeseq = false
@@ -59,11 +59,12 @@ params{
     cluster_validate_resolution_keras = true
     input_tables_column_delimiter = '\t'
     outdir= "${launchDir}/results"
+    tracedir                   = "${params.outdir}/pipeline_info"
     do_deconvolution = true
     split_bam = false
     run_multiplet = true
     utilise_gpu = true
-    split_ad_per_bach = false
+    split_ad_per_bach = true
     cellbender_resolution_to_use='0pt1'
     reference_assembly_fasta_dir = "https://yascp.cog.sanger.ac.uk/public/10x_reference_assembly"
     webtransfer = false
@@ -239,246 +240,31 @@ process {
     withLabel:process_medium_memory {
         memory = { 30.GB * task.attempt }
     }
-
-    withName: SCRUBLET{
-        maxRetries    = 3
-        errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' }
-    }
-
-
-    withName: DOUBLET_DECON{
-        maxRetries    = 3
-        errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' }
-        memory = { 100.GB * task.attempt }
-    }
-    
-    withName: AZIMUTH{
-        maxRetries    = 3
-        errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' }
-    }
-    withName: CELLTYPIST{
-        maxRetries    = 3
-        errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' }
-    }
-    withName: cellbender__remove_background{
-        maxRetries    = 2
-        //# errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' }
-    }
-
-    withLabel:error_ignore {
-        errorStrategy = 'ignore'
-    }
-    withLabel:error_retry {
-        errorStrategy = 'retry'
-        maxRetries    = 2
-    }
-
-    withName:cluster_validate_resolution_keras{
-        maxForks=4
+    withName: ASSESS_CALL_RATE{
         maxRetries    = 3
-        memory = { 60.GB * task.attempt }
-        time   = { 12.h   * task.attempt }
+        memory = { 10.GB * task.attempt }
         errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' }
     }
 
-    withName: CELLTYPIST{
-        maxForks=7
-    }
-
-    withName: CELLTYPE_FILE_MERGE{
-        memory = { 60.GB * task.attempt }
-    }
-
-    withName: NORMALISE_AND_PCA{
-        maxForks=7
-        errorStrategy = 'retry'
-        memory = { 50.GB * task.attempt}
-        maxRetries    = 8
-        cpus   = 4
-    }
-    
-    withName: LISI{
-        maxForks=7
-        memory =300.GB
-    }
-
-    withName: RESOLVE_POOL_VCFS{
-        cpus =  1
-        memory = { 1.GB * task.attempt }
-        time   = {  12.h   * task.attempt }
-        maxRetries    = 3
-    }
-
-    withName: SPLIT_BATCH_H5AD{
-        cpus =  2
-        memory = { 25.GB * task.attempt * 0.5}
-        time   = {  12.h   * task.attempt }
-        maxRetries    = 3
-    }
-
-    withName: SUBSET_GENOTYPE2{
-        cpus =  2
-        memory = { 1.GB * task.attempt }
-        time   = {  12.h   * task.attempt }
-        maxRetries    = 3
-    }
-
-    withName: JOIN_STUDIES_MERGE{
-        cpus =  1
-        memory = { 20.GB * task.attempt }
-        time   = {  12.h   * task.attempt }
-        maxRetries    = 3
-    }    
-    
-    withName: FREEBAYES{
-        cpus =  1
-        time   = {  12.h   * task.attempt }
-        maxRetries    = 2
-    }
-
-    withName: VIREO_ADD_SAMPLE_PREFIX{
-        cpus =  1
-        memory = { 2.GB * task.attempt }
-        time   = {  12.h   * task.attempt }
-        maxRetries    = 3
-    }
-
-
-    withName: REPLACE_GT_DONOR_ID2{
-        cpus =  1
-        memory = { 1.GB * task.attempt }
-        time   = 12.h
-        maxRetries    = 3
-    }
-
-    withName: JOIN_CHROMOSOMES{
-        cpus =  1
-        memory = { 2.GB * task.attempt }
-        time   = 12.h
-        maxRetries    = 3
-    }
-
-    withName: serialize_known_markers{
-        cpus =  1
-        memory = { 1.GB * task.attempt }
-        time   = {  12.h   * task.attempt }
-        maxRetries    = 3
-    }
-
-    withName: OUTLIER_FILTER{
-        errorStrategy = 'retry'
-        memory = { 50.GB * task.attempt}
-        maxRetries    = 8
-    }
-
-   
-
-    withName: cluster{
-        cpus   = {  3     * task.attempt }
-    }
-
-    withName: LISI{
-        maxForks=7
-        errorStrategy = 'retry'
-        maxRetries    = 8
-        memory = { 200.GB * task.attempt}
-    }
-
-    withName: VIREO_GT_FIX_HEADER{
-        errorStrategy = 'retry'
-        maxRetries    = 4
-        cpus = 1
-        memory = { 1.GB * task.attempt }
-    }
-
-
-    withName: JOIN_CHROMOSOMES{
-        errorStrategy = 'retry'
-        maxRetries    = 4
-    }
-
-    withName: cluster{
-        errorStrategy = 'retry'
-        maxRetries    = 4
-    }
-
-    withName: SPLIT_BAM_BY_CELL_BARCODES {
-      cpus = 1
-      memory = { 8.GB * task.attempt}
-      time = 4.h
-    }
-
-    withName: CONCORDANCE_CALCLULATIONS{
-        cpus = 5
-        time   = {  12.h   * task.attempt }
-        memory = { 70.GB * task.attempt }
-    }
-
-    withName: OTHER_DONOR_CONCORDANCE_CALCLULATIONS{
-        cpus = 3
-        time   = {  6.h   * task.attempt }
-        memory = { 20.GB * task.attempt }
-    }
-
-
-    withName: CELLSNP{
-        memory = { 5.GB * task.attempt }
-    }
-
-    withName: DYNAMIC_DONOR_EXCLUSIVE_SNP_SELECTION{
-        cpus = 5
-        time   = { 12.h   * task.attempt }
-        memory = { 20.GB * task.attempt }
-    }
-
-    withName: prep_collectmetadata{
-        memory = { 150.MB * task.attempt }
-    }
-
-    withName: VIREO{
-        //# maxForks=7
-        cpus   = 5
-        time   = { 12.h   * task.attempt }
-        memory = { 70.GB * task.attempt }
-    }
-    withName: DSB_INTEGRATE{
-        memory = { 200.GB * task.attempt }
-        cpus   = {  4     * task.attempt }
-        maxRetries    = 3
-    }
-
-    withName: MULTIMODAL_INTEGRATION{
-        memory = { 200.GB * task.attempt }
-        cpus   = {  4     * task.attempt }
-        maxRetries    = 3
-    }
-
-    withName: umap_gather{
-        memory = { 100.GB * task.attempt }
-        errorStrategy = 'retry'
-        maxRetries    = 3
-    }
-
-    withName: DOUBLET_FINDER{
-        memory = { 100.GB * task.attempt }
-    }
-
-
-
-    withName: GT_MATCH_POOL_AGAINST_PANEL{
-        time   = { 24.h   * task.attempt }
-    }
-    
-    withName: plot_predicted_sex{
-        memory = { 50.GB * task.attempt }
-        maxRetries = 5
-        cpus = 2
-
-    }
-
-
+}
 
 
+def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss')
+timeline {
+    enabled = true
+    file    = "${params.tracedir}/execution_timeline_${trace_timestamp}.html"
+}
+report {
+    enabled = true
+    file    = "${params.tracedir}/execution_report_${trace_timestamp}.html"
+}
+trace {
+    enabled = true
+    file    = "${params.tracedir}/execution_trace_${trace_timestamp}.txt"
+}
+dag {
+    enabled = true
+    file    = "${params.tracedir}/pipeline_dag_${trace_timestamp}.svg"
 }
 
 singularity {
diff --git a/conf/cellbender.conf b/conf/cellbender.conf
index 636dd16..a8449ec 100755
--- a/conf/cellbender.conf
+++ b/conf/cellbender.conf
@@ -59,8 +59,6 @@ params {
                 value{
                     expected_nemptydroplets_umi_cutoff = 0
                     method_estimate_ncells = 'dropletutils::barcoderanks::inflection'
-                    //method_estimate_ncells = 'cellrangerv2::expected' //this method feeds in the cellranger estimate ncells
-                    //method_estimate_ncells = 'expected'
                     lower_bound_umis_estimate_ncells = 1000
                     method_estimate_nemptydroplets = 'dropletutils::barcoderanks::inflection,dropletutils::barcoderanks::knee,0.33'
                     lower_bound_umis_estimate_nemptydroplets = 10
diff --git a/conf/modules.conf b/conf/modules.conf
index 1885d64..0df8fa4 100755
--- a/conf/modules.conf
+++ b/conf/modules.conf
@@ -52,4 +52,250 @@ params {
         }
 
     }
+
+    withName: SCRUBLET{
+        maxRetries    = 3
+        errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' }
+    }
+
+
+    withName: DOUBLET_DECON{
+        maxRetries    = 3
+        errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' }
+        memory = { 100.GB * task.attempt }
+    }
+    
+    withName: AZIMUTH{
+        maxRetries    = 3
+        errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' }
+    }
+    withName: CELLTYPIST{
+        maxRetries    = 3
+        errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' }
+    }
+    withName: cellbender__remove_background{
+        maxRetries    = 2
+        //# errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' }
+    }
+
+    withLabel:error_ignore {
+        errorStrategy = 'ignore'
+    }
+    withLabel:error_retry {
+        errorStrategy = 'retry'
+        maxRetries    = 2
+    }
+
+    withName:cluster_validate_resolution_keras{
+        maxForks=4
+        maxRetries    = 3
+        memory = { 60.GB * task.attempt }
+        time   = { 12.h   * task.attempt }
+        errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' }
+    }
+
+    withName: CELLTYPIST{
+        maxForks=7
+    }
+
+    withName: ASSESS_CALL_RATE{
+        maxRetries    = 3
+        memory = { 10.GB * task.attempt }
+        errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' }
+    }
+
+
+    withName: CELLTYPE_FILE_MERGE{
+        memory = { 60.GB * task.attempt }
+    }
+
+    withName: NORMALISE_AND_PCA{
+        maxForks=7
+        errorStrategy = 'retry'
+        memory = { 50.GB * task.attempt}
+        maxRetries    = 8
+        cpus   = 4
+    }
+    
+    withName: LISI{
+        maxForks=7
+        memory =300.GB
+    }
+
+    withName: RESOLVE_POOL_VCFS{
+        cpus =  1
+        memory = { 1.GB * task.attempt }
+        time   = {  12.h   * task.attempt }
+        maxRetries    = 3
+    }
+
+    withName: SPLIT_BATCH_H5AD{
+        cpus =  2
+        memory = { 25.GB * task.attempt * 0.5}
+        time   = {  12.h   * task.attempt }
+        maxRetries    = 3
+    }
+
+    withName: SUBSET_GENOTYPE2{
+        cpus =  2
+        memory = { 1.GB * task.attempt }
+        time   = {  12.h   * task.attempt }
+        maxRetries    = 3
+    }
+
+    withName: JOIN_STUDIES_MERGE{
+        cpus =  1
+        memory = { 20.GB * task.attempt }
+        time   = {  12.h   * task.attempt }
+        maxRetries    = 3
+    }    
+    
+    withName: FREEBAYES{
+        cpus =  1
+        time   = {  12.h   * task.attempt }
+        maxRetries    = 2
+    }
+
+    withName: VIREO_ADD_SAMPLE_PREFIX{
+        cpus =  1
+        memory = { 2.GB * task.attempt }
+        time   = {  12.h   * task.attempt }
+        maxRetries    = 3
+    }
+
+
+    withName: REPLACE_GT_DONOR_ID2{
+        cpus =  1
+        memory = { 1.GB * task.attempt }
+        time   = 12.h
+        maxRetries    = 3
+    }
+
+    withName: JOIN_CHROMOSOMES{
+        cpus =  1
+        memory = { 2.GB * task.attempt }
+        time   = 12.h
+        maxRetries    = 3
+    }
+
+    withName: serialize_known_markers{
+        cpus =  1
+        memory = { 1.GB * task.attempt }
+        time   = {  12.h   * task.attempt }
+        maxRetries    = 3
+    }
+
+    withName: OUTLIER_FILTER{
+        errorStrategy = 'retry'
+        memory = { 50.GB * task.attempt}
+        maxRetries    = 8
+    }
+
+   
+
+    withName: cluster{
+        cpus   = {  3     * task.attempt }
+    }
+
+    withName: LISI{
+        maxForks=7
+        errorStrategy = 'retry'
+        maxRetries    = 8
+        memory = { 200.GB * task.attempt}
+    }
+
+    withName: VIREO_GT_FIX_HEADER{
+        errorStrategy = 'retry'
+        maxRetries    = 4
+        cpus = 1
+        memory = { 1.GB * task.attempt }
+    }
+
+
+    withName: JOIN_CHROMOSOMES{
+        errorStrategy = 'retry'
+        maxRetries    = 4
+    }
+
+    withName: cluster{
+        errorStrategy = 'retry'
+        maxRetries    = 4
+    }
+
+    withName: SPLIT_BAM_BY_CELL_BARCODES {
+      cpus = 1
+      memory = { 8.GB * task.attempt}
+      time = 4.h
+    }
+
+    withName: CONCORDANCE_CALCLULATIONS{
+        cpus = 20
+        time   = {  24.h   * task.attempt }
+        memory = { 100.GB * task.attempt }
+    }
+
+    withName: OTHER_DONOR_CONCORDANCE_CALCLULATIONS{
+        cpus = 3
+        time   = {  6.h   * task.attempt }
+        memory = { 20.GB * task.attempt }
+    }
+
+
+    withName: CELLSNP{
+        memory = { 5.GB * task.attempt }
+    }
+
+    withName: DYNAMIC_DONOR_EXCLUSIVE_SNP_SELECTION{
+        cpus = 5
+        time   = { 12.h   * task.attempt }
+        memory = { 20.GB * task.attempt }
+    }
+
+    withName: prep_collectmetadata{
+        memory = { 150.MB * task.attempt }
+    }
+
+    withName: VIREO{
+        //# maxForks=7
+        cpus   = 5
+        time   = { 12.h   * task.attempt }
+        memory = { 70.GB * task.attempt }
+    }
+    withName: DSB_INTEGRATE{
+        memory = { 200.GB * task.attempt }
+        cpus   = {  4     * task.attempt }
+        maxRetries    = 3
+    }
+
+    withName: MULTIMODAL_INTEGRATION{
+        memory = { 200.GB * task.attempt }
+        cpus   = {  4     * task.attempt }
+        maxRetries    = 3
+    }
+
+    withName: umap_gather{
+        memory = { 100.GB * task.attempt }
+        errorStrategy = 'retry'
+        maxRetries    = 3
+    }
+
+    withName: DOUBLET_FINDER{
+        memory = { 100.GB * task.attempt }
+    }
+
+
+
+    withName: GT_MATCH_POOL_AGAINST_PANEL{
+        time   = { 24.h   * task.attempt }
+    }
+    
+    withName: plot_predicted_sex{
+        memory = { 50.GB * task.attempt }
+        maxRetries = 5
+        cpus = 2
+
+    }
+
+
+
 }
diff --git a/conf/test.conf b/conf/test.conf
index 6b947ed..185ca42 100755
--- a/conf/test.conf
+++ b/conf/test.conf
@@ -23,7 +23,10 @@ params {
         posterior_assignment = false //if this is set to true, we will perform the genotype donor matching after the deconvolution is performed.
         subset_genotypes = false
         tsv_donor_panel_vcfs = "https://yascp.cog.sanger.ac.uk/public/test_datasets/full_test_dataset/vcf_inputs_v2.tsv" //this is a panel of vcf files that we want to compar the genotypes with
+        ZSCORE_THRESH = 8 //# Minimum z0 threshold required to call the gt assignment confident. 
+        ZSCORE_DIST_THRESH = 8 //# Minimum bifference between z0 and z1 to call the assignment confident,
     }
+
     hard_filters_file = "${projectDir}/sample_input/sample_qc.yml" //this file defilnes what hard filters we want to use to flag/drop the cells
 
     //# For the training purposes we reduce the cellbender epochs and learning rate as this step takes a long time to compute.
diff --git a/conf/test_full.conf b/conf/test_full.conf
index 6b947ed..9b6150a 100755
--- a/conf/test_full.conf
+++ b/conf/test_full.conf
@@ -23,6 +23,9 @@ params {
         posterior_assignment = false //if this is set to true, we will perform the genotype donor matching after the deconvolution is performed.
         subset_genotypes = false
         tsv_donor_panel_vcfs = "https://yascp.cog.sanger.ac.uk/public/test_datasets/full_test_dataset/vcf_inputs_v2.tsv" //this is a panel of vcf files that we want to compar the genotypes with
+        ZSCORE_THRESH = 8 //# Minimum z0 threshold required to call the gt assignment confident. 
+        ZSCORE_DIST_THRESH = 8 //# Minimum bifference between z0 and z1 to call the assignment confident,
+    
     }
     hard_filters_file = "${projectDir}/sample_input/sample_qc.yml" //this file defilnes what hard filters we want to use to flag/drop the cells
 
diff --git a/main.nf b/main.nf
index 93fc1f0..3cecc35 100755
--- a/main.nf
+++ b/main.nf
@@ -38,7 +38,7 @@ include {collect_file} from "$projectDir/modules/nf-core/modules/collect_file/ma
 include { CELLSNP;capture_cellsnp_files } from "$projectDir/modules/nf-core/modules/cellsnp/main"
 
 
-
+// Channel.of(params.outdir).mkdirs()
 
 ////// WORKFLOW: Run main nf-core/yascp analysis pipeline
 // This is the default entry point, we have others to update ceirtain parts of the results. 
diff --git a/modules/nf-core/modules/citeseq/main.nf b/modules/nf-core/modules/citeseq/main.nf
index 70edf6e..272029a 100755
--- a/modules/nf-core/modules/citeseq/main.nf
+++ b/modules/nf-core/modules/citeseq/main.nf
@@ -26,7 +26,7 @@ process SPLIT_CITESEQ_GEX {
         tuple val(sample_name), path("${sample_name}__Gene_Expression"), emit:gex_data
         tuple val(sample_name), path("antibody-${sample_name}.h5ad"), emit: ab_data2 optional true
         tuple val(sample_name), path("Gene_Expression-${sample_name}.h5ad"), emit: gex_h5ad optional true
-        tuple val(sample_name), path("${sample_name}__*"), emit: ab_data
+        tuple val(sample_name), path("${sample_name}__*"), emit: ab_data optional true
         tuple val(sample_name), path("${sample_name}__Gene_Expression/barcodes.tsv.gz"), path("${sample_name}__Gene_Expression/features.tsv.gz"), path("${sample_name}__Gene_Expression/matrix.mtx.gz"), emit: channel__file_paths_10x
  
     script:
@@ -166,6 +166,8 @@ process VDJ_INTEGRATION{
     
     output:
         path("*all_samples_integrated.vdj.RDS"), emit: all_data_integrated_vdj_rds
+        path("*all_samples_integrated.BCR.RDS"), emit: all_data_integrated_BCR_rds
+        path("*all_samples_integrated.TCR.RDS"), emit: all_data_integrated_TCR_rds
 
     input:
         path(all_cellranger_samples)
diff --git a/modules/nf-core/modules/doubletdecon/main.nf b/modules/nf-core/modules/doubletdecon/main.nf
index 0e9ef94..b50bc01 100644
--- a/modules/nf-core/modules/doubletdecon/main.nf
+++ b/modules/nf-core/modules/doubletdecon/main.nf
@@ -1,4 +1,4 @@
-process DOUBLET_DECON {
+process DOUBLET_DECON{
 
     tag "${experiment_id}"
     label 'process_medium'
diff --git a/modules/nf-core/modules/prepere_yascp_inputs/main.nf b/modules/nf-core/modules/prepere_yascp_inputs/main.nf
index a83c72c..ebabfe1 100644
--- a/modules/nf-core/modules/prepere_yascp_inputs/main.nf
+++ b/modules/nf-core/modules/prepere_yascp_inputs/main.nf
@@ -16,6 +16,7 @@ process YASCP_INPUTS {
     input:
         path(input_file)
 
+
     // the output for this is a correct format input files as per cb 6.1
     output:
         path("input_file_corectly_formatted.tsv"), emit: input_file_corectly_formatted
diff --git a/nextflow.config b/nextflow.config
index 572aa8b..2378616 100755
--- a/nextflow.config
+++ b/nextflow.config
@@ -25,8 +25,8 @@ params {
     max_multiqc_email_size     = '25.MB'
 
     // Boilerplate options
-    outdir    = output_dir                 = "${launchDir}/results"
-    tracedir                   = "${params.outdir}/pipeline_info"
+    outdir                     = "${launchDir}/results"
+    
     publish_dir_mode           = 'copy'
     email                      = null
     email_on_fail              = null
@@ -38,7 +38,8 @@ params {
     schema_ignore_params       = 'genomes,modules'
     enable_conda               = false
     singularity_pull_docker_container = false
-
+    tracedir                   = "${params.outdir}/pipeline_info"
+    
     // Config options
     custom_config_version      = 'master'
     custom_config_base         = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}"
@@ -78,13 +79,6 @@ includeConfig 'conf/qc.conf'
 includeConfig 'conf/modules.conf'
 
 
-// Load igenomes.config if required
-if (!params.igenomes_ignore) {
-    includeConfig 'conf/igenomes.conf'
-} else {
-    params.genomes = [:]
-}
-
 profiles {
     debug { process.beforeScript = 'echo $HOSTNAME' }
     conda {
diff --git a/subworkflows/celltype.nf b/subworkflows/celltype.nf
index 7966b7a..4e611ad 100755
--- a/subworkflows/celltype.nf
+++ b/subworkflows/celltype.nf
@@ -70,9 +70,9 @@ workflow celltype{
         }else{
             sc_out = Channel.of()
         }        
-        all_extra_fields = all_extra_fields.mix(sc_out)
+        all_extra_fields2 = all_extra_fields.mix(sc_out)
         
-        CELLTYPE_FILE_MERGE(az_out,ct_out,all_extra_fields,SPLIT_BATCH_H5AD.out.keras_outfile.collect())       
+        CELLTYPE_FILE_MERGE(az_out,ct_out,all_extra_fields2,SPLIT_BATCH_H5AD.out.keras_outfile.collect())       
         file__anndata_merged2=CELLTYPE_FILE_MERGE.out.file__anndata_merged2
         file__anndata_merged2.view()
 
diff --git a/subworkflows/local/retrieve_recourses.nf b/subworkflows/local/retrieve_recourses.nf
index 6a48228..505b1c3 100755
--- a/subworkflows/local/retrieve_recourses.nf
+++ b/subworkflows/local/retrieve_recourses.nf
@@ -64,4 +64,22 @@ process RETRIEVE_RECOURSES_TEST_DATASET{
         sed -i 's#/path/to/replace/pointing/to/downloaded/files#${params.outdir}/recourses/full_test_dataset#' input_test_data_file.tsv
         touch Done > Done.tmp
     """    
+}
+
+
+process STAGE_FILE{
+  label 'process_tiny'
+    // In nf there is a function collectFile - however if you provide a symlinked file directory tusing nf function will overwrite the source instead of replacing the file
+    // This snipped is a replication of the function but as a nf module and hence the problem is avoided.
+  // publishDir "${params.outdir}/recourses",  mode: "${params.copy_mode}", overwrite: true    
+  input:
+    path(file)
+  output:
+    path(file)
+
+  script:
+
+    """
+    echo 'staged'
+    """
 }
\ No newline at end of file
diff --git a/subworkflows/main_deconvolution.nf b/subworkflows/main_deconvolution.nf
index 08fd451..25828bf 100755
--- a/subworkflows/main_deconvolution.nf
+++ b/subworkflows/main_deconvolution.nf
@@ -17,7 +17,7 @@ include { match_genotypes } from './match_genotypes'
 include {ENHANCE_STATS_GT_MATCH } from "$projectDir/modules/nf-core/modules/genotypes/main"
 include {SUBSET_WORKF} from "$projectDir/modules/nf-core/modules/subset_genotype/main"
 include {REPLACE_GT_DONOR_ID2; REPLACE_GT_DONOR_ID2 as REPLACE_GT_DONOR_ID_SUBS } from "$projectDir/modules/nf-core/modules/genotypes/main"
-include { RETRIEVE_RECOURSES } from "$projectDir/subworkflows/local/retrieve_recourses"
+include { RETRIEVE_RECOURSES; STAGE_FILE } from "$projectDir/subworkflows/local/retrieve_recourses"
 include {GT_MATCH_POOL_IBD } from "$projectDir/modules/nf-core/modules/genotypes/main"
 include { MATCH_GT_VIREO } from "$projectDir/modules/nf-core/modules/genotypes/main"
 
@@ -52,7 +52,7 @@ workflow  main_deconvolution {
         }else{
             genome = "${params.reference_assembly_fasta_dir}"
         }
-
+        vcf_candidate_snps = STAGE_FILE(params.cellsnp.vcf_candidate_snps)
         // genome.subscribe { println "genome: $it" }
 
         if (params.genotype_input.run_with_genotype_input) {
@@ -73,7 +73,7 @@ workflow  main_deconvolution {
             // This will subsequently result in a joint vcf file per cohort per donors listed for each of the pools that can be used in VIREO and/or GT matching algorythm.
             SUBSET_WORKF(ch_ref_vcf,donors_in_pools,'AllExpectedGT',genome)
             merged_expected_genotypes = SUBSET_WORKF.out.merged_expected_genotypes
-            merged_expected_genotypes2 = merged_expected_genotypes.combine(Channel.fromPath(params.cellsnp.vcf_candidate_snps))
+            merged_expected_genotypes2 = merged_expected_genotypes.combine(vcf_candidate_snps)
             // merged_expected_genotypes2.subscribe { println "merged_expected_genotypes2: $it" }
             GT_MATCH_POOL_IBD(SUBSET_WORKF.out.samplename_subsetvcf_ibd,'Withing_expected','Expected')
 
@@ -105,10 +105,10 @@ workflow  main_deconvolution {
             if (params.provide_within_pool_donor_specific_sites_for_pilup){
                 cellsnp_with_npooled2 = cellsnp_with_npooled.combine(cellsnp_panels, by: 0)
             }else{
-                cellsnp_with_npooled2 = cellsnp_with_npooled.combine(Channel.fromPath(params.cellsnp.vcf_candidate_snps))
+                cellsnp_with_npooled2 = cellsnp_with_npooled.combine(vcf_candidate_snps)
             }
         }else{
-            cellsnp_with_npooled2 = cellsnp_with_npooled.combine(Channel.fromPath(params.cellsnp.vcf_candidate_snps))
+            cellsnp_with_npooled2 = cellsnp_with_npooled.combine(vcf_candidate_snps)
         }
 
         log.info('Capturing some of the existing CELLSNP files')
diff --git a/subworkflows/prepare_inputs.nf b/subworkflows/prepare_inputs.nf
index 69c438c..be61c90 100755
--- a/subworkflows/prepare_inputs.nf
+++ b/subworkflows/prepare_inputs.nf
@@ -15,6 +15,7 @@ workflow prepare_inputs {
 
 		YASCP_INPUTS(channel_input_data_table)
 		channel_input_data_table = YASCP_INPUTS.out.input_file_corectly_formatted
+        
 
         channel_input_data_table
             .splitCsv(header: true, sep: params.input_tables_column_delimiter)
diff --git a/workflows/yascp.nf b/workflows/yascp.nf
index 85492f5..22774af 100755
--- a/workflows/yascp.nf
+++ b/workflows/yascp.nf
@@ -57,7 +57,7 @@ workflow YASCP {
         bam_split_channel = Channel.of()
         out_ch = params.outdir
             ? Channel.fromPath(params.outdir, checkIfExists:true)
-            : Channel.fromPath("${launchDir}/${params.outdir}")
+            : Channel.from("${launchDir}/${params.outdir}")
                 
         // out_ch.map{row->"${row[0]}/possorted_genome_bam.bam" }
         prepare_inputs(input_channel)

From f38ed2215cbe4bbefa67ff04c1ce0d7e8fd5528a Mon Sep 17 00:00:00 2001
From: Matiss Ozols <mo11@sanger.ac.uk>
Date: Fri, 3 May 2024 11:07:15 +0100
Subject: [PATCH 5/8] some latest changes

---
 bin/DoubletDecon.R            | 57 +++++++++++++--------
 bin/gather_minimal_dataset.py | 95 +++++++----------------------------
 conf/modules.conf             | 12 +++--
 3 files changed, 63 insertions(+), 101 deletions(-)

diff --git a/bin/DoubletDecon.R b/bin/DoubletDecon.R
index 238b72b..8a6d3ae 100755
--- a/bin/DoubletDecon.R
+++ b/bin/DoubletDecon.R
@@ -647,7 +647,7 @@ Main_Doublet_Decon<-function(rawDataFile, groupsFile, filename, location, fullDa
 ## Run Doublet Decon ##
 
 tryCatch({
-
+  print(paste0('trying rhop: ',args$rhop))
   results <- Main_Doublet_Decon(rawDataFile = processed$newExpressionFile, 
     groupsFile = processed$newGroupsFile, 
     filename = "DoubletDecon_results",
@@ -667,25 +667,42 @@ tryCatch({
     nCores = args$nCores)
 
 }, error = function(e) {
-
-  results <- Main_Doublet_Decon(rawDataFile = processed$newExpressionFile, 
-    groupsFile = processed$newGroupsFile, 
-    filename = "DoubletDecon_results",
-    location = paste0(args$out, "/"),
-    fullDataFile = NULL, 
-    removeCC = args$removeCC, 
-    species = args$species, 
-    rhop =  0.64,
-    write = TRUE, 
-    PMF = args$pmf, 
-    useFull = FALSE, 
-    heatmap = args$heatmap, 
-    centroids=args$centroids, 
-    num_doubs=args$num_doubs, 
-    only50=args$only50, 
-    min_uniq=args$min_uniq, 
-    nCores = args$nCores)
-
+  tryCatch({
+    print(paste0('trying rhop: ',0.64))
+    results <- Main_Doublet_Decon(rawDataFile = processed$newExpressionFile, 
+      groupsFile = processed$newGroupsFile, 
+      filename = "DoubletDecon_results",
+      location = paste0(args$out, "/"),
+      fullDataFile = NULL, 
+      removeCC = args$removeCC, 
+      species = args$species, 
+      rhop =  0.64,
+      write = TRUE, 
+      PMF = args$pmf, 
+      useFull = FALSE, 
+      heatmap = args$heatmap, 
+      centroids=args$centroids, 
+      num_doubs=args$num_doubs, 
+      only50=args$only50, 
+      min_uniq=args$min_uniq, 
+      nCores = args$nCores)
+  }, error = function(e) {
+      print(paste0('trying rhop: ',0.5))
+      results <- Main_Doublet_Decon(rawDataFile = processed$newExpressionFile, 
+        groupsFile = processed$newGroupsFile, 
+        filename = "DoubletDecon_results",
+        location = paste0('.', "/"),
+        fullDataFile = NULL, 
+        removeCC = FALSE, 
+        species = args$species, 
+        rhop =  0.5,
+        write = TRUE, 
+        PMF = args$pmf, 
+        useFull = FALSE, 
+        heatmap =  args$heatmap, 
+        nCores = 3)
+  }
+  )
 }
 )
 
diff --git a/bin/gather_minimal_dataset.py b/bin/gather_minimal_dataset.py
index 4349485..c54ee90 100755
--- a/bin/gather_minimal_dataset.py
+++ b/bin/gather_minimal_dataset.py
@@ -45,8 +45,8 @@
 SCRUBLET_ASSIGNMENTS_FNSUFFIX = 'scrublet.tsv'
 
 COLUMNS_AZIMUTH = {
-    'Azimuth:predicted.celltype.l2': 'azimuth.celltyp.l2',
-    'Azimuth:predicted.celltype.l2.score': 'azimuth.pred.score.l2',
+    'Azimuth:Azimuth:predicted.celltype.l2': 'azimuth.celltyp.l2',
+    'Azimuth:Azimuth:predicted.celltype.l2.score': 'azimuth.pred.score.l2',
     'Azimuth:mapping.score.celltype.l2': 'azimuth.map.score',
     }
 
@@ -61,14 +61,14 @@
     'cell_passes_qc:score':'qc.filter.pass:score',
     'cell_passes_qc-per:all_together::exclude':'qc.filter.pass.spikein_exclude',
     'cell_passes_qc-per:all_together::exclude:score':'qc.filter.pass.spikein_exclude:score',
-    'cell_passes_qc-per:Azimuth:L0_predicted.celltype.l2':'qc.filter.pass.AZ:L0',
-    'cell_passes_qc-per:Azimuth:L0_predicted.celltype.l2:score':'qc.filter.pass.AZ:L0:score',
+    'cell_passes_qc-per:Azimuth:L0_Azimuth:predicted.celltype.l2':'qc.filter.pass.AZ:L0',
+    'cell_passes_qc-per:Azimuth:L0_Azimuth:predicted.celltype.l2:score':'qc.filter.pass.AZ:L0:score',
     'total_counts': 'qc.umi.count.total',
     'total_counts_gene_group__mito_transcript': 'qc.umi.count.mt',
     'pct_counts_gene_group__mito_transcript': 'qc.umi.perc.mt',
     'n_genes_by_counts': 'qc.genes.detected.count',
-    'Azimuth:L0_predicted.celltype.l2':'azimuth.celltyp.l0',
-    'Azimuth:L1_predicted.celltype.l2':'azimuth.celltyp.l1',
+    'Azimuth:L0_Azimuth:predicted.celltype.l2':'azimuth.celltyp.l0',
+    'Azimuth:L1_Azimuth:predicted.celltype.l2':'azimuth.celltyp.l1',
     'total_counts_gene_group__mito_transcript':'total_counts_gene_group__mito_transcript',
     'pct_counts_gene_group__mito_transcript':'pct_counts_gene_group__mito_transcript',
     'total_counts_gene_group__mito_protein':'total_counts_gene_group__mito_protein',
@@ -224,49 +224,6 @@ def anndata_from_h5(
 
     return adata
 
-def gather_azimuth_annotation(expid, datadir_azimuth, index_label = None):
-    # e.g. A4C06803ACD34DFB-adata_franke_Pilot_3_lane_3_predicted_celltype.tsv.gz
-    filpath = None
-    expid2=expid
-    fnsfx = '_{}{}'.format(expid, AZIMUTH_ASSIGNMENTS_FNSUFFIX)
-    for fn in os.listdir(datadir_azimuth):
-        if fn.endswith(fnsfx):
-            filpath = os.path.join(datadir_azimuth, fn)
-            break
-    if not filpath:
-        expid ='full'
-        fnsfx = '_{}{}'.format(expid, AZIMUTH_ASSIGNMENTS_FNSUFFIX)
-        for fn in os.listdir(datadir_azimuth):
-            if fn.endswith(fnsfx):
-                if fn.startswith('remapped'):
-                    filpath = os.path.join(datadir_azimuth, fn)
-                    break
-    if not filpath:
-        expid ='full'
-        fnsfx = '_{}{}'.format(expid, AZIMUTH_ASSIGNMENTS_FNSUFFIX)
-        for fn in os.listdir(datadir_azimuth):
-            if fn.endswith(fnsfx):
-                print(fn)
-                if not fn.startswith('remapped'):
-                    filpath = os.path.join(datadir_azimuth, fn)
-                    break   
-                         
-    if not filpath:
-        sys.exit("ERROR: could not find filename suffix '{}' in direcotry {}\n"
-            .format(fnsfx, datadir_azimuth))
-    azt = pandas.read_table(filpath,index_col=0)
-    if (expid=='full'):
-        expid=expid2
-        azt = azt[azt.index.str.contains(expid2)]
-        # filter to only the experiment id inputs.
-    df = get_df_from_mangled_index(azt, expid)
-    azt.insert(0, "barcode", df["barcode"])
-    azt.insert(1, "donor", df["donor"])
-    azt.insert(2, "experiment_id", expid)
-    if index_label is not None and index_label == "barcode":
-        azt.insert(0, "mangled_cell_id", df.index)
-        azt = azt.set_index("barcode", drop = True)
-    return azt
 
 def load_scrublet_assignments(expid, datadir_scrublet):
     filpath = None
@@ -544,22 +501,12 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
     except:
         metrics = pd.read_csv(f'/lustre/scratch123/hgi/projects/ukbb_scrna/pipelines/Pilot_UKB/qc/{args.experiment_name}/results_rsync2/results/handover/Summary_plots/{args.experiment_name}/Fetch Pipeline/CSV/Submission_Data_Pilot_UKB.file_metadata.tsv',sep='\t')
         metrics = metrics[metrics['Sample_id']==expid]
+    
     #############
-    #Azimuth cell-type assignments
+    #Cell-type assignments
     #############
-    datadir_azimuth = f'{args.results_dir}/celltype/azimuth' 
-    for datadir_azimuth in glob.glob(f'{args.results_dir}/celltype/azimuth/*'):
-        if os.path.isdir(datadir_azimuth):
-            try:
-                azt = gather_azimuth_annotation(
-                    expid, datadir_azimuth=datadir_azimuth,
-                    index_label = 'barcode')
-                columns_output = {**columns_output, **COLUMNS_AZIMUTH}
-            except:
-                azt = None
-        else:
-            azt = None
 
+    azt = pd.read_csv(f'{args.results_dir}/celltype/All_Celltype_Assignments.csv',sep='\t',index_col=0)
     ##########################
     # Scrublet
     #########################
@@ -647,7 +594,7 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
         chromium_channel = 'Run_ID not vailable'
         
     
-    Azimuth_Cell_Assignments_data = azt
+
 
     def isNaN(num):
         return num!= num
@@ -719,12 +666,7 @@ def isNaN(num):
     Cells_before_QC_filters=len(all_QC_lane.obs['cell_passes_qc'])
     Cells_passing_QC=len(all_QC_lane.obs[all_QC_lane.obs['cell_passes_qc']])
     Cells_failing_QC=len(all_QC_lane.obs[all_QC_lane.obs['cell_passes_qc']==False])
-    try:
-        Azimuth_Cell_Assignments_data=Azimuth_Cell_Assignments_data.set_index('mangled_cell_id')
-        all_QC_lane.obs['predicted celltype']=Azimuth_Cell_Assignments_data['predicted.celltype.l2']
-    
-    except:
-        print('skipped az')
+
     UMIS_mapped_to_mitochondrial_genes = sum(all_QC_lane.obs['total_counts_gene_group__mito_transcript'])
     UMIS_mapped_to_ribo_genes = sum(all_QC_lane.obs['total_counts_gene_group__ribo_protein'])
     UMIS_mapped_to_ribo_rna = sum(all_QC_lane.obs['total_counts_gene_group__ribo_rna'])
@@ -819,8 +761,9 @@ def isNaN(num):
             data_donor_for_stats['cells failing QC'].append(Donor_cells_fails_qc)
             data_donor_for_stats['cells passing QC'].append(Donor_cells_passes_qc)
 
-            Donor_cell_assignments = Azimuth_Cell_Assignments_data.loc[Mengled_barcodes_donor] #for this have to figure out when the cell type is unasigned.
-            Cell_types_detected = len(set(Donor_cell_assignments['predicted.celltype.l2']))
+            Donor_cell_assignments = azt.loc[Mengled_barcodes_donor] #for this have to figure out when the cell type is unasigned.
+            # Donor_cell_assignments = Donor_cell_assignments.rename(COLUMNS_AZIMUTH,axis=1)
+            Cell_types_detected = len(set(Donor_cell_assignments['Azimuth:predicted.celltype.l2']))
             Donor_UMIS_mapped_to_mitochondrial_genes = sum(Donor_qc_files.obs['total_counts_gene_group__mito_transcript'])
             Donor_UMIS_mapped_to_ribo_genes = sum(Donor_qc_files.obs['total_counts_gene_group__ribo_protein'])
             Donor_UMIS_mapped_to_ribo_rna = sum(Donor_qc_files.obs['total_counts_gene_group__ribo_rna'])
@@ -833,8 +776,8 @@ def isNaN(num):
             Median_UMIs_per_cell= statistics.median(pd.DataFrame(Donor_qc_files.X.sum(axis=1))[0])
 
             Cell_numbers = ''
-            for type1 in set(Donor_cell_assignments['predicted.celltype.l2']):
-                nr_cells_of_this_type = len(Donor_cell_assignments[Donor_cell_assignments['predicted.celltype.l2']==type1])
+            for type1 in set(Donor_cell_assignments['Azimuth:predicted.celltype.l2']):
+                nr_cells_of_this_type = len(Donor_cell_assignments[Donor_cell_assignments['Azimuth:predicted.celltype.l2']==type1])
                 Cell_numbers+=f"{type1}:{nr_cells_of_this_type} ; "
 
 
@@ -984,7 +927,7 @@ def isNaN(num):
         'Tranche Pass/Fail':Tranche_Pass_Fail,
         'Tranche Failure Reason':Tranche_Failure_Reason
     }
-    return fctr, data_tranche, data_donor,azt
+    return fctr, data_tranche, data_donor
 
 def set_argument_parser():
 
@@ -1064,7 +1007,7 @@ def set_argument_parser():
     
     adqc.obs['tranche.id']=args.experiment_name
     try:
-        adqc.obs['cell_passes_qc-per:Azimuth:L0_predicted.celltype.l2:score'] = adqc.obs['cell_passes_qc-per:Azimuth:L0_predicted.celltype.l2:score'].astype(float,errors='ignore')
+        adqc.obs['cell_passes_qc-per:Azimuth:L0_Azimuth:predicted.celltype.l2:score'] = adqc.obs['cell_passes_qc-per:Azimuth:L0_Azimuth:predicted.celltype.l2:score'].astype(float,errors='ignore')
     except:
         _='no values associated'
     try:
@@ -1082,7 +1025,7 @@ def set_argument_parser():
             continue #Here no cells has passed the qc thresholds.
         nf, data_tranche, data_donor, azt = gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = oufh, lane_id=count,Resolution=Resolution)
         # add the stuff to the adata.
-        azt=azt.set_index('mangled_cell_id')
+
         All_probs_and_celltypes=pd.concat([All_probs_and_celltypes,azt])
         data_tranche_all.append(data_tranche)
         data_donor_all= data_donor_all+data_donor
diff --git a/conf/modules.conf b/conf/modules.conf
index 0df8fa4..bcb6f3a 100755
--- a/conf/modules.conf
+++ b/conf/modules.conf
@@ -53,6 +53,12 @@ params {
 
     }
 
+
+
+}
+
+
+process {
     withName: SCRUBLET{
         maxRetries    = 3
         errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' }
@@ -283,8 +289,6 @@ params {
         memory = { 100.GB * task.attempt }
     }
 
-
-
     withName: GT_MATCH_POOL_AGAINST_PANEL{
         time   = { 24.h   * task.attempt }
     }
@@ -296,6 +300,4 @@ params {
 
     }
 
-
-
-}
+}
\ No newline at end of file

From cf87ee478e6acbfa79ec59ed514821f61b0f3e82 Mon Sep 17 00:00:00 2001
From: Matiss Ozols <mo11@sanger.ac.uk>
Date: Fri, 10 May 2024 14:43:49 +0100
Subject: [PATCH 6/8] v1.6 nearly finished, needs some more trsting

---
 bin/0026-filter_outlier_cells.py              | 414 ++++++++++--------
 bin/gather_minimal_dataset.py                 |  15 +-
 bin/remap_azimuth_l2.py                       |   2 +-
 bin/scanpy_merge_from_h5ad.py                 | 311 +------------
 bin/scanpy_split_h5ad.py                      |  48 +-
 conf/base.conf                                |   1 +
 conf/modules.conf                             |   8 +-
 conf/qc.conf                                  |   4 +-
 modules/nf-core/modules/azimuth/main.nf       |  17 +-
 modules/nf-core/modules/citeseq/main.nf       |  15 +-
 .../nf-core/modules/outlier_filter/main.nf    |   5 +-
 modules/nf-core/modules/vireo/main.nf         |   2 +-
 subworkflows/celltype.nf                      |  25 +-
 subworkflows/main_deconvolution.nf            |   8 +-
 subworkflows/qc.nf                            |   2 +-
 workflows/yascp.nf                            |   3 -
 16 files changed, 319 insertions(+), 561 deletions(-)

diff --git a/bin/0026-filter_outlier_cells.py b/bin/0026-filter_outlier_cells.py
index 1b623b6..14b3623 100755
--- a/bin/0026-filter_outlier_cells.py
+++ b/bin/0026-filter_outlier_cells.py
@@ -12,6 +12,7 @@
 from distutils.version import LooseVersion
 import numpy as np
 import scanpy as sc
+import re
 import pandas as pd
 from sklearn.svm import OneClassSVM
 from sklearn.covariance import EllipticEnvelope
@@ -58,6 +59,18 @@ def predict(self, X):
         
         return (np.sign(self.thresh) * (X - self.median - self.thresh*self.mad) > 0).any(axis=1)
 
+    def decision_function(self,cols):
+        tresholds_set = (self.median + self.thresh*self.mad).ravel()
+        thresh = self.thresh.ravel()
+        decission_string = ""
+        for i,tr1 in enumerate(thresh):
+            if tr1<0:
+                trs=f"<{tresholds_set[i]}"
+            else:
+                trs=f">{tresholds_set[i]}"
+            decission_string = f"{decission_string}{tr1}({cols[i]}{trs})"
+        return decission_string
+
     def fit_predict(self, X):
         self.fit(X)
         return self.predict(X)
@@ -78,11 +91,19 @@ def perform_adaptiveQC_Filtering(clf,adata,method,metadata_columns):
             adata.obs[metadata_columns].values
         ) == 1
         predicted_scores = clf.decision_function(adata.obs[metadata_columns].values) 
+    elif method == 'MAD':
+        metadata_columns2 = metadata_columns.copy()
+        metadata_columns2.pop()
+        f = clf.fit_predict(
+            adata.obs[metadata_columns2].values
+        ) == 1
+        predicted_scores = clf.decision_function(metadata_columns2)
+        
     else:
         f = clf.fit_predict(
             adata.obs[metadata_columns].values
         ) == 1
-        predicted_scores = clf.decision_function(adata.obs[metadata_columns].values)
+        predicted_scores = clf.decision_function(adata.obs[metadata_columns].values) 
 
     return predicted_scores,f
 
@@ -201,6 +222,16 @@ def main():
         help='Columns to use for outliers.\
             (default: %(default)s)'
     )
+    
+    parser.add_argument(
+        '--MAD_thresholds',
+        action='store',
+        dest='MAD_thresholds',
+        type=str,
+        default='-5,-5,5',
+        help='Columns to use for outliers.\
+            (default: %(default)s)'
+    )
 
     parser.add_argument(
         '--cell_qc_column',
@@ -285,7 +316,7 @@ def main():
     
     # Here we add an adaptive QC per Column
     # Drop out previous QCed cells
-    cell_qc_column = options.cell_qc_column
+    
     # if cell_qc_column in adata.obs.columns:
     #     n_cells_original = adata.shape[0]
     #     adata = adata[adata.obs[cell_qc_column], :]
@@ -313,194 +344,217 @@ def main():
     # ]
 
     method = options.method
-    if method == 'LocalOutlierFactor':
-        # fit the model for outlier detection (default)
-        clf = LocalOutlierFactor(
-            #n_neighbors=100,
-            # contamination=outliers_fraction
-        )
-    elif method == 'IsolationForest':
-        max_samples = options.max_samples
-        # if max_samples == 0.0:
-        #     if n_cells < 1000:
-        #         max_samples = 250
-        #     else:
-        #         max_samples = 0.1
-        print("Using max_samples of:\t{}".format(max_samples))
-        clf = IsolationForest(
-            #n_estimators=500,
-            max_samples=max_samples,
-            warm_start=False,
-            contamination=outliers_fraction,
-            random_state=0,
-            bootstrap=True
-        )
-    elif method == 'EllipticEnvelope':
-        if outliers_fraction == 'auto':
-            outliers_fraction = 0.1
-        clf = EllipticEnvelope(
-            contamination=outliers_fraction
-        )
-    elif method == 'OneClassSVM':
-        clf = OneClassSVM(
-            # nu=n_outliers,
-            # kernel="rbf",
-            # gamma=0.1
-        )
-    elif method == 'MAD':
-        mad_thresh = [-5, -5, 5]
-        clf = onesidemad(mad_thresh)
-    else:
-        raise ValueError('ERROR: invalid method.')
-
-    metadata_columns_original = metadata_columns.copy()
-    # We perform the adaptive qc either for all data together or per user defined column of unique values.
+    
+    
+    methods = re.split(';|,',options.method)
     outlier_filtering_strategys = options.outlier_filtering_strategys
     # outlier_filtering_strategys = 'Azimuth:L0_predicted.celltype.l2;all_together;all_together::exclude'
-    outlier_filtering_strategys = outlier_filtering_strategys.split(';')
-    matching = [s for s in outlier_filtering_strategys if "::" in s]
-    for m1 in matching:
-        # duplicate the strategy if we have flagged it as an extra step by adding ::no_exclude flag
-        m2 = m1.split('::')[0]
-        if m1 =='all_together::exclude':
-            adata.obs[m1] = 'all_cells'
-        else:
-            adata.obs[m1] = m2
-        
-    # We load the GT match file and determine the celline match that needs to be subjected to adaptive qc independently
-    if(options.patterns_exclude):
-        ######## The folowing bit of code takes the GT match outputs and utilises this to run adaptive qc on cellines independently - only if the celline is expected.
-        if(options.gt_match_file!='fake_file.fq'):
-            GT_match_file = pd.read_csv(options.gt_match_file,sep='\t')
-            GT_patterns = options.patterns_exclude.split(';')
-            for outlier_filtering_strategy in outlier_filtering_strategys:
-                strategy = outlier_filtering_strategy
-                if strategy!='all_together':
-                    for pattern in GT_patterns:
-                        Matches = GT_match_file[GT_match_file['donor_gt'].str.contains(pattern)]
-                        if len(Matches)>0:
-                            for i,match in Matches.iterrows():
-                                donor=match['donor_query']
-                                pool=match['pool']
-                                expected = match['Match Expected']
-                                if expected:
-                                    filter_query = f"convoluted_samplename=='{pool}' and Donor=='{donor}'"
-                                    idx1 = adata.obs.query(filter_query).index
-                                    try:
-                                        adata.obs[strategy] = adata.obs[strategy].cat.add_categories(pattern)
-                                    except:
-                                        _='category exists already'
-                                    adata.obs.loc[idx1,strategy]=pattern
-        ########                 
-    # all_index = pd.DataFrame(adata.obs.index,columns=['col'])
-    # all_together = all_indexes.str[0]+'-'+all_indexes.str[1]+'-'+all_indexes.str[2]
+    outlier_filtering_strategys = re.split(';|,',outlier_filtering_strategys)   
+    metadata_columns_original = metadata_columns.copy() 
     
-    for outlier_filtering_strategy in outlier_filtering_strategys:
-        metadata_columns = metadata_columns_original.copy()
-        if (outlier_filtering_strategy == 'all_together'):
+    for method in methods:
+        print(f'-------Running {method} outlier filtering strategy-------')
+        nmethod = method
+        
+        if method == 'LocalOutlierFactor':
+            cell_qc_column = nmethod+':'+options.cell_qc_column
+            # fit the model for outlier detection (default)
+            clf = LocalOutlierFactor(
+                #n_neighbors=100,
+                # contamination=outliers_fraction
+            )
+        elif method == 'IsolationForest':
+            max_samples = options.max_samples
+            # if max_samples == 0.0:
+            #     if n_cells < 1000:
+            #         max_samples = 250
+            #     else:
+            #         max_samples = 0.1
             cell_qc_column = options.cell_qc_column
-            cell_qc_column_score = f"{cell_qc_column}:score"
-            adata.obs[cell_qc_column] = True
-            adata.obs[cell_qc_column_score] = None
-            metadata_columns.append(cell_qc_column)
-            prediction_score, fail_pass = perform_adaptiveQC_Filtering(clf,adata,method,metadata_columns)
-            adata.obs[cell_qc_column] = fail_pass
-            adata.obs[cell_qc_column_score] = prediction_score
+            print("Using max_samples of:\t{}".format(max_samples))
+            clf = IsolationForest(
+                #n_estimators=500,
+                max_samples=max_samples,
+                warm_start=False,
+                contamination=outliers_fraction,
+                random_state=0,
+                bootstrap=True
+            )
+        elif method == 'EllipticEnvelope':
+            cell_qc_column = nmethod+':'+options.cell_qc_column
+            if outliers_fraction == 'auto':
+                outliers_fraction = 0.1
+            clf = EllipticEnvelope(
+                contamination=outliers_fraction
+            )
+        elif method == 'OneClassSVM':
+            cell_qc_column = nmethod+':'+options.cell_qc_column
+            clf = OneClassSVM(
+                # nu=n_outliers,
+                # kernel="rbf",
+                # gamma=0.1
+            )
+        elif method == 'MAD':
+            mad_thresh = np.array([ int(x) for x in re.split(';|,',options.MAD_thresholds)])
+            nmethod = method+options.MAD_thresholds
+            cell_qc_column = nmethod+':'+options.cell_qc_column
+            clf = onesidemad(mad_thresh)
         else:
-            cell_qc_column = f'{cell_qc_column}-per:{outlier_filtering_strategy}'
-            cell_qc_column_score = f"{cell_qc_column}:score"
-            metadata_columns.append(cell_qc_column)
-            adata.obs[cell_qc_column] = True
-            adata.obs[cell_qc_column_score] = None
-            try:
-                os.mkdir(f'per_celltype_outliers__{outlier_filtering_strategy}')
-            except:
-                _='exists already'
-            try:
-                outlier_strategy_cols = set(adata.obs[outlier_filtering_strategy])
-            except:
-                print('user provided col doesnt exist')
-                continue
-            for subset_id_for_ad_qc in outlier_strategy_cols:
-                subset_ad = adata[adata.obs[outlier_filtering_strategy]==subset_id_for_ad_qc]
-                print(f'filtering:{subset_id_for_ad_qc} {outlier_filtering_strategy}')    
-                if(len(subset_ad)>100):
-                    # We only perform adaptive qc when there is at least 100 cells, otherwise we assume that all pass
-                    prediction_score, fail_pass = perform_adaptiveQC_Filtering(clf,subset_ad,method,metadata_columns)
-                    adata.obs.loc[subset_ad.obs.index,cell_qc_column]=fail_pass
-                    subset_ad.obs.loc[subset_ad.obs.index,cell_qc_column]=fail_pass
-                    
-                    adata.obs.loc[subset_ad.obs.index,cell_qc_column_score]=prediction_score
-                    subset_ad.obs.loc[subset_ad.obs.index,cell_qc_column_score]=prediction_score
-                    
+            raise ValueError('ERROR: invalid method.')
+        print(cell_qc_column)
+        metadata_columns=metadata_columns_original.copy()
+        metadata_columns.append(cell_qc_column)
+        # We perform the adaptive qc either for all data together or per user defined column of unique values.
+
+        matching = [s for s in outlier_filtering_strategys if "::" in s]
+        for m1 in matching:
+            # duplicate the strategy if we have flagged it as an extra step by adding ::no_exclude flag
+            m2 = m1.split('::')[0]
+            if m1 =='all_together::exclude':
+                adata.obs[m1] = 'all_cells'
+            else:
+                adata.obs[m1] = m2
+            
+        # We load the GT match file and determine the celline match that needs to be subjected to adaptive qc independently
+        if(options.patterns_exclude):
+            ######## The folowing bit of code takes the GT match outputs and utilises this to run adaptive qc on cellines independently - only if the celline is expected.
+            if(options.gt_match_file!='fake_file.fq'):
+                GT_match_file = pd.read_csv(options.gt_match_file,sep='\t')
+                GT_patterns = options.patterns_exclude.split(';')
+                for outlier_filtering_strategy in outlier_filtering_strategys:
+                    strategy = outlier_filtering_strategy
+                    if strategy!='all_together':
+                        for pattern in GT_patterns:
+                            Matches = GT_match_file[GT_match_file['donor_gt'].str.contains(pattern)]
+                            if len(Matches)>0:
+                                for i,match in Matches.iterrows():
+                                    donor=match['donor_query']
+                                    pool=match['pool']
+                                    expected = match['Match Expected']
+                                    if expected:
+                                        filter_query = f"convoluted_samplename=='{pool}' and Donor=='{donor}'"
+                                        idx1 = adata.obs.query(filter_query).index
+                                        try:
+                                            adata.obs[strategy] = adata.obs[strategy].cat.add_categories(pattern)
+                                        except:
+                                            _='category exists already'
+                                        adata.obs.loc[idx1,strategy]=pattern
+            ########                 
+        # all_index = pd.DataFrame(adata.obs.index,columns=['col'])
+        # all_together = all_indexes.str[0]+'-'+all_indexes.str[1]+'-'+all_indexes.str[2]
+        
+        for outlier_filtering_strategy in outlier_filtering_strategys:
+            metadata_columns = metadata_columns_original.copy()
+            
+            if (outlier_filtering_strategy == 'all_together'):
+                if method == 'IsolationForest':
+                    cell_qc_column = options.cell_qc_column
                 else:
-                    print(f'For a category {subset_id_for_ad_qc} we have only {len(subset_ad)} cells and as its not sufficient ammount to estimate distributions we assuma all pass QC')
-                # subset_ad.uns['cell_outlier_estimator'] = method
-                of = f'per_celltype_outliers__{outlier_filtering_strategy}/{subset_id_for_ad_qc}---{options.of}'
-                generate_plots(subset_ad,cell_qc_column,metadata_columns,metadata_columns_original,of)
-                del subset_ad
-        adata.uns['cell_outlier_estimator'] = method  
-        adata.obs[['cell_id', cell_qc_column]].to_csv(
-            f'{options.of}-outliers_filtered__{cell_qc_column}.tsv.gz',
-            sep='\t',
-            compression=compression_opts,
-            index=False,
-            header=True
-        )
-
-        # Calculate cell_filtered_per_experiment
-        filter_columns = [
-            'experiment_id',
-            'filter_type',
-            'n_cells_left_in_adata'
-        ]
-        if options.cell_filtered_per_experiment == 'None':
-            df_cell_filt_per_exp = adata.obs['experiment_id'].value_counts()
-            df_cell_filt_per_exp = df_cell_filt_per_exp.rename_axis(
+                    cell_qc_column = f'{cell_qc_column}-per:{outlier_filtering_strategy}'
+                cell_qc_column_score = f"{cell_qc_column}:score"
+                adata.obs[cell_qc_column] = True
+                adata.obs[cell_qc_column_score] = None
+                       
+                metadata_columns.append(cell_qc_column)
+                
+                prediction_score, fail_pass = perform_adaptiveQC_Filtering(clf,adata,method,metadata_columns)
+                adata.obs[cell_qc_column] = fail_pass
+                adata.obs[cell_qc_column_score] = prediction_score
+            else:
+                cell_qc_column = f'{cell_qc_column}-per:{outlier_filtering_strategy}'
+                cell_qc_column_score = f"{cell_qc_column}:score"
+                
+                adata.obs[cell_qc_column] = True
+                adata.obs[cell_qc_column_score] = ''
+                try:
+                    os.mkdir(f'per_celltype_outliers__{outlier_filtering_strategy}')
+                except:
+                    _='exists already'
+                try:
+                    outlier_strategy_cols = set(adata.obs[outlier_filtering_strategy])
+                except:
+                    print(f'{outlier_filtering_strategy} - user provided col doesnt exist')
+                    continue
+                for subset_id_for_ad_qc in outlier_strategy_cols:
+                    subset_ad = adata[adata.obs[outlier_filtering_strategy]==subset_id_for_ad_qc]
+                    print(f'filtering:{subset_id_for_ad_qc} {outlier_filtering_strategy}')    
+                    if(len(subset_ad)>100):
+                        # We only perform adaptive qc when there is at least 100 cells, otherwise we assume that all pass
+                        prediction_score, fail_pass = perform_adaptiveQC_Filtering(clf,subset_ad,method,metadata_columns)
+                        adata.obs.loc[subset_ad.obs.index,cell_qc_column]=fail_pass
+                        subset_ad.obs.loc[subset_ad.obs.index,cell_qc_column]=fail_pass
+                        
+                        adata.obs.loc[subset_ad.obs.index,cell_qc_column_score]=prediction_score
+                        subset_ad.obs.loc[subset_ad.obs.index,cell_qc_column_score]=prediction_score
+                        
+                    else:
+                        print(f'For a category {subset_id_for_ad_qc} we have only {len(subset_ad)} cells and as its not sufficient ammount to estimate distributions we assuma all pass QC')
+                    # subset_ad.uns['cell_outlier_estimator'] = method
+                    of = f'per_celltype_outliers__{outlier_filtering_strategy}/{subset_id_for_ad_qc}---{options.of}'
+                    generate_plots(subset_ad,cell_qc_column,metadata_columns,metadata_columns_original,of)
+                    del subset_ad
+            adata.uns['cell_outlier_estimator'] = method  
+            adata.obs[['cell_id', cell_qc_column]].to_csv(
+                f'{options.of}-outliers_filtered__{cell_qc_column}.tsv.gz',
+                sep='\t',
+                compression=compression_opts,
+                index=False,
+                header=True
+            )
+
+            # Calculate cell_filtered_per_experiment
+            filter_columns = [
+                'experiment_id',
+                'filter_type',
+                'n_cells_left_in_adata'
+            ]
+            if options.cell_filtered_per_experiment == 'None':
+                df_cell_filt_per_exp = adata.obs['experiment_id'].value_counts()
+                df_cell_filt_per_exp = df_cell_filt_per_exp.rename_axis(
+                    'experiment_id'
+                ).reset_index(name='n_cells_left_in_adata')
+                df_cell_filt_per_exp['filter_type'] = 'before_filters'
+                df_cell_filt_per_exp = df_cell_filt_per_exp[filter_columns]
+            else:
+                # Load the samples filtered per experiment file:
+                df_cell_filt_per_exp = pd.read_csv(
+                    options.cell_filtered_per_experiment,
+                    sep="\t"
+                )
+                filt = df_cell_filt_per_exp['filter_type'] != 'after_filters'
+                df_cell_filt_per_exp = df_cell_filt_per_exp.loc[filt, :]
+                
+            # Now calculate the n cells left after all filters
+            adata_after_filters = adata.obs.loc[adata.obs[cell_qc_column], :]
+            df_cells_filtered = adata_after_filters['experiment_id'].value_counts()
+            df_cells_filtered = df_cells_filtered.rename_axis(
                 'experiment_id'
             ).reset_index(name='n_cells_left_in_adata')
-            df_cell_filt_per_exp['filter_type'] = 'before_filters'
-            df_cell_filt_per_exp = df_cell_filt_per_exp[filter_columns]
-        else:
-            # Load the samples filtered per experiment file:
-            df_cell_filt_per_exp = pd.read_csv(
-                options.cell_filtered_per_experiment,
-                sep="\t"
+            df_cells_filtered['filter_type'] = '{} {} outlier_{}'.format(
+                'filter__all_samples',
+                'after_outlier_filter',
+                method
+            )
+            df_cells_filtered = df_cells_filtered[filter_columns]
+            df_cell_filt_per_exp = df_cell_filt_per_exp.append(
+                df_cells_filtered,
+                ignore_index=True
+            )
+            df_cells_filtered['filter_type'] = 'after_filters'
+            df_cell_filt_per_exp = df_cell_filt_per_exp.append(
+                df_cells_filtered,
+                ignore_index=True
+            )
+            # Save the final dataframe
+            df_cell_filt_per_exp.to_csv(
+                f'{options.of}-cell_filtered_per_experiment__{cell_qc_column}.tsv.gz',
+                sep='\t',
+                compression=compression_opts,
+                index=False,
+                header=True
             )
-            filt = df_cell_filt_per_exp['filter_type'] != 'after_filters'
-            df_cell_filt_per_exp = df_cell_filt_per_exp.loc[filt, :]
             
-        # Now calculate the n cells left after all filters
-        adata_after_filters = adata.obs.loc[adata.obs[cell_qc_column], :]
-        df_cells_filtered = adata_after_filters['experiment_id'].value_counts()
-        df_cells_filtered = df_cells_filtered.rename_axis(
-            'experiment_id'
-        ).reset_index(name='n_cells_left_in_adata')
-        df_cells_filtered['filter_type'] = '{} {} outlier_{}'.format(
-            'filter__all_samples',
-            'after_outlier_filter',
-            method
-        )
-        df_cells_filtered = df_cells_filtered[filter_columns]
-        df_cell_filt_per_exp = df_cell_filt_per_exp.append(
-            df_cells_filtered,
-            ignore_index=True
-        )
-        df_cells_filtered['filter_type'] = 'after_filters'
-        df_cell_filt_per_exp = df_cell_filt_per_exp.append(
-            df_cells_filtered,
-            ignore_index=True
-        )
-        # Save the final dataframe
-        df_cell_filt_per_exp.to_csv(
-            f'{options.of}-cell_filtered_per_experiment__{cell_qc_column}.tsv.gz',
-            sep='\t',
-            compression=compression_opts,
-            index=False,
-            header=True
-        )
-        
-        generate_plots(adata,cell_qc_column,metadata_columns,metadata_columns_original,options.of)
+            generate_plots(adata,cell_qc_column,metadata_columns,metadata_columns_original,options.of)
 
 
     adata.write(
diff --git a/bin/gather_minimal_dataset.py b/bin/gather_minimal_dataset.py
index c54ee90..a921145 100755
--- a/bin/gather_minimal_dataset.py
+++ b/bin/gather_minimal_dataset.py
@@ -45,8 +45,8 @@
 SCRUBLET_ASSIGNMENTS_FNSUFFIX = 'scrublet.tsv'
 
 COLUMNS_AZIMUTH = {
-    'Azimuth:Azimuth:predicted.celltype.l2': 'azimuth.celltyp.l2',
-    'Azimuth:Azimuth:predicted.celltype.l2.score': 'azimuth.pred.score.l2',
+    'Azimuth:predicted.celltype.l2': 'azimuth.celltyp.l2',
+    'Azimuth:predicted.celltype.l2.score': 'azimuth.pred.score.l2',
     'Azimuth:mapping.score.celltype.l2': 'azimuth.map.score',
     }
 
@@ -327,7 +327,7 @@ def gather_donor(donor_id, ad, ad_lane_raw, azimuth_annot, qc_obs, columns_outpu
     ad.raw = ad_lane_raw[ad.obs.index, :]
     if donor_id != "unassigned" and donor_id != "doublet":
         # add annotation from QC
-        df = pandas.concat([ad.obs, azimuth_annot.loc[azimuth_annot.donor == donor_id]], axis = 1, join = 'outer')
+        df = pandas.concat([ad.obs, azimuth_annot.loc[azimuth_annot.Donor == donor_id]], axis = 1, join = 'outer')
         df =df.loc[:,~df.columns.duplicated()]
         df = df[['experiment_id'] + list(COLUMNS_DECONV.keys()) + list(COLUMNS_AZIMUTH.keys())]
         try:
@@ -370,7 +370,10 @@ def gather_donor(donor_id, ad, ad_lane_raw, azimuth_annot, qc_obs, columns_outpu
     ad.obs = ad.obs.loc[:,~ad.obs.columns.duplicated()]
     if write_h5:
         path1=os.path.join(outdir, oufnam + '.h5ad')
-        ad.obs['qc.filter.pass.AZ:L0'] = ad.obs['qc.filter.pass.AZ:L0'].astype('bool')
+        try:
+            ad.obs['qc.filter.pass.AZ:L0'] = ad.obs['qc.filter.pass.AZ:L0'].astype('bool')
+        except:
+            pass
         ad.obs['cell_passes_hard_filters'] = ad.obs['cell_passes_hard_filters'].astype('bool')
         ad.obs['qc.filter.pass'] = ad.obs['qc.filter.pass'].astype('bool')
         ad.write(path1,compression='gzip')
@@ -1023,10 +1026,10 @@ def set_argument_parser():
         ad = adqc[s]
         if ad.n_obs == 0:
             continue #Here no cells has passed the qc thresholds.
-        nf, data_tranche, data_donor, azt = gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = oufh, lane_id=count,Resolution=Resolution)
+        nf, data_tranche, data_donor = gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = oufh, lane_id=count,Resolution=Resolution)
         # add the stuff to the adata.
 
-        All_probs_and_celltypes=pd.concat([All_probs_and_celltypes,azt])
+        # All_probs_and_celltypes=pd.concat([All_probs_and_celltypes,azt])
         data_tranche_all.append(data_tranche)
         data_donor_all= data_donor_all+data_donor
         count += 1
diff --git a/bin/remap_azimuth_l2.py b/bin/remap_azimuth_l2.py
index 4b00a79..22261b9 100755
--- a/bin/remap_azimuth_l2.py
+++ b/bin/remap_azimuth_l2.py
@@ -53,7 +53,7 @@
 
 D1 =All_Data
 D1['idx1']=D1.index
-D1 = All_Data.set_index('predicted.celltype.l2')
+D1 = All_Data.set_index('Azimuth:predicted.celltype.l2')
 for col in Mappings.columns:
     D1[f'{col}_predicted.celltype.l2']=''
     D1[f'{col}_predicted.celltype.l2']=Mappings[col]
diff --git a/bin/scanpy_merge_from_h5ad.py b/bin/scanpy_merge_from_h5ad.py
index 8a42900..150a439 100755
--- a/bin/scanpy_merge_from_h5ad.py
+++ b/bin/scanpy_merge_from_h5ad.py
@@ -16,6 +16,9 @@
 import pandas as pd
 import scanpy as sc
 
+
+
+
 # Set seed for reproducibility
 seed_value = 0
 y_n_print=False
@@ -479,7 +482,6 @@ def scanpy_merge(
     metadata,
     metadata_key,
     output_file,
-    params_dict=dict(),
     cellmetadata_filepaths=None,
     anndata_compression_opts=4,
     extra_metadata=None
@@ -505,14 +507,7 @@ def scanpy_merge(
     output_file : string
         output_file
     """
-    # Get compression opts for pandas
-    compression_opts = 'gzip'
-    if LooseVersion(pd.__version__) > '1.0.0':
-        compression_opts = dict(
-            method='gzip',
-            compresslevel=anndata_compression_opts
-        )
-    
+
     # check the h5ad_data
     # check for required columns
     h5ad_data_required_cols = set(['experiment_id', 'data_path_h5ad_format'])
@@ -551,38 +546,11 @@ def scanpy_merge(
                 )
             )
 
-    # Init default values for params_dict
-    params_filters_check = [
-        'cell_filters',
-        'downsample_cells_fraction',
-        'downsample_cells_n',
-        'downsample_feature_counts'
-    ]
-    for i in params_filters_check:
-        if i not in params_dict:
-            if i != 'cell_filters':
-                params_dict[i] = {'value': ''}
-    # Check for validity of filters specified in params_dict
-    param_filters_check = [
-        'downsample_cells_fraction',
-        'downsample_cells_n'
-    ]
-    if all(params_dict[k]['value'] != '' for k in param_filters_check):
-        raise Exception(
-            'Error check the params. Both {} and {} are set.'.format(
-                'downsample_cells_fraction',
-                'downsample_cells_n'
-            )
-        )
 
     # Init a dictionary to record the original number of cells per sample and
     # the number of cells after each filter.
     n_cells_dict = {}
 
-    # If true, then filtered cells are dropped prior to merging the data.
-    # This will save disk space.
-    drop_filtered_cells = False
-
     # Iterate over samples and load data
     adatasets = []
     adatasets__experiment_ids = []
@@ -598,6 +566,7 @@ def scanpy_merge(
             # ivar_names='gene_ids',
             # make_unique=False
         )
+
         adata_orig_cols = list(adata.obs.columns)
         adata_orig_cols.append("donor")
         adata = check_adata(adata, row['experiment_id'])
@@ -676,204 +645,6 @@ def scanpy_merge(
                 for col in cellmetadata.columns:
                     adata.obs[col] = cellmetadata.loc[adata.obs.index, col]
 
-        # Calculate basic qc metrics for this sample.
-        # NOTE: n_genes_by_counts == number of genes with > 0 counts
-        #       adata.obs['n_genes'] = (adata.X > 0).sum(axis = 1) is same as
-        #       adata.obs['n_genes_by_counts']
-        vars_prior_metrics = adata.var_keys()
-        sc.pp.calculate_qc_metrics(
-            adata,
-            qc_vars=[
-                'gene_group__mito_transcript',
-                'gene_group__mito_protein',
-                'gene_group__ribo_protein',
-                'gene_group__ribo_rna'
-            ],
-            inplace=True
-        )
-
-        # # Apply cell filter.
-        # # adata = adata[selected_cells, :]
-        # # Apply gene filter
-        # # adata = adata[:, selected_genes]
-
-        # # Apply cell QC filters.
-        # adata.obs['cell_passes_qc'] = True
-        # filters_all_samples = []
-        # filters_experiment = []
-        # if 'cell_filters' not in params_dict:
-        #     warnings.warn('Found no cell_filters in params_dict.')
-        # else:
-        #     if 'all_samples' in params_dict['cell_filters'].keys():
-        #         # NOTE: we want this to throw an error if value is not there.
-        #         filters_all_samples = params_dict['cell_filters'][
-        #             'all_samples'
-        #         ]['value']
-        #     if row['experiment_id'] in params_dict['cell_filters'].keys():
-        #         filters_experiment = params_dict['cell_filters'][
-        #             row['experiment_id']
-        #         ]['value']
-
-        # # First record the total number of cells that pass each filter
-        # # independently i.e., not depenedent on any other filter.
-        # if len(filters_all_samples) > 0:
-        #     for filter_query in filters_all_samples:
-        #         if filter_query != '':
-        #             n_cells_dict[row['experiment_id']][
-        #                 'filter__all_samples {}'.format(filter_query)
-        #             ] = adata.n_obs - adata.obs.query(filter_query).shape[0]
-        # if len(filters_experiment) > 0:
-        #     for filter_query in filters_experiment:
-        #         # NOTE: could add if test here to grab filter_query ==
-        #         # file_cellids_filter or file_cellids_keep
-        #         if filter_query != '':
-        #             n_cells_dict[row['experiment_id']][
-        #                 'filter__sample_specific {}'.format(
-        #                     filter_query
-        #                 )
-        #             ] = adata.n_obs - adata.obs.query(filter_query).shape[0]
-
-        # # Now apply the filters - first apply the filters for all samples.
-        # n_cells_start = adata.n_obs
-        # filter_i = 0
-        # if len(filters_all_samples) > 0:
-        #     # Run each filter iteratively.
-        #     for filter_query in filters_all_samples:
-        #         if filter_query != '':
-        #             # Drop the cells that are flagged in this query
-        #             cells_to_remove = adata.obs.query(filter_query).index
-        #             adata.obs.loc[cells_to_remove, 'cell_passes_qc'] = False
-        #             # adata = adata[
-        #             #     np.invert(adata.obs.index.isin(cells_to_remove)),
-        #             #     :
-        #             # ]
-        #             if y_n_print:
-        #                 print('[{}] {} "{}": {} dropped {} remain'.format(
-        #                     'all sample cell QC applied',
-        #                     row['experiment_id'],
-        #                     filter_query,
-        #                     len(cells_to_remove),
-        #                     adata.obs['cell_passes_qc'].sum()
-        #                 ))
-        #             n_cells_dict[row['experiment_id']][
-        #                 'filter__all_samples after_filter_{} {}'.format(
-        #                     filter_i,
-        #                     filter_query
-        #                 )
-        #             ] = adata.obs['cell_passes_qc'].sum()
-        #             filter_i += 1
-
-        # # Now apply per sample filters.
-        # if len(filters_experiment) > 0:
-        #     # Run each filter iteratively.
-        #     for filter_query in filters_experiment:
-        #         if filter_query != '':
-        #             cells_to_remove = adata.obs.query(filter_query).index
-        #             adata.obs.loc[cells_to_remove, 'cell_passes_qc'] = False
-        #             # adata = adata[
-        #             #     np.invert(adata.obs.index.isin(cells_to_remove)),
-        #             #     :
-        #             # ]
-        #             if y_n_print:
-        #                 print('[{}] {} "{}": {} dropped {} remain'.format(
-        #                     'sample specific cell QC applied',
-        #                     row['experiment_id'],
-        #                     filter_query,
-        #                     len(cells_to_remove),
-        #                     adata.obs['cell_passes_qc'].sum()
-        #                 ))
-        #             n_cells_dict[row['experiment_id']][
-        #                 'filter__sample_specific after_filter_{} {}'.format(
-        #                     filter_i,
-        #                     filter_query
-        #                 )
-        #             ] = adata.obs['cell_passes_qc'].sum()
-        #             filter_i += 1
-
-        # # Write the number of cells filtered to standard out.
-        # if y_n_print:
-        #     print('[{}] after all cell QC: {} dropped {} remain'.format(
-        #         row['experiment_id'],
-        #         n_cells_start - adata.obs['cell_passes_qc'].sum(),
-        #         adata.obs['cell_passes_qc'].sum()
-        #     ))
-
-        # # Apply cell downsampling if needed.
-        # if params_dict['downsample_cells_fraction']['value'] != '':
-        #     n_cells_start = adata.n_obs
-        #     sc.pp.subsample(
-        #         adata,
-        #         fraction=float(
-        #             params_dict['downsample_cells_fraction']['value']
-        #         ),
-        #         copy=False,
-        #         random_state=0
-        #     )
-        #     n_cells_dict[
-        #         row['experiment_id']
-        #     ]['downsample_cells_fraction'] = adata.n_obs
-        #     if y_n_print:
-        #         print('[{}] cell downsample applied: {} dropped {} remain'.format(
-        #             row['experiment_id'],
-        #             n_cells_start - adata.n_obs,
-        #             adata.n_obs
-        #         ))
-        # elif params_dict['downsample_cells_n']['value'] != '':
-        #     n_cells_start = adata.n_obs
-        #     sc.pp.subsample(
-        #         adata,
-        #         n_obs=int(params_dict['downsample_cells_n']['value']),
-        #         copy=False,
-        #         random_state=0
-        #     )
-        #     n_cells_dict[
-        #         row['experiment_id']
-        #     ]['downsample_cells_n'] = adata.n_obs
-        #     if y_n_print:
-        #         print('[{}] cell downsample applied: {} dropped {} remain'.format(
-        #             row['experiment_id'],
-        #             n_cells_start - adata.n_obs,
-        #             adata.n_obs
-        #         ))
-        # # Apply count downsampling if needed.
-        # if params_dict['downsample_feature_counts']['value'] != '':
-        #     fraction = params_dict['downsample_feature_counts']['value']
-        #     target_counts_per_cell = adata.obs['total_counts'].apply(
-        #         lambda x: int(x * fraction)
-        #     ).values
-        #     sc.pp.downsample_counts(
-        #         adata,
-        #         counts_per_cell=target_counts_per_cell,
-        #         random_state=0
-        #     )
-
-        # # Print the number of cells and genes for this sample.
-        # n_cells_dict[row['experiment_id']]['after_filters'] = adata.obs[
-        #     'cell_passes_qc'
-        # ].sum()
-
-        # if y_n_print:
-        #     print('[{}] {} obs (cells), {} var (genes)'.format(
-        #         row['experiment_id'],
-        #         adata.obs['cell_passes_qc'].sum(),
-        #         adata.n_vars
-        #     ))
-
-        # # Comment code below to keep the vars (gene) output from
-        # # calculate_qc_metrics *per sample*. If we do this, then in
-        # # adata_merged.var, we will have duplicated # measures according to
-        # # each sample (e.g., n_cells_by_counts-0, # n_cells_by_counts-1,
-        # # n_cells_by_counts-3).
-        # #
-        # # Code below removes such output.
-        # adata.var = adata.var[vars_prior_metrics]
-
-        # # Only keep cells that pass QC
-        # if drop_filtered_cells:
-        #     adata = adata[adata.obs['cell_passes_qc'], :]
-        #     del adata.obs['cell_passes_qc']
-
-        # If we still have cells after filters, add to our list of data.
         if adata.n_obs > 0:
             adatasets.append(adata)
             adatasets__experiment_ids.append(row['experiment_id'])
@@ -882,12 +653,7 @@ def scanpy_merge(
             raise Exception(
                 'Error invalid h5ad_data file. Missing coluns.'
             )
-    # OPTIONAL: one could convert the experiment_ids to hashes
-    # adatasets__experiment_ids_hash = [
-    #     hashlib.sha1(i.encode('utf-8')) for i in adatasets__experiment_ids
-    # ]
 
-    # Merge all of the data together.
     adata_merged = adatasets[0].concatenate(
         *adatasets[1:],
         batch_categories=adatasets__experiment_ids
@@ -901,7 +667,7 @@ def scanpy_merge(
     adata_merged.obs['experiment_id'] = adata_merged.obs['experiment_id'].str.split('__donor').str[0]
     # Re-calculate basic qc metrics of var (genes) for the whole dataset.
     # NOTE: we are only changing adata.var
-    obs_prior = adata_merged.obs.copy()
+    # obs_prior = adata_merged.obs.copy()
     sc.pp.calculate_qc_metrics(
         adata_merged,
         qc_vars=[
@@ -912,53 +678,14 @@ def scanpy_merge(
         ],
         inplace=True
     )
-    adata_merged.obs = obs_prior
-
-    # Possible additional basic filtering on the full dataset.
-    # sc.pp.filter_cells(adata, min_genes=200)
-    # sc.pp.filter_genes(adata, min_cells=1)
-    # if y_n_print:
-    #     print('[adata_merged] {} obs, {} vars'.format(
-    #         adata_merged.n_obs,
-    #         adata_merged.n_vars
-    #     ))
-
-    # # Merge info on cell filters
-    # n_cells_df = pd.DataFrame(n_cells_dict)
-    # n_cells_df = n_cells_df.transpose()
-    # n_cells_df['experiment_id'] = n_cells_df.index
-    # n_cells_df = n_cells_df.melt(
-    #     id_vars=['experiment_id'],
-    #     var_name='filter_type',
-    #     value_name='n_cells_left_in_adata'
-    # )
-    # # Drop rows with no value for n_cells_left_in_adata. This will happen for
-    # # per sample filters.
-    # n_cells_df = n_cells_df.dropna(subset=['n_cells_left_in_adata'])
-    # adata_merged.uns['cell_filtered_per_experiment'] = n_cells_df
-    # adata_merged.uns['cell_filtered_per_experiment_dict'] = n_cells_dict
-
-    # Save the adata matrix
-    # output_file = output_dir + "/adata"
+    # adata_merged.obs = obs_prior
+    adata_merged.obs['log10_ngenes_by_count'] = np.log10(adata_merged.obs['n_genes_by_counts']) / np.log10(adata_merged.obs['total_counts'])
     adata_merged.write(
         '{}.h5ad'.format(output_file),
         compression='gzip',
         compression_opts=anndata_compression_opts
     )
-    # adata_merged.write_csvs(output_file)
-    # adata_merged.write_loom(output_file+".loom")
 
-    # n_cells_df.to_csv(
-    #     '{}-cell_filtered_per_experiment.tsv.gz'.format(output_file),
-    #     sep='\t',
-    #     index=False,
-    #     quoting=csv.QUOTE_NONNUMERIC,
-    #     # index_label='cell_barcode',
-    #     na_rep='',
-    #     compression=compression_opts
-    # )
-
-    return(output_file)
 
 
 def main():
@@ -1085,23 +812,6 @@ def main():
     # Scanpy settings
     sc.settings.figdir = os.getcwd()  # figure output directory to match base.
     sc.settings.n_jobs = options.ncpu  # number CPUs
-    # sc.settings.max_memory = 500  # in Gb
-    # sc.set_figure_params(dpi_save = 300)
-
-    # NOTE:
-    # - Could change yaml params file to include a list per sample and the
-    #   filters one wants to use for that sample
-    # - Could allow the input to be either h5ad dir or AnnData/loom object.
-    #   Use AnnData/loom object would be useful if we add doublet scores
-    #   or other scores prior to filtering.
-
-    # Read in the parameters for downsampling and cell filters.
-    if options.pyml == '':
-        params_dict = {}
-    else:
-        with open(options.pyml, 'r') as f:
-            params_dict = yaml.safe_load(f)
-        params_dict = params_dict['sample_qc']
 
     # Load a file of the samples to analyse
     h5ad_data = pd.read_csv(options.txd, sep='\t')
@@ -1151,17 +861,16 @@ def main():
         )
 
     # Run the merge function.
-    out_file = scanpy_merge(
+    scanpy_merge(
         h5ad_data,
         metadata,
         metadata_key=options.mk,
         output_file=options.of,
-        params_dict=params_dict,
         cellmetadata_filepaths=cellmetadata_filepaths,
         anndata_compression_opts=options.anndata_compression_opts,
         extra_metadata= extra_metadata
     )
-    print(out_file)
+
 
 
 if __name__ == '__main__':
diff --git a/bin/scanpy_split_h5ad.py b/bin/scanpy_split_h5ad.py
index 207d7c1..71faf88 100755
--- a/bin/scanpy_split_h5ad.py
+++ b/bin/scanpy_split_h5ad.py
@@ -37,32 +37,32 @@ def write_h5_out_for_ct(ad,oufn_list_AZ,oufnam,oufn_list,samples,samples_AZ,bl,c
     # disable
     adb_AZ.obs = pandas.DataFrame(adb_AZ.obs.index, index = adb_AZ.obs.index, columns = ["cell_barcode"])
 
-    adb.layers['counts'] = adb.X.copy()
+    # adb.layers['counts'] = adb.X.copy()
 
-    # Total-count normalize (library-size correct) the data matrix X to
-    # counts per million, so that counts become comparable among cells.
-    scanpy.pp.normalize_total(
-        adb,
-        target_sum=1e4,
-        exclude_highly_expressed=False,
-        key_added='normalization_factor',  # add to adata.obs
-        inplace=True
-    )
-    # Logarithmize the data: X = log(X + 1) where log = natural logorithm.
-    # Numpy has a nice function to undo this np.expm1(adata.X).
-    scanpy.pp.log1p(adb)
-    # Delete automatically added uns - UPDATE: bad idea to delete as this slot
-    # is used in _highly_variable_genes_single_batch.
-    # del adata.uns['log1p']
-    # Add record of this operation.
-    # adata.layers['log1p_cpm'] = adata.X.copy()
-    # adata.uns['log1p_cpm'] = {'transformation': 'ln(CPM+1)'}
-    adb.layers['log1p_cp10k'] = adb.X.copy()
-    adb.uns['log1p_cp10k'] = {'transformation': 'ln(CP10k+1)'}
+    # # Total-count normalize (library-size correct) the data matrix X to
+    # # counts per million, so that counts become comparable among cells.
+    # scanpy.pp.normalize_total(
+    #     adb,
+    #     target_sum=1e4,
+    #     exclude_highly_expressed=False,
+    #     key_added='normalization_factor',  # add to adata.obs
+    #     inplace=True
+    # )
+    # # Logarithmize the data: X = log(X + 1) where log = natural logorithm.
+    # # Numpy has a nice function to undo this np.expm1(adata.X).
+    # scanpy.pp.log1p(adb)
+    # # Delete automatically added uns - UPDATE: bad idea to delete as this slot
+    # # is used in _highly_variable_genes_single_batch.
+    # # del adata.uns['log1p']
+    # # Add record of this operation.
+    # # adata.layers['log1p_cpm'] = adata.X.copy()
+    # # adata.uns['log1p_cpm'] = {'transformation': 'ln(CPM+1)'}
+    # adb.layers['log1p_cp10k'] = adb.X.copy()
+    # adb.uns['log1p_cp10k'] = {'transformation': 'ln(CP10k+1)'}
 
-    # Reset X to counts
-    adb.X = adb.layers['counts'].copy()  
-    del adb.layers["counts"] #Since counts are set as an X we dont need it as part of the andata - this saves space.
+    # # Reset X to counts
+    # adb.X = adb.layers['counts'].copy()  
+    # del adb.layers["counts"] #Since counts are set as an X we dont need it as part of the andata - this saves space.
 
     try:
         vdf = adb_AZ.var[["feature_types", "genome"]]
diff --git a/conf/base.conf b/conf/base.conf
index 98bb86d..78b2b3f 100755
--- a/conf/base.conf
+++ b/conf/base.conf
@@ -18,6 +18,7 @@ params{
     //# estimate_and_provide_informative_snps_for_deconvolution=false
     perform_concordance_calculations = false
     filter_outliers = true
+    remap_celltypes = false
     //# These are default parameters that can be overwriten to run in a different mode.
     //# Here we have listed the default parameters when running without any extrainput.
     tmpdir = "${launchDir}/work"
diff --git a/conf/modules.conf b/conf/modules.conf
index bcb6f3a..6350b62 100755
--- a/conf/modules.conf
+++ b/conf/modules.conf
@@ -68,7 +68,7 @@ process {
     withName: DOUBLET_DECON{
         maxRetries    = 3
         errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' }
-        memory = { 100.GB * task.attempt }
+        memory = { 150.GB * task.attempt }
     }
     
     withName: AZIMUTH{
@@ -235,9 +235,9 @@ process {
     }
 
     withName: CONCORDANCE_CALCLULATIONS{
-        cpus = 20
+        cpus = 10
         time   = {  24.h   * task.attempt }
-        memory = { 100.GB * task.attempt }
+        memory = { 80.GB * task.attempt }
     }
 
     withName: OTHER_DONOR_CONCORDANCE_CALCLULATIONS{
@@ -280,7 +280,7 @@ process {
     }
 
     withName: umap_gather{
-        memory = { 100.GB * task.attempt }
+        memory = { 200.GB * task.attempt }
         errorStrategy = 'retry'
         maxRetries    = 3
     }
diff --git a/conf/qc.conf b/conf/qc.conf
index 1e72efb..e31c42c 100755
--- a/conf/qc.conf
+++ b/conf/qc.conf
@@ -179,13 +179,13 @@ params{
                 run_process = true
                 method = 'IsolationForest' //# Available methods: ocalOutlierFactor, IsolationForest, EllipticEnvelope, OneClassSVM, onesidemad
                 metadata_columns = 'log1p_total_counts,log1p_n_genes_by_counts,pct_counts_gene_group__mito_transcript'
-                mad_tresholds = '-5,-5,5' //# These one side MAD filters will be used if selected.
+                mad_tresholds = '-5,-5,5' //# These one side MAD filters will be used if selected. If MAD is used then these thresholds will be used for each of them. [-] prefix means filtering on the left side of distribution, whereas [+] means filtering on the right side of distribution.
                 outliers_fraction = 0.0
                 max_samples = 0.1
             }
         }
         
-        outlier_filtering_strategy = 'Azimuth:L0_predicted.celltype.l2;all_together' //the outliers may be filterered per strategy type
+        outlier_filtering_strategy = 'Azimuth:L0_predicted.celltype.l2,all_together' //the outliers may be filterered per strategy type
         gt_match_based_adaptive_qc_exclusion_pattern = '' // #We run the adaptive QC on these patterns independently regardless on assigned celltype.  
 
         downsample_cells_fraction{
diff --git a/modules/nf-core/modules/azimuth/main.nf b/modules/nf-core/modules/azimuth/main.nf
index e50bcf9..4edf5b0 100755
--- a/modules/nf-core/modules/azimuth/main.nf
+++ b/modules/nf-core/modules/azimuth/main.nf
@@ -63,28 +63,21 @@ process REMAP_AZIMUTH{
         container "wtsihgi/nf_scrna_qc_azimuth:d54db9b"
     }
 
-    publishDir  path: "${params.outdir}/celltype/azimuth/",
+    publishDir  path: "${params.outdir}/celltype/",
             mode: "${params.copy_mode}",
             overwrite: "true"
     stageInMode 'copy'  
 
     input:
-        tuple val(outfil_prfx),val(name) path(azimuth_file)
+        path(file)
         path(mapping_file)
-    when:
-        name=='PBMC'
+
     output:
-        path('azimuth/*', emit:predicted_celltype_labels)
+        path("remapped__${file}", emit:predicted_celltype_labels)
 
     script:
-        celltype_table = "remapped__${azimuth_file}"
         """
-            remap_azimuth_l2.py -of remapped__predicted_celltype_l2.tsv -m ${mapping_file} -az predicted_celltype_l2.tsv
-            mkdir azimuth
-            
-            mv predicted_celltype_l1.tsv  azimuth/${outfil_prfx}__predicted_celltype_l1.tsv
-            mv predicted_celltype_l3.tsv azimuth/${outfil_prfx}__predicted_celltype_l3.tsv
-            mv remapped__predicted_celltype_l2.tsv azimuth/${outfil_prfx}__predicted_celltype_l2.tsv
+            remap_azimuth_l2.py -of remapped__${file} -m ${mapping_file} -az ${file}
         """
 
 }
diff --git a/modules/nf-core/modules/citeseq/main.nf b/modules/nf-core/modules/citeseq/main.nf
index 272029a..60e1680 100755
--- a/modules/nf-core/modules/citeseq/main.nf
+++ b/modules/nf-core/modules/citeseq/main.nf
@@ -7,16 +7,9 @@ process SPLIT_CITESEQ_GEX {
         container "wtsihgi/nf_scrna_qc:6bb6af5"
     }
 
-    publishDir  path: "${params.outdir}/citeseq/${mode}/${sample_name}",
-    //   saveAs: {filename ->
-    //     if (filename.contains("antibody-")) {
-    //         filename.replaceAll("antibody-", "${mode}_antibody-")
-    //     }else {
-    //         null
-    //     }
-    //   },
-      mode: "${params.copy_mode}",
-      overwrite: "true"
+    publishDir  path: "${params.outdir}/data_modalities_split/${mode}/${sample_name}",
+    mode: "${params.copy_mode}",
+    overwrite: "true"
 
     input:
         tuple val(sample_name),path(cellranger_raw) 
@@ -32,7 +25,7 @@ process SPLIT_CITESEQ_GEX {
     script:
 
         """
-        
+
             strip_citeseq.py --raw_data ${cellranger_raw} -o ${sample_name}
         """
 }
diff --git a/modules/nf-core/modules/outlier_filter/main.nf b/modules/nf-core/modules/outlier_filter/main.nf
index 5dcf36c..913e79b 100755
--- a/modules/nf-core/modules/outlier_filter/main.nf
+++ b/modules/nf-core/modules/outlier_filter/main.nf
@@ -56,7 +56,7 @@ process OUTLIER_FILTER {
 
     script:
         if(gt_match_based_adaptive_qc_exclusion_pattern!=''){
-            filter_strategy_exclusion = "--patterns_exclude '${gt_match_based_adaptive_qc_exclusion_pattern}' --gt_match_file ${gt_outlier_input}"
+            filter_strategy_exclusion = "--patterns_exclude='${gt_match_based_adaptive_qc_exclusion_pattern}' --gt_match_file ${gt_outlier_input}"
         }else{
             filter_strategy_exclusion = ""
         }
@@ -77,7 +77,8 @@ process OUTLIER_FILTER {
                 --max_samples ${max_samples} \
                 --output_file ${outfile} \
                 --anndata_compression_opts ${anndata_compression_opts} \
-                --filter_strategy '${outlier_filtering_strategy}' \
+                --filter_strategy='${outlier_filtering_strategy}' \
+                --MAD_thresholds='${params.sample_qc.cell_filters.filter_outliers.mad_tresholds}' \
                 ${filter_strategy_exclusion}
                 
             mkdir plots
diff --git a/modules/nf-core/modules/vireo/main.nf b/modules/nf-core/modules/vireo/main.nf
index 5a2f3e5..107db60 100755
--- a/modules/nf-core/modules/vireo/main.nf
+++ b/modules/nf-core/modules/vireo/main.nf
@@ -213,7 +213,7 @@ process VIREO {
     output:
       path("vireo_${samplename}"), emit: output_dir
       
-      // tuple val(samplename), path("vireo_${samplename}"), emit: output_dir
+      tuple val(samplename), path("vireo_${samplename}"), emit: output_dir_subsampling
       tuple val(samplename), path("vireo_${samplename}/donor_ids.tsv"), emit: sample_donor_ids
       tuple val(samplename), path("vireo_${samplename}/GT_donors.vireo.vcf.gz"), path(vcf_file),path(donor_gt_csi), emit: sample_donor_vcf
       tuple val(samplename), path("vireo_${samplename}/GT_donors.vireo.vcf.gz"), emit: infered_vcf
diff --git a/subworkflows/celltype.nf b/subworkflows/celltype.nf
index 4e611ad..4f775c0 100755
--- a/subworkflows/celltype.nf
+++ b/subworkflows/celltype.nf
@@ -6,6 +6,7 @@ include {KERAS_CELLTYPE} from "$projectDir/modules/nf-core/modules/keras_celltyp
 include {CELLTYPE_FILE_MERGE} from "$projectDir/modules/nf-core/modules/cell_type_assignment/functions"
 include {SCPRED} from "$projectDir/modules/nf-core/modules/scpred/main"
 include {  DSB } from '../modules/nf-core/modules/citeseq/main'
+
 workflow celltype{
     
     take:
@@ -24,18 +25,19 @@ workflow celltype{
         SPLIT_BATCH_H5AD.out.sample_file
             .splitCsv(header: true, sep: "\t", by: 1)
             .map{row -> tuple(row.experiment_id, file(row.h5ad_filepath))}.set{ch_experiment_filth5}
+
         SPLIT_BATCH_H5AD.out.az_sample_file
             .splitCsv(header: true, sep: "\t", by: 1)
             .map{row -> tuple(row.experiment_id, file(row.h5ad_filepath))}.set{az_ch_experiment_filth5}
 
         SPLIT_BATCH_H5AD.out.files_anndata_batch.flatMap().set{ch_batch_files}
-
+        SPLIT_BATCH_H5AD.out.keras_outfile.collect().set{keras_files}
 
         // Keras celltype assignemt
         if (params.celltype_assignment.run_keras){
             KERAS_CELLTYPE(ch_experiment_filth5,params.celltype_prediction.keras.keras_model,params.celltype_prediction.keras.keras_weights_df) 
-            all_extra_fields = KERAS_CELLTYPE.out.predicted_celltype_labels.collect()
-            all_extra_fields = all_extra_fields.ifEmpty(Channel.from("$projectDir/assets/fake_file.fq"))
+            all_extra_fields3 = KERAS_CELLTYPE.out.predicted_celltype_labels.collect()
+            all_extra_fields = all_extra_fields3.ifEmpty(Channel.from("$projectDir/assets/fake_file.fq"))
         }else{
             all_extra_fields = Channel.from("$projectDir/assets/fake_file.fq")
         }
@@ -56,8 +58,8 @@ workflow celltype{
             Channel.fromList(params.celltypist.models)
                 .set{ch_celltypist_models}
             CELLTYPIST(az_ch_experiment_filth5.combine(ch_celltypist_models))
-            ct_out = CELLTYPIST.out.predicted_labels.collect()
-            ct_out = ct_out.ifEmpty(Channel.from("$projectDir/assets/fake_file2.fq"))
+            ct_out2 = CELLTYPIST.out.predicted_labels.collect()
+            ct_out = ct_out2.ifEmpty(Channel.from("$projectDir/assets/fake_file2.fq"))
         }else{
             ct_out = Channel.from("$projectDir/assets/fake_file2.fq")
         }
@@ -65,16 +67,21 @@ workflow celltype{
         // // SCPRED
         if (params.celltype_assignment.run_scpred){
             SCPRED(params.outdir,ch_batch_files)
-            sc_out = SCPRED.out.predicted_celltype_labels.collect()
-            sc_out = sc_out.ifEmpty(Channel.of())
+            sc_out2 = SCPRED.out.predicted_celltype_labels.collect()
+            sc_out = sc_out2.ifEmpty(Channel.of())
         }else{
             sc_out = Channel.of()
         }        
         all_extra_fields2 = all_extra_fields.mix(sc_out)
         
-        CELLTYPE_FILE_MERGE(az_out,ct_out,all_extra_fields2,SPLIT_BATCH_H5AD.out.keras_outfile.collect())       
+        CELLTYPE_FILE_MERGE(az_out,ct_out,all_extra_fields2,keras_files) 
+        
+        if (params.remap_celltypes){
+            REMAP_AZIMUTH(CELLTYPE_FILE_MERGE.out.celltype_assignments,params.mapping_file)
+            file__anndata_merged2 = REMAP_AZIMUTH.out.predicted_celltype_labels.collect()   
+        }
+
         file__anndata_merged2=CELLTYPE_FILE_MERGE.out.file__anndata_merged2
-        file__anndata_merged2.view()
 
     emit:
         file__anndata_merged2
diff --git a/subworkflows/main_deconvolution.nf b/subworkflows/main_deconvolution.nf
index 25828bf..5d0dabf 100755
--- a/subworkflows/main_deconvolution.nf
+++ b/subworkflows/main_deconvolution.nf
@@ -320,9 +320,9 @@ workflow  main_deconvolution {
         if (params.genotype_input.run_with_genotype_input) {
             if (params.do_vireo_subsampling){
                 VIREO_SUBSAMPLING(vireo_extra_repeats)
-                VIREO_SUBSAMPLING.out.output_dir.concat(VIREO.out.output_dir).set{tuple_1}
-                // tuple_1.groupTuple(by:0).set{vspp0}
-                // VIREO_SUBSAMPLING_PROCESSING(vspp0)
+                VIREO_SUBSAMPLING.out.output_dir.concat(VIREO.out.output_dir_subsampling).set{tuple_1}
+                tuple_1.groupTuple(by:0).set{vspp0}
+                VIREO_SUBSAMPLING_PROCESSING(vspp0)
                 // VIREO_SUBSAMPLING.out.all_required_data.set{replacement_input_sub}
                 // // replacement_input_sub.combine(vireo_with_gt).set{vir_repl_input}
                 // // REPLACE_GT_DONOR_ID_SUBS(vir_repl_input)
@@ -331,7 +331,7 @@ workflow  main_deconvolution {
                 // //     .combine(ch_ref_vcf).set { gt_math_pool_against_panel_input_subs }
                     
                 // MATCH_GT_VIREO(gt_math_pool_against_panel_input_subs)
-                // subsampling_donor_swap = VIREO_SUBSAMPLING_PROCESSING.out.subsampling_donor_swap
+                subsampling_donor_swap = VIREO_SUBSAMPLING_PROCESSING.out.subsampling_donor_swap
             }else{
                 subsampling_donor_swap = Channel.from("$projectDir/assets/fake_file.fq")
             }
diff --git a/subworkflows/qc.nf b/subworkflows/qc.nf
index b4110ee..f849bdb 100755
--- a/subworkflows/qc.nf
+++ b/subworkflows/qc.nf
@@ -58,7 +58,7 @@ workflow qc {
                 params.sample_qc.outlier_filtering_strategy
             )
             file__anndata_merged = OUTLIER_FILTER.out.anndata
-            file__cells_filtered = OUTLIER_FILTER.out.cells_filtered
+            // file__cells_filtered = OUTLIER_FILTER.out.cells_filtered
                 
         }
         
diff --git a/workflows/yascp.nf b/workflows/yascp.nf
index 22774af..07511e0 100755
--- a/workflows/yascp.nf
+++ b/workflows/yascp.nf
@@ -94,8 +94,6 @@ workflow YASCP {
                     channel__file_paths_10x=SPLIT_CITESEQ_GEX_FILTERED.out.channel__file_paths_10x
                     channel__file_paths_10x_single=SPLIT_CITESEQ_GEX_FILTERED.out.gex_data
                     ch_experiment_filth5 = SPLIT_CITESEQ_GEX.out.gex_data
-                }else{
-                    ab_data = Channel.of()
                 }
 
                 // Either run ambient RNA removal with cellbender or use cellranger filtered reads (cellbender|cellranger)
@@ -191,7 +189,6 @@ workflow YASCP {
                         ? Channel.fromPath("${params.outdir}/deconvolution/vireo/*/vireo_*", checkIfExists:true, type: 'dir')
                         : Channel.fromPath("${launchDir}/${params.outdir}/deconvolution/vireo/*/vireo_*", type: 'dir')
 
-                    
                     GENOTYPE_MATCHER(vireo_paths.collect())
                     matched_donors = GENOTYPE_MATCHER.out.matched_donors
                     matched_donors.subscribe { println "matched_donors: $it" }

From 9a45feac6527e042ae0771fc49355bcabd8f0f00 Mon Sep 17 00:00:00 2001
From: Matiss Ozols <mo11@sanger.ac.uk>
Date: Sat, 11 May 2024 08:07:40 +0100
Subject: [PATCH 7/8] fixed the index error

---
 bin/generate_combined_celltype_anotation_file.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/bin/generate_combined_celltype_anotation_file.py b/bin/generate_combined_celltype_anotation_file.py
index af48e41..b7ab69d 100755
--- a/bin/generate_combined_celltype_anotation_file.py
+++ b/bin/generate_combined_celltype_anotation_file.py
@@ -127,13 +127,13 @@ def main():
         adata1 = scanpy.read_h5ad(ad1)
         if adata1.n_obs > 0:
             adatasets.append(adata1)
-    ad = adatasets[0].concatenate(*adatasets[1:])
-    if(len(adatasets)>1):
-        # in this case the concentration adds a -1 -2 -3 to index that has to be removed.
-        all_index = pd.DataFrame(ad.obs.index,columns=['col'])
-        all_indexes = all_index['col'].str.split('-')
-        all_together = all_indexes.str[0]+'-'+all_indexes.str[1]+'-'+all_indexes.str[2]
-        ad.obs.set_index(all_together, inplace=True)
+    ad = adatasets[0].concatenate(*adatasets[1:],index_unique=None)
+    # if(len(adatasets)>1):
+    #     # in this case the concentration adds a -1 -2 -3 to index that has to be removed.
+    #     all_index = pd.DataFrame(ad.obs.index,columns=['col'])
+    #     all_indexes = all_index['col'].str.split('-')
+    #     all_together = all_indexes.str[0]+'-'+all_indexes.str[1]+'-'+all_indexes.str[2]
+    #     ad.obs.set_index(all_together, inplace=True)
         
     # ad2 = adatasets2[0].concatenate(*adatasets2[1:])
     # ad = scanpy.read(adata)

From ac9e8c6be3e868a1ece0787fb87aa2cef2e39508 Mon Sep 17 00:00:00 2001
From: Matiss Ozols <mo11@sanger.ac.uk>
Date: Fri, 24 May 2024 11:28:35 +0100
Subject: [PATCH 8/8] ensuring all in place

---
 bin/0035-scanpy_normalize_pca.py |  2 +-
 bin/2.integrate.R                |  5 ++
 bin/add_adt.R                    | 17 ++++---
 bin/gather_minimal_dataset.py    | 86 +++++++++++---------------------
 bin/transfer_data.py             | 30 ++++++++++-
 5 files changed, 73 insertions(+), 67 deletions(-)

diff --git a/bin/0035-scanpy_normalize_pca.py b/bin/0035-scanpy_normalize_pca.py
index db1b551..6306556 100755
--- a/bin/0035-scanpy_normalize_pca.py
+++ b/bin/0035-scanpy_normalize_pca.py
@@ -543,7 +543,7 @@ def scanpy_normalize_and_pca(
             copy=False
         )
        
-
+    print('---- Writting files ----')
     # Keep a record of the different gene scores
     if score_genes_df is not None:
         adata.uns['df_score_genes'] = score_genes_df_updated
diff --git a/bin/2.integrate.R b/bin/2.integrate.R
index a9f3082..f072cdd 100755
--- a/bin/2.integrate.R
+++ b/bin/2.integrate.R
@@ -169,6 +169,11 @@ for(f in cite_files){
 
   # Add the vireo donor assignment to the seurat object
   this_donor_cells <- donor_cells[donor_cells$sample==sample_id,]
+
+  if (dim(this_donor_cells)[1]==0){
+    # Here we skip non deconvoluted samples
+    next
+  }
   sobj@meta.data$donor.vireo <- this_donor_cells[match(paste0(rownames(sobj@meta.data),'_',sample_name),
                                                        paste0(this_donor_cells$cell,
                                                               '_',this_donor_cells$sample)),]$matched.donor
diff --git a/bin/add_adt.R b/bin/add_adt.R
index 199bd2a..6f3903f 100755
--- a/bin/add_adt.R
+++ b/bin/add_adt.R
@@ -24,7 +24,7 @@ if (future::supportsMulticore()) {
 } else {
   future::plan(future::multisession)
 }
-# args=vector(mode='list', length=6); args[[1]]='cellranger700_multi_abb2ba0911f1f4bde982a4c38d9fd6ed'; args[[2]]='raw_feature_bc_matrix'; args[[3]]='filtered_feature_bc_matrix'; args[[4]]='cellranger700_multi_abb2ba0911f1f4bde982a4c38d9fd6ed___sample_QCd_adata.h5ad'
+# args=vector(mode='list', length=6); args[[1]]='LDP58'; args[[2]]='raw_feature_bc_matrix'; args[[3]]='filtered_feature_bc_matrix'; args[[4]]='LDP58___sample_QCd_adata.h5ad'
    
 #####
 args = commandArgs(trailingOnly=TRUE)
@@ -47,13 +47,13 @@ outdir <- getwd()
 # filtered_feature_file = cellranger_filepath = args[2]
 #    
 # filtered_cellranger = '/lustre/scratch123/hgi/teams/hgi/mo11/tmp_projects/jaguar_yascp/nieks_pipeline/fetch/results_old/cellranger_data/cellranger700_multi_bc45a1c2fe2a3fbbcde46cf984cf42e2/per_sample_outs/cellranger700_multi_bc45a1c2fe2a3fbbcde46cf984cf42e2/count/sample_filtered_feature_bc_matrix.h5'
-# sample_name <- 'cellranger700_multi_74b30caec2cf83c0048bc87946b301e8'
-# cellranger_rawfile_path <- 'raw_feature_bc_matrix'
-# filtered_cellranger = 'filtered_feature_bc_matrix'
-# file_with_qc_applied = 'cellranger700_multi_74b30caec2cf83c0048bc87946b301e8___sample_QCd_adata.h5ad'
+sample_name <- 'LDP58'
+cellranger_rawfile_path <- 'raw_feature_bc_matrix'
+filtered_cellranger = 'filtered_feature_bc_matrix'
+file_with_qc_applied = 'LDP58___sample_QCd_adata.h5ad'
 sample_name <- args[1]
 cellranger_rawfile_path <- args[2]
-filtered_cellranger = args[3]
+filtered_cellranger = args[3][1]
 file_with_qc_applied = args[4]
 # cellranger_rawfile_path = '/lustre/scratch123/hgi/teams/hgi/mo11/tmp_projects/jaguar_yascp/nieks_pipeline/fetch/results_old/cellranger_data/cellranger700_multi_bc45a1c2fe2a3fbbcde46cf984cf42e2/multi/count/raw_feature_bc_matrix.h5'
 
@@ -151,7 +151,9 @@ Convert(
   # qced_cells <- readRDS(scpred_file_with_qc)
   qced_cells <- LoadH5Seurat(paste('tmp',"h5seurat",sep='.'),assays = "RNA")
   qced_barcodes <- gsub('_.*','',colnames(qced_cells))
-  qced_barcodes <- gsub('-cellranger.*','',colnames(qced_cells))
+  qced_barcodes <- gsub('-1.*','-1',(qced_barcodes))
+  qced_barcodes <- gsub('-2.*','-2',(qced_barcodes))
+  qced_barcodes <- gsub('-cellranger.*','',(qced_barcodes))
   qced_cells = RenameCells(qced_cells, new.names = qced_barcodes)
 
   genes <-  read.table(paste0(cellranger_rawfile_path,"/features.tsv.gz"), sep = "\t", col.names = c("ENSG_ID","Gene_ID", "FeatureType"))
@@ -183,6 +185,7 @@ Convert(
   # }else if(basename(f)=='sample_feature_bc_matrix.h5' || basename(f) == 'sample_filtered_feature_bc_matrix.h5'){
     # with cellranger multi the raw file is 3 directories higher than filtered file, then in multi/count
   rna = raw <- Read10X(cellranger_rawfile_path)
+  print(paste0(' RAW data loaded '))
   # }else{
   #   stop(paste('basename(f) should be sample_feature_bc_matrix.h5 or filtered_feature_bc_matrix.h5, but was:',basename(f)))
   # }
diff --git a/bin/gather_minimal_dataset.py b/bin/gather_minimal_dataset.py
index a921145..7c6620a 100755
--- a/bin/gather_minimal_dataset.py
+++ b/bin/gather_minimal_dataset.py
@@ -61,8 +61,8 @@
     'cell_passes_qc:score':'qc.filter.pass:score',
     'cell_passes_qc-per:all_together::exclude':'qc.filter.pass.spikein_exclude',
     'cell_passes_qc-per:all_together::exclude:score':'qc.filter.pass.spikein_exclude:score',
-    'cell_passes_qc-per:Azimuth:L0_Azimuth:predicted.celltype.l2':'qc.filter.pass.AZ:L0',
-    'cell_passes_qc-per:Azimuth:L0_Azimuth:predicted.celltype.l2:score':'qc.filter.pass.AZ:L0:score',
+    'cell_passes_qc-per:Azimuth:L0_predicted.celltype.l2':'qc.filter.pass.AZ:L0',
+    'cell_passes_qc-per:Azimuth:L0_predicted.celltype.l2:score':'qc.filter.pass.AZ:L0:score',
     'total_counts': 'qc.umi.count.total',
     'total_counts_gene_group__mito_transcript': 'qc.umi.count.mt',
     'pct_counts_gene_group__mito_transcript': 'qc.umi.perc.mt',
@@ -399,8 +399,7 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
     #Cellranger datasets
     ######################
     columns_output = {**COLUMNS_DATASET, **COLUMNS_DECONV, **COLUMNS_QC}
-    #Unfiltered
-    compression_opts = 'gzip'
+    #Reading unfiltered raw cellranger datasets
     try:
         adata_cellranger_raw = scanpy.read_10x_mtx(f"{df_raw.loc[expid, 'data_path_10x_format']}/raw_feature_bc_matrix")
         try:
@@ -417,6 +416,7 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
         except:
             print('File already linked')
      
+    # Reading filtered cellranger files
     try:
         adata_cellranger_filtered = scanpy.read_10x_mtx(f"{df_raw.loc[expid, 'data_path_10x_format']}/filtered_feature_bc_matrix")
         try:
@@ -441,7 +441,6 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
     df_total_counts['barcodes'] = df_total_counts.index
     df_total_counts_cellranger_raw = df_total_counts
     df_total_counts_cellranger_raw['dataset']='Cellranger Raw'
-    wd = os.getcwd()
 
     if df_cellbender is not None and (len(df_cellbender)!=0):
         df_cellbender = df_cellbender.reset_index()
@@ -449,16 +448,11 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
         df_cellbender = df_cellbender.set_index('experiment_id')
         f=df_cellbender.loc[expid, 'data_path_10x_format']
         if (type(f) == str):
-            print('yes')
             f=[f]
         for id in f:
             print(id)
             try:
                 cell_bender_path = id
-                # try:
-                #     cell_bender_path = f'{wd}/'+'/'.join(cell_bender_path.split('/')[-6:])
-                # except:
-                #     cell_bender_path = f"{args.results_dir}/{df_cellbender.loc[expid, 'data_path_10x_format']}"
                 cellbender_h5 = f"{cell_bender_path}/../cellbender_FPR_{Resolution}_filtered.h5"
                 ad_lane_filtered = scanpy.read_10x_mtx(cell_bender_path)
                 print('loaded')
@@ -471,30 +465,19 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
             try:
                 path2 = os.path.realpath(cellbender_h5)
                 os.symlink(path2, f"./{outdir}/Cellbender_filtered_{Resolution}__{expid}.h5")
-                # Here link also mtx files
             except:
                 print('File already linked')
-        dfcb = fetch_cellbender_annotation(cell_bender_path, expid,Resolution)
         columns_output = {**columns_output, **COLUMNS_CELLBENDER}
     else:
         ad_lane_filtered = scanpy.read_10x_mtx(f"{df_raw.loc[expid, 'data_path_10x_format']}/filtered_feature_bc_matrix")
         df_cellbender=None
         cell_bender_path=None
 
- 
+    # Removing Zero count cells from the matices
     zero_count_cells = ad_lane_filtered.obs_names[np.where(ad_lane_filtered.X.sum(axis=1) == 0)[0]]
     ad_lane_filtered = ad_lane_filtered[ad_lane_filtered.obs_names.difference(zero_count_cells, sort=False)]
-
-
     zero_count_cells = adata_cellranger_filtered.obs_names[np.where(adata_cellranger_filtered.X.sum(axis=1) == 0)[0]]
     adata_cellranger_filtered = adata_cellranger_filtered[adata_cellranger_filtered.obs_names.difference(zero_count_cells, sort=False)]
-    # adata_cellranger_filtered=ad_lane_filtered
-    # scanpy.pp.calculate_qc_metrics(adata_cellranger_filtered, inplace=True)
-    # df_total_counts = pd.DataFrame(data= adata_cellranger_filtered.obs.sort_values(by=['total_counts'], ascending=False).total_counts)
-    # df_total_counts['barcodes'] = df_total_counts.index
-    # df_total_counts['barcode_row_number'] = df_total_counts.reset_index().index + 1 
-    # df_total_counts_cellranger_filtered = df_total_counts
-    # df_total_counts_cellranger_filtered['dataset'] = 'Cellranger Filtered'
 
     #############
     #Cellranger Metrics Datasheet
@@ -513,6 +496,15 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
     ##########################
     # Scrublet
     #########################
+    doublet_data = glob.glob(f'{args.results_dir}/doublets/*.tsv')
+    doublet_data_combined = pd.DataFrame()
+    for f1 in doublet_data:
+        print(f1)
+        pool_name = f1.split('__')[0].split('/')[-1]
+        d2 = pd.read_csv(f1,sep='\t')
+        d2['Exp']=pool_name
+        doublet_data_combined = pd.concat([doublet_data_combined,d2])
+        
     datadir_scrublet=glob.glob(f'{args.results_dir}/*/multiplet.method=scrublet')[0]
     if os.path.isdir(datadir_scrublet):
         # Scrublet loading QC
@@ -532,15 +524,9 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
     ############################################################
     # Loading deconvoluted data including unassigned and doublets
     ###########################################
-    print(expid)
 
     obsqc,all_QC_lane = fetch_qc_obs_from_anndata(adqc, expid, cell_bender_path = cell_bender_path,Resolution=Resolution)
-    
-    # try:        
-    #     datadir_deconv=f'{args.results_dir}/deconvolution/split_donor_h5ad'
-    #     donor_table = os.path.join(datadir_deconv, expid, "{}.donors.h5ad.tsv".format(expid))
-    #     df_donors = pandas.read_table(donor_table, header=None, names=("experiment_id", "donor_id", "file_path_h5ad"))
-    # except:
+
     donor_tables=pd.DataFrame([])
     for d1 in set(obsqc['donor']):
         donor_table={}
@@ -596,11 +582,6 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
     except:
         chromium_channel = 'Run_ID not vailable'
         
-    
-
-
-    def isNaN(num):
-        return num!= num
     Donors = list(df_donors.donor_id)
 
     try:
@@ -634,7 +615,7 @@ def isNaN(num):
             Mean_reads_per_cell = int(metrics['Mean reads per cell'].values[0].replace(',',''))
         except:
             Mean_reads_per_cell= None
-    # adata_cellranger_raw.X
+    
     f = pd.DataFrame(adata_cellranger_filtered.X.sum(axis=1))
     Median_UMI_Counts_per_cellranger= statistics.median(f[f>0][0])
     Mean_Reads = statistics.mean(f[f>0][0])
@@ -658,7 +639,6 @@ def isNaN(num):
     Number_of_cells = len(set(df1.index))
     Total_UMIs_before_10x_filter = np.sum(ad_lane_raw.X) #this may be after the normalisation
 
-    # ad_lane_filtered = 
     Total_UMIs_after_cellbender_filter = np.sum(ad_lane_filtered.X) #This is more 27840
     Total_UMIs_after_cellbender = sum(all_QC_lane.obs['total_counts']) #This is less 22817
 
@@ -694,9 +674,6 @@ def isNaN(num):
     all_probs = pd.DataFrame()
 
     Tranche_Pass_Fail='PASS'
-    Tranche_Failure_Reason=''
-
-    # print("** Fraction_Reads_in_Cells : "+Fraction_Reads_in_Cells.strip('%'))
     Tranche_Failure_Reason =' '
     try:
         if (float(Fraction_Reads_in_Cells.strip('%'))<=70):
@@ -713,7 +690,6 @@ def isNaN(num):
     
     for i in df_donors.index:
         print(i)
-        # feeds in the individual assignments here.
         Donor_Stats=[]
         row = df_donors.loc[i]
         print(row)
@@ -727,14 +703,12 @@ def isNaN(num):
             path1 = re.sub('.*/results/', 'results/', path1)
             Deconvoluted_Donor_Data = anndata.read_h5ad(path1)
             Donor_barcodes = Deconvoluted_Donor_Data.obs.index
-        # print(path1)
+
         #################
         #Deconvolution data
         #################
         
-        
-        
-        # issue with the all_QC_lane is that they are filtered and the unassigned cells are removed - ve can merge them back together and 
+        # Issue with the all_QC_lane is that they are filtered and the unassigned cells are removed - we can merge them back together and 
         if (row["donor_id"] == 'unassigned'):
             Unassigned_donor = len(Deconvoluted_Donor_Data.obs)
         elif (row["donor_id"] == 'doublet'):
@@ -765,7 +739,6 @@ def isNaN(num):
             data_donor_for_stats['cells passing QC'].append(Donor_cells_passes_qc)
 
             Donor_cell_assignments = azt.loc[Mengled_barcodes_donor] #for this have to figure out when the cell type is unasigned.
-            # Donor_cell_assignments = Donor_cell_assignments.rename(COLUMNS_AZIMUTH,axis=1)
             Cell_types_detected = len(set(Donor_cell_assignments['Azimuth:predicted.celltype.l2']))
             Donor_UMIS_mapped_to_mitochondrial_genes = sum(Donor_qc_files.obs['total_counts_gene_group__mito_transcript'])
             Donor_UMIS_mapped_to_ribo_genes = sum(Donor_qc_files.obs['total_counts_gene_group__ribo_protein'])
@@ -855,7 +828,6 @@ def isNaN(num):
                 data_donor.append(Donor_Stats)
 
         fctr += 1
-    # print('done')
     all_probs = all_probs[~all_probs.index.duplicated(keep='first')]
     azt['prob_doublet']=all_probs['prob_doublet']
     Donor_df = pd.DataFrame(data_donor)
@@ -973,15 +945,15 @@ def set_argument_parser():
         write_h5=False
     else:
        write_h5=True 
-    # write_h5=False
+
     oufh = open(os.path.join(args.outdir, "files.tsv"), 'w')
     oufh.write("experiment_id\tdonor_id\tfilename_h5ad\tfilename_annotation_tsv\n")
     df_raw = pandas.read_table(args.input_table, index_col = 'experiment_id')
     if (args.cellbender)=='cellranger':
-        # here we do not use cellbender and go with default cellranger
+        # Here we do not use cellbender and go with default cellranger
         df_cellbender = None
     else:
-        # here we have run the cellbender as par of pipeline. 
+        # Here we have run the cellbender as par of pipeline. 
         # cellbender/*/cellbender-epochs_*/cellbender-FPR_0pt01-filtered_10x_mtx
         file_path = glob.glob(f'{args.results_dir}/nf-preprocessing/cellbender/*/cellbender-epochs_*/*{args.resolution}*10x_mtx*')
         file_path2 = glob.glob(f'{args.results_dir}/nf-preprocessing/cellbender/*/*{args.resolution}*10x_mtx*')
@@ -989,7 +961,10 @@ def set_argument_parser():
         df_cellbender = pd.DataFrame(joined_file_paths,columns=['data_path_10x_format'])
         df_cellbender['experiment_id']=df_cellbender['data_path_10x_format'].str.split('/').str[-3]
         df_cellbender= df_cellbender.set_index('experiment_id')
+    
     Resolution = args.resolution
+    
+    # Load the final QCd dataset
     try:
         adqc = anndata.read_h5ad(f'{args.results_dir}/merged_h5ad/outlier_filtered_adata.h5ad')
     except:
@@ -998,30 +973,27 @@ def set_argument_parser():
         except:
             d2 = glob.glob(f'{args.results_dir}/*/*/adatanormalized.h5ad')[0]
             adqc = anndata.read_h5ad(d2)
-    adqc.obs['experiment_id'] = adqc.obs['experiment_id'].str.split("__").str[0]
+    # adqc.obs['experiment_id'] = adqc.obs['experiment_id'].str.split("__").str[0]
     fctr = 0
     data_tranche_all=[]
     data_donor_all=[]
     count = 1
     All_probs_and_celltypes = pd.DataFrame()
-
-
     Sample_metadata = pd.DataFrame()
     
-    adqc.obs['tranche.id']=args.experiment_name
+    # SETTING TRANCHE NAME
     try:
-        adqc.obs['cell_passes_qc-per:Azimuth:L0_Azimuth:predicted.celltype.l2:score'] = adqc.obs['cell_passes_qc-per:Azimuth:L0_Azimuth:predicted.celltype.l2:score'].astype(float,errors='ignore')
+        adqc.obs['cell_passes_qc-per:Azimuth:L0_predicted.celltype.l2'] = adqc.obs['cell_passes_qc-per:Azimuth:L0_predicted.celltype.l2:score'].astype(float,errors='ignore')
     except:
         _='no values associated'
     try:
         adqc.obs['cell_passes_qc-per:all_together::exclude:score']= adqc.obs['cell_passes_qc-per:all_together::exclude:score'].astype(float,errors='ignore')
     except:
         _='no values associated'
-        
+    
+    # Now we calculate all the statistics for each of the pools.
     for expid in df_raw.index:
         print(expid)
-        # try:
-        # expid ='OTARscRNA12924807'
         s = adqc.obs['convoluted_samplename'] == expid
         ad = adqc[s]
         if ad.n_obs == 0:
diff --git a/bin/transfer_data.py b/bin/transfer_data.py
index 155eb09..06e93d2 100755
--- a/bin/transfer_data.py
+++ b/bin/transfer_data.py
@@ -167,6 +167,26 @@ def main_data_colection(pipeline='',name='',directory='',input_table=None,cb_res
         for folder in Folders:
             copyfile(f'{folder1}/{folder}/Vireo_plots.pdf', f'{name_dir}/Deconvolution/Vireo_plots_{folder}.pdf')
         
+        
+        
+    folder1 = f'{directory}/doublets'
+    if os.path.isdir(folder1):
+        print('prepearing Doublet folder')
+        try:
+            os.mkdir(f'{name_dir}/Doublets___301')
+        except:
+            print('dire exists')
+        files = glob.glob(f'{folder1}/*.tsv')
+
+        for file1 in files:
+            print(file1)
+            try:
+                copy(file1, f'{name_dir}/Doublets___301')
+            except:
+                print('picked up directory')
+                continue
+                    
+        
     folder1 = f'{directory}/celltype/celltypist'
     if os.path.isdir(folder1):
         print('prepearing celltype folder')
@@ -205,6 +225,8 @@ def main_data_colection(pipeline='',name='',directory='',input_table=None,cb_res
         #     print('dire exists')
         # copyfile(fil1, f'{name_dir}/QC metrics/plot_ecdf-x_log10.var=total_counts.color=experiment_id-adata.png')
         files = glob.glob(f'{folder1}/*[!.gz]')
+        files2 = glob.glob(f'{folder1}/*/*[!.gz]')
+        files = files + files2
         for file1 in files:
             print(file1)
             try:
@@ -212,8 +234,12 @@ def main_data_colection(pipeline='',name='',directory='',input_table=None,cb_res
             except:
                 print('picked up directory')
                 continue
-
-
+    try:
+        copy(f'{directory}/celltype/All_Celltype_Assignments.csv', f'{name_dir}/Cell-type assignment/All_Celltype_Assignments.csv')
+    except:
+        print('doesnt exist')    
+    
+    
     folder1 = f'{directory}/plots/per_celltype_outliers'
     if os.path.isdir(folder1):
         print('yes')