From 29688df3536c10aadd5761ec48224caaba667076 Mon Sep 17 00:00:00 2001 From: Matiss Ozols Date: Tue, 23 Apr 2024 13:48:03 +0100 Subject: [PATCH 1/8] pushing current changes --- assets/deploy_scripts/sanger_module_files/1.6 | 2 +- conf/qc.conf | 2 +- modules/nf-core/modules/cellsnp/main.nf | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/assets/deploy_scripts/sanger_module_files/1.6 b/assets/deploy_scripts/sanger_module_files/1.6 index 8bf0d00..d73f9ac 100644 --- a/assets/deploy_scripts/sanger_module_files/1.6 +++ b/assets/deploy_scripts/sanger_module_files/1.6 @@ -33,7 +33,7 @@ module load ISG/singularity/3.11.4 module load HGI/common/nextflow/22.04.4 module load HGI/common/baton module load ISG/experimental/irods/4.2.7 - +module load HGI/softpack/users/bh18/bcftools/1.0 prepend-path PATH "/software/hgi/envs/conda/team151/mo11/mo11/bin" prepend-path PATH "/software/hgi/containers/yascp/modules/full_yascp" prepend-path PATH "/software/hgi/pipelines/yascp_versions/yascp_v1.6/bin" diff --git a/conf/qc.conf b/conf/qc.conf index 05e1d4a..ae026d4 100755 --- a/conf/qc.conf +++ b/conf/qc.conf @@ -62,7 +62,7 @@ params{ expected_multiplet_rate = 0.1 //# used for scrublet and doubletFinder. doubletDetection{ - run_process = true + run_process = false } doubletDecon{ run_process = true diff --git a/modules/nf-core/modules/cellsnp/main.nf b/modules/nf-core/modules/cellsnp/main.nf index 27d8cf2..3c0abaa 100755 --- a/modules/nf-core/modules/cellsnp/main.nf +++ b/modules/nf-core/modules/cellsnp/main.nf @@ -41,7 +41,7 @@ process DYNAMIC_DONOR_EXCLUSIVE_SNP_SELECTION{ script: if (add_dynamic_sites_or_not_to_panel){ cmd2 = "cat cellsnp_variants.tsv >> cellsnp_panel_${samplename}.vcf" - }{ + }else{ cmd2 = '' } cmd1="ln -s ${vcf_file} dynamic_snps.vcf.gz" From 47a6876b4e8566660c9c4f5369ea9eb3396cd1e6 Mon Sep 17 00:00:00 2001 From: Matiss Ozols Date: Thu, 25 Apr 2024 10:32:50 +0100 Subject: [PATCH 2/8] merge --- README.md | 3 +- .../module_exacutables/help.info | 1 + bin/0026-filter_outlier_cells.py | 3 +- bin/DoubletDecon.R | 58 +++++++++++++------ bin/azimuth.R | 18 +++--- conf/base.conf | 7 ++- conf/qc.conf | 3 +- modules/nf-core/modules/azimuth/main.nf | 8 +-- subworkflows/celltype.nf | 4 +- 9 files changed, 68 insertions(+), 37 deletions(-) diff --git a/README.md b/README.md index d42296c..4583e3f 100755 --- a/README.md +++ b/README.md @@ -4,8 +4,7 @@ ## Introduction -**nf-core/yascp** is a bioinformatics best-practice analysis pipeline for deconvolution, qc, clustering, integration of a single cell datasets. -This is a large scale single-cell pipeline developed initially for processing Cardinal project samples, however it is applicable to any other scRNA analysis. The pipeline has been inspired by [deconvolution](https://github.com/wtsi-hgi/nf_scrna_deconvolution.git ), [cellbender](https://github.com/wtsi-hgi/nf_cellbender ) and [qc](https://github.com/wtsi-hgi/nf_qc_cluster/tree/main ) pipelines initially developed in Anderson lab. +**nf-core/yascp** is a bioinformatics best-practice analysis pipeline tailored for deconvolution, quality control, clustering, and integration of single-cell datasets. Developed under the leadership of N.Soranzo and Human Genetics Informatics (HGI), this large-scale single-cell pipeline was originally crafted for the Cardinal project (profiling UKBB and ELGH participants) but is versatile enough for broader scRNA analysis applications. The foundational ideas were inspired by earlier pipelines from Anderson lab but has been expanded, specifically those for [deconvolution](https://github.com/wtsi-hgi/nf_scrna_deconvolution.git), [cellbender](https://github.com/wtsi-hgi/nf_cellbender), and [quality control and clustering](https://github.com/wtsi-hgi/nf_qc_cluster/tree/main). This ensures a robust integration of proven methodologies tailored to meet the demands of expansive single-cell data analysis. Input requires a tsv seperated file [(please read detailed documentation here)](https://github.com/wtsi-hgi/yascp/tree/yascp_docs) with paths and if running in an genotype additional input is required to be provided in an input.nf file pointing to the vcf location. This pipeline is designed to be used any large scale single cell experiments. diff --git a/assets/deploy_scripts/module_exacutables/help.info b/assets/deploy_scripts/module_exacutables/help.info index 3180a5c..6829668 100644 --- a/assets/deploy_scripts/module_exacutables/help.info +++ b/assets/deploy_scripts/module_exacutables/help.info @@ -3,6 +3,7 @@ ################### ${bold}YASCP (Yet Another Single Cell Pieline)${normal}[https://github.com/wtsi-hgi/yascp] Nextflow pipeline that QCs the scRNA Cellranger data by removing ambient RNA, deconvoluting donors, assigning celltypes, analysing concordances vs expected genotypes +Please refear to the usage and outputs documentation here: https://hgi-projects.pages.internal.sanger.ac.uk/documentation/docs/tutorials/yascp_docs/#getting-started or here: https://github.com/wtsi-hgi/yascp/tree/78df9d94388045bc386ea32c2127abeb154a36c6 ################### Yascp module has been set to run in multiple modes: diff --git a/bin/0026-filter_outlier_cells.py b/bin/0026-filter_outlier_cells.py index 19a4d3b..1b623b6 100755 --- a/bin/0026-filter_outlier_cells.py +++ b/bin/0026-filter_outlier_cells.py @@ -348,7 +348,8 @@ def main(): # gamma=0.1 ) elif method == 'MAD': - clf = onesidemad() + mad_thresh = [-5, -5, 5] + clf = onesidemad(mad_thresh) else: raise ValueError('ERROR: invalid method.') diff --git a/bin/DoubletDecon.R b/bin/DoubletDecon.R index 1e49897..238b72b 100755 --- a/bin/DoubletDecon.R +++ b/bin/DoubletDecon.R @@ -645,25 +645,49 @@ Main_Doublet_Decon<-function(rawDataFile, groupsFile, filename, location, fullDa } ## Run Doublet Decon ## -results <- Main_Doublet_Decon(rawDataFile = processed$newExpressionFile, - groupsFile = processed$newGroupsFile, - filename = "DoubletDecon_results", - location = paste0(args$out, "/"), - fullDataFile = NULL, - removeCC = args$removeCC, - species = args$species, - rhop = args$rhop, - write = TRUE, - PMF = args$pmf, - useFull = FALSE, - heatmap = args$heatmap, - centroids=args$centroids, - num_doubs=args$num_doubs, - only50=args$only50, - min_uniq=args$min_uniq, - nCores = args$nCores) +tryCatch({ + + results <- Main_Doublet_Decon(rawDataFile = processed$newExpressionFile, + groupsFile = processed$newGroupsFile, + filename = "DoubletDecon_results", + location = paste0(args$out, "/"), + fullDataFile = NULL, + removeCC = args$removeCC, + species = args$species, + rhop = args$rhop, + write = TRUE, + PMF = args$pmf, + useFull = FALSE, + heatmap = args$heatmap, + centroids=args$centroids, + num_doubs=args$num_doubs, + only50=args$only50, + min_uniq=args$min_uniq, + nCores = args$nCores) + +}, error = function(e) { + + results <- Main_Doublet_Decon(rawDataFile = processed$newExpressionFile, + groupsFile = processed$newGroupsFile, + filename = "DoubletDecon_results", + location = paste0(args$out, "/"), + fullDataFile = NULL, + removeCC = args$removeCC, + species = args$species, + rhop = 0.64, + write = TRUE, + PMF = args$pmf, + useFull = FALSE, + heatmap = args$heatmap, + centroids=args$centroids, + num_doubs=args$num_doubs, + only50=args$only50, + min_uniq=args$min_uniq, + nCores = args$nCores) +} +) doublets <- read.table(paste0(args$out, "/Final_doublets_groups_DoubletDecon_results.txt")) diff --git a/bin/azimuth.R b/bin/azimuth.R index 5e7933f..87e2d55 100755 --- a/bin/azimuth.R +++ b/bin/azimuth.R @@ -35,6 +35,7 @@ if (REFERENCE_DIR=='PBMC'){ } levels = args[3] +prefix = args[4] # levels = "celltype.l2,celltype.l1,celltype.l3" levels = unlist(x = strsplit(x = levels, split = ',', fixed = TRUE)) # reference files are expected in the following directory @@ -597,6 +598,7 @@ anchors <- FindTransferAnchors( for (celltype_level in levels) { id <- celltype_level + # id <- paste0(id,"_",prefix) print(id) refdata <- lapply(X = celltype_level, function(x) { reference$map[[x, drop = TRUE]] @@ -668,7 +670,7 @@ for (celltype_level in levels) { predicted.id.score <- paste0(predicted.id, ".score") # write a table of cell-type assignments, prediction and mapping scores: - fnam.table <- paste0(gsub(".", "_", predicted.id, fixed = TRUE),".tsv") + fnam.table <- paste0(prefix,"_",gsub(".", "_", predicted.id, fixed = TRUE),".tsv") data <- FetchData(object = query, vars = c(predicted.id, predicted.id.score, paste0("mapping.score.", id)), slot = "data") write.table(data, fnam.table, quote = FALSE, sep="\t") #gzip(fnam.table, overwrite = TRUE) @@ -695,7 +697,7 @@ for (celltype_level in levels) { p <- ggplot(tdf, aes(x=cell_type, y=count, fill = threshold)) p <- p + geom_col(position = "dodge") p <- p + theme(axis.text.x = element_text(angle = 90)) - ggsave(paste0(id,".ncells_by_type_barplot.pdf")) + ggsave(paste0(prefix,"_",id,".ncells_by_type_barplot.pdf")) # DimPlot of the reference #ref.plt <- DimPlot(object = reference$plot, reduction = "refUMAP", group.by = id, label = TRUE) + NoLegend() @@ -704,23 +706,23 @@ for (celltype_level in levels) { # DimPlot of the query, colored by predicted cell type DimPlot(object = query, reduction = "proj.umap", group.by = predicted.id, label = TRUE) + NoLegend() - ggsave(paste0(id,".query_umap.pdf")) + ggsave(paste0(prefix,"_",id,".query_umap.pdf")) # Plot the score for the predicted cell type of the query FeaturePlot(object = query, features = paste0(predicted.id, ".score"), reduction = "proj.umap") - ggsave(paste0(id,".prediction_score_umap.pdf")) + ggsave(paste0(prefix,"_",id,".prediction_score_umap.pdf")) VlnPlot(object = query, features = paste0(predicted.id, ".score"), group.by = predicted.id) + NoLegend() - ggsave(paste0(id,".prediction_score_vln.pdf")) + ggsave(paste0(prefix,"_",id,".prediction_score_vln.pdf")) # Plot the mapping score FeaturePlot(object = query, features = paste0("mapping.score.", id), reduction = "proj.umap") - ggsave(paste0(id,".mapping_score_umap.pdf")) + ggsave(paste0(prefix,"_",id,".mapping_score_umap.pdf")) VlnPlot(object = query, features = paste0("mapping.score.", id), group.by = predicted.id) + NoLegend() - ggsave(paste0(id,".mapping_score_vln.pdf")) + ggsave(paste0(prefix,"_",id,".mapping_score_vln.pdf")) } # save mapped data set #save(query, file = "azimuth.bin") - saveRDS(query, file = "azimuth.rds") + # saveRDS(query, file = paste0(prefix,"azimuth.rds")) # load("azimuth.bin" ) \ No newline at end of file diff --git a/conf/base.conf b/conf/base.conf index 6df1d62..249dc9f 100755 --- a/conf/base.conf +++ b/conf/base.conf @@ -249,6 +249,7 @@ process { withName: DOUBLET_DECON{ maxRetries = 3 errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' } + memory = { 100.GB * task.attempt } } withName: AZIMUTH{ @@ -410,7 +411,7 @@ process { withName: CONCORDANCE_CALCLULATIONS{ cpus = 5 time = { 12.h * task.attempt } - memory = { 50.GB * task.attempt } + memory = { 70.GB * task.attempt } } withName: OTHER_DONOR_CONCORDANCE_CALCLULATIONS{ @@ -459,9 +460,11 @@ process { } withName: DOUBLET_FINDER{ - memory = { 70.GB * task.attempt } + memory = { 100.GB * task.attempt } } + + withName: GT_MATCH_POOL_AGAINST_PANEL{ time = { 24.h * task.attempt } } diff --git a/conf/qc.conf b/conf/qc.conf index 2a7873f..1e72efb 100755 --- a/conf/qc.conf +++ b/conf/qc.conf @@ -178,7 +178,8 @@ params{ each estimator. Only valid if method == IsolationForest.""" run_process = true method = 'IsolationForest' //# Available methods: ocalOutlierFactor, IsolationForest, EllipticEnvelope, OneClassSVM, onesidemad - metadata_columns = 'pct_counts_gene_group__mito_transcript,log1p_total_counts,log1p_n_genes_by_counts' + metadata_columns = 'log1p_total_counts,log1p_n_genes_by_counts,pct_counts_gene_group__mito_transcript' + mad_tresholds = '-5,-5,5' //# These one side MAD filters will be used if selected. outliers_fraction = 0.0 max_samples = 0.1 } diff --git a/modules/nf-core/modules/azimuth/main.nf b/modules/nf-core/modules/azimuth/main.nf index fd7237f..e50bcf9 100755 --- a/modules/nf-core/modules/azimuth/main.nf +++ b/modules/nf-core/modules/azimuth/main.nf @@ -22,13 +22,13 @@ process AZIMUTH{ input: val outdir_prev - path file_h5ad_batch + tuple val(samplename),path(file_h5ad_batch) each refset // path(mapping_file) output: // path(celltype_table, emit:predicted_celltypes) - tuple(val(outfil_prfx), val(refset.refset), path("predicted_*.tsv"),emit:celltype_tables_all) - path("predicted_*.tsv"), emit:predicted_celltype_labels + tuple(val(outfil_prfx), val(refset.refset), path("*predicted_*.tsv"),emit:celltype_tables_all) + path("*predicted_*.tsv"), emit:predicted_celltype_labels path "*ncells_by_type_barplot.pdf" path "*query_umap.pdf" path "*prediction_score_umap.pdf" @@ -46,7 +46,7 @@ process AZIMUTH{ //outfil_prfx = "${file_h5ad_batch}".minus(".h5ad") """ - azimuth.R ./${file_h5ad_batch} ${refset.refset} ${refset.annotation_labels} + azimuth.R ./${file_h5ad_batch} ${refset.refset} ${refset.annotation_labels} ${samplename} """ } diff --git a/subworkflows/celltype.nf b/subworkflows/celltype.nf index 5710f5f..7966b7a 100755 --- a/subworkflows/celltype.nf +++ b/subworkflows/celltype.nf @@ -42,7 +42,7 @@ workflow celltype{ // AZIMUTH if (params.celltype_assignment.run_azimuth){ - AZIMUTH(params.outdir,ch_batch_files,Channel.fromList( params.azimuth.celltype_refsets)) + AZIMUTH(params.outdir,az_ch_experiment_filth5,Channel.fromList( params.azimuth.celltype_refsets)) az_out = AZIMUTH.out.predicted_celltype_labels.collect() // REMAP_AZIMUTH(AZIMUTH.out.celltype_tables_all,params.mapping_file) // az_out = REMAP_AZIMUTH.out.predicted_celltype_labels.collect() @@ -72,7 +72,7 @@ workflow celltype{ } all_extra_fields = all_extra_fields.mix(sc_out) - CELLTYPE_FILE_MERGE(az_out,ct_out,all_extra_fields,SPLIT_BATCH_H5AD.out.keras_outfile) + CELLTYPE_FILE_MERGE(az_out,ct_out,all_extra_fields,SPLIT_BATCH_H5AD.out.keras_outfile.collect()) file__anndata_merged2=CELLTYPE_FILE_MERGE.out.file__anndata_merged2 file__anndata_merged2.view() From 70c53406cae2032908699ceb6b36f283a7efce02 Mon Sep 17 00:00:00 2001 From: Matiss Date: Thu, 25 Apr 2024 10:46:34 +0100 Subject: [PATCH 3/8] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4583e3f..40a68f3 100755 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ ## Introduction -**nf-core/yascp** is a bioinformatics best-practice analysis pipeline tailored for deconvolution, quality control, clustering, and integration of single-cell datasets. Developed under the leadership of N.Soranzo and Human Genetics Informatics (HGI), this large-scale single-cell pipeline was originally crafted for the Cardinal project (profiling UKBB and ELGH participants) but is versatile enough for broader scRNA analysis applications. The foundational ideas were inspired by earlier pipelines from Anderson lab but has been expanded, specifically those for [deconvolution](https://github.com/wtsi-hgi/nf_scrna_deconvolution.git), [cellbender](https://github.com/wtsi-hgi/nf_cellbender), and [quality control and clustering](https://github.com/wtsi-hgi/nf_qc_cluster/tree/main). This ensures a robust integration of proven methodologies tailored to meet the demands of expansive single-cell data analysis. +**nf-core/yascp** is a bioinformatics best-practice analysis pipeline tailored for deconvolution, quality control, clustering, and integration of single-cell datasets. Developed under the leadership of N.Soranzo and Human Genetics Informatics (HGI), this large-scale single-cell pipeline was originally crafted for the Cardinal project (profiling UKBB and ELGH participants) but is versatile enough for broader scRNA analysis applications and has had contributions from many other teams across Sanger. The foundational ideas were inspired by earlier pipelines from Anderson lab that have been significantly expanded and improved - specifically those for [deconvolution](https://github.com/wtsi-hgi/nf_scrna_deconvolution.git), [cellbender](https://github.com/wtsi-hgi/nf_cellbender), and [quality control and clustering](https://github.com/wtsi-hgi/nf_qc_cluster/tree/main). This ensures a robust integration of proven methodologies tailored to meet the demands of expansive single-cell data analysis. Input requires a tsv seperated file [(please read detailed documentation here)](https://github.com/wtsi-hgi/yascp/tree/yascp_docs) with paths and if running in an genotype additional input is required to be provided in an input.nf file pointing to the vcf location. This pipeline is designed to be used any large scale single cell experiments. @@ -71,7 +71,7 @@ Easyest to do is using a conda enviroment. ## Credits -Yascp was originally written by Matiss Ozols as part of the Cardinal project but is applicable to many other projects with contributions from Leland Taylor, Guillaume Noell, Hannes Ponstingl, Vivek Iyer, Henry Taylor, Tobi Alegbe, Monika Krzak, Alessandro Raveane, Carl Anderson, Anna Lorenc, Stephen Watt, Nicole Soranzo. +Yascp was originally written by Matiss Ozols as part of the Cardinal project but is applicable to many other projects with contributions from Leland Taylor, Guillaume Noell, Hannes Ponstingl, Vivek Iyer, Henry Taylor, Tobi Alegbe, Monika Krzak, Alessandro Raveane, Carl Anderson, Anna Lorenc, Haerin Jang, Niek de Klein, Stephen Watt, Nicole Soranzo. The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community! From 509a86d17c49992855ba8c1a6c12ea9a60028ca1 Mon Sep 17 00:00:00 2001 From: Matiss Ozols Date: Tue, 30 Apr 2024 16:24:13 +0100 Subject: [PATCH 4/8] test data tested and works --- .../deploy_scripts/module_exacutables/yascp | 1 + bin/4.add_vdj.R | 13 +- bin/concordance_calculations.py | 2019 +++++++++-------- bin/gather_minimal_dataset.py | 3 + conf/base.conf | 258 +-- conf/cellbender.conf | 2 - conf/modules.conf | 246 ++ conf/test.conf | 3 + conf/test_full.conf | 3 + main.nf | 2 +- modules/nf-core/modules/citeseq/main.nf | 4 +- modules/nf-core/modules/doubletdecon/main.nf | 2 +- .../modules/prepere_yascp_inputs/main.nf | 1 + nextflow.config | 14 +- subworkflows/celltype.nf | 4 +- subworkflows/local/retrieve_recourses.nf | 18 + subworkflows/main_deconvolution.nf | 10 +- subworkflows/prepare_inputs.nf | 1 + workflows/yascp.nf | 2 +- 19 files changed, 1348 insertions(+), 1258 deletions(-) diff --git a/assets/deploy_scripts/module_exacutables/yascp b/assets/deploy_scripts/module_exacutables/yascp index 174b8ac..e555c03 100755 --- a/assets/deploy_scripts/module_exacutables/yascp +++ b/assets/deploy_scripts/module_exacutables/yascp @@ -5,6 +5,7 @@ bold=$(tput bold) normal=$(tput sgr0) + if [[ "$QUEUE" == '' ]]; then export QUEUE='long' diff --git a/bin/4.add_vdj.R b/bin/4.add_vdj.R index 4a0ed4d..0af69d2 100755 --- a/bin/4.add_vdj.R +++ b/bin/4.add_vdj.R @@ -34,8 +34,7 @@ data_dir <- './' args = commandArgs(trailingOnly=TRUE) wnn_integrated_file = args[1] -n2 = strsplit(as.character(wnn_integrated_file), split="__all_samples")[[1]][1] -outname = paste0(n2,'__all_samples_integrated.vdj.RDS') + myPalette <- colorRampPalette(rev(brewer.pal(11, "Spectral"))) @@ -155,8 +154,14 @@ vdj_dir <- paste0('./vdj/') dir.create(vdj_dir,showWarnings = F) names(immdata_TCR$data) <- tcr_names names(immdata_BCR$data) <- bcr_names -saveRDS(immdata_BCR, file=paste0(vdj_dir,'/slemap_BCR.rds')) -saveRDS(immdata_TCR, paste0(vdj_dir,'/slemap_TCR.rds')) + +n2 = strsplit(as.character(wnn_integrated_file), split="__all_samples")[[1]][1] +outname = paste0(n2,'__all_samples_integrated.vdj.RDS') +outname_BCR = paste0(n2,'__all_samples_integrated.BCR.RDS') +outname_TCR = paste0(n2,'__all_samples_integrated.TCR.RDS') + +saveRDS(immdata_BCR, file=outname_BCR) +saveRDS(immdata_TCR, file=outname_TCR) saveRDS(all_samples, file=outname) diff --git a/bin/concordance_calculations.py b/bin/concordance_calculations.py index 0aac3a4..5b61c83 100755 --- a/bin/concordance_calculations.py +++ b/bin/concordance_calculations.py @@ -35,8 +35,11 @@ mode = 'no_het' #We can not determine whether hets are concordant or discordant in terms of reads. -class Concordances: + + +class Concordances: + def reset(self): self.cell_concordance_table ={} @@ -55,1069 +58,1095 @@ def __init__(self, donor_assignments_table,cell_assignments_table,exclusive_don_ self.uninformative_sites = uninformative_sites self.record_dict={} - def norm_genotypes(self,expected_vars): - expected_vars = pd.DataFrame(expected_vars) - if len(expected_vars) > 0: - split_str=expected_vars[0].str.split("_") - expected_vars['ids'] = split_str.str[0]+'_'+split_str.str[1]+'_'+split_str.str[2]+'_'+split_str.str[3] - expected_vars['pos'] = split_str.str[0]+'_'+split_str.str[1] - expected_vars['vars'] = split_str.str[4] - expected_vars['vars'] = expected_vars['vars'].str.replace('|','/',regex=False) - expected_vars = expected_vars[expected_vars['vars']!='./.'] - expected_vars.loc[expected_vars['vars']=='0/1','vars']='1/0' - expected_vars['combo']= expected_vars['ids']+'_'+expected_vars['vars'] - return expected_vars - - - - def get_strict_discordance(self, expected_vars, cell_vars): - ''' - take a list of SNP array genotypes and a list of cellSNP genotypes, return counts of truly discordant - sites and relaxed concordant sites and list of discordant sites1 - 1) If you have 1/1 on SNP array you can not get a 0/1 or 0/0 genotype - 2) if you have a 0/0 you can not get a 1/1 or 0/1 - 3) if you genotype is 0/1 you can get all copies: 0/0 . 0/1. 1/1 - So - each obversed cellsnp allele must be in the array SNP gtype - ''' - snp_gtypes = expected_vars[0] - cellsnp_gtypes = cell_vars[0] - true_discordant = 0 - relaxed_concordant = 0 - relaxed_concordant_informative = 0 - relaxed_concordant_informative_ids = [] - relaxed_concordant_uninformative_ids = [] - true_discordant_uninformative_ids =[] - true_discordant_informative_ids=[] - relaxed_concordant_uninformative = 0 - true_discordant_informative = 0 - true_discordant_uninformative = 0 - discordant_vars = [] - concordant_vars = [] - subset_informative_concordant = 0 - subset_informative_discordant = 0 - - #print(self.uninformative_sites) - #print(self.informative_sites) + def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match, donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data,donor_assignments_table): - #create sets of the ids (chrom, pos, ref, alt) in each set of genotypes. Filter to the ids present in both - #then filter to informative and uninformative. If uninformative >0 then create a subset of informative - # with the same number of vars (at random) - split_snp_gts=snp_gtypes.str.split("_") - snp_gtypes_ids = set(split_snp_gts.str[0]+'_'+split_snp_gts.str[1]+'_'+split_snp_gts.str[2]+'_'+split_snp_gts.str[3]) + Concordant_Sites, \ + Discordant_sites, \ + Total_Overlapping_sites, \ + discordant_sites, \ + cell_vars_norm, \ + true_discordant_count, \ + relaxed_concordant_count, \ + relaxed_concordant_informative_count, \ + relaxed_concordant_uninformative_count, \ + true_discordant_informative_count, \ + true_discordant_uninformative_count, \ + total_sites, \ + informative_sites, \ + uninformative_sites, \ + total_reads, \ + total_reads_informative, \ + total_reads_uninformative, \ + discordant_reads, \ + discordant_reads_informative, \ + discordant_reads_uninformative, \ + discordant_vars, \ + concordant_vars, \ + discordant_read_fraction_in_concordant_sites, \ + discordant_read_fraction_in_discordant_sites, \ + discordant_reads_uninformative_fraction, \ + discordant_reads_informative_fraction, discordant_reads_informative_subset = self.retrieve_concordant_discordant_sites(expected_vars_norm,cell_vars) + + total_concordant_sites = len(Concordant_Sites) + relaxed_concordant_count + dds = self.donor_distinct_sites[donor_gt_match] + Nr_donor_distinct_sites = len(dds) + Nr_Concordant = len(Concordant_Sites) + Nr_Relaxed_concordant = relaxed_concordant_count + Nr_Discordant = len(Discordant_sites) + Nr_Total_Overlapping_sites = len(Total_Overlapping_sites) + Number_of_sites_that_are_donor_concordant_and_exclusive = len(set(dds).intersection(set(Discordant_sites))) + Number_of_sites_in_cellsnp_but_not_in_reference = set(cell_vars_norm['pos'])-set(expected_vars_norm['pos']) + #Quantify donor variation in other donors + discordant_vars_in_pool = [] + donor_table_of_concordances = [] + total_discordant_sites_that_are_concordant_with_other_donors_in_pool = set() + total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown = {} + total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort']={} + total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort']={} + + total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown_totalReads = {} + total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown_totalReads['Whithin_Cohort']={} + total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown_totalReads['Out_of_Cohort']={} + + total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown = {} + total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort']={} + total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort']={} + + + informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool = set() + total_cordant_sites_that_are_concordant_with_other_donors_in_pool = set() + donor_gt_match_cohort = donor_cohorts[donor_gt_match] + donors_contributing_to_out_of_cohort= [] + donors_contributing_to_Whithin_Cohort=[] + sites_contributing_to_out_of_cohort= [] + sites_contributing_to_Whithin_Cohort=[] + for donor in set(donor_assignments_table['donor_gt']): + try: + expected_vars_norm_of_other_donor = all_donor_data[donor] + except: + continue + try: + donor_cohort = donor_cohorts[donor] + donor_vars = vars_per_donor_gt[donor] + except: + continue + + Concordant_Sites_otherDonor, \ + Discordant_sites_otherDonor, \ + Total_Overlapping_sites_otherDonor, \ + discordant_sites_otherDonor, \ + cell_vars_norm_otherDonor, \ + true_discordant_count_otherDonor, \ + relaxed_concordant_count_otherDonor, \ + relaxed_concordant_informative_count_otherDonor, \ + relaxed_concordant_uninformative_count_otherDonor, \ + true_discordant_informative_count_otherDonor, \ + true_discordant_uninformative_count_otherDonor, \ + total_sites_otherDonor, \ + informative_sites_otherDonor, \ + uninformative_sites_otherDonor, \ + total_reads_otherDonor, \ + total_reads_informative_otherDonor, \ + total_reads_uninformative_otherDonor, \ + discordant_reads_otherDonor, \ + discordant_reads_informative_otherDonor, \ + discordant_reads_uninformative_otherDonor, \ + discordant_vars_otherDonor, \ + concordant_vars_otherDonor, \ + discordant_read_fraction_in_concordant_sites_otherDonor, \ + discordant_read_fraction_in_discordant_sites_otherDonor, \ + discordant_reads_uninformative_fraction_otherDonor, \ + discordant_reads_informative_fraction_otherDonor, discordant_reads_informative_subset_otherDonor = self.retrieve_concordant_discordant_sites(expected_vars_norm_of_other_donor,cell_vars,donor_cohort=donor_cohort) + + if total_sites_otherDonor==0: + total_sites_otherDonor=0.000000000001 + total_concordant_sites_otherDonor = relaxed_concordant_count_otherDonor + concordant_percent_in_other_donor= total_concordant_sites_otherDonor/total_sites_otherDonor*100 + discordant_percent_in_other_donor= true_discordant_count_otherDonor/total_sites_otherDonor*100 + DonorDiscordant_Sites_that_are_atributed_to_other_donor = set(discordant_vars).intersection(set(concordant_vars_otherDonor)) + TotalSites_that_are_atributed_to_other_donor = set(discordant_vars).union(set(concordant_vars)).intersection(set(concordant_vars_otherDonor)) - split_cellsnp_gts=cellsnp_gtypes.str.split("_") - cellsnp_gtypes_ids = set(split_cellsnp_gts.str[0]+'_'+split_cellsnp_gts.str[1]+'_'+split_cellsnp_gts.str[2]+'_'+split_cellsnp_gts.str[3]) + Informative__DonorDiscordant_Sites_that_are_atributed_to_other_donor = set(true_discordant_informative_count).intersection(set(relaxed_concordant_informative_count_otherDonor)) + DonorCordant_Sites_that_are_atributed_to_other_donor = set(concordant_vars).intersection(set(concordant_vars_otherDonor)) + + total_reads_for_discordant_sites_that_are_concordant_with_other_donor,discordant_reads_for_discordant_sites_that_are_concordant_with_other_donor,concordant_reads_for_discordant_sites_that_are_concordant_with_other_donor,\ + _,_,_= self.read_extraction(DonorDiscordant_Sites_that_are_atributed_to_other_donor,expected_vars_norm_of_other_donor,cell_vars_norm_otherDonor) - shared_gts = snp_gtypes_ids.intersection(cellsnp_gtypes_ids) + if not donor == donor_gt_match: + # We want to kow how many of these discordant site + if 'U937' in donor: + continue + if 'THP1' in donor: + continue + + if donor_gt_match_cohort == donor_cohort: + coh = 'Whithin_Cohort' + sites_contributing_to_Whithin_Cohort.extend(list(TotalSites_that_are_atributed_to_other_donor)) + if len(DonorDiscordant_Sites_that_are_atributed_to_other_donor)>0: + donors_contributing_to_Whithin_Cohort.append(donor) + + else: + coh = 'Out_of_Cohort' + sites_contributing_to_out_of_cohort.extend(list(TotalSites_that_are_atributed_to_other_donor)) + if len(DonorDiscordant_Sites_that_are_atributed_to_other_donor)>0: + donors_contributing_to_out_of_cohort.append(donor) + + + total_discordant_sites_that_are_concordant_with_other_donors_in_pool = total_discordant_sites_that_are_concordant_with_other_donors_in_pool.union(set(DonorDiscordant_Sites_that_are_atributed_to_other_donor)) + # now we addit for a cohort since the biggest issue comes from cohort cross-contamination + # for each of these sites now we calculate the number of reads that it accounts: + # tree level set: cohort: site: counts + for site in DonorDiscordant_Sites_that_are_atributed_to_other_donor: + total_reads_for_site,discordant_reads_for_site,concordant_for_site,\ + total_reads_noHet,discordant_reads_noHet,concordant_reads_noHet= self.read_extraction([site],expected_vars_norm_of_other_donor,cell_vars_norm_otherDonor) + if concordant_for_site==0: + pass + try: + total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_reads_noHet) - shared_informative = shared_gts.intersection(self.informative_sites) - shared_uninformative = shared_gts.intersection(self.uninformative_sites) - # print("shared informative " + str(len(shared_informative))) - # print("shared uninformative " + str(len(shared_uninformative))) + except: + try: + total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site]=[] + total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_reads_noHet) + + except: + total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh]={} + total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site]=[] + total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_reads_noHet) - #store the numbers of informative and uninformative sites shared between cellSNP and gt data as these - #are the sites used for concordance - self.informative_covered = len(shared_informative) - self.uninformative_covered = len(shared_uninformative) + for site in DonorDiscordant_Sites_that_are_atributed_to_other_donor: + total_reads_for_site,discordant_reads_for_site,concordant_for_site,\ + _,_,_= self.read_extraction([site],expected_vars_norm_of_other_donor,cell_vars_norm_otherDonor) + # if discordant_reads_for_site>0: + # print('here') + if concordant_for_site==0: + pass + try: + total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_for_site) + except: + try: + total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site]=[] + total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_for_site) + + except: + total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh]={} + total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site]=[] + total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_for_site) + + # to get the total reads that can be atributed to the other donor i have to check if site is already covered in the total_discordant_sites_that_are_concordant_with_other_donors_in_pool. + # the ones that havent, i have to add the reads up for them. + informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool = informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool.union(set(Informative__DonorDiscordant_Sites_that_are_atributed_to_other_donor)) + + total_cordant_sites_that_are_concordant_with_other_donors_in_pool = total_cordant_sites_that_are_concordant_with_other_donors_in_pool.union(set(DonorCordant_Sites_that_are_atributed_to_other_donor)) + - if len(shared_uninformative) > 0: - #print(len(shared_uninformative)) - # print(len(shared_informative)) - if len(shared_uninformative) <= len(shared_informative): - informative_subset = set(random.sample(shared_informative, len(shared_uninformative))) + common_vars = list(set(discordant_vars) & set(donor_vars)) + common_var_count = str(len(common_vars)) + donor_cohort_common = donor + ":" + donor_cohort + ":" + common_var_count + discordant_vars_in_pool.append(donor_cohort_common) + + # Here we want to calculate the number of discordant sites in other donors and see if in terms of concordance the same donor is picked as per GT assignment. + # We do this to investigate the potential of a cell coming from this other donor. else: - informative_subset = set()#if there are more shared uninformative than shared informative we will not subset - # print(informative_subset) - # exit(0) - else: - informative_subset = set() - - # print(informative_subset) - self.informative_subset = informative_subset - - snp_gtypes_set = set(snp_gtypes) - snp_gtypes_set = sorted(snp_gtypes_set) - - cellsnp_gtypes_set = set(cellsnp_gtypes) - cellsnp_gtypes_set = sorted(cellsnp_gtypes_set) - - #for i in range(0, len(snp_gtypes)): - # cellsnp_gtypes_str = pd.DataFrame(cellsnp_gtypes_set,columns=['cellsnp_gtypes']) - # cellsnp_gtypes_str['snp_gtypes'] = pd.DataFrame(snp_gtypes_set,columns=['snp_gtypes'])['snp_gtypes'] - for i in range(0, len(snp_gtypes_set)): - discordant = False - # snp_data = snp_gtypes[i].split('_') - # cellsnp_data = cellsnp_gtypes[i].split('_') - snp_data = snp_gtypes_set[i].split('_') - cellsnp_data = cellsnp_gtypes_set[i].split('_') - - snp_alleles_set = set([snp_data[4][0], snp_data[4][2]]) - cellsnp_alleles_set = set([cellsnp_data[4][0], cellsnp_data[4][2]]) - - snp_var = ('_').join(snp_data[0:4]) - cellsnp_var = ('_').join(cellsnp_data[0:4]) - - if not cellsnp_var == snp_var: - print("Error with strict discordance calculations: " + snp_gtypes[i] + " " + cellsnp_gtypes[i]) - exit(1) - else: - for allele in cellsnp_alleles_set: - if not allele in snp_alleles_set:#if a cellSNP allele is found that is not in the array data this is discordant - discordant = True + _='check' + try: + percent_relaxed_concordant = Nr_Relaxed_concordant/(Nr_Relaxed_concordant+true_discordant_count)*100 + except: + percent_relaxed_concordant = 0 + # g2 = total_concordant_sites_otherDonor/total_sites_otherDonor*100 + if not (percent_relaxed_concordant == concordant_percent_in_other_donor): + _='something not right' + break - if discordant == True: - true_discordant+=1 - discordant_vars.append(cellsnp_var) - if snp_var in self.uninformative_sites: - true_discordant_uninformative+=1 - true_discordant_uninformative_ids.append(snp_var) - elif snp_var in self.informative_sites: - true_discordant_informative+=1 - true_discordant_informative_ids.append(snp_var) - else: - relaxed_concordant+=1 - concordant_vars.append(cellsnp_var) - if snp_var in self.uninformative_sites: - relaxed_concordant_uninformative+=1 - relaxed_concordant_uninformative_ids.append(snp_var) - elif snp_var in self.informative_sites: - relaxed_concordant_informative+=1 - relaxed_concordant_informative_ids.append(snp_var) - if len(shared_uninformative) > 0: - if snp_var in informative_subset: - if discordant == True: - subset_informative_discordant+=1 - else: - subset_informative_concordant+=1 - # true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count - cell_vars2 = cell_vars.set_index('ids') - disc = pd.DataFrame(set(cell_vars2.loc[discordant_vars]['combo']),columns=['combo_x']) - df_cd = pd.merge(cell_vars, expected_vars, how='inner', on = 'pos') - disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x') - disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y'] - disc_sites_string = ';'.join(disc2['expected_retrieved']) - return true_discordant, relaxed_concordant, relaxed_concordant_informative_ids, relaxed_concordant_uninformative_ids, true_discordant_informative_ids, true_discordant_uninformative_ids, discordant_vars, concordant_vars, disc_sites_string + donor_table_of_concordances.append({'donor':donor, 'cell':cell1, 'donor_cohort':donor_cohort, \ + 'gt matched donor':donor == donor_gt_match, \ + 'DonorCordant_Sites_that_are_atributed_to_other_donor':len(DonorCordant_Sites_that_are_atributed_to_other_donor), \ + 'DonorCordant_Sites_that_are_atributed_to_other_donor/total':f"{len(DonorCordant_Sites_that_are_atributed_to_other_donor)}/{len(concordant_vars)}", \ + 'DonorDiscordant_Sites_that_are_atributed_to_other_donor':len(DonorDiscordant_Sites_that_are_atributed_to_other_donor), \ + 'DonorDiscordant_Sites_that_are_atributed_to_other_donor/total':f"{len(DonorDiscordant_Sites_that_are_atributed_to_other_donor)}/{len(discordant_vars)}", \ + 'concordant_percent_in_other_donor':concordant_percent_in_other_donor, \ + 'discordant_percent_in_other_donor':discordant_percent_in_other_donor, \ + 'discordant_reads_otherDonor':discordant_reads_otherDonor, \ + 'discordant_sites_otherDonor':len(discordant_vars_otherDonor), \ + 'concordant_sites_otherDonor':len(concordant_vars_otherDonor), \ + 'total_sites_otherDonor':total_sites_otherDonor, \ + 'discordant_reads_otherDonor':discordant_reads_otherDonor, \ + 'total_reads_otherDonor':total_reads_otherDonor, \ + # 'discordant_read_fraction_in_concordant_sites_otherDonor':discordant_read_fraction_in_concordant_sites_otherDonor, \ + # 'discordant_read_fraction_in_discordant_sites_otherDonor':discordant_read_fraction_in_discordant_sites_otherDonor, \ + 'concordant_reads_For_discordant_sites_that_are_Concordant_with_other_donor':concordant_reads_for_discordant_sites_that_are_concordant_with_other_donor + }) + + + #here now we want to see overall how many reads potentially come from different cohorts. + # cohort_specific_site_quant_string="" + # cohort_specific_read_quant_string="" + + Whithin_Cohort__total_number_of_potential_contaminent_reads=0 + Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool=0 + Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool=0 + try: + Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool = len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) + # for k1 in total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys(): + # Whithin_Cohort__total_number_of_potential_contaminent_reads+= max(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'][k1]) + except: + _='Doesnt Exist' + + Out_of_Cohort__total_number_of_potential_contaminent_reads=0 + try: + Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool = len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys()) + # for k1 in total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys(): + # Out_of_Cohort__total_number_of_potential_contaminent_reads+= max(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'][k1]) + except: + _='Doesnt Exist' - def read_concordance_calc(self,expected_vars,cell_vars): - - # This is a wrapper to add up the discordant reads in the cellsnp file. + + + + + + + + # No het total calcs + + # Since we can not tell just from the cellsnp file if the discordant read at concordant site is actually concordant with another donor in a pool + # we assume the worse case scenario and add idscordant reads at concordant sites to the total. + # Number of reads on the sites that are concordant, but has discordant reads with another donor + Whithin_Cohort__total_number_of_potential_contaminent_reads=0 + try: + concordant_sites_atributed_to_other_donor = list(set(sites_contributing_to_Whithin_Cohort)) + if not total_sites >= len(concordant_sites_atributed_to_other_donor): + raise Exception("Sorry, the number of sites atributed can not be larger than total sites") - # expected genotype 0/0 - expected_hom_ref = expected_vars[expected_vars['vars'] == '0/0'] - hom_ref_sites = set(expected_hom_ref['ids']) - cell_vars2 = cell_vars[cell_vars['ids'].isin(hom_ref_sites)] - ad_hom_ref = cell_vars2['AD'].sum() - oth_hom_ref = cell_vars2['OTH'].sum() - discordant_hom_ref = ad_hom_ref + oth_hom_ref + _,_,_,\ + _,Whithin_Cohort__total_number_of_potential_contaminent_reads, \ + _= self.read_extraction(concordant_sites_atributed_to_other_donor,expected_vars_norm,cell_vars_norm) - # expected genotype 0/1 or 1/0 - hets = ['0/1', '1/0'] - expected_het = expected_vars[expected_vars['vars'].isin(hets)] - het_sites = set(expected_het['ids']) - cell_vars3 = cell_vars[cell_vars['ids'].isin(het_sites)] - discordant_het = cell_vars3['OTH'].sum() + if not discordant_reads >= len(Whithin_Cohort__total_number_of_potential_contaminent_reads): + raise Exception("Total discordant reads can not be less than within cohort discordant reads.") - # expected genotype 1/1 - expected_hom_alt = expected_vars[expected_vars['vars'] == '1/1'] - hom_alt_sites = set(expected_hom_alt['ids']) - cell_vars4 = cell_vars[cell_vars['ids'].isin(hom_alt_sites)] - # DP + OTH - AD - ad_hom_alt = cell_vars4['AD'].sum() - dp_hom_alt = cell_vars4['DP'].sum() - oth_hom_alt = cell_vars4['OTH'].sum() - discordant_hom_alt = (dp_hom_alt + oth_hom_alt) - ad_hom_alt - # Total analysis - discordant_reads = discordant_hom_ref + discordant_het + discordant_hom_alt - total_dp = cell_vars['DP'].sum() - total_oth = cell_vars['OTH'].sum() - total_reads = total_dp + total_oth - discordantRead_ratio = discordant_reads/total_reads + except: + _='Doesnt Exist' + + + Out_of_Cohort__total_number_of_potential_contaminent_reads=0 + try: + concordant_sites_atributed_to_other_donor = list(set(sites_contributing_to_out_of_cohort)) + if not total_sites >= len(concordant_sites_atributed_to_other_donor): + raise Exception("Sorry, the number of sites atributed can not be larger than total sites") - # No het analysis - they are not discordant or concordant - we cannot make any call at those sites - discordant_reads_noHet = discordant_hom_ref + discordant_hom_alt - total_reads_noHet = total_reads - discordant_het - discordantRead_ratio_noHet = discordant_reads_noHet/total_reads_noHet + _,_,_,_,Out_of_Cohort__total_number_of_potential_contaminent_reads, _= self.read_extraction(concordant_sites_atributed_to_other_donor,expected_vars_norm,cell_vars_norm) - return total_reads,total_dp,total_oth,discordant_reads,discordantRead_ratio,discordant_reads_noHet,total_reads_noHet,discordantRead_ratio_noHet - - def read_condordance(self, expected_vars, cell_vars,discordant_vars, concordant_vars): - ''' - get read level concordance using DP, AD and OTH format fields - ##FORMAT= - ##FORMAT= - ##FORMAT= - ''' - if not len(expected_vars) == len(cell_vars): - print("length mismatch between expected vars and cell vars") - exit(1) + if not discordant_reads >= len(Out_of_Cohort__total_number_of_potential_contaminent_reads): + raise Exception("Total discordant reads can not be less than out of cohort discordant reads.") - total_sites = len(expected_vars) - #add cols for DP, AD< OTH - cell_vars['DP'] = cell_vars[0].str.split("_").str[5].astype(int) - cell_vars['AD'] = cell_vars[0].str.split("_").str[6].astype(int) - cell_vars['OTH'] = cell_vars[0].str.split("_").str[7].astype(int) + except: + _='Doesnt Exist' + + + try: + Out_of_Cohort__sites = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys()) + except: + Out_of_Cohort__sites = set() + + try: + Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys()) - set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) + shared_sites_between_out_of_cohort_and_within_cohort = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys()).intersection(set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) ) + except: + Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set() + shared_sites_between_out_of_cohort_and_within_cohort = set() + + try: + Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) - set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys()) + except: + Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) + Out_of_Cohort__unique_sites_total_reads,_,_,_,_,_ = self.read_extraction(Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool,expected_vars_norm,cell_vars_norm) + + try: + Whithin_Cohort__sites = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) + except: + Whithin_Cohort__sites = set() + + + + Total__discordant_sites_that_are_concordant_with_other_donors_in_pool = Whithin_Cohort__sites.union(Out_of_Cohort__sites) + concordant_vars_in_pool_str = (";").join(concordant_vars) + DF = pd.DataFrame(donor_table_of_concordances) - # Total - total_reads,total_dp,total_oth,discordant_reads, \ - discordantRead_ratio,discordant_reads_noHet,total_reads_noHet,discordantRead_ratio_noHet = self.read_concordance_calc(expected_vars,cell_vars) - - # uninformative - cell_vars_uninformative = cell_vars[cell_vars['ids'].isin(self.uninformative_sites)] - total_reads_uninformative,total_dp_uninformative,total_oth_uninformative,discordant_reads_uninformative, \ - discordantRead_ratio_uninformative,discordant_reads_uninformative_noHet,total_reads_uninformative_noHet,discordantRead_ratio_uninformative_noHet= self.read_concordance_calc(expected_vars,cell_vars_uninformative) - - # informative - cell_vars_informative = cell_vars[cell_vars['ids'].isin(self.informative_sites)] - total_reads_informative,total_dp_informative,total_oth_informative,discordant_reads_informative, \ - discordantRead_ratio_informative,discordant_reads_informative_noHet,total_reads_informative_noHet,discordantRead_ratio_informative_noHet = self.read_concordance_calc(expected_vars,cell_vars_informative) - - # Split into concordant and discordant sites - # concordant - concordant_sites = cell_vars[cell_vars['ids'].isin(set(concordant_vars))] - total_reads_for_concordant_sites,total_dp_for_concordant_sites,total_oth_for_concordant_sites,discordant_reads_for_concordant_sites, \ - discordantRead_ratio_for_concordant_sites,discordant_reads_for_concordant_sites_noHet,total_reads_for_concordant_sites_noHet,discordantRead_ratio_for_concordant_sites_noHet = self.read_concordance_calc(expected_vars,concordant_sites) + Donor_With_Lowest_DisConcordance = ';'.join(DF[DF['discordant_percent_in_other_donor']==min(DF['discordant_percent_in_other_donor'])]['donor'].values) + Lowest_Disconcordance_value_in_all_donors= DF[DF['discordant_percent_in_other_donor']==min(DF['discordant_percent_in_other_donor'])]['discordant_percent_in_other_donor'].values[0] - # discordant - discordant_sites = cell_vars[cell_vars['ids'].isin(set(discordant_vars))] - total_reads_for_discconcordant_sites,total_dp_for_discconcordant_sites,total_oth_for_discconcordant_sites,discordant_reads_for_discconcordant_sites, \ - discordantRead_ratio_for_discconcordant_sites,discordant_reads_for_discconcordant_sites_noHet,total_reads_for_discconcordant_sites_noHet,discordantRead_ratio_for_discconcordant_sites_noHet = self.read_concordance_calc(expected_vars,discordant_sites) - - # Subset analysis - cell_vars_informative_subset = cell_vars[cell_vars['ids'].isin(self.informative_subset)] - total_reads_informative_subset,total_dp_informative_subset,total_oth_informative_subset,discordant_reads_informative_subset, \ - discordantRead_ratio_informative_subset,discordant_reads_informative_subset_noHet,total_reads_informative_subset_noHet,discordantRead_ratio_informative_subset_noHet = self.read_concordance_calc(expected_vars,cell_vars_informative_subset) - - if mode == 'no_het': - total_reads = total_reads_noHet - discordant_reads = discordant_reads_noHet - total_reads_informative = total_reads_informative_noHet - discordant_reads_informative = discordant_reads_informative_noHet - total_reads_uninformative =total_reads_uninformative_noHet - discordant_reads_uninformative = discordant_reads_uninformative_noHet - total_reads_informative_subset = total_reads_informative_subset_noHet - discordant_reads_informative_subset = discordant_reads_informative_subset_noHet - total_reads_for_concordant_sites = total_reads_for_concordant_sites_noHet - discordant_reads_for_concordant_sites = discordant_reads_for_concordant_sites_noHet - total_reads_for_discconcordant_sites = total_reads_for_discconcordant_sites_noHet - discordant_reads_for_discconcordant_sites = discordant_reads_for_discconcordant_sites_noHet - - return total_sites, \ - self.informative_covered, \ - self.uninformative_covered, \ - total_reads, \ - discordant_reads, \ - total_reads_informative, \ - discordant_reads_informative, \ - total_reads_uninformative, \ - discordant_reads_uninformative, \ - total_reads_informative_subset, \ - discordant_reads_informative_subset, \ - total_reads_for_concordant_sites, \ - discordant_reads_for_concordant_sites, \ - total_reads_for_discconcordant_sites, \ - discordant_reads_for_discconcordant_sites - - - - def get_discordance(self,expected_vars2,cell_vars2): - Concordant_Sites = set(cell_vars2['combo']).intersection(set(expected_vars2['combo'])) - Discordant_sites = set(cell_vars2['combo'])-set(expected_vars2['combo']) - disc = pd.DataFrame(Discordant_sites,columns=['combo_x']) - df_cd = pd.merge(cell_vars2, expected_vars2, how='inner', on = 'pos') - disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x') - disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y'] - disc_sites = ';'.join(disc2['expected_retrieved']) - return Concordant_Sites,Discordant_sites,disc_sites - - - def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars,donor_cohort=False): - # This function has been inspired by Hails Concordance implementations, however hail has a pitfall that it performs a lot of other stuff under hood and requires intermediate sorting operations. - # Since the single cell calculations requires concordance calculations per cell this becomes very computationally heavy on Hail, hence we have implemented concordance calculations here as part of the pipeline. - # Author: M.Ozols - - cell_vars_norm = self.norm_genotypes(cell_vars) - - if len(cell_vars_norm) > 0: - - if mode == 'no_het': - hets = ['0/1', '1/0'] - expected_vars_norm = expected_vars_norm[~expected_vars_norm['vars'].isin(hets)] + Donor_With_Highest_Concordance = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['donor'].values) + Highest_Concordance_value_in_all_donors= DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['concordant_percent_in_other_donor'].values[0] + Total_sites_other_donor = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['total_sites_otherDonor'].astype(str).values) + Total_reads_other_donor = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['total_reads_otherDonor'].astype(str).values) + + return [{ + 'cell1':cell1, + 'donor_gt_match':donor_gt_match, + 'Nr_Concordant':Nr_Concordant, + 'Nr_Discordant':Nr_Discordant, + 'Nr_Relaxed_concordant':Nr_Relaxed_concordant, + 'true_discordant_count':true_discordant_count, + 'relaxed_concordant_informative_count':relaxed_concordant_informative_count, + 'relaxed_concordant_uninformative_count':relaxed_concordant_uninformative_count, + 'true_discordant_informative_count':true_discordant_informative_count, + 'true_discordant_uninformative_count':true_discordant_uninformative_count, + 'Nr_Total_Overlapping_sites':Nr_Total_Overlapping_sites, + 'Number_of_sites_that_are_donor_concordant_and_exclusive':Number_of_sites_that_are_donor_concordant_and_exclusive, + 'Nr_donor_distinct_sites':Nr_donor_distinct_sites, + 'count':count, + 'discordant_sites':discordant_sites, + 'total_sites':total_sites, + 'informative_sites':informative_sites, + 'uninformative_sites':uninformative_sites, + 'total_reads_informative':total_reads_informative, + 'total_reads_uninformative':total_reads_uninformative, + 'discordant_reads_informative':discordant_reads_informative, + 'discordant_reads_uninformative':discordant_reads_uninformative, + 'Discordant_sites_in_pool': discordant_vars, + 'Lowest_Disconcordance_value_in_all_donors':Lowest_Disconcordance_value_in_all_donors, + 'Donor_With_Lowest_DisConcordance':Donor_With_Lowest_DisConcordance, + 'Concordant_Site_Identities':concordant_vars_in_pool_str, + 'Donor_With_Highest_Concordance':Donor_With_Highest_Concordance, + 'Highest_Concordance_value_in_all_donors':Highest_Concordance_value_in_all_donors, + 'Total_sites_other_donor':Total_sites_other_donor, + 'Total_reads_other_donor':Total_reads_other_donor, + 'total_discordant_sites_that_are_concordant_with_other_donors_in_pool':f"{len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool)}/{len(discordant_vars)}", + 'informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool':f"{len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool)}/{len(true_discordant_informative_count)}", + 'total_reads':total_reads, + 'discordant_reads':discordant_reads, + 'discordant_reads_informative_subset':discordant_reads_informative_subset, \ + 'discordant_read_fraction_in_concordant_sites':discordant_read_fraction_in_concordant_sites, \ + 'discordant_read_fraction_in_discordant_sites':discordant_read_fraction_in_discordant_sites, \ + 'Whithin_Cohort__total_number_of_potential_contaminent_reads':Whithin_Cohort__total_number_of_potential_contaminent_reads, \ + 'Out_of_Cohort__total_number_of_potential_contaminent_reads':Out_of_Cohort__total_number_of_potential_contaminent_reads, \ + 'NrDonors_contributing_to_out_of_cohort':len(set(donors_contributing_to_out_of_cohort)), \ + 'NrDonors_contributing_to_Whithin_Cohort':len(set(donors_contributing_to_Whithin_Cohort)), \ - Total_Overlapping_sites = set(expected_vars_norm['ids']).intersection(set(cell_vars_norm['ids'])) - expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlapping_sites)] - cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)] - cell_vars2 = cell_vars2.drop_duplicates(subset=['ids']) - expected_vars2 = expected_vars2.drop_duplicates(subset=['ids']) - if len(cell_vars2)!=len(expected_vars2): - print('failed to select comon variants') - exit(1) - - # Find exact discordant sites - Concordant_Sites, Discordant_sites, _ = self.get_discordance(expected_vars2, cell_vars2) - #find truly discordant sites - true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, discordant_vars, concordant_vars, disc_sites_string = self.get_strict_discordance(expected_vars2, cell_vars2) - #find discordant reads - total_sites, \ - informative_sites, \ - uninformative_sites, \ - total_reads, \ - discordant_reads, \ - total_reads_informative, \ - discordant_reads_informative, \ - total_reads_uninformative, \ - discordant_reads_uninformative, \ - total_reads_informative_subset, \ - discordant_reads_informative_subset, \ - total_reads_for_concordant_sites, \ - discordant_reads_for_concordant_sites, \ - total_reads_for_discconcordant_sites, \ - discordant_reads_for_discconcordant_sites = self.read_condordance(expected_vars2, cell_vars2, discordant_vars, concordant_vars) - - discordant_read_fraction_in_concordant_sites = f"{discordant_reads_for_concordant_sites}/{total_reads_for_concordant_sites}" - discordant_read_fraction_in_discordant_sites = f"{discordant_reads_for_discconcordant_sites}/{total_reads_for_discconcordant_sites}" - discordant_reads_uninformative_fraction = f"{discordant_reads_uninformative}/{total_reads_uninformative}" - discordant_reads_informative_fraction = f"{discordant_reads_informative}/{total_reads_informative}" - - # sanity checks - if total_reads != total_reads_for_concordant_sites+total_reads_for_discconcordant_sites: - print("Error: total reads dont add up ") - exit(1) - if discordant_reads != discordant_reads_for_concordant_sites+discordant_reads_for_discconcordant_sites: - print("Error: discordant reads dont add up ") - exit(1) - - - else: - Total_Overlapping_sites = set() - Concordant_Sites = set() - Discordant_sites = set() - disc_sites = '' - true_discordant_count = 0 - relaxed_concordant_count = 0 - total_sites = 0 - relaxed_concordant_informative_count = 0 - relaxed_concordant_uninformative_count = 0 - true_discordant_informative_count = 0 - true_discordant_uninformative_count = 0 - informative_sites = 0 - disc_sites_string = '' - discordant_reads = 0 - uninformative_sites = 0 - total_reads = 0 - total_reads_informative =0 - total_reads_uninformative=0 - discordant_reads=0 - discordant_reads_informative=0 - discordant_reads_uninformative=0 - discordant_vars=0 - concordant_vars=0 - discordant_read_fraction_in_concordant_sites=0 - discordant_read_fraction_in_discordant_sites=0 - discordant_reads_uninformative_fraction=0 - discordant_reads_informative_fraction=0 - discordant_reads_informative_subset=0 - - return Concordant_Sites, \ - Discordant_sites, \ - Total_Overlapping_sites, \ - disc_sites_string, \ - cell_vars_norm, \ - true_discordant_count, \ - relaxed_concordant_count, \ - relaxed_concordant_informative_count, \ - relaxed_concordant_uninformative_count, \ - true_discordant_informative_count, \ - true_discordant_uninformative_count, \ - total_sites, \ - informative_sites, \ - uninformative_sites, \ - total_reads, \ - total_reads_informative, \ - total_reads_uninformative, \ - discordant_reads, \ - discordant_reads_informative, \ - discordant_reads_uninformative, \ - discordant_vars, \ - concordant_vars, \ - discordant_read_fraction_in_concordant_sites, \ - discordant_read_fraction_in_discordant_sites, \ - discordant_reads_uninformative_fraction, \ - discordant_reads_informative_fraction, \ - discordant_reads_informative_subset - + 'Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool':Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool, \ + 'Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool':Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool, \ + 'shared_sites_between_Out_of_Cohort_and_Within_Cohort':len(shared_sites_between_out_of_cohort_and_within_cohort), \ + 'Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool':len(Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool), \ + 'Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool':len(Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool), \ + 'Out_of_Cohort__unique_sites_total_reads':Out_of_Cohort__unique_sites_total_reads, \ + 'Total__discordant_sites_that_are_concordant_with_other_donors_in_pool':len(Total__discordant_sites_that_are_concordant_with_other_donors_in_pool) + }, donor_table_of_concordances] - - def set_results(self,to_set,id): - # Recod to disk to save the loading mmeory time. - hash = random.getrandbits(128) - with open(f'tmp_{id}_{hash}.pkl', 'wb') as f: - pickle.dump(to_set, f) - self.record_dict[id]=f'tmp_{id}_{hash}.pkl' - return - # def append_results_cell_concordances(self,result): - def append_results_cell_concordances(self,result,cell_concordance_table,other_donor_concordances,other_donor_concordance_table): - other_donor_concordance_table = other_donor_concordance_table + other_donor_concordances - count=result['count'] - try: - percent_concordant = result['Nr_Concordant']/(result['Nr_Discordant']+result['Nr_Concordant'])*100 - except: - percent_concordant = 0 - - try: - percent_discordant = result['Nr_Discordant']/(result['Nr_Discordant']+result['Nr_Concordant'])*100 - except: - percent_discordant = 0 - try: - percent_relaxed_concordant = result['Nr_Relaxed_concordant']/(result['Nr_Relaxed_concordant']+result['true_discordant_count'])*100 - except: - percent_relaxed_concordant = 0 - - try: - percent_strict_discordant = result['true_discordant_count']/(result['Nr_Relaxed_concordant']+result['true_discordant_count'])*100 - except: - percent_strict_discordant = 0 + def norm_genotypes(self,expected_vars): + expected_vars = pd.DataFrame(expected_vars) + if len(expected_vars) > 0: + split_str=expected_vars[0].str.split("_") + expected_vars['ids'] = split_str.str[0]+'_'+split_str.str[1]+'_'+split_str.str[2]+'_'+split_str.str[3] + expected_vars['pos'] = split_str.str[0]+'_'+split_str.str[1] + expected_vars['vars'] = split_str.str[4] + expected_vars['vars'] = expected_vars['vars'].str.replace('|','/',regex=False) + expected_vars = expected_vars[expected_vars['vars']!='./.'] + expected_vars.loc[expected_vars['vars']=='0/1','vars']='1/0' + expected_vars['combo']= expected_vars['ids']+'_'+expected_vars['vars'] + return expected_vars - try: - read_discordance = result['discordant_reads']/result['total_sites'] - except: - read_discordance = 0 - cohort = 'UNKNOWN' - donor_split = result['donor_gt_match'].split("_") - if (len(donor_split) == 2) and (donor_split[0] == donor_split[1]): - cohort = 'UKB' - elif (len(donor_split) == 3) and (len(donor_split[0]) == 14): - cohort = 'ELGH' - same_as_asigned_donor = result['donor_gt_match'] in result['Donor_With_Highest_Concordance'] - if not same_as_asigned_donor: - same_as_asigned_donor = result['donor_gt_match'] in result['Donor_With_Lowest_DisConcordance'] - - cell_concordance_table[f"{result['cell1']} --- {result['donor_gt_match']}"] = {'GT 1':result['cell1'], - 'GT 2':result['donor_gt_match'], - 'cohort': cohort, - - # 'Nr_Concordant':result['Nr_Concordant'], - # 'Nr_Discordant':result['Nr_Discordant'], - 'Nr_Relaxed_concordant':result['Nr_Relaxed_concordant'], - 'Nr_strict_discordant':result['true_discordant_count'], - # 'Percent Concordant':percent_concordant, - # 'Percent Discordant':percent_discordant, - 'Percent_relaxed_concordant': percent_relaxed_concordant, - 'Percent_strict_discordant': percent_strict_discordant, - 'Nr_concordant_informative': len(result['relaxed_concordant_informative_count']), - 'Nr_concordant_uninformative': len(result['relaxed_concordant_uninformative_count']), - 'Nr_discordant_informative': len(result['true_discordant_informative_count']), - 'Nr_discordant_uninformative': len(result['true_discordant_uninformative_count']), - 'NrTotal_Overlapping_sites_between_two_genotypes':result['Nr_Total_Overlapping_sites'], - 'Nr_donor_distinct_sites_within_pool_individuals':result['Nr_donor_distinct_sites'], - 'Number_of_sites_that_are_donor_concordant_and_exclusive':result['Number_of_sites_that_are_donor_concordant_and_exclusive'], - 'Total_sites': result['total_sites'], - 'Total_informative_sites': result['informative_sites'], - 'Total_uninformative_sites': result['uninformative_sites'], - 'Total_reads': result['total_reads'], - 'Total_reads_informative': result['total_reads_informative'], - 'Total_reads_uninformative': result['total_reads_uninformative'], - 'Discordant_reads': result['discordant_reads'], - 'Discordant_reads_informtive': result['discordant_reads_informative'], - 'Discordant_reads_uninformtive': result['discordant_reads_uninformative'], - 'discordant_reads_informative_subset':result['discordant_reads_informative_subset'], - 'Discordant_reads_by_n_sites': read_discordance, - 'Discordant_sites_in_pool': len(result['Discordant_sites_in_pool']), - 'Lowest_Disconcordance_value_in_all_donors':result['Lowest_Disconcordance_value_in_all_donors'], - 'Donor_With_Lowest_DisConcordance':result['Donor_With_Lowest_DisConcordance'], - 'Concordant_Site_Identities':result['Concordant_Site_Identities'], - 'Donor_With_Highest_Concordance':result['Donor_With_Highest_Concordance'], - 'Highest_Concordance_value_in_all_donors':result['Highest_Concordance_value_in_all_donors'], - 'same_as_asigned_donor':same_as_asigned_donor, - 'Total_sites_other_donor (if same_as_asigned_donor=False)':result['Total_sites_other_donor'], - 'Total_reads_other_donor (if same_as_asigned_donor=False)':result['Total_reads_other_donor'], - 'total_discordant_sites_that_are_concordant_with_other_donors_in_pool':result['total_discordant_sites_that_are_concordant_with_other_donors_in_pool'], - 'discordant_read_fraction_in_concordant_site':result['discordant_read_fraction_in_concordant_sites'], - 'discordant_read_fraction_in_discordant_sites':result['discordant_read_fraction_in_discordant_sites'], - 'Whithin_Cohort__total_number_of_potential_contaminent_reads':result['Whithin_Cohort__total_number_of_potential_contaminent_reads'], - 'Out_of_Cohort__total_number_of_potential_contaminent_reads':result['Out_of_Cohort__total_number_of_potential_contaminent_reads'], - 'NrDonors_contributing_to_out_of_cohort':result['NrDonors_contributing_to_out_of_cohort'], - 'NrDonors_contributing_to_Whithin_Cohort':result['NrDonors_contributing_to_Whithin_Cohort'], - - 'Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool'], - 'Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool'], - 'shared_sites_between_Out_of_Cohort_and_Within_Cohort':result['shared_sites_between_Out_of_Cohort_and_Within_Cohort'], - 'Within_Cohort__unique_discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool'], - 'Out_of_Cohort__unique_discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool'], - 'Out_of_Cohort__unique_sites_total_reads':result['Out_of_Cohort__unique_sites_total_reads'], - 'Total__discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Total__discordant_sites_that_are_concordant_with_other_donors_in_pool'] - } - - return [cell_concordance_table,other_donor_concordance_table] - - # def combine_written_files(self):#this one is for concordance class - # to_export = self.cell_concordance_table - # for val1 in self.record_dict.values(): - # # here remove the int files. - # print(f"merging temp file: {val1}") - # with open(val1, 'rb') as f: - # loaded_dict = pickle.load(f) - # for k1 in loaded_dict.keys(): - # to_export[k1]=loaded_dict[k1] - # os.remove(val1) - # return to_export - - - def combine_written_lists(self,exclusive_donor_variants,record_dict):#this is for VCF loader class - to_export = exclusive_donor_variants - for val1 in record_dict.values(): - # here remove the int files. - print(f"merging temp file: {val1}") - with open(val1, 'rb') as f: - loaded_dict = pickle.load(f) - self.other_donor_comp = self.other_donor_comp+ loaded_dict - os.remove(val1) - return self.other_donor_comp - - def combine_written_files(self,exclusive_donor_variants,record_dict):#this is for VCF loader class - to_export = exclusive_donor_variants - for val1 in record_dict.values(): - # here remove the int files. - print(f"merging temp file: {val1}") - with open(val1, 'rb') as f: - loaded_dict = pickle.load(f) - for k1 in loaded_dict.keys(): - try: - to_export[k1]=to_export[k1].union(loaded_dict[k1]) - except: - to_export[k1]=set() - to_export[k1]=to_export[k1].union(loaded_dict[k1]) - os.remove(val1) - return to_export - - def set_results(self,to_set,id): - # Recod to disk to save the loading mmeory time. - hash = random.getrandbits(128) - with open(f'tmp_{id}_{hash}.pkl', 'wb') as f: - pickle.dump(to_set, f) - self.record_dict[id]=f'tmp_{id}_{hash}.pkl' - - def analyse_donor(self,analyse_donor,Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm,donor_assignments_table): - donor_concordance_table = {} - other_donor_concordance_table = [] - for cell1 in Cells_to_keep_pre: - count+=1 + def get_strict_discordance(self, expected_vars, cell_vars): + ''' + take a list of SNP array genotypes and a list of cellSNP genotypes, return counts of truly discordant + sites and relaxed concordant sites and list of discordant sites1 + 1) If you have 1/1 on SNP array you can not get a 0/1 or 0/0 genotype + 2) if you have a 0/0 you can not get a 1/1 or 0/1 + 3) if you genotype is 0/1 you can get all copies: 0/0 . 0/1. 1/1 + So - each obversed cellsnp allele must be in the array SNP gtype + ''' + snp_gtypes = expected_vars[0] + cellsnp_gtypes = cell_vars[0] + true_discordant = 0 + relaxed_concordant = 0 + relaxed_concordant_informative = 0 + relaxed_concordant_informative_ids = [] + relaxed_concordant_uninformative_ids = [] + true_discordant_uninformative_ids =[] + true_discordant_informative_ids=[] + relaxed_concordant_uninformative = 0 + true_discordant_informative = 0 + true_discordant_uninformative = 0 + discordant_vars = [] + concordant_vars = [] + subset_informative_concordant = 0 + subset_informative_discordant = 0 - cell_vars = exclusive_cell_variants[cell1] - try: - result1, other_donor_concordances = self.concordance_table_production(expected_vars_norm,cell_vars,cell1,donor_gt_match,donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data,donor_assignments_table) - cell_concordance_table,other_donor_concordance_table = self.append_results_cell_concordances(result1,donor_concordance_table,other_donor_concordances,other_donor_concordance_table) - except: - continue - # if count>800: - # break - # here we should write these independently to the files - if (count % 50 == 0): - self.set_results(other_donor_concordance_table,f"{count}--{donor_gt_match}") - other_donor_concordance_table = [] - + #print(self.uninformative_sites) + #print(self.informative_sites) - self.set_results(other_donor_concordance_table,f"{count}--{donor_gt_match}") - output2 = self.combine_written_lists(self.other_donor_comp,self.record_dict) - pd.DataFrame(output2).sort_values(by=['cell']).to_csv(f'{donor_gt_match}-{analyse_donor}--each_cells_comparison_with_other_donor.tsv',sep='\t',index=False) - # del output2 - self.record_dict={} - return donor_concordance_table + #create sets of the ids (chrom, pos, ref, alt) in each set of genotypes. Filter to the ids present in both + #then filter to informative and uninformative. If uninformative >0 then create a subset of informative + # with the same number of vars (at random) + split_snp_gts=snp_gtypes.str.split("_") + snp_gtypes_ids = set(split_snp_gts.str[0]+'_'+split_snp_gts.str[1]+'_'+split_snp_gts.str[2]+'_'+split_snp_gts.str[3]) - def combine_concordances(self,result): - - self.cell_concordance_table = {**self.cell_concordance_table, **result} - + split_cellsnp_gts=cellsnp_gtypes.str.split("_") + cellsnp_gtypes_ids = set(split_cellsnp_gts.str[0]+'_'+split_cellsnp_gts.str[1]+'_'+split_cellsnp_gts.str[2]+'_'+split_cellsnp_gts.str[3]) - def conc_table(self): - donor_assignments_table=self.donor_assignments_table - cell_assignments_table=self.cell_assignments_table - exclusive_don_variants=self.exclusive_don_variants - exclusive_cell_variants= self.exclusive_cell_variants - donor_list = exclusive_don_variants.keys() - # cpus = 1 - pool = mp.Pool(cpus) - count = 0 - - - #create a list of variants that are on each donor genotype file - vars_per_donor_gt = {} - for don_id in donor_list: - donor_gt_vars = list(exclusive_don_variants[don_id]) - donor_gt_vars = pd.DataFrame(donor_gt_vars) - donor_gt_vars = self.norm_genotypes(donor_gt_vars) - donor_gt_vars = donor_gt_vars[donor_gt_vars['vars'] != '0/0'] - donor_gt_varids = list(donor_gt_vars['ids']) - vars_per_donor_gt[don_id] = donor_gt_varids - - #work out what cohort each donor belongs to - donor_cohorts = {} - for don_id in donor_list: - cohort = 'UNKNOWN' - donor_split = don_id.split("_") - if (len(donor_split) == 2) and (donor_split[0] == donor_split[1]): - cohort = 'UKB' - elif (len(donor_split) == 3) and (len(donor_split[0]) == 14): - cohort = 'ELGH' - donor_cohorts[don_id] = cohort + shared_gts = snp_gtypes_ids.intersection(cellsnp_gtypes_ids) - all_donor_data={} - # here we calvculate all the expected donor datasets - for row1 in exclusive_don_variants.keys(): - # donor_in_question = row1['donor_query'] - donor_gt_match = row1 - expected_vars_of_other_donor = self.exclusive_don_variants[donor_gt_match] - expected_vars_norm_of_other_donor = self.norm_genotypes(expected_vars_of_other_donor) - all_donor_data[donor_gt_match]=expected_vars_norm_of_other_donor + shared_informative = shared_gts.intersection(self.informative_sites) + shared_uninformative = shared_gts.intersection(self.uninformative_sites) + # print("shared informative " + str(len(shared_informative))) + # print("shared uninformative " + str(len(shared_uninformative))) - results = [] - for i,row1 in donor_assignments_table.iterrows(): - donor_in_question = row1['donor_query'] - donor_gt_match = row1['donor_gt'] - print(donor_gt_match) - # if i>4: - # continue - if (donor_gt_match=='NONE'): - continue - try: - donor_gt_match_cohort = donor_cohorts[donor_gt_match] - except: - continue - Cells_to_keep_pre = list(set(cell_assignments_table.loc[cell_assignments_table['donor_id']==donor_in_question,'cell'])) - expected_vars = exclusive_don_variants[donor_gt_match] - expected_vars_norm = self.norm_genotypes(expected_vars) - try: - # Now we subset this down to each of the uniqie variants per donor and check which of the concordant sites are exclusive to donor. - dds = self.donor_distinct_sites[donor_gt_match] - except: - continue - if cpus==1: - result = self.analyse_donor(donor_in_question,Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm,donor_assignments_table) - self.combine_concordances(result) + #store the numbers of informative and uninformative sites shared between cellSNP and gt data as these + #are the sites used for concordance + self.informative_covered = len(shared_informative) + self.uninformative_covered = len(shared_uninformative) + + if len(shared_uninformative) > 0: + #print(len(shared_uninformative)) + # print(len(shared_informative)) + if len(shared_uninformative) <= len(shared_informative): + informative_subset = set(random.sample(shared_informative, len(shared_uninformative))) else: + informative_subset = set()#if there are more shared uninformative than shared informative we will not subset + # print(informative_subset) + # exit(0) + else: + informative_subset = set() - result = pool.apply_async(self.analyse_donor, args=([donor_in_question,Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,count,all_donor_data,expected_vars_norm,donor_assignments_table]),callback=self.combine_concordances) - results.append(result) - try: - [result.wait() for result in results] - except: - print('done') - print('Done') - pool.close() - pool.join() - - - - # output = self.combine_written_files(self.cell_concordance_table,self.record_dict) - - return self.cell_concordance_table + # print(informative_subset) + self.informative_subset = informative_subset - + snp_gtypes_set = set(snp_gtypes) + snp_gtypes_set = sorted(snp_gtypes_set) - def read_extraction(self,DonorDiscordant_Sites_that_are_atributed_to_other_donor,expected_vars_norm,cell_vars_norm): - # we need this function wrapper to calculate the concordant, discordant read - # counts for each of the discordant sites that are concordant with another donor. - - Total_Overlapping_sites = set(DonorDiscordant_Sites_that_are_atributed_to_other_donor) - expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlapping_sites)] - cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)] - - cell_vars2 = cell_vars2.drop_duplicates(subset=['ids']) - expected_vars2 = expected_vars2.drop_duplicates(subset=['ids']) - cell_vars2['DP'] = cell_vars2[0].str.split("_").str[5].astype(int) - cell_vars2['AD'] = cell_vars2[0].str.split("_").str[6].astype(int) - cell_vars2['OTH'] = cell_vars2[0].str.split("_").str[7].astype(int) - total_reads,_,_,discordant_reads, \ - discordantRead_ratio,discordant_reads_noHet,total_reads_noHet,discordantRead_ratio_noHet= self.read_concordance_calc(expected_vars2,cell_vars2) - concordant_reads = total_reads - discordant_reads - concordant_reads_noHet = total_reads_noHet-discordant_reads_noHet - return total_reads,discordant_reads,concordant_reads,\ - total_reads_noHet,discordant_reads_noHet,concordant_reads_noHet - - def concordance_table_production(self,expected_vars_norm,cell_vars,cell1,donor_gt_match, donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data,donor_assignments_table): + cellsnp_gtypes_set = set(cellsnp_gtypes) + cellsnp_gtypes_set = sorted(cellsnp_gtypes_set) - Concordant_Sites, \ - Discordant_sites, \ - Total_Overlapping_sites, \ - discordant_sites, \ - cell_vars_norm, \ - true_discordant_count, \ - relaxed_concordant_count, \ - relaxed_concordant_informative_count, \ - relaxed_concordant_uninformative_count, \ - true_discordant_informative_count, \ - true_discordant_uninformative_count, \ - total_sites, \ - informative_sites, \ - uninformative_sites, \ - total_reads, \ - total_reads_informative, \ - total_reads_uninformative, \ - discordant_reads, \ - discordant_reads_informative, \ - discordant_reads_uninformative, \ - discordant_vars, \ - concordant_vars, \ - discordant_read_fraction_in_concordant_sites, \ - discordant_read_fraction_in_discordant_sites, \ - discordant_reads_uninformative_fraction, \ - discordant_reads_informative_fraction, discordant_reads_informative_subset = self.retrieve_concordant_discordant_sites(expected_vars_norm,cell_vars) + #for i in range(0, len(snp_gtypes)): + # cellsnp_gtypes_str = pd.DataFrame(cellsnp_gtypes_set,columns=['cellsnp_gtypes']) + # cellsnp_gtypes_str['snp_gtypes'] = pd.DataFrame(snp_gtypes_set,columns=['snp_gtypes'])['snp_gtypes'] + for i in range(0, len(snp_gtypes_set)): + discordant = False + # snp_data = snp_gtypes[i].split('_') + # cellsnp_data = cellsnp_gtypes[i].split('_') + snp_data = snp_gtypes_set[i].split('_') + cellsnp_data = cellsnp_gtypes_set[i].split('_') + + snp_alleles_set = set([snp_data[4][0], snp_data[4][2]]) + cellsnp_alleles_set = set([cellsnp_data[4][0], cellsnp_data[4][2]]) + + snp_var = ('_').join(snp_data[0:4]) + cellsnp_var = ('_').join(cellsnp_data[0:4]) + + if not cellsnp_var == snp_var: + print("Error with strict discordance calculations: " + snp_gtypes[i] + " " + cellsnp_gtypes[i]) + exit(1) + else: + for allele in cellsnp_alleles_set: + if not allele in snp_alleles_set:#if a cellSNP allele is found that is not in the array data this is discordant + discordant = True + + if discordant == True: + true_discordant+=1 + discordant_vars.append(cellsnp_var) + if snp_var in self.uninformative_sites: + true_discordant_uninformative+=1 + true_discordant_uninformative_ids.append(snp_var) + elif snp_var in self.informative_sites: + true_discordant_informative+=1 + true_discordant_informative_ids.append(snp_var) + else: + relaxed_concordant+=1 + concordant_vars.append(cellsnp_var) + if snp_var in self.uninformative_sites: + relaxed_concordant_uninformative+=1 + relaxed_concordant_uninformative_ids.append(snp_var) + elif snp_var in self.informative_sites: + relaxed_concordant_informative+=1 + relaxed_concordant_informative_ids.append(snp_var) + if len(shared_uninformative) > 0: + if snp_var in informative_subset: + if discordant == True: + subset_informative_discordant+=1 + else: + subset_informative_concordant+=1 + # true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, subset_informative_sites_concordant_count, subset_informative_sites_discordant_count + cell_vars2 = cell_vars.set_index('ids') + disc = pd.DataFrame(set(cell_vars2.loc[discordant_vars]['combo']),columns=['combo_x']) + df_cd = pd.merge(cell_vars, expected_vars, how='inner', on = 'pos') + disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x') + disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y'] + disc_sites_string = ';'.join(disc2['expected_retrieved']) + return true_discordant, relaxed_concordant, relaxed_concordant_informative_ids, relaxed_concordant_uninformative_ids, true_discordant_informative_ids, true_discordant_uninformative_ids, discordant_vars, concordant_vars, disc_sites_string + + def read_concordance_calc(self,expected_vars,cell_vars): + + # This is a wrapper to add up the discordant reads in the cellsnp file. + + # expected genotype 0/0 + expected_hom_ref = expected_vars[expected_vars['vars'] == '0/0'] + hom_ref_sites = set(expected_hom_ref['ids']) + cell_vars2 = cell_vars[cell_vars['ids'].isin(hom_ref_sites)] + ad_hom_ref = cell_vars2['AD'].sum() + oth_hom_ref = cell_vars2['OTH'].sum() + discordant_hom_ref = ad_hom_ref + oth_hom_ref + + # expected genotype 0/1 or 1/0 + hets = ['0/1', '1/0'] + expected_het = expected_vars[expected_vars['vars'].isin(hets)] + het_sites = set(expected_het['ids']) + cell_vars3 = cell_vars[cell_vars['ids'].isin(het_sites)] + discordant_het = cell_vars3['OTH'].sum() + + # expected genotype 1/1 + expected_hom_alt = expected_vars[expected_vars['vars'] == '1/1'] + hom_alt_sites = set(expected_hom_alt['ids']) + cell_vars4 = cell_vars[cell_vars['ids'].isin(hom_alt_sites)] + # DP + OTH - AD + ad_hom_alt = cell_vars4['AD'].sum() + dp_hom_alt = cell_vars4['DP'].sum() + oth_hom_alt = cell_vars4['OTH'].sum() + discordant_hom_alt = (dp_hom_alt + oth_hom_alt) - ad_hom_alt - total_concordant_sites = len(Concordant_Sites) + relaxed_concordant_count - dds = self.donor_distinct_sites[donor_gt_match] - Nr_donor_distinct_sites = len(dds) - Nr_Concordant = len(Concordant_Sites) - Nr_Relaxed_concordant = relaxed_concordant_count - Nr_Discordant = len(Discordant_sites) - Nr_Total_Overlapping_sites = len(Total_Overlapping_sites) - Number_of_sites_that_are_donor_concordant_and_exclusive = len(set(dds).intersection(set(Discordant_sites))) - Number_of_sites_in_cellsnp_but_not_in_reference = set(cell_vars_norm['pos'])-set(expected_vars_norm['pos']) - #Quantify donor variation in other donors - discordant_vars_in_pool = [] - donor_table_of_concordances = [] - total_discordant_sites_that_are_concordant_with_other_donors_in_pool = set() - total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown = {} - total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort']={} - total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort']={} - - total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown_totalReads = {} - total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown_totalReads['Whithin_Cohort']={} - total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown_totalReads['Out_of_Cohort']={} - - total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown = {} - total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort']={} - total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort']={} - - - informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool = set() - total_cordant_sites_that_are_concordant_with_other_donors_in_pool = set() - donor_gt_match_cohort = donor_cohorts[donor_gt_match] - donors_contributing_to_out_of_cohort= [] - donors_contributing_to_Whithin_Cohort=[] - sites_contributing_to_out_of_cohort= [] - sites_contributing_to_Whithin_Cohort=[] - for donor in set(donor_assignments_table['donor_gt']): - try: - expected_vars_norm_of_other_donor = all_donor_data[donor] - except: - continue - try: - donor_cohort = donor_cohorts[donor] - donor_vars = vars_per_donor_gt[donor] - except: - continue + # Total analysis + discordant_reads = discordant_hom_ref + discordant_het + discordant_hom_alt + total_dp = cell_vars['DP'].sum() + total_oth = cell_vars['OTH'].sum() + total_reads = total_dp + total_oth + discordantRead_ratio = discordant_reads/total_reads - Concordant_Sites_otherDonor, \ - Discordant_sites_otherDonor, \ - Total_Overlapping_sites_otherDonor, \ - discordant_sites_otherDonor, \ - cell_vars_norm_otherDonor, \ - true_discordant_count_otherDonor, \ - relaxed_concordant_count_otherDonor, \ - relaxed_concordant_informative_count_otherDonor, \ - relaxed_concordant_uninformative_count_otherDonor, \ - true_discordant_informative_count_otherDonor, \ - true_discordant_uninformative_count_otherDonor, \ - total_sites_otherDonor, \ - informative_sites_otherDonor, \ - uninformative_sites_otherDonor, \ - total_reads_otherDonor, \ - total_reads_informative_otherDonor, \ - total_reads_uninformative_otherDonor, \ - discordant_reads_otherDonor, \ - discordant_reads_informative_otherDonor, \ - discordant_reads_uninformative_otherDonor, \ - discordant_vars_otherDonor, \ - concordant_vars_otherDonor, \ - discordant_read_fraction_in_concordant_sites_otherDonor, \ - discordant_read_fraction_in_discordant_sites_otherDonor, \ - discordant_reads_uninformative_fraction_otherDonor, \ - discordant_reads_informative_fraction_otherDonor, discordant_reads_informative_subset_otherDonor = self.retrieve_concordant_discordant_sites(expected_vars_norm_of_other_donor,cell_vars,donor_cohort=donor_cohort) + # No het analysis - they are not discordant or concordant - we cannot make any call at those sites + discordant_reads_noHet = discordant_hom_ref + discordant_hom_alt + total_reads_noHet = total_reads - discordant_het + discordantRead_ratio_noHet = discordant_reads_noHet/total_reads_noHet - if total_sites_otherDonor==0: - total_sites_otherDonor=0.000000000001 - total_concordant_sites_otherDonor = relaxed_concordant_count_otherDonor - concordant_percent_in_other_donor= total_concordant_sites_otherDonor/total_sites_otherDonor*100 - discordant_percent_in_other_donor= true_discordant_count_otherDonor/total_sites_otherDonor*100 - DonorDiscordant_Sites_that_are_atributed_to_other_donor = set(discordant_vars).intersection(set(concordant_vars_otherDonor)) - TotalSites_that_are_atributed_to_other_donor = set(discordant_vars).union(set(concordant_vars)).intersection(set(concordant_vars_otherDonor)) + return total_reads,total_dp,total_oth,discordant_reads,discordantRead_ratio,discordant_reads_noHet,total_reads_noHet,discordantRead_ratio_noHet - Informative__DonorDiscordant_Sites_that_are_atributed_to_other_donor = set(true_discordant_informative_count).intersection(set(relaxed_concordant_informative_count_otherDonor)) - DonorCordant_Sites_that_are_atributed_to_other_donor = set(concordant_vars).intersection(set(concordant_vars_otherDonor)) - - total_reads_for_discordant_sites_that_are_concordant_with_other_donor,discordant_reads_for_discordant_sites_that_are_concordant_with_other_donor,concordant_reads_for_discordant_sites_that_are_concordant_with_other_donor,\ - _,_,_= self.read_extraction(DonorDiscordant_Sites_that_are_atributed_to_other_donor,expected_vars_norm_of_other_donor,cell_vars_norm_otherDonor) + def read_condordance(self, expected_vars, cell_vars,discordant_vars, concordant_vars): + ''' + get read level concordance using DP, AD and OTH format fields + ##FORMAT= + ##FORMAT= + ##FORMAT= + ''' + if not len(expected_vars) == len(cell_vars): + print("length mismatch between expected vars and cell vars") + exit(1) - if not donor == donor_gt_match: - # We want to kow how many of these discordant site - if 'U937' in donor: - continue - if 'THP1' in donor: - continue - - if donor_gt_match_cohort == donor_cohort: - coh = 'Whithin_Cohort' - sites_contributing_to_Whithin_Cohort.extend(list(TotalSites_that_are_atributed_to_other_donor)) - if len(DonorDiscordant_Sites_that_are_atributed_to_other_donor)>0: - donors_contributing_to_Whithin_Cohort.append(donor) - - else: - coh = 'Out_of_Cohort' - sites_contributing_to_out_of_cohort.extend(list(TotalSites_that_are_atributed_to_other_donor)) - if len(DonorDiscordant_Sites_that_are_atributed_to_other_donor)>0: - donors_contributing_to_out_of_cohort.append(donor) - - - total_discordant_sites_that_are_concordant_with_other_donors_in_pool = total_discordant_sites_that_are_concordant_with_other_donors_in_pool.union(set(DonorDiscordant_Sites_that_are_atributed_to_other_donor)) - # now we addit for a cohort since the biggest issue comes from cohort cross-contamination - # for each of these sites now we calculate the number of reads that it accounts: - # tree level set: cohort: site: counts - for site in DonorDiscordant_Sites_that_are_atributed_to_other_donor: - total_reads_for_site,discordant_reads_for_site,concordant_for_site,\ - total_reads_noHet,discordant_reads_noHet,concordant_reads_noHet= self.read_extraction([site],expected_vars_norm_of_other_donor,cell_vars_norm_otherDonor) - if concordant_for_site==0: - pass - try: - total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_reads_noHet) + total_sites = len(expected_vars) + #add cols for DP, AD< OTH + cell_vars['DP'] = cell_vars[0].str.split("_").str[5].astype(int) + cell_vars['AD'] = cell_vars[0].str.split("_").str[6].astype(int) + cell_vars['OTH'] = cell_vars[0].str.split("_").str[7].astype(int) + - except: - try: - total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site]=[] - total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_reads_noHet) - - except: - total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh]={} - total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site]=[] - total_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_reads_noHet) + + # Total + total_reads,total_dp,total_oth,discordant_reads, \ + discordantRead_ratio,discordant_reads_noHet,total_reads_noHet,discordantRead_ratio_noHet = self.read_concordance_calc(expected_vars,cell_vars) + + # uninformative + cell_vars_uninformative = cell_vars[cell_vars['ids'].isin(self.uninformative_sites)] + total_reads_uninformative,total_dp_uninformative,total_oth_uninformative,discordant_reads_uninformative, \ + discordantRead_ratio_uninformative,discordant_reads_uninformative_noHet,total_reads_uninformative_noHet,discordantRead_ratio_uninformative_noHet= self.read_concordance_calc(expected_vars,cell_vars_uninformative) + + # informative + cell_vars_informative = cell_vars[cell_vars['ids'].isin(self.informative_sites)] + total_reads_informative,total_dp_informative,total_oth_informative,discordant_reads_informative, \ + discordantRead_ratio_informative,discordant_reads_informative_noHet,total_reads_informative_noHet,discordantRead_ratio_informative_noHet = self.read_concordance_calc(expected_vars,cell_vars_informative) + + # Split into concordant and discordant sites + # concordant + concordant_sites = cell_vars[cell_vars['ids'].isin(set(concordant_vars))] + total_reads_for_concordant_sites,total_dp_for_concordant_sites,total_oth_for_concordant_sites,discordant_reads_for_concordant_sites, \ + discordantRead_ratio_for_concordant_sites,discordant_reads_for_concordant_sites_noHet,total_reads_for_concordant_sites_noHet,discordantRead_ratio_for_concordant_sites_noHet = self.read_concordance_calc(expected_vars,concordant_sites) + + # discordant + discordant_sites = cell_vars[cell_vars['ids'].isin(set(discordant_vars))] + total_reads_for_discconcordant_sites,total_dp_for_discconcordant_sites,total_oth_for_discconcordant_sites,discordant_reads_for_discconcordant_sites, \ + discordantRead_ratio_for_discconcordant_sites,discordant_reads_for_discconcordant_sites_noHet,total_reads_for_discconcordant_sites_noHet,discordantRead_ratio_for_discconcordant_sites_noHet = self.read_concordance_calc(expected_vars,discordant_sites) + + # Subset analysis + cell_vars_informative_subset = cell_vars[cell_vars['ids'].isin(self.informative_subset)] + total_reads_informative_subset,total_dp_informative_subset,total_oth_informative_subset,discordant_reads_informative_subset, \ + discordantRead_ratio_informative_subset,discordant_reads_informative_subset_noHet,total_reads_informative_subset_noHet,discordantRead_ratio_informative_subset_noHet = self.read_concordance_calc(expected_vars,cell_vars_informative_subset) - for site in DonorDiscordant_Sites_that_are_atributed_to_other_donor: - total_reads_for_site,discordant_reads_for_site,concordant_for_site,\ - _,_,_= self.read_extraction([site],expected_vars_norm_of_other_donor,cell_vars_norm_otherDonor) - # if discordant_reads_for_site>0: - # print('here') - if concordant_for_site==0: - pass - try: - total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_for_site) - except: - try: - total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site]=[] - total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_for_site) - - except: - total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh]={} - total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site]=[] - total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown[coh][site].append(concordant_for_site) + if mode == 'no_het': + total_reads = total_reads_noHet + discordant_reads = discordant_reads_noHet + total_reads_informative = total_reads_informative_noHet + discordant_reads_informative = discordant_reads_informative_noHet + total_reads_uninformative =total_reads_uninformative_noHet + discordant_reads_uninformative = discordant_reads_uninformative_noHet + total_reads_informative_subset = total_reads_informative_subset_noHet + discordant_reads_informative_subset = discordant_reads_informative_subset_noHet + total_reads_for_concordant_sites = total_reads_for_concordant_sites_noHet + discordant_reads_for_concordant_sites = discordant_reads_for_concordant_sites_noHet + total_reads_for_discconcordant_sites = total_reads_for_discconcordant_sites_noHet + discordant_reads_for_discconcordant_sites = discordant_reads_for_discconcordant_sites_noHet + + return total_sites, \ + self.informative_covered, \ + self.uninformative_covered, \ + total_reads, \ + discordant_reads, \ + total_reads_informative, \ + discordant_reads_informative, \ + total_reads_uninformative, \ + discordant_reads_uninformative, \ + total_reads_informative_subset, \ + discordant_reads_informative_subset, \ + total_reads_for_concordant_sites, \ + discordant_reads_for_concordant_sites, \ + total_reads_for_discconcordant_sites, \ + discordant_reads_for_discconcordant_sites + - # to get the total reads that can be atributed to the other donor i have to check if site is already covered in the total_discordant_sites_that_are_concordant_with_other_donors_in_pool. - # the ones that havent, i have to add the reads up for them. - informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool = informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool.union(set(Informative__DonorDiscordant_Sites_that_are_atributed_to_other_donor)) - - total_cordant_sites_that_are_concordant_with_other_donors_in_pool = total_cordant_sites_that_are_concordant_with_other_donors_in_pool.union(set(DonorCordant_Sites_that_are_atributed_to_other_donor)) + + def get_discordance(self,expected_vars2,cell_vars2): + Concordant_Sites = set(cell_vars2['combo']).intersection(set(expected_vars2['combo'])) + Discordant_sites = set(cell_vars2['combo'])-set(expected_vars2['combo']) + disc = pd.DataFrame(Discordant_sites,columns=['combo_x']) + df_cd = pd.merge(cell_vars2, expected_vars2, how='inner', on = 'pos') + disc2= pd.merge(disc, df_cd, how='inner', on = 'combo_x') + disc2['expected_retrieved'] = disc2['0_x']+'::'+disc2['0_y'] + disc_sites = ';'.join(disc2['expected_retrieved']) + return Concordant_Sites,Discordant_sites,disc_sites + + + def retrieve_concordant_discordant_sites(self,expected_vars_norm,cell_vars,donor_cohort=False): + # This function has been inspired by Hails Concordance implementations, however hail has a pitfall that it performs a lot of other stuff under hood and requires intermediate sorting operations. + # Since the single cell calculations requires concordance calculations per cell this becomes very computationally heavy on Hail, hence we have implemented concordance calculations here as part of the pipeline. + # Author: M.Ozols + + cell_vars_norm = self.norm_genotypes(cell_vars) + + if len(cell_vars_norm) > 0: + + if mode == 'no_het': + hets = ['0/1', '1/0'] + expected_vars_norm = expected_vars_norm[~expected_vars_norm['vars'].isin(hets)] + Total_Overlapping_sites = set(expected_vars_norm['ids']).intersection(set(cell_vars_norm['ids'])) + expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlapping_sites)] + cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)] + cell_vars2 = cell_vars2.drop_duplicates(subset=['ids']) + expected_vars2 = expected_vars2.drop_duplicates(subset=['ids']) + if len(cell_vars2)!=len(expected_vars2): + print('failed to select comon variants') + exit(1) + + # Find exact discordant sites + Concordant_Sites, Discordant_sites, _ = self.get_discordance(expected_vars2, cell_vars2) + #find truly discordant sites + true_discordant_count, relaxed_concordant_count, relaxed_concordant_informative_count, relaxed_concordant_uninformative_count, true_discordant_informative_count, true_discordant_uninformative_count, discordant_vars, concordant_vars, disc_sites_string = self.get_strict_discordance(expected_vars2, cell_vars2) + #find discordant reads + total_sites, \ + informative_sites, \ + uninformative_sites, \ + total_reads, \ + discordant_reads, \ + total_reads_informative, \ + discordant_reads_informative, \ + total_reads_uninformative, \ + discordant_reads_uninformative, \ + total_reads_informative_subset, \ + discordant_reads_informative_subset, \ + total_reads_for_concordant_sites, \ + discordant_reads_for_concordant_sites, \ + total_reads_for_discconcordant_sites, \ + discordant_reads_for_discconcordant_sites = self.read_condordance(expected_vars2, cell_vars2, discordant_vars, concordant_vars) + + discordant_read_fraction_in_concordant_sites = f"{discordant_reads_for_concordant_sites}/{total_reads_for_concordant_sites}" + discordant_read_fraction_in_discordant_sites = f"{discordant_reads_for_discconcordant_sites}/{total_reads_for_discconcordant_sites}" + discordant_reads_uninformative_fraction = f"{discordant_reads_uninformative}/{total_reads_uninformative}" + discordant_reads_informative_fraction = f"{discordant_reads_informative}/{total_reads_informative}" + + # sanity checks + if total_reads != total_reads_for_concordant_sites+total_reads_for_discconcordant_sites: + print("Error: total reads dont add up ") + exit(1) + if discordant_reads != discordant_reads_for_concordant_sites+discordant_reads_for_discconcordant_sites: + print("Error: discordant reads dont add up ") + exit(1) + - common_vars = list(set(discordant_vars) & set(donor_vars)) - common_var_count = str(len(common_vars)) - donor_cohort_common = donor + ":" + donor_cohort + ":" + common_var_count - discordant_vars_in_pool.append(donor_cohort_common) - - # Here we want to calculate the number of discordant sites in other donors and see if in terms of concordance the same donor is picked as per GT assignment. - # We do this to investigate the potential of a cell coming from this other donor. - else: - _='check' - try: - percent_relaxed_concordant = Nr_Relaxed_concordant/(Nr_Relaxed_concordant+true_discordant_count)*100 - except: - percent_relaxed_concordant = 0 - # g2 = total_concordant_sites_otherDonor/total_sites_otherDonor*100 - if not (percent_relaxed_concordant == concordant_percent_in_other_donor): - _='something not right' - break + else: + Total_Overlapping_sites = set() + Concordant_Sites = set() + Discordant_sites = set() + disc_sites = '' + true_discordant_count = 0 + relaxed_concordant_count = 0 + total_sites = 0 + relaxed_concordant_informative_count = 0 + relaxed_concordant_uninformative_count = 0 + true_discordant_informative_count = 0 + true_discordant_uninformative_count = 0 + informative_sites = 0 + disc_sites_string = '' + discordant_reads = 0 + uninformative_sites = 0 + total_reads = 0 + total_reads_informative =0 + total_reads_uninformative=0 + discordant_reads=0 + discordant_reads_informative=0 + discordant_reads_uninformative=0 + discordant_vars=0 + concordant_vars=0 + discordant_read_fraction_in_concordant_sites=0 + discordant_read_fraction_in_discordant_sites=0 + discordant_reads_uninformative_fraction=0 + discordant_reads_informative_fraction=0 + discordant_reads_informative_subset=0 + + return Concordant_Sites, \ + Discordant_sites, \ + Total_Overlapping_sites, \ + disc_sites_string, \ + cell_vars_norm, \ + true_discordant_count, \ + relaxed_concordant_count, \ + relaxed_concordant_informative_count, \ + relaxed_concordant_uninformative_count, \ + true_discordant_informative_count, \ + true_discordant_uninformative_count, \ + total_sites, \ + informative_sites, \ + uninformative_sites, \ + total_reads, \ + total_reads_informative, \ + total_reads_uninformative, \ + discordant_reads, \ + discordant_reads_informative, \ + discordant_reads_uninformative, \ + discordant_vars, \ + concordant_vars, \ + discordant_read_fraction_in_concordant_sites, \ + discordant_read_fraction_in_discordant_sites, \ + discordant_reads_uninformative_fraction, \ + discordant_reads_informative_fraction, \ + discordant_reads_informative_subset - donor_table_of_concordances.append({'donor':donor, 'cell':cell1, 'donor_cohort':donor_cohort, \ - 'gt matched donor':donor == donor_gt_match, \ - 'DonorCordant_Sites_that_are_atributed_to_other_donor':len(DonorCordant_Sites_that_are_atributed_to_other_donor), \ - 'DonorCordant_Sites_that_are_atributed_to_other_donor/total':f"{len(DonorCordant_Sites_that_are_atributed_to_other_donor)}/{len(concordant_vars)}", \ - 'DonorDiscordant_Sites_that_are_atributed_to_other_donor':len(DonorDiscordant_Sites_that_are_atributed_to_other_donor), \ - 'DonorDiscordant_Sites_that_are_atributed_to_other_donor/total':f"{len(DonorDiscordant_Sites_that_are_atributed_to_other_donor)}/{len(discordant_vars)}", \ - 'concordant_percent_in_other_donor':concordant_percent_in_other_donor, \ - 'discordant_percent_in_other_donor':discordant_percent_in_other_donor, \ - 'discordant_reads_otherDonor':discordant_reads_otherDonor, \ - 'discordant_sites_otherDonor':len(discordant_vars_otherDonor), \ - 'concordant_sites_otherDonor':len(concordant_vars_otherDonor), \ - 'total_sites_otherDonor':total_sites_otherDonor, \ - 'discordant_reads_otherDonor':discordant_reads_otherDonor, \ - 'total_reads_otherDonor':total_reads_otherDonor, \ - # 'discordant_read_fraction_in_concordant_sites_otherDonor':discordant_read_fraction_in_concordant_sites_otherDonor, \ - # 'discordant_read_fraction_in_discordant_sites_otherDonor':discordant_read_fraction_in_discordant_sites_otherDonor, \ - 'concordant_reads_For_discordant_sites_that_are_Concordant_with_other_donor':concordant_reads_for_discordant_sites_that_are_concordant_with_other_donor - }) - - - #here now we want to see overall how many reads potentially come from different cohorts. - # cohort_specific_site_quant_string="" - # cohort_specific_read_quant_string="" + + + def set_results(self,to_set,id): + # Recod to disk to save the loading mmeory time. + hash = random.getrandbits(128) + with open(f'tmp_{id}_{hash}.pkl', 'wb') as f: + pickle.dump(to_set, f) + self.record_dict[id]=f'tmp_{id}_{hash}.pkl' + return + + # def append_results_cell_concordances(self,result): + def append_results_cell_concordances(self,result,cell_concordance_table,other_donor_concordances,other_donor_concordance_table): + other_donor_concordance_table = other_donor_concordance_table + other_donor_concordances + # other_donor_concordance_table = {**other_donor_concordance_table, **other_donor_concordances} + count=result['count'] + try: + percent_concordant = result['Nr_Concordant']/(result['Nr_Discordant']+result['Nr_Concordant'])*100 + except: + percent_concordant = 0 - Whithin_Cohort__total_number_of_potential_contaminent_reads=0 - Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool=0 - Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool=0 try: - Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool = len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) - # for k1 in total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys(): - # Whithin_Cohort__total_number_of_potential_contaminent_reads+= max(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'][k1]) + percent_discordant = result['Nr_Discordant']/(result['Nr_Discordant']+result['Nr_Concordant'])*100 except: - _='Doesnt Exist' + percent_discordant = 0 + + try: + percent_relaxed_concordant = result['Nr_Relaxed_concordant']/(result['Nr_Relaxed_concordant']+result['true_discordant_count'])*100 + except: + percent_relaxed_concordant = 0 - Out_of_Cohort__total_number_of_potential_contaminent_reads=0 try: - Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool = len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys()) - # for k1 in total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys(): - # Out_of_Cohort__total_number_of_potential_contaminent_reads+= max(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'][k1]) + percent_strict_discordant = result['true_discordant_count']/(result['Nr_Relaxed_concordant']+result['true_discordant_count'])*100 except: - _='Doesnt Exist' - - - - + percent_strict_discordant = 0 + + try: + read_discordance = result['discordant_reads']/result['total_sites'] + except: + read_discordance = 0 + + cohort = 'UNKNOWN' + donor_split = result['donor_gt_match'].split("_") + if (len(donor_split) == 2) and (donor_split[0] == donor_split[1]): + cohort = 'UKB' + elif (len(donor_split) == 3) and (len(donor_split[0]) == 14): + cohort = 'ELGH' + + same_as_asigned_donor = result['donor_gt_match'] in result['Donor_With_Highest_Concordance'] + if not same_as_asigned_donor: + same_as_asigned_donor = result['donor_gt_match'] in result['Donor_With_Lowest_DisConcordance'] + + cell_concordance_table[f"{result['cell1']} --- {result['donor_gt_match']}"] = {'GT 1':result['cell1'], + 'GT 2':result['donor_gt_match'], + 'cohort': cohort, + + # 'Nr_Concordant':result['Nr_Concordant'], + # 'Nr_Discordant':result['Nr_Discordant'], + 'Nr_Relaxed_concordant':result['Nr_Relaxed_concordant'], + 'Nr_strict_discordant':result['true_discordant_count'], + # 'Percent Concordant':percent_concordant, + # 'Percent Discordant':percent_discordant, + 'Percent_relaxed_concordant': percent_relaxed_concordant, + 'Percent_strict_discordant': percent_strict_discordant, + 'Nr_concordant_informative': len(result['relaxed_concordant_informative_count']), + 'Nr_concordant_uninformative': len(result['relaxed_concordant_uninformative_count']), + 'Nr_discordant_informative': len(result['true_discordant_informative_count']), + 'Nr_discordant_uninformative': len(result['true_discordant_uninformative_count']), + 'NrTotal_Overlapping_sites_between_two_genotypes':result['Nr_Total_Overlapping_sites'], + 'Nr_donor_distinct_sites_within_pool_individuals':result['Nr_donor_distinct_sites'], + 'Number_of_sites_that_are_donor_concordant_and_exclusive':result['Number_of_sites_that_are_donor_concordant_and_exclusive'], + 'Total_sites': result['total_sites'], + 'Total_informative_sites': result['informative_sites'], + 'Total_uninformative_sites': result['uninformative_sites'], + 'Total_reads': result['total_reads'], + 'Total_reads_informative': result['total_reads_informative'], + 'Total_reads_uninformative': result['total_reads_uninformative'], + 'Discordant_reads': result['discordant_reads'], + 'Discordant_reads_informtive': result['discordant_reads_informative'], + 'Discordant_reads_uninformtive': result['discordant_reads_uninformative'], + 'discordant_reads_informative_subset':result['discordant_reads_informative_subset'], + 'Discordant_reads_by_n_sites': read_discordance, + 'Discordant_sites_in_pool': len(result['Discordant_sites_in_pool']), + 'Lowest_Disconcordance_value_in_all_donors':result['Lowest_Disconcordance_value_in_all_donors'], + 'Donor_With_Lowest_DisConcordance':result['Donor_With_Lowest_DisConcordance'], + 'Concordant_Site_Identities':result['Concordant_Site_Identities'], + 'Donor_With_Highest_Concordance':result['Donor_With_Highest_Concordance'], + 'Highest_Concordance_value_in_all_donors':result['Highest_Concordance_value_in_all_donors'], + 'same_as_asigned_donor':same_as_asigned_donor, + 'Total_sites_other_donor (if same_as_asigned_donor=False)':result['Total_sites_other_donor'], + 'Total_reads_other_donor (if same_as_asigned_donor=False)':result['Total_reads_other_donor'], + 'total_discordant_sites_that_are_concordant_with_other_donors_in_pool':result['total_discordant_sites_that_are_concordant_with_other_donors_in_pool'], + 'discordant_read_fraction_in_concordant_site':result['discordant_read_fraction_in_concordant_sites'], + 'discordant_read_fraction_in_discordant_sites':result['discordant_read_fraction_in_discordant_sites'], + 'Whithin_Cohort__total_number_of_potential_contaminent_reads':result['Whithin_Cohort__total_number_of_potential_contaminent_reads'], + 'Out_of_Cohort__total_number_of_potential_contaminent_reads':result['Out_of_Cohort__total_number_of_potential_contaminent_reads'], + 'NrDonors_contributing_to_out_of_cohort':result['NrDonors_contributing_to_out_of_cohort'], + 'NrDonors_contributing_to_Whithin_Cohort':result['NrDonors_contributing_to_Whithin_Cohort'], + + 'Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool'], + 'Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool'], + 'shared_sites_between_Out_of_Cohort_and_Within_Cohort':result['shared_sites_between_Out_of_Cohort_and_Within_Cohort'], + 'Within_Cohort__unique_discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool'], + 'Out_of_Cohort__unique_discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool'], + 'Out_of_Cohort__unique_sites_total_reads':result['Out_of_Cohort__unique_sites_total_reads'], + 'Total__discordant_sites_that_are_concordant_with_other_donors_in_pool':result['Total__discordant_sites_that_are_concordant_with_other_donors_in_pool'] + } + + return [cell_concordance_table,other_donor_concordance_table] + def combine_written_lists(self,exclusive_donor_variants,record_dict):#this is for VCF loader class + to_export = exclusive_donor_variants + for val1 in record_dict.values(): + # here remove the int files. + print(f"merging temp file: {val1}") + with open(val1, 'rb') as f: + loaded_dict = pickle.load(f) + self.other_donor_comp = self.other_donor_comp+ loaded_dict + os.remove(val1) + return self.other_donor_comp + def combine_written_files(self,exclusive_donor_variants,record_dict):#this is for VCF loader class + to_export = exclusive_donor_variants + for val1 in record_dict.values(): + # here remove the int files. + print(f"merging temp file: {val1}") + with open(val1, 'rb') as f: + loaded_dict = pickle.load(f) + for k1 in loaded_dict.keys(): + try: + to_export[k1]=to_export[k1].union(loaded_dict[k1]) + except: + to_export[k1]=set() + to_export[k1]=to_export[k1].union(loaded_dict[k1]) + os.remove(val1) + return to_export + + def set_results(self,to_set,id): + # Recod to disk to save the loading mmeory time. + hash = random.getrandbits(128) + with open(f'tmp_{id}_{hash}.pkl', 'wb') as f: + pickle.dump(to_set, f) + self.record_dict[id]=f'tmp_{id}_{hash}.pkl' - # No het total calcs + + def combine_concordances(self,result,other_donor_concordance,donor_gt_match,analyse_donor): + pd.DataFrame(other_donor_concordance).sort_values(by=['cell']).to_csv(f'{donor_gt_match}-{analyse_donor}--each_cells_comparison_with_other_donor.tsv',sep='\t',index=False) + self.cell_concordance_table = {**self.cell_concordance_table, **result} + + def combine_dict(self,cell_concordance_table,result): + cell_concordance_table = {**cell_concordance_table, **result} + return cell_concordance_table + + def conc_table(self): + donor_assignments_table=self.donor_assignments_table + cell_assignments_table=self.cell_assignments_table + exclusive_don_variants=self.exclusive_don_variants + exclusive_cell_variants= self.exclusive_cell_variants + donor_list = exclusive_don_variants.keys() + # cpus = 1 - # Since we can not tell just from the cellsnp file if the discordant read at concordant site is actually concordant with another donor in a pool - # we assume the worse case scenario and add idscordant reads at concordant sites to the total. - # Number of reads on the sites that are concordant, but has discordant reads with another donor - Whithin_Cohort__total_number_of_potential_contaminent_reads=0 - try: - concordant_sites_atributed_to_other_donor = list(set(sites_contributing_to_Whithin_Cohort)) - if not total_sites >= len(concordant_sites_atributed_to_other_donor): - raise Exception("Sorry, the number of sites atributed can not be larger than total sites") - - _,_,_,\ - _,Whithin_Cohort__total_number_of_potential_contaminent_reads, \ - _= self.read_extraction(concordant_sites_atributed_to_other_donor,expected_vars_norm,cell_vars_norm) - - if not discordant_reads >= len(Whithin_Cohort__total_number_of_potential_contaminent_reads): - raise Exception("Total discordant reads can not be less than within cohort discordant reads.") - - - except: - _='Doesnt Exist' + count = 0 + + + #create a list of variants that are on each donor genotype file + vars_per_donor_gt = {} + for don_id in donor_list: + donor_gt_vars = list(exclusive_don_variants[don_id]) + donor_gt_vars = pd.DataFrame(donor_gt_vars) + donor_gt_vars = self.norm_genotypes(donor_gt_vars) + donor_gt_vars = donor_gt_vars[donor_gt_vars['vars'] != '0/0'] + donor_gt_varids = list(donor_gt_vars['ids']) + vars_per_donor_gt[don_id] = donor_gt_varids + + #work out what cohort each donor belongs to + donor_cohorts = {} + for don_id in donor_list: + cohort = 'UNKNOWN' + donor_split = don_id.split("_") + if (len(donor_split) == 2) and (donor_split[0] == donor_split[1]): + cohort = 'UKB' + elif (len(donor_split) == 3) and (len(donor_split[0]) == 14): + cohort = 'ELGH' + donor_cohorts[don_id] = cohort + all_donor_data={} + # here we calvculate all the expected donor datasets + for row1 in exclusive_don_variants.keys(): + # donor_in_question = row1['donor_query'] + donor_gt_match = row1 + expected_vars_of_other_donor = self.exclusive_don_variants[donor_gt_match] + expected_vars_norm_of_other_donor = self.norm_genotypes(expected_vars_of_other_donor) + all_donor_data[donor_gt_match]=expected_vars_norm_of_other_donor - Out_of_Cohort__total_number_of_potential_contaminent_reads=0 - try: - concordant_sites_atributed_to_other_donor = list(set(sites_contributing_to_out_of_cohort)) - if not total_sites >= len(concordant_sites_atributed_to_other_donor): - raise Exception("Sorry, the number of sites atributed can not be larger than total sites") - - _,_,_,_,Out_of_Cohort__total_number_of_potential_contaminent_reads, _= self.read_extraction(concordant_sites_atributed_to_other_donor,expected_vars_norm,cell_vars_norm) - - if not discordant_reads >= len(Out_of_Cohort__total_number_of_potential_contaminent_reads): - raise Exception("Total discordant reads can not be less than out of cohort discordant reads.") + results = [] + for i,row1 in donor_assignments_table.iterrows(): + donor_in_question = row1['donor_query'] + donor_gt_match = row1['donor_gt'] + print(donor_gt_match) + # if i>4: + # continue + if (donor_gt_match=='NONE'): + continue + try: + donor_gt_match_cohort = donor_cohorts[donor_gt_match] + except: + continue + Cells_to_keep_pre = list(set(cell_assignments_table.loc[cell_assignments_table['donor_id']==donor_in_question,'cell'])) + expected_vars = exclusive_don_variants[donor_gt_match] + expected_vars_norm = self.norm_genotypes(expected_vars) + try: + # Now we subset this down to each of the uniqie variants per donor and check which of the concordant sites are exclusive to donor. + dds = self.donor_distinct_sites[donor_gt_match] + except: + continue + # if cpus==1: + result,other_donor_concordance,donor_gt_match,donor_id = Donor(donor_in_question,self.informative_sites,self.uninformative_sites,self.donor_assignments_table,self.cell_assignments_table,self.exclusive_cell_variants,self.donor_distinct_sites,cpus).analyse_donor(Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,all_donor_data,expected_vars_norm,donor_assignments_table,exclusive_cell_variants) + self.combine_concordances(result,other_donor_concordance,donor_gt_match,donor_id) + + + + return self.cell_concordance_table - except: - _='Doesnt Exist' - + + def read_extraction(self,DonorDiscordant_Sites_that_are_atributed_to_other_donor,expected_vars_norm,cell_vars_norm): + # we need this function wrapper to calculate the concordant, discordant read + # counts for each of the discordant sites that are concordant with another donor. - try: - Out_of_Cohort__sites = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys()) - except: - Out_of_Cohort__sites = set() - - try: - Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys()) - set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) - shared_sites_between_out_of_cohort_and_within_cohort = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys()).intersection(set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) ) - except: - Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set() - shared_sites_between_out_of_cohort_and_within_cohort = set() - - try: - Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) - set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Out_of_Cohort'].keys()) - except: - Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) + Total_Overlapping_sites = set(DonorDiscordant_Sites_that_are_atributed_to_other_donor) + expected_vars2 = expected_vars_norm[expected_vars_norm['ids'].isin(Total_Overlapping_sites)] + cell_vars2 = cell_vars_norm[cell_vars_norm['ids'].isin(Total_Overlapping_sites)] + + cell_vars2 = cell_vars2.drop_duplicates(subset=['ids']) + expected_vars2 = expected_vars2.drop_duplicates(subset=['ids']) + cell_vars2['DP'] = cell_vars2[0].str.split("_").str[5].astype(int) + cell_vars2['AD'] = cell_vars2[0].str.split("_").str[6].astype(int) + cell_vars2['OTH'] = cell_vars2[0].str.split("_").str[7].astype(int) + total_reads,_,_,discordant_reads, \ + discordantRead_ratio,discordant_reads_noHet,total_reads_noHet,discordantRead_ratio_noHet= self.read_concordance_calc(expected_vars2,cell_vars2) + concordant_reads = total_reads - discordant_reads + concordant_reads_noHet = total_reads_noHet-discordant_reads_noHet + return total_reads,discordant_reads,concordant_reads,\ + total_reads_noHet,discordant_reads_noHet,concordant_reads_noHet + - Out_of_Cohort__unique_sites_total_reads,_,_,_,_,_ = self.read_extraction(Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool,expected_vars_norm,cell_vars_norm) - - try: - Whithin_Cohort__sites = set(total_discordant_sites_that_are_concordant_with_other_donors_in_pool__cohortBreakdown['Whithin_Cohort'].keys()) - except: - Whithin_Cohort__sites = set() - +class Donor(Concordances): + def __init__(self, donor_id,informative_sites,uninformative_sites,donor_assignments_table,cell_assignments_table,exclusive_cell_variants,donor_distinct_sites,pool): + self.donor_concordance_table = {} + self.other_donor_concordance = [] + self.pool = pool + self.donor_id = donor_id + self.informative_sites = informative_sites + self.uninformative_sites = uninformative_sites + self.donor_assignments_table=donor_assignments_table + self.cell_assignments_table=cell_assignments_table + self.exclusive_don_variants=exclusive_don_variants + self.exclusive_cell_variants=exclusive_cell_variants + self.donor_distinct_sites=donor_distinct_sites - - Total__discordant_sites_that_are_concordant_with_other_donors_in_pool = Whithin_Cohort__sites.union(Out_of_Cohort__sites) - concordant_vars_in_pool_str = (";").join(concordant_vars) - DF = pd.DataFrame(donor_table_of_concordances) + def combine_results(self,result): + donor_concordance_table=result[0] + other_donor_concordance_table=result[1] + # other_donor_concordance_table + other_donor_concordances + self.other_donor_concordance = self.other_donor_concordance + other_donor_concordance_table + self.donor_concordance_table = {**self.donor_concordance_table, **donor_concordance_table} - Donor_With_Lowest_DisConcordance = ';'.join(DF[DF['discordant_percent_in_other_donor']==min(DF['discordant_percent_in_other_donor'])]['donor'].values) - Lowest_Disconcordance_value_in_all_donors= DF[DF['discordant_percent_in_other_donor']==min(DF['discordant_percent_in_other_donor'])]['discordant_percent_in_other_donor'].values[0] - - Donor_With_Highest_Concordance = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['donor'].values) - Highest_Concordance_value_in_all_donors= DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['concordant_percent_in_other_donor'].values[0] - Total_sites_other_donor = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['total_sites_otherDonor'].astype(str).values) - Total_reads_other_donor = ';'.join(DF[DF['concordant_percent_in_other_donor']==max(DF['concordant_percent_in_other_donor'])]['total_reads_otherDonor'].astype(str).values) - - return [{ - 'cell1':cell1, - 'donor_gt_match':donor_gt_match, - 'Nr_Concordant':Nr_Concordant, - 'Nr_Discordant':Nr_Discordant, - 'Nr_Relaxed_concordant':Nr_Relaxed_concordant, - 'true_discordant_count':true_discordant_count, - 'relaxed_concordant_informative_count':relaxed_concordant_informative_count, - 'relaxed_concordant_uninformative_count':relaxed_concordant_uninformative_count, - 'true_discordant_informative_count':true_discordant_informative_count, - 'true_discordant_uninformative_count':true_discordant_uninformative_count, - 'Nr_Total_Overlapping_sites':Nr_Total_Overlapping_sites, - 'Number_of_sites_that_are_donor_concordant_and_exclusive':Number_of_sites_that_are_donor_concordant_and_exclusive, - 'Nr_donor_distinct_sites':Nr_donor_distinct_sites, - 'count':count, - 'discordant_sites':discordant_sites, - 'total_sites':total_sites, - 'informative_sites':informative_sites, - 'uninformative_sites':uninformative_sites, - 'total_reads_informative':total_reads_informative, - 'total_reads_uninformative':total_reads_uninformative, - 'discordant_reads_informative':discordant_reads_informative, - 'discordant_reads_uninformative':discordant_reads_uninformative, - 'Discordant_sites_in_pool': discordant_vars, - 'Lowest_Disconcordance_value_in_all_donors':Lowest_Disconcordance_value_in_all_donors, - 'Donor_With_Lowest_DisConcordance':Donor_With_Lowest_DisConcordance, - 'Concordant_Site_Identities':concordant_vars_in_pool_str, - 'Donor_With_Highest_Concordance':Donor_With_Highest_Concordance, - 'Highest_Concordance_value_in_all_donors':Highest_Concordance_value_in_all_donors, - 'Total_sites_other_donor':Total_sites_other_donor, - 'Total_reads_other_donor':Total_reads_other_donor, - 'total_discordant_sites_that_are_concordant_with_other_donors_in_pool':f"{len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool)}/{len(discordant_vars)}", - 'informative__total_discordant_sites_that_are_concordant_with_other_donors_in_pool':f"{len(total_discordant_sites_that_are_concordant_with_other_donors_in_pool)}/{len(true_discordant_informative_count)}", - 'total_reads':total_reads, - 'discordant_reads':discordant_reads, - 'discordant_reads_informative_subset':discordant_reads_informative_subset, \ - 'discordant_read_fraction_in_concordant_sites':discordant_read_fraction_in_concordant_sites, \ - 'discordant_read_fraction_in_discordant_sites':discordant_read_fraction_in_discordant_sites, \ - 'Whithin_Cohort__total_number_of_potential_contaminent_reads':Whithin_Cohort__total_number_of_potential_contaminent_reads, \ - 'Out_of_Cohort__total_number_of_potential_contaminent_reads':Out_of_Cohort__total_number_of_potential_contaminent_reads, \ - 'NrDonors_contributing_to_out_of_cohort':len(set(donors_contributing_to_out_of_cohort)), \ - 'NrDonors_contributing_to_Whithin_Cohort':len(set(donors_contributing_to_Whithin_Cohort)), \ - - 'Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool':Out_of_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool, \ - 'Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool':Whithin_Cohort__discordant_sites_that_are_concordant_with_other_donors_in_pool, \ - 'shared_sites_between_Out_of_Cohort_and_Within_Cohort':len(shared_sites_between_out_of_cohort_and_within_cohort), \ - 'Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool':len(Within_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool), \ - 'Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool':len(Out_of_Cohort__unique_sites_discordant_sites_that_are_concordant_with_other_donors_in_pool), \ - 'Out_of_Cohort__unique_sites_total_reads':Out_of_Cohort__unique_sites_total_reads, \ - 'Total__discordant_sites_that_are_concordant_with_other_donors_in_pool':len(Total__discordant_sites_that_are_concordant_with_other_donors_in_pool) - }, donor_table_of_concordances] + def analyse_cells(self,chunk_df,exclusive_cell_variants,expected_vars_norm,donor_gt_match,vars_per_donor_gt,donor_cohorts,all_donor_data,donor_assignments_table,donor_gt_match_cohort): + donor_concordance_table = {} + other_donor_concordance_table = [] + count=0 + for cell1 in chunk_df: + count+=1 + cell_vars = exclusive_cell_variants[cell1] + # try: + result1, other_donor_concordances = self.concordance_table_production(expected_vars_norm,cell_vars,cell1,donor_gt_match,donor_gt_match_cohort, vars_per_donor_gt, donor_cohorts, count,all_donor_data,donor_assignments_table) + donor_concordance_table,other_donor_concordance_table = self.append_results_cell_concordances(result1,donor_concordance_table,other_donor_concordances,other_donor_concordance_table) + # except: + # continue + return [donor_concordance_table, other_donor_concordance_table] + def analyse_donor(self,Cells_to_keep_pre,donor_gt_match,donor_gt_match_cohort,vars_per_donor_gt,donor_cohorts,all_donor_data,expected_vars_norm,donor_assignments_table,exclusive_cell_variants): + n=200 + print(f'-- Using {cpus} cpus ---') + pool = mp.Pool(cpus) + # Cells_to_keep_pre = Cells_to_keep_pre[:13] + list_df = [Cells_to_keep_pre[i:i+n] for i in range(0,len(Cells_to_keep_pre),n)] + # results = [] + for chunk_df in list_df: + pool.apply_async(self.analyse_cells,args =([chunk_df,exclusive_cell_variants,expected_vars_norm,donor_gt_match,vars_per_donor_gt,donor_cohorts,all_donor_data,donor_assignments_table,donor_gt_match_cohort]),callback=self.combine_results) + # results.append(result) + # while True: + # time.sleep(1) + # # catch exception if results are not ready yet + # try: + # ready = [result.ready() for result in results] + # successful = [result.successful() for result in results] + # except Exception: + # continue + # # exit loop if all tasks returned success + # if all(successful): + # break + # # raise exception reporting exceptions received from workers + # if all(ready) and not all(successful): + # raise Exception(f'Workers raised following exceptions {[result._value for result in results if not result.successful()]}') + # self.pool.join() + # try: + # [result.wait() for result in results] + # except: + # print('done') + # r1,r2 = self.analyse_cells(chunk_df,exclusive_cell_variants,expected_vars_norm,donor_gt_match,vars_per_donor_gt,donor_cohorts,all_donor_data,donor_assignments_table,donor_gt_match_cohort) + # self.combine_results(r1,r2) + print('Done') + pool.close() + pool.join() + return self.donor_concordance_table,self.other_donor_concordance,donor_gt_match,self.donor_id class VCF_Loader: diff --git a/bin/gather_minimal_dataset.py b/bin/gather_minimal_dataset.py index 3a6cd6b..4349485 100755 --- a/bin/gather_minimal_dataset.py +++ b/bin/gather_minimal_dataset.py @@ -413,6 +413,9 @@ def gather_donor(donor_id, ad, ad_lane_raw, azimuth_annot, qc_obs, columns_outpu ad.obs = ad.obs.loc[:,~ad.obs.columns.duplicated()] if write_h5: path1=os.path.join(outdir, oufnam + '.h5ad') + ad.obs['qc.filter.pass.AZ:L0'] = ad.obs['qc.filter.pass.AZ:L0'].astype('bool') + ad.obs['cell_passes_hard_filters'] = ad.obs['cell_passes_hard_filters'].astype('bool') + ad.obs['qc.filter.pass'] = ad.obs['qc.filter.pass'].astype('bool') ad.write(path1,compression='gzip') return { diff --git a/conf/base.conf b/conf/base.conf index 249dc9f..98bb86d 100755 --- a/conf/base.conf +++ b/conf/base.conf @@ -9,7 +9,7 @@ */ params{ - input = 'cellbender' + input = 'cellbender' //# cellbender|cellranger rsync_to_web_file = "${launchDir}/yascp/bin/rsync_to_web.sh" profile = 'normal_run' citeseq = false @@ -59,11 +59,12 @@ params{ cluster_validate_resolution_keras = true input_tables_column_delimiter = '\t' outdir= "${launchDir}/results" + tracedir = "${params.outdir}/pipeline_info" do_deconvolution = true split_bam = false run_multiplet = true utilise_gpu = true - split_ad_per_bach = false + split_ad_per_bach = true cellbender_resolution_to_use='0pt1' reference_assembly_fasta_dir = "https://yascp.cog.sanger.ac.uk/public/10x_reference_assembly" webtransfer = false @@ -239,246 +240,31 @@ process { withLabel:process_medium_memory { memory = { 30.GB * task.attempt } } - - withName: SCRUBLET{ - maxRetries = 3 - errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' } - } - - - withName: DOUBLET_DECON{ - maxRetries = 3 - errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' } - memory = { 100.GB * task.attempt } - } - - withName: AZIMUTH{ - maxRetries = 3 - errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' } - } - withName: CELLTYPIST{ - maxRetries = 3 - errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' } - } - withName: cellbender__remove_background{ - maxRetries = 2 - //# errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' } - } - - withLabel:error_ignore { - errorStrategy = 'ignore' - } - withLabel:error_retry { - errorStrategy = 'retry' - maxRetries = 2 - } - - withName:cluster_validate_resolution_keras{ - maxForks=4 + withName: ASSESS_CALL_RATE{ maxRetries = 3 - memory = { 60.GB * task.attempt } - time = { 12.h * task.attempt } + memory = { 10.GB * task.attempt } errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' } } - withName: CELLTYPIST{ - maxForks=7 - } - - withName: CELLTYPE_FILE_MERGE{ - memory = { 60.GB * task.attempt } - } - - withName: NORMALISE_AND_PCA{ - maxForks=7 - errorStrategy = 'retry' - memory = { 50.GB * task.attempt} - maxRetries = 8 - cpus = 4 - } - - withName: LISI{ - maxForks=7 - memory =300.GB - } - - withName: RESOLVE_POOL_VCFS{ - cpus = 1 - memory = { 1.GB * task.attempt } - time = { 12.h * task.attempt } - maxRetries = 3 - } - - withName: SPLIT_BATCH_H5AD{ - cpus = 2 - memory = { 25.GB * task.attempt * 0.5} - time = { 12.h * task.attempt } - maxRetries = 3 - } - - withName: SUBSET_GENOTYPE2{ - cpus = 2 - memory = { 1.GB * task.attempt } - time = { 12.h * task.attempt } - maxRetries = 3 - } - - withName: JOIN_STUDIES_MERGE{ - cpus = 1 - memory = { 20.GB * task.attempt } - time = { 12.h * task.attempt } - maxRetries = 3 - } - - withName: FREEBAYES{ - cpus = 1 - time = { 12.h * task.attempt } - maxRetries = 2 - } - - withName: VIREO_ADD_SAMPLE_PREFIX{ - cpus = 1 - memory = { 2.GB * task.attempt } - time = { 12.h * task.attempt } - maxRetries = 3 - } - - - withName: REPLACE_GT_DONOR_ID2{ - cpus = 1 - memory = { 1.GB * task.attempt } - time = 12.h - maxRetries = 3 - } - - withName: JOIN_CHROMOSOMES{ - cpus = 1 - memory = { 2.GB * task.attempt } - time = 12.h - maxRetries = 3 - } - - withName: serialize_known_markers{ - cpus = 1 - memory = { 1.GB * task.attempt } - time = { 12.h * task.attempt } - maxRetries = 3 - } - - withName: OUTLIER_FILTER{ - errorStrategy = 'retry' - memory = { 50.GB * task.attempt} - maxRetries = 8 - } - - - - withName: cluster{ - cpus = { 3 * task.attempt } - } - - withName: LISI{ - maxForks=7 - errorStrategy = 'retry' - maxRetries = 8 - memory = { 200.GB * task.attempt} - } - - withName: VIREO_GT_FIX_HEADER{ - errorStrategy = 'retry' - maxRetries = 4 - cpus = 1 - memory = { 1.GB * task.attempt } - } - - - withName: JOIN_CHROMOSOMES{ - errorStrategy = 'retry' - maxRetries = 4 - } - - withName: cluster{ - errorStrategy = 'retry' - maxRetries = 4 - } - - withName: SPLIT_BAM_BY_CELL_BARCODES { - cpus = 1 - memory = { 8.GB * task.attempt} - time = 4.h - } - - withName: CONCORDANCE_CALCLULATIONS{ - cpus = 5 - time = { 12.h * task.attempt } - memory = { 70.GB * task.attempt } - } - - withName: OTHER_DONOR_CONCORDANCE_CALCLULATIONS{ - cpus = 3 - time = { 6.h * task.attempt } - memory = { 20.GB * task.attempt } - } - - - withName: CELLSNP{ - memory = { 5.GB * task.attempt } - } - - withName: DYNAMIC_DONOR_EXCLUSIVE_SNP_SELECTION{ - cpus = 5 - time = { 12.h * task.attempt } - memory = { 20.GB * task.attempt } - } - - withName: prep_collectmetadata{ - memory = { 150.MB * task.attempt } - } - - withName: VIREO{ - //# maxForks=7 - cpus = 5 - time = { 12.h * task.attempt } - memory = { 70.GB * task.attempt } - } - withName: DSB_INTEGRATE{ - memory = { 200.GB * task.attempt } - cpus = { 4 * task.attempt } - maxRetries = 3 - } - - withName: MULTIMODAL_INTEGRATION{ - memory = { 200.GB * task.attempt } - cpus = { 4 * task.attempt } - maxRetries = 3 - } - - withName: umap_gather{ - memory = { 100.GB * task.attempt } - errorStrategy = 'retry' - maxRetries = 3 - } - - withName: DOUBLET_FINDER{ - memory = { 100.GB * task.attempt } - } - - - - withName: GT_MATCH_POOL_AGAINST_PANEL{ - time = { 24.h * task.attempt } - } - - withName: plot_predicted_sex{ - memory = { 50.GB * task.attempt } - maxRetries = 5 - cpus = 2 - - } - - +} +def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') +timeline { + enabled = true + file = "${params.tracedir}/execution_timeline_${trace_timestamp}.html" +} +report { + enabled = true + file = "${params.tracedir}/execution_report_${trace_timestamp}.html" +} +trace { + enabled = true + file = "${params.tracedir}/execution_trace_${trace_timestamp}.txt" +} +dag { + enabled = true + file = "${params.tracedir}/pipeline_dag_${trace_timestamp}.svg" } singularity { diff --git a/conf/cellbender.conf b/conf/cellbender.conf index 636dd16..a8449ec 100755 --- a/conf/cellbender.conf +++ b/conf/cellbender.conf @@ -59,8 +59,6 @@ params { value{ expected_nemptydroplets_umi_cutoff = 0 method_estimate_ncells = 'dropletutils::barcoderanks::inflection' - //method_estimate_ncells = 'cellrangerv2::expected' //this method feeds in the cellranger estimate ncells - //method_estimate_ncells = 'expected' lower_bound_umis_estimate_ncells = 1000 method_estimate_nemptydroplets = 'dropletutils::barcoderanks::inflection,dropletutils::barcoderanks::knee,0.33' lower_bound_umis_estimate_nemptydroplets = 10 diff --git a/conf/modules.conf b/conf/modules.conf index 1885d64..0df8fa4 100755 --- a/conf/modules.conf +++ b/conf/modules.conf @@ -52,4 +52,250 @@ params { } } + + withName: SCRUBLET{ + maxRetries = 3 + errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' } + } + + + withName: DOUBLET_DECON{ + maxRetries = 3 + errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' } + memory = { 100.GB * task.attempt } + } + + withName: AZIMUTH{ + maxRetries = 3 + errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' } + } + withName: CELLTYPIST{ + maxRetries = 3 + errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' } + } + withName: cellbender__remove_background{ + maxRetries = 2 + //# errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' } + } + + withLabel:error_ignore { + errorStrategy = 'ignore' + } + withLabel:error_retry { + errorStrategy = 'retry' + maxRetries = 2 + } + + withName:cluster_validate_resolution_keras{ + maxForks=4 + maxRetries = 3 + memory = { 60.GB * task.attempt } + time = { 12.h * task.attempt } + errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' } + } + + withName: CELLTYPIST{ + maxForks=7 + } + + withName: ASSESS_CALL_RATE{ + maxRetries = 3 + memory = { 10.GB * task.attempt } + errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' } + } + + + withName: CELLTYPE_FILE_MERGE{ + memory = { 60.GB * task.attempt } + } + + withName: NORMALISE_AND_PCA{ + maxForks=7 + errorStrategy = 'retry' + memory = { 50.GB * task.attempt} + maxRetries = 8 + cpus = 4 + } + + withName: LISI{ + maxForks=7 + memory =300.GB + } + + withName: RESOLVE_POOL_VCFS{ + cpus = 1 + memory = { 1.GB * task.attempt } + time = { 12.h * task.attempt } + maxRetries = 3 + } + + withName: SPLIT_BATCH_H5AD{ + cpus = 2 + memory = { 25.GB * task.attempt * 0.5} + time = { 12.h * task.attempt } + maxRetries = 3 + } + + withName: SUBSET_GENOTYPE2{ + cpus = 2 + memory = { 1.GB * task.attempt } + time = { 12.h * task.attempt } + maxRetries = 3 + } + + withName: JOIN_STUDIES_MERGE{ + cpus = 1 + memory = { 20.GB * task.attempt } + time = { 12.h * task.attempt } + maxRetries = 3 + } + + withName: FREEBAYES{ + cpus = 1 + time = { 12.h * task.attempt } + maxRetries = 2 + } + + withName: VIREO_ADD_SAMPLE_PREFIX{ + cpus = 1 + memory = { 2.GB * task.attempt } + time = { 12.h * task.attempt } + maxRetries = 3 + } + + + withName: REPLACE_GT_DONOR_ID2{ + cpus = 1 + memory = { 1.GB * task.attempt } + time = 12.h + maxRetries = 3 + } + + withName: JOIN_CHROMOSOMES{ + cpus = 1 + memory = { 2.GB * task.attempt } + time = 12.h + maxRetries = 3 + } + + withName: serialize_known_markers{ + cpus = 1 + memory = { 1.GB * task.attempt } + time = { 12.h * task.attempt } + maxRetries = 3 + } + + withName: OUTLIER_FILTER{ + errorStrategy = 'retry' + memory = { 50.GB * task.attempt} + maxRetries = 8 + } + + + + withName: cluster{ + cpus = { 3 * task.attempt } + } + + withName: LISI{ + maxForks=7 + errorStrategy = 'retry' + maxRetries = 8 + memory = { 200.GB * task.attempt} + } + + withName: VIREO_GT_FIX_HEADER{ + errorStrategy = 'retry' + maxRetries = 4 + cpus = 1 + memory = { 1.GB * task.attempt } + } + + + withName: JOIN_CHROMOSOMES{ + errorStrategy = 'retry' + maxRetries = 4 + } + + withName: cluster{ + errorStrategy = 'retry' + maxRetries = 4 + } + + withName: SPLIT_BAM_BY_CELL_BARCODES { + cpus = 1 + memory = { 8.GB * task.attempt} + time = 4.h + } + + withName: CONCORDANCE_CALCLULATIONS{ + cpus = 20 + time = { 24.h * task.attempt } + memory = { 100.GB * task.attempt } + } + + withName: OTHER_DONOR_CONCORDANCE_CALCLULATIONS{ + cpus = 3 + time = { 6.h * task.attempt } + memory = { 20.GB * task.attempt } + } + + + withName: CELLSNP{ + memory = { 5.GB * task.attempt } + } + + withName: DYNAMIC_DONOR_EXCLUSIVE_SNP_SELECTION{ + cpus = 5 + time = { 12.h * task.attempt } + memory = { 20.GB * task.attempt } + } + + withName: prep_collectmetadata{ + memory = { 150.MB * task.attempt } + } + + withName: VIREO{ + //# maxForks=7 + cpus = 5 + time = { 12.h * task.attempt } + memory = { 70.GB * task.attempt } + } + withName: DSB_INTEGRATE{ + memory = { 200.GB * task.attempt } + cpus = { 4 * task.attempt } + maxRetries = 3 + } + + withName: MULTIMODAL_INTEGRATION{ + memory = { 200.GB * task.attempt } + cpus = { 4 * task.attempt } + maxRetries = 3 + } + + withName: umap_gather{ + memory = { 100.GB * task.attempt } + errorStrategy = 'retry' + maxRetries = 3 + } + + withName: DOUBLET_FINDER{ + memory = { 100.GB * task.attempt } + } + + + + withName: GT_MATCH_POOL_AGAINST_PANEL{ + time = { 24.h * task.attempt } + } + + withName: plot_predicted_sex{ + memory = { 50.GB * task.attempt } + maxRetries = 5 + cpus = 2 + + } + + + } diff --git a/conf/test.conf b/conf/test.conf index 6b947ed..185ca42 100755 --- a/conf/test.conf +++ b/conf/test.conf @@ -23,7 +23,10 @@ params { posterior_assignment = false //if this is set to true, we will perform the genotype donor matching after the deconvolution is performed. subset_genotypes = false tsv_donor_panel_vcfs = "https://yascp.cog.sanger.ac.uk/public/test_datasets/full_test_dataset/vcf_inputs_v2.tsv" //this is a panel of vcf files that we want to compar the genotypes with + ZSCORE_THRESH = 8 //# Minimum z0 threshold required to call the gt assignment confident. + ZSCORE_DIST_THRESH = 8 //# Minimum bifference between z0 and z1 to call the assignment confident, } + hard_filters_file = "${projectDir}/sample_input/sample_qc.yml" //this file defilnes what hard filters we want to use to flag/drop the cells //# For the training purposes we reduce the cellbender epochs and learning rate as this step takes a long time to compute. diff --git a/conf/test_full.conf b/conf/test_full.conf index 6b947ed..9b6150a 100755 --- a/conf/test_full.conf +++ b/conf/test_full.conf @@ -23,6 +23,9 @@ params { posterior_assignment = false //if this is set to true, we will perform the genotype donor matching after the deconvolution is performed. subset_genotypes = false tsv_donor_panel_vcfs = "https://yascp.cog.sanger.ac.uk/public/test_datasets/full_test_dataset/vcf_inputs_v2.tsv" //this is a panel of vcf files that we want to compar the genotypes with + ZSCORE_THRESH = 8 //# Minimum z0 threshold required to call the gt assignment confident. + ZSCORE_DIST_THRESH = 8 //# Minimum bifference between z0 and z1 to call the assignment confident, + } hard_filters_file = "${projectDir}/sample_input/sample_qc.yml" //this file defilnes what hard filters we want to use to flag/drop the cells diff --git a/main.nf b/main.nf index 93fc1f0..3cecc35 100755 --- a/main.nf +++ b/main.nf @@ -38,7 +38,7 @@ include {collect_file} from "$projectDir/modules/nf-core/modules/collect_file/ma include { CELLSNP;capture_cellsnp_files } from "$projectDir/modules/nf-core/modules/cellsnp/main" - +// Channel.of(params.outdir).mkdirs() ////// WORKFLOW: Run main nf-core/yascp analysis pipeline // This is the default entry point, we have others to update ceirtain parts of the results. diff --git a/modules/nf-core/modules/citeseq/main.nf b/modules/nf-core/modules/citeseq/main.nf index 70edf6e..272029a 100755 --- a/modules/nf-core/modules/citeseq/main.nf +++ b/modules/nf-core/modules/citeseq/main.nf @@ -26,7 +26,7 @@ process SPLIT_CITESEQ_GEX { tuple val(sample_name), path("${sample_name}__Gene_Expression"), emit:gex_data tuple val(sample_name), path("antibody-${sample_name}.h5ad"), emit: ab_data2 optional true tuple val(sample_name), path("Gene_Expression-${sample_name}.h5ad"), emit: gex_h5ad optional true - tuple val(sample_name), path("${sample_name}__*"), emit: ab_data + tuple val(sample_name), path("${sample_name}__*"), emit: ab_data optional true tuple val(sample_name), path("${sample_name}__Gene_Expression/barcodes.tsv.gz"), path("${sample_name}__Gene_Expression/features.tsv.gz"), path("${sample_name}__Gene_Expression/matrix.mtx.gz"), emit: channel__file_paths_10x script: @@ -166,6 +166,8 @@ process VDJ_INTEGRATION{ output: path("*all_samples_integrated.vdj.RDS"), emit: all_data_integrated_vdj_rds + path("*all_samples_integrated.BCR.RDS"), emit: all_data_integrated_BCR_rds + path("*all_samples_integrated.TCR.RDS"), emit: all_data_integrated_TCR_rds input: path(all_cellranger_samples) diff --git a/modules/nf-core/modules/doubletdecon/main.nf b/modules/nf-core/modules/doubletdecon/main.nf index 0e9ef94..b50bc01 100644 --- a/modules/nf-core/modules/doubletdecon/main.nf +++ b/modules/nf-core/modules/doubletdecon/main.nf @@ -1,4 +1,4 @@ -process DOUBLET_DECON { +process DOUBLET_DECON{ tag "${experiment_id}" label 'process_medium' diff --git a/modules/nf-core/modules/prepere_yascp_inputs/main.nf b/modules/nf-core/modules/prepere_yascp_inputs/main.nf index a83c72c..ebabfe1 100644 --- a/modules/nf-core/modules/prepere_yascp_inputs/main.nf +++ b/modules/nf-core/modules/prepere_yascp_inputs/main.nf @@ -16,6 +16,7 @@ process YASCP_INPUTS { input: path(input_file) + // the output for this is a correct format input files as per cb 6.1 output: path("input_file_corectly_formatted.tsv"), emit: input_file_corectly_formatted diff --git a/nextflow.config b/nextflow.config index 572aa8b..2378616 100755 --- a/nextflow.config +++ b/nextflow.config @@ -25,8 +25,8 @@ params { max_multiqc_email_size = '25.MB' // Boilerplate options - outdir = output_dir = "${launchDir}/results" - tracedir = "${params.outdir}/pipeline_info" + outdir = "${launchDir}/results" + publish_dir_mode = 'copy' email = null email_on_fail = null @@ -38,7 +38,8 @@ params { schema_ignore_params = 'genomes,modules' enable_conda = false singularity_pull_docker_container = false - + tracedir = "${params.outdir}/pipeline_info" + // Config options custom_config_version = 'master' custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" @@ -78,13 +79,6 @@ includeConfig 'conf/qc.conf' includeConfig 'conf/modules.conf' -// Load igenomes.config if required -if (!params.igenomes_ignore) { - includeConfig 'conf/igenomes.conf' -} else { - params.genomes = [:] -} - profiles { debug { process.beforeScript = 'echo $HOSTNAME' } conda { diff --git a/subworkflows/celltype.nf b/subworkflows/celltype.nf index 7966b7a..4e611ad 100755 --- a/subworkflows/celltype.nf +++ b/subworkflows/celltype.nf @@ -70,9 +70,9 @@ workflow celltype{ }else{ sc_out = Channel.of() } - all_extra_fields = all_extra_fields.mix(sc_out) + all_extra_fields2 = all_extra_fields.mix(sc_out) - CELLTYPE_FILE_MERGE(az_out,ct_out,all_extra_fields,SPLIT_BATCH_H5AD.out.keras_outfile.collect()) + CELLTYPE_FILE_MERGE(az_out,ct_out,all_extra_fields2,SPLIT_BATCH_H5AD.out.keras_outfile.collect()) file__anndata_merged2=CELLTYPE_FILE_MERGE.out.file__anndata_merged2 file__anndata_merged2.view() diff --git a/subworkflows/local/retrieve_recourses.nf b/subworkflows/local/retrieve_recourses.nf index 6a48228..505b1c3 100755 --- a/subworkflows/local/retrieve_recourses.nf +++ b/subworkflows/local/retrieve_recourses.nf @@ -64,4 +64,22 @@ process RETRIEVE_RECOURSES_TEST_DATASET{ sed -i 's#/path/to/replace/pointing/to/downloaded/files#${params.outdir}/recourses/full_test_dataset#' input_test_data_file.tsv touch Done > Done.tmp """ +} + + +process STAGE_FILE{ + label 'process_tiny' + // In nf there is a function collectFile - however if you provide a symlinked file directory tusing nf function will overwrite the source instead of replacing the file + // This snipped is a replication of the function but as a nf module and hence the problem is avoided. + // publishDir "${params.outdir}/recourses", mode: "${params.copy_mode}", overwrite: true + input: + path(file) + output: + path(file) + + script: + + """ + echo 'staged' + """ } \ No newline at end of file diff --git a/subworkflows/main_deconvolution.nf b/subworkflows/main_deconvolution.nf index 08fd451..25828bf 100755 --- a/subworkflows/main_deconvolution.nf +++ b/subworkflows/main_deconvolution.nf @@ -17,7 +17,7 @@ include { match_genotypes } from './match_genotypes' include {ENHANCE_STATS_GT_MATCH } from "$projectDir/modules/nf-core/modules/genotypes/main" include {SUBSET_WORKF} from "$projectDir/modules/nf-core/modules/subset_genotype/main" include {REPLACE_GT_DONOR_ID2; REPLACE_GT_DONOR_ID2 as REPLACE_GT_DONOR_ID_SUBS } from "$projectDir/modules/nf-core/modules/genotypes/main" -include { RETRIEVE_RECOURSES } from "$projectDir/subworkflows/local/retrieve_recourses" +include { RETRIEVE_RECOURSES; STAGE_FILE } from "$projectDir/subworkflows/local/retrieve_recourses" include {GT_MATCH_POOL_IBD } from "$projectDir/modules/nf-core/modules/genotypes/main" include { MATCH_GT_VIREO } from "$projectDir/modules/nf-core/modules/genotypes/main" @@ -52,7 +52,7 @@ workflow main_deconvolution { }else{ genome = "${params.reference_assembly_fasta_dir}" } - + vcf_candidate_snps = STAGE_FILE(params.cellsnp.vcf_candidate_snps) // genome.subscribe { println "genome: $it" } if (params.genotype_input.run_with_genotype_input) { @@ -73,7 +73,7 @@ workflow main_deconvolution { // This will subsequently result in a joint vcf file per cohort per donors listed for each of the pools that can be used in VIREO and/or GT matching algorythm. SUBSET_WORKF(ch_ref_vcf,donors_in_pools,'AllExpectedGT',genome) merged_expected_genotypes = SUBSET_WORKF.out.merged_expected_genotypes - merged_expected_genotypes2 = merged_expected_genotypes.combine(Channel.fromPath(params.cellsnp.vcf_candidate_snps)) + merged_expected_genotypes2 = merged_expected_genotypes.combine(vcf_candidate_snps) // merged_expected_genotypes2.subscribe { println "merged_expected_genotypes2: $it" } GT_MATCH_POOL_IBD(SUBSET_WORKF.out.samplename_subsetvcf_ibd,'Withing_expected','Expected') @@ -105,10 +105,10 @@ workflow main_deconvolution { if (params.provide_within_pool_donor_specific_sites_for_pilup){ cellsnp_with_npooled2 = cellsnp_with_npooled.combine(cellsnp_panels, by: 0) }else{ - cellsnp_with_npooled2 = cellsnp_with_npooled.combine(Channel.fromPath(params.cellsnp.vcf_candidate_snps)) + cellsnp_with_npooled2 = cellsnp_with_npooled.combine(vcf_candidate_snps) } }else{ - cellsnp_with_npooled2 = cellsnp_with_npooled.combine(Channel.fromPath(params.cellsnp.vcf_candidate_snps)) + cellsnp_with_npooled2 = cellsnp_with_npooled.combine(vcf_candidate_snps) } log.info('Capturing some of the existing CELLSNP files') diff --git a/subworkflows/prepare_inputs.nf b/subworkflows/prepare_inputs.nf index 69c438c..be61c90 100755 --- a/subworkflows/prepare_inputs.nf +++ b/subworkflows/prepare_inputs.nf @@ -15,6 +15,7 @@ workflow prepare_inputs { YASCP_INPUTS(channel_input_data_table) channel_input_data_table = YASCP_INPUTS.out.input_file_corectly_formatted + channel_input_data_table .splitCsv(header: true, sep: params.input_tables_column_delimiter) diff --git a/workflows/yascp.nf b/workflows/yascp.nf index 85492f5..22774af 100755 --- a/workflows/yascp.nf +++ b/workflows/yascp.nf @@ -57,7 +57,7 @@ workflow YASCP { bam_split_channel = Channel.of() out_ch = params.outdir ? Channel.fromPath(params.outdir, checkIfExists:true) - : Channel.fromPath("${launchDir}/${params.outdir}") + : Channel.from("${launchDir}/${params.outdir}") // out_ch.map{row->"${row[0]}/possorted_genome_bam.bam" } prepare_inputs(input_channel) From f38ed2215cbe4bbefa67ff04c1ce0d7e8fd5528a Mon Sep 17 00:00:00 2001 From: Matiss Ozols Date: Fri, 3 May 2024 11:07:15 +0100 Subject: [PATCH 5/8] some latest changes --- bin/DoubletDecon.R | 57 +++++++++++++-------- bin/gather_minimal_dataset.py | 95 +++++++---------------------------- conf/modules.conf | 12 +++-- 3 files changed, 63 insertions(+), 101 deletions(-) diff --git a/bin/DoubletDecon.R b/bin/DoubletDecon.R index 238b72b..8a6d3ae 100755 --- a/bin/DoubletDecon.R +++ b/bin/DoubletDecon.R @@ -647,7 +647,7 @@ Main_Doublet_Decon<-function(rawDataFile, groupsFile, filename, location, fullDa ## Run Doublet Decon ## tryCatch({ - + print(paste0('trying rhop: ',args$rhop)) results <- Main_Doublet_Decon(rawDataFile = processed$newExpressionFile, groupsFile = processed$newGroupsFile, filename = "DoubletDecon_results", @@ -667,25 +667,42 @@ tryCatch({ nCores = args$nCores) }, error = function(e) { - - results <- Main_Doublet_Decon(rawDataFile = processed$newExpressionFile, - groupsFile = processed$newGroupsFile, - filename = "DoubletDecon_results", - location = paste0(args$out, "/"), - fullDataFile = NULL, - removeCC = args$removeCC, - species = args$species, - rhop = 0.64, - write = TRUE, - PMF = args$pmf, - useFull = FALSE, - heatmap = args$heatmap, - centroids=args$centroids, - num_doubs=args$num_doubs, - only50=args$only50, - min_uniq=args$min_uniq, - nCores = args$nCores) - + tryCatch({ + print(paste0('trying rhop: ',0.64)) + results <- Main_Doublet_Decon(rawDataFile = processed$newExpressionFile, + groupsFile = processed$newGroupsFile, + filename = "DoubletDecon_results", + location = paste0(args$out, "/"), + fullDataFile = NULL, + removeCC = args$removeCC, + species = args$species, + rhop = 0.64, + write = TRUE, + PMF = args$pmf, + useFull = FALSE, + heatmap = args$heatmap, + centroids=args$centroids, + num_doubs=args$num_doubs, + only50=args$only50, + min_uniq=args$min_uniq, + nCores = args$nCores) + }, error = function(e) { + print(paste0('trying rhop: ',0.5)) + results <- Main_Doublet_Decon(rawDataFile = processed$newExpressionFile, + groupsFile = processed$newGroupsFile, + filename = "DoubletDecon_results", + location = paste0('.', "/"), + fullDataFile = NULL, + removeCC = FALSE, + species = args$species, + rhop = 0.5, + write = TRUE, + PMF = args$pmf, + useFull = FALSE, + heatmap = args$heatmap, + nCores = 3) + } + ) } ) diff --git a/bin/gather_minimal_dataset.py b/bin/gather_minimal_dataset.py index 4349485..c54ee90 100755 --- a/bin/gather_minimal_dataset.py +++ b/bin/gather_minimal_dataset.py @@ -45,8 +45,8 @@ SCRUBLET_ASSIGNMENTS_FNSUFFIX = 'scrublet.tsv' COLUMNS_AZIMUTH = { - 'Azimuth:predicted.celltype.l2': 'azimuth.celltyp.l2', - 'Azimuth:predicted.celltype.l2.score': 'azimuth.pred.score.l2', + 'Azimuth:Azimuth:predicted.celltype.l2': 'azimuth.celltyp.l2', + 'Azimuth:Azimuth:predicted.celltype.l2.score': 'azimuth.pred.score.l2', 'Azimuth:mapping.score.celltype.l2': 'azimuth.map.score', } @@ -61,14 +61,14 @@ 'cell_passes_qc:score':'qc.filter.pass:score', 'cell_passes_qc-per:all_together::exclude':'qc.filter.pass.spikein_exclude', 'cell_passes_qc-per:all_together::exclude:score':'qc.filter.pass.spikein_exclude:score', - 'cell_passes_qc-per:Azimuth:L0_predicted.celltype.l2':'qc.filter.pass.AZ:L0', - 'cell_passes_qc-per:Azimuth:L0_predicted.celltype.l2:score':'qc.filter.pass.AZ:L0:score', + 'cell_passes_qc-per:Azimuth:L0_Azimuth:predicted.celltype.l2':'qc.filter.pass.AZ:L0', + 'cell_passes_qc-per:Azimuth:L0_Azimuth:predicted.celltype.l2:score':'qc.filter.pass.AZ:L0:score', 'total_counts': 'qc.umi.count.total', 'total_counts_gene_group__mito_transcript': 'qc.umi.count.mt', 'pct_counts_gene_group__mito_transcript': 'qc.umi.perc.mt', 'n_genes_by_counts': 'qc.genes.detected.count', - 'Azimuth:L0_predicted.celltype.l2':'azimuth.celltyp.l0', - 'Azimuth:L1_predicted.celltype.l2':'azimuth.celltyp.l1', + 'Azimuth:L0_Azimuth:predicted.celltype.l2':'azimuth.celltyp.l0', + 'Azimuth:L1_Azimuth:predicted.celltype.l2':'azimuth.celltyp.l1', 'total_counts_gene_group__mito_transcript':'total_counts_gene_group__mito_transcript', 'pct_counts_gene_group__mito_transcript':'pct_counts_gene_group__mito_transcript', 'total_counts_gene_group__mito_protein':'total_counts_gene_group__mito_protein', @@ -224,49 +224,6 @@ def anndata_from_h5( return adata -def gather_azimuth_annotation(expid, datadir_azimuth, index_label = None): - # e.g. A4C06803ACD34DFB-adata_franke_Pilot_3_lane_3_predicted_celltype.tsv.gz - filpath = None - expid2=expid - fnsfx = '_{}{}'.format(expid, AZIMUTH_ASSIGNMENTS_FNSUFFIX) - for fn in os.listdir(datadir_azimuth): - if fn.endswith(fnsfx): - filpath = os.path.join(datadir_azimuth, fn) - break - if not filpath: - expid ='full' - fnsfx = '_{}{}'.format(expid, AZIMUTH_ASSIGNMENTS_FNSUFFIX) - for fn in os.listdir(datadir_azimuth): - if fn.endswith(fnsfx): - if fn.startswith('remapped'): - filpath = os.path.join(datadir_azimuth, fn) - break - if not filpath: - expid ='full' - fnsfx = '_{}{}'.format(expid, AZIMUTH_ASSIGNMENTS_FNSUFFIX) - for fn in os.listdir(datadir_azimuth): - if fn.endswith(fnsfx): - print(fn) - if not fn.startswith('remapped'): - filpath = os.path.join(datadir_azimuth, fn) - break - - if not filpath: - sys.exit("ERROR: could not find filename suffix '{}' in direcotry {}\n" - .format(fnsfx, datadir_azimuth)) - azt = pandas.read_table(filpath,index_col=0) - if (expid=='full'): - expid=expid2 - azt = azt[azt.index.str.contains(expid2)] - # filter to only the experiment id inputs. - df = get_df_from_mangled_index(azt, expid) - azt.insert(0, "barcode", df["barcode"]) - azt.insert(1, "donor", df["donor"]) - azt.insert(2, "experiment_id", expid) - if index_label is not None and index_label == "barcode": - azt.insert(0, "mangled_cell_id", df.index) - azt = azt.set_index("barcode", drop = True) - return azt def load_scrublet_assignments(expid, datadir_scrublet): filpath = None @@ -544,22 +501,12 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane except: metrics = pd.read_csv(f'/lustre/scratch123/hgi/projects/ukbb_scrna/pipelines/Pilot_UKB/qc/{args.experiment_name}/results_rsync2/results/handover/Summary_plots/{args.experiment_name}/Fetch Pipeline/CSV/Submission_Data_Pilot_UKB.file_metadata.tsv',sep='\t') metrics = metrics[metrics['Sample_id']==expid] + ############# - #Azimuth cell-type assignments + #Cell-type assignments ############# - datadir_azimuth = f'{args.results_dir}/celltype/azimuth' - for datadir_azimuth in glob.glob(f'{args.results_dir}/celltype/azimuth/*'): - if os.path.isdir(datadir_azimuth): - try: - azt = gather_azimuth_annotation( - expid, datadir_azimuth=datadir_azimuth, - index_label = 'barcode') - columns_output = {**columns_output, **COLUMNS_AZIMUTH} - except: - azt = None - else: - azt = None + azt = pd.read_csv(f'{args.results_dir}/celltype/All_Celltype_Assignments.csv',sep='\t',index_col=0) ########################## # Scrublet ######################### @@ -647,7 +594,7 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane chromium_channel = 'Run_ID not vailable' - Azimuth_Cell_Assignments_data = azt + def isNaN(num): return num!= num @@ -719,12 +666,7 @@ def isNaN(num): Cells_before_QC_filters=len(all_QC_lane.obs['cell_passes_qc']) Cells_passing_QC=len(all_QC_lane.obs[all_QC_lane.obs['cell_passes_qc']]) Cells_failing_QC=len(all_QC_lane.obs[all_QC_lane.obs['cell_passes_qc']==False]) - try: - Azimuth_Cell_Assignments_data=Azimuth_Cell_Assignments_data.set_index('mangled_cell_id') - all_QC_lane.obs['predicted celltype']=Azimuth_Cell_Assignments_data['predicted.celltype.l2'] - - except: - print('skipped az') + UMIS_mapped_to_mitochondrial_genes = sum(all_QC_lane.obs['total_counts_gene_group__mito_transcript']) UMIS_mapped_to_ribo_genes = sum(all_QC_lane.obs['total_counts_gene_group__ribo_protein']) UMIS_mapped_to_ribo_rna = sum(all_QC_lane.obs['total_counts_gene_group__ribo_rna']) @@ -819,8 +761,9 @@ def isNaN(num): data_donor_for_stats['cells failing QC'].append(Donor_cells_fails_qc) data_donor_for_stats['cells passing QC'].append(Donor_cells_passes_qc) - Donor_cell_assignments = Azimuth_Cell_Assignments_data.loc[Mengled_barcodes_donor] #for this have to figure out when the cell type is unasigned. - Cell_types_detected = len(set(Donor_cell_assignments['predicted.celltype.l2'])) + Donor_cell_assignments = azt.loc[Mengled_barcodes_donor] #for this have to figure out when the cell type is unasigned. + # Donor_cell_assignments = Donor_cell_assignments.rename(COLUMNS_AZIMUTH,axis=1) + Cell_types_detected = len(set(Donor_cell_assignments['Azimuth:predicted.celltype.l2'])) Donor_UMIS_mapped_to_mitochondrial_genes = sum(Donor_qc_files.obs['total_counts_gene_group__mito_transcript']) Donor_UMIS_mapped_to_ribo_genes = sum(Donor_qc_files.obs['total_counts_gene_group__ribo_protein']) Donor_UMIS_mapped_to_ribo_rna = sum(Donor_qc_files.obs['total_counts_gene_group__ribo_rna']) @@ -833,8 +776,8 @@ def isNaN(num): Median_UMIs_per_cell= statistics.median(pd.DataFrame(Donor_qc_files.X.sum(axis=1))[0]) Cell_numbers = '' - for type1 in set(Donor_cell_assignments['predicted.celltype.l2']): - nr_cells_of_this_type = len(Donor_cell_assignments[Donor_cell_assignments['predicted.celltype.l2']==type1]) + for type1 in set(Donor_cell_assignments['Azimuth:predicted.celltype.l2']): + nr_cells_of_this_type = len(Donor_cell_assignments[Donor_cell_assignments['Azimuth:predicted.celltype.l2']==type1]) Cell_numbers+=f"{type1}:{nr_cells_of_this_type} ; " @@ -984,7 +927,7 @@ def isNaN(num): 'Tranche Pass/Fail':Tranche_Pass_Fail, 'Tranche Failure Reason':Tranche_Failure_Reason } - return fctr, data_tranche, data_donor,azt + return fctr, data_tranche, data_donor def set_argument_parser(): @@ -1064,7 +1007,7 @@ def set_argument_parser(): adqc.obs['tranche.id']=args.experiment_name try: - adqc.obs['cell_passes_qc-per:Azimuth:L0_predicted.celltype.l2:score'] = adqc.obs['cell_passes_qc-per:Azimuth:L0_predicted.celltype.l2:score'].astype(float,errors='ignore') + adqc.obs['cell_passes_qc-per:Azimuth:L0_Azimuth:predicted.celltype.l2:score'] = adqc.obs['cell_passes_qc-per:Azimuth:L0_Azimuth:predicted.celltype.l2:score'].astype(float,errors='ignore') except: _='no values associated' try: @@ -1082,7 +1025,7 @@ def set_argument_parser(): continue #Here no cells has passed the qc thresholds. nf, data_tranche, data_donor, azt = gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = oufh, lane_id=count,Resolution=Resolution) # add the stuff to the adata. - azt=azt.set_index('mangled_cell_id') + All_probs_and_celltypes=pd.concat([All_probs_and_celltypes,azt]) data_tranche_all.append(data_tranche) data_donor_all= data_donor_all+data_donor diff --git a/conf/modules.conf b/conf/modules.conf index 0df8fa4..bcb6f3a 100755 --- a/conf/modules.conf +++ b/conf/modules.conf @@ -53,6 +53,12 @@ params { } + + +} + + +process { withName: SCRUBLET{ maxRetries = 3 errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' } @@ -283,8 +289,6 @@ params { memory = { 100.GB * task.attempt } } - - withName: GT_MATCH_POOL_AGAINST_PANEL{ time = { 24.h * task.attempt } } @@ -296,6 +300,4 @@ params { } - - -} +} \ No newline at end of file From cf87ee478e6acbfa79ec59ed514821f61b0f3e82 Mon Sep 17 00:00:00 2001 From: Matiss Ozols Date: Fri, 10 May 2024 14:43:49 +0100 Subject: [PATCH 6/8] v1.6 nearly finished, needs some more trsting --- bin/0026-filter_outlier_cells.py | 414 ++++++++++-------- bin/gather_minimal_dataset.py | 15 +- bin/remap_azimuth_l2.py | 2 +- bin/scanpy_merge_from_h5ad.py | 311 +------------ bin/scanpy_split_h5ad.py | 48 +- conf/base.conf | 1 + conf/modules.conf | 8 +- conf/qc.conf | 4 +- modules/nf-core/modules/azimuth/main.nf | 17 +- modules/nf-core/modules/citeseq/main.nf | 15 +- .../nf-core/modules/outlier_filter/main.nf | 5 +- modules/nf-core/modules/vireo/main.nf | 2 +- subworkflows/celltype.nf | 25 +- subworkflows/main_deconvolution.nf | 8 +- subworkflows/qc.nf | 2 +- workflows/yascp.nf | 3 - 16 files changed, 319 insertions(+), 561 deletions(-) diff --git a/bin/0026-filter_outlier_cells.py b/bin/0026-filter_outlier_cells.py index 1b623b6..14b3623 100755 --- a/bin/0026-filter_outlier_cells.py +++ b/bin/0026-filter_outlier_cells.py @@ -12,6 +12,7 @@ from distutils.version import LooseVersion import numpy as np import scanpy as sc +import re import pandas as pd from sklearn.svm import OneClassSVM from sklearn.covariance import EllipticEnvelope @@ -58,6 +59,18 @@ def predict(self, X): return (np.sign(self.thresh) * (X - self.median - self.thresh*self.mad) > 0).any(axis=1) + def decision_function(self,cols): + tresholds_set = (self.median + self.thresh*self.mad).ravel() + thresh = self.thresh.ravel() + decission_string = "" + for i,tr1 in enumerate(thresh): + if tr1<0: + trs=f"<{tresholds_set[i]}" + else: + trs=f">{tresholds_set[i]}" + decission_string = f"{decission_string}{tr1}({cols[i]}{trs})" + return decission_string + def fit_predict(self, X): self.fit(X) return self.predict(X) @@ -78,11 +91,19 @@ def perform_adaptiveQC_Filtering(clf,adata,method,metadata_columns): adata.obs[metadata_columns].values ) == 1 predicted_scores = clf.decision_function(adata.obs[metadata_columns].values) + elif method == 'MAD': + metadata_columns2 = metadata_columns.copy() + metadata_columns2.pop() + f = clf.fit_predict( + adata.obs[metadata_columns2].values + ) == 1 + predicted_scores = clf.decision_function(metadata_columns2) + else: f = clf.fit_predict( adata.obs[metadata_columns].values ) == 1 - predicted_scores = clf.decision_function(adata.obs[metadata_columns].values) + predicted_scores = clf.decision_function(adata.obs[metadata_columns].values) return predicted_scores,f @@ -201,6 +222,16 @@ def main(): help='Columns to use for outliers.\ (default: %(default)s)' ) + + parser.add_argument( + '--MAD_thresholds', + action='store', + dest='MAD_thresholds', + type=str, + default='-5,-5,5', + help='Columns to use for outliers.\ + (default: %(default)s)' + ) parser.add_argument( '--cell_qc_column', @@ -285,7 +316,7 @@ def main(): # Here we add an adaptive QC per Column # Drop out previous QCed cells - cell_qc_column = options.cell_qc_column + # if cell_qc_column in adata.obs.columns: # n_cells_original = adata.shape[0] # adata = adata[adata.obs[cell_qc_column], :] @@ -313,194 +344,217 @@ def main(): # ] method = options.method - if method == 'LocalOutlierFactor': - # fit the model for outlier detection (default) - clf = LocalOutlierFactor( - #n_neighbors=100, - # contamination=outliers_fraction - ) - elif method == 'IsolationForest': - max_samples = options.max_samples - # if max_samples == 0.0: - # if n_cells < 1000: - # max_samples = 250 - # else: - # max_samples = 0.1 - print("Using max_samples of:\t{}".format(max_samples)) - clf = IsolationForest( - #n_estimators=500, - max_samples=max_samples, - warm_start=False, - contamination=outliers_fraction, - random_state=0, - bootstrap=True - ) - elif method == 'EllipticEnvelope': - if outliers_fraction == 'auto': - outliers_fraction = 0.1 - clf = EllipticEnvelope( - contamination=outliers_fraction - ) - elif method == 'OneClassSVM': - clf = OneClassSVM( - # nu=n_outliers, - # kernel="rbf", - # gamma=0.1 - ) - elif method == 'MAD': - mad_thresh = [-5, -5, 5] - clf = onesidemad(mad_thresh) - else: - raise ValueError('ERROR: invalid method.') - - metadata_columns_original = metadata_columns.copy() - # We perform the adaptive qc either for all data together or per user defined column of unique values. + + + methods = re.split(';|,',options.method) outlier_filtering_strategys = options.outlier_filtering_strategys # outlier_filtering_strategys = 'Azimuth:L0_predicted.celltype.l2;all_together;all_together::exclude' - outlier_filtering_strategys = outlier_filtering_strategys.split(';') - matching = [s for s in outlier_filtering_strategys if "::" in s] - for m1 in matching: - # duplicate the strategy if we have flagged it as an extra step by adding ::no_exclude flag - m2 = m1.split('::')[0] - if m1 =='all_together::exclude': - adata.obs[m1] = 'all_cells' - else: - adata.obs[m1] = m2 - - # We load the GT match file and determine the celline match that needs to be subjected to adaptive qc independently - if(options.patterns_exclude): - ######## The folowing bit of code takes the GT match outputs and utilises this to run adaptive qc on cellines independently - only if the celline is expected. - if(options.gt_match_file!='fake_file.fq'): - GT_match_file = pd.read_csv(options.gt_match_file,sep='\t') - GT_patterns = options.patterns_exclude.split(';') - for outlier_filtering_strategy in outlier_filtering_strategys: - strategy = outlier_filtering_strategy - if strategy!='all_together': - for pattern in GT_patterns: - Matches = GT_match_file[GT_match_file['donor_gt'].str.contains(pattern)] - if len(Matches)>0: - for i,match in Matches.iterrows(): - donor=match['donor_query'] - pool=match['pool'] - expected = match['Match Expected'] - if expected: - filter_query = f"convoluted_samplename=='{pool}' and Donor=='{donor}'" - idx1 = adata.obs.query(filter_query).index - try: - adata.obs[strategy] = adata.obs[strategy].cat.add_categories(pattern) - except: - _='category exists already' - adata.obs.loc[idx1,strategy]=pattern - ######## - # all_index = pd.DataFrame(adata.obs.index,columns=['col']) - # all_together = all_indexes.str[0]+'-'+all_indexes.str[1]+'-'+all_indexes.str[2] + outlier_filtering_strategys = re.split(';|,',outlier_filtering_strategys) + metadata_columns_original = metadata_columns.copy() - for outlier_filtering_strategy in outlier_filtering_strategys: - metadata_columns = metadata_columns_original.copy() - if (outlier_filtering_strategy == 'all_together'): + for method in methods: + print(f'-------Running {method} outlier filtering strategy-------') + nmethod = method + + if method == 'LocalOutlierFactor': + cell_qc_column = nmethod+':'+options.cell_qc_column + # fit the model for outlier detection (default) + clf = LocalOutlierFactor( + #n_neighbors=100, + # contamination=outliers_fraction + ) + elif method == 'IsolationForest': + max_samples = options.max_samples + # if max_samples == 0.0: + # if n_cells < 1000: + # max_samples = 250 + # else: + # max_samples = 0.1 cell_qc_column = options.cell_qc_column - cell_qc_column_score = f"{cell_qc_column}:score" - adata.obs[cell_qc_column] = True - adata.obs[cell_qc_column_score] = None - metadata_columns.append(cell_qc_column) - prediction_score, fail_pass = perform_adaptiveQC_Filtering(clf,adata,method,metadata_columns) - adata.obs[cell_qc_column] = fail_pass - adata.obs[cell_qc_column_score] = prediction_score + print("Using max_samples of:\t{}".format(max_samples)) + clf = IsolationForest( + #n_estimators=500, + max_samples=max_samples, + warm_start=False, + contamination=outliers_fraction, + random_state=0, + bootstrap=True + ) + elif method == 'EllipticEnvelope': + cell_qc_column = nmethod+':'+options.cell_qc_column + if outliers_fraction == 'auto': + outliers_fraction = 0.1 + clf = EllipticEnvelope( + contamination=outliers_fraction + ) + elif method == 'OneClassSVM': + cell_qc_column = nmethod+':'+options.cell_qc_column + clf = OneClassSVM( + # nu=n_outliers, + # kernel="rbf", + # gamma=0.1 + ) + elif method == 'MAD': + mad_thresh = np.array([ int(x) for x in re.split(';|,',options.MAD_thresholds)]) + nmethod = method+options.MAD_thresholds + cell_qc_column = nmethod+':'+options.cell_qc_column + clf = onesidemad(mad_thresh) else: - cell_qc_column = f'{cell_qc_column}-per:{outlier_filtering_strategy}' - cell_qc_column_score = f"{cell_qc_column}:score" - metadata_columns.append(cell_qc_column) - adata.obs[cell_qc_column] = True - adata.obs[cell_qc_column_score] = None - try: - os.mkdir(f'per_celltype_outliers__{outlier_filtering_strategy}') - except: - _='exists already' - try: - outlier_strategy_cols = set(adata.obs[outlier_filtering_strategy]) - except: - print('user provided col doesnt exist') - continue - for subset_id_for_ad_qc in outlier_strategy_cols: - subset_ad = adata[adata.obs[outlier_filtering_strategy]==subset_id_for_ad_qc] - print(f'filtering:{subset_id_for_ad_qc} {outlier_filtering_strategy}') - if(len(subset_ad)>100): - # We only perform adaptive qc when there is at least 100 cells, otherwise we assume that all pass - prediction_score, fail_pass = perform_adaptiveQC_Filtering(clf,subset_ad,method,metadata_columns) - adata.obs.loc[subset_ad.obs.index,cell_qc_column]=fail_pass - subset_ad.obs.loc[subset_ad.obs.index,cell_qc_column]=fail_pass - - adata.obs.loc[subset_ad.obs.index,cell_qc_column_score]=prediction_score - subset_ad.obs.loc[subset_ad.obs.index,cell_qc_column_score]=prediction_score - + raise ValueError('ERROR: invalid method.') + print(cell_qc_column) + metadata_columns=metadata_columns_original.copy() + metadata_columns.append(cell_qc_column) + # We perform the adaptive qc either for all data together or per user defined column of unique values. + + matching = [s for s in outlier_filtering_strategys if "::" in s] + for m1 in matching: + # duplicate the strategy if we have flagged it as an extra step by adding ::no_exclude flag + m2 = m1.split('::')[0] + if m1 =='all_together::exclude': + adata.obs[m1] = 'all_cells' + else: + adata.obs[m1] = m2 + + # We load the GT match file and determine the celline match that needs to be subjected to adaptive qc independently + if(options.patterns_exclude): + ######## The folowing bit of code takes the GT match outputs and utilises this to run adaptive qc on cellines independently - only if the celline is expected. + if(options.gt_match_file!='fake_file.fq'): + GT_match_file = pd.read_csv(options.gt_match_file,sep='\t') + GT_patterns = options.patterns_exclude.split(';') + for outlier_filtering_strategy in outlier_filtering_strategys: + strategy = outlier_filtering_strategy + if strategy!='all_together': + for pattern in GT_patterns: + Matches = GT_match_file[GT_match_file['donor_gt'].str.contains(pattern)] + if len(Matches)>0: + for i,match in Matches.iterrows(): + donor=match['donor_query'] + pool=match['pool'] + expected = match['Match Expected'] + if expected: + filter_query = f"convoluted_samplename=='{pool}' and Donor=='{donor}'" + idx1 = adata.obs.query(filter_query).index + try: + adata.obs[strategy] = adata.obs[strategy].cat.add_categories(pattern) + except: + _='category exists already' + adata.obs.loc[idx1,strategy]=pattern + ######## + # all_index = pd.DataFrame(adata.obs.index,columns=['col']) + # all_together = all_indexes.str[0]+'-'+all_indexes.str[1]+'-'+all_indexes.str[2] + + for outlier_filtering_strategy in outlier_filtering_strategys: + metadata_columns = metadata_columns_original.copy() + + if (outlier_filtering_strategy == 'all_together'): + if method == 'IsolationForest': + cell_qc_column = options.cell_qc_column else: - print(f'For a category {subset_id_for_ad_qc} we have only {len(subset_ad)} cells and as its not sufficient ammount to estimate distributions we assuma all pass QC') - # subset_ad.uns['cell_outlier_estimator'] = method - of = f'per_celltype_outliers__{outlier_filtering_strategy}/{subset_id_for_ad_qc}---{options.of}' - generate_plots(subset_ad,cell_qc_column,metadata_columns,metadata_columns_original,of) - del subset_ad - adata.uns['cell_outlier_estimator'] = method - adata.obs[['cell_id', cell_qc_column]].to_csv( - f'{options.of}-outliers_filtered__{cell_qc_column}.tsv.gz', - sep='\t', - compression=compression_opts, - index=False, - header=True - ) - - # Calculate cell_filtered_per_experiment - filter_columns = [ - 'experiment_id', - 'filter_type', - 'n_cells_left_in_adata' - ] - if options.cell_filtered_per_experiment == 'None': - df_cell_filt_per_exp = adata.obs['experiment_id'].value_counts() - df_cell_filt_per_exp = df_cell_filt_per_exp.rename_axis( + cell_qc_column = f'{cell_qc_column}-per:{outlier_filtering_strategy}' + cell_qc_column_score = f"{cell_qc_column}:score" + adata.obs[cell_qc_column] = True + adata.obs[cell_qc_column_score] = None + + metadata_columns.append(cell_qc_column) + + prediction_score, fail_pass = perform_adaptiveQC_Filtering(clf,adata,method,metadata_columns) + adata.obs[cell_qc_column] = fail_pass + adata.obs[cell_qc_column_score] = prediction_score + else: + cell_qc_column = f'{cell_qc_column}-per:{outlier_filtering_strategy}' + cell_qc_column_score = f"{cell_qc_column}:score" + + adata.obs[cell_qc_column] = True + adata.obs[cell_qc_column_score] = '' + try: + os.mkdir(f'per_celltype_outliers__{outlier_filtering_strategy}') + except: + _='exists already' + try: + outlier_strategy_cols = set(adata.obs[outlier_filtering_strategy]) + except: + print(f'{outlier_filtering_strategy} - user provided col doesnt exist') + continue + for subset_id_for_ad_qc in outlier_strategy_cols: + subset_ad = adata[adata.obs[outlier_filtering_strategy]==subset_id_for_ad_qc] + print(f'filtering:{subset_id_for_ad_qc} {outlier_filtering_strategy}') + if(len(subset_ad)>100): + # We only perform adaptive qc when there is at least 100 cells, otherwise we assume that all pass + prediction_score, fail_pass = perform_adaptiveQC_Filtering(clf,subset_ad,method,metadata_columns) + adata.obs.loc[subset_ad.obs.index,cell_qc_column]=fail_pass + subset_ad.obs.loc[subset_ad.obs.index,cell_qc_column]=fail_pass + + adata.obs.loc[subset_ad.obs.index,cell_qc_column_score]=prediction_score + subset_ad.obs.loc[subset_ad.obs.index,cell_qc_column_score]=prediction_score + + else: + print(f'For a category {subset_id_for_ad_qc} we have only {len(subset_ad)} cells and as its not sufficient ammount to estimate distributions we assuma all pass QC') + # subset_ad.uns['cell_outlier_estimator'] = method + of = f'per_celltype_outliers__{outlier_filtering_strategy}/{subset_id_for_ad_qc}---{options.of}' + generate_plots(subset_ad,cell_qc_column,metadata_columns,metadata_columns_original,of) + del subset_ad + adata.uns['cell_outlier_estimator'] = method + adata.obs[['cell_id', cell_qc_column]].to_csv( + f'{options.of}-outliers_filtered__{cell_qc_column}.tsv.gz', + sep='\t', + compression=compression_opts, + index=False, + header=True + ) + + # Calculate cell_filtered_per_experiment + filter_columns = [ + 'experiment_id', + 'filter_type', + 'n_cells_left_in_adata' + ] + if options.cell_filtered_per_experiment == 'None': + df_cell_filt_per_exp = adata.obs['experiment_id'].value_counts() + df_cell_filt_per_exp = df_cell_filt_per_exp.rename_axis( + 'experiment_id' + ).reset_index(name='n_cells_left_in_adata') + df_cell_filt_per_exp['filter_type'] = 'before_filters' + df_cell_filt_per_exp = df_cell_filt_per_exp[filter_columns] + else: + # Load the samples filtered per experiment file: + df_cell_filt_per_exp = pd.read_csv( + options.cell_filtered_per_experiment, + sep="\t" + ) + filt = df_cell_filt_per_exp['filter_type'] != 'after_filters' + df_cell_filt_per_exp = df_cell_filt_per_exp.loc[filt, :] + + # Now calculate the n cells left after all filters + adata_after_filters = adata.obs.loc[adata.obs[cell_qc_column], :] + df_cells_filtered = adata_after_filters['experiment_id'].value_counts() + df_cells_filtered = df_cells_filtered.rename_axis( 'experiment_id' ).reset_index(name='n_cells_left_in_adata') - df_cell_filt_per_exp['filter_type'] = 'before_filters' - df_cell_filt_per_exp = df_cell_filt_per_exp[filter_columns] - else: - # Load the samples filtered per experiment file: - df_cell_filt_per_exp = pd.read_csv( - options.cell_filtered_per_experiment, - sep="\t" + df_cells_filtered['filter_type'] = '{} {} outlier_{}'.format( + 'filter__all_samples', + 'after_outlier_filter', + method + ) + df_cells_filtered = df_cells_filtered[filter_columns] + df_cell_filt_per_exp = df_cell_filt_per_exp.append( + df_cells_filtered, + ignore_index=True + ) + df_cells_filtered['filter_type'] = 'after_filters' + df_cell_filt_per_exp = df_cell_filt_per_exp.append( + df_cells_filtered, + ignore_index=True + ) + # Save the final dataframe + df_cell_filt_per_exp.to_csv( + f'{options.of}-cell_filtered_per_experiment__{cell_qc_column}.tsv.gz', + sep='\t', + compression=compression_opts, + index=False, + header=True ) - filt = df_cell_filt_per_exp['filter_type'] != 'after_filters' - df_cell_filt_per_exp = df_cell_filt_per_exp.loc[filt, :] - # Now calculate the n cells left after all filters - adata_after_filters = adata.obs.loc[adata.obs[cell_qc_column], :] - df_cells_filtered = adata_after_filters['experiment_id'].value_counts() - df_cells_filtered = df_cells_filtered.rename_axis( - 'experiment_id' - ).reset_index(name='n_cells_left_in_adata') - df_cells_filtered['filter_type'] = '{} {} outlier_{}'.format( - 'filter__all_samples', - 'after_outlier_filter', - method - ) - df_cells_filtered = df_cells_filtered[filter_columns] - df_cell_filt_per_exp = df_cell_filt_per_exp.append( - df_cells_filtered, - ignore_index=True - ) - df_cells_filtered['filter_type'] = 'after_filters' - df_cell_filt_per_exp = df_cell_filt_per_exp.append( - df_cells_filtered, - ignore_index=True - ) - # Save the final dataframe - df_cell_filt_per_exp.to_csv( - f'{options.of}-cell_filtered_per_experiment__{cell_qc_column}.tsv.gz', - sep='\t', - compression=compression_opts, - index=False, - header=True - ) - - generate_plots(adata,cell_qc_column,metadata_columns,metadata_columns_original,options.of) + generate_plots(adata,cell_qc_column,metadata_columns,metadata_columns_original,options.of) adata.write( diff --git a/bin/gather_minimal_dataset.py b/bin/gather_minimal_dataset.py index c54ee90..a921145 100755 --- a/bin/gather_minimal_dataset.py +++ b/bin/gather_minimal_dataset.py @@ -45,8 +45,8 @@ SCRUBLET_ASSIGNMENTS_FNSUFFIX = 'scrublet.tsv' COLUMNS_AZIMUTH = { - 'Azimuth:Azimuth:predicted.celltype.l2': 'azimuth.celltyp.l2', - 'Azimuth:Azimuth:predicted.celltype.l2.score': 'azimuth.pred.score.l2', + 'Azimuth:predicted.celltype.l2': 'azimuth.celltyp.l2', + 'Azimuth:predicted.celltype.l2.score': 'azimuth.pred.score.l2', 'Azimuth:mapping.score.celltype.l2': 'azimuth.map.score', } @@ -327,7 +327,7 @@ def gather_donor(donor_id, ad, ad_lane_raw, azimuth_annot, qc_obs, columns_outpu ad.raw = ad_lane_raw[ad.obs.index, :] if donor_id != "unassigned" and donor_id != "doublet": # add annotation from QC - df = pandas.concat([ad.obs, azimuth_annot.loc[azimuth_annot.donor == donor_id]], axis = 1, join = 'outer') + df = pandas.concat([ad.obs, azimuth_annot.loc[azimuth_annot.Donor == donor_id]], axis = 1, join = 'outer') df =df.loc[:,~df.columns.duplicated()] df = df[['experiment_id'] + list(COLUMNS_DECONV.keys()) + list(COLUMNS_AZIMUTH.keys())] try: @@ -370,7 +370,10 @@ def gather_donor(donor_id, ad, ad_lane_raw, azimuth_annot, qc_obs, columns_outpu ad.obs = ad.obs.loc[:,~ad.obs.columns.duplicated()] if write_h5: path1=os.path.join(outdir, oufnam + '.h5ad') - ad.obs['qc.filter.pass.AZ:L0'] = ad.obs['qc.filter.pass.AZ:L0'].astype('bool') + try: + ad.obs['qc.filter.pass.AZ:L0'] = ad.obs['qc.filter.pass.AZ:L0'].astype('bool') + except: + pass ad.obs['cell_passes_hard_filters'] = ad.obs['cell_passes_hard_filters'].astype('bool') ad.obs['qc.filter.pass'] = ad.obs['qc.filter.pass'].astype('bool') ad.write(path1,compression='gzip') @@ -1023,10 +1026,10 @@ def set_argument_parser(): ad = adqc[s] if ad.n_obs == 0: continue #Here no cells has passed the qc thresholds. - nf, data_tranche, data_donor, azt = gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = oufh, lane_id=count,Resolution=Resolution) + nf, data_tranche, data_donor = gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = oufh, lane_id=count,Resolution=Resolution) # add the stuff to the adata. - All_probs_and_celltypes=pd.concat([All_probs_and_celltypes,azt]) + # All_probs_and_celltypes=pd.concat([All_probs_and_celltypes,azt]) data_tranche_all.append(data_tranche) data_donor_all= data_donor_all+data_donor count += 1 diff --git a/bin/remap_azimuth_l2.py b/bin/remap_azimuth_l2.py index 4b00a79..22261b9 100755 --- a/bin/remap_azimuth_l2.py +++ b/bin/remap_azimuth_l2.py @@ -53,7 +53,7 @@ D1 =All_Data D1['idx1']=D1.index -D1 = All_Data.set_index('predicted.celltype.l2') +D1 = All_Data.set_index('Azimuth:predicted.celltype.l2') for col in Mappings.columns: D1[f'{col}_predicted.celltype.l2']='' D1[f'{col}_predicted.celltype.l2']=Mappings[col] diff --git a/bin/scanpy_merge_from_h5ad.py b/bin/scanpy_merge_from_h5ad.py index 8a42900..150a439 100755 --- a/bin/scanpy_merge_from_h5ad.py +++ b/bin/scanpy_merge_from_h5ad.py @@ -16,6 +16,9 @@ import pandas as pd import scanpy as sc + + + # Set seed for reproducibility seed_value = 0 y_n_print=False @@ -479,7 +482,6 @@ def scanpy_merge( metadata, metadata_key, output_file, - params_dict=dict(), cellmetadata_filepaths=None, anndata_compression_opts=4, extra_metadata=None @@ -505,14 +507,7 @@ def scanpy_merge( output_file : string output_file """ - # Get compression opts for pandas - compression_opts = 'gzip' - if LooseVersion(pd.__version__) > '1.0.0': - compression_opts = dict( - method='gzip', - compresslevel=anndata_compression_opts - ) - + # check the h5ad_data # check for required columns h5ad_data_required_cols = set(['experiment_id', 'data_path_h5ad_format']) @@ -551,38 +546,11 @@ def scanpy_merge( ) ) - # Init default values for params_dict - params_filters_check = [ - 'cell_filters', - 'downsample_cells_fraction', - 'downsample_cells_n', - 'downsample_feature_counts' - ] - for i in params_filters_check: - if i not in params_dict: - if i != 'cell_filters': - params_dict[i] = {'value': ''} - # Check for validity of filters specified in params_dict - param_filters_check = [ - 'downsample_cells_fraction', - 'downsample_cells_n' - ] - if all(params_dict[k]['value'] != '' for k in param_filters_check): - raise Exception( - 'Error check the params. Both {} and {} are set.'.format( - 'downsample_cells_fraction', - 'downsample_cells_n' - ) - ) # Init a dictionary to record the original number of cells per sample and # the number of cells after each filter. n_cells_dict = {} - # If true, then filtered cells are dropped prior to merging the data. - # This will save disk space. - drop_filtered_cells = False - # Iterate over samples and load data adatasets = [] adatasets__experiment_ids = [] @@ -598,6 +566,7 @@ def scanpy_merge( # ivar_names='gene_ids', # make_unique=False ) + adata_orig_cols = list(adata.obs.columns) adata_orig_cols.append("donor") adata = check_adata(adata, row['experiment_id']) @@ -676,204 +645,6 @@ def scanpy_merge( for col in cellmetadata.columns: adata.obs[col] = cellmetadata.loc[adata.obs.index, col] - # Calculate basic qc metrics for this sample. - # NOTE: n_genes_by_counts == number of genes with > 0 counts - # adata.obs['n_genes'] = (adata.X > 0).sum(axis = 1) is same as - # adata.obs['n_genes_by_counts'] - vars_prior_metrics = adata.var_keys() - sc.pp.calculate_qc_metrics( - adata, - qc_vars=[ - 'gene_group__mito_transcript', - 'gene_group__mito_protein', - 'gene_group__ribo_protein', - 'gene_group__ribo_rna' - ], - inplace=True - ) - - # # Apply cell filter. - # # adata = adata[selected_cells, :] - # # Apply gene filter - # # adata = adata[:, selected_genes] - - # # Apply cell QC filters. - # adata.obs['cell_passes_qc'] = True - # filters_all_samples = [] - # filters_experiment = [] - # if 'cell_filters' not in params_dict: - # warnings.warn('Found no cell_filters in params_dict.') - # else: - # if 'all_samples' in params_dict['cell_filters'].keys(): - # # NOTE: we want this to throw an error if value is not there. - # filters_all_samples = params_dict['cell_filters'][ - # 'all_samples' - # ]['value'] - # if row['experiment_id'] in params_dict['cell_filters'].keys(): - # filters_experiment = params_dict['cell_filters'][ - # row['experiment_id'] - # ]['value'] - - # # First record the total number of cells that pass each filter - # # independently i.e., not depenedent on any other filter. - # if len(filters_all_samples) > 0: - # for filter_query in filters_all_samples: - # if filter_query != '': - # n_cells_dict[row['experiment_id']][ - # 'filter__all_samples {}'.format(filter_query) - # ] = adata.n_obs - adata.obs.query(filter_query).shape[0] - # if len(filters_experiment) > 0: - # for filter_query in filters_experiment: - # # NOTE: could add if test here to grab filter_query == - # # file_cellids_filter or file_cellids_keep - # if filter_query != '': - # n_cells_dict[row['experiment_id']][ - # 'filter__sample_specific {}'.format( - # filter_query - # ) - # ] = adata.n_obs - adata.obs.query(filter_query).shape[0] - - # # Now apply the filters - first apply the filters for all samples. - # n_cells_start = adata.n_obs - # filter_i = 0 - # if len(filters_all_samples) > 0: - # # Run each filter iteratively. - # for filter_query in filters_all_samples: - # if filter_query != '': - # # Drop the cells that are flagged in this query - # cells_to_remove = adata.obs.query(filter_query).index - # adata.obs.loc[cells_to_remove, 'cell_passes_qc'] = False - # # adata = adata[ - # # np.invert(adata.obs.index.isin(cells_to_remove)), - # # : - # # ] - # if y_n_print: - # print('[{}] {} "{}": {} dropped {} remain'.format( - # 'all sample cell QC applied', - # row['experiment_id'], - # filter_query, - # len(cells_to_remove), - # adata.obs['cell_passes_qc'].sum() - # )) - # n_cells_dict[row['experiment_id']][ - # 'filter__all_samples after_filter_{} {}'.format( - # filter_i, - # filter_query - # ) - # ] = adata.obs['cell_passes_qc'].sum() - # filter_i += 1 - - # # Now apply per sample filters. - # if len(filters_experiment) > 0: - # # Run each filter iteratively. - # for filter_query in filters_experiment: - # if filter_query != '': - # cells_to_remove = adata.obs.query(filter_query).index - # adata.obs.loc[cells_to_remove, 'cell_passes_qc'] = False - # # adata = adata[ - # # np.invert(adata.obs.index.isin(cells_to_remove)), - # # : - # # ] - # if y_n_print: - # print('[{}] {} "{}": {} dropped {} remain'.format( - # 'sample specific cell QC applied', - # row['experiment_id'], - # filter_query, - # len(cells_to_remove), - # adata.obs['cell_passes_qc'].sum() - # )) - # n_cells_dict[row['experiment_id']][ - # 'filter__sample_specific after_filter_{} {}'.format( - # filter_i, - # filter_query - # ) - # ] = adata.obs['cell_passes_qc'].sum() - # filter_i += 1 - - # # Write the number of cells filtered to standard out. - # if y_n_print: - # print('[{}] after all cell QC: {} dropped {} remain'.format( - # row['experiment_id'], - # n_cells_start - adata.obs['cell_passes_qc'].sum(), - # adata.obs['cell_passes_qc'].sum() - # )) - - # # Apply cell downsampling if needed. - # if params_dict['downsample_cells_fraction']['value'] != '': - # n_cells_start = adata.n_obs - # sc.pp.subsample( - # adata, - # fraction=float( - # params_dict['downsample_cells_fraction']['value'] - # ), - # copy=False, - # random_state=0 - # ) - # n_cells_dict[ - # row['experiment_id'] - # ]['downsample_cells_fraction'] = adata.n_obs - # if y_n_print: - # print('[{}] cell downsample applied: {} dropped {} remain'.format( - # row['experiment_id'], - # n_cells_start - adata.n_obs, - # adata.n_obs - # )) - # elif params_dict['downsample_cells_n']['value'] != '': - # n_cells_start = adata.n_obs - # sc.pp.subsample( - # adata, - # n_obs=int(params_dict['downsample_cells_n']['value']), - # copy=False, - # random_state=0 - # ) - # n_cells_dict[ - # row['experiment_id'] - # ]['downsample_cells_n'] = adata.n_obs - # if y_n_print: - # print('[{}] cell downsample applied: {} dropped {} remain'.format( - # row['experiment_id'], - # n_cells_start - adata.n_obs, - # adata.n_obs - # )) - # # Apply count downsampling if needed. - # if params_dict['downsample_feature_counts']['value'] != '': - # fraction = params_dict['downsample_feature_counts']['value'] - # target_counts_per_cell = adata.obs['total_counts'].apply( - # lambda x: int(x * fraction) - # ).values - # sc.pp.downsample_counts( - # adata, - # counts_per_cell=target_counts_per_cell, - # random_state=0 - # ) - - # # Print the number of cells and genes for this sample. - # n_cells_dict[row['experiment_id']]['after_filters'] = adata.obs[ - # 'cell_passes_qc' - # ].sum() - - # if y_n_print: - # print('[{}] {} obs (cells), {} var (genes)'.format( - # row['experiment_id'], - # adata.obs['cell_passes_qc'].sum(), - # adata.n_vars - # )) - - # # Comment code below to keep the vars (gene) output from - # # calculate_qc_metrics *per sample*. If we do this, then in - # # adata_merged.var, we will have duplicated # measures according to - # # each sample (e.g., n_cells_by_counts-0, # n_cells_by_counts-1, - # # n_cells_by_counts-3). - # # - # # Code below removes such output. - # adata.var = adata.var[vars_prior_metrics] - - # # Only keep cells that pass QC - # if drop_filtered_cells: - # adata = adata[adata.obs['cell_passes_qc'], :] - # del adata.obs['cell_passes_qc'] - - # If we still have cells after filters, add to our list of data. if adata.n_obs > 0: adatasets.append(adata) adatasets__experiment_ids.append(row['experiment_id']) @@ -882,12 +653,7 @@ def scanpy_merge( raise Exception( 'Error invalid h5ad_data file. Missing coluns.' ) - # OPTIONAL: one could convert the experiment_ids to hashes - # adatasets__experiment_ids_hash = [ - # hashlib.sha1(i.encode('utf-8')) for i in adatasets__experiment_ids - # ] - # Merge all of the data together. adata_merged = adatasets[0].concatenate( *adatasets[1:], batch_categories=adatasets__experiment_ids @@ -901,7 +667,7 @@ def scanpy_merge( adata_merged.obs['experiment_id'] = adata_merged.obs['experiment_id'].str.split('__donor').str[0] # Re-calculate basic qc metrics of var (genes) for the whole dataset. # NOTE: we are only changing adata.var - obs_prior = adata_merged.obs.copy() + # obs_prior = adata_merged.obs.copy() sc.pp.calculate_qc_metrics( adata_merged, qc_vars=[ @@ -912,53 +678,14 @@ def scanpy_merge( ], inplace=True ) - adata_merged.obs = obs_prior - - # Possible additional basic filtering on the full dataset. - # sc.pp.filter_cells(adata, min_genes=200) - # sc.pp.filter_genes(adata, min_cells=1) - # if y_n_print: - # print('[adata_merged] {} obs, {} vars'.format( - # adata_merged.n_obs, - # adata_merged.n_vars - # )) - - # # Merge info on cell filters - # n_cells_df = pd.DataFrame(n_cells_dict) - # n_cells_df = n_cells_df.transpose() - # n_cells_df['experiment_id'] = n_cells_df.index - # n_cells_df = n_cells_df.melt( - # id_vars=['experiment_id'], - # var_name='filter_type', - # value_name='n_cells_left_in_adata' - # ) - # # Drop rows with no value for n_cells_left_in_adata. This will happen for - # # per sample filters. - # n_cells_df = n_cells_df.dropna(subset=['n_cells_left_in_adata']) - # adata_merged.uns['cell_filtered_per_experiment'] = n_cells_df - # adata_merged.uns['cell_filtered_per_experiment_dict'] = n_cells_dict - - # Save the adata matrix - # output_file = output_dir + "/adata" + # adata_merged.obs = obs_prior + adata_merged.obs['log10_ngenes_by_count'] = np.log10(adata_merged.obs['n_genes_by_counts']) / np.log10(adata_merged.obs['total_counts']) adata_merged.write( '{}.h5ad'.format(output_file), compression='gzip', compression_opts=anndata_compression_opts ) - # adata_merged.write_csvs(output_file) - # adata_merged.write_loom(output_file+".loom") - # n_cells_df.to_csv( - # '{}-cell_filtered_per_experiment.tsv.gz'.format(output_file), - # sep='\t', - # index=False, - # quoting=csv.QUOTE_NONNUMERIC, - # # index_label='cell_barcode', - # na_rep='', - # compression=compression_opts - # ) - - return(output_file) def main(): @@ -1085,23 +812,6 @@ def main(): # Scanpy settings sc.settings.figdir = os.getcwd() # figure output directory to match base. sc.settings.n_jobs = options.ncpu # number CPUs - # sc.settings.max_memory = 500 # in Gb - # sc.set_figure_params(dpi_save = 300) - - # NOTE: - # - Could change yaml params file to include a list per sample and the - # filters one wants to use for that sample - # - Could allow the input to be either h5ad dir or AnnData/loom object. - # Use AnnData/loom object would be useful if we add doublet scores - # or other scores prior to filtering. - - # Read in the parameters for downsampling and cell filters. - if options.pyml == '': - params_dict = {} - else: - with open(options.pyml, 'r') as f: - params_dict = yaml.safe_load(f) - params_dict = params_dict['sample_qc'] # Load a file of the samples to analyse h5ad_data = pd.read_csv(options.txd, sep='\t') @@ -1151,17 +861,16 @@ def main(): ) # Run the merge function. - out_file = scanpy_merge( + scanpy_merge( h5ad_data, metadata, metadata_key=options.mk, output_file=options.of, - params_dict=params_dict, cellmetadata_filepaths=cellmetadata_filepaths, anndata_compression_opts=options.anndata_compression_opts, extra_metadata= extra_metadata ) - print(out_file) + if __name__ == '__main__': diff --git a/bin/scanpy_split_h5ad.py b/bin/scanpy_split_h5ad.py index 207d7c1..71faf88 100755 --- a/bin/scanpy_split_h5ad.py +++ b/bin/scanpy_split_h5ad.py @@ -37,32 +37,32 @@ def write_h5_out_for_ct(ad,oufn_list_AZ,oufnam,oufn_list,samples,samples_AZ,bl,c # disable adb_AZ.obs = pandas.DataFrame(adb_AZ.obs.index, index = adb_AZ.obs.index, columns = ["cell_barcode"]) - adb.layers['counts'] = adb.X.copy() + # adb.layers['counts'] = adb.X.copy() - # Total-count normalize (library-size correct) the data matrix X to - # counts per million, so that counts become comparable among cells. - scanpy.pp.normalize_total( - adb, - target_sum=1e4, - exclude_highly_expressed=False, - key_added='normalization_factor', # add to adata.obs - inplace=True - ) - # Logarithmize the data: X = log(X + 1) where log = natural logorithm. - # Numpy has a nice function to undo this np.expm1(adata.X). - scanpy.pp.log1p(adb) - # Delete automatically added uns - UPDATE: bad idea to delete as this slot - # is used in _highly_variable_genes_single_batch. - # del adata.uns['log1p'] - # Add record of this operation. - # adata.layers['log1p_cpm'] = adata.X.copy() - # adata.uns['log1p_cpm'] = {'transformation': 'ln(CPM+1)'} - adb.layers['log1p_cp10k'] = adb.X.copy() - adb.uns['log1p_cp10k'] = {'transformation': 'ln(CP10k+1)'} + # # Total-count normalize (library-size correct) the data matrix X to + # # counts per million, so that counts become comparable among cells. + # scanpy.pp.normalize_total( + # adb, + # target_sum=1e4, + # exclude_highly_expressed=False, + # key_added='normalization_factor', # add to adata.obs + # inplace=True + # ) + # # Logarithmize the data: X = log(X + 1) where log = natural logorithm. + # # Numpy has a nice function to undo this np.expm1(adata.X). + # scanpy.pp.log1p(adb) + # # Delete automatically added uns - UPDATE: bad idea to delete as this slot + # # is used in _highly_variable_genes_single_batch. + # # del adata.uns['log1p'] + # # Add record of this operation. + # # adata.layers['log1p_cpm'] = adata.X.copy() + # # adata.uns['log1p_cpm'] = {'transformation': 'ln(CPM+1)'} + # adb.layers['log1p_cp10k'] = adb.X.copy() + # adb.uns['log1p_cp10k'] = {'transformation': 'ln(CP10k+1)'} - # Reset X to counts - adb.X = adb.layers['counts'].copy() - del adb.layers["counts"] #Since counts are set as an X we dont need it as part of the andata - this saves space. + # # Reset X to counts + # adb.X = adb.layers['counts'].copy() + # del adb.layers["counts"] #Since counts are set as an X we dont need it as part of the andata - this saves space. try: vdf = adb_AZ.var[["feature_types", "genome"]] diff --git a/conf/base.conf b/conf/base.conf index 98bb86d..78b2b3f 100755 --- a/conf/base.conf +++ b/conf/base.conf @@ -18,6 +18,7 @@ params{ //# estimate_and_provide_informative_snps_for_deconvolution=false perform_concordance_calculations = false filter_outliers = true + remap_celltypes = false //# These are default parameters that can be overwriten to run in a different mode. //# Here we have listed the default parameters when running without any extrainput. tmpdir = "${launchDir}/work" diff --git a/conf/modules.conf b/conf/modules.conf index bcb6f3a..6350b62 100755 --- a/conf/modules.conf +++ b/conf/modules.conf @@ -68,7 +68,7 @@ process { withName: DOUBLET_DECON{ maxRetries = 3 errorStrategy = { task.attempt > 2 ? 'ignore' : 'retry' } - memory = { 100.GB * task.attempt } + memory = { 150.GB * task.attempt } } withName: AZIMUTH{ @@ -235,9 +235,9 @@ process { } withName: CONCORDANCE_CALCLULATIONS{ - cpus = 20 + cpus = 10 time = { 24.h * task.attempt } - memory = { 100.GB * task.attempt } + memory = { 80.GB * task.attempt } } withName: OTHER_DONOR_CONCORDANCE_CALCLULATIONS{ @@ -280,7 +280,7 @@ process { } withName: umap_gather{ - memory = { 100.GB * task.attempt } + memory = { 200.GB * task.attempt } errorStrategy = 'retry' maxRetries = 3 } diff --git a/conf/qc.conf b/conf/qc.conf index 1e72efb..e31c42c 100755 --- a/conf/qc.conf +++ b/conf/qc.conf @@ -179,13 +179,13 @@ params{ run_process = true method = 'IsolationForest' //# Available methods: ocalOutlierFactor, IsolationForest, EllipticEnvelope, OneClassSVM, onesidemad metadata_columns = 'log1p_total_counts,log1p_n_genes_by_counts,pct_counts_gene_group__mito_transcript' - mad_tresholds = '-5,-5,5' //# These one side MAD filters will be used if selected. + mad_tresholds = '-5,-5,5' //# These one side MAD filters will be used if selected. If MAD is used then these thresholds will be used for each of them. [-] prefix means filtering on the left side of distribution, whereas [+] means filtering on the right side of distribution. outliers_fraction = 0.0 max_samples = 0.1 } } - outlier_filtering_strategy = 'Azimuth:L0_predicted.celltype.l2;all_together' //the outliers may be filterered per strategy type + outlier_filtering_strategy = 'Azimuth:L0_predicted.celltype.l2,all_together' //the outliers may be filterered per strategy type gt_match_based_adaptive_qc_exclusion_pattern = '' // #We run the adaptive QC on these patterns independently regardless on assigned celltype. downsample_cells_fraction{ diff --git a/modules/nf-core/modules/azimuth/main.nf b/modules/nf-core/modules/azimuth/main.nf index e50bcf9..4edf5b0 100755 --- a/modules/nf-core/modules/azimuth/main.nf +++ b/modules/nf-core/modules/azimuth/main.nf @@ -63,28 +63,21 @@ process REMAP_AZIMUTH{ container "wtsihgi/nf_scrna_qc_azimuth:d54db9b" } - publishDir path: "${params.outdir}/celltype/azimuth/", + publishDir path: "${params.outdir}/celltype/", mode: "${params.copy_mode}", overwrite: "true" stageInMode 'copy' input: - tuple val(outfil_prfx),val(name) path(azimuth_file) + path(file) path(mapping_file) - when: - name=='PBMC' + output: - path('azimuth/*', emit:predicted_celltype_labels) + path("remapped__${file}", emit:predicted_celltype_labels) script: - celltype_table = "remapped__${azimuth_file}" """ - remap_azimuth_l2.py -of remapped__predicted_celltype_l2.tsv -m ${mapping_file} -az predicted_celltype_l2.tsv - mkdir azimuth - - mv predicted_celltype_l1.tsv azimuth/${outfil_prfx}__predicted_celltype_l1.tsv - mv predicted_celltype_l3.tsv azimuth/${outfil_prfx}__predicted_celltype_l3.tsv - mv remapped__predicted_celltype_l2.tsv azimuth/${outfil_prfx}__predicted_celltype_l2.tsv + remap_azimuth_l2.py -of remapped__${file} -m ${mapping_file} -az ${file} """ } diff --git a/modules/nf-core/modules/citeseq/main.nf b/modules/nf-core/modules/citeseq/main.nf index 272029a..60e1680 100755 --- a/modules/nf-core/modules/citeseq/main.nf +++ b/modules/nf-core/modules/citeseq/main.nf @@ -7,16 +7,9 @@ process SPLIT_CITESEQ_GEX { container "wtsihgi/nf_scrna_qc:6bb6af5" } - publishDir path: "${params.outdir}/citeseq/${mode}/${sample_name}", - // saveAs: {filename -> - // if (filename.contains("antibody-")) { - // filename.replaceAll("antibody-", "${mode}_antibody-") - // }else { - // null - // } - // }, - mode: "${params.copy_mode}", - overwrite: "true" + publishDir path: "${params.outdir}/data_modalities_split/${mode}/${sample_name}", + mode: "${params.copy_mode}", + overwrite: "true" input: tuple val(sample_name),path(cellranger_raw) @@ -32,7 +25,7 @@ process SPLIT_CITESEQ_GEX { script: """ - + strip_citeseq.py --raw_data ${cellranger_raw} -o ${sample_name} """ } diff --git a/modules/nf-core/modules/outlier_filter/main.nf b/modules/nf-core/modules/outlier_filter/main.nf index 5dcf36c..913e79b 100755 --- a/modules/nf-core/modules/outlier_filter/main.nf +++ b/modules/nf-core/modules/outlier_filter/main.nf @@ -56,7 +56,7 @@ process OUTLIER_FILTER { script: if(gt_match_based_adaptive_qc_exclusion_pattern!=''){ - filter_strategy_exclusion = "--patterns_exclude '${gt_match_based_adaptive_qc_exclusion_pattern}' --gt_match_file ${gt_outlier_input}" + filter_strategy_exclusion = "--patterns_exclude='${gt_match_based_adaptive_qc_exclusion_pattern}' --gt_match_file ${gt_outlier_input}" }else{ filter_strategy_exclusion = "" } @@ -77,7 +77,8 @@ process OUTLIER_FILTER { --max_samples ${max_samples} \ --output_file ${outfile} \ --anndata_compression_opts ${anndata_compression_opts} \ - --filter_strategy '${outlier_filtering_strategy}' \ + --filter_strategy='${outlier_filtering_strategy}' \ + --MAD_thresholds='${params.sample_qc.cell_filters.filter_outliers.mad_tresholds}' \ ${filter_strategy_exclusion} mkdir plots diff --git a/modules/nf-core/modules/vireo/main.nf b/modules/nf-core/modules/vireo/main.nf index 5a2f3e5..107db60 100755 --- a/modules/nf-core/modules/vireo/main.nf +++ b/modules/nf-core/modules/vireo/main.nf @@ -213,7 +213,7 @@ process VIREO { output: path("vireo_${samplename}"), emit: output_dir - // tuple val(samplename), path("vireo_${samplename}"), emit: output_dir + tuple val(samplename), path("vireo_${samplename}"), emit: output_dir_subsampling tuple val(samplename), path("vireo_${samplename}/donor_ids.tsv"), emit: sample_donor_ids tuple val(samplename), path("vireo_${samplename}/GT_donors.vireo.vcf.gz"), path(vcf_file),path(donor_gt_csi), emit: sample_donor_vcf tuple val(samplename), path("vireo_${samplename}/GT_donors.vireo.vcf.gz"), emit: infered_vcf diff --git a/subworkflows/celltype.nf b/subworkflows/celltype.nf index 4e611ad..4f775c0 100755 --- a/subworkflows/celltype.nf +++ b/subworkflows/celltype.nf @@ -6,6 +6,7 @@ include {KERAS_CELLTYPE} from "$projectDir/modules/nf-core/modules/keras_celltyp include {CELLTYPE_FILE_MERGE} from "$projectDir/modules/nf-core/modules/cell_type_assignment/functions" include {SCPRED} from "$projectDir/modules/nf-core/modules/scpred/main" include { DSB } from '../modules/nf-core/modules/citeseq/main' + workflow celltype{ take: @@ -24,18 +25,19 @@ workflow celltype{ SPLIT_BATCH_H5AD.out.sample_file .splitCsv(header: true, sep: "\t", by: 1) .map{row -> tuple(row.experiment_id, file(row.h5ad_filepath))}.set{ch_experiment_filth5} + SPLIT_BATCH_H5AD.out.az_sample_file .splitCsv(header: true, sep: "\t", by: 1) .map{row -> tuple(row.experiment_id, file(row.h5ad_filepath))}.set{az_ch_experiment_filth5} SPLIT_BATCH_H5AD.out.files_anndata_batch.flatMap().set{ch_batch_files} - + SPLIT_BATCH_H5AD.out.keras_outfile.collect().set{keras_files} // Keras celltype assignemt if (params.celltype_assignment.run_keras){ KERAS_CELLTYPE(ch_experiment_filth5,params.celltype_prediction.keras.keras_model,params.celltype_prediction.keras.keras_weights_df) - all_extra_fields = KERAS_CELLTYPE.out.predicted_celltype_labels.collect() - all_extra_fields = all_extra_fields.ifEmpty(Channel.from("$projectDir/assets/fake_file.fq")) + all_extra_fields3 = KERAS_CELLTYPE.out.predicted_celltype_labels.collect() + all_extra_fields = all_extra_fields3.ifEmpty(Channel.from("$projectDir/assets/fake_file.fq")) }else{ all_extra_fields = Channel.from("$projectDir/assets/fake_file.fq") } @@ -56,8 +58,8 @@ workflow celltype{ Channel.fromList(params.celltypist.models) .set{ch_celltypist_models} CELLTYPIST(az_ch_experiment_filth5.combine(ch_celltypist_models)) - ct_out = CELLTYPIST.out.predicted_labels.collect() - ct_out = ct_out.ifEmpty(Channel.from("$projectDir/assets/fake_file2.fq")) + ct_out2 = CELLTYPIST.out.predicted_labels.collect() + ct_out = ct_out2.ifEmpty(Channel.from("$projectDir/assets/fake_file2.fq")) }else{ ct_out = Channel.from("$projectDir/assets/fake_file2.fq") } @@ -65,16 +67,21 @@ workflow celltype{ // // SCPRED if (params.celltype_assignment.run_scpred){ SCPRED(params.outdir,ch_batch_files) - sc_out = SCPRED.out.predicted_celltype_labels.collect() - sc_out = sc_out.ifEmpty(Channel.of()) + sc_out2 = SCPRED.out.predicted_celltype_labels.collect() + sc_out = sc_out2.ifEmpty(Channel.of()) }else{ sc_out = Channel.of() } all_extra_fields2 = all_extra_fields.mix(sc_out) - CELLTYPE_FILE_MERGE(az_out,ct_out,all_extra_fields2,SPLIT_BATCH_H5AD.out.keras_outfile.collect()) + CELLTYPE_FILE_MERGE(az_out,ct_out,all_extra_fields2,keras_files) + + if (params.remap_celltypes){ + REMAP_AZIMUTH(CELLTYPE_FILE_MERGE.out.celltype_assignments,params.mapping_file) + file__anndata_merged2 = REMAP_AZIMUTH.out.predicted_celltype_labels.collect() + } + file__anndata_merged2=CELLTYPE_FILE_MERGE.out.file__anndata_merged2 - file__anndata_merged2.view() emit: file__anndata_merged2 diff --git a/subworkflows/main_deconvolution.nf b/subworkflows/main_deconvolution.nf index 25828bf..5d0dabf 100755 --- a/subworkflows/main_deconvolution.nf +++ b/subworkflows/main_deconvolution.nf @@ -320,9 +320,9 @@ workflow main_deconvolution { if (params.genotype_input.run_with_genotype_input) { if (params.do_vireo_subsampling){ VIREO_SUBSAMPLING(vireo_extra_repeats) - VIREO_SUBSAMPLING.out.output_dir.concat(VIREO.out.output_dir).set{tuple_1} - // tuple_1.groupTuple(by:0).set{vspp0} - // VIREO_SUBSAMPLING_PROCESSING(vspp0) + VIREO_SUBSAMPLING.out.output_dir.concat(VIREO.out.output_dir_subsampling).set{tuple_1} + tuple_1.groupTuple(by:0).set{vspp0} + VIREO_SUBSAMPLING_PROCESSING(vspp0) // VIREO_SUBSAMPLING.out.all_required_data.set{replacement_input_sub} // // replacement_input_sub.combine(vireo_with_gt).set{vir_repl_input} // // REPLACE_GT_DONOR_ID_SUBS(vir_repl_input) @@ -331,7 +331,7 @@ workflow main_deconvolution { // // .combine(ch_ref_vcf).set { gt_math_pool_against_panel_input_subs } // MATCH_GT_VIREO(gt_math_pool_against_panel_input_subs) - // subsampling_donor_swap = VIREO_SUBSAMPLING_PROCESSING.out.subsampling_donor_swap + subsampling_donor_swap = VIREO_SUBSAMPLING_PROCESSING.out.subsampling_donor_swap }else{ subsampling_donor_swap = Channel.from("$projectDir/assets/fake_file.fq") } diff --git a/subworkflows/qc.nf b/subworkflows/qc.nf index b4110ee..f849bdb 100755 --- a/subworkflows/qc.nf +++ b/subworkflows/qc.nf @@ -58,7 +58,7 @@ workflow qc { params.sample_qc.outlier_filtering_strategy ) file__anndata_merged = OUTLIER_FILTER.out.anndata - file__cells_filtered = OUTLIER_FILTER.out.cells_filtered + // file__cells_filtered = OUTLIER_FILTER.out.cells_filtered } diff --git a/workflows/yascp.nf b/workflows/yascp.nf index 22774af..07511e0 100755 --- a/workflows/yascp.nf +++ b/workflows/yascp.nf @@ -94,8 +94,6 @@ workflow YASCP { channel__file_paths_10x=SPLIT_CITESEQ_GEX_FILTERED.out.channel__file_paths_10x channel__file_paths_10x_single=SPLIT_CITESEQ_GEX_FILTERED.out.gex_data ch_experiment_filth5 = SPLIT_CITESEQ_GEX.out.gex_data - }else{ - ab_data = Channel.of() } // Either run ambient RNA removal with cellbender or use cellranger filtered reads (cellbender|cellranger) @@ -191,7 +189,6 @@ workflow YASCP { ? Channel.fromPath("${params.outdir}/deconvolution/vireo/*/vireo_*", checkIfExists:true, type: 'dir') : Channel.fromPath("${launchDir}/${params.outdir}/deconvolution/vireo/*/vireo_*", type: 'dir') - GENOTYPE_MATCHER(vireo_paths.collect()) matched_donors = GENOTYPE_MATCHER.out.matched_donors matched_donors.subscribe { println "matched_donors: $it" } From 9a45feac6527e042ae0771fc49355bcabd8f0f00 Mon Sep 17 00:00:00 2001 From: Matiss Ozols Date: Sat, 11 May 2024 08:07:40 +0100 Subject: [PATCH 7/8] fixed the index error --- bin/generate_combined_celltype_anotation_file.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/bin/generate_combined_celltype_anotation_file.py b/bin/generate_combined_celltype_anotation_file.py index af48e41..b7ab69d 100755 --- a/bin/generate_combined_celltype_anotation_file.py +++ b/bin/generate_combined_celltype_anotation_file.py @@ -127,13 +127,13 @@ def main(): adata1 = scanpy.read_h5ad(ad1) if adata1.n_obs > 0: adatasets.append(adata1) - ad = adatasets[0].concatenate(*adatasets[1:]) - if(len(adatasets)>1): - # in this case the concentration adds a -1 -2 -3 to index that has to be removed. - all_index = pd.DataFrame(ad.obs.index,columns=['col']) - all_indexes = all_index['col'].str.split('-') - all_together = all_indexes.str[0]+'-'+all_indexes.str[1]+'-'+all_indexes.str[2] - ad.obs.set_index(all_together, inplace=True) + ad = adatasets[0].concatenate(*adatasets[1:],index_unique=None) + # if(len(adatasets)>1): + # # in this case the concentration adds a -1 -2 -3 to index that has to be removed. + # all_index = pd.DataFrame(ad.obs.index,columns=['col']) + # all_indexes = all_index['col'].str.split('-') + # all_together = all_indexes.str[0]+'-'+all_indexes.str[1]+'-'+all_indexes.str[2] + # ad.obs.set_index(all_together, inplace=True) # ad2 = adatasets2[0].concatenate(*adatasets2[1:]) # ad = scanpy.read(adata) From ac9e8c6be3e868a1ece0787fb87aa2cef2e39508 Mon Sep 17 00:00:00 2001 From: Matiss Ozols Date: Fri, 24 May 2024 11:28:35 +0100 Subject: [PATCH 8/8] ensuring all in place --- bin/0035-scanpy_normalize_pca.py | 2 +- bin/2.integrate.R | 5 ++ bin/add_adt.R | 17 ++++--- bin/gather_minimal_dataset.py | 86 +++++++++++--------------------- bin/transfer_data.py | 30 ++++++++++- 5 files changed, 73 insertions(+), 67 deletions(-) diff --git a/bin/0035-scanpy_normalize_pca.py b/bin/0035-scanpy_normalize_pca.py index db1b551..6306556 100755 --- a/bin/0035-scanpy_normalize_pca.py +++ b/bin/0035-scanpy_normalize_pca.py @@ -543,7 +543,7 @@ def scanpy_normalize_and_pca( copy=False ) - + print('---- Writting files ----') # Keep a record of the different gene scores if score_genes_df is not None: adata.uns['df_score_genes'] = score_genes_df_updated diff --git a/bin/2.integrate.R b/bin/2.integrate.R index a9f3082..f072cdd 100755 --- a/bin/2.integrate.R +++ b/bin/2.integrate.R @@ -169,6 +169,11 @@ for(f in cite_files){ # Add the vireo donor assignment to the seurat object this_donor_cells <- donor_cells[donor_cells$sample==sample_id,] + + if (dim(this_donor_cells)[1]==0){ + # Here we skip non deconvoluted samples + next + } sobj@meta.data$donor.vireo <- this_donor_cells[match(paste0(rownames(sobj@meta.data),'_',sample_name), paste0(this_donor_cells$cell, '_',this_donor_cells$sample)),]$matched.donor diff --git a/bin/add_adt.R b/bin/add_adt.R index 199bd2a..6f3903f 100755 --- a/bin/add_adt.R +++ b/bin/add_adt.R @@ -24,7 +24,7 @@ if (future::supportsMulticore()) { } else { future::plan(future::multisession) } -# args=vector(mode='list', length=6); args[[1]]='cellranger700_multi_abb2ba0911f1f4bde982a4c38d9fd6ed'; args[[2]]='raw_feature_bc_matrix'; args[[3]]='filtered_feature_bc_matrix'; args[[4]]='cellranger700_multi_abb2ba0911f1f4bde982a4c38d9fd6ed___sample_QCd_adata.h5ad' +# args=vector(mode='list', length=6); args[[1]]='LDP58'; args[[2]]='raw_feature_bc_matrix'; args[[3]]='filtered_feature_bc_matrix'; args[[4]]='LDP58___sample_QCd_adata.h5ad' ##### args = commandArgs(trailingOnly=TRUE) @@ -47,13 +47,13 @@ outdir <- getwd() # filtered_feature_file = cellranger_filepath = args[2] # # filtered_cellranger = '/lustre/scratch123/hgi/teams/hgi/mo11/tmp_projects/jaguar_yascp/nieks_pipeline/fetch/results_old/cellranger_data/cellranger700_multi_bc45a1c2fe2a3fbbcde46cf984cf42e2/per_sample_outs/cellranger700_multi_bc45a1c2fe2a3fbbcde46cf984cf42e2/count/sample_filtered_feature_bc_matrix.h5' -# sample_name <- 'cellranger700_multi_74b30caec2cf83c0048bc87946b301e8' -# cellranger_rawfile_path <- 'raw_feature_bc_matrix' -# filtered_cellranger = 'filtered_feature_bc_matrix' -# file_with_qc_applied = 'cellranger700_multi_74b30caec2cf83c0048bc87946b301e8___sample_QCd_adata.h5ad' +sample_name <- 'LDP58' +cellranger_rawfile_path <- 'raw_feature_bc_matrix' +filtered_cellranger = 'filtered_feature_bc_matrix' +file_with_qc_applied = 'LDP58___sample_QCd_adata.h5ad' sample_name <- args[1] cellranger_rawfile_path <- args[2] -filtered_cellranger = args[3] +filtered_cellranger = args[3][1] file_with_qc_applied = args[4] # cellranger_rawfile_path = '/lustre/scratch123/hgi/teams/hgi/mo11/tmp_projects/jaguar_yascp/nieks_pipeline/fetch/results_old/cellranger_data/cellranger700_multi_bc45a1c2fe2a3fbbcde46cf984cf42e2/multi/count/raw_feature_bc_matrix.h5' @@ -151,7 +151,9 @@ Convert( # qced_cells <- readRDS(scpred_file_with_qc) qced_cells <- LoadH5Seurat(paste('tmp',"h5seurat",sep='.'),assays = "RNA") qced_barcodes <- gsub('_.*','',colnames(qced_cells)) - qced_barcodes <- gsub('-cellranger.*','',colnames(qced_cells)) + qced_barcodes <- gsub('-1.*','-1',(qced_barcodes)) + qced_barcodes <- gsub('-2.*','-2',(qced_barcodes)) + qced_barcodes <- gsub('-cellranger.*','',(qced_barcodes)) qced_cells = RenameCells(qced_cells, new.names = qced_barcodes) genes <- read.table(paste0(cellranger_rawfile_path,"/features.tsv.gz"), sep = "\t", col.names = c("ENSG_ID","Gene_ID", "FeatureType")) @@ -183,6 +185,7 @@ Convert( # }else if(basename(f)=='sample_feature_bc_matrix.h5' || basename(f) == 'sample_filtered_feature_bc_matrix.h5'){ # with cellranger multi the raw file is 3 directories higher than filtered file, then in multi/count rna = raw <- Read10X(cellranger_rawfile_path) + print(paste0(' RAW data loaded ')) # }else{ # stop(paste('basename(f) should be sample_feature_bc_matrix.h5 or filtered_feature_bc_matrix.h5, but was:',basename(f))) # } diff --git a/bin/gather_minimal_dataset.py b/bin/gather_minimal_dataset.py index a921145..7c6620a 100755 --- a/bin/gather_minimal_dataset.py +++ b/bin/gather_minimal_dataset.py @@ -61,8 +61,8 @@ 'cell_passes_qc:score':'qc.filter.pass:score', 'cell_passes_qc-per:all_together::exclude':'qc.filter.pass.spikein_exclude', 'cell_passes_qc-per:all_together::exclude:score':'qc.filter.pass.spikein_exclude:score', - 'cell_passes_qc-per:Azimuth:L0_Azimuth:predicted.celltype.l2':'qc.filter.pass.AZ:L0', - 'cell_passes_qc-per:Azimuth:L0_Azimuth:predicted.celltype.l2:score':'qc.filter.pass.AZ:L0:score', + 'cell_passes_qc-per:Azimuth:L0_predicted.celltype.l2':'qc.filter.pass.AZ:L0', + 'cell_passes_qc-per:Azimuth:L0_predicted.celltype.l2:score':'qc.filter.pass.AZ:L0:score', 'total_counts': 'qc.umi.count.total', 'total_counts_gene_group__mito_transcript': 'qc.umi.count.mt', 'pct_counts_gene_group__mito_transcript': 'qc.umi.perc.mt', @@ -399,8 +399,7 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane #Cellranger datasets ###################### columns_output = {**COLUMNS_DATASET, **COLUMNS_DECONV, **COLUMNS_QC} - #Unfiltered - compression_opts = 'gzip' + #Reading unfiltered raw cellranger datasets try: adata_cellranger_raw = scanpy.read_10x_mtx(f"{df_raw.loc[expid, 'data_path_10x_format']}/raw_feature_bc_matrix") try: @@ -417,6 +416,7 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane except: print('File already linked') + # Reading filtered cellranger files try: adata_cellranger_filtered = scanpy.read_10x_mtx(f"{df_raw.loc[expid, 'data_path_10x_format']}/filtered_feature_bc_matrix") try: @@ -441,7 +441,6 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane df_total_counts['barcodes'] = df_total_counts.index df_total_counts_cellranger_raw = df_total_counts df_total_counts_cellranger_raw['dataset']='Cellranger Raw' - wd = os.getcwd() if df_cellbender is not None and (len(df_cellbender)!=0): df_cellbender = df_cellbender.reset_index() @@ -449,16 +448,11 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane df_cellbender = df_cellbender.set_index('experiment_id') f=df_cellbender.loc[expid, 'data_path_10x_format'] if (type(f) == str): - print('yes') f=[f] for id in f: print(id) try: cell_bender_path = id - # try: - # cell_bender_path = f'{wd}/'+'/'.join(cell_bender_path.split('/')[-6:]) - # except: - # cell_bender_path = f"{args.results_dir}/{df_cellbender.loc[expid, 'data_path_10x_format']}" cellbender_h5 = f"{cell_bender_path}/../cellbender_FPR_{Resolution}_filtered.h5" ad_lane_filtered = scanpy.read_10x_mtx(cell_bender_path) print('loaded') @@ -471,30 +465,19 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane try: path2 = os.path.realpath(cellbender_h5) os.symlink(path2, f"./{outdir}/Cellbender_filtered_{Resolution}__{expid}.h5") - # Here link also mtx files except: print('File already linked') - dfcb = fetch_cellbender_annotation(cell_bender_path, expid,Resolution) columns_output = {**columns_output, **COLUMNS_CELLBENDER} else: ad_lane_filtered = scanpy.read_10x_mtx(f"{df_raw.loc[expid, 'data_path_10x_format']}/filtered_feature_bc_matrix") df_cellbender=None cell_bender_path=None - + # Removing Zero count cells from the matices zero_count_cells = ad_lane_filtered.obs_names[np.where(ad_lane_filtered.X.sum(axis=1) == 0)[0]] ad_lane_filtered = ad_lane_filtered[ad_lane_filtered.obs_names.difference(zero_count_cells, sort=False)] - - zero_count_cells = adata_cellranger_filtered.obs_names[np.where(adata_cellranger_filtered.X.sum(axis=1) == 0)[0]] adata_cellranger_filtered = adata_cellranger_filtered[adata_cellranger_filtered.obs_names.difference(zero_count_cells, sort=False)] - # adata_cellranger_filtered=ad_lane_filtered - # scanpy.pp.calculate_qc_metrics(adata_cellranger_filtered, inplace=True) - # df_total_counts = pd.DataFrame(data= adata_cellranger_filtered.obs.sort_values(by=['total_counts'], ascending=False).total_counts) - # df_total_counts['barcodes'] = df_total_counts.index - # df_total_counts['barcode_row_number'] = df_total_counts.reset_index().index + 1 - # df_total_counts_cellranger_filtered = df_total_counts - # df_total_counts_cellranger_filtered['dataset'] = 'Cellranger Filtered' ############# #Cellranger Metrics Datasheet @@ -513,6 +496,15 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane ########################## # Scrublet ######################### + doublet_data = glob.glob(f'{args.results_dir}/doublets/*.tsv') + doublet_data_combined = pd.DataFrame() + for f1 in doublet_data: + print(f1) + pool_name = f1.split('__')[0].split('/')[-1] + d2 = pd.read_csv(f1,sep='\t') + d2['Exp']=pool_name + doublet_data_combined = pd.concat([doublet_data_combined,d2]) + datadir_scrublet=glob.glob(f'{args.results_dir}/*/multiplet.method=scrublet')[0] if os.path.isdir(datadir_scrublet): # Scrublet loading QC @@ -532,15 +524,9 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane ############################################################ # Loading deconvoluted data including unassigned and doublets ########################################### - print(expid) obsqc,all_QC_lane = fetch_qc_obs_from_anndata(adqc, expid, cell_bender_path = cell_bender_path,Resolution=Resolution) - - # try: - # datadir_deconv=f'{args.results_dir}/deconvolution/split_donor_h5ad' - # donor_table = os.path.join(datadir_deconv, expid, "{}.donors.h5ad.tsv".format(expid)) - # df_donors = pandas.read_table(donor_table, header=None, names=("experiment_id", "donor_id", "file_path_h5ad")) - # except: + donor_tables=pd.DataFrame([]) for d1 in set(obsqc['donor']): donor_table={} @@ -596,11 +582,6 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane except: chromium_channel = 'Run_ID not vailable' - - - - def isNaN(num): - return num!= num Donors = list(df_donors.donor_id) try: @@ -634,7 +615,7 @@ def isNaN(num): Mean_reads_per_cell = int(metrics['Mean reads per cell'].values[0].replace(',','')) except: Mean_reads_per_cell= None - # adata_cellranger_raw.X + f = pd.DataFrame(adata_cellranger_filtered.X.sum(axis=1)) Median_UMI_Counts_per_cellranger= statistics.median(f[f>0][0]) Mean_Reads = statistics.mean(f[f>0][0]) @@ -658,7 +639,6 @@ def isNaN(num): Number_of_cells = len(set(df1.index)) Total_UMIs_before_10x_filter = np.sum(ad_lane_raw.X) #this may be after the normalisation - # ad_lane_filtered = Total_UMIs_after_cellbender_filter = np.sum(ad_lane_filtered.X) #This is more 27840 Total_UMIs_after_cellbender = sum(all_QC_lane.obs['total_counts']) #This is less 22817 @@ -694,9 +674,6 @@ def isNaN(num): all_probs = pd.DataFrame() Tranche_Pass_Fail='PASS' - Tranche_Failure_Reason='' - - # print("** Fraction_Reads_in_Cells : "+Fraction_Reads_in_Cells.strip('%')) Tranche_Failure_Reason =' ' try: if (float(Fraction_Reads_in_Cells.strip('%'))<=70): @@ -713,7 +690,6 @@ def isNaN(num): for i in df_donors.index: print(i) - # feeds in the individual assignments here. Donor_Stats=[] row = df_donors.loc[i] print(row) @@ -727,14 +703,12 @@ def isNaN(num): path1 = re.sub('.*/results/', 'results/', path1) Deconvoluted_Donor_Data = anndata.read_h5ad(path1) Donor_barcodes = Deconvoluted_Donor_Data.obs.index - # print(path1) + ################# #Deconvolution data ################# - - - # issue with the all_QC_lane is that they are filtered and the unassigned cells are removed - ve can merge them back together and + # Issue with the all_QC_lane is that they are filtered and the unassigned cells are removed - we can merge them back together and if (row["donor_id"] == 'unassigned'): Unassigned_donor = len(Deconvoluted_Donor_Data.obs) elif (row["donor_id"] == 'doublet'): @@ -765,7 +739,6 @@ def isNaN(num): data_donor_for_stats['cells passing QC'].append(Donor_cells_passes_qc) Donor_cell_assignments = azt.loc[Mengled_barcodes_donor] #for this have to figure out when the cell type is unasigned. - # Donor_cell_assignments = Donor_cell_assignments.rename(COLUMNS_AZIMUTH,axis=1) Cell_types_detected = len(set(Donor_cell_assignments['Azimuth:predicted.celltype.l2'])) Donor_UMIS_mapped_to_mitochondrial_genes = sum(Donor_qc_files.obs['total_counts_gene_group__mito_transcript']) Donor_UMIS_mapped_to_ribo_genes = sum(Donor_qc_files.obs['total_counts_gene_group__ribo_protein']) @@ -855,7 +828,6 @@ def isNaN(num): data_donor.append(Donor_Stats) fctr += 1 - # print('done') all_probs = all_probs[~all_probs.index.duplicated(keep='first')] azt['prob_doublet']=all_probs['prob_doublet'] Donor_df = pd.DataFrame(data_donor) @@ -973,15 +945,15 @@ def set_argument_parser(): write_h5=False else: write_h5=True - # write_h5=False + oufh = open(os.path.join(args.outdir, "files.tsv"), 'w') oufh.write("experiment_id\tdonor_id\tfilename_h5ad\tfilename_annotation_tsv\n") df_raw = pandas.read_table(args.input_table, index_col = 'experiment_id') if (args.cellbender)=='cellranger': - # here we do not use cellbender and go with default cellranger + # Here we do not use cellbender and go with default cellranger df_cellbender = None else: - # here we have run the cellbender as par of pipeline. + # Here we have run the cellbender as par of pipeline. # cellbender/*/cellbender-epochs_*/cellbender-FPR_0pt01-filtered_10x_mtx file_path = glob.glob(f'{args.results_dir}/nf-preprocessing/cellbender/*/cellbender-epochs_*/*{args.resolution}*10x_mtx*') file_path2 = glob.glob(f'{args.results_dir}/nf-preprocessing/cellbender/*/*{args.resolution}*10x_mtx*') @@ -989,7 +961,10 @@ def set_argument_parser(): df_cellbender = pd.DataFrame(joined_file_paths,columns=['data_path_10x_format']) df_cellbender['experiment_id']=df_cellbender['data_path_10x_format'].str.split('/').str[-3] df_cellbender= df_cellbender.set_index('experiment_id') + Resolution = args.resolution + + # Load the final QCd dataset try: adqc = anndata.read_h5ad(f'{args.results_dir}/merged_h5ad/outlier_filtered_adata.h5ad') except: @@ -998,30 +973,27 @@ def set_argument_parser(): except: d2 = glob.glob(f'{args.results_dir}/*/*/adatanormalized.h5ad')[0] adqc = anndata.read_h5ad(d2) - adqc.obs['experiment_id'] = adqc.obs['experiment_id'].str.split("__").str[0] + # adqc.obs['experiment_id'] = adqc.obs['experiment_id'].str.split("__").str[0] fctr = 0 data_tranche_all=[] data_donor_all=[] count = 1 All_probs_and_celltypes = pd.DataFrame() - - Sample_metadata = pd.DataFrame() - adqc.obs['tranche.id']=args.experiment_name + # SETTING TRANCHE NAME try: - adqc.obs['cell_passes_qc-per:Azimuth:L0_Azimuth:predicted.celltype.l2:score'] = adqc.obs['cell_passes_qc-per:Azimuth:L0_Azimuth:predicted.celltype.l2:score'].astype(float,errors='ignore') + adqc.obs['cell_passes_qc-per:Azimuth:L0_predicted.celltype.l2'] = adqc.obs['cell_passes_qc-per:Azimuth:L0_predicted.celltype.l2:score'].astype(float,errors='ignore') except: _='no values associated' try: adqc.obs['cell_passes_qc-per:all_together::exclude:score']= adqc.obs['cell_passes_qc-per:all_together::exclude:score'].astype(float,errors='ignore') except: _='no values associated' - + + # Now we calculate all the statistics for each of the pools. for expid in df_raw.index: print(expid) - # try: - # expid ='OTARscRNA12924807' s = adqc.obs['convoluted_samplename'] == expid ad = adqc[s] if ad.n_obs == 0: diff --git a/bin/transfer_data.py b/bin/transfer_data.py index 155eb09..06e93d2 100755 --- a/bin/transfer_data.py +++ b/bin/transfer_data.py @@ -167,6 +167,26 @@ def main_data_colection(pipeline='',name='',directory='',input_table=None,cb_res for folder in Folders: copyfile(f'{folder1}/{folder}/Vireo_plots.pdf', f'{name_dir}/Deconvolution/Vireo_plots_{folder}.pdf') + + + folder1 = f'{directory}/doublets' + if os.path.isdir(folder1): + print('prepearing Doublet folder') + try: + os.mkdir(f'{name_dir}/Doublets___301') + except: + print('dire exists') + files = glob.glob(f'{folder1}/*.tsv') + + for file1 in files: + print(file1) + try: + copy(file1, f'{name_dir}/Doublets___301') + except: + print('picked up directory') + continue + + folder1 = f'{directory}/celltype/celltypist' if os.path.isdir(folder1): print('prepearing celltype folder') @@ -205,6 +225,8 @@ def main_data_colection(pipeline='',name='',directory='',input_table=None,cb_res # print('dire exists') # copyfile(fil1, f'{name_dir}/QC metrics/plot_ecdf-x_log10.var=total_counts.color=experiment_id-adata.png') files = glob.glob(f'{folder1}/*[!.gz]') + files2 = glob.glob(f'{folder1}/*/*[!.gz]') + files = files + files2 for file1 in files: print(file1) try: @@ -212,8 +234,12 @@ def main_data_colection(pipeline='',name='',directory='',input_table=None,cb_res except: print('picked up directory') continue - - + try: + copy(f'{directory}/celltype/All_Celltype_Assignments.csv', f'{name_dir}/Cell-type assignment/All_Celltype_Assignments.csv') + except: + print('doesnt exist') + + folder1 = f'{directory}/plots/per_celltype_outliers' if os.path.isdir(folder1): print('yes')