dynamic SNP fix for / | genotypes

wtsi-hgi · Aug 1, 2024 · 56eac6f · 56eac6f
1 parent 4586eae
commit 56eac6f
Show file tree

Hide file tree

Showing 9 changed files with 493 additions and 1,604 deletions.
diff --git a/bin/combine_concordance.py b/bin/combine_concordance.py
diff --git a/bin/concordance_calculations.py b/bin/concordance_calculations.py
@@ -975,7 +975,12 @@ def set_results(self,to_set,id):
 
 
     def combine_concordances(self,result,other_donor_concordance,donor_gt_match,analyse_donor):
-        pd.DataFrame(other_donor_concordance).sort_values(by=['cell']).to_csv(f'{donor_gt_match}-{analyse_donor}--each_cells_comparison_with_other_donor.tsv',sep='\t',index=False)
+        try:
+            pd.DataFrame(other_donor_concordance).sort_values(by=['cell']).to_csv(f'{donor_gt_match}-{analyse_donor}--each_cells_comparison_with_other_donor.tsv',sep='\t',index=False)
+        except:
+            print('We do not have any cells to analyse for this donor')
+            pd.DataFrame(other_donor_concordance).to_csv(f'{donor_gt_match}-{analyse_donor}--each_cells_comparison_with_other_donor.tsv',sep='\t',index=False)
+
         self.cell_concordance_table = {**self.cell_concordance_table, **result}
 
     def combine_dict(self,cell_concordance_table,result): 

diff --git a/bin/dynamic_donor_exclusive_snp_selection.py b/bin/dynamic_donor_exclusive_snp_selection.py
@@ -339,7 +339,8 @@ def donor_exclusive_sites(exclusive_don_variants2):
     subs['full'] = subs['full'].str.replace(".|.",';', regex=False).str.replace(";+",';')
     subs['full'] = subs['full'].str.replace("./.",';', regex=False).str.replace(";+",';')
     subs['full'] = subs['full'].str.replace(".",';', regex=False).str.replace(";+",';')
-
+    subs['full'] = subs['full'].str.replace("/",'|', regex=False)
+
     # all informative indexes
     # now we need to locate which variants actually has a change in the genotype. 
     all_informative_site_index = set()

diff --git a/bin/gather_minimal_dataset.py b/bin/gather_minimal_dataset.py
@@ -98,11 +98,17 @@
     'chromium_lane': 'chromium.lane',
     'instrument':'instrument'
     }
+
 COLUMNS_SCRUBLET = {
     'scrublet__multiplet_scores': 'scrublet.scores',
     'scrublet__predicted_multiplet': 'scrublet.multiplet',
-    'scrublet__multiplet_zscores': 'scrublet.zscores'
+    'scrublet__multiplet_zscores': 'scrublet.zscores',
+    'scds_DropletType':'scds.multiplet','scds_score':'scds.score',
+    'scDblFinder_DropletType':'scDblFinder.multiplet','scDblFinder_Score':'scDblFinder.score',
+    'DoubletDecon_DropletType':'DoubletDecon.multiplet',
+    'DoubletFinder_DropletType':'DoubletFinder.multiplet','DoubletFinder_score':'DoubletFinder.score'
     }
+
 COLUMNS_OUTPUT = \
     {**COLUMNS_DATASET, **COLUMNS_CELLBENDER, **COLUMNS_DECONV, **COLUMNS_QC, **COLUMNS_AZIMUTH}
 COLUMNS_OUTPUT_WITH_SCRUBLET = \
@@ -354,7 +360,7 @@ def gather_donor(donor_id, ad, ad_lane_raw, azimuth_annot, qc_obs, columns_outpu
         dt = pandas.concat([df,dfqc], axis = 1, join = 'inner')
 
         colnams = list(columns_output.keys())
-        colnams_overlap = set(colnams).intersection(set(dt.columns))
+        colnams_overlap = sorted(set(colnams).intersection(set(dt.columns)))
         ad.obs = dt[colnams_overlap].rename(columns = columns_output)
         dt = pandas.concat([df, dfqc], axis = 1, join = 'outer')[colnams_overlap]
         dt.rename(columns = columns_output, inplace = True)
@@ -377,9 +383,11 @@ def gather_donor(donor_id, ad, ad_lane_raw, azimuth_annot, qc_obs, columns_outpu
 
     dt.index.name = 'barcode'
     ad.obs.index.name = 'barcode'
-    dt.to_csv(os.path.join(outdir, oufnam + '.tsv'), sep = "\t", na_rep = "N/A")
-    sys.stderr.write("writing file {} ...\n".format(oufnam))
     ad.obs = ad.obs.loc[:,~ad.obs.columns.duplicated()]
+    dt = dt.loc[:,~dt.columns.duplicated()].copy()
+    dt[set(dt.columns)].to_csv(os.path.join(outdir, oufnam + '.tsv'), sep = "\t", na_rep = "N/A")
+    sys.stderr.write("writing file {} ...\n".format(oufnam))
+
     if write_h5:
         path1=os.path.join(outdir, oufnam + '.h5ad')
         try:
@@ -505,6 +513,13 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
     #############
 
     azt = pd.read_csv(f'{args.results_dir}/celltype/All_Celltype_Assignments.tsv',sep='\t',index_col=0)
+    azt_cols_to_add = azt.columns[azt.columns.str.contains('Azimuth')]
+    ct_cols_to_add = azt.columns[azt.columns.str.contains('Celltypist')]
+    for i3 in set(azt_cols_to_add) - set(columns_output.keys()):
+        columns_output = {**columns_output,  **{i3:i3}}
+    for i3 in set(ct_cols_to_add) - set(columns_output.keys()):
+        columns_output = {**columns_output,  **{i3:i3}}
+    # scpred_to_add = azt.columns[azt.columns.str.contains('Scpred')]
     ##########################
     # Scrublet
     #########################
@@ -516,21 +531,26 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
         d2 = pd.read_csv(f1,sep='\t')
         d2['Exp']=pool_name
         doublet_data_combined = pd.concat([doublet_data_combined,d2])
+    doublet_data_combined = doublet_data_combined.drop_duplicates(subset='barcodes')
+    scb = doublet_data_combined.set_index('barcodes')
+    columns_output = {**columns_output,  **COLUMNS_SCRUBLET}    
 
-    datadir_scrublet=glob.glob(f'{args.results_dir}/*/multiplet.method=scrublet')[0]
-    if os.path.isdir(datadir_scrublet):
-        # Scrublet loading QC
-        try:
-            scb = load_scrublet_assignments(
-                expid,
-                datadir_scrublet=datadir_scrublet
-            )
-            columns_output = {**columns_output,  **COLUMNS_SCRUBLET}
-        except:
-            print('Scrubblet was not performed for this pool - potential reason is that there are not enough cells for assignment')
-            scb = None
-    else:
-        scb = None
+    # doublet_data_combined.iloc[0]
+    # datadir_scrublet=glob.glob(f'{args.results_dir}/*/multiplet.method=scrublet')[0]
+    # if os.path.isdir(datadir_scrublet):
+    #     # Scrublet loading QC
+    #     try:
+    #         scb = load_scrublet_assignments(
+    #             expid,
+    #             datadir_scrublet=datadir_scrublet
+    #         )
+    #         columns_output = {**columns_output,  **COLUMNS_SCRUBLET}
+    #         scb = pd.concat([scb,doublet_data_combined.loc[scb.index]],axis=1)
+    #     except:
+    #         print('Scrubblet was not performed for this pool - potential reason is that there are not enough cells for assignment')
+    #         scb = None
+    # else:
+    #     scb = None
 
 
     ############################################################

diff --git a/bin/generate_combined_celltype_anotation_file.py b/bin/generate_combined_celltype_anotation_file.py
@@ -146,8 +146,8 @@ def main():
 
     # ad2 = adatasets2[0].concatenate(*adatasets2[1:])
     # ad = scanpy.read(adata)
-    ad.obs = ad.obs.merge(Data_All, left_index=True, right_index=True)
-
+    ad.obs = ad.obs.merge(Data_All, left_index=True, right_index=True, how='left')
+    # set(ad.obs.index)-set(Data_All.index)
     donor_celltype_report={}
     tranche_exp_report={}
     for id1 in set(Data_All['Exp']):

diff --git a/conf/base.conf b/conf/base.conf
@@ -69,6 +69,7 @@ params{
     split_ad_per_bach = true
     cellbender_resolution_to_use='0pt1'
     reference_assembly_fasta_dir = " /nfs/srpipe_references/downloaded_from_10X/refdata-gex-GRCh38-2020-A/fasta/"
+    //# reference_assembly_fasta_dir = "https://yascp.cog.sanger.ac.uk/public/10x_reference_assembly"
     webtransfer = false
     project_name = 'Cardinal_pilots'
     run_with_genotype_input=false
@@ -208,7 +209,7 @@ process {
         cpus   = { 1     * task.attempt }
     }
     withLabel:many_cores_small_mem {
-        cpus   = {  20     * task.attempt }
+        cpus   = {  10     * task.attempt }
         memory = { 20.GB * task.attempt }
         time   = { 12.h   * task.attempt }
     }
@@ -219,6 +220,8 @@ process {
 
     withName: SUBSET_GENOTYPE2{
         memory = { 200.MB * task.attempt}
+        cpus   = { 1     * task.attempt }
+        time   = { 1.h   * task.attempt }
     }
 
     withLabel:process_high {

diff --git a/conf/modules.conf b/conf/modules.conf
@@ -143,7 +143,7 @@ process {
     }
 
     withName: SUBSET_GENOTYPE2{
-        cpus =  2
+        cpus =  1
         memory = { 1.GB * task.attempt }
         time   = {  12.h   * task.attempt }
         maxRetries    = 3
@@ -241,9 +241,9 @@ process {
     }
 
     withName: CONCORDANCE_CALCLULATIONS{
-        cpus   = {  30     * task.attempt }
+        cpus   = {  10     * task.attempt }
         time   = {  24.h   * task.attempt }
-        memory = { 80.GB * task.attempt }
+        memory = { 100.GB * task.attempt }
     }
 
     withName: OTHER_DONOR_CONCORDANCE_CALCLULATIONS{
@@ -258,8 +258,8 @@ process {
     }
 
     withName: DYNAMIC_DONOR_EXCLUSIVE_SNP_SELECTION{
-        cpus = 5
-        time   = { 12.h   * task.attempt }
+        cpus = 1
+        time   = { 2.h   * task.attempt }
         memory = { 20.GB * task.attempt }
     }
 

diff --git a/modules/nf-core/modules/cellsnp/main.nf b/modules/nf-core/modules/cellsnp/main.nf
@@ -29,9 +29,8 @@ process DYNAMIC_DONOR_EXCLUSIVE_SNP_SELECTION{
     } else {
         container "mercury/scrna_deconvolution:62bd56a"
     }
-    publishDir  path: "${params.outdir}/concordances/${samplename}",
-                mode: "${params.copy_mode}",
-                overwrite: "true"
+    publishDir "${params.outdir}/cellsnp/cellsnp_${samplename}", mode: "${params.copy_mode}", pattern: "cellsnp_${samplename}", overwrite: true
+
     input: 
         val(add_dynamic_sites_or_not_to_panel)
         tuple val(samplename), path(vcf_file),path(csi),path(cellsnp_primary_file)

diff --git a/workflows/yascp.nf b/workflows/yascp.nf
@@ -47,33 +47,30 @@ workflow YASCP {
         }
 
         if (!params.input_data_table.contains('fake_file')){
-
-        // vcf_input.subscribe { println "vcf_input: $it" }
-        // ###################################
-        // ################################### Readme
-        // AMBIENT RNA REMOVAL USING CELLBENDER
-        // There are 2 modes of running YASCP pipeline:
-        // (option 1) users can run it from existing cellbender if the analysis has already been performed by providing a parth to existing cellbender files : note a specific folder structure is required
-        // (option 2) users can run it from cellranger - skipping the cellbender. params.input == 'cellranger'
-        // ###################################
-        // ###################################
-        prepare_inputs(input_channel)
-        channel__file_paths_10x=prepare_inputs.out.channel__file_paths_10x
-        channel__file_paths_10x_single=prepare_inputs.out.ch_experimentid_paths10x_filtered
-        input_channel = prepare_inputs.out.channel_input_data_table
-        if (params.reference_assembly_fasta_dir=='https://yascp.cog.sanger.ac.uk/public/10x_reference_assembly'){
+            prepare_inputs(input_channel)
+            channel__file_paths_10x=prepare_inputs.out.channel__file_paths_10x
+            channel__file_paths_10x_single=prepare_inputs.out.ch_experimentid_paths10x_filtered
+            input_channel = prepare_inputs.out.channel_input_data_table
+            if (params.reference_assembly_fasta_dir=='https://yascp.cog.sanger.ac.uk/public/10x_reference_assembly'){
                 RETRIEVE_RECOURSES()  
-                genome = RETRIEVE_RECOURSES.out.reference_assembly
+                genome1 = RETRIEVE_RECOURSES.out.reference_assembly
             }else{
-                genome = "${params.reference_assembly_fasta_dir}"
-        }       
-
-        chanel_cr_outs = prepare_inputs.out.chanel_cr_outs
-        channel_dsb = prepare_inputs.out.channel_dsb
+                genome1 = "${params.reference_assembly_fasta_dir}"
+            }
+            genome = PREPROCESS_GENOME(genome1)
+
+            chanel_cr_outs = prepare_inputs.out.chanel_cr_outs
+            channel_dsb = prepare_inputs.out.channel_dsb
         }
-        vireo_paths = Channel.from("$projectDir/assets/fake_file.fq")
-        matched_donors = Channel.from("$projectDir/assets/fake_file.fq")
-
+            vireo_paths = Channel.from("$projectDir/assets/fake_file.fq")
+            matched_donors = Channel.from("$projectDir/assets/fake_file.fq")
+
+            ch_poolid_csv_donor_assignments = Channel.empty()
+            bam_split_channel = Channel.of()
+            out_ch = params.outdir
+                ? Channel.fromPath(params.outdir, checkIfExists:true)
+                : Channel.from("${launchDir}/${params.outdir}")
+
         if(!params.just_reports){
             // sometimes we just want to rerun report generation as a result of alterations, hence if we set params.just_reports =True pipeline will use the results directory and generate a new reports.