Merge pull request #21 from Tobi1kenobi/main

Tweak to celltype annotation
wtsi-hgi · Jun 12, 2024 · 9997239 · 9997239
2 parents ac9e8c6 + 6fc3e91
commit 9997239
Show file tree

Hide file tree

Showing 6 changed files with 66 additions and 41 deletions.
diff --git a/bin/generate_combined_celltype_anotation_file.py b/bin/generate_combined_celltype_anotation_file.py
@@ -15,7 +15,7 @@
 def combine_reports(all_alternitive,mode):
     all_indexes_full=set({})
     for d1 in all_alternitive:
-        if d1=='fake_file.fq':
+        if d1 in ('fake_file.fq', 'fake_file1.fq', 'fake_file2.fq'):
             Dataset = pd.DataFrame()
         else:
             Dataset = pd.read_csv(d1,sep='\t',index_col=0)
@@ -26,7 +26,7 @@ def combine_reports(all_alternitive,mode):
         all_indexes_full = all_indexes_full.union(all_indexes)
     Data_All_alt=pd.DataFrame(index=list(set(all_indexes_full)))    
     for d1 in all_alternitive:
-        if d1=='fake_file.fq':
+        if d1 in ('fake_file.fq', 'fake_file1.fq', 'fake_file2.fq'):
             Dataset = pd.DataFrame()
         else:
             Dataset = pd.read_csv(d1,sep='\t',index_col=0)
@@ -94,31 +94,38 @@ def main():
 
     Data_All=pd.DataFrame()
 
-    azimuth_files = options.all_azimuth_files.split('::')
+    # Read azimuth files from a TSV file using pandas
+    azimuth_df = pd.read_csv(options.all_azimuth_files, header=None, names=['file_path'])
+    azimuth_files = azimuth_df['file_path'].tolist()
     Data_All_Azimuth = combine_reports(azimuth_files,'Azimuth:')
 
-    celltypist_files = options.all_celltypist_files.split('::')
+    # Read celltypist files from a TSV file using pandas
+    celltypist_df = pd.read_csv(options.all_celltypist_files, header=None, names=['file_path'])
+    celltypist_files = celltypist_df['file_path'].tolist()
+
     celltypist_files2 = pd.DataFrame(celltypist_files,columns=['col1'])
     celltypist_files3 =list(celltypist_files2[~celltypist_files2['col1'].str.contains('input')]['col1'])
     Data_All_celltypist = combine_reports(celltypist_files3,'Celltypist:')
 
     if (options.all_alternitive):
-        all_alternitive = options.all_alternitive.split('::')
+        all_alternitive_df = pd.read_csv(options.all_alternitive, header=None, names=['file_path'])
+        all_alternitive = all_alternitive_df['file_path'].tolist()
         Data_All_alt = combine_reports(all_alternitive,'')
     else:
         Data_All_alt=pd.DataFrame()
 
     Data_All = pd.concat([Data_All,Data_All_Azimuth,Data_All_celltypist,Data_All_alt],axis=1)
 
-    Donor_Exp = Data_All.index.str.split('-').str[-1]
+    Donor_Exp = Data_All.index.map(lambda x: '-'.join(x.split('-')[2:]))
     Donor = Donor_Exp.str.split('__').str[-1]
     Exp = Donor_Exp.str.split('__').str[0]
 
     Data_All['Donor'] =Donor
     Data_All['Exp'] =Exp
-    Data_All.to_csv('All_Celltype_Assignments.csv',sep='\t')
+    Data_All.to_csv('All_Celltype_Assignments.tsv',sep='\t')
 
-    adatas = options.andata.split('::')
+    adatas_df = pd.read_csv(options.andata, header=None, names=['file_path'])
+    adatas = adatas_df['file_path'].tolist()
     adatasets = []
     # adatasets2 = adatasets[:2]
     adatasets__experiment_ids = []
@@ -127,18 +134,19 @@ def main():
         adata1 = scanpy.read_h5ad(ad1)
         if adata1.n_obs > 0:
             adatasets.append(adata1)
-    ad = adatasets[0].concatenate(*adatasets[1:],index_unique=None)
-    # if(len(adatasets)>1):
-    #     # in this case the concentration adds a -1 -2 -3 to index that has to be removed.
-    #     all_index = pd.DataFrame(ad.obs.index,columns=['col'])
-    #     all_indexes = all_index['col'].str.split('-')
-    #     all_together = all_indexes.str[0]+'-'+all_indexes.str[1]+'-'+all_indexes.str[2]
-    #     ad.obs.set_index(all_together, inplace=True)
+
+    ad = adatasets[0].concatenate(*adatasets[1:])
+    if(len(adatasets)>1):
+        # in this case the concentration adds a -1 -2 -3 to index that has to be removed.
+        all_index = pd.DataFrame(ad.obs.index,columns=['col'])
+        all_indexes = all_index['col'].str.split('-')
+        all_together = all_indexes.apply(lambda x: '-'.join(x[:-1]))
+        ad.obs.set_index(all_together, inplace=True)
+
 
     # ad2 = adatasets2[0].concatenate(*adatasets2[1:])
     # ad = scanpy.read(adata)
-    for col in Data_All.columns:
-        ad.obs[col]=Data_All[col]
+    ad.obs = ad.obs.merge(Data_All, left_index=True, right_index=True)
 
     donor_celltype_report={}
     tranche_exp_report={}
@@ -155,13 +163,13 @@ def main():
                     # col='Celltypist:over_clustering'
                     # col='Azimuth:predicted.celltype.l2'
                     counts = Exp_Data[col].value_counts()
-                    counts.index = counts.index+' - '+col
+                    counts.index = counts.index.astype(str) + ' - ' + col
                     dict_tranche_cells.update(counts.to_dict())
 
                     # print(donor)
                     donor_data=Exp_Data[Exp_Data['Donor']==donor]
                     donor_counts = donor_data[col].value_counts()
-                    donor_counts.index = donor_counts.index+' - '+col
+                    donor_counts.index = donor_counts.index.astype(str) + ' - '+col
                     dict_donor_cells.update(donor_counts.to_dict())
                 # check all the available celltypes here
                 # and count the numbers

diff --git a/bin/run_celltypist.py b/bin/run_celltypist.py
@@ -61,10 +61,12 @@
 @click.option('-g','--input_h5_genome_version', default="GRCh38", show_default=True, type=str,
               help='True or False: whether to write donor level scanpy hdf5 objects to dir --output_dir')
 
-
+# Optional arguments:
+@click.option('-p','--sample_plot_probs', is_flag=True, default=False, type=bool,
+              help='True or False: whether or not to plot probabilities per cell types and per sample')
 
 def run_celltypist(samplename, filtered_matrix_h5, celltypist_model,
-                   output_dir, anndata_compression_level,input_h5_genome_version):
+                   output_dir, anndata_compression_level,input_h5_genome_version, sample_plot_probs):
     """process cellranger output filtered h5 so that it can be fed to Celltypist"""
     logging.info('running run_celltypist() function..')
 
@@ -185,10 +187,11 @@ def run_celltypist(samplename, filtered_matrix_h5, celltypist_model,
     predictions.to_plots(folder = output_dir, prefix = samplename + '_')
     ###predictions.to_plots(folder = os.getcwd())
     # Visualise the decision scores and probabilities of each cell type overlaid onto the UMAP as well.
-    folder_plot_probs = output_dir + '/plot_prob'
-    if not os.path.exists(folder_plot_probs):
-        os.makedirs(folder_plot_probs)
-    predictions.to_plots(folder = folder_plot_probs, prefix = samplename + '_prob_', plot_probability = True)
+    if sample_plot_probs:
+        folder_plot_probs = output_dir + '/plot_prob'
+        if not os.path.exists(folder_plot_probs):
+            os.makedirs(folder_plot_probs)
+        predictions.to_plots(folder = folder_plot_probs, prefix = samplename + '_prob_', plot_probability = True)
 
     # Get an `AnnData` with predicted labels embedded into the cell metadata columns.
     # logging.info("... running predictions.to_adata()")

diff --git a/modules/nf-core/modules/cell_type_assignment/functions.nf b/modules/nf-core/modules/cell_type_assignment/functions.nf
@@ -1,20 +1,19 @@
 process CELLTYPE_FILE_MERGE{
     tag "${samplename}"    
-    label 'process_medium'
+    label 'process_high'
     publishDir  path: "${params.outdir}/celltype/",
             saveAs: {filename -> filename},
             mode: "${params.copy_mode}",
             overwrite: "true"  
     if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
         container "https://yascp.cog.sanger.ac.uk/public/singularity_images/wtsihgi_nf_scrna_qc_6bb6af5-2021-12-23-3270149cf265.sif"
         // container "/lustre/scratch123/hgi/projects/ukbb_scrna/pipelines/singularity_images/nf_qc_cluster_2.4.img"
-
     } else {
         container "wtsihgi/nf_scrna_qc:6bb6af5"
     }
     output:
         path('adata.h5ad', emit:file__anndata_merged2)
-        path("All_Celltype_Assignments.csv",emit:celltype_assignments)
+        path("All_Celltype_Assignments.tsv",emit:celltype_assignments)
         path "tranche_celltype_report.tsv"
         path "donor_celltype_report.tsv"
 
@@ -24,18 +23,27 @@ process CELLTYPE_FILE_MERGE{
         path(all_other_paths)
         path(file__anndata_input)
     script:
-        all_azimuth_files = azimuth_files.join("::")
-        all_celltypist_files = celltypist_paths.join("::")
-        if ("${all_other_paths}"!='fake_file.fq'){
-            all_other_paths_comb = all_other_paths.join("::")
-            other_paths ="--all_other_paths ${all_other_paths_comb}"
-        }else{
+        def merged_files_outpath = "${params.outdir}/celltype/merged_files/"
+        file(merged_files_outpath).mkdirs()
+        def azimuth_files_path = "${merged_files_outpath}/azimuth_files.tsv"
+        def celltypist_files_path = "${merged_files_outpath}/celltypist_files.tsv"
+        def all_other_files_path = "${merged_files_outpath}/other_files.tsv"
+        def adatas_path = "${merged_files_outpath}/adatas.tsv"
+
+        new File(azimuth_files_path).text = azimuth_files.join("\n")
+        new File(celltypist_files_path).text = celltypist_paths.join("\n")
+
+        if ("${all_other_paths}" != 'fake_file.fq') {
+            new File(all_other_files_path).text = all_other_paths.join("\n")
+            other_paths = "--all_other_paths ${all_other_files_path}"
+        } else {
             other_paths = ""
         }
-
-        all_adatas = file__anndata_input.join("::")
+
+        new File(adatas_path).text = file__anndata_input.join("\n")
+
         """
-            generate_combined_celltype_anotation_file.py --all_azimuth_files ${all_azimuth_files} --all_celltypist_files ${all_celltypist_files} ${other_paths} --adata '${all_adatas}'
+        generate_combined_celltype_anotation_file.py --all_azimuth_files ${azimuth_files_path} --all_celltypist_files ${celltypist_files_path} ${other_paths} --adata '${adatas_path}'
         """
 
 }
diff --git a/modules/nf-core/modules/cell_type_assignment/main.nf b/modules/nf-core/modules/cell_type_assignment/main.nf
@@ -42,7 +42,7 @@ workflow CELL_TYPE_ASSIGNEMT{
             az_out = Channel.of()
         }
 
-        if (params.celltype_assignment.run_azimuth){
+        if (params.celltype_assignment.run_celltypist){
             Channel.fromList(params.celltypist.models)
                 .set{ch_celltypist_models}
             CELLTYPIST(az_ch_experiment_filth5.combine(ch_celltypist_models))

diff --git a/modules/nf-core/modules/celltypist/main.nf b/modules/nf-core/modules/celltypist/main.nf
@@ -24,7 +24,7 @@ process CELLTYPIST {
       tuple val(sample), path("outputs/*_probability_matrix.csv"), emit: sample_probability_matrix_csv
       tuple val(sample), path("outputs/*_decision_matrix.csv"), emit: sample_decision_matrix_csv
       tuple val(sample), path("outputs/*_*.pdf"), emit: sample_plots_pdf
-      tuple val(sample), path("outputs/plot_prob/*_*.pdf"), emit: sample_plots_prob_pdf
+      tuple val(sample), path("outputs/plot_prob/*_*.pdf"), emit: sample_plots_prob_pdf, optional: true
 
     script:
       model="${celltypist_model}".replaceAll(/^.*[\\/]/, "").replaceFirst(".pkl","")
@@ -37,13 +37,20 @@ process CELLTYPIST {
         filtered_matrix_h5_path = file("${filtered_matrix_h5}/../cellbender_FPR_0pt05_filtered.h5")
       }
 
+      if (params.celltypist.sample_plot_probs){
+        sample_plot_probs = "--sample_plot_probs"
+      }
+      else{
+        sample_plot_probs = ""
+      }
 
       """
 
         umask 2 # make files group_writable 
         mkdir -p outputs
         run_celltypist.py \\
           --samplename ${sample} \\
+          ${sample_plot_probs} \\
           --filtered_matrix_h5 ${filtered_matrix_h5} \\
           --celltypist_model ${celltypist_model}  \\
           --output_dir \$PWD/outputs  \\

diff --git a/modules/nf-core/modules/merge_samples/functions.nf b/modules/nf-core/modules/merge_samples/functions.nf
@@ -125,8 +125,7 @@ process merge_samples {
 
     tag "${samplename}"
 
-    label 'process_medium'
-    label 'process_high_memory'
+    label 'process_high'
     publishDir  path: "${outdir}/merged_h5ad",
                 saveAs: {filename -> filename.replaceAll("-", "pre_QC_")},
                 mode: "${params.copy_mode}",