Skip to content

Commit

Permalink
Merge pull request #21 from Tobi1kenobi/main
Browse files Browse the repository at this point in the history
Tweak to celltype annotation
  • Loading branch information
maxozo authored Jun 12, 2024
2 parents ac9e8c6 + 6fc3e91 commit 9997239
Show file tree
Hide file tree
Showing 6 changed files with 66 additions and 41 deletions.
46 changes: 27 additions & 19 deletions bin/generate_combined_celltype_anotation_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
def combine_reports(all_alternitive,mode):
all_indexes_full=set({})
for d1 in all_alternitive:
if d1=='fake_file.fq':
if d1 in ('fake_file.fq', 'fake_file1.fq', 'fake_file2.fq'):
Dataset = pd.DataFrame()
else:
Dataset = pd.read_csv(d1,sep='\t',index_col=0)
Expand All @@ -26,7 +26,7 @@ def combine_reports(all_alternitive,mode):
all_indexes_full = all_indexes_full.union(all_indexes)
Data_All_alt=pd.DataFrame(index=list(set(all_indexes_full)))
for d1 in all_alternitive:
if d1=='fake_file.fq':
if d1 in ('fake_file.fq', 'fake_file1.fq', 'fake_file2.fq'):
Dataset = pd.DataFrame()
else:
Dataset = pd.read_csv(d1,sep='\t',index_col=0)
Expand Down Expand Up @@ -94,31 +94,38 @@ def main():

Data_All=pd.DataFrame()

azimuth_files = options.all_azimuth_files.split('::')
# Read azimuth files from a TSV file using pandas
azimuth_df = pd.read_csv(options.all_azimuth_files, header=None, names=['file_path'])
azimuth_files = azimuth_df['file_path'].tolist()
Data_All_Azimuth = combine_reports(azimuth_files,'Azimuth:')

celltypist_files = options.all_celltypist_files.split('::')
# Read celltypist files from a TSV file using pandas
celltypist_df = pd.read_csv(options.all_celltypist_files, header=None, names=['file_path'])
celltypist_files = celltypist_df['file_path'].tolist()

celltypist_files2 = pd.DataFrame(celltypist_files,columns=['col1'])
celltypist_files3 =list(celltypist_files2[~celltypist_files2['col1'].str.contains('input')]['col1'])
Data_All_celltypist = combine_reports(celltypist_files3,'Celltypist:')

if (options.all_alternitive):
all_alternitive = options.all_alternitive.split('::')
all_alternitive_df = pd.read_csv(options.all_alternitive, header=None, names=['file_path'])
all_alternitive = all_alternitive_df['file_path'].tolist()
Data_All_alt = combine_reports(all_alternitive,'')
else:
Data_All_alt=pd.DataFrame()

Data_All = pd.concat([Data_All,Data_All_Azimuth,Data_All_celltypist,Data_All_alt],axis=1)

Donor_Exp = Data_All.index.str.split('-').str[-1]
Donor_Exp = Data_All.index.map(lambda x: '-'.join(x.split('-')[2:]))
Donor = Donor_Exp.str.split('__').str[-1]
Exp = Donor_Exp.str.split('__').str[0]

Data_All['Donor'] =Donor
Data_All['Exp'] =Exp
Data_All.to_csv('All_Celltype_Assignments.csv',sep='\t')
Data_All.to_csv('All_Celltype_Assignments.tsv',sep='\t')

adatas = options.andata.split('::')
adatas_df = pd.read_csv(options.andata, header=None, names=['file_path'])
adatas = adatas_df['file_path'].tolist()
adatasets = []
# adatasets2 = adatasets[:2]
adatasets__experiment_ids = []
Expand All @@ -127,18 +134,19 @@ def main():
adata1 = scanpy.read_h5ad(ad1)
if adata1.n_obs > 0:
adatasets.append(adata1)
ad = adatasets[0].concatenate(*adatasets[1:],index_unique=None)
# if(len(adatasets)>1):
# # in this case the concentration adds a -1 -2 -3 to index that has to be removed.
# all_index = pd.DataFrame(ad.obs.index,columns=['col'])
# all_indexes = all_index['col'].str.split('-')
# all_together = all_indexes.str[0]+'-'+all_indexes.str[1]+'-'+all_indexes.str[2]
# ad.obs.set_index(all_together, inplace=True)

ad = adatasets[0].concatenate(*adatasets[1:])
if(len(adatasets)>1):
# in this case the concentration adds a -1 -2 -3 to index that has to be removed.
all_index = pd.DataFrame(ad.obs.index,columns=['col'])
all_indexes = all_index['col'].str.split('-')
all_together = all_indexes.apply(lambda x: '-'.join(x[:-1]))
ad.obs.set_index(all_together, inplace=True)


# ad2 = adatasets2[0].concatenate(*adatasets2[1:])
# ad = scanpy.read(adata)
for col in Data_All.columns:
ad.obs[col]=Data_All[col]
ad.obs = ad.obs.merge(Data_All, left_index=True, right_index=True)

donor_celltype_report={}
tranche_exp_report={}
Expand All @@ -155,13 +163,13 @@ def main():
# col='Celltypist:over_clustering'
# col='Azimuth:predicted.celltype.l2'
counts = Exp_Data[col].value_counts()
counts.index = counts.index+' - '+col
counts.index = counts.index.astype(str) + ' - ' + col
dict_tranche_cells.update(counts.to_dict())

# print(donor)
donor_data=Exp_Data[Exp_Data['Donor']==donor]
donor_counts = donor_data[col].value_counts()
donor_counts.index = donor_counts.index+' - '+col
donor_counts.index = donor_counts.index.astype(str) + ' - '+col
dict_donor_cells.update(donor_counts.to_dict())
# check all the available celltypes here
# and count the numbers
Expand Down
15 changes: 9 additions & 6 deletions bin/run_celltypist.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,12 @@
@click.option('-g','--input_h5_genome_version', default="GRCh38", show_default=True, type=str,
help='True or False: whether to write donor level scanpy hdf5 objects to dir --output_dir')


# Optional arguments:
@click.option('-p','--sample_plot_probs', is_flag=True, default=False, type=bool,
help='True or False: whether or not to plot probabilities per cell types and per sample')

def run_celltypist(samplename, filtered_matrix_h5, celltypist_model,
output_dir, anndata_compression_level,input_h5_genome_version):
output_dir, anndata_compression_level,input_h5_genome_version, sample_plot_probs):
"""process cellranger output filtered h5 so that it can be fed to Celltypist"""
logging.info('running run_celltypist() function..')

Expand Down Expand Up @@ -185,10 +187,11 @@ def run_celltypist(samplename, filtered_matrix_h5, celltypist_model,
predictions.to_plots(folder = output_dir, prefix = samplename + '_')
###predictions.to_plots(folder = os.getcwd())
# Visualise the decision scores and probabilities of each cell type overlaid onto the UMAP as well.
folder_plot_probs = output_dir + '/plot_prob'
if not os.path.exists(folder_plot_probs):
os.makedirs(folder_plot_probs)
predictions.to_plots(folder = folder_plot_probs, prefix = samplename + '_prob_', plot_probability = True)
if sample_plot_probs:
folder_plot_probs = output_dir + '/plot_prob'
if not os.path.exists(folder_plot_probs):
os.makedirs(folder_plot_probs)
predictions.to_plots(folder = folder_plot_probs, prefix = samplename + '_prob_', plot_probability = True)

# Get an `AnnData` with predicted labels embedded into the cell metadata columns.
# logging.info("... running predictions.to_adata()")
Expand Down
32 changes: 20 additions & 12 deletions modules/nf-core/modules/cell_type_assignment/functions.nf
Original file line number Diff line number Diff line change
@@ -1,20 +1,19 @@
process CELLTYPE_FILE_MERGE{
tag "${samplename}"
label 'process_medium'
label 'process_high'
publishDir path: "${params.outdir}/celltype/",
saveAs: {filename -> filename},
mode: "${params.copy_mode}",
overwrite: "true"
if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
container "https://yascp.cog.sanger.ac.uk/public/singularity_images/wtsihgi_nf_scrna_qc_6bb6af5-2021-12-23-3270149cf265.sif"
// container "/lustre/scratch123/hgi/projects/ukbb_scrna/pipelines/singularity_images/nf_qc_cluster_2.4.img"

} else {
container "wtsihgi/nf_scrna_qc:6bb6af5"
}
output:
path('adata.h5ad', emit:file__anndata_merged2)
path("All_Celltype_Assignments.csv",emit:celltype_assignments)
path("All_Celltype_Assignments.tsv",emit:celltype_assignments)
path "tranche_celltype_report.tsv"
path "donor_celltype_report.tsv"

Expand All @@ -24,18 +23,27 @@ process CELLTYPE_FILE_MERGE{
path(all_other_paths)
path(file__anndata_input)
script:
all_azimuth_files = azimuth_files.join("::")
all_celltypist_files = celltypist_paths.join("::")
if ("${all_other_paths}"!='fake_file.fq'){
all_other_paths_comb = all_other_paths.join("::")
other_paths ="--all_other_paths ${all_other_paths_comb}"
}else{
def merged_files_outpath = "${params.outdir}/celltype/merged_files/"
file(merged_files_outpath).mkdirs()
def azimuth_files_path = "${merged_files_outpath}/azimuth_files.tsv"
def celltypist_files_path = "${merged_files_outpath}/celltypist_files.tsv"
def all_other_files_path = "${merged_files_outpath}/other_files.tsv"
def adatas_path = "${merged_files_outpath}/adatas.tsv"

new File(azimuth_files_path).text = azimuth_files.join("\n")
new File(celltypist_files_path).text = celltypist_paths.join("\n")

if ("${all_other_paths}" != 'fake_file.fq') {
new File(all_other_files_path).text = all_other_paths.join("\n")
other_paths = "--all_other_paths ${all_other_files_path}"
} else {
other_paths = ""
}

all_adatas = file__anndata_input.join("::")

new File(adatas_path).text = file__anndata_input.join("\n")

"""
generate_combined_celltype_anotation_file.py --all_azimuth_files ${all_azimuth_files} --all_celltypist_files ${all_celltypist_files} ${other_paths} --adata '${all_adatas}'
generate_combined_celltype_anotation_file.py --all_azimuth_files ${azimuth_files_path} --all_celltypist_files ${celltypist_files_path} ${other_paths} --adata '${adatas_path}'
"""

}
2 changes: 1 addition & 1 deletion modules/nf-core/modules/cell_type_assignment/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ workflow CELL_TYPE_ASSIGNEMT{
az_out = Channel.of()
}

if (params.celltype_assignment.run_azimuth){
if (params.celltype_assignment.run_celltypist){
Channel.fromList(params.celltypist.models)
.set{ch_celltypist_models}
CELLTYPIST(az_ch_experiment_filth5.combine(ch_celltypist_models))
Expand Down
9 changes: 8 additions & 1 deletion modules/nf-core/modules/celltypist/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ process CELLTYPIST {
tuple val(sample), path("outputs/*_probability_matrix.csv"), emit: sample_probability_matrix_csv
tuple val(sample), path("outputs/*_decision_matrix.csv"), emit: sample_decision_matrix_csv
tuple val(sample), path("outputs/*_*.pdf"), emit: sample_plots_pdf
tuple val(sample), path("outputs/plot_prob/*_*.pdf"), emit: sample_plots_prob_pdf
tuple val(sample), path("outputs/plot_prob/*_*.pdf"), emit: sample_plots_prob_pdf, optional: true

script:
model="${celltypist_model}".replaceAll(/^.*[\\/]/, "").replaceFirst(".pkl","")
Expand All @@ -37,13 +37,20 @@ process CELLTYPIST {
filtered_matrix_h5_path = file("${filtered_matrix_h5}/../cellbender_FPR_0pt05_filtered.h5")
}

if (params.celltypist.sample_plot_probs){
sample_plot_probs = "--sample_plot_probs"
}
else{
sample_plot_probs = ""
}

"""
umask 2 # make files group_writable
mkdir -p outputs
run_celltypist.py \\
--samplename ${sample} \\
${sample_plot_probs} \\
--filtered_matrix_h5 ${filtered_matrix_h5} \\
--celltypist_model ${celltypist_model} \\
--output_dir \$PWD/outputs \\
Expand Down
3 changes: 1 addition & 2 deletions modules/nf-core/modules/merge_samples/functions.nf
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,7 @@ process merge_samples {

tag "${samplename}"

label 'process_medium'
label 'process_high_memory'
label 'process_high'
publishDir path: "${outdir}/merged_h5ad",
saveAs: {filename -> filename.replaceAll("-", "pre_QC_")},
mode: "${params.copy_mode}",
Expand Down

0 comments on commit 9997239

Please sign in to comment.