Skip to content

Commit

Permalink
ensuring all in place
Browse files Browse the repository at this point in the history
  • Loading branch information
Matiss Ozols committed May 24, 2024
1 parent a0b20fc commit e25e274
Show file tree
Hide file tree
Showing 12 changed files with 171 additions and 69 deletions.
29 changes: 22 additions & 7 deletions bin/gather_minimal_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,11 @@

COLUMNS_AZIMUTH = {
'Azimuth:predicted.celltype.l2': 'azimuth.celltyp.l2',
'Azimuth:predicted.celltype.l3': 'azimuth.celltyp.l3',
'Azimuth:predicted.celltype.l1': 'azimuth.celltyp.l1',
'Azimuth:predicted.celltype.l2.score': 'azimuth.pred.score.l2',
'Azimuth:mapping.score.celltype.l2': 'azimuth.map.score',
'Celltypist*':'Celltypist*',
}

COLUMNS_DECONV = {
Expand All @@ -66,9 +69,10 @@
'total_counts': 'qc.umi.count.total',
'total_counts_gene_group__mito_transcript': 'qc.umi.count.mt',
'pct_counts_gene_group__mito_transcript': 'qc.umi.perc.mt',
'pct_counts_in_top_500_genes':'pct_counts_in_top_500_genes',
'n_genes_by_counts': 'qc.genes.detected.count',
'Azimuth:L0_Azimuth:predicted.celltype.l2':'azimuth.celltyp.l0',
'Azimuth:L1_Azimuth:predicted.celltype.l2':'azimuth.celltyp.l1',
'Azimuth:L0_Azimuth:predicted.celltype.l2':'azimuth.celltyp.l0.mapped_fromL2',
'Azimuth:L1_Azimuth:predicted.celltype.l2':'azimuth.celltyp.l1.mapped_fromL2',
'total_counts_gene_group__mito_transcript':'total_counts_gene_group__mito_transcript',
'pct_counts_gene_group__mito_transcript':'pct_counts_gene_group__mito_transcript',
'total_counts_gene_group__mito_protein':'total_counts_gene_group__mito_protein',
Expand All @@ -91,7 +95,8 @@
'experiment_id': 'experiment.id',
'tranche.id':'tranche.id',
'chromium_run_id': 'chromium.run.id',
'chromium_lane': 'chromium.lane'
'chromium_lane': 'chromium.lane',
'instrument':'instrument'
}
COLUMNS_SCRUBLET = {
'scrublet__multiplet_scores': 'scrublet.scores',
Expand Down Expand Up @@ -229,8 +234,9 @@ def load_scrublet_assignments(expid, datadir_scrublet):
filpath = None
fnam = '{}{}'.format(expid, SCRUBLET_ASSIGNMENTS_FNSUFFIX)
fnam2 = '{}{}'.format(expid, SCRUBLET_ASSIGNMENTS_FNSUFFIX.replace('-',''))
fnam3 = '{}{}'.format(expid, 'scrublet.tsv.gz')
for fn in os.listdir(datadir_scrublet):
if fn == fnam or fn == fnam2:
if fn == fnam or fn == fnam2 or fn == fnam3:
filpath = os.path.join(datadir_scrublet, fn)
break

Expand Down Expand Up @@ -327,9 +333,15 @@ def gather_donor(donor_id, ad, ad_lane_raw, azimuth_annot, qc_obs, columns_outpu
ad.raw = ad_lane_raw[ad.obs.index, :]
if donor_id != "unassigned" and donor_id != "doublet":
# add annotation from QC
df = pandas.concat([ad.obs, azimuth_annot.loc[azimuth_annot.Donor == donor_id]], axis = 1, join = 'outer')
donor_azt = azimuth_annot.loc[azimuth_annot.Donor == donor_id]
donor_azt = donor_azt.loc[donor_azt.Exp == expid]
donor_azt['barcode'] = donor_azt.index.str.split('-').str[0]+'-'+donor_azt.index.str.split('-').str[1]
donor_azt = donor_azt.set_index('barcode')
df = pandas.concat([ad.obs, donor_azt], axis = 1, join = 'outer')
df =df.loc[:,~df.columns.duplicated()]
df = df[['experiment_id'] + list(COLUMNS_DECONV.keys()) + list(COLUMNS_AZIMUTH.keys())]
df = df[['experiment_id'] +
list(set(df.columns).intersection(set(COLUMNS_DECONV.keys()))) +
list(set(df.columns).intersection(set(COLUMNS_AZIMUTH.keys())))]
try:
df = get_lane_and_runid_from_experiment_id(df, insert_pos = 1)
except:
Expand Down Expand Up @@ -513,7 +525,10 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
##########################
# Scrublet
#########################
datadir_scrublet=glob.glob(f'{args.results_dir}/*/multiplet.method=scrublet')[0]
try:
datadir_scrublet=glob.glob(f'{args.results_dir}/*/multiplet.method=scrublet')[0]
except:
datadir_scrublet=glob.glob(f'{args.results_dir}/multiplet.method=scrublet')[0]
if os.path.isdir(datadir_scrublet):
# Scrublet loading QC
try:
Expand Down
2 changes: 1 addition & 1 deletion bin/run_celltypist.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def run_celltypist(samplename, filtered_matrix_h5, celltypist_model,
# try the .raw.X attribute.
# If none of them fit into the desired data type or the expression matrix is not properly normalised, an error will be raised.
logging.info('... running sc.pp.normalize_total(adata, target_sum=1e4)')
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
logging.info('... running sc.pp.log1p(adata, copy = False)')
sc.pp.log1p(adata, copy = False)

Expand Down
17 changes: 15 additions & 2 deletions bin/scanpy_split_h5ad.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,11 @@ def write_h5_out_for_ct(ad,oufn_list_AZ,oufnam,oufn_list,samples,samples_AZ,bl,c
# assumes that cells failing qc already were stripped out
# ad.obs = ad.obs[['convoluted_samplename', 'cell_passes_qc']]
# ad.var = ad.var[['feature_types', 'genome', 'gene_symbols']]
adb.obs['log10_ngenes_by_count'] = np.log10(adb.obs['n_genes_by_counts']) / np.log10(adb.obs['total_counts'])
try:
adb.obs['log10_ngenes_by_count'] = np.log10(adb.obs['n_genes_by_counts']) / np.log10(adb.obs['total_counts'])
except:
print('some cols dont exist')

adb_AZ = adb.copy()
# disable
adb_AZ.obs = pandas.DataFrame(adb_AZ.obs.index, index = adb_AZ.obs.index, columns = ["cell_barcode"])
Expand Down Expand Up @@ -70,7 +74,10 @@ def write_h5_out_for_ct(ad,oufn_list_AZ,oufnam,oufn_list,samples,samples_AZ,bl,c
adb_AZ.var['genome']='GRCh38'
vdf = adb_AZ.var[["feature_types", "genome"]]
vdf.insert(1,"gene_ids", vdf.index)
vdf.index = pandas.Index(adb_AZ.var['gene_symbols'].astype('str'))
try:
vdf.index = pandas.Index(adb_AZ.var['gene_symbols'].astype('str'))
except:
print('gene_symbols not available')
#ad.var = vdf.set_index("gene_symbols", drop = True, verify_integrity = False)
adb_AZ.var = vdf

Expand Down Expand Up @@ -103,6 +110,12 @@ def split_h5ad_by_batch(ad, oufnprfx, colnam_batch = 'batch', anndata_compressio
oufn_list_fnam = '{}_files.txt'.format(oufnprfx)
oufn_list = []
oufn_list_AZ = []
try:
print(set(ad.obs[colnam_batch]))
# here we do not have a batch id since the data is not deconvoluted yet
except:
ad.obs[colnam_batch] = oufnprfx

batch_labels = pandas.Categorical(ad.obs[colnam_batch].apply(lambda a: a.split('__')[0])) # <class 'pandas.core.series.Series'>
samples = {}
samples_AZ = {}
Expand Down
92 changes: 61 additions & 31 deletions bin/transfer_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,47 +83,77 @@ def main_data_colection(pipeline='',name='',directory='',input_table=None,cb_res
print('missing6')
# Fetch Gather
df_raw = pd.read_table(input_table, index_col = 'experiment_id')

# 'Note that the names for the future projects may be different - have to be handled on the Nextflow modules'
print('prepearing fetch folder')
try:
os.mkdir(f'{name_dir}/Fetch Pipeline')
except:
print('exists')
metadata_table = pd.DataFrame()
d1 = glob.glob(f"{results_dir}/*/web_summary.html")
d2 = glob.glob(f"{results_dir}/*/*/web_summary.html")
d3 = glob.glob(f"{results_dir}/*/*/*/web_summary.html")
d4 =glob.glob(f"{results_dir}/*/*/*/*/web_summary.html")
ts3 = [*d1, *d2,*d3,*d4]
ts3 = pd.DataFrame(ts3,columns=['col'])
for folder in df_raw.index:
dir3 = f"{df_raw.loc[folder, 'data_path_10x_format']}"
try:
copyfile(f'{dir3}/web_summary.html', f'{name_dir}/Fetch Pipeline/html_{folder}.html')
metadata = pd.read_csv(f'{dir3}/metrics_summary.csv',sep=',',index_col=False)
metadata['Sample_id']=folder

ts3 = []
c1 =0
for i,p2 in df_raw.iterrows():
# print(p1)
p1 = p2['data_path_10x_format']

d1 = glob.glob(f"{p1}/*/web_summary.html")
d2 = glob.glob(f"{p1}/*/*/web_summary.html")
d3 = glob.glob(f"{p1}/*/*/*/web_summary.html")
d4 =glob.glob(f"{p1}/*/*/*/*/web_summary.html")
ts3 = [ *d1, *d2,*d3,*d4]
for t1 in ts3:
copyfile(t1, f'{name_dir}/Fetch Pipeline/html_{i}.html')

d1 = glob.glob(f"{p1}/*/metrics_summary.csv")
d2 = glob.glob(f"{p1}/*/*/metrics_summary.csv")
d3 = glob.glob(f"{p1}/*/*/*/metrics_summary.csv")
d4 =glob.glob(f"{p1}/*/*/*/*/metrics_summary.csv")
ts3 = [ *d1, *d2,*d3,*d4]
for t1 in ts3:
metadata = pd.read_csv(t1,sep=',',index_col=False)
metadata['Sample_id']=i
metadata.set_index('Sample_id',drop=True,inplace=True)
metadata_table=pd.concat([metadata_table,metadata])
except:
try:
try:
copyfile(f'{results_dir}/handover/{name_dir}/Fetch Pipeline/html_{folder}.html', f'{name_dir}/Fetch Pipeline/html_{folder}.html')
except:
try:
copyfile(ts3[ts3['col'].str.contains(f'{folder}')]['col'].values[0], f'{name_dir}/Fetch Pipeline/html_{folder}.html')
except:
copyfile(f'/lustre/scratch123/hgi/projects/ukbb_scrna/pipelines/Pilot_UKB/qc/{name}/results_rsync2/results/handover/{name_dir}/Fetch Pipeline/html_{folder}.html', f'{name_dir}/Fetch Pipeline/html_{folder}.html')
except:
_='pass'
try:
metadata_table = pd.read_csv(f'{results_dir}/handover/{name_dir}/Fetch Pipeline/CSV/Submission_Data_Pilot_UKB.file_metadata.tsv',sep='\t')
metadata_table.set_index('Sample_id',drop=True,inplace=True)
if all(metadata_table.columns == metadata.columns):
metadata_table=pd.concat([metadata_table,metadata.T])
else:
metadata_table=pd.concat([metadata_table,metadata])
except:
try:
metadata_tablemetadata_table = pd.read_csv(f'{results_dir}/handover/{name_dir}/Fetch Pipeline/Submission_Data_Pilot_UKB.file_metadata.tsv',sep=',')
metadata_table.set_index('Sample_id',drop=True,inplace=True)
except:
metadata_table = pd.DataFrame()
metadata_table=pd.concat([metadata_table,metadata])
# ts3 = pd.DataFrame(ts3,columns=['col'])

# for

# for folder in df_raw.index:
# dir3 = f"{df_raw.loc[folder, 'data_path_10x_format']}"
# try:
# copyfile(f'{dir3}/web_summary.html', f'{name_dir}/Fetch Pipeline/html_{folder}.html')
# metadata = pd.read_csv(f'{dir3}/metrics_summary.csv',sep=',',index_col=False)
# metadata['Sample_id']=folder
# metadata.set_index('Sample_id',drop=True,inplace=True)
# metadata_table=pd.concat([metadata_table,metadata])
# except:
# try:
# try:
# copyfile(f'{results_dir}/handover/{name_dir}/Fetch Pipeline/html_{folder}.html', f'{name_dir}/Fetch Pipeline/html_{folder}.html')
# except:
# try:
# copyfile(ts3[ts3['col'].str.contains(f'{folder}')]['col'].values[0], f'{name_dir}/Fetch Pipeline/html_{folder}.html')
# except:
# copyfile(f'/lustre/scratch123/hgi/projects/ukbb_scrna/pipelines/Pilot_UKB/qc/{name}/results_rsync2/results/handover/{name_dir}/Fetch Pipeline/html_{folder}.html', f'{name_dir}/Fetch Pipeline/html_{folder}.html')
# except:
# _='pass'
# try:
# metadata_table = pd.read_csv(f'{results_dir}/handover/{name_dir}/Fetch Pipeline/CSV/Submission_Data_Pilot_UKB.file_metadata.tsv',sep='\t')
# metadata_table.set_index('Sample_id',drop=True,inplace=True)
# except:
# try:
# metadata_tablemetadata_table = pd.read_csv(f'{results_dir}/handover/{name_dir}/Fetch Pipeline/Submission_Data_Pilot_UKB.file_metadata.tsv',sep=',')
# metadata_table.set_index('Sample_id',drop=True,inplace=True)
# except:
# metadata_table = pd.DataFrame()
try:
os.mkdir(f'{name_dir}/Fetch Pipeline/CSV')
except:
Expand Down
2 changes: 1 addition & 1 deletion conf/modules.conf
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ process {
}

withName: CONCORDANCE_CALCLULATIONS{
cpus = 10
cpus = { 10 * task.attempt }
time = { 24.h * task.attempt }
memory = { 80.GB * task.attempt }
}
Expand Down
2 changes: 1 addition & 1 deletion modules/nf-core/modules/gather_data/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ process GATHER_DATA{
output:
path("${subdir}", emit:outfiles_dataset)
path("${subdir}_summary", emit:outfiles_dataset2)

path("Donor_Quantification/*/*.tsv", emit: barcodes_files)
val(outdir, emit: outdir_dataset)

script:
Expand Down
2 changes: 1 addition & 1 deletion modules/nf-core/modules/split_batch_h5ad/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ process SPLIT_BATCH_H5AD {
}

input:
path(file__anndata) // anndata h5ad file seurat_azimuth_pbmc_1.0
tuple val(name), path(file__anndata) // anndata h5ad file seurat_azimuth_pbmc_1.0
val(mode)

output:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,40 @@ process SUBSET_BAM_PER_BARCODES_AND_VARIANTS {
filter_bam_file_for_popscle_dsc_pileup.sh ${bam} ${barcodes} srt_${vcf} ${sample}_filtered.bam
"""

}


process SUBSET_BAM_PER_BARCODES{


tag "${samplename}"
label 'process_low'

if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
container "/lustre/scratch123/hgi/projects/ukbb_scrna/pipelines/Pilot_UKB/carls_data/analysis/bam_tool_processing_05_04_2024.sif"
} else {
container "mercury/bam_tool_processing:05_04_2024"
}

publishDir path: "${params.outdir}/handover/Donor_Quantification/${sample}",
mode: "${params.copy_mode}",
overwrite: "true"

input:

// CRD_CMB12979963, /lustre/scratch123/hgi/mdt2/teams/hgi/mo11/tmp_projects/cardinal/qc_F3/UKBB_ELGH_5th_July_2022/work/5c/dada93d9cbebef97d30e4014e131cf/input__CRD_CMB12979963/possorted_genome_bam.bam, /lustre/scratch123/hgi/mdt2/teams/hgi/mo11/tmp_projects/cardinal/qc_F3/UKBB_ELGH_5th_July_2022/work/5c/dada93d9cbebef97d30e4014e131cf/input__CRD_CMB12979963/possorted_genome_bam.bam.bai, /lustre/scratch123/hgi/mdt2/teams/hgi/mo11/tmp_projects/cardinal/qc_F3/UKBB_ELGH_5th_July_2022/results/nf-preprocessing/cellbender/CRD_CMB12979963/cellbender-epochs_250__learnrt_0pt000005__zdim_100__zlayer_500__lowcount_10/cellbender-FPR_0pt1-filtered_10x_mtx/barcodes.tsv.gz
tuple val(sample), val(sample_donor),path(barcodes), path(bam), path(bai), path(cellbender_barcodes)
path(genome)
output:
tuple val(sample),path("${sample_donor}.cram"), emit: cram_files
// tuple val(sample), val("${donor}") ,path("${sample}_filtered.bam"),path("${sample}_filtered.bam.csi"), path(barcodes), emit: freebayes_input

script:

"""
samtools view --threads ${task.cpus} --tag-file CB:${barcodes} \
--cram -T ${genome}/genome.fa \
-o ${sample_donor}.cram ${bam}
"""

}
3 changes: 2 additions & 1 deletion subworkflows/celltype.nf
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ workflow celltype{
main:

log.info '---Splitting the assignment for each batch---'
file__anndata_merged.map{val1 -> tuple('full_ct', val1)}.set{out1}
// file__anndata_merged.map{val1 -> tuple('full_ct', val1)}.set{out1}
file__anndata_merged.subscribe { println "file__anndata_merged: $it" }
SPLIT_BATCH_H5AD(file__anndata_merged,params.split_ad_per_bach)

// Here we may want to not split it and just pass in an entire h5ad file for annotations.
Expand Down
22 changes: 12 additions & 10 deletions subworkflows/data_handover.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ include { GATHER_DATA; SPLIT_DATA_BY_STUDY} from "$projectDir/modules/nf-core/m
include { ENCRYPT_DIR; ENCRYPT_TARGET } from "$projectDir/modules/local/encrypt"
include { TRANSFER;SUMMARY_STATISTICS_PLOTS } from "$projectDir/modules/nf-core/modules/summary_statistics_plots/main"
include { split_bam_by_donor } from "$projectDir/modules/local/cellranger_bam_per_donor"
include { SUBSET_BAM_PER_BARCODES } from "$projectDir/modules/nf-core/modules/subset_bam_per_barcodes_and_variants/main"

workflow data_handover{
take:
Expand All @@ -10,23 +11,24 @@ workflow data_handover{
qc_input
ch_poolid_csv_donor_assignments
sample_possorted_bam_vireo_donor_ids

genome
main:
log.info 'running data handover'
// outdir = file(outdir)
// outdir = file("${launchDir}/${outdir}").ifEmpty(outdir)






GATHER_DATA(outdir,qc_input.collect(),input_channel)


if (params.split_bam){
split_bam_by_donor(sample_possorted_bam_vireo_donor_ids, params.reference_assembly_fasta_dir_bam_split)
ENCRYPT_TARGET(split_bam_by_donor.out.possorted_cram_files)
cram_encrypted_dirs = ENCRYPT_TARGET.out.encrypted_dir
// val(sample), path(barcodes), path(bam)
GATHER_DATA.out.barcodes_files.flatten().map{sample -> tuple("${sample}".replaceFirst(/.*\//,"").replaceFirst(/\..*/,""),"${sample}".replaceFirst(/.*\//,"").replaceFirst(/\.tsv.*/,""),sample)}.set{barcodes}
barcodes.combine(sample_possorted_bam_vireo_donor_ids, by: 0).set{full_split_chanel_input}
// full_split_chanel_input.subscribe { println "full_split_chanel_input: $it" }
// genome.subscribe { println "genome: $it" }
SUBSET_BAM_PER_BARCODES(full_split_chanel_input,genome)
// split_bam_by_donor(sample_possorted_bam_vireo_donor_ids, genome)
// ENCRYPT_TARGET(split_bam_by_donor.out.possorted_cram_files)
// cram_encrypted_dirs = ENCRYPT_TARGET.out.encrypted_dir
} else {
cram_encrypted_dirs = Channel.empty()
}
Expand Down
14 changes: 5 additions & 9 deletions subworkflows/main_deconvolution.nf
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ include { match_genotypes } from './match_genotypes'
include {ENHANCE_STATS_GT_MATCH } from "$projectDir/modules/nf-core/modules/genotypes/main"
include {SUBSET_WORKF} from "$projectDir/modules/nf-core/modules/subset_genotype/main"
include {REPLACE_GT_DONOR_ID2; REPLACE_GT_DONOR_ID2 as REPLACE_GT_DONOR_ID_SUBS } from "$projectDir/modules/nf-core/modules/genotypes/main"
include { RETRIEVE_RECOURSES; STAGE_FILE } from "$projectDir/subworkflows/local/retrieve_recourses"
include { STAGE_FILE } from "$projectDir/subworkflows/local/retrieve_recourses"
include {GT_MATCH_POOL_IBD } from "$projectDir/modules/nf-core/modules/genotypes/main"
include { MATCH_GT_VIREO } from "$projectDir/modules/nf-core/modules/genotypes/main"

Expand All @@ -42,16 +42,12 @@ workflow main_deconvolution {
ch_experiment_donorsvcf_donorslist
scrublet_paths
vcf_input
genome

main:
log.info "#### running DECONVOLUTION workflow #####"

if (params.reference_assembly_fasta_dir=='https://yascp.cog.sanger.ac.uk/public/10x_reference_assembly'){
RETRIEVE_RECOURSES()
genome = RETRIEVE_RECOURSES.out.reference_assembly
}else{
genome = "${params.reference_assembly_fasta_dir}"
}

vcf_candidate_snps = STAGE_FILE(params.cellsnp.vcf_candidate_snps)
// genome.subscribe { println "genome: $it" }

Expand Down Expand Up @@ -303,10 +299,10 @@ workflow main_deconvolution {
// .set { gt_math_pool_against_panel_input2 }
FREEBAYES.out.gt_pool.groupTuple(by:0).set{fbb1}

fbb1.subscribe { println "fbb1: $it" }
// fbb1.subscribe { println "fbb1: $it" }
MERGE_GENOTYPES_IN_ONE_VCF_IDX_PAN(fbb1,'freebayes')
MERGE_GENOTYPES_IN_ONE_VCF_IDX_PAN.out.gt_pool.groupTuple(by:0).set{fbb2}
fbb2.subscribe { println "fbb2: $it" }
// fbb2.subscribe { println "fbb2: $it" }
MERGE_GENOTYPES_IN_ONE_VCF_FREEBAYES(fbb2,'freebayes')
gt_math_pool_against_panel_input2 = MERGE_GENOTYPES_IN_ONE_VCF_FREEBAYES.out.gt_pool.combine(ch_ref_vcf)
// cell_assignments =
Expand Down
Loading

0 comments on commit e25e274

Please sign in to comment.