Skip to content

Commit

Permalink
dynamic SNP fix for / | genotypes
Browse files Browse the repository at this point in the history
  • Loading branch information
Matiss Ozols committed Aug 1, 2024
1 parent 4586eae commit 56eac6f
Show file tree
Hide file tree
Showing 9 changed files with 493 additions and 1,604 deletions.
1,962 changes: 413 additions & 1,549 deletions bin/combine_concordance.py

Large diffs are not rendered by default.

7 changes: 6 additions & 1 deletion bin/concordance_calculations.py
Original file line number Diff line number Diff line change
Expand Up @@ -975,7 +975,12 @@ def set_results(self,to_set,id):


def combine_concordances(self,result,other_donor_concordance,donor_gt_match,analyse_donor):
pd.DataFrame(other_donor_concordance).sort_values(by=['cell']).to_csv(f'{donor_gt_match}-{analyse_donor}--each_cells_comparison_with_other_donor.tsv',sep='\t',index=False)
try:
pd.DataFrame(other_donor_concordance).sort_values(by=['cell']).to_csv(f'{donor_gt_match}-{analyse_donor}--each_cells_comparison_with_other_donor.tsv',sep='\t',index=False)
except:
print('We do not have any cells to analyse for this donor')
pd.DataFrame(other_donor_concordance).to_csv(f'{donor_gt_match}-{analyse_donor}--each_cells_comparison_with_other_donor.tsv',sep='\t',index=False)

self.cell_concordance_table = {**self.cell_concordance_table, **result}

def combine_dict(self,cell_concordance_table,result):
Expand Down
3 changes: 2 additions & 1 deletion bin/dynamic_donor_exclusive_snp_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,7 +339,8 @@ def donor_exclusive_sites(exclusive_don_variants2):
subs['full'] = subs['full'].str.replace(".|.",';', regex=False).str.replace(";+",';')
subs['full'] = subs['full'].str.replace("./.",';', regex=False).str.replace(";+",';')
subs['full'] = subs['full'].str.replace(".",';', regex=False).str.replace(";+",';')

subs['full'] = subs['full'].str.replace("/",'|', regex=False)

# all informative indexes
# now we need to locate which variants actually has a change in the genotype.
all_informative_site_index = set()
Expand Down
56 changes: 38 additions & 18 deletions bin/gather_minimal_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,11 +98,17 @@
'chromium_lane': 'chromium.lane',
'instrument':'instrument'
}

COLUMNS_SCRUBLET = {
'scrublet__multiplet_scores': 'scrublet.scores',
'scrublet__predicted_multiplet': 'scrublet.multiplet',
'scrublet__multiplet_zscores': 'scrublet.zscores'
'scrublet__multiplet_zscores': 'scrublet.zscores',
'scds_DropletType':'scds.multiplet','scds_score':'scds.score',
'scDblFinder_DropletType':'scDblFinder.multiplet','scDblFinder_Score':'scDblFinder.score',
'DoubletDecon_DropletType':'DoubletDecon.multiplet',
'DoubletFinder_DropletType':'DoubletFinder.multiplet','DoubletFinder_score':'DoubletFinder.score'
}

COLUMNS_OUTPUT = \
{**COLUMNS_DATASET, **COLUMNS_CELLBENDER, **COLUMNS_DECONV, **COLUMNS_QC, **COLUMNS_AZIMUTH}
COLUMNS_OUTPUT_WITH_SCRUBLET = \
Expand Down Expand Up @@ -354,7 +360,7 @@ def gather_donor(donor_id, ad, ad_lane_raw, azimuth_annot, qc_obs, columns_outpu
dt = pandas.concat([df,dfqc], axis = 1, join = 'inner')

colnams = list(columns_output.keys())
colnams_overlap = set(colnams).intersection(set(dt.columns))
colnams_overlap = sorted(set(colnams).intersection(set(dt.columns)))
ad.obs = dt[colnams_overlap].rename(columns = columns_output)
dt = pandas.concat([df, dfqc], axis = 1, join = 'outer')[colnams_overlap]
dt.rename(columns = columns_output, inplace = True)
Expand All @@ -377,9 +383,11 @@ def gather_donor(donor_id, ad, ad_lane_raw, azimuth_annot, qc_obs, columns_outpu

dt.index.name = 'barcode'
ad.obs.index.name = 'barcode'
dt.to_csv(os.path.join(outdir, oufnam + '.tsv'), sep = "\t", na_rep = "N/A")
sys.stderr.write("writing file {} ...\n".format(oufnam))
ad.obs = ad.obs.loc[:,~ad.obs.columns.duplicated()]
dt = dt.loc[:,~dt.columns.duplicated()].copy()
dt[set(dt.columns)].to_csv(os.path.join(outdir, oufnam + '.tsv'), sep = "\t", na_rep = "N/A")
sys.stderr.write("writing file {} ...\n".format(oufnam))

if write_h5:
path1=os.path.join(outdir, oufnam + '.h5ad')
try:
Expand Down Expand Up @@ -505,6 +513,13 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
#############

azt = pd.read_csv(f'{args.results_dir}/celltype/All_Celltype_Assignments.tsv',sep='\t',index_col=0)
azt_cols_to_add = azt.columns[azt.columns.str.contains('Azimuth')]
ct_cols_to_add = azt.columns[azt.columns.str.contains('Celltypist')]
for i3 in set(azt_cols_to_add) - set(columns_output.keys()):
columns_output = {**columns_output, **{i3:i3}}
for i3 in set(ct_cols_to_add) - set(columns_output.keys()):
columns_output = {**columns_output, **{i3:i3}}
# scpred_to_add = azt.columns[azt.columns.str.contains('Scpred')]
##########################
# Scrublet
#########################
Expand All @@ -516,21 +531,26 @@ def gather_pool(expid, args, df_raw, df_cellbender, adqc, oufh = sys.stdout,lane
d2 = pd.read_csv(f1,sep='\t')
d2['Exp']=pool_name
doublet_data_combined = pd.concat([doublet_data_combined,d2])
doublet_data_combined = doublet_data_combined.drop_duplicates(subset='barcodes')
scb = doublet_data_combined.set_index('barcodes')
columns_output = {**columns_output, **COLUMNS_SCRUBLET}

datadir_scrublet=glob.glob(f'{args.results_dir}/*/multiplet.method=scrublet')[0]
if os.path.isdir(datadir_scrublet):
# Scrublet loading QC
try:
scb = load_scrublet_assignments(
expid,
datadir_scrublet=datadir_scrublet
)
columns_output = {**columns_output, **COLUMNS_SCRUBLET}
except:
print('Scrubblet was not performed for this pool - potential reason is that there are not enough cells for assignment')
scb = None
else:
scb = None
# doublet_data_combined.iloc[0]
# datadir_scrublet=glob.glob(f'{args.results_dir}/*/multiplet.method=scrublet')[0]
# if os.path.isdir(datadir_scrublet):
# # Scrublet loading QC
# try:
# scb = load_scrublet_assignments(
# expid,
# datadir_scrublet=datadir_scrublet
# )
# columns_output = {**columns_output, **COLUMNS_SCRUBLET}
# scb = pd.concat([scb,doublet_data_combined.loc[scb.index]],axis=1)
# except:
# print('Scrubblet was not performed for this pool - potential reason is that there are not enough cells for assignment')
# scb = None
# else:
# scb = None


############################################################
Expand Down
4 changes: 2 additions & 2 deletions bin/generate_combined_celltype_anotation_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,8 +146,8 @@ def main():

# ad2 = adatasets2[0].concatenate(*adatasets2[1:])
# ad = scanpy.read(adata)
ad.obs = ad.obs.merge(Data_All, left_index=True, right_index=True)

ad.obs = ad.obs.merge(Data_All, left_index=True, right_index=True, how='left')
# set(ad.obs.index)-set(Data_All.index)
donor_celltype_report={}
tranche_exp_report={}
for id1 in set(Data_All['Exp']):
Expand Down
5 changes: 4 additions & 1 deletion conf/base.conf
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ params{
split_ad_per_bach = true
cellbender_resolution_to_use='0pt1'
reference_assembly_fasta_dir = " /nfs/srpipe_references/downloaded_from_10X/refdata-gex-GRCh38-2020-A/fasta/"
//# reference_assembly_fasta_dir = "https://yascp.cog.sanger.ac.uk/public/10x_reference_assembly"
webtransfer = false
project_name = 'Cardinal_pilots'
run_with_genotype_input=false
Expand Down Expand Up @@ -208,7 +209,7 @@ process {
cpus = { 1 * task.attempt }
}
withLabel:many_cores_small_mem {
cpus = { 20 * task.attempt }
cpus = { 10 * task.attempt }
memory = { 20.GB * task.attempt }
time = { 12.h * task.attempt }
}
Expand All @@ -219,6 +220,8 @@ process {

withName: SUBSET_GENOTYPE2{
memory = { 200.MB * task.attempt}
cpus = { 1 * task.attempt }
time = { 1.h * task.attempt }
}

withLabel:process_high {
Expand Down
10 changes: 5 additions & 5 deletions conf/modules.conf
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ process {
}

withName: SUBSET_GENOTYPE2{
cpus = 2
cpus = 1
memory = { 1.GB * task.attempt }
time = { 12.h * task.attempt }
maxRetries = 3
Expand Down Expand Up @@ -241,9 +241,9 @@ process {
}

withName: CONCORDANCE_CALCLULATIONS{
cpus = { 30 * task.attempt }
cpus = { 10 * task.attempt }
time = { 24.h * task.attempt }
memory = { 80.GB * task.attempt }
memory = { 100.GB * task.attempt }
}

withName: OTHER_DONOR_CONCORDANCE_CALCLULATIONS{
Expand All @@ -258,8 +258,8 @@ process {
}

withName: DYNAMIC_DONOR_EXCLUSIVE_SNP_SELECTION{
cpus = 5
time = { 12.h * task.attempt }
cpus = 1
time = { 2.h * task.attempt }
memory = { 20.GB * task.attempt }
}

Expand Down
5 changes: 2 additions & 3 deletions modules/nf-core/modules/cellsnp/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,8 @@ process DYNAMIC_DONOR_EXCLUSIVE_SNP_SELECTION{
} else {
container "mercury/scrna_deconvolution:62bd56a"
}
publishDir path: "${params.outdir}/concordances/${samplename}",
mode: "${params.copy_mode}",
overwrite: "true"
publishDir "${params.outdir}/cellsnp/cellsnp_${samplename}", mode: "${params.copy_mode}", pattern: "cellsnp_${samplename}", overwrite: true

input:
val(add_dynamic_sites_or_not_to_panel)
tuple val(samplename), path(vcf_file),path(csi),path(cellsnp_primary_file)
Expand Down
45 changes: 21 additions & 24 deletions workflows/yascp.nf
Original file line number Diff line number Diff line change
Expand Up @@ -47,33 +47,30 @@ workflow YASCP {
}

if (!params.input_data_table.contains('fake_file')){

// vcf_input.subscribe { println "vcf_input: $it" }
// ###################################
// ################################### Readme
// AMBIENT RNA REMOVAL USING CELLBENDER
// There are 2 modes of running YASCP pipeline:
// (option 1) users can run it from existing cellbender if the analysis has already been performed by providing a parth to existing cellbender files : note a specific folder structure is required
// (option 2) users can run it from cellranger - skipping the cellbender. params.input == 'cellranger'
// ###################################
// ###################################
prepare_inputs(input_channel)
channel__file_paths_10x=prepare_inputs.out.channel__file_paths_10x
channel__file_paths_10x_single=prepare_inputs.out.ch_experimentid_paths10x_filtered
input_channel = prepare_inputs.out.channel_input_data_table
if (params.reference_assembly_fasta_dir=='https://yascp.cog.sanger.ac.uk/public/10x_reference_assembly'){
prepare_inputs(input_channel)
channel__file_paths_10x=prepare_inputs.out.channel__file_paths_10x
channel__file_paths_10x_single=prepare_inputs.out.ch_experimentid_paths10x_filtered
input_channel = prepare_inputs.out.channel_input_data_table
if (params.reference_assembly_fasta_dir=='https://yascp.cog.sanger.ac.uk/public/10x_reference_assembly'){
RETRIEVE_RECOURSES()
genome = RETRIEVE_RECOURSES.out.reference_assembly
genome1 = RETRIEVE_RECOURSES.out.reference_assembly
}else{
genome = "${params.reference_assembly_fasta_dir}"
}

chanel_cr_outs = prepare_inputs.out.chanel_cr_outs
channel_dsb = prepare_inputs.out.channel_dsb
genome1 = "${params.reference_assembly_fasta_dir}"
}
genome = PREPROCESS_GENOME(genome1)

chanel_cr_outs = prepare_inputs.out.chanel_cr_outs
channel_dsb = prepare_inputs.out.channel_dsb
}
vireo_paths = Channel.from("$projectDir/assets/fake_file.fq")
matched_donors = Channel.from("$projectDir/assets/fake_file.fq")

vireo_paths = Channel.from("$projectDir/assets/fake_file.fq")
matched_donors = Channel.from("$projectDir/assets/fake_file.fq")

ch_poolid_csv_donor_assignments = Channel.empty()
bam_split_channel = Channel.of()
out_ch = params.outdir
? Channel.fromPath(params.outdir, checkIfExists:true)
: Channel.from("${launchDir}/${params.outdir}")

if(!params.just_reports){
// sometimes we just want to rerun report generation as a result of alterations, hence if we set params.just_reports =True pipeline will use the results directory and generate a new reports.

Expand Down

0 comments on commit 56eac6f

Please sign in to comment.