Skip to content

Commit

Permalink
minor python lib fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
sigven committed Dec 30, 2023
1 parent b9bfd1e commit 30df312
Show file tree
Hide file tree
Showing 10 changed files with 92 additions and 105 deletions.
4 changes: 2 additions & 2 deletions pcgr/cna.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,15 +142,15 @@ def annotate_cna_segments(output_fname: str,

## Mark copy number amplifications (threshold defined by user) in input
cna_query_segment_df['aberration_key'] = 'nan'
cna_query_segment_df['loss_cond'] = True
cna_query_segment_df['amp_cond'] = True
cna_query_segment_df.loc[cna_query_segment_df['n_major'] + cna_query_segment_df['n_minor'] < n_copy_amplifications,"amp_cond"] = False
cna_query_segment_df.loc[cna_query_segment_df['n_major'] + cna_query_segment_df['n_minor'] >= n_copy_amplifications,"amp_cond"] = True

cna_query_segment_df.loc[cna_query_segment_df.amp_cond, 'aberration_key'] = \
cna_query_segment_df.loc[cna_query_segment_df.amp_cond, 'entrezgene'].astype(str) + '_amplification'

## Mark homozygous deletions in input
cna_query_segment_df['amp_cond'] = True
cna_query_segment_df['loss_cond'] = True
cna_query_segment_df.loc[cna_query_segment_df['n_major'] + cna_query_segment_df['n_minor'] > 0,"loss_cond"] = False
cna_query_segment_df.loc[cna_query_segment_df['n_major'] + cna_query_segment_df['n_minor'] == 0,"loss_cond"] = True

Expand Down
5 changes: 2 additions & 3 deletions pcgr/maf.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@
import gzip
import pandas as pd

from pcgr import utils
from pcgr.utils import getlogger, error_message, warn_message, check_file_exists
from pcgr.utils import check_file_exists, remove_file

def update_maf_allelic_support(maf_tmp_fname: str,
maf_fname: str,
Expand Down Expand Up @@ -90,7 +89,7 @@ def update_maf_allelic_support(maf_tmp_fname: str,
f.write(f'{header_line}\n')
f.close()
raw_maf_data.to_csv(maf_fname, sep="\t", index=False, mode='a')
utils.remove(maf_tmp_fname)
remove_file(maf_tmp_fname)



Expand Down
51 changes: 25 additions & 26 deletions pcgr/main.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,17 @@
#!/usr/bin/env python

from pcgr import pcgr_vars, arg_checker, config, utils, variant, cna, vep
from pcgr.utils import getlogger, check_subprocess
from pcgr.config import populate_config_data
from pcgr import pcgr_vars, arg_checker, utils, cna
from pcgr.utils import getlogger, check_subprocess, remove_file, random_id_generator
from pcgr.config import populate_config_data, create_config
from pcgr.maf import update_maf_allelic_support

from pcgr.vep import get_command
from pcgr.variant import clean_annotations, set_allelic_support, append_annotations, calculate_tmb

import re
import argparse
import pandas
import yaml
import os
import sys
import getpass
import platform
from glob import glob
from argparse import RawTextHelpFormatter

Expand Down Expand Up @@ -134,7 +132,7 @@ def cli():
arg_checker.check_args(arg_dict)

# create config options
conf_options = config.create_config(arg_dict, workflow = "PCGR")
conf_options = create_config(arg_dict, workflow = "PCGR")

# Verify existence of input files
pcgr_paths = arg_checker.verify_input_files(arg_dict)
Expand Down Expand Up @@ -199,13 +197,14 @@ def run_pcgr(pcgr_paths, conf_options):

check_subprocess(logger, f'mkdir -p {output_dir}', debug)

random_id = random_id_generator(15)
# Define temporary output file names
input_vcf_validated = f'{conf_options["sample_id"]}.pcgr_ready.vcf.gz'
input_vcf_validated_uncompr = f'{conf_options["sample_id"]}.pcgr_ready.vcf'
vep_vcf = f'{conf_options["sample_id"]}.vep.vcf'
vep_vcfanno_vcf = f'{conf_options["sample_id"]}.vep.vcfanno.vcf'
vep_vcfanno_summarised_vcf = f'{conf_options["sample_id"]}.vep.vcfanno.summarised.vcf'
vep_vcfanno_summarised_pass_vcf = f'{conf_options["sample_id"]}.vep.vcfanno.summarised.pass.vcf'
input_vcf_validated = f'{conf_options["sample_id"]}.{random_id}.pcgr_ready.vcf.gz'
input_vcf_validated_uncompr = f'{conf_options["sample_id"]}.{random_id}.pcgr_ready.vcf'
vep_vcf = f'{conf_options["sample_id"]}.{random_id}.vep.vcf'
vep_vcfanno_vcf = f'{conf_options["sample_id"]}.{random_id}.vep.vcfanno.vcf'
vep_vcfanno_summarised_vcf = f'{conf_options["sample_id"]}.{random_id}.vep.vcfanno.summarised.vcf'
vep_vcfanno_summarised_pass_vcf = f'{conf_options["sample_id"]}.{random_id}.vep.vcfanno.summarised.pass.vcf'
prefix = os.path.join(output_dir, f'{conf_options["sample_id"]}.pcgr_acmg.{conf_options["genome_assembly"]}')
output_vcf = f'{prefix}.vcf.gz'
output_pass_vcf = f'{prefix}.pass.vcf.gz'
Expand Down Expand Up @@ -304,10 +303,10 @@ def run_pcgr(pcgr_paths, conf_options):
outfile.write(yaml.dump(yaml_data))
outfile.close()

vep_command = vep.get_command(file_paths = pcgr_paths,
conf_options = yaml_data,
input_vcf = input_vcf_validated,
output_vcf = vep_vcf)
vep_command = get_command(file_paths = pcgr_paths,
conf_options = yaml_data,
input_vcf = input_vcf_validated,
output_vcf = vep_vcf)

# PCGR|VEP - run consequence annotation with Variant Effect Predictor
print('----')
Expand Down Expand Up @@ -358,7 +357,7 @@ def run_pcgr(pcgr_paths, conf_options):
)
check_subprocess(logger, vcf2maf_command, debug)
if not debug:
utils.remove(output_vcf2maf_log)
remove_file(output_vcf2maf_log)

## add information on allelic support in MAF file (n_depth, n_ref_count, n_alt_count, t_depth, t_ref_count, t_alt_count)
update_maf_allelic_support(
Expand Down Expand Up @@ -435,7 +434,7 @@ def run_pcgr(pcgr_paths, conf_options):
# do not delete if debugging
if not debug:
for fn in delete_files:
utils.remove(fn)
remove_file(fn)

logger.info('Finished pcgr-summarise main command')

Expand All @@ -447,13 +446,13 @@ def run_pcgr(pcgr_paths, conf_options):
## Append additional (space-containing) annotations not suitable for VCF INFO
logger.info("Appending ClinVar traits, official gene names, and protein domain annotations")
variant_set = \
variant.append_annotations(
append_annotations(
output_pass_vcf2tsv_gz, pcgr_db_dir = pcgr_paths["db_dir"], logger = logger)
variant_set = variant.set_allelic_support(variant_set, allelic_support_tags = yaml_data["conf"]['somatic_snv']['allelic_support'])
variant_set = variant.clean_annotations(variant_set, yaml_data, germline = False, logger = logger)
variant_set.to_csv(output_pass_tsv_gz, sep="\t", compression="gzip", index=False)
variant_set = set_allelic_support(variant_set, allelic_support_tags = yaml_data["conf"]['somatic_snv']['allelic_support'])
variant_set = clean_annotations(variant_set, yaml_data, germline = False, logger = logger)
variant_set.fillna('.').to_csv(output_pass_tsv_gz, sep="\t", compression="gzip", index=False)
if not debug:
utils.remove(output_pass_vcf2tsv_gz)
remove_file(output_pass_vcf2tsv_gz)

if yaml_data["conf"]['assay_properties']['type'] == 'WGS' or yaml_data["conf"]['assay_properties']['type'] == 'WES':
# check that output file exist
Expand Down Expand Up @@ -499,7 +498,7 @@ def run_pcgr(pcgr_paths, conf_options):

if yaml_data['conf']['somatic_snv']['tmb']['run'] == 1:
logger_tmb = getlogger('pcgr-calculate-tmb')
variant.calculate_tmb(
calculate_tmb(
variant_set = variant_set,
tumor_dp_min = int(yaml_data['conf']['somatic_snv']['tmb']['tmb_dp_min']),
tumor_af_min = float(yaml_data['conf']['somatic_snv']['tmb']['tmb_af_min']),
Expand Down
32 changes: 15 additions & 17 deletions pcgr/oncogenicity.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,11 +120,11 @@ def assign_oncogenicity_evidence(rec = None, tumortype = "Any"):
"LOSS_OF_FUNCTION",
"INTRON_POSITION",
"EXON_POSITION",
"gnomAD_EAS_AF",
"gnomAD_NFE_AF",
"gnomAD_AFR_AF",
"gnomAD_AMR_AF",
"gnomAD_SAS_AF",
"gnomADe_EAS_AF",
"gnomADe_NFE_AF",
"gnomADe_AFR_AF",
"gnomADe_AMR_AF",
"gnomADe_SAS_AF",
"DBNSFP_SIFT",
"DBNSFP_PROVEAN",
"DBNSFP_META_RNN",
Expand Down Expand Up @@ -152,7 +152,7 @@ def assign_oncogenicity_evidence(rec = None, tumortype = "Any"):
else:
if rec.INFO.get(col) == '':
variant_data[col] = True
else:
else:
variant_data[col] = rec.INFO.get(col)

for code in clingen_vicc_ev_codes:
Expand Down Expand Up @@ -246,27 +246,27 @@ def assign_oncogenicity_evidence(rec = None, tumortype = "Any"):
variant_data['CLINGEN_VICC_OP3'] = True


if "gnomAD_EAS_AF" in variant_data.keys() and \
"gnomAD_SAS_AF" in variant_data.keys() and \
"gnomAD_AMR_AF" in variant_data.keys() and \
"gnomAD_AFR_AF" in variant_data.keys() and \
"gnomAD_NFE_AF" in variant_data.keys():
if "gnomADe_EAS_AF" in variant_data.keys() and \
"gnomADe_SAS_AF" in variant_data.keys() and \
"gnomADe_AMR_AF" in variant_data.keys() and \
"gnomADe_AFR_AF" in variant_data.keys() and \
"gnomADe_NFE_AF" in variant_data.keys():

## check if variant has MAF > 0.01 (SBVS1) or > 0.05 in any of five major gnomAD subpopulations (exome set)
for pop in ['gnomAD_SAS_AF','gnomAD_EAS_AF','gnomAD_AMR_AF','gnomAD_AFR_AF','gnomAD_NFE_AF']:
for pop in ['gnomADe_SAS_AF','gnomADe_EAS_AF','gnomADe_AMR_AF','gnomADe_AFR_AF','gnomADe_NFE_AF']:
if not variant_data[pop] is None:
## MAF for this population >= 0.05
if float(variant_data[pop]) >= 0.05:
variant_data["CLINGEN_VICC_SBVS1"] = True
for pop in ['gnomAD_SAS_AF','gnomAD_EAS_AF','gnomAD_AMR_AF','gnomAD_AFR_AF','gnomAD_NFE_AF']:
for pop in ['gnomADe_SAS_AF','gnomADe_EAS_AF','gnomADe_AMR_AF','gnomADe_AFR_AF','gnomADe_NFE_AF']:
if not variant_data[pop] is None:
## MAF for this population >= 0.01 (< 0.05)
if float(variant_data[pop]) >= 0.01 and variant_data["CLINGEN_VICC_SBVS1"] is False:
variant_data["CLINGEN_VICC_SBS1"] = True

#missing_pop_freq = 0
approx_zero_pop_freq = 0
for pop in ['gnomAD_SAS_AF','gnomAD_EAS_AF','gnomAD_AMR_AF','gnomAD_AFR_AF','gnomAD_NFE_AF']:
for pop in ['gnomADe_SAS_AF','gnomADe_EAS_AF','gnomADe_AMR_AF','gnomADe_AFR_AF','gnomADe_NFE_AF']:
## no MAF recorded in gnomAD for this population
if variant_data[pop] is None:
approx_zero_pop_freq = approx_zero_pop_freq + 1
Expand All @@ -278,6 +278,7 @@ def assign_oncogenicity_evidence(rec = None, tumortype = "Any"):
## check if variant is missing or with MAF approximately zero in all five major gnomAD subpopulations (exome set)
if approx_zero_pop_freq == 5:
variant_data["CLINGEN_VICC_OP4"] = True


## check if variant is a loss-of-function variant (LOFTEE) in a tumor suppressor gene (Cancer Gene Census/CancerMine)
if "TSG" in variant_data.keys() and \
Expand Down Expand Up @@ -397,9 +398,6 @@ def assign_oncogenicity_evidence(rec = None, tumortype = "Any"):
likely_oncogenic_lower_limit = 5
likely_oncogenic_upper_limit = 9
oncogenic_lower_limit = 10

#if variant_data['SYMBOL'] == "PIK3CA":
# print(str(variant_data))

variant_data['ONCOGENICITY_SCORE'] = onc_score_benign + onc_score_pathogenic
if variant_data['ONCOGENICITY_SCORE'] <= likely_benign_upper_limit and \
Expand Down
2 changes: 1 addition & 1 deletion pcgr/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def get_cpsr_version():
return subprocess.check_output(v_cmd, shell=True).decode("utf-8")

# https://stackoverflow.com/a/10840586/2169986
def remove(filename):
def remove_file(filename):
try:
os.remove(filename)
except OSError as e:
Expand Down
Loading

0 comments on commit 30df312

Please sign in to comment.