Skip to content

Commit

Permalink
code clean-up, arg checks
Browse files Browse the repository at this point in the history
  • Loading branch information
sigven committed Apr 6, 2024
1 parent 69229a8 commit 52279c9
Show file tree
Hide file tree
Showing 24 changed files with 417 additions and 656 deletions.
448 changes: 176 additions & 272 deletions pcgr/arg_checker.py

Large diffs are not rendered by default.

26 changes: 13 additions & 13 deletions pcgr/cna.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
def annotate_cna_segments(output_fname: str,
output_dir: str,
cna_segment_file: str,
db_assembly_dir: str,
refdata_assembly_dir: str,
build: str,
sample_id: str,
n_copy_amplifications: int = 5,
Expand All @@ -30,7 +30,7 @@ def annotate_cna_segments(output_fname: str,
output_fname (str): File name of the annotated output file.
output_dir (str): Directory to save the annotated file.
cna_segment_file (str): Path to the user-provided copy number aberrations segment file.
db_assembly_dir (str): Path to the build-specific PCGR database directory.
refdata_assembly_dir (str): Path to the build-specific PCGR database directory.
build (str): Genome assembly build of input data.
sample_id( (str): Sample identifier
n_copy_amplifications (int, optional): Number of copies to consider as gains/amplifications. Defaults to 5.
Expand Down Expand Up @@ -86,7 +86,7 @@ def annotate_cna_segments(output_fname: str,

## Check that 'End' of segments do not exceed chromosome lengths
chromsizes_fname = \
os.path.join(db_assembly_dir, 'chromsize.' + build + '.tsv')
os.path.join(refdata_assembly_dir, 'chromsize.' + build + '.tsv')

check_file_exists(chromsizes_fname, logger)
chromsizes = pd.read_csv(chromsizes_fname, sep="\t", header=None, names=['Chromosome', 'ChromLength'])
Expand All @@ -110,14 +110,14 @@ def annotate_cna_segments(output_fname: str,
temp_files.append(cna_query_segment_bed.fn)

## annotate segments with cytobands
cna_query_segment_df = annotate_cytoband(cna_query_segment_bed, output_dir, db_assembly_dir, logger)
cna_query_segment_df = annotate_cytoband(cna_query_segment_bed, output_dir, refdata_assembly_dir, logger)

## annotate with protein-coding transcripts
cna_query_segment_bed = pybedtools.BedTool.from_dataframe(cna_query_segment_df)
temp_files.append(cna_query_segment_bed.fn)

cna_query_segment_df = annotate_transcripts(
cna_query_segment_bed, output_dir, db_assembly_dir, overlap_fraction=overlap_fraction, logger=logger)
cna_query_segment_bed, output_dir, refdata_assembly_dir, overlap_fraction=overlap_fraction, logger=logger)

cna_query_segment_df['segment_length_mb'] = \
((cna_query_segment_df['segment_end'] - cna_query_segment_df['segment_start']) / 1e6).astype(float).round(5)
Expand All @@ -128,8 +128,8 @@ def annotate_cna_segments(output_fname: str,
cna_actionable_dict = {}

for db in ['cgi','civic']:
variant_fname = os.path.join(db_assembly_dir, 'biomarker','tsv', f"{db}.variant.tsv.gz")
clinical_fname = os.path.join(db_assembly_dir, 'biomarker','tsv', f"{db}.clinical.tsv.gz")
variant_fname = os.path.join(refdata_assembly_dir, 'biomarker','tsv', f"{db}.variant.tsv.gz")
clinical_fname = os.path.join(refdata_assembly_dir, 'biomarker','tsv', f"{db}.clinical.tsv.gz")
logger.info(f"Loading copy-number biomarker evidence from {db} ..")
biomarkers[db] = load_biomarkers(
logger, variant_fname, clinical_fname, biomarker_vartype = 'CNA')
Expand Down Expand Up @@ -197,14 +197,14 @@ def annotate_cna_segments(output_fname: str,
return 0


def annotate_cytoband(cna_segments_bt: BedTool, output_dir: str, db_assembly_dir: str, logger: logging.Logger) -> pd.DataFrame:
def annotate_cytoband(cna_segments_bt: BedTool, output_dir: str, refdata_assembly_dir: str, logger: logging.Logger) -> pd.DataFrame:

pybedtools.set_tempdir(output_dir)
temp_files = []

# BED file with cytoband annotations
cytoband_bed_fname = \
os.path.join(db_assembly_dir, 'misc','bed','cytoband', 'cytoband.bed.gz')
os.path.join(refdata_assembly_dir, 'misc','bed','cytoband', 'cytoband.bed.gz')

cytoband_annotated_segments = pd.DataFrame()

Expand Down Expand Up @@ -272,7 +272,7 @@ def annotate_cytoband(cna_segments_bt: BedTool, output_dir: str, db_assembly_dir

def annotate_transcripts(cna_segments_bt: BedTool,
output_dir: str,
db_assembly_dir: str,
refdata_assembly_dir: str,
overlap_fraction: float,
logger: logging.Logger) -> pd.DataFrame:

Expand All @@ -282,7 +282,7 @@ def annotate_transcripts(cna_segments_bt: BedTool,
Parameters:
- cna_segments_bt: A BedTool object representing the CNA segments.
- output_dir: A string specifying the output directory (for temporary files generation).
- db_assembly_dir: A string specifying the directory of the build-specific PCGR data bundle.
- refdata_assembly_dir: A string specifying the directory of the build-specific PCGR data bundle.
- overlap_fraction: A float representing the fraction of overlap required for annotation.
Returns:
Expand All @@ -294,9 +294,9 @@ def annotate_transcripts(cna_segments_bt: BedTool,

# BED file with protein-coding transcripts
gene_transcript_bed_fname = \
os.path.join(db_assembly_dir, 'gene','bed','gene_transcript_xref', 'gene_transcript_xref_pc_nopad.bed.gz')
os.path.join(refdata_assembly_dir, 'gene','bed','gene_transcript_xref', 'gene_transcript_xref_pc_nopad.bed.gz')
gene_xref_tsv_fname = \
os.path.join(db_assembly_dir, "gene", "tsv", "gene_transcript_xref", "gene_transcript_xref.tsv.gz")
os.path.join(refdata_assembly_dir, "gene", "tsv", "gene_transcript_xref", "gene_transcript_xref.tsv.gz")

cna_segments_annotated = pd.DataFrame()

Expand Down
58 changes: 18 additions & 40 deletions pcgr/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ def create_config(arg_dict, workflow = "PCGR"):
'sample_id': arg_dict['sample_id'],
'genome_assembly': arg_dict['genome_assembly'],
'debug': arg_dict['debug'],
'pcgrr_conda': arg_dict['pcgrr_conda']
'pcgrr_conda': arg_dict['pcgrr_conda'],
'output_prefix': "None"
}

conf_options['vep'] = {
Expand Down Expand Up @@ -72,9 +73,9 @@ def create_config(arg_dict, workflow = "PCGR"):
if not arg_dict['tumor_ploidy'] is None:
conf_options['sample_properties']['tumor_ploidy'] = float(arg_dict['tumor_ploidy'])

conf_options['clinicaltrials'] = {
'run': int(arg_dict['include_trials'])
}
#conf_options['clinicaltrials'] = {
# 'run': int(arg_dict['include_trials'])
#}
conf_options['other']['vcf2maf'] = int(arg_dict['vcf2maf'])
conf_options['somatic_cna'] = {
'cna_overlap_pct': float(arg_dict['cna_overlap_pct']),
Expand Down Expand Up @@ -172,54 +173,30 @@ def create_config(arg_dict, workflow = "PCGR"):
return conf_options


def populate_config_data(conf_options: dict, db_dir: str, workflow = "PCGR", logger=None):
def populate_config_data(conf_options: dict, refdata_assembly_dir: str, workflow = "PCGR", logger=None):

conf_data = {}
conf_data['sample_id'] = conf_options['sample_id']
conf_data['output_dir'] = conf_options['output_dir']
conf_data['output_prefix'] = conf_options['output_prefix']
conf_data['workflow'] = workflow
conf_data['genome_assembly'] = conf_options['genome_assembly']
conf_data['software'] = {}
conf_data['software']['pcgr_version'] = pcgr_vars.PCGR_VERSION
conf_data['software']['cpsr_version'] = pcgr_vars.PCGR_VERSION
conf_data['molecular_data'] = conf_options['molecular_data']

## add paths to annotated files (VCF/TSV, CNA, TMB, EXPRESSION)
# conf_data['molecular_data'] = {}
# conf_data['molecular_data']['fname_mut_vcf'] = conf_options['annotated_vcf']
# conf_data['molecular_data']['fname_mut_tsv'] = conf_options['annotated_tsv']
# conf_data['molecular_data']['fname_cna_tsv'] = "None"
# conf_data['molecular_data']['fname_expression_tsv'] = "None"
# if workflow == "PCGR" and conf_options['annotated_cna'] != "None":
# conf_data['molecular_data']['fname_cna_tsv'] = conf_options['annotated_cna']
# del conf_options['annotated_cna']
# if workflow == "PCGR" and conf_options['annotated_exp'] != "None":
# conf_data['molecular_data']['fname_expression_tsv'] = conf_options['annotated_exp']
# del conf_options['annotated_exp']
# if workflow == "CPSR":
# del conf_data['molecular_data']['fname_cna_tsv']
# del conf_data['molecular_data']['fname_expression_tsv']

# conf_data['molecular_data']['fname_tmb'] = "None"
# if workflow == "PCGR" and conf_options['fname_tmb'] != "None":
# conf_data['molecular_data']['fname_tmb'] = conf_options['fname_tmb']
# del conf_options['fname_tmb']
# if workflow == "CPSR":
# del conf_data['molecular_data']['fname_tmb']

genome_assembly = conf_options['genome_assembly']
del conf_options['sample_id']
del conf_options['genome_assembly']
del conf_options['output_dir']
del conf_options['molecular_data']
#del conf_options['annotated_vcf']
#del conf_options['annotated_tsv']

conf_data['conf'] = conf_options

conf_data['reference_data'] = {}
conf_data['reference_data']['version'] = pcgr_vars.DB_VERSION
conf_data['reference_data']['path'] = \
os.path.join(db_dir, "data", genome_assembly)
refdata_assembly_dir

metadata_pd = pd.DataFrame()
conf_data['reference_data']['source_metadata'] = {}
Expand All @@ -228,23 +205,25 @@ def populate_config_data(conf_options: dict, db_dir: str, workflow = "PCGR", log
## add metadata information for each data source

cpsr_sources_regex = r'^(gepa|cpg_other|maxwell2016|acmg_sf|dbmts|woods_dnarepair|gerp|tcga_pancan_2018|gwas_catalog)'
pcgr_sources_regex = r'^(cytoband|mitelmandb|tcga|nci|intogen|opentargets|dgidb|pubchem|cosmic_mutsigs)'
pcgr_sources_regex = r'^(cytoband|mitelmandb|tcga|nci|intogen|opentargets|depmap|treehouse|dgidb|pubchem|cosmic_mutsigs)'
sources_skip_regex = r'^(illumina|foundation_one)'

for dtype in ['gene','phenotype','biomarker','drug','gwas','hotspot','other']:
metadata_fname = os.path.join(
db_dir, "data", conf_data['genome_assembly'],
refdata_assembly_dir,
".METADATA", "tsv", dtype + "_metadata.tsv")
if check_file_exists(metadata_fname, logger):
metadata_df = pd.read_csv(metadata_fname, sep="\t", na_values=".")
metadata_df["source_type"] = dtype
metadata_df['wflow'] = 'pcgr_cpsr'
metadata_df.loc[metadata_df['source_abbreviation'].str.match(cpsr_sources_regex), 'wflow'] = 'cpsr'
metadata_df.loc[metadata_df['source_abbreviation'].str.match(pcgr_sources_regex), 'wflow'] = 'pcgr'
metadata_df.loc[metadata_df['source_abbreviation'].str.match(sources_skip_regex), 'wflow'] = 'skip'
metadata_pd = metadata_pd._append(metadata_df, ignore_index=True)

conf_data['reference_data']['source_metadata'] = metadata_pd.to_dict(orient='records')

oncotree_fname = os.path.join(db_dir, "data", genome_assembly,
oncotree_fname = os.path.join(refdata_assembly_dir,
"phenotype","tsv","phenotype_onco.tsv.gz")

if check_file_exists(oncotree_fname, logger):
Expand Down Expand Up @@ -275,8 +254,7 @@ def populate_config_data(conf_options: dict, db_dir: str, workflow = "PCGR", log

conf_data['conf']['gene_panel']['panel_genes'] = set_virtual_target_genes(
conf_data['conf']['gene_panel']['panel_id'],
db_dir,
conf_data['genome_assembly'],
refdata_assembly_dir,
conf_data['conf']['gene_panel']['diagnostic_grade_only'],
conf_data['conf']['gene_panel']['custom_list_tsv'],
bool(conf_data['conf']['variant_classification']['secondary_findings']),
Expand All @@ -285,16 +263,16 @@ def populate_config_data(conf_options: dict, db_dir: str, workflow = "PCGR", log

return(conf_data)

def set_virtual_target_genes(panel_id: str, db_dir: str, genome_assembly: str, diagnostic_grade_only: bool,
def set_virtual_target_genes(panel_id: str, refdata_assembly_dir: str, diagnostic_grade_only: bool,
custom_list_tsv: str, secondary_findings: bool, logger=None):

all_panels_fname = os.path.join(
db_dir, "data", genome_assembly,
refdata_assembly_dir,
"gene","tsv","gene_virtual_panel",
"gene_virtual_panel.tsv.gz")

all_cpg_fname = os.path.join(
db_dir, "data", genome_assembly,
refdata_assembly_dir,
"gene","tsv","gene_cpg",
"gene_cpg.tsv.gz")

Expand Down
Loading

0 comments on commit 52279c9

Please sign in to comment.