From 52279c99384d17fb5cbcf90b2602aab0198aecc7 Mon Sep 17 00:00:00 2001 From: Sigve Nakken Date: Sat, 6 Apr 2024 11:10:19 +0200 Subject: [PATCH] code clean-up, arg checks --- pcgr/arg_checker.py | 448 +++++++----------- pcgr/cna.py | 26 +- pcgr/config.py | 58 +-- pcgr/cpsr.py | 78 ++- pcgr/expression.py | 55 ++- pcgr/main.py | 52 +- pcgr/variant.py | 46 +- pcgr/vcf.py | 20 +- pcgr/vep.py | 7 +- pcgrr/R/input_data.R | 21 + pcgrr/R/main.R | 28 +- pcgrr/R/reference_data.R | 3 +- pcgrr/R/report.R | 4 +- pcgrr/data/data_coltype_defs.rda | Bin 1912 -> 1922 bytes pcgrr/data/tcga_cohorts.rda | Bin 762 -> 769 bytes pcgrr/data/tsv_cols.rda | Bin 694 -> 687 bytes .../mutational_signature.qmd | 3 - .../pcgr_quarto_report/snv_indel_main.qmd | 7 +- .../pcgr_quarto_report/value_boxes.qmd | 52 -- .../value_boxes_tumor_only.qmd | 52 -- scripts/cpsr_validate_input.py | 42 +- scripts/pcgr_summarise.py | 44 +- scripts/pcgr_validate_input.py | 25 +- scripts/pcgrr.R | 2 +- 24 files changed, 417 insertions(+), 656 deletions(-) delete mode 100644 pcgrr/inst/templates/pcgr_quarto_report/value_boxes.qmd delete mode 100644 pcgrr/inst/templates/pcgr_quarto_report/value_boxes_tumor_only.qmd diff --git a/pcgr/arg_checker.py b/pcgr/arg_checker.py index cb2f8f70..ff87799f 100644 --- a/pcgr/arg_checker.py +++ b/pcgr/arg_checker.py @@ -5,32 +5,14 @@ import os -def check_args(arg_dict): +def verify_args(arg_dict): logger = getlogger("pcgr-validate-arguments-input-a") - # Check the existence of required arguments - if arg_dict['refdata_dir'] is None or not os.path.isdir(arg_dict['refdata_dir']): - err_msg = f"Required argument '--refdata_dir' does not exist ({arg_dict['refdata_dir']})." - error_message(err_msg, logger) - - if arg_dict['genome_assembly'] is None: - err_msg = f"Required argument '--genome_assembly' has no/undefined value ({arg_dict['genome_assembly']})." - error_message(err_msg, logger) - - if arg_dict['input_vcf'] is None: - err_msg = f"Required argument '--input_vcf' does not exist ({arg_dict['input_vcf']})." - error_message(err_msg, logger) - - if arg_dict['sample_id'] is None: - err_msg = f"Required argument '--sample_id' has no/undefined value ({arg_dict['sample_id']})." - error_message(err_msg, logger) - - if len(arg_dict['sample_id']) <= 2 or len(arg_dict['sample_id']) > 35: - err_msg = f"Sample name identifier ('--sample_id' = {arg_dict['sample_id']}) must be between 2 and 35 characters long" - error_message(err_msg, logger) + + # Check the existence of required arguments + verify_required_args(arg_dict, logger) # Optional arguments - if arg_dict['expression_sim'] == True and arg_dict['input_rna_exp'] is None: warn_msg = f"RNA expression similarity analysis can only be performed if --input_rna_exp is set (--input_rna_exp = {arg_dict['input_rna_exp']})." warn_message(warn_msg, logger) @@ -198,51 +180,25 @@ def check_args(arg_dict): err_msg = f"Minimum percent overlap between copy number segment and gene transcript ('--cna_overlap_pct' = {arg_dict['cna_overlap_pct']}) must be within (0, 100]" error_message(err_msg, logger) - # VEP options - if int(arg_dict['vep_n_forks']) <= int(pcgr_vars.VEP_MIN_FORKS) or int(arg_dict['vep_n_forks']) > int(pcgr_vars.VEP_MAX_FORKS): - err_msg = ( - f"Number of forks that VEP can use during annotation must be above {str(pcgr_vars.VEP_MIN_FORKS)} and not " - f"more than {str(pcgr_vars.VEP_MAX_FORKS)} (recommended is 4), current value is {arg_dict['vep_n_forks']}" - ) - error_message(err_msg, logger) + verify_vep_options(arg_dict, logger) - if int(arg_dict['vep_buffer_size']) < int(pcgr_vars.VEP_MIN_BUFFER_SIZE) or int(arg_dict['vep_buffer_size']) > int(pcgr_vars.VEP_MAX_BUFFER_SIZE): - err_msg = ( - f"Internal VEP buffer size, corresponding to the number of variants that are read in to memory simultaneously " - f"('--vep_buffer_size' = {arg_dict['vep_buffer_size']}), " - f"must be within [{str(pcgr_vars.VEP_MIN_BUFFER_SIZE)}, {str(pcgr_vars.VEP_MAX_BUFFER_SIZE)}]") - error_message(err_msg, logger) - - # Check that VEP pick criteria is formatted correctly - if not arg_dict['vep_pick_order'] is None: - values = str(arg_dict['vep_pick_order']).split(',') - permitted_sources = pcgr_vars.VEP_PICK_CRITERIA - num_permitted_sources = 0 - for v in values: - if v in permitted_sources: - num_permitted_sources += 1 - - if num_permitted_sources != len(pcgr_vars.VEP_PICK_CRITERIA): - err_msg = ( - f"Option '--vep_pick_order' = {str(arg_dict['vep_pick_order'])} is formatted incorrectly, should be " - f"a comma-separated string of the following values: '{' '.join(pcgr_vars.VEP_PICK_CRITERIA)}'" - ) - error_message(err_msg, logger) - - return + return(arg_dict) -def define_output_files(arg_dict): +def define_output_files(arg_dict, cpsr = False): """ Define output files """ - logger = getlogger("pcgr-output-files") + logger = getlogger("define-output-files") # create output folder (if not already exists) output_dir = utils.safe_makedir(os.path.abspath(arg_dict['output_dir'])) # Define general output prefix output_prefix = os.path.join( str(output_dir), f"{arg_dict['sample_id']}.pcgr.{arg_dict['genome_assembly']}") + if cpsr: + output_prefix = os.path.join( + str(output_dir), f"{arg_dict['sample_id']}.cpsr.{arg_dict['genome_assembly']}") # Append output files to arg_dict output_vcf = \ @@ -264,33 +220,35 @@ def define_output_files(arg_dict): err_msg = "Output files (e.g. " + str(output_vcf_tsv) + \ ") already exist - please specify different sample_id or add option --force_overwrite" error_message(err_msg, logger) - - # if annotated output cna segments exist and overwrite not set - if os.path.exists(output_cna) and arg_dict["force_overwrite"] is False: - err_msg = "Output files (e.g. " + str(output_cna) + \ - ") already exist - please specify different sample_id or add option --force_overwrite" - error_message(err_msg, logger) - - # if output annotated gene expression exist and overwrite not set - if os.path.exists(output_expression) and arg_dict["force_overwrite"] is False: - err_msg = "Output files (e.g. " + str(output_expression) + \ - ") already exist - please specify different sample_id or add option --force_overwrite" - error_message(err_msg, logger) + + if not cpsr: + # if annotated output cna segments exist and overwrite not set + if os.path.exists(output_cna) and arg_dict["force_overwrite"] is False: + err_msg = "Output files (e.g. " + str(output_cna) + \ + ") already exist - please specify different sample_id or add option --force_overwrite" + error_message(err_msg, logger) + + # if output annotated gene expression exist and overwrite not set + if os.path.exists(output_expression) and arg_dict["force_overwrite"] is False: + err_msg = "Output files (e.g. " + str(output_expression) + \ + ") already exist - please specify different sample_id or add option --force_overwrite" + error_message(err_msg, logger) output_data = {} output_data['prefix'] = output_prefix output_data['dir'] = output_dir output_data['vcf'] = output_vcf output_data['vcf2tsv'] = output_vcf_tsv - output_data['cna'] = output_cna - output_data['expression'] = output_expression + + if not cpsr: + output_data['cna'] = output_cna + output_data['expression'] = output_expression return output_data def verify_input_files(arg_dict): """ - 1. Checks existence of input files/dirs (arg_dict) - 2. Checks that the data bundle is of correct date + 1. Checks existence of all input files/dirs (arg_dict) """ logger = getlogger("pcgr-check-files") @@ -302,9 +260,6 @@ def verify_input_files(arg_dict): pon_vcf_dir = 'NA' db_dir = 'NA' base_dir = 'NA' - #output_vcf = 'NA' - #output_cna = 'NA' - #output_exp = 'NA' pon_vcf_basename = 'NA' input_vcf_basename = 'NA' input_cna_basename = 'NA' @@ -314,15 +269,7 @@ def verify_input_files(arg_dict): arg_dict['rna_fusion_tumor'] = None # create output folder (if not already exists) - output_dir_full = utils.safe_makedir(os.path.abspath(arg_dict['output_dir'])) - - # Append output files to arg_dict - #output_vcf = \ - # os.path.join(str(output_dir_full), f"{arg_dict['sample_id']}.pcgr.{arg_dict['genome_assembly']}.vcf.gz") - #output_cna = \ - # os.path.join(str(output_dir_full), f"{arg_dict['sample_id']}.pcgr.{arg_dict['genome_assembly']}.cna_segments.tsv.gz") - #output_exp = \ - # os.path.join(str(output_dir_full), f"{arg_dict['sample_id']}.pcgr.{arg_dict['genome_assembly']}.expression.tsv.gz") + #output_dir_full = utils.safe_makedir(os.path.abspath(arg_dict['output_dir'])) # check that either input vcf or cna segments exist if arg_dict['input_vcf'] is None and arg_dict['input_cna'] is None: @@ -367,11 +314,6 @@ def verify_input_files(arg_dict): input_vcf_basename = os.path.basename(str(arg_dict["input_vcf"])) input_vcf_dir = os.path.dirname(os.path.abspath(arg_dict["input_vcf"])) - # if output vcf exist and overwrite not set - #if os.path.exists(output_vcf) and arg_dict["force_overwrite"] is False: - # err_msg = f"Output files (e.g. {output_vcf}) already exist - please specify different sample_id or add option --force_overwrite" - # error_message(err_msg, logger) - # check if input cna segments exist if not arg_dict["input_cna"] is None: check_file_exists(os.path.abspath(arg_dict["input_cna"]), strict = True, logger = logger) @@ -384,25 +326,19 @@ def verify_input_files(arg_dict): input_cna_basename = os.path.basename(str(arg_dict["input_cna"])) input_cna_dir = os.path.dirname(os.path.abspath(arg_dict["input_cna"])) - # if output cna segments exist and overwrite not set - #if os.path.exists(output_cna) and arg_dict["force_overwrite"] is False: - # err_msg = "Output files (e.g. " + str(output_cna) + \ - # ") already exist - please specify different sample_id or add option --force_overwrite" - # error_message(err_msg, logger) - # check if input rna fusion variants exist - if not arg_dict["input_rna_fusion"] is None: - check_file_exists(os.path.abspath(arg_dict["input_rna_fusion"]), strict = True, logger = logger) + # if not arg_dict["input_rna_fusion"] is None: + # check_file_exists(os.path.abspath(arg_dict["input_rna_fusion"]), strict = True, logger = logger) - if not (os.path.abspath(arg_dict["input_rna_fusion"]).endswith(".tsv") or - os.path.abspath(arg_dict["input_rna_fusion"]).endswith(".txt")): - err_msg = "RNA fusion variants file (" + os.path.abspath( - arg_dict["input_rna_fusion"]) + ") does not have the correct file extension (.tsv or .txt)" - error_message(err_msg, logger) - input_rna_fusion_basename = os.path.basename( - str(arg_dict["input_rna_fusion"])) - input_rna_fusion_dir = os.path.dirname( - os.path.abspath(arg_dict["input_rna_fusion"])) + # if not (os.path.abspath(arg_dict["input_rna_fusion"]).endswith(".tsv") or + # os.path.abspath(arg_dict["input_rna_fusion"]).endswith(".txt")): + # err_msg = "RNA fusion variants file (" + os.path.abspath( + # arg_dict["input_rna_fusion"]) + ") does not have the correct file extension (.tsv or .txt)" + # error_message(err_msg, logger) + # input_rna_fusion_basename = os.path.basename( + # str(arg_dict["input_rna_fusion"])) + # input_rna_fusion_dir = os.path.dirname( + # os.path.abspath(arg_dict["input_rna_fusion"])) # check if input rna expression exist if not arg_dict["input_rna_exp"] is None: @@ -418,12 +354,6 @@ def verify_input_files(arg_dict): input_rna_expression_dir = os.path.dirname( os.path.abspath(arg_dict["input_rna_exp"])) - # if output annotated gene expression exist and overwrite not set - #if os.path.exists(output_exp) and arg_dict["force_overwrite"] is False: - # err_msg = "Output files (e.g. " + str(output_exp) + \ - # ") already exist - please specify different sample_id or add option --force_overwrite" - # error_message(err_msg, logger) - # check if input rna fusion variants exist if not arg_dict["cpsr_report"] is None: if not os.path.exists(os.path.abspath(arg_dict["cpsr_report"])): @@ -438,14 +368,84 @@ def verify_input_files(arg_dict): str(arg_dict["cpsr_report"])) input_cpsr_report_dir = os.path.dirname( os.path.abspath(arg_dict["cpsr_report"])) + + vep_dir = verify_vep_cache(arg_dict, logger) + refdata_assembly_dir = verify_refdata(arg_dict, logger, cpsr = True) + - # check the existence of base folder + input_data = { + "vcf_dir": input_vcf_dir, + "cna_dir": input_cna_dir, + "rna_fusion_dir": input_rna_fusion_dir, + "rna_expression_dir": input_rna_expression_dir, + "cpsr_report_dir": input_cpsr_report_dir, + "cpsr_report_basename": input_cpsr_report_basename, + "pon_vcf_dir": pon_vcf_dir, + "refdata_assembly_dir": refdata_assembly_dir, + "vep_dir": vep_dir, + "pon_vcf_basename": pon_vcf_basename, + "vcf_basename": input_vcf_basename, + "cna_basename": input_cna_basename, + "rna_fusion_basename": input_rna_fusion_basename, + "rna_expression_basename": input_rna_expression_basename + + } + + return input_data + + +def verify_refdata(arg_dict: dict, logger = None, cpsr = False): + + # check the existence of base refdata folder base_dir = os.path.abspath(arg_dict["refdata_dir"]) if not os.path.isdir(base_dir): err_msg = "Base directory (" + str(base_dir) + ") does not exist" error_message(err_msg, logger) - # check the existence of base folder + # check the existence of 'data' folder within the base folder + refdata_dir = os.path.join(os.path.abspath(arg_dict["refdata_dir"]), "data") + if not os.path.isdir(refdata_dir): + err_msg = "Data directory (" + str(refdata_dir) + ") does not exist" + error_message(err_msg, logger) + + # check the existence of specified assembly data folder within the base/data folder + refdata_assembly_dir = os.path.join(os.path.abspath( + arg_dict["refdata_dir"]), "data", arg_dict["genome_assembly"]) + if not os.path.isdir(refdata_assembly_dir): + err_msg = "Data directory for the specified genome assembly (" + str( + refdata_assembly_dir) + ") does not exist" + error_message(err_msg, logger) + + # check the existence of .PCGR_BUNDLE_VERSION (starting from 1.5.0) + rel_notes_file = os.path.join(refdata_assembly_dir, ".PCGR_BUNDLE_VERSION") + if not os.path.exists(rel_notes_file): + err_msg = "Could not find file ('" + str(rel_notes_file) + "') in the PCGR reference data directory - exiting." + error_message(err_msg, logger) + + f_rel_notes = open(rel_notes_file,'r') + compliant_data_bundle = 0 + for line in f_rel_notes: + if pcgr_vars.DB_VERSION in line: + compliant_data_bundle = 1 + + if compliant_data_bundle == 0: + err_msg = ( + f'The PCGR data bundle is not compliant with the software version - please download the ' + f'latest software and data bundle (see https://github.com/sigven/cpsr for instructions)') + if cpsr is False: + err_msg = ( + f'The PCGR data bundle is outdated - please download the latest data bundle ', + f'(see https://sigven.github.io/pcgr for instructions)' + ) + error_message(err_msg,logger) + + f_rel_notes.close() + + return(refdata_assembly_dir) + +def verify_vep_cache(arg_dict: dict, logger = None): + + # check the existence of base VEP dir vep_dir = os.path.abspath(arg_dict["vep_dir"]) if not os.path.isdir(vep_dir): err_msg = "VEP directory ('" + str(vep_dir) + "') does not exist" @@ -456,99 +456,86 @@ def verify_input_files(arg_dict): err_msg = "VEP directory ('" + str(vep_human_dir) + "') does not exist" error_message(err_msg, logger) + ## check that the specified VEP version exists vep_human_version_dir = os.path.join( os.path.abspath(arg_dict["vep_dir"]), "homo_sapiens", f"{pcgr_vars.VEP_VERSION}_{pcgr_vars.VEP_ASSEMBLY[arg_dict['genome_assembly']]}") if not os.path.isdir(vep_human_version_dir): err_msg = "VEP directory ('" + str(vep_human_version_dir) + "') does not exist" error_message(err_msg, logger) + + return(vep_dir) - # check the existence of data folder within the base folder - db_dir = os.path.join(os.path.abspath(arg_dict["refdata_dir"]), "data") - if not os.path.isdir(db_dir): - err_msg = "Data directory (" + str(db_dir) + ") does not exist" - error_message(err_msg, logger) - - # check the existence of specified assembly data folder within the base folder - db_assembly_dir = os.path.join(os.path.abspath( - arg_dict["refdata_dir"]), "data", arg_dict["genome_assembly"]) - if not os.path.isdir(db_assembly_dir): - err_msg = "Data directory for the specified genome assembly (" + str( - db_assembly_dir) + ") does not exist" +def verify_vep_options(arg_dict: dict, logger = None): + + # VEP options + if int(arg_dict['vep_n_forks']) <= int(pcgr_vars.VEP_MIN_FORKS) or \ + int(arg_dict['vep_n_forks']) > int(pcgr_vars.VEP_MAX_FORKS): + err_msg = ( + f"Number of forks that VEP can use during annotation must be above {str(pcgr_vars.VEP_MIN_FORKS)} and not " + f"more than {str(pcgr_vars.VEP_MAX_FORKS)} (recommended is 4), current value is {arg_dict['vep_n_forks']}" + ) error_message(err_msg, logger) - # check the existence of .PCGR_BUNDLE_VERSION (starting from 1.5.0) - rel_notes_file = os.path.join(os.path.abspath( - arg_dict["refdata_dir"]), "data", arg_dict["genome_assembly"], ".PCGR_BUNDLE_VERSION") - if not os.path.exists(rel_notes_file): - err_msg = "The PCGR data bundle is outdated - please download the latest data bundle (see github.com/sigven/pcgr for instructions)" + if int(arg_dict['vep_buffer_size']) < int(pcgr_vars.VEP_MIN_BUFFER_SIZE) or \ + int(arg_dict['vep_buffer_size']) > int(pcgr_vars.VEP_MAX_BUFFER_SIZE): + err_msg = ( + f"Internal VEP buffer size, corresponding to the number of variants that are read in to memory simultaneously " + f"('--vep_buffer_size' = {arg_dict['vep_buffer_size']}), " + f"must be within [{str(pcgr_vars.VEP_MIN_BUFFER_SIZE)}, {str(pcgr_vars.VEP_MAX_BUFFER_SIZE)}]") error_message(err_msg, logger) - f_rel_not = open(rel_notes_file, "r") - compliant_data_bundle = 0 - for line in f_rel_not: - if pcgr_vars.DB_VERSION in line: - compliant_data_bundle = 1 + # Check that VEP pick criteria is formatted correctly + if not arg_dict['vep_pick_order'] is None: + values = str(arg_dict['vep_pick_order']).split(',') + permitted_sources = pcgr_vars.VEP_PICK_CRITERIA + num_permitted_sources = 0 + for v in values: + if v in permitted_sources: + num_permitted_sources += 1 - f_rel_not.close() + if num_permitted_sources != len(pcgr_vars.VEP_PICK_CRITERIA): + err_msg = ( + f"Option '--vep_pick_order' = {str(arg_dict['vep_pick_order'])} is formatted incorrectly, should be " + f"a comma-separated string of the following values: '{' '.join(pcgr_vars.VEP_PICK_CRITERIA)}'" + ) + error_message(err_msg, logger) + + +def verify_required_args(arg_dict: dict, logger = None): + + if arg_dict['refdata_dir'] is None or not os.path.isdir(arg_dict['refdata_dir']): + err_msg = f"Required argument '--refdata_dir' does not exist ({arg_dict['refdata_dir']})." + error_message(err_msg, logger) - if compliant_data_bundle == 0: - err_msg = "The PCGR data bundle is not compliant with the software version - please download the latest software and data bundle (see https://github.com/sigven/pcgr for instructions)" + if arg_dict['genome_assembly'] is None: + err_msg = f"Required argument '--genome_assembly' has no/undefined value ({arg_dict['genome_assembly']})." error_message(err_msg, logger) - input_data = { - "vcf_dir": input_vcf_dir, - "cna_dir": input_cna_dir, - "rna_fusion_dir": input_rna_fusion_dir, - "rna_expression_dir": input_rna_expression_dir, - "cpsr_report_dir": input_cpsr_report_dir, - "cpsr_report_basename": input_cpsr_report_basename, - "pon_vcf_dir": pon_vcf_dir, - "db_assembly_dir": db_assembly_dir, - "refdata_dir": base_dir, - "vep_dir": vep_dir, - #"output_dir": output_dir_full, - #"output_vcf": output_vcf, - #"output_cna": output_cna, - #"output_exp": output_exp, - "pon_vcf_basename": pon_vcf_basename, - "vcf_basename": input_vcf_basename, - "cna_basename": input_cna_basename, - "rna_fusion_basename": input_rna_fusion_basename, - "rna_expression_basename": input_rna_expression_basename - - } + if arg_dict['input_vcf'] is None or not os.path.exists(arg_dict['input_vcf']): + err_msg = f"Required argument '--input_vcf' does not exist ({arg_dict['input_vcf']})." + error_message(err_msg, logger) - return input_data + if arg_dict['sample_id'] is None: + err_msg = f"Required argument '--sample_id' has no/undefined value ({arg_dict['sample_id']})." + error_message(err_msg, logger) + if len(arg_dict['sample_id']) <= 2 or len(arg_dict['sample_id']) > 35: + err_msg = f"Sample name identifier ('--sample_id' = {arg_dict['sample_id']}) must be between 2 and 35 characters long" + error_message(err_msg, logger) + + return(0) + ##---- CPSR ----## -def check_args_cpsr(arg_dict): +def verify_args_cpsr(arg_dict): logger = getlogger('cpsr-validate-input-arguments-a') arg_dict['vep_regulatory'] = True ## Required arguments - ## Check that query VCF is set and exists - if arg_dict['input_vcf'] is None or not os.path.exists(arg_dict['input_vcf']): - err_msg = f"Required argument '--input_vcf' does not exist ({arg_dict['input_vcf']})." - error_message(err_msg,logger) - ## Check that PCGR directory (with data bundle) is provided and exists - if arg_dict['refdata_dir'] is None or not os.path.isdir(arg_dict['refdata_dir']): - err_msg = f"Required argument '--refdata_dir' does not exist ({arg_dict['refdata_dir']})." - error_message(err_msg,logger) - ## Check that genome assembly is set - if arg_dict['genome_assembly'] is None: - err_msg = f"Required argument '--genome_assembly' has no/undefined value ({arg_dict['genome_assembly']})." - error_message(err_msg,logger) - ## Check that sample identifier is set and is of appropriate length (minimum two characters) - if arg_dict['sample_id'] is None: - err_msg = f"Required argument '--sample_id' has no/undefined value ({arg_dict['sample_id']})." - error_message(err_msg,logger) - - if len(arg_dict['sample_id']) <= 2: - err_msg = f"Sample name identifier ('--sample_id') requires a name with more than two characters ({arg_dict['sample_id']})." - error_message(err_msg,logger) - + + verify_required_args(arg_dict, logger) + ### Optional arguments ## Provide virtual_panel_id or a custom list from panel 0 if arg_dict['virtual_panel_id'] == "-1" and not arg_dict['custom_list']: @@ -604,36 +591,8 @@ def check_args_cpsr(arg_dict): warn_message(warn_msg, logger) arg_dict['diagnostic_grade_only'] = False - ## VEP options - if arg_dict['vep_n_forks'] <= pcgr_vars.VEP_MIN_FORKS or arg_dict['vep_n_forks'] > pcgr_vars.VEP_MAX_FORKS: - err_msg = ( - f"Number of forks that VEP can use during annotation must be above 0 and not more " - f"than {pcgr_vars.VEP_MAX_FORKS} (recommended is 4), current value is {arg_dict['vep_n_forks']}") - error_message(err_msg,logger) - - if arg_dict['vep_buffer_size'] < pcgr_vars.VEP_MIN_BUFFER_SIZE or arg_dict['vep_buffer_size'] > pcgr_vars.VEP_MAX_BUFFER_SIZE: - err_msg = ( - f"Internal VEP buffer size, corresponding to the number of variants that are read in to memory simultaneously, " - f"must be above {pcgr_vars.VEP_MIN_BUFFER_SIZE} and not more than {pcgr_vars.VEP_MAX_BUFFER_SIZE}, " - f"current value is {arg_dict['vep_buffer_size']}" - ) - error_message(err_msg,logger) - - ## Check that VEP pick criteria is formatted correctly - if not arg_dict['vep_pick_order'] is None: - values = str(arg_dict['vep_pick_order']).split(',') - permitted_sources = pcgr_vars.VEP_PICK_CRITERIA - num_permitted_sources = 0 - for v in values: - if v in permitted_sources: - num_permitted_sources += 1 + verify_vep_options(arg_dict, logger) - if num_permitted_sources != len(pcgr_vars.VEP_PICK_CRITERIA): - err_msg = ( - f"Option '--vep_pick_order' = {str(arg_dict['vep_pick_order'])} is formatted incorrectly, should be " - f"a comma-separated string of the following values: '{' '.join(pcgr_vars.VEP_PICK_CRITERIA)}'" - ) - error_message(err_msg, logger) return(arg_dict) @@ -684,71 +643,15 @@ def verify_input_files_cpsr(arg_dict): err_msg = f"Output files (e.g. {output_vcf}) already exist - please specify different " + \ "sample_id or add option --force_overwrite" error_message(err_msg,logger) + - # check the existence of base folder - base_dir = os.path.abspath(arg_dict["refdata_dir"]) - if not os.path.isdir(base_dir): - err_msg = "Base directory (" + str(base_dir) + ") does not exist" - error_message(err_msg, logger) - - # check the existence of base folder - vep_dir = os.path.abspath(arg_dict["vep_dir"]) - if not os.path.isdir(vep_dir): - err_msg = "VEP directory ('" + str(vep_dir) + "') does not exist" - error_message(err_msg, logger) - - vep_human_dir = os.path.join(os.path.abspath(arg_dict["vep_dir"]), "homo_sapiens") - if not os.path.isdir(vep_human_dir): - err_msg = "VEP directory ('" + str(vep_human_dir) + "') does not exist" - error_message(err_msg, logger) - - vep_human_version_dir = os.path.join( - os.path.abspath(arg_dict["vep_dir"]), "homo_sapiens", - f"{pcgr_vars.VEP_VERSION}_{pcgr_vars.VEP_ASSEMBLY[arg_dict['genome_assembly']]}") - if not os.path.isdir(vep_human_version_dir): - err_msg = "VEP directory ('" + str(vep_human_version_dir) + "') does not exist" - error_message(err_msg, logger) - - # check the existence of data folder within the base folder - db_dir = os.path.join(os.path.abspath(arg_dict["refdata_dir"]), "data") - if not os.path.isdir(db_dir): - err_msg = "Data directory (" + str(db_dir) + ") does not exist" - error_message(err_msg, logger) - - # check the existence of specified assembly data folder within the base folder - db_assembly_dir = os.path.join(os.path.abspath( - arg_dict["refdata_dir"]), "data", arg_dict["genome_assembly"]) - if not os.path.isdir(db_assembly_dir): - err_msg = "Data directory for the specified genome assembly (" + str( - db_assembly_dir) + ") does not exist" - error_message(err_msg, logger) - - # check the existence of .PCGR_BUNDLE_VERSION (starting from 1.5.0) - rel_notes_file = os.path.join(os.path.abspath( - arg_dict["refdata_dir"]), "data", arg_dict["genome_assembly"], ".PCGR_BUNDLE_VERSION") - if not os.path.exists(rel_notes_file): - err_msg = "The PCGR data bundle is outdated - please download the latest data bundle (see github.com/sigven/pcgr for instructions)" - error_message(err_msg, logger) - - f_rel_not = open(rel_notes_file,'r') - compliant_data_bundle = 0 - for line in f_rel_not: - if pcgr_vars.DB_VERSION in line: - compliant_data_bundle = 1 - - f_rel_not.close() - - if compliant_data_bundle == 0: - err_msg = ( - f'The PCGR data bundle is not compliant with the software version - please download the ' - f'latest software and data bundle (see https://github.com/sigven/cpsr for instructions)') - error_message(err_msg,logger) + vep_dir = verify_vep_cache(arg_dict, logger) + refdata_assembly_dir = verify_refdata(arg_dict, logger, cpsr = True) cpsr_paths = { "input_vcf_dir": input_vcf_dir, "input_customlist_dir": input_customlist_dir, - "db_assembly_dir": db_assembly_dir, - "refdata_dir": base_dir, + "refdata_assembly_dir": refdata_assembly_dir, "vep_dir": vep_dir, "output_dir": output_dir_full, "output_vcf": output_vcf, @@ -757,3 +660,4 @@ def verify_input_files_cpsr(arg_dict): } return cpsr_paths + diff --git a/pcgr/cna.py b/pcgr/cna.py index d60f8bac..dcd4fd63 100644 --- a/pcgr/cna.py +++ b/pcgr/cna.py @@ -17,7 +17,7 @@ def annotate_cna_segments(output_fname: str, output_dir: str, cna_segment_file: str, - db_assembly_dir: str, + refdata_assembly_dir: str, build: str, sample_id: str, n_copy_amplifications: int = 5, @@ -30,7 +30,7 @@ def annotate_cna_segments(output_fname: str, output_fname (str): File name of the annotated output file. output_dir (str): Directory to save the annotated file. cna_segment_file (str): Path to the user-provided copy number aberrations segment file. - db_assembly_dir (str): Path to the build-specific PCGR database directory. + refdata_assembly_dir (str): Path to the build-specific PCGR database directory. build (str): Genome assembly build of input data. sample_id( (str): Sample identifier n_copy_amplifications (int, optional): Number of copies to consider as gains/amplifications. Defaults to 5. @@ -86,7 +86,7 @@ def annotate_cna_segments(output_fname: str, ## Check that 'End' of segments do not exceed chromosome lengths chromsizes_fname = \ - os.path.join(db_assembly_dir, 'chromsize.' + build + '.tsv') + os.path.join(refdata_assembly_dir, 'chromsize.' + build + '.tsv') check_file_exists(chromsizes_fname, logger) chromsizes = pd.read_csv(chromsizes_fname, sep="\t", header=None, names=['Chromosome', 'ChromLength']) @@ -110,14 +110,14 @@ def annotate_cna_segments(output_fname: str, temp_files.append(cna_query_segment_bed.fn) ## annotate segments with cytobands - cna_query_segment_df = annotate_cytoband(cna_query_segment_bed, output_dir, db_assembly_dir, logger) + cna_query_segment_df = annotate_cytoband(cna_query_segment_bed, output_dir, refdata_assembly_dir, logger) ## annotate with protein-coding transcripts cna_query_segment_bed = pybedtools.BedTool.from_dataframe(cna_query_segment_df) temp_files.append(cna_query_segment_bed.fn) cna_query_segment_df = annotate_transcripts( - cna_query_segment_bed, output_dir, db_assembly_dir, overlap_fraction=overlap_fraction, logger=logger) + cna_query_segment_bed, output_dir, refdata_assembly_dir, overlap_fraction=overlap_fraction, logger=logger) cna_query_segment_df['segment_length_mb'] = \ ((cna_query_segment_df['segment_end'] - cna_query_segment_df['segment_start']) / 1e6).astype(float).round(5) @@ -128,8 +128,8 @@ def annotate_cna_segments(output_fname: str, cna_actionable_dict = {} for db in ['cgi','civic']: - variant_fname = os.path.join(db_assembly_dir, 'biomarker','tsv', f"{db}.variant.tsv.gz") - clinical_fname = os.path.join(db_assembly_dir, 'biomarker','tsv', f"{db}.clinical.tsv.gz") + variant_fname = os.path.join(refdata_assembly_dir, 'biomarker','tsv', f"{db}.variant.tsv.gz") + clinical_fname = os.path.join(refdata_assembly_dir, 'biomarker','tsv', f"{db}.clinical.tsv.gz") logger.info(f"Loading copy-number biomarker evidence from {db} ..") biomarkers[db] = load_biomarkers( logger, variant_fname, clinical_fname, biomarker_vartype = 'CNA') @@ -197,14 +197,14 @@ def annotate_cna_segments(output_fname: str, return 0 -def annotate_cytoband(cna_segments_bt: BedTool, output_dir: str, db_assembly_dir: str, logger: logging.Logger) -> pd.DataFrame: +def annotate_cytoband(cna_segments_bt: BedTool, output_dir: str, refdata_assembly_dir: str, logger: logging.Logger) -> pd.DataFrame: pybedtools.set_tempdir(output_dir) temp_files = [] # BED file with cytoband annotations cytoband_bed_fname = \ - os.path.join(db_assembly_dir, 'misc','bed','cytoband', 'cytoband.bed.gz') + os.path.join(refdata_assembly_dir, 'misc','bed','cytoband', 'cytoband.bed.gz') cytoband_annotated_segments = pd.DataFrame() @@ -272,7 +272,7 @@ def annotate_cytoband(cna_segments_bt: BedTool, output_dir: str, db_assembly_dir def annotate_transcripts(cna_segments_bt: BedTool, output_dir: str, - db_assembly_dir: str, + refdata_assembly_dir: str, overlap_fraction: float, logger: logging.Logger) -> pd.DataFrame: @@ -282,7 +282,7 @@ def annotate_transcripts(cna_segments_bt: BedTool, Parameters: - cna_segments_bt: A BedTool object representing the CNA segments. - output_dir: A string specifying the output directory (for temporary files generation). - - db_assembly_dir: A string specifying the directory of the build-specific PCGR data bundle. + - refdata_assembly_dir: A string specifying the directory of the build-specific PCGR data bundle. - overlap_fraction: A float representing the fraction of overlap required for annotation. Returns: @@ -294,9 +294,9 @@ def annotate_transcripts(cna_segments_bt: BedTool, # BED file with protein-coding transcripts gene_transcript_bed_fname = \ - os.path.join(db_assembly_dir, 'gene','bed','gene_transcript_xref', 'gene_transcript_xref_pc_nopad.bed.gz') + os.path.join(refdata_assembly_dir, 'gene','bed','gene_transcript_xref', 'gene_transcript_xref_pc_nopad.bed.gz') gene_xref_tsv_fname = \ - os.path.join(db_assembly_dir, "gene", "tsv", "gene_transcript_xref", "gene_transcript_xref.tsv.gz") + os.path.join(refdata_assembly_dir, "gene", "tsv", "gene_transcript_xref", "gene_transcript_xref.tsv.gz") cna_segments_annotated = pd.DataFrame() diff --git a/pcgr/config.py b/pcgr/config.py index 7b5168f7..acefd5bd 100644 --- a/pcgr/config.py +++ b/pcgr/config.py @@ -17,7 +17,8 @@ def create_config(arg_dict, workflow = "PCGR"): 'sample_id': arg_dict['sample_id'], 'genome_assembly': arg_dict['genome_assembly'], 'debug': arg_dict['debug'], - 'pcgrr_conda': arg_dict['pcgrr_conda'] + 'pcgrr_conda': arg_dict['pcgrr_conda'], + 'output_prefix': "None" } conf_options['vep'] = { @@ -72,9 +73,9 @@ def create_config(arg_dict, workflow = "PCGR"): if not arg_dict['tumor_ploidy'] is None: conf_options['sample_properties']['tumor_ploidy'] = float(arg_dict['tumor_ploidy']) - conf_options['clinicaltrials'] = { - 'run': int(arg_dict['include_trials']) - } + #conf_options['clinicaltrials'] = { + # 'run': int(arg_dict['include_trials']) + #} conf_options['other']['vcf2maf'] = int(arg_dict['vcf2maf']) conf_options['somatic_cna'] = { 'cna_overlap_pct': float(arg_dict['cna_overlap_pct']), @@ -172,11 +173,12 @@ def create_config(arg_dict, workflow = "PCGR"): return conf_options -def populate_config_data(conf_options: dict, db_dir: str, workflow = "PCGR", logger=None): +def populate_config_data(conf_options: dict, refdata_assembly_dir: str, workflow = "PCGR", logger=None): conf_data = {} conf_data['sample_id'] = conf_options['sample_id'] conf_data['output_dir'] = conf_options['output_dir'] + conf_data['output_prefix'] = conf_options['output_prefix'] conf_data['workflow'] = workflow conf_data['genome_assembly'] = conf_options['genome_assembly'] conf_data['software'] = {} @@ -184,42 +186,17 @@ def populate_config_data(conf_options: dict, db_dir: str, workflow = "PCGR", log conf_data['software']['cpsr_version'] = pcgr_vars.PCGR_VERSION conf_data['molecular_data'] = conf_options['molecular_data'] - ## add paths to annotated files (VCF/TSV, CNA, TMB, EXPRESSION) - # conf_data['molecular_data'] = {} - # conf_data['molecular_data']['fname_mut_vcf'] = conf_options['annotated_vcf'] - # conf_data['molecular_data']['fname_mut_tsv'] = conf_options['annotated_tsv'] - # conf_data['molecular_data']['fname_cna_tsv'] = "None" - # conf_data['molecular_data']['fname_expression_tsv'] = "None" - # if workflow == "PCGR" and conf_options['annotated_cna'] != "None": - # conf_data['molecular_data']['fname_cna_tsv'] = conf_options['annotated_cna'] - # del conf_options['annotated_cna'] - # if workflow == "PCGR" and conf_options['annotated_exp'] != "None": - # conf_data['molecular_data']['fname_expression_tsv'] = conf_options['annotated_exp'] - # del conf_options['annotated_exp'] - # if workflow == "CPSR": - # del conf_data['molecular_data']['fname_cna_tsv'] - # del conf_data['molecular_data']['fname_expression_tsv'] - - # conf_data['molecular_data']['fname_tmb'] = "None" - # if workflow == "PCGR" and conf_options['fname_tmb'] != "None": - # conf_data['molecular_data']['fname_tmb'] = conf_options['fname_tmb'] - # del conf_options['fname_tmb'] - # if workflow == "CPSR": - # del conf_data['molecular_data']['fname_tmb'] - - genome_assembly = conf_options['genome_assembly'] del conf_options['sample_id'] del conf_options['genome_assembly'] del conf_options['output_dir'] del conf_options['molecular_data'] - #del conf_options['annotated_vcf'] - #del conf_options['annotated_tsv'] + conf_data['conf'] = conf_options conf_data['reference_data'] = {} conf_data['reference_data']['version'] = pcgr_vars.DB_VERSION conf_data['reference_data']['path'] = \ - os.path.join(db_dir, "data", genome_assembly) + refdata_assembly_dir metadata_pd = pd.DataFrame() conf_data['reference_data']['source_metadata'] = {} @@ -228,11 +205,12 @@ def populate_config_data(conf_options: dict, db_dir: str, workflow = "PCGR", log ## add metadata information for each data source cpsr_sources_regex = r'^(gepa|cpg_other|maxwell2016|acmg_sf|dbmts|woods_dnarepair|gerp|tcga_pancan_2018|gwas_catalog)' - pcgr_sources_regex = r'^(cytoband|mitelmandb|tcga|nci|intogen|opentargets|dgidb|pubchem|cosmic_mutsigs)' + pcgr_sources_regex = r'^(cytoband|mitelmandb|tcga|nci|intogen|opentargets|depmap|treehouse|dgidb|pubchem|cosmic_mutsigs)' + sources_skip_regex = r'^(illumina|foundation_one)' for dtype in ['gene','phenotype','biomarker','drug','gwas','hotspot','other']: metadata_fname = os.path.join( - db_dir, "data", conf_data['genome_assembly'], + refdata_assembly_dir, ".METADATA", "tsv", dtype + "_metadata.tsv") if check_file_exists(metadata_fname, logger): metadata_df = pd.read_csv(metadata_fname, sep="\t", na_values=".") @@ -240,11 +218,12 @@ def populate_config_data(conf_options: dict, db_dir: str, workflow = "PCGR", log metadata_df['wflow'] = 'pcgr_cpsr' metadata_df.loc[metadata_df['source_abbreviation'].str.match(cpsr_sources_regex), 'wflow'] = 'cpsr' metadata_df.loc[metadata_df['source_abbreviation'].str.match(pcgr_sources_regex), 'wflow'] = 'pcgr' + metadata_df.loc[metadata_df['source_abbreviation'].str.match(sources_skip_regex), 'wflow'] = 'skip' metadata_pd = metadata_pd._append(metadata_df, ignore_index=True) conf_data['reference_data']['source_metadata'] = metadata_pd.to_dict(orient='records') - oncotree_fname = os.path.join(db_dir, "data", genome_assembly, + oncotree_fname = os.path.join(refdata_assembly_dir, "phenotype","tsv","phenotype_onco.tsv.gz") if check_file_exists(oncotree_fname, logger): @@ -275,8 +254,7 @@ def populate_config_data(conf_options: dict, db_dir: str, workflow = "PCGR", log conf_data['conf']['gene_panel']['panel_genes'] = set_virtual_target_genes( conf_data['conf']['gene_panel']['panel_id'], - db_dir, - conf_data['genome_assembly'], + refdata_assembly_dir, conf_data['conf']['gene_panel']['diagnostic_grade_only'], conf_data['conf']['gene_panel']['custom_list_tsv'], bool(conf_data['conf']['variant_classification']['secondary_findings']), @@ -285,16 +263,16 @@ def populate_config_data(conf_options: dict, db_dir: str, workflow = "PCGR", log return(conf_data) -def set_virtual_target_genes(panel_id: str, db_dir: str, genome_assembly: str, diagnostic_grade_only: bool, +def set_virtual_target_genes(panel_id: str, refdata_assembly_dir: str, diagnostic_grade_only: bool, custom_list_tsv: str, secondary_findings: bool, logger=None): all_panels_fname = os.path.join( - db_dir, "data", genome_assembly, + refdata_assembly_dir, "gene","tsv","gene_virtual_panel", "gene_virtual_panel.tsv.gz") all_cpg_fname = os.path.join( - db_dir, "data", genome_assembly, + refdata_assembly_dir, "gene","tsv","gene_cpg", "gene_cpg.tsv.gz") diff --git a/pcgr/cpsr.py b/pcgr/cpsr.py index 4d1a08e4..8fe72dec 100755 --- a/pcgr/cpsr.py +++ b/pcgr/cpsr.py @@ -15,7 +15,7 @@ from glob import glob from argparse import RawTextHelpFormatter from pcgr import pcgr_vars, arg_checker, utils, config, variant -from pcgr.utils import check_subprocess, getlogger, error_message, warn_message, remove_file +from pcgr.utils import check_subprocess, getlogger, error_message, warn_message, remove_file, random_id_generator from pcgr.config import populate_config_data from pcgr.vep import get_vep_command @@ -80,81 +80,79 @@ def get_args(): def main(): arg_dict = get_args() # check parsed arguments - arg_checker.check_args_cpsr(arg_dict) + arg_dict_verified = arg_checker.verify_args_cpsr(arg_dict) # Verify existence of input files - cpsr_paths = arg_checker.verify_input_files_cpsr(arg_dict) + + input_data = arg_checker.verify_input_files_cpsr(arg_dict_verified) + output_data = arg_checker.define_output_files(arg_dict, cpsr = True) # create configuration/options conf_options = config.create_config(arg_dict, workflow = "CPSR") ## Run CPSR workflow - run_cpsr(conf_options, cpsr_paths) + run_cpsr(conf_options, input_data, output_data) -def run_cpsr(conf_options, cpsr_paths): +def run_cpsr(conf_options, input_data, output_data): """ Main function to run the CPSR workflow """ - prefix = os.path.join( - cpsr_paths['output_dir'], f'{conf_options["sample_id"]}.cpsr.{conf_options["genome_assembly"]}') - debug = conf_options['debug'] vep_skip_intergenic_set = 'ON' if conf_options['vep']['vep_no_intergenic'] == 1 else 'OFF' - output_vcf = 'None' - output_pass_vcf = 'None' - output_pass_tsv = 'None' + #output_vcf = 'None' + #output_pass_vcf = 'None' + #output_pass_tsv = 'None' uid = '' genome_assembly = str(conf_options['genome_assembly']) input_vcf = 'None' input_customlist = 'None' output_custom_bed = 'None' + output_dir = output_data['dir'] + output_prefix = output_data['prefix'] print('') logger = getlogger('cpsr-validate-input-arguments') logger.info("CPSR - STEP 0: Validate input data") - if cpsr_paths['input_vcf_basename'] != 'NA': - input_vcf = os.path.join(cpsr_paths['input_vcf_dir'], cpsr_paths['input_vcf_basename']) - if cpsr_paths['input_customlist_basename'] != 'NA' and cpsr_paths['input_customlist_dir'] != 'NA': + if input_data['input_vcf_basename'] != 'NA': + input_vcf = os.path.join(input_data['input_vcf_dir'], input_data['input_vcf_basename']) + if input_data['input_customlist_basename'] != 'NA' and input_data['input_customlist_dir'] != 'NA': input_customlist = os.path.join( - cpsr_paths['input_customlist_dir'], cpsr_paths['input_customlist_basename']) - output_custom_bed = f'{prefix}.custom_panel.bed' + input_data['input_customlist_dir'], input_data['input_customlist_basename']) + output_custom_bed = f'{output_prefix}.custom_panel.bed' if conf_options['gene_panel']['custom_list_name'] == "None": warn_msg = "No custom list name provided, use argument '--custom_list_name' to provide name for custom defined panel" warn_message(warn_msg, logger) if not input_vcf == 'None': - refdata_dir = cpsr_paths['refdata_dir'] - db_assembly_dir = cpsr_paths['db_assembly_dir'] - output_dir = cpsr_paths['output_dir'] - check_subprocess(logger, f'mkdir -p {output_dir}', debug) ## Define input, output and temporary file names - input_vcf_validated = f'{conf_options["sample_id"]}.cpsr_ready_target.vcf.gz' - input_vcf_validated_uncompr = f'{conf_options["sample_id"]}.cpsr_ready_target.vcf' - vep_vcf = f'{conf_options["sample_id"]}.vep.vcf' - vep_vcfanno_vcf = f'{conf_options["sample_id"]}.vep.vcfanno.vcf' - vep_vcfanno_summarised_vcf = f'{conf_options["sample_id"]}.vep.vcfanno.summarised.vcf' - vep_vcfanno_summarised_pass_vcf = f'{conf_options["sample_id"]}.vep.vcfanno.summarised.pass.vcf' - output_vcf = f'{prefix}.vcf.gz' - output_pass_vcf = f'{prefix}.pass.vcf.gz' - output_pass_vcf2tsv = f'{prefix}.pass.vcf2tsv.tsv' + random_id = random_id_generator(15) + # Define temporary output files + input_vcf_validated = f'{output_prefix}.{random_id}.ready.vcf.gz' + input_vcf_validated_uncompr = f'{output_prefix}.{random_id}.ready.vcf' + vep_vcf = f'{output_prefix}.{random_id}.vep.vcf' + vep_vcfanno_vcf = f'{output_prefix}.{random_id}.vep.vcfanno.vcf' + vep_vcfanno_summarised_vcf = f'{output_prefix}.{random_id}.vep.vcfanno.summarised.vcf' + vep_vcfanno_summarised_pass_vcf = f'{output_prefix}.{random_id}.vep.vcfanno.summarised.pass.vcf' + output_vcf = f'{output_prefix}.vcf.gz' + output_pass_vcf = f'{output_prefix}.pass.vcf.gz' + output_pass_vcf2tsv = f'{output_prefix}.pass.vcf2tsv.tsv' output_pass_vcf2tsv_gz = f'{output_pass_vcf2tsv}.gz' - output_pass_tsv = f'{prefix}.pass.tsv' + output_pass_tsv = f'{output_prefix}.pass.tsv' output_pass_tsv_gz = f'{output_pass_tsv}.gz' - yaml_fname = f'{prefix}.conf.yaml' + yaml_fname = f'{output_prefix}.conf.yaml' ## CPSR|Validate input VCF - check formatting, non-overlap with CPSR INFO tags, and whether sample contains any variants in cancer predisposition loci vcf_validate_command = ( f'cpsr_validate_input.py ' - f'{refdata_dir} ' + f'{input_data["refdata_assembly_dir"]} ' f'{input_vcf} ' f'{input_vcf_validated_uncompr} ' f'{input_customlist} ' f'{output_custom_bed} ' f'{conf_options["other"]["retained_vcf_info_tags"]} ' - f'{conf_options["genome_assembly"]} ' f'{conf_options["sample_id"]} ' f'{conf_options["gene_panel"]["panel_id"]} ' f'{conf_options["gene_panel"]["diagnostic_grade_only"]} ' @@ -185,23 +183,22 @@ def run_cpsr(conf_options, cpsr_paths): logger.info(f"Genome assembly: {conf_options['genome_assembly']}") print('----') - #conf_options['annotated_tsv'] = output_pass_tsv_gz - #conf_options['annotated_vcf'] = output_vcf conf_options['molecular_data']['fname_mut_vcf'] = output_vcf conf_options['molecular_data']['fname_mut_tsv'] = output_pass_tsv_gz conf_options['output_dir'] = output_dir + conf_options['output_prefix'] = output_prefix conf_options['gene_panel']['custom_list_bed'] = "None" if not input_customlist == 'None': conf_options['gene_panel']['custom_list_bed'] = output_custom_bed ## Write YAML configuration file - settings, path to files, reference bundle etc - yaml_data = populate_config_data(conf_options, refdata_dir, workflow = "CPSR", logger = logger) + yaml_data = populate_config_data(conf_options, input_data["refdata_assembly_dir"], workflow = "CPSR", logger = logger) with open(yaml_fname, "w") as outfile: outfile.write(yaml.dump(yaml_data)) outfile.close() ## CPSR|VEP - run Variant Effect Predictor on query VCF with LoF and NearestExonJB plugins - vep_command = get_vep_command(file_paths = cpsr_paths, + vep_command = get_vep_command(file_paths = input_data, conf_options = yaml_data, input_vcf = input_vcf_validated, output_vcf = vep_vcf) @@ -236,8 +233,7 @@ def run_cpsr(conf_options, cpsr_paths): f'{"--debug" if debug else ""} ' f"--dbmts --gerp --tcga --gnomad_non_cancer --gene_transcript_xref " f"--gwas --rmsk {vep_vcf}.gz {vep_vcfanno_vcf} " - f"{db_assembly_dir}" - #f"{os.path.join(refdata_dir, 'data', str(yaml_data['genome_assembly']))}" + f"{input_data['refdata_assembly_dir']}" ) check_subprocess(logger, pcgr_vcfanno_command, debug) logger.info("Finished cpsr-vcfanno") @@ -250,7 +246,7 @@ def run_cpsr(conf_options, cpsr_paths): f'pcgr_summarise.py {vep_vcfanno_vcf}.gz {vep_vcfanno_summarised_vcf} 0 ' f'{yaml_data["conf"]["vep"]["vep_regulatory"]} 0 ' f'Any {yaml_data["conf"]["vep"]["vep_pick_order"]} ' - f'{cpsr_paths["db_assembly_dir"]} --compress_output_vcf ' + f'{input_data["refdata_assembly_dir"]} --compress_output_vcf ' f'--cpsr_yaml {yaml_fname} ' f'--cpsr {"--debug" if debug else ""}' ) @@ -283,7 +279,7 @@ def run_cpsr(conf_options, cpsr_paths): logger.info("Appending ClinVar traits, official gene names, and protein domain annotations") variant_set = \ variant.append_annotations( - output_pass_vcf2tsv_gz, db_assembly_dir = cpsr_paths["db_assembly_dir"], logger = logger) + output_pass_vcf2tsv_gz, refdata_assembly_dir = input_data["refdata_assembly_dir"], logger = logger) variant_set = variant.clean_annotations(variant_set, yaml_data, germline = True, logger = logger) ## If no genotypes are available, set conf['sample_properties']['genotypes_available'] = 1 diff --git a/pcgr/expression.py b/pcgr/expression.py index cce25e10..927ed964 100644 --- a/pcgr/expression.py +++ b/pcgr/expression.py @@ -12,14 +12,14 @@ def parse_expression(expression_fname_tsv: str, sample_id: str, - db_assembly_dir: str, + refdata_assembly_dir: str, logger: logging.Logger = None): """ Parse the expression data from the user-provided file and verify record identifiers with reference data. Aggregate transcript-level TPM values to generate gene-level TPM values. Args: expression_fname_tsv (str): Path to the user-provided gene expression file. - db_assembly_dir (str): Path to the build-specific PCGR database directory. + refdata_assembly_dir (str): Path to the build-specific PCGR reference data directory. logger (logging.Logger, optional): Logger. Defaults to None. """ ## Check that the expression file exists @@ -38,7 +38,7 @@ def parse_expression(expression_fname_tsv: str, ## gene symbols/aliases and identifiers (Ensembl) to Entrez gene identifers and ## biotype gene_index_fname_tsv = \ - os.path.join(db_assembly_dir, "gene", "tsv", "gene_transcript_xref", "gene_index.tsv.gz") + os.path.join(refdata_assembly_dir, "gene", "tsv", "gene_transcript_xref", "gene_index.tsv.gz") check_file_exists(gene_index_fname_tsv, logger) gene_index = pd.read_csv(gene_index_fname_tsv, sep = "\t", na_values = ".", low_memory = False) @@ -127,6 +127,12 @@ def parse_expression(expression_fname_tsv: str, ['ENSEMBL_GENE_ID','SYMBOL','ENTREZGENE','GENENAME','BIOTYPE']).agg({'TPM':'sum'}).reset_index() expression_map['gene'].columns = ['ENSEMBL_GENE_ID','SYMBOL','ENTREZGENE','GENENAME','BIOTYPE','TPM_GENE'] expression_map['gene'] = expression_map['gene'].drop_duplicates().sort_values(by='TPM_GENE', ascending=False) + + ## for transcripts with missing TPM's, use the minimum TPM value across all transcripts + expression_map['gene_min_tpm'] = transcript_expression_map.groupby( + ['ENSEMBL_GENE_ID','SYMBOL','ENTREZGENE','GENENAME','BIOTYPE']).agg({'TPM':'min'}).reset_index() + expression_map['gene_min_tpm'].columns = ['ENSEMBL_GENE_ID','SYMBOL','ENTREZGENE','GENENAME','BIOTYPE','TPM_MIN'] + expression_map['gene_min_tpm'] = expression_map['gene_min_tpm'].drop_duplicates().sort_values(by='TPM_MIN', ascending=False) return(expression_map) @@ -145,17 +151,22 @@ def integrate_variant_expression(variant_set: pd.DataFrame, """ if not expression_data is None: - if 'gene' in expression_data.keys(): - if not expression_data['gene'] is None: - logger.info("Integrating gene-level expression data from tumor into somatic variant set") - if expression_data['gene'].empty: - logger.warn('Expression file does not contain any gene-level expression data') - else: - if 'TPM_GENE' in variant_set.columns: - variant_set.drop('TPM_GENE', inplace=True, axis=1) - if {'TPM_GENE','ENSEMBL_GENE_ID'}.issubset(expression_data['gene'].columns) and \ - 'ENSEMBL_GENE_ID' in variant_set.columns: - variant_set = variant_set.merge(expression_data['gene'], on = 'ENSEMBL_GENE_ID', how = 'left') + for s in ['gene','gene_min_tpm']: + if s in expression_data.keys(): + if not expression_data[s] is None: + logger.info("Integrating gene-level expression data from tumor into somatic variant set") + if expression_data[s].empty: + logger.warn('Expression file does not contain any gene-level expression data') + else: + v = 'TPM_MIN' + if s == 'gene': + v = 'TPM_GENE' + if v in variant_set.columns: + variant_set.drop(v, inplace=True, axis=1) + if {v,'ENSEMBL_GENE_ID'}.issubset(expression_data[s].columns) and \ + 'ENSEMBL_GENE_ID' in variant_set.columns: + exp_gene = expression_data[s][[v,'ENSEMBL_GENE_ID']] + variant_set = variant_set.merge(exp_gene, on = 'ENSEMBL_GENE_ID', how = 'left') if 'transcript' in expression_data.keys() and 'transcript_identifier' in expression_data.keys(): if not expression_data['transcript'] is None: @@ -187,6 +198,7 @@ def integrate_variant_expression(variant_set: pd.DataFrame, if 'TPM' in variant_set.columns: variant_set.drop('TPM', inplace=True, axis=1) if {'TPM','VAR_ID','ENSEMBL_TRANSCRIPT_ID'}.issubset(var2exp.columns): + var2exp = var2exp[['TPM','VAR_ID','ENSEMBL_TRANSCRIPT_ID']] variant_set = variant_set.merge(var2exp, on = ['VAR_ID','ENSEMBL_TRANSCRIPT_ID'], how = 'left') else: variant_set['TPM'] = np.nan @@ -196,7 +208,7 @@ def integrate_variant_expression(variant_set: pd.DataFrame, def correlate_sample_expression(sample_expression_data: dict, yaml_data: dict, - db_assembly_dir: str, + refdata_assembly_dir: str, protein_coding_only: bool = True, logger: logging.Logger = None) -> pd.DataFrame: @@ -229,7 +241,7 @@ def correlate_sample_expression(sample_expression_data: dict, if 'tcga' in yaml_data['conf']['gene_expression']['similarity_db'].keys(): for cohort in yaml_data['conf']['gene_expression']['similarity_db']['tcga'].keys(): - exp_fname = os.path.join(db_assembly_dir, "expression", "tsv", + exp_fname = os.path.join(refdata_assembly_dir, "expression", "tsv", "tcga", "tcga_" + str(cohort).lower() + "_tpm.tsv.gz") if check_file_exists(exp_fname, strict = False, logger = logger): sample_comp_exp_mat = pd.read_csv( @@ -245,8 +257,8 @@ def correlate_sample_expression(sample_expression_data: dict, corr_mat = correlate_samples(sample_expression_data, sample_id, 'tcga-' + str(cohort).lower(), protein_coding_only, sample_comp_exp_mat, logger) ## Make sure sample IDs match patient barcodes (sample metadata is organized per patient) - corr_mat['EXT_SAMPLE_ID'] = corr_mat['EXT_SAMPLE_ID'].replace(regex=r'-[0-9][0-9][A-Z]$', value='') - metadata_fname = os.path.join(db_assembly_dir, "expression", "tsv", + #corr_mat['EXT_SAMPLE_ID'] = corr_mat['EXT_SAMPLE_ID'].replace(regex=r'-[0-9][0-9][A-Z]$', value='') + metadata_fname = os.path.join(refdata_assembly_dir, "expression", "tsv", 'tcga', "tcga_" + str(cohort).lower() + "_sample_metadata.tsv.gz") if check_file_exists(metadata_fname, strict = False, logger = logger): sample_metadata = pd.read_csv( @@ -266,7 +278,7 @@ def correlate_sample_expression(sample_expression_data: dict, if source == 'treehouse': source_verbose = 'samples from the Treehouse Childhood Cancer Initiative' if source in yaml_data['conf']['gene_expression']['similarity_db'].keys(): - exp_fname = os.path.join(db_assembly_dir, "expression", "tsv", + exp_fname = os.path.join(refdata_assembly_dir, "expression", "tsv", source, source + "_tpm.tsv.gz") if check_file_exists(exp_fname, strict = False, logger = logger): sample_comp_exp_mat = pd.read_csv( @@ -286,7 +298,7 @@ def correlate_sample_expression(sample_expression_data: dict, if 'CORRELATION' in sample_exp_similarity[source].columns: sample_exp_similarity[source] = \ sample_exp_similarity[source].sort_values(by=['CORRELATION'], ascending=False) - metadata_fname = os.path.join(db_assembly_dir, "expression", "tsv", + metadata_fname = os.path.join(refdata_assembly_dir, "expression", "tsv", source, source + "_sample_metadata.tsv.gz") if check_file_exists(metadata_fname, strict = False, logger = logger): sample_metadata = pd.read_csv( @@ -294,7 +306,8 @@ def correlate_sample_expression(sample_expression_data: dict, sample_exp_similarity[source] = sample_exp_similarity[source].merge( sample_metadata, on = ['EXT_SAMPLE_ID'], how = 'left') - + #else: + #print("BALLE") return(sample_exp_similarity) diff --git a/pcgr/main.py b/pcgr/main.py index 7ab94c7b..1414217a 100755 --- a/pcgr/main.py +++ b/pcgr/main.py @@ -42,7 +42,7 @@ def cli(): optional_allelic_support = parser.add_argument_group("Allelic support options") optional_other = parser.add_argument_group("Other options") - optional_rna.add_argument("--input_rna_fusion", dest = "input_rna_fusion", help = "File with RNA fusion transcripts detected in tumor (tab-separated values)") + #optional_rna.add_argument("--input_rna_fusion", dest = "input_rna_fusion", help = "File with RNA fusion transcripts detected in tumor (tab-separated values)") optional_rna.add_argument("--input_rna_expression", dest = "input_rna_exp", help = "File with bulk RNA expression counts (TPM) of transcripts in tumor (tab-separated values)") optional_rna.add_argument('--expression_sim', action='store_true', help="Compare expression profile of tumor sample to known expression profiles (default: %(default)s)") optional_rna.add_argument("--expression_sim_db", dest = "expression_sim_db", default="tcga,depmap,treehouse", help=f"Comma-separated string " + \ @@ -88,7 +88,7 @@ def cli(): optional_other.add_argument("--cpsr_report", dest="cpsr_report", help="CPSR report file (Gzipped JSON - file ending with 'cpsr..json.gz' - germline report of patient's blood/control sample") optional_other.add_argument("--vcf2maf", action="store_true", help="Generate a MAF file for input VCF using https://github.com/mskcc/vcf2maf (default: %(default)s)") optional_other.add_argument("--ignore_noncoding", action="store_true", help="Ignore non-coding (i.e. non protein-altering) variants in report, default: %(default)s") - optional_other.add_argument("--include_trials", action="store_true", help="(Beta) Include relevant ongoing or future clinical trials, focusing on studies with molecularly targeted interventions") + #optional_other.add_argument("--include_trials", action="store_true", help="(Beta) Include relevant ongoing or future clinical trials, focusing on studies with molecularly targeted interventions") optional_other.add_argument("--retained_info_tags", dest="retained_info_tags", default="None", help="Comma-separated string of VCF INFO tags from query VCF that should be kept in PCGR output TSV file") optional_other.add_argument("--force_overwrite", action="store_true", help="By default, the script will fail with an error if any output file already exists. You can force the overwrite of existing result files by using this flag, default: %(default)s") optional_other.add_argument("--version", action="version", version="%(prog)s " + str(pcgr_vars.PCGR_VERSION)) @@ -134,17 +134,17 @@ def cli(): args = parser.parse_args() arg_dict = vars(args) - # check parsed arguments - arg_checker.check_args(arg_dict) + # verify parsed arguments + arg_dict_verified = arg_checker.verify_args(arg_dict) # create config options - conf_options = create_config(arg_dict, workflow = "PCGR") + conf_options = create_config(arg_dict_verified, workflow = "PCGR") # Verify existence of input files, define and check existence of output files - input_data = arg_checker.verify_input_files(arg_dict) - output_data = arg_checker.define_output_files(arg_dict) + input_data = arg_checker.verify_input_files(arg_dict_verified) + output_data = arg_checker.define_output_files(arg_dict_verified) - # Run PCGR workflow (vep, vcfanno, summarise, vcf2tsvpy, html) + # Run PCGR workflow run_pcgr(input_data, output_data, conf_options) def run_pcgr(input_data, output_data,conf_options): @@ -155,7 +155,7 @@ def run_pcgr(input_data, output_data,conf_options): debug = conf_options['debug'] # set basic run commands - clinical_trials_set = 'ON' if conf_options['clinicaltrials']['run'] else 'OFF' + #clinical_trials_set = 'ON' if conf_options['clinicaltrials']['run'] else 'OFF' msi_prediction_set = 'ON' if conf_options['somatic_snv']['msi']['run'] else 'OFF' msig_estimation_set = 'ON' if conf_options['somatic_snv']['mutational_signatures']['run'] else 'OFF' tmb_estimation_set = 'ON' if conf_options['somatic_snv']['tmb']['run'] else 'OFF' @@ -201,11 +201,8 @@ def run_pcgr(input_data, output_data,conf_options): pon_vcf = os.path.join(input_data['pon_vcf_dir'], input_data['pon_vcf_basename']) if not input_vcf == 'None': - refdata_dir = input_data['refdata_dir'] output_dir = output_data['dir'] - output_prefix = output_data['prefix'] - vep_dir = input_data['vep_dir'] - + output_prefix = output_data['prefix'] check_subprocess(logger, f'mkdir -p {output_dir}', debug) random_id = random_id_generator(15) @@ -213,7 +210,7 @@ def run_pcgr(input_data, output_data,conf_options): input_vcf_validated = f'{output_prefix}.{random_id}.ready.vcf.gz' input_vcf_validated_uncompr = f'{output_prefix}.{random_id}.ready.vcf' vep_vcf = f'{output_prefix}.{random_id}.vep.vcf' - vep_vcfanno_vcf = f'{output_prefix}.{random_id}vep.vcfanno.vcf' + vep_vcfanno_vcf = f'{output_prefix}.{random_id}.vep.vcfanno.vcf' vep_vcfanno_summarised_vcf = f'{output_prefix}.{random_id}.vep.vcfanno.summarised.vcf' vep_vcfanno_summarised_pass_vcf = f'{output_prefix}.{random_id}.vep.vcfanno.summarised.pass.vcf' output_vcf = f'{output_prefix}.vcf.gz' @@ -236,7 +233,7 @@ def run_pcgr(input_data, output_data,conf_options): vcf_validate_command = ( f'pcgr_validate_input.py ' - f'{refdata_dir} ' + f'{input_data["refdata_assembly_dir"]} ' f'{input_vcf} ' f'{input_vcf_validated_uncompr} ' f'{input_cna} ' @@ -244,7 +241,6 @@ def run_pcgr(input_data, output_data,conf_options): f'{input_rna_expression} ' f'{pon_vcf} ' f'{conf_options["assay_properties"]["vcf_tumor_only"]} ' - f'{conf_options["genome_assembly"]} ' f'{conf_options["sample_id"]} ' f'{conf_options["other"]["retained_vcf_info_tags"]} ' f'{conf_options["somatic_snv"]["allelic_support"]["tumor_dp_tag"]} ' @@ -293,7 +289,7 @@ def run_pcgr(input_data, output_data,conf_options): logger.info(f'MSI classification: {msi_prediction_set}') logger.info(f'Mutational burden estimation: {tmb_estimation_set}') logger.info(f'RNA expression similarity analysis: {rnaseq_sim_analysis_set}') - logger.info(f'Include molecularly targeted clinical trials (beta): {clinical_trials_set}') + #logger.info(f'Include molecularly targeted clinical trials (beta): {clinical_trials_set}') # PCGR|Generate YAML file - containing configuration options and paths to annotated molecular profile datasets # - VCF/TSV files (SNVs/InDels) @@ -305,6 +301,7 @@ def run_pcgr(input_data, output_data,conf_options): # update conf_options with paths to output files conf_options['output_dir'] = output_dir + conf_options['output_prefix'] = output_prefix conf_options['molecular_data']['fname_mut_vcf'] = output_vcf conf_options['molecular_data']['fname_mut_tsv'] = output_pass_tsv_gz if conf_options['somatic_snv']['tmb']['run'] == 1: @@ -318,7 +315,8 @@ def run_pcgr(input_data, output_data,conf_options): conf_options['molecular_data']['fname_expression_sim_' + source] = f'{output_prefix}.exp_sim_{source}.tsv.gz' # make YAML file - yaml_data = populate_config_data(conf_options, refdata_dir, workflow = "PCGR", logger = logger) + yaml_data = populate_config_data(conf_options, input_data["refdata_assembly_dir"], + workflow = "PCGR", logger = logger) genome_assembly = yaml_data['genome_assembly'] vep_command = get_vep_command(file_paths = input_data, @@ -392,7 +390,7 @@ def run_pcgr(input_data, output_data,conf_options): # PCGR|vcfanno - annotate VCF against a number of variant annotation tracks (BED/VCF) logger = getlogger("pcgr-vcfanno") pcgr_vcfanno_command = ( - f'pcgr_vcfanno.py {vep_vcf}.gz {vep_vcfanno_vcf} {input_data["db_assembly_dir"]} ' + f'pcgr_vcfanno.py {vep_vcf}.gz {vep_vcfanno_vcf} {input_data["refdata_assembly_dir"]} ' f'--num_processes {conf_options["other"]["vcfanno_n_proc"]} ' f'--dbnsfp --clinvar --rmsk --winmsk --simplerepeat ' f'--tcga --gene_transcript_xref --dbmts --gwas ' @@ -400,7 +398,7 @@ def run_pcgr(input_data, output_data,conf_options): ) vcfanno_db_src_msg1 = ( f"Annotation sources (vcfanno): {'Panel-of-Normals, ' if pon_vcf != 'None' else ''}ClinVar, dbNSFP, " - f"dbMTS, cancerhotspots.org, DoCM, TCGA, GWAS catalog" + f"dbMTS, TCGA, GWAS catalog" ) vcfanno_db_src_msg2 = \ f"Annotation sources (vcfanno): RepeatMasker, SimpleRepeats, WindowMaskerSDust, gnomAD non-cancer subset" @@ -420,7 +418,7 @@ def run_pcgr(input_data, output_data,conf_options): f'pcgr_summarise.py {vep_vcfanno_vcf}.gz {vep_vcfanno_summarised_vcf} {pon_annotation} ' f'{yaml_data["conf"]["vep"]["vep_regulatory"]} {oncogenicity_annotation} ' f'{yaml_data["conf"]["sample_properties"]["site2"]} {yaml_data["conf"]["vep"]["vep_pick_order"]} ' - f'{input_data["db_assembly_dir"]} --compress_output_vcf ' + f'{input_data["refdata_assembly_dir"]} --compress_output_vcf ' f'{"--debug" if debug else ""}' ) summarise_db_src_msg1 = \ @@ -466,7 +464,7 @@ def run_pcgr(input_data, output_data,conf_options): logger.info("Appending ClinVar traits, official gene names, and protein domain annotations") variant_set = \ append_annotations( - output_pass_vcf2tsv_gz, db_assembly_dir = input_data["db_assembly_dir"], logger = logger) + output_pass_vcf2tsv_gz, refdata_assembly_dir = input_data["refdata_assembly_dir"], logger = logger) variant_set = set_allelic_support(variant_set, allelic_support_tags = yaml_data["conf"]['somatic_snv']['allelic_support']) variant_set = clean_annotations(variant_set, yaml_data, germline = False, logger = logger) @@ -484,7 +482,7 @@ def run_pcgr(input_data, output_data,conf_options): ## Parse expression data, verify identifiers expression_data = parse_expression( input_rna_expression, yaml_data["sample_id"], - input_data["db_assembly_dir"], logger = logger) + input_data["refdata_assembly_dir"], logger = logger) ## Write transcript-level expression data to TSV if 'transcript' in expression_data.keys(): if not expression_data['transcript'] is None: @@ -563,17 +561,17 @@ def run_pcgr(input_data, output_data,conf_options): if yaml_data['conf']['gene_expression']['similarity_analysis'] == 1 and expression_data is not None: logger = getlogger("pcgr-expression-similarity") logger.info('PCGR - STEP 4: Gene expression (RNA) similarity analysis') - + + ## Correlate sample expression with reference samples exp_similarity_results = correlate_sample_expression( expression_data, yaml_data, - input_data["db_assembly_dir"], + input_data["refdata_assembly_dir"], protein_coding_only = True, logger = logger) for source in exp_similarity_results.keys(): exp_similarity_results[source].fillna('.').to_csv( yaml_data['molecular_data']['fname_expression_sim_' + source], - #f'{output_prefix}.exp_sim_{source}.tsv.gz', sep = "\t", index = False) print('----') else: @@ -590,7 +588,7 @@ def run_pcgr(input_data, output_data,conf_options): cna_segment_file = input_cna, build = yaml_data['genome_assembly'], sample_id = yaml_data['sample_id'], - db_assembly_dir = input_data['db_assembly_dir'], + refdata_assembly_dir = input_data['refdata_assembly_dir'], n_copy_amplifications = yaml_data["conf"]['somatic_cna']['n_copy_gain'], overlap_fraction = 0.5, expression_data = expression_data, diff --git a/pcgr/variant.py b/pcgr/variant.py index bd480c2f..8595d080 100644 --- a/pcgr/variant.py +++ b/pcgr/variant.py @@ -10,49 +10,11 @@ warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning) -# def set_genotype(variant_set: pd.DataFrame, logger) -> pd.DataFrame: -# """ -# Set verbose genotype (homozygous, heterozygous) for each variant -# """ -# variant_set['GENOTYPE'] = '.' -# if {'GT'}.issubset(variant_set.columns): -# logger.info("Assignment of genotype (homozygous, heterozygous) for each variant based on 'GT' tag") -# heterozygous_states = [] -# ref_allele_index = 0 -# while ref_allele_index < 20: -# alt_allele_index = ref_allele_index + 1 -# while alt_allele_index <= 20: -# phased_gt_1 = str(ref_allele_index) + "|" + str(alt_allele_index) -# phased_gt_2 = str(alt_allele_index) + "|" + str(ref_allele_index) -# unphased_gt_1 = str(ref_allele_index) + "/" + str(alt_allele_index) -# unphased_gt_2 = str(alt_allele_index) + "/" + str(ref_allele_index) -# gts = [phased_gt_1, phased_gt_2, unphased_gt_1, unphased_gt_2] -# heterozygous_states.extend(gts) - -# alt_allele_index = alt_allele_index + 1 -# ref_allele_index = ref_allele_index + 1 - -# homozygous_states = [] -# hom_allele_index = 1 -# while hom_allele_index <= 20: -# phased_gt = str(hom_allele_index) + "|" + str(hom_allele_index) -# unphased_gt = str(hom_allele_index) + "/" + str(hom_allele_index) -# homozygous_states.extend([phased_gt, unphased_gt]) -# hom_allele_index = hom_allele_index + 1 - -# variant_set.loc[variant_set['GT'].isin(homozygous_states), "GENOTYPE"] = "homozygous" -# variant_set.loc[variant_set['GT'].isin(heterozygous_states),"GENOTYPE"] = "heterozygous" -# else: -# variant_set['GENOTYPE'] = "undefined" - -# variant_set = variant_set.astype({'GENOTYPE':'string'}) -# return(variant_set) - -def append_annotations(vcf2tsv_gz_fname: str, db_assembly_dir: str, logger): +def append_annotations(vcf2tsv_gz_fname: str, refdata_assembly_dir: str, logger): - clinvar_tsv_fname = os.path.join(db_assembly_dir, 'variant','tsv','clinvar', 'clinvar.tsv.gz') - protein_domain_tsv_fname = os.path.join(db_assembly_dir, 'misc','tsv','protein_domain', 'protein_domain.tsv.gz') - gene_xref_tsv_fname = os.path.join(db_assembly_dir, 'gene','tsv','gene_transcript_xref', 'gene_transcript_xref.tsv.gz') + clinvar_tsv_fname = os.path.join(refdata_assembly_dir, 'variant','tsv','clinvar', 'clinvar.tsv.gz') + protein_domain_tsv_fname = os.path.join(refdata_assembly_dir, 'misc','tsv','protein_domain', 'protein_domain.tsv.gz') + gene_xref_tsv_fname = os.path.join(refdata_assembly_dir, 'gene','tsv','gene_transcript_xref', 'gene_transcript_xref.tsv.gz') vcf2tsv_df = None clinvar_data_df = None diff --git a/pcgr/vcf.py b/pcgr/vcf.py index 80266699..c6b7f31b 100644 --- a/pcgr/vcf.py +++ b/pcgr/vcf.py @@ -45,15 +45,15 @@ def detect_reserved_info_tag(tag, tag_name, logger): def check_retained_vcf_info_tags(vcf: VCF, retained_info_tags: str, logger: logging.Logger) -> int: """ - Function that compares the INFO tags in the query VCF and retained INFO tags set by the user as retained for output - If any retained tag is not in query VCF, an error will be returned + Function that compares the INFO tags in the input VCF and retained INFO tags set by the user as retained for output + If any retained tag is not in input VCF, an error will be returned """ tags = str(retained_info_tags).split(',') info_elements_query_vcf = [] #vcf = VCF(input_vcf) - logger.info('Checking if existing INFO tags of query VCF file matches retained INFO tags set by the user') + logger.info('Checking if existing INFO tags of input VCF file matches retained INFO tags set by the user') ret = 1 for e in vcf.header_iter(): header_element = e.info() @@ -63,10 +63,10 @@ def check_retained_vcf_info_tags(vcf: VCF, retained_info_tags: str, logger: logg for t in tags: if not t in info_elements_query_vcf: - err_msg = "Retained INFO tag '" + str(t) + "' not found among INFO tags in query VCF - make sure retained VCF INFO tags are set correctly" + err_msg = "Retained INFO tag '" + str(t) + "' not found among INFO tags in input VCF - make sure retained VCF INFO tags are set correctly" error_message(err_msg, logger) else: - logger.info("Retained INFO tag '" + str(t) + "' detected among INFO tags in query VCF") + logger.info("Retained INFO tag '" + str(t) + "' detected among INFO tags in input VCF") return ret @@ -75,22 +75,22 @@ def check_retained_vcf_info_tags(vcf: VCF, retained_info_tags: str, logger: logg def check_existing_vcf_info_tags(vcf: VCF, populated_infotags: dict, logger) -> int: """ - Function that compares the INFO tags in the query VCF and the INFO tags generated by PCGR/CPSR + Function that compares the INFO tags in the input VCF and the INFO tags generated by PCGR/CPSR If any coinciding tags, an error will be returned """ - logger.info('Checking if existing INFO tags of query VCF file coincide with PCGR/CPSR INFO tags') + logger.info('Checking if existing INFO tags of input VCF file coincide with PCGR/CPSR INFO tags') ret = 1 for e in vcf.header_iter(): header_element = e.info() if 'ID' in header_element.keys() and 'HeaderType' in header_element.keys(): if header_element['HeaderType'] == 'INFO': if header_element['ID'] in populated_infotags.keys(): - err_msg = 'INFO tag ' + str(header_element['ID']) + ' in the query VCF coincides with a VCF ' + \ - 'annotation tag produced by PCGR/CPSR - please remove or rename this tag in your query VCF' + err_msg = 'INFO tag ' + str(header_element['ID']) + ' in the input VCF coincides with a VCF ' + \ + 'annotation tag produced by PCGR/CPSR - please remove or rename this tag in your input VCF' return error_message(err_msg, logger) - logger.info('No query VCF INFO tags coincide with populated INFO tags by PCGR/CPSR') + logger.info('No input VCF INFO tags coincide with populated INFO tags by PCGR/CPSR') return ret diff --git a/pcgr/vep.py b/pcgr/vep.py index 4d2c307f..a4dfd579 100644 --- a/pcgr/vep.py +++ b/pcgr/vep.py @@ -17,13 +17,16 @@ def get_vep_command(file_paths, conf_options, input_vcf, output_vcf, debug = Fal vep_dir = file_paths['vep_dir'] fasta_assembly = os.path.join( - file_paths['db_assembly_dir'], + file_paths['refdata_assembly_dir'], 'misc','fasta','assembly', f'Homo_sapiens.{pcgr_vars.VEP_ASSEMBLY[genome_assembly]}.dna.primary_assembly.fa.gz') ancestor_assembly = os.path.join( - file_paths['db_assembly_dir'], + file_paths['refdata_assembly_dir'], 'misc','fasta','ancestor', f'human_ancestor.fa.gz') + + utils.check_file_exists(fasta_assembly, logger = None) + utils.check_file_exists(ancestor_assembly, logger = None) plugins_in_use = "NearestExonJB, LoF" diff --git a/pcgrr/R/input_data.R b/pcgrr/R/input_data.R index ef68a77e..341cf764 100644 --- a/pcgrr/R/input_data.R +++ b/pcgrr/R/input_data.R @@ -791,3 +791,24 @@ load_dna_variants <- function( return(results) } + +load_expression_sim <- function(ref_data = NULL, + settings = NULL){ + + ## Load expression similarity results for input sample + ## against reference collections + expression_sim <- list() + + if(!is.null(settings$conf$gene_expression$similarity_db)){ + for(db in names(settings$conf$gene_expression$similarity_db)){ + fname_db <- paste0("fname_expression_sim_",db) + if(!is.null(settings$molecular_data[[fname_db]]) & + file.exists(settings$molecular_data[[fname_db]])){ + expression_sim[[db]] <- readr::read_tsv( + settings$molecular_data[[fname_db]], + show_col_types = F, na = ".") + } + } + } + return(expression_sim) +} diff --git a/pcgrr/R/main.R b/pcgrr/R/main.R index 211e69d6..420cf995 100644 --- a/pcgrr/R/main.R +++ b/pcgrr/R/main.R @@ -54,7 +54,7 @@ generate_report <- ## Retrieve relevant clinical trials for the tumor type in question - if (as.logical(settings$conf$clinicaltrials$run) == T) { + #if (as.logical(settings$conf$clinicaltrials$run) == T) { # pcg_report_trials <- # pcgrr::generate_report_data_trials( # ref_data = ref_data, @@ -63,7 +63,7 @@ generate_report <- # pcg_report <- # pcgrr::update_report(pcg_report, pcg_report_trials, # a_elem = "clinicaltrials") - } + #} if (NROW(callset_snv$variant) > 0) { @@ -514,9 +514,9 @@ write_report <- function(report, if (!is.null(report_strip$content$tmb)) { report_strip$content$tmb$tcga_tmb <- NULL } - if (!is.null(report_strip$content$clinicaltrials)) { - report_strip$content$clinicaltrials <- NULL - } + #if (!is.null(report_strip$content$clinicaltrials)) { + # report_strip$content$clinicaltrials <- NULL + #} if (!is.null(report_strip$content$msi)) { if (!is.null(report_strip$content$msi$prediction)) { report_strip$content$msi$prediction$tcga_dataset <- NULL @@ -775,17 +775,11 @@ write_report_quarto_html <- function(report = NULL){ settings <- report[['settings']] output_dir <- settings[['output_dir']] - sample_name <- settings[['sample_id']] - genome_assembly <- settings[['genome_assembly']] - - sample_fname_pattern <- - paste(sample_name, 'pcgr_acmg', genome_assembly, sep = ".") + output_prefix <- settings[['output_prefix']] output_format <- "html" fnames <- list() - fnames[["html"]] <- - file.path(output_dir, - paste0(sample_fname_pattern, ".html")) + fnames[["html"]] <- paste0(output_prefix, ".html") ## Path to PCGR reporting templates pcgr_rep_template_path <- @@ -836,7 +830,7 @@ write_report_quarto_html <- function(report = NULL){ replacement = rds_report_path) |> stringr::str_replace( pattern = "", - replacement = sample_name + replacement = settings[['sample_id']] ) |> stringr::str_replace( pattern = "", @@ -886,7 +880,11 @@ write_report_quarto_html <- function(report = NULL){ } -write_report_excel <- function(){ +#' Function that writes contents of PCGR object to an HTML report (quarto-based) +#' +#' @param report List object with all report data, settings etc. +#' @export +write_report_excel <- function(report = NULL){ fname_xlsx <- "" diff --git a/pcgrr/R/reference_data.R b/pcgrr/R/reference_data.R index 78a653f5..0908f24a 100644 --- a/pcgrr/R/reference_data.R +++ b/pcgrr/R/reference_data.R @@ -654,7 +654,8 @@ load_reference_data <- function( "woods_dnarepair|gerp|tcga_pancan_2018|gwas_catalog)")) ~ "cpsr", stringr::str_detect( .data$source_abbreviation, - "^(cytoband|mitelmandb|tcga|nci|intogen|opentargets|dgidb|pubchem)$") ~ "pcgr", + paste0("^(cytoband|mitelmandb|tcga|nci|intogen|depmap|treehouse", + "|opentargets|dgidb|pubchem)$")) ~ "pcgr", TRUE ~ as.character("pcgr_cpsr") )) diff --git a/pcgrr/R/report.R b/pcgrr/R/report.R index ab5d9ad5..b88e3754 100644 --- a/pcgrr/R/report.R +++ b/pcgrr/R/report.R @@ -90,9 +90,9 @@ init_report <- function(yaml_fname = NULL, "rainfall", "kataegis", "expression", - "predisposition", + "predisposition")){ #"report_display_config", - "clinicaltrials")) { + #"clinicaltrials")) { report[["content"]][[a_elem]] <- list() report[["content"]][[a_elem]][["eval"]] <- FALSE diff --git a/pcgrr/data/data_coltype_defs.rda b/pcgrr/data/data_coltype_defs.rda index b25e5f8e310ecd4ca70581eea8271450c56e011c..ce1c2438db00683a4bc6093c9d904be79d6a55f4 100644 GIT binary patch literal 1922 zcmV-|2YvWLT4*^jL0KkKS+~*=k^mp=Z=nDG|GSU?KmY&mzwh6m2mk;9;0b?xK4EQC zIsgFBbN~YwSc6kaJwRv&lr#VU4F*Pl0005NFpMUch5-P`0fcFjCJ}&!hXBGbnqn9P z10V(wrc9Vd0vZW4sG!vSQKo>zG#LOI28N9Q27^yT2%2Gy#_O2Cy#Y9y+XOR&Yp-6q$c#rSw#n)Y$7j^|xlVA*N9?v&Sc zGuelA;@R0>h3>}FHBU**G28C095B^yPJiFK46^FEhXhAI%Qjf!2zh4F&!&oBC=pjT z?*3Pt8k+Cl^85PoGUD2qWrmts*0r|VZMNHO(9NV`6b-2kT0&0vLJ)|mB0zwLMo~c; zKoo^lK$ujK1w=@P)ZuNJxP}Q56(92L{REXQAn;MDco>@Uy={9)Iou^sXGk#ZcSyAG?orZpK;1x%jDmX#uI>{C-T!IO&k$xmlQmt>hnS6>y>-DTNJE{w)GKb*9; zoT+Ggi`cuijpmg3rhCp%Tt_(OhDf8p&eJ)={P;-wY-EW-a=@DmtW1w%ht~54tX)|90(JJ#BFnUsr9~*Jc zSp6QqTPIgPJ387MIku3?lxn-{U5!Z1uQ{~B>qXnSWVUo~g3z&3t)ty_wI32Sm)4@!>52?m)BSwvE?teqg9uXvRZFZZf9K@SmS<(&{?B4k>lm~4 zF2S;Bn4Rf|7A%yejqj(gR(E!L)5`w_4YN&*T+RvoOR!IBalH3s^)7QNq@11pr41?B z+S3jUk?vJes=2mGl5)fF#mUj;{AcQH(~8aJt8|jho)Svt!f9Bn_${82p2E!}Xy_V6 zG(^H+#+2l1Mk|RijACJhF&P(!ZVv~V?*osDar)&GIVUeaAC4?h);F?Ep|h(7ShU4= zM_Dsj^gFR%TY8k;Jxx=lY*Bfoht6DPr-QxWd?~?$^rk%~)lTBp?S&FY zESA5mOfIph-QjyBTK6up-OMj~v-^xZ7&P%&=HmS=+a+XrDcmyodN+KfI48zbi?+u% za`~`xlPfuVY&i9Jb(c%{EXIEuzRbE~_1`u8FPrXWvh3P3U9LG_3*(k7vsBINTIw@r zj#_Ddp?vgI*LRj~EZZ@h7^$}#HG1s395yv--N8-e*)M{oT-&XsZ8*2#`mJ0OJ~d`C z#nSATU7c~p#!{T{!)#ZVZsW;DE}XLAwe)1OgM(d_EzQGb>=*NBw}i@WR+=$s!M5t1 z*x=D>b#HejtFV5za`QQZlc!2kZT80nGk?(zP$T_1Sw8>$S`fJkAzzF97nx*!I$;7b zM70qAlmRQ*@i|?`SwKUaZF3yJydZ_3gdhp4R)<2O2E+k0k0J;_A;$dN8BhecR6w&^ zRi&(tOmqlmfI_)wDnK>ErzC?1_Q8Z#JwR#g6@zj!*&|Cn&T>eZ)j6ob7H=8_V1n$8=Oc zsmEBzgH%eN8D9xPOAy!PavOT^A(Z5iG$CmuQ4TH$nZ!suk|oqbqDX`4c_x$OfA({f zkq2;)RFHvk0G70&RQIlHO#0vZ=nDG|GN+XKmY&m|L@%th+q&5fEY%ZGGQ19Xf;v2Q}opa zfB~aG00000000RzsVFq{28}c{G|8Ye$kButjF~Y8Pf>y?DLo;k)MNkvG|)Xj01W^P zjWPvE5GFt*$$|ipsrr*Aq|;AR(t3I+`V;^fc1S^YNmU^R{3Ia2xxN|E0dJx$xC9a) zk^-dDNJ&Z}+jhqZNhA-&U(#C3Z;23XzHx|x5d!S+d6%L%%kfa_6~BGl3Oz zZohr!2Bz))kB9T=#LI(PWtJLgYi+jMZMNHO>1A~cK|s=w?I20)0t8A)2#QG{^jZQX z=w6~kg4Gm+6(ERyR2^uL7^FdgWRWIZa@~K@M~nFKUG=0&kjjJdd35hQ73eZmEa<3j z;U_4?I2)}DS{|!3Jws&4lG08OOqcsPVJ4G4^5j`Bgyl?W!<9;vU!ARzb5mw$&6Sg% zk!-&${nPk8n2*ihB($dop^PZcM_2D#1}|eb@MiC-qa)JheXq6luN%kPKJDA45`5S` z_N%^&zU#7|dvPr|Dtk`Voo^(wwd*-FUdmmD4!FHaQk0`8K!_lWU_<~p7~#}tTGGy= z$k>RDp)H{yiLJEU5U8?b?X%4D+>txIEFQJ56=&$zEZ1en3r~9rY{ElY9&=Z8=h-?=nVWIDipq}bx!bQCG?=W|{pZ=cevTX4-99gL zVLQ0dy6z2K)KeOase-0Y7k-r@Z0sqx%`jx(zA{tCnoF`wqbsk9>TI&?rI$uy9KT6v z@|a^D zg}19$KGibzzNZPajT$#u)%{P_zD8Ylx1DK?HYqI*w$QY-)|6p#WWr%g6qZs_r7B{W z%ZuQ`6v25+Oj8Xk#HI3N&5t$NNiHnlld*xT9CsF$mSHZXJle2`#Zv`^kCLUAdH2ov zm|U0vi#rRd8E8~++QBs@hEhVqWz1{uXJa)AgyyUjQtJxa^zUcKGqHEGvFu7+V-|iZ zuxgqnC!sLl#gdfKwe;}Z&hE#tJMZma*EH0{vVPTc#{Y&cE^jB&e&($>EZS~coR(_sl2;}ZO2ucjZ1j}zEYe0;vsu#< z3427S5iv$FFv6IIM9s|WnWvKO(}!|!{ZfgXlaH5oyA~+v8(Ai>)zO12T4KAK zq?xSuy5hd;dQ{mxEy>R|D7@0cklqVRj1Swm3tJ2ICvI z3#Ttr$-y);Ot`jU!almQs>TbmQmU2Wr1x>&8r!`ikk4VbHdB9!&7r}X7x6gWWEZSa%|IC!M6Y1YTlV}t1*@?mt?x^ z>xMQml;wsS@m^iChZPvP@ymX<(UQ&W?b@nalYXt3FXYi~*_7C=G-A_xU6nfHgGH&) zwcJ~mVEY|cnamtq`O=#0t~e>1{>X8GAL-7?^Z(_c3y`2!(fF@2$ot<-*uad@Ekr-+ z1ZDppg5!e70K7A~<TQHLKF|CnqVpqG*tV3DXpW8<8q| zrF&%wEDhfKkk`Wk8BQq^LKc!m1A}@daUu@%iFFX>kb~myO(($qna(m0bs`FogNz8_ yFLoCCxXfel@w92!##@|{IzWplAqDnJ%n%{3WB@|qA_s{7i@744C`cOezMz13AFq=D diff --git a/pcgrr/data/tcga_cohorts.rda b/pcgrr/data/tcga_cohorts.rda index 5b7656cc2279caef5a5a29069058ed12d0357f21..36ff684e9b6bd97a0da4d343f920d17294ffd4b6 100644 GIT binary patch delta 751 zcmVyj1xM)(Ptti!8ftom z0GMP(k5g?Vz<_8*fe|x8JfqYN00w{okN^gNaMMf>$Z4iVOduL)0|}!{m;gY~NTM{= zPxVbPC#mQfYH5Q|U?T!C0~#Yi?NJ4B-i0Y~qHQ2Q4VGkzNU&gbTbkgBv|HIF2!RFS z0Ds?yFCR8@lQ$fFCGKMjPdIL6iYPEMt+k3l8+Y^|fv|UA3kRGlX(gd~ z2_$uLKuWeC%TFJ-kA7O@+CN!fqk7j{oNKO1uScnjXxBZ*La1THJK}W`gU-kUwLe*K z=>n_0{+qh7+lzCj?1`AS7sU3(*(5dDI)Ah2?}HlIsn*hNH`_%hfL++ZgvvmnI}ac; z14E-VRZ#A;61KB3JQj!;OOGpfA6>QI!9u0s`8n*lWBT}1`rJE+w2GX)6m$e zSuK?3dj6gu-wc@wlcCa-`AL1rradv_i*1Epb=%YF-m{c*liC0tWB@U9l40m_pMP2Q zECOO-qfoyIS~V{eWjThNjWbE+Ol)n9v6%k4w%FM>^v;fK#o+!BLHl?IV$y~XL?iV8 zz}KvI8jUkEg=W(VMPbC?%&^n@u}u#Sy0~2Gs2@H|;?Y_RB1$sNUd=hkV;I|M z^j_Q~Ri8bqGS<4_1}0LJckk4IE|Fv;_MECwBQiCaG5YEkDIyyRgbARj3JtJ1@4TX?QeRMXZ=gLRgfp$^Z#0 zT4l3&N~=Qt+T(Bt9&{XSQVGW4Q;<%qWDL0b54&5eBMW*w3d`5v?|1bUMaxECfRqep h2GpFETT!4$4Cys;g9vwjyuk=>yVv6A_>QX{V7I zG}#c)0j8mmG$Yj1%5Nz3Mg%=W27~}`gejRA!9Jw_&)4IWVo3&j`Tg`&+MKMj@1B#}{pmqvUdEYYJa$_Rl2 z#D4+GALQ($-=V&yTs_#aGShD;1{rD7^aW1!Iua;Sl*HJ_yulukegOS_+} zZv$^X%X6=PuWx$qe~Kjqo17;jSlxK+!hgcBY!}qb5o=UyHd%@^Kr6<;$Wh>uvF9j% z90ON8FU;KpPcqi`eAej@?P#WMIMWr?g&vb0hMB8YoGX!Xwla}95;m+dOt8`4bZ6e4 zXd$I+`m8?9>-tKG%-w>VPT6fLa;G>Nw!u>CuJl!3o2+(|kS3xKCyfaoE<336Jb%`? z##uO=Eb5aynu`VaSvPx)iniD^*x1_oZ1?n4vd=eSS@W2K_(Bf1jCESEs3c(Pu<;13 zTrhMACRmsdsFt!Ye!%TlTYb?Bp>bxGE28bMdL^E4dG6?fA za>8y8zr_WmsS1Lw!Y?o&Gqk7#4}VHmAtlt#m=yh}T^0f$dSa8=hMVNX5J@DIKCHeF z!WPSWLt~J*0kTBrGfF`K4$&q>&vD%~B330sCdZay_ZJbguOPAsfI|k)hDbJm@LIC) z>Y8})9ov$$19pl^r9CvpAz+AnS`9DoG>_BO{g?94@ek8K*9|W(?J>(+BGyZ8UeKc0iXbCc@Z^FYHBu<^+%{_pa1{^ zKma6#4JqwWwH}bj00uxbVKe|PgI;w^1bx=%6+ZvgBCJjc1&sZC`e)-5vvg4%IQBX` z?fmV4oYnov5@S;$rZgv4MzWdiHK#a8d zwOYeJBD#dRgWM z29`wv8UidLLxSN!f-MB@z~4x)4jz|s2}vj!T0i|_8NCsz;K$Z#w zm@tm2?ip<{$r!i>8CDYbvU{rGL1hHOynytw$exj9ViU-%P{P60zO^bknzP0Ul-OfA zsMn3fPAK_seX#74t^`Z)?GH5 zjz+B09W#V&v|@`;IJpN5aV^bMEM^IrE9q8x5yd0n3K5(}{1}yo`%Q^w0lV9W4cn%j V`}1-4bAHG8yOJrwgo8s5xj;4!BsBm4 literal 694 zcmV;n0!jTsT4*^jL0KkKS>CtEh5!Ol?|}b*|4I-5zwiI>uBXia00BS(n{&Gg>ut9U zO;gm_BTWIOhJXOc=>tq49;c`pnrV|lO{jy^^gsY<>HsnTXbm(XYM#{EnreDYG%^O7 z0B8UJ1d<7pNjwuqNs0gf7|;=*7cjPQm;ndelpsiZ|5k!Lih&Rc%JXRSo~D63PTFwp zgtMNm_7tR7!2(cpo7B4v2w=^oDoRj>H=KbpG)zrw3xI$bB$FKOPkVq!J6I;j0t_X5 z;{N~BcO7?+X1uz*`)VX?Q+$K4)>ui-O5x$qK;&tHk`O2>$83X6S=CN!>DYiB3Q~^Y zs+f%@G}4s5s?aE`jk{G)9uYARb_pba#ReZWi?Y4AhSXM0t`r@?v>y#P@6xtbxjA5` zE#-(eb>qf#V__>JaRcG1WQM{!pM~l;3R z-tkpJSSk`xP~fuRjWo(wfKrwSW7b*csl^5w35ctJ^p$He;uS$j7Qd@OgWY;t5on96 z3gMbemlLxFk>T>iYAT6LCZUTJ5}-C5k8S}Df1KnIJq^H*AC1+F#0WN90UbHkd1$wg z=i%?r^;rgz!sfn8$6o^EjVLi8*dQVvAPlTFjP6@8B_)mg)kbtBIn@{VYO`VX{z&^z curH$+wYy%Y-5;lIK1cC)BvXY64eNY}VC|4JaR2}S diff --git a/pcgrr/inst/templates/pcgr_quarto_report/mutational_signature.qmd b/pcgrr/inst/templates/pcgr_quarto_report/mutational_signature.qmd index adbe6917..fe8cfd1d 100644 --- a/pcgrr/inst/templates/pcgr_quarto_report/mutational_signature.qmd +++ b/pcgrr/inst/templates/pcgr_quarto_report/mutational_signature.qmd @@ -4,7 +4,6 @@ ```{r setup1} #| echo: false -pcg_report <- readRDS(file="../../../tmp.rds") report_color <- pcgrr::color_palette$report_color$values[2] ``` @@ -13,8 +12,6 @@ Here, we apply the **MutationalPatterns** package [@Blokzijl2018-nc] to estimate ```{r setup_signature_display} #| echo: false -#pcg_report <- readRDS(file="tmp.rds") - msig_content <- pcg_report$content$mutational_signatures diff --git a/pcgrr/inst/templates/pcgr_quarto_report/snv_indel_main.qmd b/pcgrr/inst/templates/pcgr_quarto_report/snv_indel_main.qmd index 9cbcacb0..4e26bdb9 100644 --- a/pcgrr/inst/templates/pcgr_quarto_report/snv_indel_main.qmd +++ b/pcgrr/inst/templates/pcgr_quarto_report/snv_indel_main.qmd @@ -406,7 +406,9 @@ DT::datatable( noncoding_variant_set <- pcg_report$content$snv_indel$callset$variant_display |> - dplyr::filter(.data$CODING_STATUS == "noncoding") |> + dplyr::filter( + .data$CODING_STATUS == "noncoding" & + .data$CONSEQUENCE != "intergenic_variant") |> dplyr::select( dplyr::any_of( pcgrr::dt_display$snv_indel_tier3 @@ -496,8 +498,7 @@ DT::datatable( pcgrr::color_palette$cancer_assoc$breaks, pcgrr::color_palette$cancer_assoc$values ) - ) - + ) ``` diff --git a/pcgrr/inst/templates/pcgr_quarto_report/value_boxes.qmd b/pcgrr/inst/templates/pcgr_quarto_report/value_boxes.qmd deleted file mode 100644 index 79c39458..00000000 --- a/pcgrr/inst/templates/pcgr_quarto_report/value_boxes.qmd +++ /dev/null @@ -1,52 +0,0 @@ -## Main results - -
-
-
TIER 1 - SNV/INDEL

-`r pcg_report[["content"]][["value_box"]][["tier1"]]` -
- -
-
TIER 2 - SNV/INDEL

-`r pcg_report[["content"]][["value_box"]][["tier2"]]` -
- - -
-
sCNA

-`r pcg_report[["content"]][["value_box"]][["scna"]]` -
- - -
-
TUMOR PURITY

-`r pcg_report[["content"]][["value_box"]][["tumor_purity"]]` -
- -
-
TUMOR PLOIDY

-`r pcg_report[["content"]][["value_box"]][["tumor_ploidy"]]` -
- -
-
MSI STATUS

-`r pcg_report[["content"]][["value_box"]][["msi"]]` -
- -
-
DOMINANT SIGNATURE

-`r pcg_report[["content"]][["value_box"]][["signatures"]]` -
- -
-
MUTATIONAL BURDEN

-`r pcg_report[["content"]][["value_box"]][["tmb"]]` -
- -
-
KATAEGIS EVENTS

-`r pcg_report[["content"]][["value_box"]][["kataegis"]]` -
-
- -

diff --git a/pcgrr/inst/templates/pcgr_quarto_report/value_boxes_tumor_only.qmd b/pcgrr/inst/templates/pcgr_quarto_report/value_boxes_tumor_only.qmd deleted file mode 100644 index f150b26d..00000000 --- a/pcgrr/inst/templates/pcgr_quarto_report/value_boxes_tumor_only.qmd +++ /dev/null @@ -1,52 +0,0 @@ -## Main results - -
-
-
TIER 1 - SNV/INDEL

-`r pcg_report[["content"]][["value_box"]][["tier1"]]` -
- -
-
TIER 2 - SNV/INDEL

-`r pcg_report[["content"]][["value_box"]][["tier2"]]` -
- - -
-
sCNA

-`r pcg_report[["content"]][["value_box"]][["scna"]]` -
- - -
-
TUMOR PURITY

-`r pcg_report[["content"]][["value_box"]][["tumor_purity"]]` -
- -
-
TUMOR PLOIDY

-`r pcg_report[["content"]][["value_box"]][["tumor_ploidy"]]` -
- -
-
MSI STATUS

-`r pcg_report[["content"]][["value_box"]][["msi"]]` -
- -
-
DOMINANT SIGNATURE

-`r pcg_report[["content"]][["value_box"]][["signatures"]]` -
- -
-
MUTATIONAL BURDEN

-`r pcg_report[["content"]][["value_box"]][["tmb"]]` -
- -
-
KATAEGIS EVENTS

-`r pcg_report[["content"]][["value_box"]][["kataegis"]]` -
-
- -

diff --git a/scripts/cpsr_validate_input.py b/scripts/cpsr_validate_input.py index a3b80874..f9e839ed 100755 --- a/scripts/cpsr_validate_input.py +++ b/scripts/cpsr_validate_input.py @@ -20,13 +20,12 @@ def __main__(): parser = argparse.ArgumentParser(description='Verify input data for CPSR') - parser.add_argument('refdata_dir',help='Directory containing the reference data files for PCGR/CPSR') + parser.add_argument('refdata_assembly_dir',help='Assembly-specific directory containing the reference data files for PCGR/CPSR') parser.add_argument('input_vcf', help='VCF input file with query variants (SNVs/InDels)') parser.add_argument('validated_vcf',help="Name of VCF file with validated, decomposed query variants that overlap target genes (SNVs/InDels)") parser.add_argument('custom_target_tsv',help='Custom text/TSV file indicating user-defined target genes from panel 0 for screening and reporting') parser.add_argument('custom_target_bed',help='Name of BED file populated with regions associated with custom target genes defined by user') parser.add_argument('retained_info_tags',help='Comma-separated string of VCF INFO tags in query VCF to be retained for output') - parser.add_argument('genome_assembly',help='grch37 or grch38') parser.add_argument('sample_id',help='CPSR sample_name') parser.add_argument('virtual_panel_id',type=str,help='virtual panel identifier(s)') parser.add_argument('diagnostic_grade_only', type=int, default=0, choices=[0,1], help="Green virtual panels only (Genomics England PanelApp)") @@ -36,13 +35,12 @@ def __main__(): parser.add_argument("--debug", action="store_true", help="Print full commands to log") args = parser.parse_args() - ret = validate_cpsr_input(args.refdata_dir, + ret = validate_cpsr_input(args.refdata_assembly_dir, args.input_vcf, args.validated_vcf, args.custom_target_tsv, args.custom_target_bed, args.retained_info_tags, - args.genome_assembly, args.sample_id, args.virtual_panel_id, args.diagnostic_grade_only, @@ -54,7 +52,7 @@ def __main__(): sys.exit(1) -def get_valid_custom_genelist(genelist_fname, genelist_bed_fname, pcgr_dir, genome_assembly, +def get_valid_custom_genelist(genelist_fname, genelist_bed_fname, refdata_assembly_dir, gwas_findings, secondary_findings, logger, debug): """ Function that checks whether the custom genelist contains valid entries from the complete exploratory track @@ -64,10 +62,13 @@ def get_valid_custom_genelist(genelist_fname, genelist_bed_fname, pcgr_dir, geno genelist_reader = csv.DictReader(open(genelist_fname,'r'), delimiter='\n', fieldnames=['ensembl_gene_id']) virtualpanel_track_bed = os.path.join( - pcgr_dir, "data", genome_assembly, "gene","bed","gene_virtual_panel", "0.bed.gz") + refdata_assembly_dir, "gene","bed","gene_virtual_panel", "0.bed.gz") virtualpanel_track_tsv = os.path.join( - pcgr_dir, "data", genome_assembly, "gene","tsv","gene_virtual_panel", "gene_virtual_panel.tsv.gz") + refdata_assembly_dir, "gene","tsv","gene_virtual_panel", "gene_virtual_panel.tsv.gz") genelist_bed_fname_unsorted = f'{genelist_bed_fname}.{random_id}.unsorted.bed' + + check_file_exists(virtualpanel_track_bed, logger) + check_file_exists(virtualpanel_track_tsv, logger) customlist_identifiers = {} superpanel_track = [] @@ -135,7 +136,7 @@ def get_valid_custom_genelist(genelist_fname, genelist_bed_fname, pcgr_dir, geno return 0 -def simplify_vcf(input_vcf, validated_vcf, vcf, custom_bed, refdata_directory, genome_assembly, virtual_panel_id, +def simplify_vcf(input_vcf, validated_vcf, vcf, custom_bed, refdata_assembly_dir, virtual_panel_id, sample_id, diagnostic_grade_only, gwas_findings, secondary_findings, output_dir, logger, debug): """ @@ -225,11 +226,11 @@ def simplify_vcf(input_vcf, validated_vcf, vcf, custom_bed, refdata_directory, g for pid in set(panel_ids): ge_panel_identifier = "GE_PANEL_" + str(pid) target_bed_gz = os.path.join( - refdata_directory,'data',genome_assembly, 'gene','bed','gene_virtual_panel', str(pid) + ".bed.gz") + refdata_assembly_dir, 'gene','bed','gene_virtual_panel', str(pid) + ".bed.gz") if diagnostic_grade_only == 1 and virtual_panel_id != "0": logger.info('Considering diagnostic-grade only genes in panel ' + str(pid) + ' - (GREEN status in Genomics England PanelApp)') target_bed_gz = os.path.join( - refdata_directory, 'data', genome_assembly, 'gene','bed','gene_virtual_panel', str(pid) + ".GREEN.bed.gz") + refdata_assembly_dir, 'gene','bed','gene_virtual_panel', str(pid) + ".GREEN.bed.gz") check_file_exists(target_bed_gz, logger) ## awk command to ignore secondary finding records while keeping records that belong to target (and that can potentially @@ -281,13 +282,12 @@ def simplify_vcf(input_vcf, validated_vcf, vcf, custom_bed, refdata_directory, g logger.info('') exit(1) -def validate_cpsr_input(refdata_directory, +def validate_cpsr_input(refdata_assembly_dir, input_vcf, validated_vcf, custom_list_fname, custom_list_bed_fname, retained_info_tags, - genome_assembly, sample_id, virtual_panel_id, diagnostic_grade_only, @@ -313,23 +313,20 @@ def validate_cpsr_input(refdata_directory, custom_target_fname['bed'] = custom_list_bed_fname get_valid_custom_genelist(custom_target_fname['tsv'], custom_target_fname['bed'], - refdata_directory, - genome_assembly, + refdata_assembly_dir, gwas_findings, secondary_findings, logger, debug) - db_assembly_dir = os.path.join(refdata_directory, 'data', genome_assembly) - if not input_vcf == 'None': vcf_object = VCF(input_vcf) ## Check that VCF does not already contain INFO tags that will be appended through CPSR annotation ## - First add info tags generated by VEP, and those generated by CPSR - populated_infotags_other_fname = os.path.join(db_assembly_dir, 'vcf_infotags_other.tsv') - populated_infotags_vep_fname = os.path.join(db_assembly_dir, 'vcf_infotags_vep.tsv') + populated_infotags_other_fname = os.path.join(refdata_assembly_dir, 'vcf_infotags_other.tsv') + populated_infotags_vep_fname = os.path.join(refdata_assembly_dir, 'vcf_infotags_vep.tsv') tags_cpsr = read_infotag_file(populated_infotags_other_fname, scope = "cpsr") tags_vep = read_infotag_file(populated_infotags_vep_fname, scope = "vep") tags_cpsr.update(tags_vep) @@ -340,14 +337,14 @@ def validate_cpsr_input(refdata_directory, track_file_info['tags_fname'] = {} for variant_track in ['clinvar','tcga','gwas','dbmts','dbnsfp','gnomad_non_cancer']: track_file_info['tags_fname'][variant_track] = os.path.join( - db_assembly_dir,'variant','vcf', variant_track, f'{variant_track}.vcfanno.vcf_info_tags.txt') + refdata_assembly_dir,'variant','vcf', variant_track, f'{variant_track}.vcfanno.vcf_info_tags.txt') for bed_track in ['simplerepeat','winmsk','rmsk','gerp']: track_file_info['tags_fname'][bed_track] = os.path.join( - db_assembly_dir,'misc','bed', bed_track, f'{bed_track}.vcfanno.vcf_info_tags.txt') + refdata_assembly_dir,'misc','bed', bed_track, f'{bed_track}.vcfanno.vcf_info_tags.txt') track_file_info['tags_fname']['gene_transcript_xref'] = os.path.join( - db_assembly_dir,'gene','bed', 'gene_transcript_xref', 'gene_transcript_xref.vcfanno.vcf_info_tags.txt') + refdata_assembly_dir,'gene','bed', 'gene_transcript_xref', 'gene_transcript_xref.vcfanno.vcf_info_tags.txt') for track in track_file_info['tags_fname']: infotags_vcfanno = read_vcfanno_tag_file(track_file_info['tags_fname'][track], logger) @@ -374,8 +371,7 @@ def validate_cpsr_input(refdata_directory, validated_vcf, vcf_object, custom_target_fname['bed'], - refdata_directory, - genome_assembly, + refdata_assembly_dir, virtual_panel_id, sample_id, diagnostic_grade_only, diff --git a/scripts/pcgr_summarise.py b/scripts/pcgr_summarise.py index 48b3f383..c18ee854 100755 --- a/scripts/pcgr_summarise.py +++ b/scripts/pcgr_summarise.py @@ -29,7 +29,7 @@ def __main__(): parser.add_argument('tumortype', default='Any', help='Primary tumor type of query VCF') parser.add_argument('vep_pick_order', default="mane,canonical,appris,biotype,ccds,rank,tsl,length", help=f"Comma-separated string of ordered transcript/variant properties for selection of primary variant consequence") - parser.add_argument('pcgr_db_dir',help='PCGR data directory') + parser.add_argument('refdata_assembly_dir',help='Assembly-specific reference data directory, e.g. "pcgrdb/data/grch38') parser.add_argument('--compress_output_vcf', action="store_true", default=False, help="Compress output VCF file") parser.add_argument('--cpsr',action="store_true",help="Aggregate cancer gene annotations for Cancer Predisposition Sequencing Reporter (CPSR)") parser.add_argument('--cpsr_yaml',dest="cpsr_yaml", default=None, help='YAML file with list of targeted genes by CPSR (custom, or panel-defined)') @@ -55,22 +55,22 @@ def extend_vcf_annotations(arg_dict, logger): Moreover, it performs two important matching procedures, using 5. Information from VEP's CSQ information (HGVSp/HGVSc) to match known mutation hotspots in cancer - 6. Information from VEP's CSQ information (Consequence, HGVSp/HGVSc) and genomic coordinates to match known biomarkers in cancer + 6. Information from VEP's CSQ information (Consequence, HGVSp/HGVSc, exon information etc) and genomic coordinates to match known biomarkers in cancer Finally, it assesses somatic variant oncogenicity, using 7. Gene annotations (tumor suppressor, oncogene) and variant annotations (loss-of-function, gnomAD variant frequencies, variant effect predictions). Variant oncogenicity levels are provided for all variants using a recommended five-level scheme ("Oncogenic", "Likely oncogenic", "VUS", "Likely Benign", "Benign") - Recommended scoring scheme for variant oncogenicity classification outlined by VICC/ClinGen consortia (Horak et al., Genet Med, 2022) - List of VCF INFO tags appended by this procedure is defined by the 'infotags' files in the pcgr_db_dir + List of VCF INFO tags appended by this procedure is defined by the 'infotags' files in the refdata_assembly_dir """ vcf_infotags = {} - vcf_infotags['other'] = read_infotag_file(os.path.join(arg_dict['pcgr_db_dir'], 'vcf_infotags_other.tsv'), scope = "pcgr") + vcf_infotags['other'] = read_infotag_file(os.path.join(arg_dict['refdata_assembly_dir'], 'vcf_infotags_other.tsv'), scope = "pcgr") if arg_dict['cpsr'] is True: - vcf_infotags['other'] = read_infotag_file(os.path.join(arg_dict['pcgr_db_dir'], 'vcf_infotags_other.tsv'), scope = "cpsr") - vcf_infotags['vep'] = read_infotag_file(os.path.join(arg_dict['pcgr_db_dir'], 'vcf_infotags_vep.tsv'), scope = "vep") + vcf_infotags['other'] = read_infotag_file(os.path.join(arg_dict['refdata_assembly_dir'], 'vcf_infotags_other.tsv'), scope = "cpsr") + vcf_infotags['vep'] = read_infotag_file(os.path.join(arg_dict['refdata_assembly_dir'], 'vcf_infotags_vep.tsv'), scope = "vep") vcf_infotags['other'].update(vcf_infotags['vep']) vcf_info_metadata = vcf_infotags['other'] @@ -89,19 +89,19 @@ def extend_vcf_annotations(arg_dict, logger): cpsr_target_genes[str(g['entrezgene'])] = 1 - vcf_infotags['cpsr'] = read_infotag_file(os.path.join(arg_dict['pcgr_db_dir'], 'vcf_infotags_cpsr.tsv'), scope = "cpsr") + vcf_infotags['cpsr'] = read_infotag_file(os.path.join(arg_dict['refdata_assembly_dir'], 'vcf_infotags_cpsr.tsv'), scope = "cpsr") vcf_infotags['other'].update(vcf_infotags['cpsr']) vcf_info_metadata.update(vcf_infotags['cpsr']) gene_transcript_xref_map = read_genexref_namemap( - os.path.join(arg_dict['pcgr_db_dir'], 'gene','tsv','gene_transcript_xref', 'gene_transcript_xref_bedmap.tsv.gz'), logger) + os.path.join(arg_dict['refdata_assembly_dir'], 'gene','tsv','gene_transcript_xref', 'gene_transcript_xref_bedmap.tsv.gz'), logger) cancer_hotspots = load_mutation_hotspots( - os.path.join(arg_dict['pcgr_db_dir'], 'misc','tsv','hotspot', 'hotspot.tsv.gz'), logger) + os.path.join(arg_dict['refdata_assembly_dir'], 'misc','tsv','hotspot', 'hotspot.tsv.gz'), logger) biomarkers = {} for db in ['cgi','civic']: - variant_fname = os.path.join(arg_dict['pcgr_db_dir'], 'biomarker','tsv', f"{db}.variant.tsv.gz") - clinical_fname = os.path.join(arg_dict['pcgr_db_dir'], 'biomarker','tsv', f"{db}.clinical.tsv.gz") + variant_fname = os.path.join(arg_dict['refdata_assembly_dir'], 'biomarker','tsv', f"{db}.variant.tsv.gz") + clinical_fname = os.path.join(arg_dict['refdata_assembly_dir'], 'biomarker','tsv', f"{db}.clinical.tsv.gz") if arg_dict['cpsr'] is True: biomarkers[db] = load_biomarkers(logger, variant_fname, clinical_fname, biomarker_vartype = 'MUT', biomarker_variant_origin = 'Both') else: @@ -148,17 +148,19 @@ def extend_vcf_annotations(arg_dict, logger): ## For germline-called VCFs (single-sample), pull out sequencing depth and genotype using cyvcf2 API if arg_dict['cpsr'] is True: rec.INFO['DP_CONTROL'] = "-1" - if len(rec.gt_depths) == 1: - rec.INFO['DP_CONTROL'] = str(rec.gt_depths[0]) + if not rec.gt_depths is None: + if len(rec.gt_depths) == 1: + rec.INFO['DP_CONTROL'] = str(rec.gt_depths[0]) rec.INFO['GENOTYPE'] = 'undefined' - if len(rec.gt_types) == 1: - if int(rec.gt_types[0]) == 0: - rec.INFO['GENOTYPE'] = 'hom_ref' - else: - if int(rec.gt_types[0]) == 1: - rec.INFO['GENOTYPE'] = 'het' - if int(rec.gt_types[0]) == 2: - rec.INFO['GENOTYPE'] = 'hom_alt' + if not rec.gt_types is None: + if len(rec.gt_types) == 1: + if int(rec.gt_types[0]) == 0: + rec.INFO['GENOTYPE'] = 'hom_ref' + else: + if int(rec.gt_types[0]) == 1: + rec.INFO['GENOTYPE'] = 'het' + if int(rec.gt_types[0]) == 2: + rec.INFO['GENOTYPE'] = 'hom_alt' if current_chrom is None: current_chrom = str(rec.CHROM) diff --git a/scripts/pcgr_validate_input.py b/scripts/pcgr_validate_input.py index e7964f49..0f4e8cde 100755 --- a/scripts/pcgr_validate_input.py +++ b/scripts/pcgr_validate_input.py @@ -19,7 +19,7 @@ def __main__(): parser = argparse.ArgumentParser(description='Verify input data for PCGR') - parser.add_argument('refdata_dir',help='Directory with reference data for PCGR/CPSR') + parser.add_argument('refdata_assembly_dir',help='Assembly-specific directory with reference data for PCGR/CPSR') parser.add_argument('input_vcf', help='Somatic (tumor) query variants (SNVs/InDels) - VCF format') parser.add_argument('validated_vcf', help="Validated VCF file with somatic (tumor) query variants (SNVs/InDels)") parser.add_argument('input_cna', help='Somatic (tumor) copy number query segments (tab-separated values)') @@ -27,7 +27,6 @@ def __main__(): parser.add_argument('input_rna_exp', help='Tumor gene expression estimates (tab-separated values)') parser.add_argument('panel_normal_vcf',help="VCF file with germline calls from panel of normals") parser.add_argument('tumor_only',type=int, default=0,choices=[0,1],help="Tumor only sequencing") - parser.add_argument('genome_assembly',help='grch37 or grch38') parser.add_argument('sample_id',help='PCGR sample_name') parser.add_argument('retained_info_tags', help="Comma-separated string of custom VCF INFO tags to be kept in PCGR output") parser.add_argument('tumor_dp_tag', help='VCF INFO tag that denotes tumor sequencing depth') @@ -42,7 +41,7 @@ def __main__(): parser.add_argument("--debug", action="store_true", help="Print full commands to log") args = parser.parse_args() - ret = validate_pcgr_input(args.refdata_dir, + ret = validate_pcgr_input(args.refdata_assembly_dir, args.input_vcf, args.validated_vcf, args.input_cna, @@ -58,7 +57,6 @@ def __main__(): args.panel_normal_vcf, args.retained_info_tags, args.tumor_only, - args.genome_assembly, args.sample_id, args.keep_uncompressed, args.output_dir, @@ -245,7 +243,7 @@ def simplify_vcf(input_vcf, validated_vcf, vcf, output_dir, sample_id, keep_unco remove_file(bcftools_simplify_log) remove_file(vt_decompose_log) -def validate_pcgr_input(refdata_directory, +def validate_pcgr_input(refdata_assembly_dir, input_vcf, validated_vcf, input_cna, @@ -261,14 +259,13 @@ def validate_pcgr_input(refdata_directory, panel_normal_vcf, retained_info_tags, tumor_only, - genome_assembly, sample_id, keep_uncompressed, output_dir, debug): """ Function that reads the input files to PCGR (VCF file and Tab-separated values file with copy number segments) and performs the following checks: - 1. No INFO annotation tags in the query VCF coincides with those generated by PCGR + 1. No INFO annotation tags in the input VCF coincides with those generated by PCGR 2. Provided columns for tumor/normal coverage and allelic depths are found in VCF 3. Provided retained VCF INFO tags are present in VCF file 4. If VCF have variants with multiple alternative alleles (e.g. 'A,T') run vt decompose @@ -282,8 +279,6 @@ def validate_pcgr_input(refdata_directory, # if panel_normal_vcf == "None" and tumor_only == 1 and config_options['tumor_only']['exclude_pon'] is True: # logger.warning('Panel-of-normals VCF is not present - exclusion of calls found in panel-of-normals will be ignored') - db_assembly_dir = os.path.join(refdata_directory, 'data', genome_assembly) - if not input_vcf == 'None': @@ -293,9 +288,9 @@ def validate_pcgr_input(refdata_directory, ## - First add info tags generated by VEP, and those generated by PCGR vcf_infotags = {} vcf_infotags['pcgr'] = read_infotag_file( - os.path.join(refdata_directory,'data',genome_assembly, 'vcf_infotags_other.tsv'), scope = "pcgr") + os.path.join(refdata_assembly_dir, 'vcf_infotags_other.tsv'), scope = "pcgr") vcf_infotags['vep'] = read_infotag_file( - os.path.join(refdata_directory,'data',genome_assembly, 'vcf_infotags_vep.tsv'), scope = "vep") + os.path.join(refdata_assembly_dir, 'vcf_infotags_vep.tsv'), scope = "vep") vcf_infotags['pcgr'].update(vcf_infotags['vep']) vcf_tags_pcgr = vcf_infotags['pcgr'] @@ -304,20 +299,20 @@ def validate_pcgr_input(refdata_directory, track_file_info['tags_fname'] = {} for variant_track in ['clinvar','tcga','gwas','dbnsfp']: track_file_info['tags_fname'][variant_track] = os.path.join( - db_assembly_dir,'variant','vcf', variant_track, f'{variant_track}.vcfanno.vcf_info_tags.txt') + refdata_assembly_dir,'variant','vcf', variant_track, f'{variant_track}.vcfanno.vcf_info_tags.txt') for bed_track in ['simplerepeat','winmsk','rmsk','gerp']: track_file_info['tags_fname'][bed_track] = os.path.join( - db_assembly_dir,'misc','bed', bed_track, f'{bed_track}.vcfanno.vcf_info_tags.txt') + refdata_assembly_dir,'misc','bed', bed_track, f'{bed_track}.vcfanno.vcf_info_tags.txt') track_file_info['tags_fname']['gene_transcript_xref'] = os.path.join( - db_assembly_dir,'gene','bed', 'gene_transcript_xref', 'gene_transcript_xref.vcfanno.vcf_info_tags.txt') + refdata_assembly_dir,'gene','bed', 'gene_transcript_xref', 'gene_transcript_xref.vcfanno.vcf_info_tags.txt') for track in track_file_info['tags_fname']: infotags_vcfanno = read_vcfanno_tag_file(track_file_info['tags_fname'][track], logger) vcf_tags_pcgr.update(infotags_vcfanno) - ## Check that no INFO annotation tags in the query VCF coincides with those generated by PCGR + ## Check that no INFO annotation tags in the input VCF coincides with those generated by PCGR tag_check = check_existing_vcf_info_tags(vcf_object, vcf_tags_pcgr, logger) if tag_check == -1: return -1 diff --git a/scripts/pcgrr.R b/scripts/pcgrr.R index 76886aca..62443b3a 100755 --- a/scripts/pcgrr.R +++ b/scripts/pcgrr.R @@ -40,7 +40,7 @@ pcgrr::write_report_quarto_html(report = pcg_report1) yaml_fname <- paste0("/Users/sigven/project_data/packages/", "package__pcgr/bundle_update_2023/", - "pcgr/examples/SAMPLE-T_N-001.pcgr_acmg.grch38.conf.yaml") + "pcgr/examples/SAMPLE-T_N-001.pcgr.grch38.conf.yaml") ## Generate report content