From 52279c99384d17fb5cbcf90b2602aab0198aecc7 Mon Sep 17 00:00:00 2001
From: Sigve Nakken <sigven@ifi.uio.no>
Date: Sat, 6 Apr 2024 11:10:19 +0200
Subject: [PATCH] code clean-up, arg checks

---
 pcgr/arg_checker.py                           | 448 +++++++-----------
 pcgr/cna.py                                   |  26 +-
 pcgr/config.py                                |  58 +--
 pcgr/cpsr.py                                  |  78 ++-
 pcgr/expression.py                            |  55 ++-
 pcgr/main.py                                  |  52 +-
 pcgr/variant.py                               |  46 +-
 pcgr/vcf.py                                   |  20 +-
 pcgr/vep.py                                   |   7 +-
 pcgrr/R/input_data.R                          |  21 +
 pcgrr/R/main.R                                |  28 +-
 pcgrr/R/reference_data.R                      |   3 +-
 pcgrr/R/report.R                              |   4 +-
 pcgrr/data/data_coltype_defs.rda              | Bin 1912 -> 1922 bytes
 pcgrr/data/tcga_cohorts.rda                   | Bin 762 -> 769 bytes
 pcgrr/data/tsv_cols.rda                       | Bin 694 -> 687 bytes
 .../mutational_signature.qmd                  |   3 -
 .../pcgr_quarto_report/snv_indel_main.qmd     |   7 +-
 .../pcgr_quarto_report/value_boxes.qmd        |  52 --
 .../value_boxes_tumor_only.qmd                |  52 --
 scripts/cpsr_validate_input.py                |  42 +-
 scripts/pcgr_summarise.py                     |  44 +-
 scripts/pcgr_validate_input.py                |  25 +-
 scripts/pcgrr.R                               |   2 +-
 24 files changed, 417 insertions(+), 656 deletions(-)
 delete mode 100644 pcgrr/inst/templates/pcgr_quarto_report/value_boxes.qmd
 delete mode 100644 pcgrr/inst/templates/pcgr_quarto_report/value_boxes_tumor_only.qmd

diff --git a/pcgr/arg_checker.py b/pcgr/arg_checker.py
index cb2f8f70..ff87799f 100644
--- a/pcgr/arg_checker.py
+++ b/pcgr/arg_checker.py
@@ -5,32 +5,14 @@
 import os
 
 
-def check_args(arg_dict):
+def verify_args(arg_dict):
 
     logger = getlogger("pcgr-validate-arguments-input-a")
-    # Check the existence of required arguments
-    if arg_dict['refdata_dir'] is None or not os.path.isdir(arg_dict['refdata_dir']):
-        err_msg = f"Required argument '--refdata_dir' does not exist ({arg_dict['refdata_dir']})."
-        error_message(err_msg, logger)
-
-    if arg_dict['genome_assembly'] is None:
-        err_msg = f"Required argument '--genome_assembly' has no/undefined value ({arg_dict['genome_assembly']})."
-        error_message(err_msg, logger)
-
-    if arg_dict['input_vcf'] is None:
-        err_msg = f"Required argument '--input_vcf' does not exist ({arg_dict['input_vcf']})."
-        error_message(err_msg, logger)
-
-    if arg_dict['sample_id'] is None:
-        err_msg = f"Required argument '--sample_id' has no/undefined value ({arg_dict['sample_id']})."
-        error_message(err_msg, logger)
-
-    if len(arg_dict['sample_id']) <= 2 or len(arg_dict['sample_id']) > 35:
-        err_msg = f"Sample name identifier ('--sample_id' = {arg_dict['sample_id']}) must be between 2 and 35 characters long"
-        error_message(err_msg, logger)
+    
+    # Check the existence of required arguments    
+    verify_required_args(arg_dict, logger)
 
     # Optional arguments
-
     if arg_dict['expression_sim'] == True and arg_dict['input_rna_exp'] is None:
         warn_msg = f"RNA expression similarity analysis can only be performed if --input_rna_exp is set (--input_rna_exp = {arg_dict['input_rna_exp']})."
         warn_message(warn_msg, logger)
@@ -198,51 +180,25 @@ def check_args(arg_dict):
         err_msg = f"Minimum percent overlap between copy number segment and gene transcript ('--cna_overlap_pct' = {arg_dict['cna_overlap_pct']}) must be within (0, 100]"
         error_message(err_msg, logger)
 
-    # VEP options
-    if int(arg_dict['vep_n_forks']) <= int(pcgr_vars.VEP_MIN_FORKS) or int(arg_dict['vep_n_forks']) > int(pcgr_vars.VEP_MAX_FORKS):
-        err_msg = (
-            f"Number of forks that VEP can use during annotation must be above {str(pcgr_vars.VEP_MIN_FORKS)} and not "
-            f"more than {str(pcgr_vars.VEP_MAX_FORKS)} (recommended is 4), current value is {arg_dict['vep_n_forks']}"
-            )
-        error_message(err_msg, logger)
+    verify_vep_options(arg_dict, logger)
 
-    if int(arg_dict['vep_buffer_size']) < int(pcgr_vars.VEP_MIN_BUFFER_SIZE) or int(arg_dict['vep_buffer_size']) > int(pcgr_vars.VEP_MAX_BUFFER_SIZE):
-        err_msg = (
-            f"Internal VEP buffer size, corresponding to the number of variants that are read in to memory simultaneously "
-            f"('--vep_buffer_size' = {arg_dict['vep_buffer_size']}),  "
-            f"must be within [{str(pcgr_vars.VEP_MIN_BUFFER_SIZE)}, {str(pcgr_vars.VEP_MAX_BUFFER_SIZE)}]")
-        error_message(err_msg, logger)
-
-    # Check that VEP pick criteria is formatted correctly
-    if not arg_dict['vep_pick_order'] is None:
-        values = str(arg_dict['vep_pick_order']).split(',')
-        permitted_sources = pcgr_vars.VEP_PICK_CRITERIA
-        num_permitted_sources = 0
-        for v in values:
-            if v in permitted_sources:
-                num_permitted_sources += 1
-
-        if num_permitted_sources != len(pcgr_vars.VEP_PICK_CRITERIA):
-            err_msg = (
-                f"Option '--vep_pick_order' = {str(arg_dict['vep_pick_order'])} is formatted incorrectly, should be " 
-                f"a comma-separated string of the following values: '{' '.join(pcgr_vars.VEP_PICK_CRITERIA)}'"
-            )
-            error_message(err_msg, logger)
-    
-    return
+    return(arg_dict)
 
 
-def define_output_files(arg_dict):
+def define_output_files(arg_dict, cpsr = False):
     """
     Define output files
     """
-    logger = getlogger("pcgr-output-files")
+    logger = getlogger("define-output-files")
     # create output folder (if not already exists)
     output_dir = utils.safe_makedir(os.path.abspath(arg_dict['output_dir']))
 
     # Define general output prefix
     output_prefix = os.path.join(
         str(output_dir), f"{arg_dict['sample_id']}.pcgr.{arg_dict['genome_assembly']}")
+    if cpsr:
+        output_prefix = os.path.join(
+            str(output_dir), f"{arg_dict['sample_id']}.cpsr.{arg_dict['genome_assembly']}")
 
     # Append output files to arg_dict
     output_vcf = \
@@ -264,33 +220,35 @@ def define_output_files(arg_dict):
         err_msg = "Output files (e.g. " + str(output_vcf_tsv) + \
             ") already exist - please specify different sample_id or add option --force_overwrite"
         error_message(err_msg, logger)
-        
-    # if annotated output cna segments exist and overwrite not set
-    if os.path.exists(output_cna) and arg_dict["force_overwrite"] is False:
-        err_msg = "Output files (e.g. " + str(output_cna) + \
-            ") already exist - please specify different sample_id or add option --force_overwrite"
-        error_message(err_msg, logger)
-        
-    # if output annotated gene expression exist and overwrite not set
-    if os.path.exists(output_expression) and arg_dict["force_overwrite"] is False:
-        err_msg = "Output files (e.g. " + str(output_expression) + \
-            ") already exist - please specify different sample_id or add option --force_overwrite"
-        error_message(err_msg, logger)
+    
+    if not cpsr:
+        # if annotated output cna segments exist and overwrite not set
+        if os.path.exists(output_cna) and arg_dict["force_overwrite"] is False:
+            err_msg = "Output files (e.g. " + str(output_cna) + \
+                ") already exist - please specify different sample_id or add option --force_overwrite"
+            error_message(err_msg, logger)
+            
+        # if output annotated gene expression exist and overwrite not set
+        if os.path.exists(output_expression) and arg_dict["force_overwrite"] is False:
+            err_msg = "Output files (e.g. " + str(output_expression) + \
+                ") already exist - please specify different sample_id or add option --force_overwrite"
+            error_message(err_msg, logger)
         
     output_data = {}
     output_data['prefix'] = output_prefix
     output_data['dir'] = output_dir
     output_data['vcf'] = output_vcf
     output_data['vcf2tsv'] = output_vcf_tsv
-    output_data['cna'] = output_cna
-    output_data['expression'] = output_expression
+    
+    if not cpsr:
+        output_data['cna'] = output_cna
+        output_data['expression'] = output_expression
 
     return output_data
 
 def verify_input_files(arg_dict):
     """
-    1. Checks existence of input files/dirs (arg_dict)
-    2. Checks that the data bundle is of correct date
+    1. Checks existence of all input files/dirs (arg_dict)
     """
     logger = getlogger("pcgr-check-files")
 
@@ -302,9 +260,6 @@ def verify_input_files(arg_dict):
     pon_vcf_dir = 'NA'
     db_dir = 'NA'
     base_dir = 'NA'
-    #output_vcf = 'NA'
-    #output_cna = 'NA'
-    #output_exp = 'NA'
     pon_vcf_basename = 'NA'
     input_vcf_basename = 'NA'
     input_cna_basename = 'NA'
@@ -314,15 +269,7 @@ def verify_input_files(arg_dict):
     arg_dict['rna_fusion_tumor'] = None
     
     # create output folder (if not already exists)
-    output_dir_full = utils.safe_makedir(os.path.abspath(arg_dict['output_dir']))
-
-    # Append output files to arg_dict
-    #output_vcf = \
-    #    os.path.join(str(output_dir_full), f"{arg_dict['sample_id']}.pcgr.{arg_dict['genome_assembly']}.vcf.gz")
-    #output_cna = \
-    #    os.path.join(str(output_dir_full), f"{arg_dict['sample_id']}.pcgr.{arg_dict['genome_assembly']}.cna_segments.tsv.gz")
-    #output_exp = \
-    #    os.path.join(str(output_dir_full), f"{arg_dict['sample_id']}.pcgr.{arg_dict['genome_assembly']}.expression.tsv.gz")
+    #output_dir_full = utils.safe_makedir(os.path.abspath(arg_dict['output_dir']))
 
     # check that either input vcf or cna segments exist
     if arg_dict['input_vcf'] is None and arg_dict['input_cna'] is None:
@@ -367,11 +314,6 @@ def verify_input_files(arg_dict):
         input_vcf_basename = os.path.basename(str(arg_dict["input_vcf"]))
         input_vcf_dir = os.path.dirname(os.path.abspath(arg_dict["input_vcf"]))
 
-        # if output vcf exist and overwrite not set
-        #if os.path.exists(output_vcf) and arg_dict["force_overwrite"] is False:
-        #    err_msg = f"Output files (e.g. {output_vcf}) already exist - please specify different sample_id or add option --force_overwrite"
-        #    error_message(err_msg, logger)
-
     # check if input cna segments exist
     if not arg_dict["input_cna"] is None:
         check_file_exists(os.path.abspath(arg_dict["input_cna"]), strict = True, logger = logger)
@@ -384,25 +326,19 @@ def verify_input_files(arg_dict):
         input_cna_basename = os.path.basename(str(arg_dict["input_cna"]))
         input_cna_dir = os.path.dirname(os.path.abspath(arg_dict["input_cna"]))
 
-        # if output cna segments exist and overwrite not set
-        #if os.path.exists(output_cna) and arg_dict["force_overwrite"] is False:
-        #    err_msg = "Output files (e.g. " + str(output_cna) + \
-        #        ") already exist - please specify different sample_id or add option --force_overwrite"
-        #    error_message(err_msg, logger)
-
     # check if input rna fusion variants exist
-    if not arg_dict["input_rna_fusion"] is None:
-        check_file_exists(os.path.abspath(arg_dict["input_rna_fusion"]), strict = True, logger = logger)
+    # if not arg_dict["input_rna_fusion"] is None:
+    #     check_file_exists(os.path.abspath(arg_dict["input_rna_fusion"]), strict = True, logger = logger)
        
-        if not (os.path.abspath(arg_dict["input_rna_fusion"]).endswith(".tsv") or 
-                os.path.abspath(arg_dict["input_rna_fusion"]).endswith(".txt")):
-            err_msg = "RNA fusion variants file (" + os.path.abspath(
-                arg_dict["input_rna_fusion"]) + ") does not have the correct file extension (.tsv or .txt)"
-            error_message(err_msg, logger)
-        input_rna_fusion_basename = os.path.basename(
-            str(arg_dict["input_rna_fusion"]))
-        input_rna_fusion_dir = os.path.dirname(
-            os.path.abspath(arg_dict["input_rna_fusion"]))
+    #     if not (os.path.abspath(arg_dict["input_rna_fusion"]).endswith(".tsv") or 
+    #             os.path.abspath(arg_dict["input_rna_fusion"]).endswith(".txt")):
+    #         err_msg = "RNA fusion variants file (" + os.path.abspath(
+    #             arg_dict["input_rna_fusion"]) + ") does not have the correct file extension (.tsv or .txt)"
+    #         error_message(err_msg, logger)
+    #     input_rna_fusion_basename = os.path.basename(
+    #         str(arg_dict["input_rna_fusion"]))
+    #     input_rna_fusion_dir = os.path.dirname(
+    #         os.path.abspath(arg_dict["input_rna_fusion"]))
 
     # check if input rna expression exist
     if not arg_dict["input_rna_exp"] is None:
@@ -418,12 +354,6 @@ def verify_input_files(arg_dict):
         input_rna_expression_dir = os.path.dirname(
             os.path.abspath(arg_dict["input_rna_exp"]))
         
-        # if output annotated gene expression exist and overwrite not set
-        #if os.path.exists(output_exp) and arg_dict["force_overwrite"] is False:
-        #    err_msg = "Output files (e.g. " + str(output_exp) + \
-        #        ") already exist - please specify different sample_id or add option --force_overwrite"
-        #    error_message(err_msg, logger)
-
       # check if input rna fusion variants exist
     if not arg_dict["cpsr_report"] is None:
         if not os.path.exists(os.path.abspath(arg_dict["cpsr_report"])):
@@ -438,14 +368,84 @@ def verify_input_files(arg_dict):
             str(arg_dict["cpsr_report"]))
         input_cpsr_report_dir = os.path.dirname(
             os.path.abspath(arg_dict["cpsr_report"]))
+    
+    vep_dir = verify_vep_cache(arg_dict, logger)
+    refdata_assembly_dir = verify_refdata(arg_dict, logger, cpsr = True)
+    
 
-    # check the existence of base folder
+    input_data = {
+      "vcf_dir": input_vcf_dir,
+      "cna_dir": input_cna_dir,
+      "rna_fusion_dir": input_rna_fusion_dir,
+      "rna_expression_dir": input_rna_expression_dir,
+      "cpsr_report_dir": input_cpsr_report_dir,
+      "cpsr_report_basename": input_cpsr_report_basename,
+      "pon_vcf_dir": pon_vcf_dir,
+      "refdata_assembly_dir": refdata_assembly_dir,
+      "vep_dir": vep_dir,
+      "pon_vcf_basename": pon_vcf_basename,
+      "vcf_basename": input_vcf_basename,
+      "cna_basename": input_cna_basename,
+      "rna_fusion_basename": input_rna_fusion_basename,
+      "rna_expression_basename": input_rna_expression_basename
+      
+    }
+
+    return input_data
+
+
+def verify_refdata(arg_dict: dict, logger = None, cpsr = False):
+    
+    # check the existence of base refdata folder
     base_dir = os.path.abspath(arg_dict["refdata_dir"])
     if not os.path.isdir(base_dir):
         err_msg = "Base directory (" + str(base_dir) + ") does not exist"
         error_message(err_msg, logger)
     
-    # check the existence of base folder
+    # check the existence of 'data' folder within the base folder
+    refdata_dir = os.path.join(os.path.abspath(arg_dict["refdata_dir"]), "data")
+    if not os.path.isdir(refdata_dir):
+        err_msg = "Data directory (" + str(refdata_dir) + ") does not exist"
+        error_message(err_msg, logger)
+
+    # check the existence of specified assembly data folder within the base/data folder
+    refdata_assembly_dir = os.path.join(os.path.abspath(
+        arg_dict["refdata_dir"]), "data", arg_dict["genome_assembly"])
+    if not os.path.isdir(refdata_assembly_dir):
+        err_msg = "Data directory for the specified genome assembly (" + str(
+            refdata_assembly_dir) + ") does not exist"
+        error_message(err_msg, logger)
+
+    # check the existence of .PCGR_BUNDLE_VERSION (starting from 1.5.0)
+    rel_notes_file = os.path.join(refdata_assembly_dir, ".PCGR_BUNDLE_VERSION")
+    if not os.path.exists(rel_notes_file):
+        err_msg = "Could not find file ('" + str(rel_notes_file) + "') in the PCGR reference data directory - exiting."
+        error_message(err_msg, logger)
+
+    f_rel_notes = open(rel_notes_file,'r')
+    compliant_data_bundle = 0
+    for line in f_rel_notes:
+        if pcgr_vars.DB_VERSION in line:
+            compliant_data_bundle = 1
+    
+    if compliant_data_bundle == 0:
+        err_msg = (
+            f'The PCGR data bundle is not compliant with the software version - please download the '
+            f'latest software and data bundle (see https://github.com/sigven/cpsr for instructions)')
+        if cpsr is False:
+            err_msg = (
+                f'The PCGR data bundle is outdated - please download the latest data bundle ',
+                f'(see https://sigven.github.io/pcgr for instructions)'
+            )
+        error_message(err_msg,logger)
+
+    f_rel_notes.close()
+    
+    return(refdata_assembly_dir)
+
+def verify_vep_cache(arg_dict: dict, logger = None):
+    
+    # check the existence of base VEP dir
     vep_dir = os.path.abspath(arg_dict["vep_dir"])
     if not os.path.isdir(vep_dir):
         err_msg = "VEP directory ('" + str(vep_dir) + "') does not exist"
@@ -456,99 +456,86 @@ def verify_input_files(arg_dict):
         err_msg = "VEP directory ('" + str(vep_human_dir) + "') does not exist"
         error_message(err_msg, logger)
     
+    ## check that the specified VEP version exists
     vep_human_version_dir = os.path.join(
         os.path.abspath(arg_dict["vep_dir"]), "homo_sapiens", 
         f"{pcgr_vars.VEP_VERSION}_{pcgr_vars.VEP_ASSEMBLY[arg_dict['genome_assembly']]}")
     if not os.path.isdir(vep_human_version_dir):
         err_msg = "VEP directory ('" + str(vep_human_version_dir) + "') does not exist"
         error_message(err_msg, logger)
+    
+    return(vep_dir)
 
-    # check the existence of data folder within the base folder
-    db_dir = os.path.join(os.path.abspath(arg_dict["refdata_dir"]), "data")
-    if not os.path.isdir(db_dir):
-        err_msg = "Data directory (" + str(db_dir) + ") does not exist"
-        error_message(err_msg, logger)
-
-    # check the existence of specified assembly data folder within the base folder
-    db_assembly_dir = os.path.join(os.path.abspath(
-        arg_dict["refdata_dir"]), "data", arg_dict["genome_assembly"])
-    if not os.path.isdir(db_assembly_dir):
-        err_msg = "Data directory for the specified genome assembly (" + str(
-            db_assembly_dir) + ") does not exist"
+def verify_vep_options(arg_dict: dict, logger = None):
+    
+    # VEP options
+    if int(arg_dict['vep_n_forks']) <= int(pcgr_vars.VEP_MIN_FORKS) or \
+        int(arg_dict['vep_n_forks']) > int(pcgr_vars.VEP_MAX_FORKS):
+        err_msg = (
+            f"Number of forks that VEP can use during annotation must be above {str(pcgr_vars.VEP_MIN_FORKS)} and not "
+            f"more than {str(pcgr_vars.VEP_MAX_FORKS)} (recommended is 4), current value is {arg_dict['vep_n_forks']}"
+            )
         error_message(err_msg, logger)
 
-    # check the existence of .PCGR_BUNDLE_VERSION (starting from 1.5.0)
-    rel_notes_file = os.path.join(os.path.abspath(
-        arg_dict["refdata_dir"]), "data", arg_dict["genome_assembly"], ".PCGR_BUNDLE_VERSION")
-    if not os.path.exists(rel_notes_file):
-        err_msg = "The PCGR data bundle is outdated - please download the latest data bundle (see github.com/sigven/pcgr for instructions)"
+    if int(arg_dict['vep_buffer_size']) < int(pcgr_vars.VEP_MIN_BUFFER_SIZE) or \
+        int(arg_dict['vep_buffer_size']) > int(pcgr_vars.VEP_MAX_BUFFER_SIZE):
+        err_msg = (
+            f"Internal VEP buffer size, corresponding to the number of variants that are read in to memory simultaneously "
+            f"('--vep_buffer_size' = {arg_dict['vep_buffer_size']}),  "
+            f"must be within [{str(pcgr_vars.VEP_MIN_BUFFER_SIZE)}, {str(pcgr_vars.VEP_MAX_BUFFER_SIZE)}]")
         error_message(err_msg, logger)
 
-    f_rel_not = open(rel_notes_file, "r")
-    compliant_data_bundle = 0
-    for line in f_rel_not:
-        if pcgr_vars.DB_VERSION in line:
-            compliant_data_bundle = 1
+    # Check that VEP pick criteria is formatted correctly
+    if not arg_dict['vep_pick_order'] is None:
+        values = str(arg_dict['vep_pick_order']).split(',')
+        permitted_sources = pcgr_vars.VEP_PICK_CRITERIA
+        num_permitted_sources = 0
+        for v in values:
+            if v in permitted_sources:
+                num_permitted_sources += 1
 
-    f_rel_not.close()
+        if num_permitted_sources != len(pcgr_vars.VEP_PICK_CRITERIA):
+            err_msg = (
+                f"Option '--vep_pick_order' = {str(arg_dict['vep_pick_order'])} is formatted incorrectly, should be " 
+                f"a comma-separated string of the following values: '{' '.join(pcgr_vars.VEP_PICK_CRITERIA)}'"
+            )
+            error_message(err_msg, logger)
+    
+    
+def verify_required_args(arg_dict: dict, logger = None):
+    
+    if arg_dict['refdata_dir'] is None or not os.path.isdir(arg_dict['refdata_dir']):
+        err_msg = f"Required argument '--refdata_dir' does not exist ({arg_dict['refdata_dir']})."
+        error_message(err_msg, logger)
 
-    if compliant_data_bundle == 0:
-        err_msg = "The PCGR data bundle is not compliant with the software version - please download the latest software and data bundle (see https://github.com/sigven/pcgr for instructions)"
+    if arg_dict['genome_assembly'] is None:
+        err_msg = f"Required argument '--genome_assembly' has no/undefined value ({arg_dict['genome_assembly']})."
         error_message(err_msg, logger)
 
-    input_data = {
-      "vcf_dir": input_vcf_dir,
-      "cna_dir": input_cna_dir,
-      "rna_fusion_dir": input_rna_fusion_dir,
-      "rna_expression_dir": input_rna_expression_dir,
-      "cpsr_report_dir": input_cpsr_report_dir,
-      "cpsr_report_basename": input_cpsr_report_basename,
-      "pon_vcf_dir": pon_vcf_dir,
-      "db_assembly_dir": db_assembly_dir,
-      "refdata_dir": base_dir,
-      "vep_dir": vep_dir,
-      #"output_dir": output_dir_full,
-      #"output_vcf": output_vcf,
-      #"output_cna": output_cna,
-      #"output_exp": output_exp,
-      "pon_vcf_basename": pon_vcf_basename,
-      "vcf_basename": input_vcf_basename,
-      "cna_basename": input_cna_basename,
-      "rna_fusion_basename": input_rna_fusion_basename,
-      "rna_expression_basename": input_rna_expression_basename
-      
-    }
+    if arg_dict['input_vcf'] is None or not os.path.exists(arg_dict['input_vcf']):
+        err_msg = f"Required argument '--input_vcf' does not exist ({arg_dict['input_vcf']})."
+        error_message(err_msg, logger)
 
-    return input_data
+    if arg_dict['sample_id'] is None:
+        err_msg = f"Required argument '--sample_id' has no/undefined value ({arg_dict['sample_id']})."
+        error_message(err_msg, logger)
 
+    if len(arg_dict['sample_id']) <= 2 or len(arg_dict['sample_id']) > 35:
+        err_msg = f"Sample name identifier ('--sample_id' = {arg_dict['sample_id']}) must be between 2 and 35 characters long"
+        error_message(err_msg, logger)
+    
+    return(0)
+    
 ##---- CPSR ----##
 
-def check_args_cpsr(arg_dict):
+def verify_args_cpsr(arg_dict):
 
     logger = getlogger('cpsr-validate-input-arguments-a')
     arg_dict['vep_regulatory'] = True
     ## Required arguments
-    ## Check that query VCF is set and exists
-    if arg_dict['input_vcf'] is None or not os.path.exists(arg_dict['input_vcf']):
-        err_msg = f"Required argument '--input_vcf' does not exist ({arg_dict['input_vcf']})."
-        error_message(err_msg,logger)
-    ## Check that PCGR directory (with data bundle) is provided and exists
-    if arg_dict['refdata_dir'] is None or not os.path.isdir(arg_dict['refdata_dir']):
-        err_msg = f"Required argument '--refdata_dir' does not exist ({arg_dict['refdata_dir']})."
-        error_message(err_msg,logger)
-    ## Check that genome assembly is set
-    if arg_dict['genome_assembly'] is None:
-        err_msg = f"Required argument '--genome_assembly' has no/undefined value ({arg_dict['genome_assembly']})."
-        error_message(err_msg,logger)
-    ## Check that sample identifier is set and is of appropriate length (minimum two characters)
-    if arg_dict['sample_id'] is None:
-        err_msg = f"Required argument '--sample_id' has no/undefined value ({arg_dict['sample_id']})."
-        error_message(err_msg,logger)
-
-    if len(arg_dict['sample_id']) <= 2:
-        err_msg = f"Sample name identifier ('--sample_id') requires a name with more than two characters ({arg_dict['sample_id']})."
-        error_message(err_msg,logger)
-
+    
+    verify_required_args(arg_dict, logger)
+    
     ### Optional arguments
     ## Provide virtual_panel_id or a custom list from panel 0
     if arg_dict['virtual_panel_id'] == "-1" and not arg_dict['custom_list']:
@@ -604,36 +591,8 @@ def check_args_cpsr(arg_dict):
         warn_message(warn_msg, logger)
         arg_dict['diagnostic_grade_only'] = False
 
-    ## VEP options
-    if arg_dict['vep_n_forks'] <= pcgr_vars.VEP_MIN_FORKS or arg_dict['vep_n_forks'] > pcgr_vars.VEP_MAX_FORKS:
-        err_msg = (
-            f"Number of forks that VEP can use during annotation must be above 0 and not more "
-            f"than {pcgr_vars.VEP_MAX_FORKS} (recommended is 4), current value is {arg_dict['vep_n_forks']}")
-        error_message(err_msg,logger)
-
-    if arg_dict['vep_buffer_size'] < pcgr_vars.VEP_MIN_BUFFER_SIZE or arg_dict['vep_buffer_size'] > pcgr_vars.VEP_MAX_BUFFER_SIZE:
-        err_msg = (
-            f"Internal VEP buffer size, corresponding to the number of variants that are read in to memory simultaneously, "
-            f"must be above {pcgr_vars.VEP_MIN_BUFFER_SIZE} and not more than {pcgr_vars.VEP_MAX_BUFFER_SIZE}, "
-            f"current value is {arg_dict['vep_buffer_size']}"
-        )
-        error_message(err_msg,logger)
-
-    ## Check that VEP pick criteria is formatted correctly
-    if not arg_dict['vep_pick_order'] is None:
-        values = str(arg_dict['vep_pick_order']).split(',')
-        permitted_sources = pcgr_vars.VEP_PICK_CRITERIA
-        num_permitted_sources = 0
-        for v in values:
-            if v in permitted_sources:
-                num_permitted_sources += 1
+    verify_vep_options(arg_dict, logger)
 
-        if num_permitted_sources != len(pcgr_vars.VEP_PICK_CRITERIA):
-            err_msg = (
-                f"Option '--vep_pick_order' = {str(arg_dict['vep_pick_order'])} is formatted incorrectly, should be " 
-                f"a comma-separated string of the following values: '{' '.join(pcgr_vars.VEP_PICK_CRITERIA)}'"
-            )
-            error_message(err_msg, logger)
     return(arg_dict)
 
 
@@ -684,71 +643,15 @@ def verify_input_files_cpsr(arg_dict):
             err_msg = f"Output files (e.g. {output_vcf}) already exist - please specify different " + \
                 "sample_id or add option --force_overwrite"
             error_message(err_msg,logger)
+            
     
-    # check the existence of base folder
-    base_dir = os.path.abspath(arg_dict["refdata_dir"])
-    if not os.path.isdir(base_dir):
-        err_msg = "Base directory (" + str(base_dir) + ") does not exist"
-        error_message(err_msg, logger)
-    
-    # check the existence of base folder
-    vep_dir = os.path.abspath(arg_dict["vep_dir"])
-    if not os.path.isdir(vep_dir):
-        err_msg = "VEP directory ('" + str(vep_dir) + "') does not exist"
-        error_message(err_msg, logger)
-    
-    vep_human_dir = os.path.join(os.path.abspath(arg_dict["vep_dir"]), "homo_sapiens")
-    if not os.path.isdir(vep_human_dir):
-        err_msg = "VEP directory ('" + str(vep_human_dir) + "') does not exist"
-        error_message(err_msg, logger)
-    
-    vep_human_version_dir = os.path.join(
-        os.path.abspath(arg_dict["vep_dir"]), "homo_sapiens", 
-        f"{pcgr_vars.VEP_VERSION}_{pcgr_vars.VEP_ASSEMBLY[arg_dict['genome_assembly']]}")
-    if not os.path.isdir(vep_human_version_dir):
-        err_msg = "VEP directory ('" + str(vep_human_version_dir) + "') does not exist"
-        error_message(err_msg, logger)
-
-    # check the existence of data folder within the base folder
-    db_dir = os.path.join(os.path.abspath(arg_dict["refdata_dir"]), "data")
-    if not os.path.isdir(db_dir):
-        err_msg = "Data directory (" + str(db_dir) + ") does not exist"
-        error_message(err_msg, logger)
-
-    # check the existence of specified assembly data folder within the base folder
-    db_assembly_dir = os.path.join(os.path.abspath(
-        arg_dict["refdata_dir"]), "data", arg_dict["genome_assembly"])
-    if not os.path.isdir(db_assembly_dir):
-        err_msg = "Data directory for the specified genome assembly (" + str(
-            db_assembly_dir) + ") does not exist"
-        error_message(err_msg, logger)
-
-    # check the existence of .PCGR_BUNDLE_VERSION (starting from 1.5.0)
-    rel_notes_file = os.path.join(os.path.abspath(
-        arg_dict["refdata_dir"]), "data", arg_dict["genome_assembly"], ".PCGR_BUNDLE_VERSION")
-    if not os.path.exists(rel_notes_file):
-        err_msg = "The PCGR data bundle is outdated - please download the latest data bundle (see github.com/sigven/pcgr for instructions)"
-        error_message(err_msg, logger)
-
-    f_rel_not = open(rel_notes_file,'r')
-    compliant_data_bundle = 0
-    for line in f_rel_not:
-        if pcgr_vars.DB_VERSION in line:
-            compliant_data_bundle = 1
-
-    f_rel_not.close()
-
-    if compliant_data_bundle == 0:
-        err_msg = (
-            f'The PCGR data bundle is not compliant with the software version - please download the '
-            f'latest software and data bundle (see https://github.com/sigven/cpsr for instructions)')
-        error_message(err_msg,logger)
+    vep_dir = verify_vep_cache(arg_dict, logger)
+    refdata_assembly_dir = verify_refdata(arg_dict, logger, cpsr = True)
 
     cpsr_paths = {
             "input_vcf_dir": input_vcf_dir,
             "input_customlist_dir": input_customlist_dir,
-            "db_assembly_dir": db_assembly_dir,
-            "refdata_dir": base_dir,
+            "refdata_assembly_dir": refdata_assembly_dir,
             "vep_dir": vep_dir,           
             "output_dir": output_dir_full,
             "output_vcf": output_vcf,
@@ -757,3 +660,4 @@ def verify_input_files_cpsr(arg_dict):
             }
 
     return cpsr_paths
+
diff --git a/pcgr/cna.py b/pcgr/cna.py
index d60f8bac..dcd4fd63 100644
--- a/pcgr/cna.py
+++ b/pcgr/cna.py
@@ -17,7 +17,7 @@
 def annotate_cna_segments(output_fname: str, 
                           output_dir: str, 
                           cna_segment_file: str, 
-                          db_assembly_dir: str, 
+                          refdata_assembly_dir: str, 
                           build: str, 
                           sample_id: str,
                           n_copy_amplifications: int = 5,
@@ -30,7 +30,7 @@ def annotate_cna_segments(output_fname: str,
         output_fname (str): File name of the annotated output file.
         output_dir (str): Directory to save the annotated file.
         cna_segment_file (str): Path to the user-provided copy number aberrations segment file.
-        db_assembly_dir (str): Path to the build-specific PCGR database directory.
+        refdata_assembly_dir (str): Path to the build-specific PCGR database directory.
         build (str): Genome assembly build of input data.
         sample_id( (str): Sample identifier
         n_copy_amplifications (int, optional): Number of copies to consider as gains/amplifications. Defaults to 5.
@@ -86,7 +86,7 @@ def annotate_cna_segments(output_fname: str,
     
     ## Check that 'End' of segments do not exceed chromosome lengths
     chromsizes_fname = \
-        os.path.join(db_assembly_dir, 'chromsize.' + build + '.tsv')
+        os.path.join(refdata_assembly_dir, 'chromsize.' + build + '.tsv')
     
     check_file_exists(chromsizes_fname, logger)
     chromsizes = pd.read_csv(chromsizes_fname, sep="\t", header=None, names=['Chromosome', 'ChromLength'])
@@ -110,14 +110,14 @@ def annotate_cna_segments(output_fname: str,
     temp_files.append(cna_query_segment_bed.fn)
     
     ## annotate segments with cytobands
-    cna_query_segment_df = annotate_cytoband(cna_query_segment_bed, output_dir, db_assembly_dir, logger)
+    cna_query_segment_df = annotate_cytoband(cna_query_segment_bed, output_dir, refdata_assembly_dir, logger)
 
     ## annotate with protein-coding transcripts
     cna_query_segment_bed = pybedtools.BedTool.from_dataframe(cna_query_segment_df)
     temp_files.append(cna_query_segment_bed.fn)
     
     cna_query_segment_df = annotate_transcripts(
-       cna_query_segment_bed, output_dir, db_assembly_dir, overlap_fraction=overlap_fraction, logger=logger)
+       cna_query_segment_bed, output_dir, refdata_assembly_dir, overlap_fraction=overlap_fraction, logger=logger)
 
     cna_query_segment_df['segment_length_mb'] = \
         ((cna_query_segment_df['segment_end'] - cna_query_segment_df['segment_start']) / 1e6).astype(float).round(5)
@@ -128,8 +128,8 @@ def annotate_cna_segments(output_fname: str,
     cna_actionable_dict = {}
     
     for db in ['cgi','civic']:
-        variant_fname = os.path.join(db_assembly_dir, 'biomarker','tsv', f"{db}.variant.tsv.gz")
-        clinical_fname = os.path.join(db_assembly_dir, 'biomarker','tsv', f"{db}.clinical.tsv.gz")
+        variant_fname = os.path.join(refdata_assembly_dir, 'biomarker','tsv', f"{db}.variant.tsv.gz")
+        clinical_fname = os.path.join(refdata_assembly_dir, 'biomarker','tsv', f"{db}.clinical.tsv.gz")
         logger.info(f"Loading copy-number biomarker evidence from {db} ..")
         biomarkers[db] = load_biomarkers(
             logger, variant_fname, clinical_fname, biomarker_vartype = 'CNA')
@@ -197,14 +197,14 @@ def annotate_cna_segments(output_fname: str,
     return 0
 
 
-def annotate_cytoband(cna_segments_bt: BedTool, output_dir: str, db_assembly_dir: str, logger: logging.Logger) -> pd.DataFrame:
+def annotate_cytoband(cna_segments_bt: BedTool, output_dir: str, refdata_assembly_dir: str, logger: logging.Logger) -> pd.DataFrame:
     
     pybedtools.set_tempdir(output_dir)    
     temp_files = []
     
     # BED file with cytoband annotations
     cytoband_bed_fname = \
-        os.path.join(db_assembly_dir, 'misc','bed','cytoband', 'cytoband.bed.gz')
+        os.path.join(refdata_assembly_dir, 'misc','bed','cytoband', 'cytoband.bed.gz')
     
     cytoband_annotated_segments = pd.DataFrame()
     
@@ -272,7 +272,7 @@ def annotate_cytoband(cna_segments_bt: BedTool, output_dir: str, db_assembly_dir
 
 def annotate_transcripts(cna_segments_bt: BedTool, 
                          output_dir: str,
-                         db_assembly_dir: str, 
+                         refdata_assembly_dir: str, 
                          overlap_fraction: float,
                          logger: logging.Logger) -> pd.DataFrame:
     
@@ -282,7 +282,7 @@ def annotate_transcripts(cna_segments_bt: BedTool,
     Parameters:
     - cna_segments_bt: A BedTool object representing the CNA segments.
     - output_dir: A string specifying the output directory (for temporary files generation).
-    - db_assembly_dir: A string specifying the directory of the build-specific PCGR data bundle.
+    - refdata_assembly_dir: A string specifying the directory of the build-specific PCGR data bundle.
     - overlap_fraction: A float representing the fraction of overlap required for annotation.
     
     Returns:
@@ -294,9 +294,9 @@ def annotate_transcripts(cna_segments_bt: BedTool,
         
     # BED file with protein-coding transcripts
     gene_transcript_bed_fname = \
-        os.path.join(db_assembly_dir, 'gene','bed','gene_transcript_xref', 'gene_transcript_xref_pc_nopad.bed.gz')
+        os.path.join(refdata_assembly_dir, 'gene','bed','gene_transcript_xref', 'gene_transcript_xref_pc_nopad.bed.gz')
     gene_xref_tsv_fname = \
-        os.path.join(db_assembly_dir, "gene", "tsv", "gene_transcript_xref", "gene_transcript_xref.tsv.gz")
+        os.path.join(refdata_assembly_dir, "gene", "tsv", "gene_transcript_xref", "gene_transcript_xref.tsv.gz")
         
     cna_segments_annotated = pd.DataFrame()
 
diff --git a/pcgr/config.py b/pcgr/config.py
index 7b5168f7..acefd5bd 100644
--- a/pcgr/config.py
+++ b/pcgr/config.py
@@ -17,7 +17,8 @@ def create_config(arg_dict, workflow = "PCGR"):
             'sample_id': arg_dict['sample_id'],
             'genome_assembly': arg_dict['genome_assembly'],
             'debug': arg_dict['debug'],
-            'pcgrr_conda': arg_dict['pcgrr_conda']                
+            'pcgrr_conda': arg_dict['pcgrr_conda'],
+            'output_prefix': "None"                
     	}
         
         conf_options['vep'] = {
@@ -72,9 +73,9 @@ def create_config(arg_dict, workflow = "PCGR"):
         if not arg_dict['tumor_ploidy'] is None:
             conf_options['sample_properties']['tumor_ploidy'] = float(arg_dict['tumor_ploidy'])
             
-        conf_options['clinicaltrials'] = {
-            'run': int(arg_dict['include_trials'])
-        }
+        #conf_options['clinicaltrials'] = {
+        #    'run': int(arg_dict['include_trials'])
+        #}
         conf_options['other']['vcf2maf'] = int(arg_dict['vcf2maf'])
         conf_options['somatic_cna'] = {            
             'cna_overlap_pct': float(arg_dict['cna_overlap_pct']),
@@ -172,11 +173,12 @@ def create_config(arg_dict, workflow = "PCGR"):
     return conf_options
 
 
-def populate_config_data(conf_options: dict, db_dir: str, workflow = "PCGR", logger=None):
+def populate_config_data(conf_options: dict, refdata_assembly_dir: str, workflow = "PCGR", logger=None):
     
     conf_data = {}
     conf_data['sample_id'] = conf_options['sample_id']
     conf_data['output_dir'] = conf_options['output_dir']
+    conf_data['output_prefix'] = conf_options['output_prefix']
     conf_data['workflow'] = workflow
     conf_data['genome_assembly'] = conf_options['genome_assembly']
     conf_data['software'] = {}
@@ -184,42 +186,17 @@ def populate_config_data(conf_options: dict, db_dir: str, workflow = "PCGR", log
     conf_data['software']['cpsr_version'] = pcgr_vars.PCGR_VERSION
     conf_data['molecular_data'] = conf_options['molecular_data']
     
-    ## add paths to annotated files (VCF/TSV, CNA, TMB, EXPRESSION)
-    # conf_data['molecular_data'] = {}
-    # conf_data['molecular_data']['fname_mut_vcf'] = conf_options['annotated_vcf']
-    # conf_data['molecular_data']['fname_mut_tsv'] = conf_options['annotated_tsv']
-    # conf_data['molecular_data']['fname_cna_tsv'] = "None"
-    # conf_data['molecular_data']['fname_expression_tsv'] = "None"
-    # if workflow == "PCGR" and conf_options['annotated_cna'] != "None":
-    #     conf_data['molecular_data']['fname_cna_tsv'] = conf_options['annotated_cna']
-    #     del conf_options['annotated_cna']
-    # if workflow == "PCGR" and conf_options['annotated_exp'] != "None":
-    #     conf_data['molecular_data']['fname_expression_tsv'] = conf_options['annotated_exp']
-    #     del conf_options['annotated_exp']
-    # if workflow == "CPSR":
-    #     del conf_data['molecular_data']['fname_cna_tsv']
-    #     del conf_data['molecular_data']['fname_expression_tsv']
-    
-    # conf_data['molecular_data']['fname_tmb'] = "None"
-    # if workflow == "PCGR" and conf_options['fname_tmb'] != "None":
-    #     conf_data['molecular_data']['fname_tmb'] = conf_options['fname_tmb']
-    #     del conf_options['fname_tmb']
-    # if workflow == "CPSR":
-    #     del conf_data['molecular_data']['fname_tmb']
-    
-    genome_assembly = conf_options['genome_assembly']
     del conf_options['sample_id']
     del conf_options['genome_assembly']
     del conf_options['output_dir']
     del conf_options['molecular_data']
-    #del conf_options['annotated_vcf']
-    #del conf_options['annotated_tsv']
+
     conf_data['conf'] = conf_options
     
     conf_data['reference_data'] = {}
     conf_data['reference_data']['version'] = pcgr_vars.DB_VERSION
     conf_data['reference_data']['path'] = \
-        os.path.join(db_dir, "data", genome_assembly)
+        refdata_assembly_dir
     
     metadata_pd = pd.DataFrame()
     conf_data['reference_data']['source_metadata'] = {}
@@ -228,11 +205,12 @@ def populate_config_data(conf_options: dict, db_dir: str, workflow = "PCGR", log
     ## add metadata information for each data source
     
     cpsr_sources_regex = r'^(gepa|cpg_other|maxwell2016|acmg_sf|dbmts|woods_dnarepair|gerp|tcga_pancan_2018|gwas_catalog)'
-    pcgr_sources_regex = r'^(cytoband|mitelmandb|tcga|nci|intogen|opentargets|dgidb|pubchem|cosmic_mutsigs)'
+    pcgr_sources_regex = r'^(cytoband|mitelmandb|tcga|nci|intogen|opentargets|depmap|treehouse|dgidb|pubchem|cosmic_mutsigs)'
+    sources_skip_regex = r'^(illumina|foundation_one)'
     
     for dtype in ['gene','phenotype','biomarker','drug','gwas','hotspot','other']:
         metadata_fname = os.path.join(
-            db_dir, "data", conf_data['genome_assembly'],
+            refdata_assembly_dir,
             ".METADATA", "tsv", dtype + "_metadata.tsv")
         if check_file_exists(metadata_fname, logger):
             metadata_df = pd.read_csv(metadata_fname, sep="\t", na_values=".")
@@ -240,11 +218,12 @@ def populate_config_data(conf_options: dict, db_dir: str, workflow = "PCGR", log
             metadata_df['wflow'] = 'pcgr_cpsr'
             metadata_df.loc[metadata_df['source_abbreviation'].str.match(cpsr_sources_regex), 'wflow'] = 'cpsr' 
             metadata_df.loc[metadata_df['source_abbreviation'].str.match(pcgr_sources_regex), 'wflow'] = 'pcgr'
+            metadata_df.loc[metadata_df['source_abbreviation'].str.match(sources_skip_regex), 'wflow'] = 'skip'
             metadata_pd = metadata_pd._append(metadata_df, ignore_index=True)
     
     conf_data['reference_data']['source_metadata'] = metadata_pd.to_dict(orient='records')
 
-    oncotree_fname = os.path.join(db_dir, "data", genome_assembly,
+    oncotree_fname = os.path.join(refdata_assembly_dir,
                                   "phenotype","tsv","phenotype_onco.tsv.gz")
     
     if check_file_exists(oncotree_fname, logger):
@@ -275,8 +254,7 @@ def populate_config_data(conf_options: dict, db_dir: str, workflow = "PCGR", log
         
             conf_data['conf']['gene_panel']['panel_genes'] = set_virtual_target_genes(
                 conf_data['conf']['gene_panel']['panel_id'], 
-                db_dir, 
-                conf_data['genome_assembly'],
+                refdata_assembly_dir,
                 conf_data['conf']['gene_panel']['diagnostic_grade_only'], 
                 conf_data['conf']['gene_panel']['custom_list_tsv'],
                 bool(conf_data['conf']['variant_classification']['secondary_findings']),
@@ -285,16 +263,16 @@ def populate_config_data(conf_options: dict, db_dir: str, workflow = "PCGR", log
 
     return(conf_data)
 
-def set_virtual_target_genes(panel_id: str, db_dir: str, genome_assembly: str, diagnostic_grade_only: bool, 
+def set_virtual_target_genes(panel_id: str, refdata_assembly_dir: str, diagnostic_grade_only: bool, 
                              custom_list_tsv: str, secondary_findings: bool, logger=None):
     
     all_panels_fname = os.path.join(
-        db_dir, "data", genome_assembly,
+        refdata_assembly_dir,
         "gene","tsv","gene_virtual_panel", 
         "gene_virtual_panel.tsv.gz")
     
     all_cpg_fname = os.path.join(
-        db_dir, "data", genome_assembly,
+        refdata_assembly_dir,
         "gene","tsv","gene_cpg", 
         "gene_cpg.tsv.gz")
         
diff --git a/pcgr/cpsr.py b/pcgr/cpsr.py
index 4d1a08e4..8fe72dec 100755
--- a/pcgr/cpsr.py
+++ b/pcgr/cpsr.py
@@ -15,7 +15,7 @@
 from glob import glob
 from argparse import RawTextHelpFormatter
 from pcgr import pcgr_vars, arg_checker, utils, config, variant
-from pcgr.utils import check_subprocess, getlogger, error_message, warn_message, remove_file
+from pcgr.utils import check_subprocess, getlogger, error_message, warn_message, remove_file, random_id_generator
 from pcgr.config import populate_config_data
 from pcgr.vep import get_vep_command
 
@@ -80,81 +80,79 @@ def get_args():
 def main():
     arg_dict = get_args()
     # check parsed arguments
-    arg_checker.check_args_cpsr(arg_dict)
+    arg_dict_verified = arg_checker.verify_args_cpsr(arg_dict)
     # Verify existence of input files
-    cpsr_paths = arg_checker.verify_input_files_cpsr(arg_dict)
+    
+    input_data = arg_checker.verify_input_files_cpsr(arg_dict_verified)
+    output_data = arg_checker.define_output_files(arg_dict, cpsr = True)
     
     # create configuration/options
     conf_options = config.create_config(arg_dict, workflow = "CPSR")
     
     ## Run CPSR workflow
-    run_cpsr(conf_options, cpsr_paths)
+    run_cpsr(conf_options, input_data, output_data)
 
 
-def run_cpsr(conf_options, cpsr_paths):
+def run_cpsr(conf_options, input_data, output_data):
     """
     Main function to run the CPSR workflow
     """
     
-    prefix = os.path.join(
-        cpsr_paths['output_dir'], f'{conf_options["sample_id"]}.cpsr.{conf_options["genome_assembly"]}')
-
     debug = conf_options['debug']
     vep_skip_intergenic_set = 'ON' if conf_options['vep']['vep_no_intergenic'] == 1 else 'OFF'
-    output_vcf = 'None'
-    output_pass_vcf = 'None'
-    output_pass_tsv = 'None'
+    #output_vcf = 'None'
+    #output_pass_vcf = 'None'
+    #output_pass_tsv = 'None'
     uid = ''
     genome_assembly = str(conf_options['genome_assembly'])
     input_vcf = 'None'
     input_customlist = 'None'
     output_custom_bed = 'None'
+    output_dir = output_data['dir']
+    output_prefix = output_data['prefix']
     print('')    
     logger = getlogger('cpsr-validate-input-arguments')
     logger.info("CPSR - STEP 0: Validate input data")
 
-    if cpsr_paths['input_vcf_basename'] != 'NA':
-        input_vcf = os.path.join(cpsr_paths['input_vcf_dir'], cpsr_paths['input_vcf_basename'])
-    if cpsr_paths['input_customlist_basename'] != 'NA' and cpsr_paths['input_customlist_dir'] != 'NA':
+    if input_data['input_vcf_basename'] != 'NA':
+        input_vcf = os.path.join(input_data['input_vcf_dir'], input_data['input_vcf_basename'])
+    if input_data['input_customlist_basename'] != 'NA' and input_data['input_customlist_dir'] != 'NA':
         input_customlist = os.path.join(
-            cpsr_paths['input_customlist_dir'], cpsr_paths['input_customlist_basename'])
-        output_custom_bed =  f'{prefix}.custom_panel.bed'
+            input_data['input_customlist_dir'], input_data['input_customlist_basename'])
+        output_custom_bed =  f'{output_prefix}.custom_panel.bed'
         if conf_options['gene_panel']['custom_list_name'] == "None":
             warn_msg = "No custom list name provided, use argument '--custom_list_name' to provide name for custom defined panel"
             warn_message(warn_msg, logger)
 
     if not input_vcf == 'None':
-        refdata_dir = cpsr_paths['refdata_dir']
-        db_assembly_dir = cpsr_paths['db_assembly_dir']
-        output_dir = cpsr_paths['output_dir']
-
         check_subprocess(logger, f'mkdir -p {output_dir}', debug)
         
         ## Define input, output and temporary file names
-        input_vcf_validated =             f'{conf_options["sample_id"]}.cpsr_ready_target.vcf.gz'
-        input_vcf_validated_uncompr =     f'{conf_options["sample_id"]}.cpsr_ready_target.vcf'
-        vep_vcf =                         f'{conf_options["sample_id"]}.vep.vcf'
-        vep_vcfanno_vcf =                 f'{conf_options["sample_id"]}.vep.vcfanno.vcf'
-        vep_vcfanno_summarised_vcf =      f'{conf_options["sample_id"]}.vep.vcfanno.summarised.vcf'
-        vep_vcfanno_summarised_pass_vcf = f'{conf_options["sample_id"]}.vep.vcfanno.summarised.pass.vcf'           
-        output_vcf =                      f'{prefix}.vcf.gz'
-        output_pass_vcf =                 f'{prefix}.pass.vcf.gz'
-        output_pass_vcf2tsv =             f'{prefix}.pass.vcf2tsv.tsv'
+        random_id = random_id_generator(15) 
+        # Define temporary output files
+        input_vcf_validated =             f'{output_prefix}.{random_id}.ready.vcf.gz'
+        input_vcf_validated_uncompr =     f'{output_prefix}.{random_id}.ready.vcf'
+        vep_vcf =                         f'{output_prefix}.{random_id}.vep.vcf'
+        vep_vcfanno_vcf =                 f'{output_prefix}.{random_id}.vep.vcfanno.vcf'
+        vep_vcfanno_summarised_vcf =      f'{output_prefix}.{random_id}.vep.vcfanno.summarised.vcf'
+        vep_vcfanno_summarised_pass_vcf = f'{output_prefix}.{random_id}.vep.vcfanno.summarised.pass.vcf'           
+        output_vcf =                      f'{output_prefix}.vcf.gz'
+        output_pass_vcf =                 f'{output_prefix}.pass.vcf.gz'
+        output_pass_vcf2tsv =             f'{output_prefix}.pass.vcf2tsv.tsv'
         output_pass_vcf2tsv_gz =          f'{output_pass_vcf2tsv}.gz'
-        output_pass_tsv =                 f'{prefix}.pass.tsv'
+        output_pass_tsv =                 f'{output_prefix}.pass.tsv'
         output_pass_tsv_gz =              f'{output_pass_tsv}.gz'
-        yaml_fname =                      f'{prefix}.conf.yaml'
+        yaml_fname =                      f'{output_prefix}.conf.yaml'
 
         ## CPSR|Validate input VCF - check formatting, non-overlap with CPSR INFO tags, and whether sample contains any variants in cancer predisposition loci
         vcf_validate_command = (
                 f'cpsr_validate_input.py '
-                f'{refdata_dir} '
+                f'{input_data["refdata_assembly_dir"]} '
                 f'{input_vcf} '
                 f'{input_vcf_validated_uncompr} '
                 f'{input_customlist} '
                 f'{output_custom_bed} '
                 f'{conf_options["other"]["retained_vcf_info_tags"]} '
-                f'{conf_options["genome_assembly"]} '
                 f'{conf_options["sample_id"]} '
                 f'{conf_options["gene_panel"]["panel_id"]} '
                 f'{conf_options["gene_panel"]["diagnostic_grade_only"]} '
@@ -185,23 +183,22 @@ def run_cpsr(conf_options, cpsr_paths):
         logger.info(f"Genome assembly: {conf_options['genome_assembly']}")
         print('----')
         
-        #conf_options['annotated_tsv'] = output_pass_tsv_gz
-        #conf_options['annotated_vcf'] = output_vcf
         conf_options['molecular_data']['fname_mut_vcf'] = output_vcf
         conf_options['molecular_data']['fname_mut_tsv'] = output_pass_tsv_gz
         conf_options['output_dir'] = output_dir
+        conf_options['output_prefix'] = output_prefix
         conf_options['gene_panel']['custom_list_bed'] = "None"
         if not input_customlist == 'None':
             conf_options['gene_panel']['custom_list_bed'] = output_custom_bed
         
         ## Write YAML configuration file  - settings, path to files, reference bundle etc
-        yaml_data = populate_config_data(conf_options, refdata_dir, workflow = "CPSR", logger = logger)
+        yaml_data = populate_config_data(conf_options, input_data["refdata_assembly_dir"], workflow = "CPSR", logger = logger)
         with open(yaml_fname, "w") as outfile:
             outfile.write(yaml.dump(yaml_data))
         outfile.close()
         
         ## CPSR|VEP - run Variant Effect Predictor on query VCF with LoF and NearestExonJB plugins
-        vep_command = get_vep_command(file_paths = cpsr_paths, 
+        vep_command = get_vep_command(file_paths = input_data, 
                                       conf_options = yaml_data, 
                                       input_vcf = input_vcf_validated, 
                                       output_vcf = vep_vcf)
@@ -236,8 +233,7 @@ def run_cpsr(conf_options, cpsr_paths):
                 f'{"--debug" if debug else ""} '
                 f"--dbmts --gerp --tcga --gnomad_non_cancer --gene_transcript_xref "
                 f"--gwas --rmsk {vep_vcf}.gz {vep_vcfanno_vcf} "
-                f"{db_assembly_dir}"
-                #f"{os.path.join(refdata_dir, 'data', str(yaml_data['genome_assembly']))}"
+                f"{input_data['refdata_assembly_dir']}"
                 )
         check_subprocess(logger, pcgr_vcfanno_command, debug)
         logger.info("Finished cpsr-vcfanno")
@@ -250,7 +246,7 @@ def run_cpsr(conf_options, cpsr_paths):
                 f'pcgr_summarise.py {vep_vcfanno_vcf}.gz {vep_vcfanno_summarised_vcf} 0 '
                 f'{yaml_data["conf"]["vep"]["vep_regulatory"]} 0 '
                 f'Any {yaml_data["conf"]["vep"]["vep_pick_order"]} '
-                f'{cpsr_paths["db_assembly_dir"]} --compress_output_vcf '
+                f'{input_data["refdata_assembly_dir"]} --compress_output_vcf '
                 f'--cpsr_yaml {yaml_fname} '
                 f'--cpsr {"--debug" if debug else ""}'
                 )
@@ -283,7 +279,7 @@ def run_cpsr(conf_options, cpsr_paths):
         logger.info("Appending ClinVar traits, official gene names, and protein domain annotations")        
         variant_set = \
            variant.append_annotations(
-              output_pass_vcf2tsv_gz, db_assembly_dir = cpsr_paths["db_assembly_dir"], logger = logger)
+              output_pass_vcf2tsv_gz, refdata_assembly_dir = input_data["refdata_assembly_dir"], logger = logger)
         variant_set = variant.clean_annotations(variant_set, yaml_data, germline = True, logger = logger)
         
         ## If no genotypes are available, set conf['sample_properties']['genotypes_available'] = 1
diff --git a/pcgr/expression.py b/pcgr/expression.py
index cce25e10..927ed964 100644
--- a/pcgr/expression.py
+++ b/pcgr/expression.py
@@ -12,14 +12,14 @@
 
 def parse_expression(expression_fname_tsv: str,
                      sample_id: str,
-                     db_assembly_dir: str,                                                                    
+                     refdata_assembly_dir: str,                                                                    
                      logger: logging.Logger = None):
     """
     Parse the expression data from the user-provided file and verify record identifiers with 
     reference data. Aggregate transcript-level TPM values to generate gene-level TPM values.
     Args:
         expression_fname_tsv (str): Path to the user-provided gene expression file.
-        db_assembly_dir (str): Path to the build-specific PCGR database directory.
+        refdata_assembly_dir (str): Path to the build-specific PCGR reference data directory.
         logger (logging.Logger, optional): Logger. Defaults to None.
     """    
     ## Check that the expression file exists
@@ -38,7 +38,7 @@ def parse_expression(expression_fname_tsv: str,
     ## gene symbols/aliases and identifiers (Ensembl) to Entrez gene identifers and
     ## biotype
     gene_index_fname_tsv =  \
-        os.path.join(db_assembly_dir, "gene", "tsv", "gene_transcript_xref", "gene_index.tsv.gz")
+        os.path.join(refdata_assembly_dir, "gene", "tsv", "gene_transcript_xref", "gene_index.tsv.gz")
     check_file_exists(gene_index_fname_tsv, logger)
     gene_index = pd.read_csv(gene_index_fname_tsv, sep = "\t", na_values = ".",  low_memory = False)
     
@@ -127,6 +127,12 @@ def parse_expression(expression_fname_tsv: str,
                 ['ENSEMBL_GENE_ID','SYMBOL','ENTREZGENE','GENENAME','BIOTYPE']).agg({'TPM':'sum'}).reset_index()
             expression_map['gene'].columns = ['ENSEMBL_GENE_ID','SYMBOL','ENTREZGENE','GENENAME','BIOTYPE','TPM_GENE']
             expression_map['gene'] = expression_map['gene'].drop_duplicates().sort_values(by='TPM_GENE', ascending=False)
+            
+            ## for transcripts with missing TPM's, use the minimum TPM value across all transcripts
+            expression_map['gene_min_tpm'] = transcript_expression_map.groupby(
+                ['ENSEMBL_GENE_ID','SYMBOL','ENTREZGENE','GENENAME','BIOTYPE']).agg({'TPM':'min'}).reset_index()
+            expression_map['gene_min_tpm'].columns = ['ENSEMBL_GENE_ID','SYMBOL','ENTREZGENE','GENENAME','BIOTYPE','TPM_MIN']
+            expression_map['gene_min_tpm'] = expression_map['gene_min_tpm'].drop_duplicates().sort_values(by='TPM_MIN', ascending=False)
                   
     
     return(expression_map)
@@ -145,17 +151,22 @@ def integrate_variant_expression(variant_set: pd.DataFrame,
     """    
     
     if not expression_data is None:
-        if 'gene' in expression_data.keys():
-            if not expression_data['gene'] is None:
-                logger.info("Integrating gene-level expression data from tumor into somatic variant set")
-                if expression_data['gene'].empty:
-                    logger.warn('Expression file does not contain any gene-level expression data')
-                else:
-                    if 'TPM_GENE' in variant_set.columns:
-                        variant_set.drop('TPM_GENE', inplace=True, axis=1)
-                    if {'TPM_GENE','ENSEMBL_GENE_ID'}.issubset(expression_data['gene'].columns) and \
-                        'ENSEMBL_GENE_ID' in variant_set.columns:
-                        variant_set = variant_set.merge(expression_data['gene'], on = 'ENSEMBL_GENE_ID', how = 'left')
+        for s in ['gene','gene_min_tpm']:
+            if s in expression_data.keys():
+                if not expression_data[s] is None:
+                    logger.info("Integrating gene-level expression data from tumor into somatic variant set")
+                    if expression_data[s].empty:
+                        logger.warn('Expression file does not contain any gene-level expression data')
+                    else:
+                        v = 'TPM_MIN'
+                        if s == 'gene':
+                            v = 'TPM_GENE'
+                        if v in variant_set.columns:
+                            variant_set.drop(v, inplace=True, axis=1)
+                        if {v,'ENSEMBL_GENE_ID'}.issubset(expression_data[s].columns) and \
+                            'ENSEMBL_GENE_ID' in variant_set.columns:
+                                exp_gene = expression_data[s][[v,'ENSEMBL_GENE_ID']]
+                                variant_set = variant_set.merge(exp_gene, on = 'ENSEMBL_GENE_ID', how = 'left')
                     
         if 'transcript' in expression_data.keys() and 'transcript_identifier' in expression_data.keys():
             if not expression_data['transcript'] is None:
@@ -187,6 +198,7 @@ def integrate_variant_expression(variant_set: pd.DataFrame,
                             if 'TPM' in variant_set.columns:
                                 variant_set.drop('TPM', inplace=True, axis=1)
                             if {'TPM','VAR_ID','ENSEMBL_TRANSCRIPT_ID'}.issubset(var2exp.columns):
+                                var2exp = var2exp[['TPM','VAR_ID','ENSEMBL_TRANSCRIPT_ID']]
                                 variant_set = variant_set.merge(var2exp, on = ['VAR_ID','ENSEMBL_TRANSCRIPT_ID'], how = 'left')  
                         else:
                             variant_set['TPM'] = np.nan
@@ -196,7 +208,7 @@ def integrate_variant_expression(variant_set: pd.DataFrame,
 
 def correlate_sample_expression(sample_expression_data: dict, 
                                 yaml_data: dict, 
-                                db_assembly_dir: str,
+                                refdata_assembly_dir: str,
                                 protein_coding_only: bool = True,
                                 logger: logging.Logger = None) -> pd.DataFrame:
     
@@ -229,7 +241,7 @@ def correlate_sample_expression(sample_expression_data: dict,
                 
                     if 'tcga' in yaml_data['conf']['gene_expression']['similarity_db'].keys():
                         for cohort in yaml_data['conf']['gene_expression']['similarity_db']['tcga'].keys():                            
-                            exp_fname = os.path.join(db_assembly_dir, "expression", "tsv", 
+                            exp_fname = os.path.join(refdata_assembly_dir, "expression", "tsv", 
                                              "tcga", "tcga_" + str(cohort).lower() + "_tpm.tsv.gz")
                             if check_file_exists(exp_fname, strict = False, logger = logger):
                                 sample_comp_exp_mat = pd.read_csv(
@@ -245,8 +257,8 @@ def correlate_sample_expression(sample_expression_data: dict,
                                     corr_mat = correlate_samples(sample_expression_data, sample_id, 'tcga-' + str(cohort).lower(), 
                                                             protein_coding_only, sample_comp_exp_mat, logger)
                                     ## Make sure sample IDs match patient barcodes (sample metadata is organized per patient)
-                                    corr_mat['EXT_SAMPLE_ID'] = corr_mat['EXT_SAMPLE_ID'].replace(regex=r'-[0-9][0-9][A-Z]$', value='')                                                                      
-                                    metadata_fname = os.path.join(db_assembly_dir, "expression", "tsv", 
+                                    #corr_mat['EXT_SAMPLE_ID'] = corr_mat['EXT_SAMPLE_ID'].replace(regex=r'-[0-9][0-9][A-Z]$', value='')                                                                      
+                                    metadata_fname = os.path.join(refdata_assembly_dir, "expression", "tsv", 
                                         'tcga', "tcga_" + str(cohort).lower() + "_sample_metadata.tsv.gz")
                                     if check_file_exists(metadata_fname, strict = False, logger = logger):
                                         sample_metadata = pd.read_csv(
@@ -266,7 +278,7 @@ def correlate_sample_expression(sample_expression_data: dict,
                         if source == 'treehouse':
                             source_verbose = 'samples from the Treehouse Childhood Cancer Initiative'
                         if source in yaml_data['conf']['gene_expression']['similarity_db'].keys():
-                            exp_fname = os.path.join(db_assembly_dir, "expression", "tsv", 
+                            exp_fname = os.path.join(refdata_assembly_dir, "expression", "tsv", 
                                              source, source + "_tpm.tsv.gz")
                             if check_file_exists(exp_fname, strict = False, logger = logger):
                                 sample_comp_exp_mat = pd.read_csv(
@@ -286,7 +298,7 @@ def correlate_sample_expression(sample_expression_data: dict,
                                 if 'CORRELATION' in sample_exp_similarity[source].columns:
                                     sample_exp_similarity[source] = \
                                         sample_exp_similarity[source].sort_values(by=['CORRELATION'], ascending=False)
-                                    metadata_fname = os.path.join(db_assembly_dir, "expression", "tsv", 
+                                    metadata_fname = os.path.join(refdata_assembly_dir, "expression", "tsv", 
                                         source, source + "_sample_metadata.tsv.gz")
                                     if check_file_exists(metadata_fname, strict = False, logger = logger):
                                         sample_metadata = pd.read_csv(
@@ -294,7 +306,8 @@ def correlate_sample_expression(sample_expression_data: dict,
                                         sample_exp_similarity[source] = sample_exp_similarity[source].merge(
                                             sample_metadata, on = ['EXT_SAMPLE_ID'], how = 'left')
                                 
-                           
+    #else:
+        #print("BALLE")                    
     return(sample_exp_similarity)
 
 
diff --git a/pcgr/main.py b/pcgr/main.py
index 7ab94c7b..1414217a 100755
--- a/pcgr/main.py
+++ b/pcgr/main.py
@@ -42,7 +42,7 @@ def cli():
     optional_allelic_support = parser.add_argument_group("Allelic support options")
     optional_other = parser.add_argument_group("Other options")
 
-    optional_rna.add_argument("--input_rna_fusion", dest = "input_rna_fusion", help = "File with RNA fusion transcripts detected in tumor (tab-separated values)")
+    #optional_rna.add_argument("--input_rna_fusion", dest = "input_rna_fusion", help = "File with RNA fusion transcripts detected in tumor (tab-separated values)")
     optional_rna.add_argument("--input_rna_expression", dest = "input_rna_exp", help = "File with bulk RNA expression counts (TPM) of transcripts in tumor (tab-separated values)")
     optional_rna.add_argument('--expression_sim', action='store_true', help="Compare expression profile of tumor sample to known expression profiles (default: %(default)s)")
     optional_rna.add_argument("--expression_sim_db", dest = "expression_sim_db", default="tcga,depmap,treehouse", help=f"Comma-separated string " + \
@@ -88,7 +88,7 @@ def cli():
     optional_other.add_argument("--cpsr_report", dest="cpsr_report", help="CPSR report file (Gzipped JSON - file ending with 'cpsr.<genome_assembly>.json.gz' -  germline report of patient's blood/control sample")
     optional_other.add_argument("--vcf2maf", action="store_true", help="Generate a MAF file for input VCF using https://github.com/mskcc/vcf2maf (default: %(default)s)")
     optional_other.add_argument("--ignore_noncoding", action="store_true", help="Ignore non-coding (i.e. non protein-altering) variants in report, default: %(default)s")
-    optional_other.add_argument("--include_trials", action="store_true", help="(Beta) Include relevant ongoing or future clinical trials, focusing on studies with molecularly targeted interventions")
+    #optional_other.add_argument("--include_trials", action="store_true", help="(Beta) Include relevant ongoing or future clinical trials, focusing on studies with molecularly targeted interventions")
     optional_other.add_argument("--retained_info_tags", dest="retained_info_tags", default="None", help="Comma-separated string of VCF INFO tags from query VCF that should be kept in PCGR output TSV file")
     optional_other.add_argument("--force_overwrite", action="store_true", help="By default, the script will fail with an error if any output file already exists. You can force the overwrite of existing result files by using this flag, default: %(default)s")
     optional_other.add_argument("--version", action="version", version="%(prog)s " + str(pcgr_vars.PCGR_VERSION))
@@ -134,17 +134,17 @@ def cli():
     args = parser.parse_args()
     arg_dict = vars(args)
 
-    # check parsed arguments
-    arg_checker.check_args(arg_dict)
+    # verify parsed arguments
+    arg_dict_verified = arg_checker.verify_args(arg_dict)
     
     # create config options
-    conf_options = create_config(arg_dict, workflow = "PCGR")
+    conf_options = create_config(arg_dict_verified, workflow = "PCGR")
     
     # Verify existence of input files, define and check existence of output files
-    input_data = arg_checker.verify_input_files(arg_dict)
-    output_data = arg_checker.define_output_files(arg_dict)
+    input_data = arg_checker.verify_input_files(arg_dict_verified)
+    output_data = arg_checker.define_output_files(arg_dict_verified)
     
-    # Run PCGR workflow (vep, vcfanno, summarise, vcf2tsvpy, html)
+    # Run PCGR workflow
     run_pcgr(input_data, output_data, conf_options)
 
 def run_pcgr(input_data, output_data,conf_options):
@@ -155,7 +155,7 @@ def run_pcgr(input_data, output_data,conf_options):
     debug = conf_options['debug']
 
     # set basic run commands
-    clinical_trials_set = 'ON' if conf_options['clinicaltrials']['run'] else 'OFF'
+    #clinical_trials_set = 'ON' if conf_options['clinicaltrials']['run'] else 'OFF'
     msi_prediction_set = 'ON' if conf_options['somatic_snv']['msi']['run'] else 'OFF'
     msig_estimation_set = 'ON' if conf_options['somatic_snv']['mutational_signatures']['run'] else 'OFF'
     tmb_estimation_set = 'ON' if conf_options['somatic_snv']['tmb']['run'] else 'OFF'
@@ -201,11 +201,8 @@ def run_pcgr(input_data, output_data,conf_options):
         pon_vcf = os.path.join(input_data['pon_vcf_dir'], input_data['pon_vcf_basename'])
 
     if not input_vcf == 'None':
-        refdata_dir = input_data['refdata_dir']
         output_dir = output_data['dir']
-        output_prefix = output_data['prefix']
-        vep_dir = input_data['vep_dir']
-        
+        output_prefix = output_data['prefix']        
         check_subprocess(logger, f'mkdir -p {output_dir}', debug)
 
         random_id = random_id_generator(15) 
@@ -213,7 +210,7 @@ def run_pcgr(input_data, output_data,conf_options):
         input_vcf_validated =             f'{output_prefix}.{random_id}.ready.vcf.gz'
         input_vcf_validated_uncompr =     f'{output_prefix}.{random_id}.ready.vcf'
         vep_vcf =                         f'{output_prefix}.{random_id}.vep.vcf'
-        vep_vcfanno_vcf =                 f'{output_prefix}.{random_id}vep.vcfanno.vcf'
+        vep_vcfanno_vcf =                 f'{output_prefix}.{random_id}.vep.vcfanno.vcf'
         vep_vcfanno_summarised_vcf =      f'{output_prefix}.{random_id}.vep.vcfanno.summarised.vcf'
         vep_vcfanno_summarised_pass_vcf = f'{output_prefix}.{random_id}.vep.vcfanno.summarised.pass.vcf'
         output_vcf =                      f'{output_prefix}.vcf.gz'
@@ -236,7 +233,7 @@ def run_pcgr(input_data, output_data,conf_options):
 
         vcf_validate_command = (
                 f'pcgr_validate_input.py '
-                f'{refdata_dir} '
+                f'{input_data["refdata_assembly_dir"]} '
                 f'{input_vcf} '
                 f'{input_vcf_validated_uncompr} '
                 f'{input_cna} '
@@ -244,7 +241,6 @@ def run_pcgr(input_data, output_data,conf_options):
                 f'{input_rna_expression} '
                 f'{pon_vcf} '
                 f'{conf_options["assay_properties"]["vcf_tumor_only"]} '
-                f'{conf_options["genome_assembly"]} '
                 f'{conf_options["sample_id"]} '
                 f'{conf_options["other"]["retained_vcf_info_tags"]} '
                 f'{conf_options["somatic_snv"]["allelic_support"]["tumor_dp_tag"]} '
@@ -293,7 +289,7 @@ def run_pcgr(input_data, output_data,conf_options):
         logger.info(f'MSI classification: {msi_prediction_set}')
         logger.info(f'Mutational burden estimation: {tmb_estimation_set}')
         logger.info(f'RNA expression similarity analysis: {rnaseq_sim_analysis_set}')
-        logger.info(f'Include molecularly targeted clinical trials (beta): {clinical_trials_set}')
+        #logger.info(f'Include molecularly targeted clinical trials (beta): {clinical_trials_set}')
         
         # PCGR|Generate YAML file - containing configuration options and paths to annotated molecular profile datasets
         # - VCF/TSV files (SNVs/InDels)
@@ -305,6 +301,7 @@ def run_pcgr(input_data, output_data,conf_options):
 
         # update conf_options with paths to output files
         conf_options['output_dir'] = output_dir
+        conf_options['output_prefix'] = output_prefix
         conf_options['molecular_data']['fname_mut_vcf'] = output_vcf
         conf_options['molecular_data']['fname_mut_tsv'] = output_pass_tsv_gz
         if conf_options['somatic_snv']['tmb']['run'] == 1:
@@ -318,7 +315,8 @@ def run_pcgr(input_data, output_data,conf_options):
                     conf_options['molecular_data']['fname_expression_sim_' + source] = f'{output_prefix}.exp_sim_{source}.tsv.gz'
 
         # make YAML file
-        yaml_data = populate_config_data(conf_options, refdata_dir, workflow = "PCGR", logger = logger)
+        yaml_data = populate_config_data(conf_options, input_data["refdata_assembly_dir"], 
+                                         workflow = "PCGR", logger = logger)
         genome_assembly = yaml_data['genome_assembly']
         
         vep_command = get_vep_command(file_paths = input_data, 
@@ -392,7 +390,7 @@ def run_pcgr(input_data, output_data,conf_options):
         # PCGR|vcfanno - annotate VCF against a number of variant annotation tracks (BED/VCF)
         logger = getlogger("pcgr-vcfanno")
         pcgr_vcfanno_command = (
-                f'pcgr_vcfanno.py {vep_vcf}.gz {vep_vcfanno_vcf} {input_data["db_assembly_dir"]} '
+                f'pcgr_vcfanno.py {vep_vcf}.gz {vep_vcfanno_vcf} {input_data["refdata_assembly_dir"]} '
                 f'--num_processes {conf_options["other"]["vcfanno_n_proc"]} '
                 f'--dbnsfp --clinvar --rmsk --winmsk --simplerepeat '
                 f'--tcga --gene_transcript_xref --dbmts --gwas '
@@ -400,7 +398,7 @@ def run_pcgr(input_data, output_data,conf_options):
                 )
         vcfanno_db_src_msg1 = (
                 f"Annotation sources (vcfanno): {'Panel-of-Normals, ' if pon_vcf != 'None' else ''}ClinVar, dbNSFP, "
-                f"dbMTS, cancerhotspots.org, DoCM, TCGA, GWAS catalog"
+                f"dbMTS, TCGA, GWAS catalog"
                 )
         vcfanno_db_src_msg2 = \
                 f"Annotation sources (vcfanno): RepeatMasker, SimpleRepeats, WindowMaskerSDust, gnomAD non-cancer subset"
@@ -420,7 +418,7 @@ def run_pcgr(input_data, output_data,conf_options):
                 f'pcgr_summarise.py {vep_vcfanno_vcf}.gz {vep_vcfanno_summarised_vcf} {pon_annotation} '
                 f'{yaml_data["conf"]["vep"]["vep_regulatory"]} {oncogenicity_annotation} '
                 f'{yaml_data["conf"]["sample_properties"]["site2"]} {yaml_data["conf"]["vep"]["vep_pick_order"]} '
-                f'{input_data["db_assembly_dir"]} --compress_output_vcf '
+                f'{input_data["refdata_assembly_dir"]} --compress_output_vcf '
                 f'{"--debug" if debug else ""}'
                 )
         summarise_db_src_msg1 = \
@@ -466,7 +464,7 @@ def run_pcgr(input_data, output_data,conf_options):
         logger.info("Appending ClinVar traits, official gene names, and protein domain annotations")        
         variant_set = \
            append_annotations(
-              output_pass_vcf2tsv_gz, db_assembly_dir = input_data["db_assembly_dir"], logger = logger)
+              output_pass_vcf2tsv_gz, refdata_assembly_dir = input_data["refdata_assembly_dir"], logger = logger)
         variant_set = set_allelic_support(variant_set, allelic_support_tags = yaml_data["conf"]['somatic_snv']['allelic_support'])
         variant_set = clean_annotations(variant_set, yaml_data, germline = False, logger = logger)        
         
@@ -484,7 +482,7 @@ def run_pcgr(input_data, output_data,conf_options):
             ## Parse expression data, verify identifiers
             expression_data = parse_expression(
                 input_rna_expression, yaml_data["sample_id"], 
-                input_data["db_assembly_dir"], logger = logger)
+                input_data["refdata_assembly_dir"], logger = logger)
             ## Write transcript-level expression data to TSV
             if 'transcript' in expression_data.keys():
                 if not expression_data['transcript'] is None:
@@ -563,17 +561,17 @@ def run_pcgr(input_data, output_data,conf_options):
     if yaml_data['conf']['gene_expression']['similarity_analysis'] == 1 and expression_data is not None:
         logger = getlogger("pcgr-expression-similarity")
         logger.info('PCGR - STEP 4: Gene expression (RNA) similarity analysis')
-                
+         
+        ## Correlate sample expression with reference samples       
         exp_similarity_results = correlate_sample_expression(
             expression_data,
             yaml_data,
-            input_data["db_assembly_dir"],
+            input_data["refdata_assembly_dir"],
             protein_coding_only = True,
             logger = logger)
         for source in exp_similarity_results.keys():
             exp_similarity_results[source].fillna('.').to_csv(
                 yaml_data['molecular_data']['fname_expression_sim_' + source],
-                #f'{output_prefix}.exp_sim_{source}.tsv.gz', 
                 sep = "\t", index = False)
         print('----')
     else:
@@ -590,7 +588,7 @@ def run_pcgr(input_data, output_data,conf_options):
             cna_segment_file = input_cna,
             build = yaml_data['genome_assembly'],
             sample_id = yaml_data['sample_id'],
-            db_assembly_dir = input_data['db_assembly_dir'], 
+            refdata_assembly_dir = input_data['refdata_assembly_dir'], 
             n_copy_amplifications = yaml_data["conf"]['somatic_cna']['n_copy_gain'],
             overlap_fraction = 0.5,
             expression_data = expression_data,
diff --git a/pcgr/variant.py b/pcgr/variant.py
index bd480c2f..8595d080 100644
--- a/pcgr/variant.py
+++ b/pcgr/variant.py
@@ -10,49 +10,11 @@
 warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
 
 
-# def set_genotype(variant_set: pd.DataFrame, logger) -> pd.DataFrame:
-#     """
-#     Set verbose genotype (homozygous, heterozygous) for each variant
-#     """
-#     variant_set['GENOTYPE'] = '.'    
-#     if {'GT'}.issubset(variant_set.columns):
-#         logger.info("Assignment of genotype (homozygous, heterozygous) for each variant based on 'GT' tag")
-#         heterozygous_states = []
-#         ref_allele_index = 0
-#         while ref_allele_index < 20:
-#             alt_allele_index = ref_allele_index + 1
-#             while alt_allele_index <= 20:
-#                 phased_gt_1 = str(ref_allele_index) + "|" + str(alt_allele_index)
-#                 phased_gt_2 = str(alt_allele_index) + "|" + str(ref_allele_index)
-#                 unphased_gt_1 = str(ref_allele_index) + "/" + str(alt_allele_index)
-#                 unphased_gt_2 = str(alt_allele_index) + "/" + str(ref_allele_index)
-#                 gts = [phased_gt_1, phased_gt_2, unphased_gt_1, unphased_gt_2]
-#                 heterozygous_states.extend(gts)
-                
-#                 alt_allele_index = alt_allele_index + 1
-#             ref_allele_index = ref_allele_index + 1
-            
-#         homozygous_states = []    
-#         hom_allele_index = 1
-#         while hom_allele_index <= 20:
-#             phased_gt = str(hom_allele_index) + "|" + str(hom_allele_index)
-#             unphased_gt = str(hom_allele_index) + "/" + str(hom_allele_index)
-#             homozygous_states.extend([phased_gt, unphased_gt])
-#             hom_allele_index = hom_allele_index + 1
-        
-#         variant_set.loc[variant_set['GT'].isin(homozygous_states), "GENOTYPE"] = "homozygous"
-#         variant_set.loc[variant_set['GT'].isin(heterozygous_states),"GENOTYPE"] = "heterozygous"
-#     else:
-#         variant_set['GENOTYPE'] = "undefined"
-    
-#     variant_set = variant_set.astype({'GENOTYPE':'string'})  
-#     return(variant_set)
-
-def append_annotations(vcf2tsv_gz_fname: str, db_assembly_dir: str, logger):
+def append_annotations(vcf2tsv_gz_fname: str, refdata_assembly_dir: str, logger):
     
-    clinvar_tsv_fname = os.path.join(db_assembly_dir, 'variant','tsv','clinvar', 'clinvar.tsv.gz')
-    protein_domain_tsv_fname = os.path.join(db_assembly_dir, 'misc','tsv','protein_domain', 'protein_domain.tsv.gz') 
-    gene_xref_tsv_fname = os.path.join(db_assembly_dir, 'gene','tsv','gene_transcript_xref', 'gene_transcript_xref.tsv.gz')
+    clinvar_tsv_fname = os.path.join(refdata_assembly_dir, 'variant','tsv','clinvar', 'clinvar.tsv.gz')
+    protein_domain_tsv_fname = os.path.join(refdata_assembly_dir, 'misc','tsv','protein_domain', 'protein_domain.tsv.gz') 
+    gene_xref_tsv_fname = os.path.join(refdata_assembly_dir, 'gene','tsv','gene_transcript_xref', 'gene_transcript_xref.tsv.gz')
     vcf2tsv_df = None
     clinvar_data_df = None
     
diff --git a/pcgr/vcf.py b/pcgr/vcf.py
index 80266699..c6b7f31b 100644
--- a/pcgr/vcf.py
+++ b/pcgr/vcf.py
@@ -45,15 +45,15 @@ def detect_reserved_info_tag(tag, tag_name, logger):
 def check_retained_vcf_info_tags(vcf: VCF, retained_info_tags: str, logger: logging.Logger) -> int:
 
     """
-    Function that compares the INFO tags in the query VCF and retained INFO tags set by the user as retained for output
-    If any retained tag is not in query VCF, an error will be returned
+    Function that compares the INFO tags in the input VCF and retained INFO tags set by the user as retained for output
+    If any retained tag is not in input VCF, an error will be returned
     """
 
     tags = str(retained_info_tags).split(',')
     info_elements_query_vcf = []
 
     #vcf = VCF(input_vcf)
-    logger.info('Checking if existing INFO tags of query VCF file matches retained INFO tags set by the user')
+    logger.info('Checking if existing INFO tags of input VCF file matches retained INFO tags set by the user')
     ret = 1
     for e in vcf.header_iter():
         header_element = e.info()
@@ -63,10 +63,10 @@ def check_retained_vcf_info_tags(vcf: VCF, retained_info_tags: str, logger: logg
 
     for t in tags:
         if not t in info_elements_query_vcf:
-            err_msg = "Retained INFO tag '" + str(t) + "' not found among INFO tags in query VCF - make sure retained VCF INFO tags are set correctly"
+            err_msg = "Retained INFO tag '" + str(t) + "' not found among INFO tags in input VCF - make sure retained VCF INFO tags are set correctly"
             error_message(err_msg, logger)
         else:
-            logger.info("Retained INFO tag '" + str(t) + "' detected among INFO tags in query VCF")
+            logger.info("Retained INFO tag '" + str(t) + "' detected among INFO tags in input VCF")
         
     return ret
 
@@ -75,22 +75,22 @@ def check_retained_vcf_info_tags(vcf: VCF, retained_info_tags: str, logger: logg
 def check_existing_vcf_info_tags(vcf: VCF, populated_infotags: dict, logger) -> int:
 
     """
-    Function that compares the INFO tags in the query VCF and the INFO tags generated by PCGR/CPSR
+    Function that compares the INFO tags in the input VCF and the INFO tags generated by PCGR/CPSR
     If any coinciding tags, an error will be returned
     """
 
-    logger.info('Checking if existing INFO tags of query VCF file coincide with PCGR/CPSR INFO tags')
+    logger.info('Checking if existing INFO tags of input VCF file coincide with PCGR/CPSR INFO tags')
     ret = 1
     for e in vcf.header_iter():
         header_element = e.info()
         if 'ID' in header_element.keys() and 'HeaderType' in header_element.keys():
             if header_element['HeaderType'] == 'INFO':
                 if header_element['ID'] in populated_infotags.keys():
-                    err_msg = 'INFO tag ' + str(header_element['ID']) + ' in the query VCF coincides with a VCF ' + \
-                        'annotation tag produced by PCGR/CPSR - please remove or rename this tag in your query VCF'
+                    err_msg = 'INFO tag ' + str(header_element['ID']) + ' in the input VCF coincides with a VCF ' + \
+                        'annotation tag produced by PCGR/CPSR - please remove or rename this tag in your input VCF'
                     return error_message(err_msg, logger)
 
-    logger.info('No query VCF INFO tags coincide with populated INFO tags by PCGR/CPSR')
+    logger.info('No input VCF INFO tags coincide with populated INFO tags by PCGR/CPSR')
     return ret
 
 
diff --git a/pcgr/vep.py b/pcgr/vep.py
index 4d2c307f..a4dfd579 100644
--- a/pcgr/vep.py
+++ b/pcgr/vep.py
@@ -17,13 +17,16 @@ def get_vep_command(file_paths, conf_options, input_vcf, output_vcf, debug = Fal
     
     vep_dir = file_paths['vep_dir']
     fasta_assembly = os.path.join(
-        file_paths['db_assembly_dir'],
+        file_paths['refdata_assembly_dir'],
         'misc','fasta','assembly',
         f'Homo_sapiens.{pcgr_vars.VEP_ASSEMBLY[genome_assembly]}.dna.primary_assembly.fa.gz')
     ancestor_assembly = os.path.join(
-        file_paths['db_assembly_dir'],
+        file_paths['refdata_assembly_dir'],
         'misc','fasta','ancestor',
         f'human_ancestor.fa.gz')
+    
+    utils.check_file_exists(fasta_assembly, logger = None)
+    utils.check_file_exists(ancestor_assembly, logger = None)
 
     plugins_in_use = "NearestExonJB, LoF"
 
diff --git a/pcgrr/R/input_data.R b/pcgrr/R/input_data.R
index ef68a77e..341cf764 100644
--- a/pcgrr/R/input_data.R
+++ b/pcgrr/R/input_data.R
@@ -791,3 +791,24 @@ load_dna_variants <- function(
   return(results)
 
 }
+
+load_expression_sim <- function(ref_data = NULL,
+                                settings = NULL){
+
+  ## Load expression similarity results for input sample
+  ## against reference collections
+  expression_sim <- list()
+
+  if(!is.null(settings$conf$gene_expression$similarity_db)){
+    for(db in names(settings$conf$gene_expression$similarity_db)){
+      fname_db <- paste0("fname_expression_sim_",db)
+      if(!is.null(settings$molecular_data[[fname_db]]) &
+        file.exists(settings$molecular_data[[fname_db]])){
+          expression_sim[[db]] <- readr::read_tsv(
+            settings$molecular_data[[fname_db]],
+            show_col_types = F, na = ".")
+      }
+    }
+  }
+  return(expression_sim)
+}
diff --git a/pcgrr/R/main.R b/pcgrr/R/main.R
index 211e69d6..420cf995 100644
--- a/pcgrr/R/main.R
+++ b/pcgrr/R/main.R
@@ -54,7 +54,7 @@ generate_report <-
 
     ## Retrieve relevant clinical trials for the tumor type in question
 
-    if (as.logical(settings$conf$clinicaltrials$run) == T) {
+    #if (as.logical(settings$conf$clinicaltrials$run) == T) {
       # pcg_report_trials <-
       #   pcgrr::generate_report_data_trials(
       #     ref_data = ref_data,
@@ -63,7 +63,7 @@ generate_report <-
       # pcg_report <-
       #   pcgrr::update_report(pcg_report, pcg_report_trials,
       #                        a_elem = "clinicaltrials")
-    }
+    #}
 
     if (NROW(callset_snv$variant) > 0) {
 
@@ -514,9 +514,9 @@ write_report <- function(report,
       if (!is.null(report_strip$content$tmb)) {
         report_strip$content$tmb$tcga_tmb <- NULL
       }
-      if (!is.null(report_strip$content$clinicaltrials)) {
-        report_strip$content$clinicaltrials <- NULL
-      }
+      #if (!is.null(report_strip$content$clinicaltrials)) {
+      #  report_strip$content$clinicaltrials <- NULL
+      #}
       if (!is.null(report_strip$content$msi)) {
         if (!is.null(report_strip$content$msi$prediction)) {
           report_strip$content$msi$prediction$tcga_dataset <- NULL
@@ -775,17 +775,11 @@ write_report_quarto_html <- function(report = NULL){
 
   settings <- report[['settings']]
   output_dir <- settings[['output_dir']]
-  sample_name <- settings[['sample_id']]
-  genome_assembly <- settings[['genome_assembly']]
-
-  sample_fname_pattern <-
-    paste(sample_name, 'pcgr_acmg', genome_assembly, sep = ".")
+  output_prefix <- settings[['output_prefix']]
 
   output_format <- "html"
   fnames <- list()
-  fnames[["html"]] <-
-    file.path(output_dir,
-              paste0(sample_fname_pattern, ".html"))
+  fnames[["html"]] <- paste0(output_prefix, ".html")
 
   ## Path to PCGR reporting templates
   pcgr_rep_template_path <-
@@ -836,7 +830,7 @@ write_report_quarto_html <- function(report = NULL){
             replacement = rds_report_path) |>
           stringr::str_replace(
             pattern = "<SAMPLE_NAME>",
-            replacement = sample_name
+            replacement = settings[['sample_id']]
           ) |>
           stringr::str_replace(
             pattern = "<MAIN_REPORT_COLOR>",
@@ -886,7 +880,11 @@ write_report_quarto_html <- function(report = NULL){
 
 }
 
-write_report_excel <- function(){
+#' Function that writes contents of PCGR object to an HTML report (quarto-based)
+#'
+#' @param report List object with all report data, settings etc.
+#' @export
+write_report_excel <- function(report = NULL){
 
   fname_xlsx <- ""
 
diff --git a/pcgrr/R/reference_data.R b/pcgrr/R/reference_data.R
index 78a653f5..0908f24a 100644
--- a/pcgrr/R/reference_data.R
+++ b/pcgrr/R/reference_data.R
@@ -654,7 +654,8 @@ load_reference_data <- function(
             "woods_dnarepair|gerp|tcga_pancan_2018|gwas_catalog)")) ~ "cpsr",
         stringr::str_detect(
           .data$source_abbreviation,
-          "^(cytoband|mitelmandb|tcga|nci|intogen|opentargets|dgidb|pubchem)$") ~ "pcgr",
+          paste0("^(cytoband|mitelmandb|tcga|nci|intogen|depmap|treehouse",
+                 "|opentargets|dgidb|pubchem)$")) ~ "pcgr",
         TRUE ~ as.character("pcgr_cpsr")
       ))
 
diff --git a/pcgrr/R/report.R b/pcgrr/R/report.R
index ab5d9ad5..b88e3754 100644
--- a/pcgrr/R/report.R
+++ b/pcgrr/R/report.R
@@ -90,9 +90,9 @@ init_report <- function(yaml_fname = NULL,
                      "rainfall",
                      "kataegis",
                      "expression",
-                     "predisposition",
+                     "predisposition")){
                      #"report_display_config",
-                     "clinicaltrials")) {
+                     #"clinicaltrials")) {
       report[["content"]][[a_elem]] <- list()
       report[["content"]][[a_elem]][["eval"]] <- FALSE
 
diff --git a/pcgrr/data/data_coltype_defs.rda b/pcgrr/data/data_coltype_defs.rda
index b25e5f8e310ecd4ca70581eea8271450c56e011c..ce1c2438db00683a4bc6093c9d904be79d6a55f4 100644
GIT binary patch
literal 1922
zcmV-|2YvWLT4*^jL0KkKS+~*=k^mp=Z=nDG|GSU?KmY&mzwh6m2mk;9;0b?xK4EQC
zIsgFBbN~YwSc6kaJwRv&lr#VU4F*Pl0005NFpMUch5-P`0fcFjCJ}&!hXBGbnqn9P
z10V(wrc9Vd0vZW4sG!vSQKo>zG#LOI28N9Q27^yT2%<ubGZfVI4XNb-G|)Xqs5BZ6
zC;;@0HlS3I06<L%qaaNLpQ##Pk5eb6QR;ah0CtifyR52`AmN0O1_jO%Go(PfP*-FE
z2@p!ErFFzaY<ak}lt4h%M%HAUNQwp=O~@z|1BS6h`E8L@EP}Nx8J&!bfGCY2P*L(6
zXCbGP)^sZLNr}4a@beiA^2N9U3{wUxay1wG<ULY*XkuZ_GMS8xeApRhn-lP?B;i5=
zF&Ky}iz5X@L|~w>2Gy#_O2Cy#Y9y+XOR&Yp-6q$c#rSw#n)Y$7j^|xlVA*N9?v&Sc
zGuelA;@R0>h3>}FHBU**G28C095B^yPJiFK46^FEhXhAI%Qjf!2zh4F&!&oBC=pjT
z?*3Pt8k+Cl^85PoGUD2qWrmts*0r|VZMNHO(9NV`6b-2kT0&0vLJ)|mB0zwLMo~c;
zKoo^lK$ujK1w=@P<w4y<K*bUa2^AL0tieyZnT|Wk9q~pKEPxNGpB_-oS9y}DXH`Ro
z89Bx&#MaQoq3yFn+AxiUgsW-5w9|b{y6$BP0!Y%P_$z|j;fYoG+Sw;XH)f34Svq-^
z%ktslKXc!R{N7SaN^y6wg&FAXe#Nk2_p^THZz`%XJx+((`(JADz<sm$YplXgy@T#-
zyYXH8F3Nq)NomDX;CL+VypqocxaQG&DRvk;v3pddDN^(Zf(XV0Km(DE9WNVNVQM_R
zh>)ZuNJxP}Q56(92L{REXQAn;MDco<J<DDy&$A1@N@tS{mc4r~oO`|drS^R$Ca1ko
z+EVFdn{K${Y4k9r>>@Uy={9)Iou^sXGk#ZcSyAG<r*|vqFj=|#57(~^9C$c7{V$AR
zJb6*A+*>?orZpK;1x%jDmX#uI>{C-T!IO&k$xmlQmt>hnS6>y>-DTNJE{w)GKb*9;
zoT+Ggi`cuijpmg3rhCp<tvE#bPlI^u$`(vZ$?xmhOR_sW3rc!COSYU<B`QglZ#ELe
zHJ>%Tt_(OhDf8p&eJ)={P;-wY-EW-a=@DmtW1w%ht~54tX)|90(JJ#BFnUsr9~*Jc
zSp6QqTPIgPJ387MIku3?lxn-{U5!Z1uQ{~B>qXnSWVUo~g3z&3t)ty_wI3<Go^uRi
zy>2Sm)4@!>52?m)BSwvE?teq<Un4Hg++0r7%A5*IL$$RnEp??BT$wPKRKZeNNlKKd
zieoM>g9uXv<uNf#HM0_z$&)rcS7ju)vyx8s2F!8Pw6wDcbt&lCghncuEG&H0EW6Xn
zaLlTJ)E8R9_CtXU=L5bK(a_AH!tps=ek|9!&74~>RZFZZf9K@SmS<(&{?B4k>lm~4
zF2S;Bn4Rf|7A%yejqj(gR(E!L)5`w_4YN&*T+RvoOR!IBalH3s^)7QNq@11pr41?B
z+S3jUk?vJes=2mGl5)fF#mUj;{AcQH(~8aJt8|jho)Svt!f9Bn_${82p2E!}Xy_V6
zG(^H+#+2l1Mk|RijACJhF&P(!ZVv~V?*osDar)&GIVUeaAC4?h);F?Ep|h(7ShU4=
zM_Dsj^gFR%TY8k;Jxx=lY*Bfoht6DPr-QxWd?~?$^rk%~)lTBp?S<GYn%LnEOd6Qo
zsJe6ZyxdbmG|Q8AEF<()oy!<5%}T0Qu}SgdV7uJ+HF})vpPS<R{#%c3(`oRXQ>&FY
zESA5mOfIph-QjyBTK6up-OMj~v-^xZ7&P%&=HmS=+a+XrDcmyodN+KfI48zbi?+u%
za`~`xlPfuVY&i9Jb(c%{EXIEuzRbE~_1`u8FPrXWvh3P3U9LG_3*(k7vsBINTIw@r
zj#_Ddp?vgI*LRj~EZZ@h7^$}#HG1s395yv--N8-e*)M{oT-&XsZ8*2#`mJ0OJ~d`C
z#nSATU7c~p#!{T{!)#ZVZsW;DE}XLAwe)1OgM(d_EzQGb>=*NBw}i@WR+=$s!M5t1
z*x=D>b#HejtFV5za`QQZlc!2kZT80nGk?(zP$T_1Sw8>$S`fJkAzzF97nx*!I$;7b
zM70qAlmRQ*@i|?`SwKUaZF3yJydZ_3gdhp4R)<2O2E+k0k0J;_A;$dN8BhecR6w&^
zRi&(tOmqlmfI_)wDnK>ErzC?1_Q8Z#JwR<KXcZv9rC7)*766G6D+yTzSy@t4gKUHr
z3N*4QvJf!X)Q~1zWG9s80B@P+6B}h>#g6@zj!*&|Cn&T>eZ)j6ob7H=8_V1n$8=Oc
zsmEBzgH%eN8D9xPOAy!PavOT^A(Z5iG$CmuQ4TH$nZ!suk|oqbqDX`4c_x$OfA({f
zkq2;)RFHvk0G70&RQIlHO<AQkauu__s$)PBNEHw%glN<#Lyw3MLzM*sSpUV`kxmpO
IE%bzBfCeF*r2qf`

literal 1912
zcmV-;2Z#7VT4*^jL0KkKSvBQ-K>#0vZ=nDG|GN+XKmY&m|L@<R2mk;9;0b?yTjvS@
z&;SQP&<~(Gaj-*ENYpfB(?_HL0009(0000S0|>%th+q&5fEY%ZGGQ19Xf;v2Q}opa
zfB~aG00000000RzsVFq{28}c{G|8Ye$kButjF~Y8Pf>y?DLo;k)MNkvG|)Xj01W^P
zjWPvE5GFt*$$|ipsrr*Aq|;AR(t3I+`V;^fc1S^YNmU^R{3Ia2xxN|E0dJx$xC9a)
zk^-dDNJ&Z}+jhqZNhA-&U(#C3Z;23XzHx|x<Kc=Dei!5_g*YZcn95e!41`cdlpvxs
z7)DUwyLm&0rd@Ux_8Jk+VoJaeT;v~Sgk?iH&I5-TFfVY&GK^<~tAURs&YX(LCo)M9
zh{Qo;SrAlWC@3fcYSm;VLRKZIEku=2F2ff4bDG|L&!27CuV)J8=;^tx4OW{j=S|xw
zo~%0FEuEF~z1Z4jspmNcI{%Gt--hM9`F|~#WtT<FH>5d!S+d6%L%%kfa_6~BGl3Oz
zZohr!2Bz))kB9T=#LI(PWtJLgYi+jMZMNHO>1A~cK|s=w?I20)0t8A)2#QG{^jZQX
z=w6~kg4Gm+6(ERyR2^uL7^FdgWRWIZa@~K@M~nFKUG=0&kjjJdd35hQ73eZmEa<3j
z;U_4?I2)}DS{|!3Jws&4lG08OOqcsPVJ4G4^5j`Bgyl?W!<9;vU!ARzb5mw$&6Sg%
zk!-&${nPk8n2*ihB($dop^PZcM_2D#1}|eb@MiC-qa)JheXq6luN%kPKJDA45`5S`
z_N%^&zU#7|dvPr|Dtk`Voo^(wwd*-FUdmmD4!FHaQk0`8K!_lWU_<~p7~#}tTGGy=
z$k>RDp)H{yiLJEU5U8?b?X%4D+>txIEFQJ56=&<h?}C}+!zHideB;;b*)OZ&VrY96
z9g>$zEZ1en3r~9rY{ElY9&=Z8=h-?=nVWIDipq}bx!bQCG?=W|{pZ=cevTX4-99gL
zVLQ0dy6z2K)KeOase-0Y7k-r@Z0sqx%`jx(zA{tCnoF`wqbsk9>TI&?rI$uy9KT6v
z<u5G{S$h|DXxeE{l4rEziqm{giS2D2jG<)2yq?aTl3kJ5?JX(h?k?7FRFtVES-hA_
z6x}{cdYjm9Z&T&N!SQkT<p(J5O_upiE^!umI}TyCz7e6TlS!KRCW%*=*@MoMX!l!&
zd!y^{@MP%a<!4JnS0=I<agEErQtWC*ZQQ1_3#}Jy<5|(Q)`g0lEgrkAp!iK>@|a^D
zg}19$KGibzzNZPajT$#u)%{P_zD8Ylx1DK?HYqI*w$QY-)|6p#WWr%g6qZs_r7B{W
z%ZuQ`6v25+Oj8Xk#HI3N&5t$NNiHnlld*xT9CsF$mSHZXJle2`#Zv`^kCLUAdH2ov
zm|U0vi#rRd8E8~++QBs@hEhVqWz1{uXJa)AgyyUjQtJxa^zUcKGqHEGvFu7+V-|iZ
zuxgqnC!sLl#gdfKwe;}Z&hE#tJMZma*EH0{vVPT<Pf~BZ_2u<0aVn&o9*;_fl<Mng
zhW1ePs;N~~+|?>c#{Y&cE^jB&e&($>EZS~coR(_sl2;}ZO2ucjZ1j}zEYe0;vsu#<
z34<C_t>27S5iv$FFv6IIM9s|WnWvKO(}!|!{ZfgXlaH5oyA~+v8(Ai>)zO12T4KAK
zq?xSuy5hd;dQ{mxEy>R|D7@0c<t{T*wdi)=3U6TjiH|w0Q>klqVRj1Swm3tJ2ICvI
z3#Ttr$-y);Ot`jU!almQs>TbmQmU2Wr1x<!U7NMS$@6=kpTltS_s@Lx&WX{+o7PKT
z!Avf(xa-?{C0h3`vfWr-^=J4PcCcyOv&q5x8#PME^HZqH;pW-!l-{1{QZCmV-B-<n
zl$lw_-C@I{x1_pXw6Hdrh*MMJq(}-GAkYvj2q10{5O8WH#A-044Q6izuA?@1<))YS
zubzsVws+>>&8r!`ikk4VbHdB9!&7r}X7x6gWWEZSa%|IC!M6Y1YTlV}t1*@?mt?x^
z>xMQml;wsS@m^iChZPvP@ymX<(UQ&W?b@nalYXt3FXYi~*_7C=G-A_xU6nfHgGH&)
zwcJ~mVEY|cnamtq`O=#0t~e>1{>X8GAL-7?^Z(_c3y`2!(fF@2$ot<-*uad@Ekr-+
z1ZDppg5!e70K7A~<<MIIg-}8e1hs00g%E$d2>TQHLKF|<K9hHR2;z`~FUDcP!dW|0
z_MzoOF|@iu4@u)Bh7a|ENs&OQZ33hi6ss8pB7h=9WQq|P6qO*_5()}5u|V;J3^Thj
z1jqj&J_cY0;gWxt+bl5*LmjeZ3}PS&_)bnES*k=TmzkW>CnqVpqG*tV3DXpW8<8q|
zrF&%wEDhfKkk`Wk8BQq^LKc!m1A}@daUu@%iFFX>kb~myO(($qna(m0bs`FogNz8_
yFLoCCxXfel@w92!##@|{IzWplAqDnJ%n%{3WB@|qA_s{7i@744C`cOezMz13AFq=D

diff --git a/pcgrr/data/tcga_cohorts.rda b/pcgrr/data/tcga_cohorts.rda
index 5b7656cc2279caef5a5a29069058ed12d0357f21..36ff684e9b6bd97a0da4d343f920d17294ffd4b6 100644
GIT binary patch
delta 751
zcmV<L0ucTB1%U<*LRx4!F+o`-Q&|q)dLEGuAb)Eym^9SVXw>yj1xM)(Ptti!8ftom
z0GMP(k5g?Vz<_8*fe|x8JfqYN00w{okN^gNaMMf>$Z4iVOduL)0|}!{m;gY~NTM{=
zPxVbPC#mQfYH5Q|U?T!C0~#Yi?NJ4B-i0Y~qHQ2Q4VGkzNU&gbTbkgBv|HIF2!RFS
z0Ds?yFCR8@lQ$fFCGKMjPdIL6iYPEMt+k3l8+Y^|fv|UA3kRGlX(g<I14t7GsL>d~
z2_$uLKuWeC%TFJ-kA7O@+CN!fqk7j{oNKO1uScnjXxBZ*La1THJK}W`gU-kUwLe*K
z=>n_0{+qh7+lzCj?1`AS7sU3(*(5dDI)Ah2?}HlIsn*hNH`_%hfL++ZgvvmnI}ac;
z14E-VRZ#A;61KB3JQj!;OOGpfA6>QI!9u0s`8n*lWBT}1`rJE+w2GX)6<F~R+>m$e
zSuK?3dj6gu-wc@wlcCa-`AL1rradv_i*1Epb=%YF-m{c*liC0tWB@U9l40m_pMP2Q
zECOO-qfoyIS~V{eWjThNjWbE+Ol)n9v6%k4w%FM>^v;fK#o+!BLHl?IV$y~XL?iV8
zz}KvI8jUkEg=W(VMPbC?%&^n@u}u#Sy0~2Gs2@H|;?Y_RB<j@RI05m#iM34}1p`Lf
z&wNv%n59eu+xVgBhN7W&XBgNfHh;vk39Ep8M41!4GORGqAe160>1$sNUd=hkV;I|M
z^j_Q~Ri8bqGS<4_1}0LJckk4IE|Fv;_MECwBQiCaG5YEkDIyyRgbARj3JtJ1@<U)7
z3{$c?`bLHiF{33$taUM1)ka8Ul0y=Rjo8h8c1=9d5K>4TX?QeRMXZ=gLRgfp$^Z#0
zT4l3&N~=Qt+T(Bt9&{XSQVGW4Q;<%qWDL0b54&5eBMW*w3d`5v?|1bUMaxECfRqep
h2GpFETT!4$4Cys;g9vwjyuk<lF64@Ep&%W+^gM-YRzCm$

delta 744
zcmV<E0vG*(2KogKLRx4!F+o`-Q&|&K9QctAAb*=Ij5O6W4HVc>=>yVv6A_>QX{V7I
zG}#c)0j8mmG$Yj1%5Nz3Mg%=W27~}<kTeEO2MsjA42GIyWWoWafH0ag$$$h61c?ch
z)jda|dYWW07>`gejRA!9Jw_&)4IWVo3&j`Tg`&+MKMj@1B#}{pmqvUdEYYJa$_Rl2
z#D4+GALQ($-=V&yTs_#aGShD;1{<N394$&Cje!m6$`V+bL}TrR8cA!QK+*)k_nIXr
zLP;FH*b<`<b940H)z8%_rvH5~iDH}3?ut^>rD7^aW1!Iua;Sl*HJ_yulukegOS_+}
zZv$^X%X6=PuWx$qe~Kjqo17;jSlxK+!hgcBY!}qb5o=UyHd%@^Kr6<;$Wh>uvF9j%
z90ON8FU;KpPcqi`eAej@?P#WMIMWr?g&vb0hMB8YoGX!Xwla}95;m+dOt8`4bZ6e4
zXd$I+`m8?9>-tKG%-w>VPT6fLa;G>Nw!u>CuJl!3o2+(|kS3xKCyfaoE<336Jb%`?
z##uO=Eb5aynu`VaSvPx)iniD^*x1_oZ1?n4vd=eSS@W2K_(Bf1jCESEs3c(Pu<;13
zTrhMACRmsdsFt!Ye!%TlTYb?Bp>bx<LFLs*oJxYxIy5#R04>GE28bMdL^E4dG6?fA
za>8y8zr_WmsS1Lw!Y?o&Gqk7#4}VHmAtlt#m=yh}T^0f$dSa8=hMVNX5J@DIKCHeF
z!WPSWLt~J*0kTBrGfF`K4$&q>&vD%~B330sCdZay_ZJbguOPAsfI|k)hDbJm@LIC)
z>Y8})9ov$<l67TRrpRPp1Wut=c~`q*>$19pl^r9Cv<M10aYv${z1ZY%0Z|=oADG1_
zg$C~e_Am|+HyWvUMl3LA3l;B?EPF>pAz+An<AdEj6}8>S`9DoG><d8OICtf%!L^*w
aNF1q?X3$@MFZQ58{}*yaI8cxiR2=yCVo=in

diff --git a/pcgrr/data/tsv_cols.rda b/pcgrr/data/tsv_cols.rda
index c1e064e1b886039561df0868e73f14b783392c09..c8173fd0b8796f9f93a810b91f25411e676bf15c 100644
GIT binary patch
literal 687
zcmV;g0#N-zT4*^jL0KkKS!iMRC;$RRZ-D=Q|4I-6KkxtWuBXia00BS(ou=u{-MO?H
zB6<XvCX>_BO{g?94@ek8K*9|W(?J>(+BGyZ8UeKc0iXbCc@Z^FYHBu<^+%{_pa1{^
zKma6#4JqwWwH}bj00uxbVKe|PgI;w^1bx=%6+ZvgBCJjc1&sZC`e)-5vvg4%IQBX`
z?fmV4oYnov5@S<X^Xh6CRr7juG3JSD-4JkV%Zuy_eE<w3lN+w097sW-gISOT423Em
zPt$rCtx;)vQwEBO8!EVvXD*p1bg&RbOhJ~xdssdX8r)$er>;$rZgv4MzWdiHK#a8d
zwOY<kNDxBOGUG}B@6tsjs!1dUFdA9Zos;9dY)Y#^?F!0q3z9i5i_R(~77}xLM5F_C
z)pMX@kTz7DNIWn&H0euMRV0<yUMH|v77durf(Tn=iGip#ctfN<0BH(Pp))v8%Io$6
zaEK!}s2Dm~r6@3pg@3FhMO277yGf%Y0fvpqVRR6QM|x09O2B4v7{Hn>eJBD#dRgWM
z29`wv8UidLLxSN!f-MB@z~4x)4jz|s2}vj!T0<gwt$v#(IyWzKXXKeIb;4t8vM1YY
zL8AjAkf^eQT0+{Mv=9*iYaGfH0=CJ`%K{*SmmnSuOf&G<;_2pu>i|_8NCsz;K$Z#w
zm@tm2?ip<{$r!i>8CDYbvU{rGL1hHOynytw$exj9ViU-%P{P60zO^bknzP0Ul-OfA
zsMn3fP<jb#fjbF?2Fd};R-xNa36n$G0u0BLCEhi_Two&Ym>AK_seX#74t^`Z)?GH5
zjz+B09W#V&v|@`;IJpN5aV^bMEM^IrE9q8x5yd0n3K5(}{1}yo`%Q^w0lV9W4cn%j
V`}1-4bAHG8yOJrwgo8s5xj;4!BsBm4

literal 694
zcmV;n0!jTsT4*^jL0KkKS>CtEh5!Ol?|}b*|4I-5zwiI>uBXia00BS(n{&Gg>ut9U
zO;gm_BTWIOhJXOc=>tq49;c`pnrV|lO{jy^^gsY<>HsnTXbm(XYM#{EnreDYG%^O7
z0B8UJ1d<7pNjwuqNs0gf7|;=*7cjPQm;ndelpsiZ|5k!Lih&Rc%JXRSo~D63PTFwp
zgtMNm_7tR7!2(cpo7B4v2w=^oDoRj>H=KbpG)zrw3xI$bB$FKOPkVq!J6I;j0t_X5
z;{N~BcO7?+X1uz*`)VX?Q+$K4)>ui-O5x$qK;&tHk`O2>$83X6S=CN!>DYiB3Q~^Y
zs+f%@G}4s5s?aE`jk{G)9uYARb_pba#ReZWi?Y4AhSXM0t`r@?v>y#P@6xtbxjA5`
zE#-(eb>qf#V__>JaRcG1WQM{!pM~l<WtPh7HV+!C+T=xeLq-lnttn4AT?43bB1BEF
zEoFvSkIu0(RUpn3+e%DwR;;_{L29BPb6%cnn8hG)vpg$`Q(aM+VM~T)q0b~?Y-vCt
zDeyDM42E?<K@5TnAVY%TQISh0LJu1Y91kNGIC&`~@s-2b7HVyArGt^W-%n=^1arO2
zE)zR_vWvkYg}o<{+klNHmn4k@#o!^0=$8w2+gi7VgAKnB<oh%3^~wh{_=NI9I>;3R
z-tkpJSSk`xP~fuRjWo(wfKrwSW7b*csl^5w35ctJ^p$He;uS$j7Qd@OgWY;t5on96
z3gMbemlLxFk>T>iYAT6LCZUTJ5}-C5k8S}Df1KnIJq^H*AC1+F#0WN90UbHkd1$wg
z=i%?r^;rgz!sfn8$6o^EjVLi8*dQVvAPlTFjP6@8B_)mg)kbtBIn@{VYO`VX{z&^z
curH$+wYy%Y-5;lIK1cC)BvXY64eNY}VC|4JaR2}S

diff --git a/pcgrr/inst/templates/pcgr_quarto_report/mutational_signature.qmd b/pcgrr/inst/templates/pcgr_quarto_report/mutational_signature.qmd
index adbe6917..fe8cfd1d 100644
--- a/pcgrr/inst/templates/pcgr_quarto_report/mutational_signature.qmd
+++ b/pcgrr/inst/templates/pcgr_quarto_report/mutational_signature.qmd
@@ -4,7 +4,6 @@
 ```{r setup1}
 #| echo: false
 
-pcg_report <- readRDS(file="../../../tmp.rds")
 report_color <- pcgrr::color_palette$report_color$values[2]
 ```
 
@@ -13,8 +12,6 @@ Here, we apply the **MutationalPatterns** package [@Blokzijl2018-nc] to estimate
 ```{r setup_signature_display}
 #| echo: false
 
-#pcg_report <- readRDS(file="tmp.rds")
-
 msig_content <- 
   pcg_report$content$mutational_signatures
 
diff --git a/pcgrr/inst/templates/pcgr_quarto_report/snv_indel_main.qmd b/pcgrr/inst/templates/pcgr_quarto_report/snv_indel_main.qmd
index 9cbcacb0..4e26bdb9 100644
--- a/pcgrr/inst/templates/pcgr_quarto_report/snv_indel_main.qmd
+++ b/pcgrr/inst/templates/pcgr_quarto_report/snv_indel_main.qmd
@@ -406,7 +406,9 @@ DT::datatable(
 
 noncoding_variant_set <- 
   pcg_report$content$snv_indel$callset$variant_display |>
-  dplyr::filter(.data$CODING_STATUS == "noncoding") |>
+  dplyr::filter(
+    .data$CODING_STATUS == "noncoding" &
+     .data$CONSEQUENCE != "intergenic_variant") |>
   dplyr::select(
     dplyr::any_of(
       pcgrr::dt_display$snv_indel_tier3
@@ -496,8 +498,7 @@ DT::datatable(
         pcgrr::color_palette$cancer_assoc$breaks,
         pcgrr::color_palette$cancer_assoc$values
       )
-  )
-
+  ) 
 
 ```
 
diff --git a/pcgrr/inst/templates/pcgr_quarto_report/value_boxes.qmd b/pcgrr/inst/templates/pcgr_quarto_report/value_boxes.qmd
deleted file mode 100644
index 79c39458..00000000
--- a/pcgrr/inst/templates/pcgr_quarto_report/value_boxes.qmd
+++ /dev/null
@@ -1,52 +0,0 @@
-## Main results
-
-<div id="value_box_container">
-<div class = "value_box">
-<br><i>TIER 1 - SNV/INDEL</i><br><br><mark style="font-size:118%; background-color:black; color:white">
-<b>`r pcg_report[["content"]][["value_box"]][["tier1"]]`</b></mark>
-</div>
-
-<div class = "value_box">
-<br><i>TIER 2 - SNV/INDEL</i><br><br><mark style="font-size:118%; background-color:black; color:white">
-<b>`r pcg_report[["content"]][["value_box"]][["tier2"]]`</b></mark>
-</div>
-
-
-<div class = "value_box">
-<br><i>sCNA</i><br><br><mark style="font-size:118%; background-color:black; color:white">
-<b>`r pcg_report[["content"]][["value_box"]][["scna"]]`</b></mark>
-</div>
-
-
-<div class = "value_box">
-<br><i>TUMOR PURITY</i><br><br><mark style="font-size:118%; background-color:black; color:white">
-<b>`r  pcg_report[["content"]][["value_box"]][["tumor_purity"]]`</b></mark>
-</div>
-
-<div class = "value_box">
-<br><i>TUMOR PLOIDY</i><br><br><mark style="font-size:118%; background-color:black; color:white">
-<b>`r pcg_report[["content"]][["value_box"]][["tumor_ploidy"]]`</b></mark>
-</div>
-
-<div class = "value_box">
-<br><i>MSI STATUS</i><br><br><mark style="font-size:118%; background-color:black; color:white">
-<b>`r pcg_report[["content"]][["value_box"]][["msi"]]`</b></mark>
-</div>
-
-<div class = "value_box">
-<br><i>DOMINANT SIGNATURE</i><br><br><mark style="font-size:118%; background-color:black; color:white">
-<b>`r pcg_report[["content"]][["value_box"]][["signatures"]]`</b></mark>
-</div>
-
-<div class = "value_box">
-<br><i>MUTATIONAL BURDEN</i><br><br><mark style="font-size:118%; background-color:black; color:white">
-<b>`r pcg_report[["content"]][["value_box"]][["tmb"]]`</b></mark>
-</div>
-
-<div class = "value_box">
-<br><i>KATAEGIS EVENTS</i><br><br><mark style="font-size:118%; background-color:black; color:white">
-<b>`r pcg_report[["content"]][["value_box"]][["kataegis"]]`</b></mark>
-</div>
-</div>
-
-<br><br>
diff --git a/pcgrr/inst/templates/pcgr_quarto_report/value_boxes_tumor_only.qmd b/pcgrr/inst/templates/pcgr_quarto_report/value_boxes_tumor_only.qmd
deleted file mode 100644
index f150b26d..00000000
--- a/pcgrr/inst/templates/pcgr_quarto_report/value_boxes_tumor_only.qmd
+++ /dev/null
@@ -1,52 +0,0 @@
-## Main results
-
-<div id="value_box_container">
-<div class = "value_box">
-<br><i>TIER 1 - SNV/INDEL</i><br><br><mark style="font-size:118%; background-color:#593196; color:white">
-<b>`r pcg_report[["content"]][["value_box"]][["tier1"]]`</b></mark>
-</div>
-
-<div class = "value_box">
-<br><i>TIER 2 - SNV/INDEL</i><br><br><mark style="font-size:118%; background-color:#593196; color:white">
-<b>`r pcg_report[["content"]][["value_box"]][["tier2"]]`</b></mark>
-</div>
-
-
-<div class = "value_box">
-<br><i>sCNA</i><br><br><mark style="font-size:118%; background-color:#593196; color:white">
-<b>`r pcg_report[["content"]][["value_box"]][["scna"]]`</b></mark>
-</div>
-
-
-<div class = "value_box">
-<br><i>TUMOR PURITY</i><br><br><mark style="font-size:118%; background-color:#593196; color:white">
-<b>`r  pcg_report[["content"]][["value_box"]][["tumor_purity"]]`</b></mark>
-</div>
-
-<div class = "value_box">
-<br><i>TUMOR PLOIDY</i><br><br><mark style="font-size:118%; background-color:#593196; color:white">
-<b>`r pcg_report[["content"]][["value_box"]][["tumor_ploidy"]]`</b></mark>
-</div>
-
-<div class = "value_box">
-<br><i>MSI STATUS</i><br><br><mark style="font-size:118%; background-color:#593196; color:white">
-<b>`r pcg_report[["content"]][["value_box"]][["msi"]]`</b></mark>
-</div>
-
-<div class = "value_box">
-<br><i>DOMINANT SIGNATURE</i><br><br><mark style="font-size:118%; background-color:#593196; color:white">
-<b>`r pcg_report[["content"]][["value_box"]][["signatures"]]`</b></mark>
-</div>
-
-<div class = "value_box">
-<br><i>MUTATIONAL BURDEN</i><br><br><mark style="font-size:118%; background-color:#593196; color:white">
-<b>`r pcg_report[["content"]][["value_box"]][["tmb"]]`</b></mark>
-</div>
-
-<div class = "value_box">
-<br><i>KATAEGIS EVENTS</i><br><br><mark style="font-size:118%; background-color:#593196; color:white">
-<b>`r pcg_report[["content"]][["value_box"]][["kataegis"]]`</b></mark>
-</div>
-</div>
-
-<br><br>
diff --git a/scripts/cpsr_validate_input.py b/scripts/cpsr_validate_input.py
index a3b80874..f9e839ed 100755
--- a/scripts/cpsr_validate_input.py
+++ b/scripts/cpsr_validate_input.py
@@ -20,13 +20,12 @@
 def __main__():
 
     parser = argparse.ArgumentParser(description='Verify input data for CPSR')
-    parser.add_argument('refdata_dir',help='Directory containing the reference data files for PCGR/CPSR')
+    parser.add_argument('refdata_assembly_dir',help='Assembly-specific directory containing the reference data files for PCGR/CPSR')
     parser.add_argument('input_vcf', help='VCF input file with query variants (SNVs/InDels)')
     parser.add_argument('validated_vcf',help="Name of VCF file with validated, decomposed query variants that overlap target genes (SNVs/InDels)")
     parser.add_argument('custom_target_tsv',help='Custom text/TSV file indicating user-defined target genes from panel 0 for screening and reporting')
     parser.add_argument('custom_target_bed',help='Name of BED file populated with regions associated with custom target genes defined by user')
     parser.add_argument('retained_info_tags',help='Comma-separated string of VCF INFO tags in query VCF to be retained for output')
-    parser.add_argument('genome_assembly',help='grch37 or grch38')
     parser.add_argument('sample_id',help='CPSR sample_name')
     parser.add_argument('virtual_panel_id',type=str,help='virtual panel identifier(s)')
     parser.add_argument('diagnostic_grade_only', type=int, default=0, choices=[0,1], help="Green virtual panels only (Genomics England PanelApp)")
@@ -36,13 +35,12 @@ def __main__():
     parser.add_argument("--debug", action="store_true", help="Print full commands to log")
     args = parser.parse_args()
 
-    ret = validate_cpsr_input(args.refdata_dir,
+    ret = validate_cpsr_input(args.refdata_assembly_dir,
                               args.input_vcf,
                               args.validated_vcf,
                               args.custom_target_tsv,
                               args.custom_target_bed,
                               args.retained_info_tags,
-                              args.genome_assembly,
                               args.sample_id,
                               args.virtual_panel_id,
                               args.diagnostic_grade_only,
@@ -54,7 +52,7 @@ def __main__():
        sys.exit(1)
 
 
-def get_valid_custom_genelist(genelist_fname, genelist_bed_fname, pcgr_dir, genome_assembly, 
+def get_valid_custom_genelist(genelist_fname, genelist_bed_fname, refdata_assembly_dir,  
                               gwas_findings, secondary_findings, logger, debug):
     """
     Function that checks whether the custom genelist contains valid entries from the complete exploratory track
@@ -64,10 +62,13 @@ def get_valid_custom_genelist(genelist_fname, genelist_bed_fname, pcgr_dir, geno
     
     genelist_reader = csv.DictReader(open(genelist_fname,'r'), delimiter='\n', fieldnames=['ensembl_gene_id'])
     virtualpanel_track_bed = os.path.join(
-        pcgr_dir, "data", genome_assembly, "gene","bed","gene_virtual_panel", "0.bed.gz")
+        refdata_assembly_dir, "gene","bed","gene_virtual_panel", "0.bed.gz")
     virtualpanel_track_tsv = os.path.join(
-        pcgr_dir, "data", genome_assembly, "gene","tsv","gene_virtual_panel", "gene_virtual_panel.tsv.gz")
+        refdata_assembly_dir, "gene","tsv","gene_virtual_panel", "gene_virtual_panel.tsv.gz")
     genelist_bed_fname_unsorted = f'{genelist_bed_fname}.{random_id}.unsorted.bed'
+    
+    check_file_exists(virtualpanel_track_bed, logger)
+    check_file_exists(virtualpanel_track_tsv, logger)
 
     customlist_identifiers = {}
     superpanel_track = []
@@ -135,7 +136,7 @@ def get_valid_custom_genelist(genelist_fname, genelist_bed_fname, pcgr_dir, geno
     return 0
 
 
-def simplify_vcf(input_vcf, validated_vcf, vcf, custom_bed, refdata_directory, genome_assembly, virtual_panel_id, 
+def simplify_vcf(input_vcf, validated_vcf, vcf, custom_bed, refdata_assembly_dir, virtual_panel_id, 
                  sample_id, diagnostic_grade_only, gwas_findings, secondary_findings, output_dir, logger, debug):
 
     """
@@ -225,11 +226,11 @@ def simplify_vcf(input_vcf, validated_vcf, vcf, custom_bed, refdata_directory, g
         for pid in set(panel_ids):
             ge_panel_identifier = "GE_PANEL_" + str(pid)
             target_bed_gz = os.path.join(
-                refdata_directory,'data',genome_assembly, 'gene','bed','gene_virtual_panel', str(pid) + ".bed.gz")
+                refdata_assembly_dir, 'gene','bed','gene_virtual_panel', str(pid) + ".bed.gz")
             if diagnostic_grade_only == 1 and virtual_panel_id != "0":
                 logger.info('Considering diagnostic-grade only genes in panel ' + str(pid) + ' - (GREEN status in Genomics England PanelApp)')
                 target_bed_gz = os.path.join(
-                    refdata_directory, 'data', genome_assembly, 'gene','bed','gene_virtual_panel', str(pid) + ".GREEN.bed.gz")            
+                    refdata_assembly_dir, 'gene','bed','gene_virtual_panel', str(pid) + ".GREEN.bed.gz")            
             check_file_exists(target_bed_gz, logger)
             
             ## awk command to ignore secondary finding records while keeping records that belong to target (and that can potentially
@@ -281,13 +282,12 @@ def simplify_vcf(input_vcf, validated_vcf, vcf, custom_bed, refdata_directory, g
             logger.info('')
             exit(1)
 
-def validate_cpsr_input(refdata_directory, 
+def validate_cpsr_input(refdata_assembly_dir, 
                         input_vcf, 
                         validated_vcf,
                         custom_list_fname, 
                         custom_list_bed_fname,
                         retained_info_tags, 
-                        genome_assembly, 
                         sample_id, 
                         virtual_panel_id, 
                         diagnostic_grade_only, 
@@ -313,23 +313,20 @@ def validate_cpsr_input(refdata_directory,
         custom_target_fname['bed'] = custom_list_bed_fname
         get_valid_custom_genelist(custom_target_fname['tsv'], 
                                   custom_target_fname['bed'], 
-                                  refdata_directory, 
-                                  genome_assembly, 
+                                  refdata_assembly_dir, 
                                   gwas_findings, 
                                   secondary_findings, 
                                   logger, 
                                   debug)
     
-    db_assembly_dir = os.path.join(refdata_directory, 'data', genome_assembly)
-
     if not input_vcf == 'None':
         
         vcf_object = VCF(input_vcf)
         
         ## Check that VCF does not already contain INFO tags that will be appended through CPSR annotation
         ## - First add info tags generated by VEP, and those generated by CPSR
-        populated_infotags_other_fname = os.path.join(db_assembly_dir, 'vcf_infotags_other.tsv')
-        populated_infotags_vep_fname = os.path.join(db_assembly_dir, 'vcf_infotags_vep.tsv')
+        populated_infotags_other_fname = os.path.join(refdata_assembly_dir, 'vcf_infotags_other.tsv')
+        populated_infotags_vep_fname = os.path.join(refdata_assembly_dir, 'vcf_infotags_vep.tsv')
         tags_cpsr = read_infotag_file(populated_infotags_other_fname, scope = "cpsr")
         tags_vep = read_infotag_file(populated_infotags_vep_fname, scope = "vep")
         tags_cpsr.update(tags_vep)
@@ -340,14 +337,14 @@ def validate_cpsr_input(refdata_directory,
         track_file_info['tags_fname'] = {}
         for variant_track in ['clinvar','tcga','gwas','dbmts','dbnsfp','gnomad_non_cancer']:
             track_file_info['tags_fname'][variant_track] = os.path.join(
-                db_assembly_dir,'variant','vcf', variant_track, f'{variant_track}.vcfanno.vcf_info_tags.txt')
+                refdata_assembly_dir,'variant','vcf', variant_track, f'{variant_track}.vcfanno.vcf_info_tags.txt')
 
         for bed_track in ['simplerepeat','winmsk','rmsk','gerp']:
             track_file_info['tags_fname'][bed_track] = os.path.join(
-                db_assembly_dir,'misc','bed', bed_track, f'{bed_track}.vcfanno.vcf_info_tags.txt')
+                refdata_assembly_dir,'misc','bed', bed_track, f'{bed_track}.vcfanno.vcf_info_tags.txt')
 
         track_file_info['tags_fname']['gene_transcript_xref'] = os.path.join(
-            db_assembly_dir,'gene','bed', 'gene_transcript_xref', 'gene_transcript_xref.vcfanno.vcf_info_tags.txt')
+            refdata_assembly_dir,'gene','bed', 'gene_transcript_xref', 'gene_transcript_xref.vcfanno.vcf_info_tags.txt')
     
         for track in track_file_info['tags_fname']:
             infotags_vcfanno = read_vcfanno_tag_file(track_file_info['tags_fname'][track], logger)
@@ -374,8 +371,7 @@ def validate_cpsr_input(refdata_directory,
                      validated_vcf,
                      vcf_object, 
                      custom_target_fname['bed'], 
-                     refdata_directory, 
-                     genome_assembly, 
+                     refdata_assembly_dir, 
                      virtual_panel_id, 
                      sample_id, 
                      diagnostic_grade_only, 
diff --git a/scripts/pcgr_summarise.py b/scripts/pcgr_summarise.py
index 48b3f383..c18ee854 100755
--- a/scripts/pcgr_summarise.py
+++ b/scripts/pcgr_summarise.py
@@ -29,7 +29,7 @@ def __main__():
     parser.add_argument('tumortype', default='Any', help='Primary tumor type of query VCF')
     parser.add_argument('vep_pick_order', default="mane,canonical,appris,biotype,ccds,rank,tsl,length", 
                         help=f"Comma-separated string of ordered transcript/variant properties for selection of primary variant consequence")
-    parser.add_argument('pcgr_db_dir',help='PCGR data directory')
+    parser.add_argument('refdata_assembly_dir',help='Assembly-specific reference data directory, e.g. "pcgrdb/data/grch38')
     parser.add_argument('--compress_output_vcf', action="store_true", default=False, help="Compress output VCF file")
     parser.add_argument('--cpsr',action="store_true",help="Aggregate cancer gene annotations for Cancer Predisposition Sequencing Reporter (CPSR)")
     parser.add_argument('--cpsr_yaml',dest="cpsr_yaml", default=None, help='YAML file with list of targeted genes by CPSR (custom, or panel-defined)')
@@ -55,22 +55,22 @@ def extend_vcf_annotations(arg_dict, logger):
 
     Moreover, it performs two important matching procedures, using 
     5. Information from VEP's CSQ information (HGVSp/HGVSc) to match known mutation hotspots in cancer
-    6. Information from VEP's CSQ information (Consequence, HGVSp/HGVSc) and genomic coordinates to match known biomarkers in cancer
+    6. Information from VEP's CSQ information (Consequence, HGVSp/HGVSc, exon information etc) and genomic coordinates to match known biomarkers in cancer
     
     Finally, it assesses somatic variant oncogenicity, using
     7. Gene annotations (tumor suppressor, oncogene) and variant annotations (loss-of-function, gnomAD variant frequencies, variant effect predictions).
        Variant oncogenicity levels are provided for all variants using a recommended five-level scheme ("Oncogenic", "Likely oncogenic", "VUS", "Likely Benign", "Benign")
        - Recommended scoring scheme for variant oncogenicity classification outlined by VICC/ClinGen consortia (Horak et al., Genet Med, 2022)
 
-    List of VCF INFO tags appended by this procedure is defined by the 'infotags' files in the pcgr_db_dir
+    List of VCF INFO tags appended by this procedure is defined by the 'infotags' files in the refdata_assembly_dir
     """
     
     vcf_infotags = {}
     
-    vcf_infotags['other'] = read_infotag_file(os.path.join(arg_dict['pcgr_db_dir'], 'vcf_infotags_other.tsv'), scope = "pcgr")
+    vcf_infotags['other'] = read_infotag_file(os.path.join(arg_dict['refdata_assembly_dir'], 'vcf_infotags_other.tsv'), scope = "pcgr")
     if arg_dict['cpsr'] is True:
-        vcf_infotags['other'] = read_infotag_file(os.path.join(arg_dict['pcgr_db_dir'], 'vcf_infotags_other.tsv'), scope = "cpsr")
-    vcf_infotags['vep'] = read_infotag_file(os.path.join(arg_dict['pcgr_db_dir'], 'vcf_infotags_vep.tsv'), scope = "vep")
+        vcf_infotags['other'] = read_infotag_file(os.path.join(arg_dict['refdata_assembly_dir'], 'vcf_infotags_other.tsv'), scope = "cpsr")
+    vcf_infotags['vep'] = read_infotag_file(os.path.join(arg_dict['refdata_assembly_dir'], 'vcf_infotags_vep.tsv'), scope = "vep")
     vcf_infotags['other'].update(vcf_infotags['vep'])
     vcf_info_metadata = vcf_infotags['other']
     
@@ -89,19 +89,19 @@ def extend_vcf_annotations(arg_dict, logger):
                                 cpsr_target_genes[str(g['entrezgene'])] = 1
                             
             
-        vcf_infotags['cpsr'] = read_infotag_file(os.path.join(arg_dict['pcgr_db_dir'], 'vcf_infotags_cpsr.tsv'), scope = "cpsr")
+        vcf_infotags['cpsr'] = read_infotag_file(os.path.join(arg_dict['refdata_assembly_dir'], 'vcf_infotags_cpsr.tsv'), scope = "cpsr")
         vcf_infotags['other'].update(vcf_infotags['cpsr'])
         vcf_info_metadata.update(vcf_infotags['cpsr'])
 
     gene_transcript_xref_map = read_genexref_namemap(
-        os.path.join(arg_dict['pcgr_db_dir'], 'gene','tsv','gene_transcript_xref', 'gene_transcript_xref_bedmap.tsv.gz'), logger)
+        os.path.join(arg_dict['refdata_assembly_dir'], 'gene','tsv','gene_transcript_xref', 'gene_transcript_xref_bedmap.tsv.gz'), logger)
     cancer_hotspots = load_mutation_hotspots(
-        os.path.join(arg_dict['pcgr_db_dir'], 'misc','tsv','hotspot', 'hotspot.tsv.gz'), logger)
+        os.path.join(arg_dict['refdata_assembly_dir'], 'misc','tsv','hotspot', 'hotspot.tsv.gz'), logger)
 
     biomarkers = {}
     for db in ['cgi','civic']:
-        variant_fname = os.path.join(arg_dict['pcgr_db_dir'], 'biomarker','tsv', f"{db}.variant.tsv.gz")
-        clinical_fname = os.path.join(arg_dict['pcgr_db_dir'], 'biomarker','tsv', f"{db}.clinical.tsv.gz")
+        variant_fname = os.path.join(arg_dict['refdata_assembly_dir'], 'biomarker','tsv', f"{db}.variant.tsv.gz")
+        clinical_fname = os.path.join(arg_dict['refdata_assembly_dir'], 'biomarker','tsv', f"{db}.clinical.tsv.gz")
         if arg_dict['cpsr'] is True:
             biomarkers[db] = load_biomarkers(logger, variant_fname, clinical_fname, biomarker_vartype = 'MUT', biomarker_variant_origin = 'Both')
         else:
@@ -148,17 +148,19 @@ def extend_vcf_annotations(arg_dict, logger):
         ## For germline-called VCFs (single-sample), pull out sequencing depth and genotype using cyvcf2 API
         if arg_dict['cpsr'] is True:
             rec.INFO['DP_CONTROL'] = "-1"
-            if len(rec.gt_depths) == 1:
-                rec.INFO['DP_CONTROL'] = str(rec.gt_depths[0])
+            if not rec.gt_depths is None:
+                if len(rec.gt_depths) == 1:
+                    rec.INFO['DP_CONTROL'] = str(rec.gt_depths[0])
             rec.INFO['GENOTYPE'] = 'undefined'
-            if len(rec.gt_types) == 1:
-                if int(rec.gt_types[0]) == 0:
-                    rec.INFO['GENOTYPE'] = 'hom_ref'
-                else:
-                    if int(rec.gt_types[0]) == 1:
-                        rec.INFO['GENOTYPE'] = 'het'
-                    if int(rec.gt_types[0]) == 2:
-                        rec.INFO['GENOTYPE'] = 'hom_alt'
+            if not rec.gt_types is None:
+                if len(rec.gt_types) == 1:
+                    if int(rec.gt_types[0]) == 0:
+                        rec.INFO['GENOTYPE'] = 'hom_ref'
+                    else:
+                        if int(rec.gt_types[0]) == 1:
+                            rec.INFO['GENOTYPE'] = 'het'
+                        if int(rec.gt_types[0]) == 2:
+                            rec.INFO['GENOTYPE'] = 'hom_alt'
                 
         if current_chrom is None:
             current_chrom = str(rec.CHROM)
diff --git a/scripts/pcgr_validate_input.py b/scripts/pcgr_validate_input.py
index e7964f49..0f4e8cde 100755
--- a/scripts/pcgr_validate_input.py
+++ b/scripts/pcgr_validate_input.py
@@ -19,7 +19,7 @@
 def __main__():
 
     parser = argparse.ArgumentParser(description='Verify input data for PCGR')
-    parser.add_argument('refdata_dir',help='Directory with reference data for PCGR/CPSR')
+    parser.add_argument('refdata_assembly_dir',help='Assembly-specific directory with reference data for PCGR/CPSR')
     parser.add_argument('input_vcf', help='Somatic (tumor) query variants (SNVs/InDels) - VCF format')
     parser.add_argument('validated_vcf', help="Validated VCF file with somatic (tumor) query variants (SNVs/InDels)")
     parser.add_argument('input_cna', help='Somatic (tumor) copy number query segments (tab-separated values)')
@@ -27,7 +27,6 @@ def __main__():
     parser.add_argument('input_rna_exp', help='Tumor gene expression estimates (tab-separated values)')
     parser.add_argument('panel_normal_vcf',help="VCF file with germline calls from panel of normals")
     parser.add_argument('tumor_only',type=int, default=0,choices=[0,1],help="Tumor only sequencing")
-    parser.add_argument('genome_assembly',help='grch37 or grch38')
     parser.add_argument('sample_id',help='PCGR sample_name')
     parser.add_argument('retained_info_tags', help="Comma-separated string of custom VCF INFO tags to be kept in PCGR output")
     parser.add_argument('tumor_dp_tag', help='VCF INFO tag that denotes tumor sequencing depth')
@@ -42,7 +41,7 @@ def __main__():
     parser.add_argument("--debug", action="store_true", help="Print full commands to log")
     args = parser.parse_args()
 
-    ret = validate_pcgr_input(args.refdata_dir,
+    ret = validate_pcgr_input(args.refdata_assembly_dir,
                               args.input_vcf,
                               args.validated_vcf,
                               args.input_cna,
@@ -58,7 +57,6 @@ def __main__():
                               args.panel_normal_vcf,
                               args.retained_info_tags,
                               args.tumor_only,
-                              args.genome_assembly,
                               args.sample_id,
                               args.keep_uncompressed,
                               args.output_dir,
@@ -245,7 +243,7 @@ def simplify_vcf(input_vcf, validated_vcf, vcf, output_dir, sample_id, keep_unco
         remove_file(bcftools_simplify_log)
         remove_file(vt_decompose_log)
 
-def validate_pcgr_input(refdata_directory,
+def validate_pcgr_input(refdata_assembly_dir,
                         input_vcf,
                         validated_vcf,
                         input_cna,
@@ -261,14 +259,13 @@ def validate_pcgr_input(refdata_directory,
                         panel_normal_vcf,
                         retained_info_tags,
                         tumor_only,
-                        genome_assembly,
                         sample_id,
                         keep_uncompressed,
                         output_dir,
                         debug):
     """
     Function that reads the input files to PCGR (VCF file and Tab-separated values file with copy number segments) and performs the following checks:
-    1. No INFO annotation tags in the query VCF coincides with those generated by PCGR
+    1. No INFO annotation tags in the input VCF coincides with those generated by PCGR
     2. Provided columns for tumor/normal coverage and allelic depths are found in VCF
     3. Provided retained VCF INFO tags are present in VCF file
     4. If VCF have variants with multiple alternative alleles (e.g. 'A,T') run vt decompose
@@ -282,8 +279,6 @@ def validate_pcgr_input(refdata_directory,
 
     # if panel_normal_vcf == "None" and tumor_only == 1 and config_options['tumor_only']['exclude_pon'] is True:
     #    logger.warning('Panel-of-normals VCF is not present - exclusion of calls found in panel-of-normals will be ignored')
-    db_assembly_dir = os.path.join(refdata_directory, 'data', genome_assembly)
-
 
     if not input_vcf == 'None':
 
@@ -293,9 +288,9 @@ def validate_pcgr_input(refdata_directory,
         ## - First add info tags generated by VEP, and those generated by PCGR
         vcf_infotags = {}
         vcf_infotags['pcgr'] = read_infotag_file(
-            os.path.join(refdata_directory,'data',genome_assembly, 'vcf_infotags_other.tsv'), scope = "pcgr")
+            os.path.join(refdata_assembly_dir, 'vcf_infotags_other.tsv'), scope = "pcgr")
         vcf_infotags['vep'] = read_infotag_file(
-            os.path.join(refdata_directory,'data',genome_assembly, 'vcf_infotags_vep.tsv'), scope = "vep")
+            os.path.join(refdata_assembly_dir, 'vcf_infotags_vep.tsv'), scope = "vep")
         vcf_infotags['pcgr'].update(vcf_infotags['vep'])
         vcf_tags_pcgr = vcf_infotags['pcgr']
         
@@ -304,20 +299,20 @@ def validate_pcgr_input(refdata_directory,
         track_file_info['tags_fname'] = {}
         for variant_track in ['clinvar','tcga','gwas','dbnsfp']:
             track_file_info['tags_fname'][variant_track] = os.path.join(
-                db_assembly_dir,'variant','vcf', variant_track, f'{variant_track}.vcfanno.vcf_info_tags.txt')
+                refdata_assembly_dir,'variant','vcf', variant_track, f'{variant_track}.vcfanno.vcf_info_tags.txt')
 
         for bed_track in ['simplerepeat','winmsk','rmsk','gerp']:
             track_file_info['tags_fname'][bed_track] = os.path.join(
-                db_assembly_dir,'misc','bed', bed_track, f'{bed_track}.vcfanno.vcf_info_tags.txt')
+                refdata_assembly_dir,'misc','bed', bed_track, f'{bed_track}.vcfanno.vcf_info_tags.txt')
 
         track_file_info['tags_fname']['gene_transcript_xref'] = os.path.join(
-            db_assembly_dir,'gene','bed', 'gene_transcript_xref', 'gene_transcript_xref.vcfanno.vcf_info_tags.txt')
+            refdata_assembly_dir,'gene','bed', 'gene_transcript_xref', 'gene_transcript_xref.vcfanno.vcf_info_tags.txt')
     
         for track in track_file_info['tags_fname']:
             infotags_vcfanno = read_vcfanno_tag_file(track_file_info['tags_fname'][track], logger)
             vcf_tags_pcgr.update(infotags_vcfanno)
         
-        ## Check that no INFO annotation tags in the query VCF coincides with those generated by PCGR
+        ## Check that no INFO annotation tags in the input VCF coincides with those generated by PCGR
         tag_check = check_existing_vcf_info_tags(vcf_object, vcf_tags_pcgr, logger)
         if tag_check == -1:
             return -1
diff --git a/scripts/pcgrr.R b/scripts/pcgrr.R
index 76886aca..62443b3a 100755
--- a/scripts/pcgrr.R
+++ b/scripts/pcgrr.R
@@ -40,7 +40,7 @@ pcgrr::write_report_quarto_html(report = pcg_report1)
 yaml_fname <-
   paste0("/Users/sigven/project_data/packages/",
          "package__pcgr/bundle_update_2023/",
-         "pcgr/examples/SAMPLE-T_N-001.pcgr_acmg.grch38.conf.yaml")
+         "pcgr/examples/SAMPLE-T_N-001.pcgr.grch38.conf.yaml")
 
 
 ## Generate report content