From 3b7e992a43382e652b970952d76504d4c2ff9226 Mon Sep 17 00:00:00 2001 From: Sigve Nakken Date: Wed, 20 Dec 2023 15:25:13 +0100 Subject: [PATCH] assess genotype and biomarker resolution rename --- pcgr/config.py | 1 + pcgr/cpsr.py | 13 ++- pcgr/variant.py | 3 +- pcgrr/R/biomarkers.R | 3 + pcgrr/R/input_data.R | 41 ++++---- scripts/pcgrr.R | 233 ------------------------------------------- 6 files changed, 37 insertions(+), 257 deletions(-) diff --git a/pcgr/config.py b/pcgr/config.py index 85ae387c..cf0d476a 100644 --- a/pcgr/config.py +++ b/pcgr/config.py @@ -119,6 +119,7 @@ def create_config(arg_dict, workflow = "PCGR"): if workflow == "CPSR": conf_options['sample_properties']['phenotype'] = 'None' conf_options['sample_properties']['site'] = 'Hereditary (blood)' + conf_options['sample_properties']['genotypes_available'] = 0 conf_options['visual_reporting']['table_display'] = str(arg_dict['report_table_display']) conf_options['gene_panel'] = { 'panel_id': str(arg_dict['virtual_panel_id']), diff --git a/pcgr/cpsr.py b/pcgr/cpsr.py index 0a1caa43..c4da7831 100755 --- a/pcgr/cpsr.py +++ b/pcgr/cpsr.py @@ -193,11 +193,10 @@ def run_cpsr(conf_options, cpsr_paths): ## Write YAML configuration file - settings, path to files, reference bundle etc yaml_data = populate_config_data(conf_options, data_dir, workflow = "CPSR", logger = logger) - with open(yaml_fname, "w") as outfile: outfile.write(yaml.dump(yaml_data)) outfile.close() - + ## CPSR|VEP - run Variant Effect Predictor on query VCF with LoF and NearestExonJB plugins vep_command = vep.get_command(file_paths = cpsr_paths, conf_options = yaml_data, @@ -280,7 +279,15 @@ def run_cpsr(conf_options, cpsr_paths): variant_set = \ variant.append_annotations( output_pass_vcf2tsv_gz, pcgr_db_dir = cpsr_paths["db_dir"], logger = logger) - variant_set = variant.clean_annotations(variant_set, yaml_data, germline = True, logger = logger) + variant_set = variant.clean_annotations(variant_set, yaml_data, germline = True, logger = logger) + if {'GENOTYPE'}.issubset(variant_set.columns): + if variant_set.loc[variant_set['GENOTYPE'] == '.'].empty and variant_set.loc[variant_set['GENOTYPE'] == 'undefined'].empty: + yaml_data['conf']['sample_properties']['genotypes_available'] = 1 + + with open(yaml_fname, "w") as outfile: + outfile.write(yaml.dump(yaml_data)) + outfile.close() + variant_set.to_csv(output_pass_tsv_gz, sep="\t", compression="gzip", index=False) if not debug: utils.remove(output_pass_vcf2tsv_gz) diff --git a/pcgr/variant.py b/pcgr/variant.py index aeb5ccc7..1a415d42 100644 --- a/pcgr/variant.py +++ b/pcgr/variant.py @@ -44,7 +44,8 @@ def set_genotype(variant_set: pd.DataFrame, logger) -> pd.DataFrame: variant_set.loc[variant_set['GT'].isin(heterozygous_states),"GENOTYPE"] = "heterozygous" else: variant_set['GENOTYPE'] = "undefined" - + + variant_set = variant_set.astype({'GENOTYPE':'string'}) return(variant_set) def append_annotations(vcf2tsv_gz_fname: str, pcgr_db_dir: str, logger): diff --git a/pcgrr/R/biomarkers.R b/pcgrr/R/biomarkers.R index dbecc9ac..1887a793 100644 --- a/pcgrr/R/biomarkers.R +++ b/pcgrr/R/biomarkers.R @@ -1011,7 +1011,10 @@ expand_biomarker_items <- function( "GENOMIC_CHANGE", "GENOME_VERSION", "SAMPLE_ID", + "GENOTYPE", + "VARIANT_CLASS", "SYMBOL", + "GENENAME", "ENTREZGENE", "CONSEQUENCE", "PROTEIN_CHANGE", diff --git a/pcgrr/R/input_data.R b/pcgrr/R/input_data.R index 7c512ce4..94a2f63e 100644 --- a/pcgrr/R/input_data.R +++ b/pcgrr/R/input_data.R @@ -247,26 +247,27 @@ load_dna_variants <- function( "BIOMARKER_MATCHTYPE"), sep = "\\|" ) |> - dplyr::mutate(BIOMARKER_MAPPING = dplyr::case_when( - stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_genomic_coord") ~ "genomic", - !stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_genomic_coord") & - stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_hgvsp_principal") ~ "hgvsp", - !stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_genomic_coord") & - !stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_hgvsp_principal") & - stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_codon_principal") ~ "codon", - !stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_genomic_coord") & - !stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_hgvsp_principal") & - !stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_codon_principal") & - stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_hgvsp_nonprincipal")~ "hgvsp_nonprincipal", - !stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_genomic_coord") & - !stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_hgvsp_principal") & - !stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_codon_principal") & - stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_exon_") ~ "exon", - !stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_genomic_coord") & - !stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_hgvsp_") & - !stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_codon_") & - !stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_exon_") & - stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_gene_") ~ "gene", + dplyr::rename(BIOMARKER_MATCH = BIOMARKER_MATCHTYPE) |> + dplyr::mutate(BIOMARKER_RESOLUTION = dplyr::case_when( + stringr::str_detect(.data$BIOMARKER_MATCH,"by_genomic_coord") ~ "genomic", + !stringr::str_detect(.data$BIOMARKER_MATCH,"by_genomic_coord") & + stringr::str_detect(.data$BIOMARKER_MATCH,"by_hgvsp_principal") ~ "hgvsp", + !stringr::str_detect(.data$BIOMARKER_MATCH,"by_genomic_coord") & + !stringr::str_detect(.data$BIOMARKER_MATCH,"by_hgvsp_principal") & + stringr::str_detect(.data$BIOMARKER_MATCH,"by_codon_principal") ~ "codon", + !stringr::str_detect(.data$BIOMARKER_MATCH,"by_genomic_coord") & + !stringr::str_detect(.data$BIOMARKER_MATCH,"by_hgvsp_principal") & + !stringr::str_detect(.data$BIOMARKER_MATCH,"by_codon_principal") & + stringr::str_detect(.data$BIOMARKER_MATCH,"by_hgvsp_nonprincipal")~ "hgvsp_nonprincipal", + !stringr::str_detect(.data$BIOMARKER_MATCH,"by_genomic_coord") & + !stringr::str_detect(.data$BIOMARKER_MATCH,"by_hgvsp_principal") & + !stringr::str_detect(.data$BIOMARKER_MATCH,"by_codon_principal") & + stringr::str_detect(.data$BIOMARKER_MATCH,"by_exon_") ~ "exon", + !stringr::str_detect(.data$BIOMARKER_MATCH,"by_genomic_coord") & + !stringr::str_detect(.data$BIOMARKER_MATCH,"by_hgvsp_") & + !stringr::str_detect(.data$BIOMARKER_MATCH,"by_codon_") & + !stringr::str_detect(.data$BIOMARKER_MATCH,"by_exon_") & + stringr::str_detect(.data$BIOMARKER_MATCH,"by_gene_") ~ "gene", TRUE ~ as.character('other') )) |> tidyr::separate_rows( diff --git a/scripts/pcgrr.R b/scripts/pcgrr.R index bd8b1f25..0253b08e 100755 --- a/scripts/pcgrr.R +++ b/scripts/pcgrr.R @@ -4,234 +4,8 @@ suppressWarnings(suppressPackageStartupMessages(library(argparse))) suppressWarnings(suppressPackageStartupMessages(library(pcgrr))) -suppressWarnings(suppressPackageStartupMessages(library(GenomeInfoDb))) suppressWarnings(suppressPackageStartupMessages(library(stringr))) -##---- Argument Parsing ----## -p <- argparse::ArgumentParser(description='PCGR HTML generation step', prog='pcgrr') - -# required args -required_args <- c( - 'output_dir', 'query_vcf2tsv', 'query_cna', 'query_rna_fusion', - 'query_rna_expression', 'cpsr_report', 'sample_name', - 'pcgr_version', 'pcgr_bundle_version','genome_assembly', 'data_dir') -p$add_argument('output_dir') # 1 -p$add_argument('query_vcf2tsv') # 2 -p$add_argument('query_cna') # 3 -p$add_argument('query_rna_fusion') # 4 -p$add_argument('query_rna_expression') # 5 -p$add_argument('cpsr_report') # 6 -p$add_argument('sample_name') # 7 -p$add_argument('pcgr_version') # 8 -p$add_argument('pcgr_bundle_version') # 9 -p$add_argument('genome_assembly') # 10 -p$add_argument('data_dir') # 11 -# tumor props -t_props_args <- c('tumor_purity', 'tumor_ploidy', 'tumor_type') -p$add_argument('tumor_purity') # 12 -p$add_argument('tumor_ploidy') # 13 -p$add_argument('tumor_type') # 14 -# assay props -assay_props_args <- c('target_size_mb', 'type') -p$add_argument('target_size_mb', type='double') # 15 -p$add_argument('type') # 16 -# used to populate 'vcf_tumor_only' and 'mode' in the 'assay_props' list -p$add_argument('vcf_tumor_only1') # 17 -p$add_argument('mode1') # 18 -# tumor only args -tumor_only_args <- list( - onekg = c('maf_onekg_afr', 'maf_onekg_amr', 'maf_onekg_eas', - 'maf_onekg_eur', 'maf_onekg_sas', 'maf_onekg_global'), - gnomad = c('maf_gnomad_afr','maf_gnomad_amr','maf_gnomad_asj', - 'maf_gnomad_eas','maf_gnomad_fin','maf_gnomad_nfe', - 'maf_gnomad_other','maf_gnomad_sas','maf_gnomad_global'), - filter = c('exclude_pon', 'exclude_likely_het_germline', - 'exclude_likely_hom_germline', 'exclude_dbsnp_nonsomatic', - 'exclude_nonexonic')) -p$add_argument('maf_onekg_afr', type='double') # 19 -p$add_argument('maf_onekg_amr', type='double') # 20 -p$add_argument('maf_onekg_eas', type='double') # 21 -p$add_argument('maf_onekg_eur', type='double') # 22 -p$add_argument('maf_onekg_sas', type='double') # 23 -p$add_argument('maf_onekg_global', type='double') # 24 -p$add_argument('maf_gnomad_afr', type='double') # 25 -p$add_argument('maf_gnomad_amr', type='double') # 26 -p$add_argument('maf_gnomad_asj', type='double') # 27 -p$add_argument('maf_gnomad_eas', type='double') # 28 -p$add_argument('maf_gnomad_fin', type='double') # 29 -p$add_argument('maf_gnomad_nfe', type='double') # 30 -p$add_argument('maf_gnomad_other', type='double') # 31 -p$add_argument('maf_gnomad_sas', type='double') # 32 -p$add_argument('maf_gnomad_global', type='double') # 33 -p$add_argument('exclude_pon', type='integer') # 34 -p$add_argument('exclude_likely_het_germline', type='integer') # 35 -p$add_argument('exclude_likely_hom_germline', type='integer') # 36 -p$add_argument('exclude_dbsnp_nonsomatic', type='integer') # 37 -p$add_argument('exclude_nonexonic', type='integer') # 38 -# tmb args -p$add_argument('tmb_run', type='integer') # 39 -p$add_argument('tmb_algo') # 40 -# msi/msigs args -p$add_argument('msi_run', type='integer') # 41 -p$add_argument('msigs_run', type='integer') # 42 -p$add_argument('msigs_mut_lim', type='double') # 43 -p$add_argument('msigs_all_ref_sigs', type='integer') # 44 -p$add_argument('msigs_incl_art_sigs', type='integer') # 45 -p$add_argument('msigs_prevalence_ref_sigs', type='integer') # 46 -# cna args -cna_args <- c('log_r_homdel', 'log_r_gain', 'cna_overlap_pct') -p$add_argument('log_r_homdel', type='double') # 47 -p$add_argument('log_r_gain', type='double') # 48 -p$add_argument('min_copies_ampl', type='integer') # 49 -p$add_argument('cna_overlap_pct', type='double') # 50 -# allelic support args -allelic_support_args <- c( - 'tumor_af_min', 'tumor_dp_min', - 'control_dp_min', 'control_af_max', - 'tumor_af_tag', 'tumor_dp_tag', - 'control_af_tag', 'control_dp_tag', - 'call_conf_tag') -p$add_argument('tumor_af_min', type='double') # 51 -p$add_argument('tumor_dp_min', type='double') # 52 -p$add_argument('control_dp_min', type='double') # 53 -p$add_argument('control_af_max', type='double') # 54 -p$add_argument('tumor_af_tag') # 55 -p$add_argument('tumor_dp_tag') # 56 -p$add_argument('control_af_tag') # 57 -p$add_argument('control_dp_tag') # 58 -p$add_argument('call_conf_tag') # 59 -# clinicaltrials -p$add_argument('clinicaltrials_run', type='integer') # 60 -# other -p$add_argument('vep_n_forks', type='integer') # 61 -p$add_argument('vep_buffer_size', type='integer') # 62 -p$add_argument('vep_no_intergenic', type='integer') # 63 -p$add_argument('vep_pick_order') # 64 -p$add_argument('vep_regulatory', type='integer') # 65 -p$add_argument('vep_gencode_all', type='integer') # 66 -p$add_argument('vcf2maf', type='integer') # 67 -p$add_argument('list_noncoding', type='integer') # 68 -# retained_info_tags -p$add_argument('retained_info_tags') # 69 -# visual -p$add_argument('report_theme') # 70 -p$add_argument('nonfloating_toc', type='integer') # 71 -# other - -args <- p$parse_args() - -# main pcgr_config list (processed later) -pcgr_config <- list( - required_args = args[required_args], - t_props = args[t_props_args], - assay_props = args[assay_props_args], - tumor_only = args[unlist(tumor_only_args)], - tmb = list( - run = as.logical(args[['tmb_run']]), - algorithm = args[['tmb_algo']]), - msi = list(run = as.logical(args[['msi_run']])), - msigs = list( - run = as.logical(args[['msigs_run']]), - mutation_limit = args[['msigs_mut_lim']], - all_reference_signatures = as.logical(args[['msigs_all_ref_sigs']]), - include_artefact_signatures = as.logical(args[['msigs_incl_art_sigs']]), - prevalence_reference_signatures = as.integer(args[['msigs_prevalence_ref_sigs']]) - ), - cna = args[cna_args], - allelic_support = args[allelic_support_args], - clinicaltrials = list(run = as.logical(args[['clinicaltrials_run']])), - vep = list(vep_n_forks = args[['vep_n_forks']], - vep_buffer_size = args[['vep_buffer_size']], - vep_no_intergenic = as.logical(args[['vep_no_intergenic']]), - vep_pick_order = args[['vep_pick_order']], - vep_regulatory = as.logical(args[['vep_regulatory']]), - vep_gencode_all = as.logical(args[['vep_gencode_all']])), - other = list(vcf2maf = as.logical(args[['vcf2maf']]), - list_noncoding = as.logical(args[['list_noncoding']]), - vcf_no_validation = as.logical(args[['vcf_no_validation']]) - ), - retained_info_tags = args[['retained_info_tags']], - visual = list(report_theme = args[['report_theme']], - nonfloating_toc = as.logical(args[['nonfloating_toc']]) - ) -) - -##---- Argument Processing ----## -# required args -if (pcgr_config[['required_args']][['query_cna']] == "None"){ - pcgr_config[['required_args']][['query_cna']] <- NULL -} -if (pcgr_config[['required_args']][['query_rna_fusion']] == "None"){ - pcgr_config[['required_args']][['query_rna_fusion']] <- NULL -} -if (pcgr_config[['required_args']][['query_rna_expression']] == "None"){ - pcgr_config[['required_args']][['query_rna_expression']] <- NULL -} -if (pcgr_config[['required_args']][['cpsr_report']] == "None"){ - pcgr_config[['required_args']][['cpsr_report']] <- NULL -} - -# tumor props -# Handle case when 'NA' -purity <- pcgr_config[['t_props']][['tumor_purity']] -ploidy <- pcgr_config[['t_props']][['tumor_ploidy']] -pcgr_config[['t_props']][['tumor_purity']] <- - ifelse(purity == 'NA', NA_real_, as.numeric(purity)) -pcgr_config[['t_props']][['tumor_ploidy']] <- - ifelse(ploidy == 'NA', NA_real_, as.numeric(ploidy)) -pcgr_config[['t_props']][['tumor_type']] <- stringr::str_replace_all( - stringr::str_replace_all( - args[['tumor_type']], "_", " "), - "@", "/") -if (pcgr_config[['t_props']][['tumor_type']] == "Any") { - pcgr_config[['t_props']][['tumor_type']] <- "Cancer, NOS" -} - -### Sequencing assay properties (VCF) -### Target (WES/WGS/TARGETED), mode (tumor-control/tumor-only), coding target size -pcgr_config[['assay_props']][['mode']] <- 'Tumor-Control' -pcgr_config[['assay_props']][['vcf_tumor_only']] <- FALSE - -if (as.integer(args[['vcf_tumor_only1']]) == 1) { - pcgr_config[['assay_props']][['vcf_tumor_only']] <- TRUE - pcgr_config[['assay_props']][['mode']] <- 'Tumor-Only' - if (as.integer(args[['mode1']]) == 1){ - pcgr_config[['assay_props']][['mode']] <- 'Cell line (Tumor-Only)' - } -} - -# tumor_only maf filter -for (mf in tumor_only_args[['filter']]) { - pcgr_config[['tumor_only']][[mf]] <- as.logical(args[[mf]]) -} - -pcgr_config_rds_fname <- file.path( - pcgr_config[['required_args']][['output_dir']], - paste0(pcgr_config[['required_args']][['sample_name']], - ".pcgr_config.rds")) -pcgr_config_json_fname <- file.path( - pcgr_config[['required_args']][['output_dir']], - paste0(pcgr_config[['required_args']][['sample_name']], - ".pcgr_config.json")) -pcgr_config_json <- jsonlite::toJSON(pcgr_config) -saveRDS(pcgr_config, file = pcgr_config_rds_fname) -jsonlite::write_json(pcgr_config_json, path = pcgr_config_json_fname) - -### Arg processing END - -#pcgr_data <- pcgrr::load_reference_data( -# pcgr_db_dir = pcgr_config[['required_args']][['data_dir']], -# genome_assembly = pcgr_config[['required_args']][['genome_assembly']]) - - - -# if (pcgr_config[['other']][['vep_regulatory']] == F){ -# for (e in c('tier4_display','tier5_display','all','tsv')){ -# pcgr_data[['annotation_tags']][[e]] <- -# pcgr_data[['annotation_tags']][[e]][ -# pcgr_data[['annotation_tags']][[e]] != "REGULATORY_ANNOTATION"] -# } -# } # my_log4r_layout <- function(level, ...) { # paste0(format(Sys.time()), " - pcgr-report-generation - ", @@ -244,13 +18,6 @@ jsonlite::write_json(pcgr_config_json, path = pcgr_config_json_fname) # # this gets passed on to all the log4r_* functions inside the pkg # options("PCGRR_LOG4R_LOGGER" = log4r_logger) -# ## Clinical trials -# if (pcgr_config[['t_props']][['tumor_type']] == "Cancer, NOS"){ -# pcgrr:::log4r_info(paste0("Clinical trials will not be included in the report when primary site is not specified - skipping")) -# pcgr_config[['clinicaltrials']][['run']] <- F -# } - -# pcgrr:::log4r_info(paste0("Tumor primary site: ", pcgr_config[['t_props']][['tumor_type']])) # pcg_report <- NULL