Skip to content

Commit

Permalink
assess genotype and biomarker resolution rename
Browse files Browse the repository at this point in the history
  • Loading branch information
sigven committed Dec 20, 2023
1 parent a3796c6 commit 3b7e992
Show file tree
Hide file tree
Showing 6 changed files with 37 additions and 257 deletions.
1 change: 1 addition & 0 deletions pcgr/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ def create_config(arg_dict, workflow = "PCGR"):
if workflow == "CPSR":
conf_options['sample_properties']['phenotype'] = 'None'
conf_options['sample_properties']['site'] = 'Hereditary (blood)'
conf_options['sample_properties']['genotypes_available'] = 0
conf_options['visual_reporting']['table_display'] = str(arg_dict['report_table_display'])
conf_options['gene_panel'] = {
'panel_id': str(arg_dict['virtual_panel_id']),
Expand Down
13 changes: 10 additions & 3 deletions pcgr/cpsr.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,11 +193,10 @@ def run_cpsr(conf_options, cpsr_paths):

## Write YAML configuration file - settings, path to files, reference bundle etc
yaml_data = populate_config_data(conf_options, data_dir, workflow = "CPSR", logger = logger)

with open(yaml_fname, "w") as outfile:
outfile.write(yaml.dump(yaml_data))
outfile.close()

## CPSR|VEP - run Variant Effect Predictor on query VCF with LoF and NearestExonJB plugins
vep_command = vep.get_command(file_paths = cpsr_paths,
conf_options = yaml_data,
Expand Down Expand Up @@ -280,7 +279,15 @@ def run_cpsr(conf_options, cpsr_paths):
variant_set = \
variant.append_annotations(
output_pass_vcf2tsv_gz, pcgr_db_dir = cpsr_paths["db_dir"], logger = logger)
variant_set = variant.clean_annotations(variant_set, yaml_data, germline = True, logger = logger)
variant_set = variant.clean_annotations(variant_set, yaml_data, germline = True, logger = logger)
if {'GENOTYPE'}.issubset(variant_set.columns):
if variant_set.loc[variant_set['GENOTYPE'] == '.'].empty and variant_set.loc[variant_set['GENOTYPE'] == 'undefined'].empty:
yaml_data['conf']['sample_properties']['genotypes_available'] = 1

with open(yaml_fname, "w") as outfile:
outfile.write(yaml.dump(yaml_data))
outfile.close()

variant_set.to_csv(output_pass_tsv_gz, sep="\t", compression="gzip", index=False)
if not debug:
utils.remove(output_pass_vcf2tsv_gz)
Expand Down
3 changes: 2 additions & 1 deletion pcgr/variant.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ def set_genotype(variant_set: pd.DataFrame, logger) -> pd.DataFrame:
variant_set.loc[variant_set['GT'].isin(heterozygous_states),"GENOTYPE"] = "heterozygous"
else:
variant_set['GENOTYPE'] = "undefined"


variant_set = variant_set.astype({'GENOTYPE':'string'})
return(variant_set)

def append_annotations(vcf2tsv_gz_fname: str, pcgr_db_dir: str, logger):
Expand Down
3 changes: 3 additions & 0 deletions pcgrr/R/biomarkers.R
Original file line number Diff line number Diff line change
Expand Up @@ -1011,7 +1011,10 @@ expand_biomarker_items <- function(
"GENOMIC_CHANGE",
"GENOME_VERSION",
"SAMPLE_ID",
"GENOTYPE",
"VARIANT_CLASS",
"SYMBOL",
"GENENAME",
"ENTREZGENE",
"CONSEQUENCE",
"PROTEIN_CHANGE",
Expand Down
41 changes: 21 additions & 20 deletions pcgrr/R/input_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -247,26 +247,27 @@ load_dna_variants <- function(
"BIOMARKER_MATCHTYPE"),
sep = "\\|"
) |>
dplyr::mutate(BIOMARKER_MAPPING = dplyr::case_when(
stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_genomic_coord") ~ "genomic",
!stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_genomic_coord") &
stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_hgvsp_principal") ~ "hgvsp",
!stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_genomic_coord") &
!stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_hgvsp_principal") &
stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_codon_principal") ~ "codon",
!stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_genomic_coord") &
!stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_hgvsp_principal") &
!stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_codon_principal") &
stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_hgvsp_nonprincipal")~ "hgvsp_nonprincipal",
!stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_genomic_coord") &
!stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_hgvsp_principal") &
!stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_codon_principal") &
stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_exon_") ~ "exon",
!stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_genomic_coord") &
!stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_hgvsp_") &
!stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_codon_") &
!stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_exon_") &
stringr::str_detect(.data$BIOMARKER_MATCHTYPE,"by_gene_") ~ "gene",
dplyr::rename(BIOMARKER_MATCH = BIOMARKER_MATCHTYPE) |>
dplyr::mutate(BIOMARKER_RESOLUTION = dplyr::case_when(
stringr::str_detect(.data$BIOMARKER_MATCH,"by_genomic_coord") ~ "genomic",
!stringr::str_detect(.data$BIOMARKER_MATCH,"by_genomic_coord") &
stringr::str_detect(.data$BIOMARKER_MATCH,"by_hgvsp_principal") ~ "hgvsp",
!stringr::str_detect(.data$BIOMARKER_MATCH,"by_genomic_coord") &
!stringr::str_detect(.data$BIOMARKER_MATCH,"by_hgvsp_principal") &
stringr::str_detect(.data$BIOMARKER_MATCH,"by_codon_principal") ~ "codon",
!stringr::str_detect(.data$BIOMARKER_MATCH,"by_genomic_coord") &
!stringr::str_detect(.data$BIOMARKER_MATCH,"by_hgvsp_principal") &
!stringr::str_detect(.data$BIOMARKER_MATCH,"by_codon_principal") &
stringr::str_detect(.data$BIOMARKER_MATCH,"by_hgvsp_nonprincipal")~ "hgvsp_nonprincipal",
!stringr::str_detect(.data$BIOMARKER_MATCH,"by_genomic_coord") &
!stringr::str_detect(.data$BIOMARKER_MATCH,"by_hgvsp_principal") &
!stringr::str_detect(.data$BIOMARKER_MATCH,"by_codon_principal") &
stringr::str_detect(.data$BIOMARKER_MATCH,"by_exon_") ~ "exon",
!stringr::str_detect(.data$BIOMARKER_MATCH,"by_genomic_coord") &
!stringr::str_detect(.data$BIOMARKER_MATCH,"by_hgvsp_") &
!stringr::str_detect(.data$BIOMARKER_MATCH,"by_codon_") &
!stringr::str_detect(.data$BIOMARKER_MATCH,"by_exon_") &
stringr::str_detect(.data$BIOMARKER_MATCH,"by_gene_") ~ "gene",
TRUE ~ as.character('other')
)) |>
tidyr::separate_rows(
Expand Down
233 changes: 0 additions & 233 deletions scripts/pcgrr.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,234 +4,8 @@

suppressWarnings(suppressPackageStartupMessages(library(argparse)))
suppressWarnings(suppressPackageStartupMessages(library(pcgrr)))
suppressWarnings(suppressPackageStartupMessages(library(GenomeInfoDb)))
suppressWarnings(suppressPackageStartupMessages(library(stringr)))

##---- Argument Parsing ----##
p <- argparse::ArgumentParser(description='PCGR HTML generation step', prog='pcgrr')

# required args
required_args <- c(
'output_dir', 'query_vcf2tsv', 'query_cna', 'query_rna_fusion',
'query_rna_expression', 'cpsr_report', 'sample_name',
'pcgr_version', 'pcgr_bundle_version','genome_assembly', 'data_dir')
p$add_argument('output_dir') # 1
p$add_argument('query_vcf2tsv') # 2
p$add_argument('query_cna') # 3
p$add_argument('query_rna_fusion') # 4
p$add_argument('query_rna_expression') # 5
p$add_argument('cpsr_report') # 6
p$add_argument('sample_name') # 7
p$add_argument('pcgr_version') # 8
p$add_argument('pcgr_bundle_version') # 9
p$add_argument('genome_assembly') # 10
p$add_argument('data_dir') # 11
# tumor props
t_props_args <- c('tumor_purity', 'tumor_ploidy', 'tumor_type')
p$add_argument('tumor_purity') # 12
p$add_argument('tumor_ploidy') # 13
p$add_argument('tumor_type') # 14
# assay props
assay_props_args <- c('target_size_mb', 'type')
p$add_argument('target_size_mb', type='double') # 15
p$add_argument('type') # 16
# used to populate 'vcf_tumor_only' and 'mode' in the 'assay_props' list
p$add_argument('vcf_tumor_only1') # 17
p$add_argument('mode1') # 18
# tumor only args
tumor_only_args <- list(
onekg = c('maf_onekg_afr', 'maf_onekg_amr', 'maf_onekg_eas',
'maf_onekg_eur', 'maf_onekg_sas', 'maf_onekg_global'),
gnomad = c('maf_gnomad_afr','maf_gnomad_amr','maf_gnomad_asj',
'maf_gnomad_eas','maf_gnomad_fin','maf_gnomad_nfe',
'maf_gnomad_other','maf_gnomad_sas','maf_gnomad_global'),
filter = c('exclude_pon', 'exclude_likely_het_germline',
'exclude_likely_hom_germline', 'exclude_dbsnp_nonsomatic',
'exclude_nonexonic'))
p$add_argument('maf_onekg_afr', type='double') # 19
p$add_argument('maf_onekg_amr', type='double') # 20
p$add_argument('maf_onekg_eas', type='double') # 21
p$add_argument('maf_onekg_eur', type='double') # 22
p$add_argument('maf_onekg_sas', type='double') # 23
p$add_argument('maf_onekg_global', type='double') # 24
p$add_argument('maf_gnomad_afr', type='double') # 25
p$add_argument('maf_gnomad_amr', type='double') # 26
p$add_argument('maf_gnomad_asj', type='double') # 27
p$add_argument('maf_gnomad_eas', type='double') # 28
p$add_argument('maf_gnomad_fin', type='double') # 29
p$add_argument('maf_gnomad_nfe', type='double') # 30
p$add_argument('maf_gnomad_other', type='double') # 31
p$add_argument('maf_gnomad_sas', type='double') # 32
p$add_argument('maf_gnomad_global', type='double') # 33
p$add_argument('exclude_pon', type='integer') # 34
p$add_argument('exclude_likely_het_germline', type='integer') # 35
p$add_argument('exclude_likely_hom_germline', type='integer') # 36
p$add_argument('exclude_dbsnp_nonsomatic', type='integer') # 37
p$add_argument('exclude_nonexonic', type='integer') # 38
# tmb args
p$add_argument('tmb_run', type='integer') # 39
p$add_argument('tmb_algo') # 40
# msi/msigs args
p$add_argument('msi_run', type='integer') # 41
p$add_argument('msigs_run', type='integer') # 42
p$add_argument('msigs_mut_lim', type='double') # 43
p$add_argument('msigs_all_ref_sigs', type='integer') # 44
p$add_argument('msigs_incl_art_sigs', type='integer') # 45
p$add_argument('msigs_prevalence_ref_sigs', type='integer') # 46
# cna args
cna_args <- c('log_r_homdel', 'log_r_gain', 'cna_overlap_pct')
p$add_argument('log_r_homdel', type='double') # 47
p$add_argument('log_r_gain', type='double') # 48
p$add_argument('min_copies_ampl', type='integer') # 49
p$add_argument('cna_overlap_pct', type='double') # 50
# allelic support args
allelic_support_args <- c(
'tumor_af_min', 'tumor_dp_min',
'control_dp_min', 'control_af_max',
'tumor_af_tag', 'tumor_dp_tag',
'control_af_tag', 'control_dp_tag',
'call_conf_tag')
p$add_argument('tumor_af_min', type='double') # 51
p$add_argument('tumor_dp_min', type='double') # 52
p$add_argument('control_dp_min', type='double') # 53
p$add_argument('control_af_max', type='double') # 54
p$add_argument('tumor_af_tag') # 55
p$add_argument('tumor_dp_tag') # 56
p$add_argument('control_af_tag') # 57
p$add_argument('control_dp_tag') # 58
p$add_argument('call_conf_tag') # 59
# clinicaltrials
p$add_argument('clinicaltrials_run', type='integer') # 60
# other
p$add_argument('vep_n_forks', type='integer') # 61
p$add_argument('vep_buffer_size', type='integer') # 62
p$add_argument('vep_no_intergenic', type='integer') # 63
p$add_argument('vep_pick_order') # 64
p$add_argument('vep_regulatory', type='integer') # 65
p$add_argument('vep_gencode_all', type='integer') # 66
p$add_argument('vcf2maf', type='integer') # 67
p$add_argument('list_noncoding', type='integer') # 68
# retained_info_tags
p$add_argument('retained_info_tags') # 69
# visual
p$add_argument('report_theme') # 70
p$add_argument('nonfloating_toc', type='integer') # 71
# other

args <- p$parse_args()

# main pcgr_config list (processed later)
pcgr_config <- list(
required_args = args[required_args],
t_props = args[t_props_args],
assay_props = args[assay_props_args],
tumor_only = args[unlist(tumor_only_args)],
tmb = list(
run = as.logical(args[['tmb_run']]),
algorithm = args[['tmb_algo']]),
msi = list(run = as.logical(args[['msi_run']])),
msigs = list(
run = as.logical(args[['msigs_run']]),
mutation_limit = args[['msigs_mut_lim']],
all_reference_signatures = as.logical(args[['msigs_all_ref_sigs']]),
include_artefact_signatures = as.logical(args[['msigs_incl_art_sigs']]),
prevalence_reference_signatures = as.integer(args[['msigs_prevalence_ref_sigs']])
),
cna = args[cna_args],
allelic_support = args[allelic_support_args],
clinicaltrials = list(run = as.logical(args[['clinicaltrials_run']])),
vep = list(vep_n_forks = args[['vep_n_forks']],
vep_buffer_size = args[['vep_buffer_size']],
vep_no_intergenic = as.logical(args[['vep_no_intergenic']]),
vep_pick_order = args[['vep_pick_order']],
vep_regulatory = as.logical(args[['vep_regulatory']]),
vep_gencode_all = as.logical(args[['vep_gencode_all']])),
other = list(vcf2maf = as.logical(args[['vcf2maf']]),
list_noncoding = as.logical(args[['list_noncoding']]),
vcf_no_validation = as.logical(args[['vcf_no_validation']])
),
retained_info_tags = args[['retained_info_tags']],
visual = list(report_theme = args[['report_theme']],
nonfloating_toc = as.logical(args[['nonfloating_toc']])
)
)

##---- Argument Processing ----##
# required args
if (pcgr_config[['required_args']][['query_cna']] == "None"){
pcgr_config[['required_args']][['query_cna']] <- NULL
}
if (pcgr_config[['required_args']][['query_rna_fusion']] == "None"){
pcgr_config[['required_args']][['query_rna_fusion']] <- NULL
}
if (pcgr_config[['required_args']][['query_rna_expression']] == "None"){
pcgr_config[['required_args']][['query_rna_expression']] <- NULL
}
if (pcgr_config[['required_args']][['cpsr_report']] == "None"){
pcgr_config[['required_args']][['cpsr_report']] <- NULL
}

# tumor props
# Handle case when 'NA'
purity <- pcgr_config[['t_props']][['tumor_purity']]
ploidy <- pcgr_config[['t_props']][['tumor_ploidy']]
pcgr_config[['t_props']][['tumor_purity']] <-
ifelse(purity == 'NA', NA_real_, as.numeric(purity))
pcgr_config[['t_props']][['tumor_ploidy']] <-
ifelse(ploidy == 'NA', NA_real_, as.numeric(ploidy))
pcgr_config[['t_props']][['tumor_type']] <- stringr::str_replace_all(
stringr::str_replace_all(
args[['tumor_type']], "_", " "),
"@", "/")
if (pcgr_config[['t_props']][['tumor_type']] == "Any") {
pcgr_config[['t_props']][['tumor_type']] <- "Cancer, NOS"
}

### Sequencing assay properties (VCF)
### Target (WES/WGS/TARGETED), mode (tumor-control/tumor-only), coding target size
pcgr_config[['assay_props']][['mode']] <- 'Tumor-Control'
pcgr_config[['assay_props']][['vcf_tumor_only']] <- FALSE

if (as.integer(args[['vcf_tumor_only1']]) == 1) {
pcgr_config[['assay_props']][['vcf_tumor_only']] <- TRUE
pcgr_config[['assay_props']][['mode']] <- 'Tumor-Only'
if (as.integer(args[['mode1']]) == 1){
pcgr_config[['assay_props']][['mode']] <- 'Cell line (Tumor-Only)'
}
}

# tumor_only maf filter
for (mf in tumor_only_args[['filter']]) {
pcgr_config[['tumor_only']][[mf]] <- as.logical(args[[mf]])
}

pcgr_config_rds_fname <- file.path(
pcgr_config[['required_args']][['output_dir']],
paste0(pcgr_config[['required_args']][['sample_name']],
".pcgr_config.rds"))
pcgr_config_json_fname <- file.path(
pcgr_config[['required_args']][['output_dir']],
paste0(pcgr_config[['required_args']][['sample_name']],
".pcgr_config.json"))
pcgr_config_json <- jsonlite::toJSON(pcgr_config)
saveRDS(pcgr_config, file = pcgr_config_rds_fname)
jsonlite::write_json(pcgr_config_json, path = pcgr_config_json_fname)

### Arg processing END

#pcgr_data <- pcgrr::load_reference_data(
# pcgr_db_dir = pcgr_config[['required_args']][['data_dir']],
# genome_assembly = pcgr_config[['required_args']][['genome_assembly']])



# if (pcgr_config[['other']][['vep_regulatory']] == F){
# for (e in c('tier4_display','tier5_display','all','tsv')){
# pcgr_data[['annotation_tags']][[e]] <-
# pcgr_data[['annotation_tags']][[e]][
# pcgr_data[['annotation_tags']][[e]] != "REGULATORY_ANNOTATION"]
# }
# }

# my_log4r_layout <- function(level, ...) {
# paste0(format(Sys.time()), " - pcgr-report-generation - ",
Expand All @@ -244,13 +18,6 @@ jsonlite::write_json(pcgr_config_json, path = pcgr_config_json_fname)
# # this gets passed on to all the log4r_* functions inside the pkg
# options("PCGRR_LOG4R_LOGGER" = log4r_logger)

# ## Clinical trials
# if (pcgr_config[['t_props']][['tumor_type']] == "Cancer, NOS"){
# pcgrr:::log4r_info(paste0("Clinical trials will not be included in the report when primary site is not specified - skipping"))
# pcgr_config[['clinicaltrials']][['run']] <- F
# }

# pcgrr:::log4r_info(paste0("Tumor primary site: ", pcgr_config[['t_props']][['tumor_type']]))

# pcg_report <- NULL

Expand Down

0 comments on commit 3b7e992

Please sign in to comment.