Skip to content

Commit

Permalink
improve signature analysis for primary sites with no reference signat…
Browse files Browse the repository at this point in the history
…ures - new refbundle
  • Loading branch information
sigven committed Apr 27, 2024
1 parent c831770 commit 43277f4
Show file tree
Hide file tree
Showing 11 changed files with 109 additions and 83 deletions.
6 changes: 6 additions & 0 deletions pcgr/arg_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,12 @@ def verify_args(arg_dict):
err_msg = f"Minimum number of mutations required for mutational signature analysis ('--min_mutations_signatures' = {arg_dict['min_mutations_signatures']}) must be >= 100"
error_message(err_msg, logger)

if float(arg_dict['prevalence_reference_signatures']) > 20 or float(arg_dict['prevalence_reference_signatures']) < 0:
err_msg = (
f"Prevalence of reference signatures must be above zero and less than 20 ('--prevalence_reference_signatures' ",
f"= {arg_dict['prevalence_reference_signatures']}")
error_message(err_msg, logger)

# if MSI status is to be estimated, mutational burden must be turned on
if arg_dict['estimate_msi'] is True and arg_dict['estimate_tmb'] is False:
err_msg = "Prediction of MSI status ('--estimate_msi') requires mutational burden analysis ('--estimate_tmb')"
Expand Down
6 changes: 2 additions & 4 deletions pcgr/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def create_config(arg_dict, workflow = "PCGR"):
'mutation_limit': int(arg_dict['min_mutations_signatures']),
'all_reference_signatures': int(arg_dict['all_reference_signatures']),
'include_artefact_signatures': int(arg_dict['include_artefact_signatures']),
'prevalence_reference_signatures': int(arg_dict['prevalence_reference_signatures'])
'prevalence_reference_signatures': float(arg_dict['prevalence_reference_signatures'])
}


Expand All @@ -139,9 +139,7 @@ def create_config(arg_dict, workflow = "PCGR"):
#conf_options['molecular_data']['fname_expression_csq_tsv'] = "None"
conf_options['molecular_data']['fname_expression_similarity_tsv'] = "None"
conf_options['molecular_data']['fname_tmb_tsv'] = "None"
#for source in ['tcga','treehouse','depmap']:
# conf_options['molecular_data']['fname_expression_sim_' + source] = "None"



if workflow == "CPSR":
conf_options['sample_properties']['phenotype'] = 'None'
Expand Down
2 changes: 1 addition & 1 deletion pcgr/expression.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def parse_expression(expression_fname_tsv: str,
sample_gene_expression.sort_values(by=['ID','TPM'], ascending=[True, False], inplace = True)
dup_ids = len(sample_gene_expression['ID']) - len(sample_gene_expression['ID'].drop_duplicates())
if dup_ids > 0:
logger.warn(f"Found N = {dup_ids} duplicate transcript identifiers - resolving duplicates by keeping the highest TPM value")
logger.warn(f"Found N = {dup_ids} duplicate identifiers - resolving duplicates by keeping the highest TPM value")
sample_gene_expression = sample_gene_expression.drop_duplicates(subset = ['ID'])

## Read the gene identifier index - maps transcript identifiers (Ensembl/Refseq),
Expand Down
2 changes: 1 addition & 1 deletion pcgr/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def cli():
optional_signatures.add_argument("--min_mutations_signatures", type=int, default=200, dest="min_mutations_signatures", help="Minimum number of SNVs required for re-fitting of mutational signatures (SBS) (default: %(default)s, minimum n = 100)")
optional_signatures.add_argument("--all_reference_signatures", action="store_true", help="Use _all_ reference mutational signatures (SBS) during signature re-fitting rather than only those already attributed to the tumor type (default: %(default)s)")
optional_signatures.add_argument("--include_artefact_signatures", action="store_true", help="Include sequencing artefacts in the collection of reference signatures (default: %(default)s")
optional_signatures.add_argument("--prevalence_reference_signatures", type=int, default=1, choices=[1,2,5,10,15,20], help="Minimum tumor-type prevalence (in percent) of reference signatures to be included in refitting procedure (default: %(default)s)")
optional_signatures.add_argument("--prevalence_reference_signatures", type=float, default=0.1, help="Minimum tumor-type prevalence (in percent) of reference signatures to be included in refitting procedure (default: %(default)s)")

optional_cna.add_argument("--input_cna", dest="input_cna", help="Somatic copy number alteration segments (tab-separated values)")
optional_cna.add_argument("--n_copy_gain", type=int, default=6, dest="n_copy_gain", help="Minimum number of total copy number for segments considered as gains/amplifications (default: %(default)s)")
Expand Down
2 changes: 1 addition & 1 deletion pcgr/pcgr_vars.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from pcgr._version import __version__

PCGR_VERSION = __version__
DB_VERSION = '20240412'
DB_VERSION = '20240426'

## MISCELLANEOUS
NCBI_BUILD_MAF = 'GRCh38'
Expand Down
95 changes: 40 additions & 55 deletions pcgrr/R/mutational_signatures.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,10 @@ generate_report_data_signatures <-
pcg_report_signatures <-
pcgrr::init_m_signature_content()

if(!is.null(variant_set) & !is.null(vstats) & !is.null(ref_data) & !is.null(settings)){
if(!is.null(variant_set) &
!is.null(vstats) &
!is.null(ref_data) &
!is.null(settings)){
pcgrr::log4r_info("------")
pcgrr::log4r_info("Identifying mutational signatures")
}else{
Expand Down Expand Up @@ -73,21 +76,33 @@ generate_report_data_signatures <-
vcf_fname = vcf_name_mutsig_analysis,
snv_only = F)

#pcg_report_signatures <-
# pcgrr::init_m_signature_content()

fit_signatures_to_ttype <- !as.logical(
sig_settings$all_reference_signatures
)

## Retrieve relevant signatures for the tumor in question


sites_with_sig_prevalence <-
unique(ref_data$misc$mutational_signature[["PRIMARY_SITE"]])

site_has_prevalence_data <- T
if(!(settings$conf$sample_properties$site %in%
sites_with_sig_prevalence)){
site_has_prevalence_data <- F
pcgrr::log4r_warn(
paste0("No signature prevalence data available for site '",
settings$conf$sample_properties$site,
"' - considering all signatures for analysis"))
}

prevalent_site_signatures <- NULL
if (fit_signatures_to_ttype == T) {
if (fit_signatures_to_ttype == T & site_has_prevalence_data == T) {
prevalent_site_signatures <-
pcgrr::get_prevalent_site_signatures(
site = settings$conf$sample_properties$site,
min_prevalence_pct =
sig_settings$prevalence_reference_signatures,
as.numeric(sig_settings$prevalence_reference_signatures),
ref_data = ref_data,
incl_poss_artifacts =
sig_settings$include_artefact_signatures)
Expand All @@ -96,7 +111,7 @@ generate_report_data_signatures <-
pcgrr::get_prevalent_site_signatures(
site = "Any",
min_prevalence_pct =
sig_settings$prevalence_reference_signatures,
as.numeric(sig_settings$prevalence_reference_signatures),
ref_data = ref_data,
incl_poss_artifacts =
sig_settings$include_artefact_signatures)
Expand Down Expand Up @@ -218,14 +233,16 @@ generate_report_data_signatures <-
) |>
#dplyr::mutate(SITE_SPECIFIC = "NOT_DEFINED") |>
dplyr::mutate(SITE_SPECIFIC = dplyr::if_else(
as.logical(fit_signatures_to_ttype) == TRUE,
as.logical(fit_signatures_to_ttype) == TRUE &
site_has_prevalence_data == TRUE,
"NO",
as.character("NOT_DEFINED")
)) |>
dplyr::mutate(SITE_SPECIFIC = dplyr::case_when(
SIGNATURE_ID %in%
unique(prevalent_site_signatures$aetiology$SIGNATURE_ID)
& as.logical(fit_signatures_to_ttype) == TRUE ~ "YES",
unique(prevalent_site_signatures$aetiology$SIGNATURE_ID) &
as.logical(fit_signatures_to_ttype) == TRUE &
as.logical(site_has_prevalence_data) == TRUE ~ "YES",
TRUE ~ as.character(SITE_SPECIFIC)))
}
}
Expand Down Expand Up @@ -454,6 +471,8 @@ generate_report_data_signatures <-
pcg_report_signatures[["result"]][["chromosomes"]] <- chromosomes
pcg_report_signatures[["result"]][["contributions"]] <- contributions
pcg_report_signatures[["result"]][["tsv"]] <- tsv_data
pcg_report_signatures[["result"]][["no_site_prevalence"]] <-
!site_has_prevalence_data
pcg_report_signatures[["result"]][["reference_data"]] <-
prevalent_site_signatures$aetiology
pcg_report_signatures[["result"]][["scale_fill_values"]] <- color_vec
Expand Down Expand Up @@ -498,7 +517,7 @@ get_prevalent_site_signatures <-
function(site = "Any",
custom_collection = NULL,
ref_data = NULL,
min_prevalence_pct = 5,
min_prevalence_pct = 0.1,
incl_poss_artifacts = T) {

cosmic_metadata <-
Expand Down Expand Up @@ -536,13 +555,9 @@ get_prevalent_site_signatures <-
msg = "Reference aetiologies must be of type data.frame()"))
invisible(
assertthat::assert_that(
min_prevalence_pct == 1 |
min_prevalence_pct == 2 |
min_prevalence_pct == 5 |
min_prevalence_pct == 10 |
min_prevalence_pct == 15 |
min_prevalence_pct == 20,
msg = "Argument 'min_prevalence_pct' must be any of '0, 2, 5, 10, 15 or 20'"))
min_prevalence_pct >= 0.1 &
min_prevalence_pct <= 20,
msg = "Argument 'min_prevalence_pct' must be more than 0.1 and less than 20"))

valid_signature_ids <-
unique(ref_data$misc$mutational_signature$SIGNATURE_ID)
Expand Down Expand Up @@ -587,11 +602,9 @@ get_prevalent_site_signatures <-

unique_sites_with_signature_prevalence <-
unique(ref_data$misc$mutational_signature[["PRIMARY_SITE"]])

## No primary site defined - 'Any'
if (!(site %in% unique_sites_with_signature_prevalence)) {
pcgrr::log4r_info(
paste0("Primary tumor site '", site, "' ",
"does not have any signatures with significant ",
"prevalence - considering all"))
signatures_prevalence <-
ref_data$misc$mutational_signature |>
dplyr::select(c("SIGNATURE_ID",
Expand All @@ -607,49 +620,21 @@ get_prevalent_site_signatures <-
dplyr::select(.data$SIGNATURE_ID,
.data$PRIMARY_SITE,
.data$PREVALENCE_PCT,
.data$PREVALENCE_ABOVE_5PCT,
.data$PREVALENCE_ABOVE_10PCT,
.data$PREVALENCE_ABOVE_15PCT,
.data$PREVALENCE_ABOVE_20PCT,
.data$AETIOLOGY_KEYWORD,
.data$AETIOLOGY,
.data$ASSOCIATED_SIGNATURES,
.data$COMMENTS) |>
dplyr::distinct()

if (min_prevalence_pct > 0) {
if (min_prevalence_pct == 5) {
signatures_prevalence <- signatures_prevalence |>
dplyr::filter(.data$PREVALENCE_ABOVE_5PCT == T |
is.na(.data$PREVALENCE_ABOVE_5PCT))
}else if (min_prevalence_pct == 10) {
signatures_prevalence <- signatures_prevalence |>
dplyr::filter(.data$PREVALENCE_ABOVE_10PCT == T |
is.na(.data$PREVALENCE_ABOVE_10PCT))
}
else if (min_prevalence_pct == 15) {
signatures_prevalence <- signatures_prevalence |>
dplyr::filter(.data$PREVALENCE_ABOVE_15PCT == T |
is.na(.data$PREVALENCE_ABOVE_15PCT))
}else if (min_prevalence_pct == 20) {
signatures_prevalence <- signatures_prevalence |>
dplyr::filter(.data$PREVALENCE_ABOVE_20PCT == T |
is.na(.data$PREVALENCE_ABOVE_20PCT))
}else if (min_prevalence_pct == 2 | min_prevalence_pct == 1) {
signatures_prevalence <- signatures_prevalence |>
dplyr::filter(!is.na(.data$PREVALENCE_PCT)) |>
dplyr::filter(.data$PREVALENCE_PCT >= min_prevalence_pct)
}
if (min_prevalence_pct > 0.1) {
signatures_prevalence <- signatures_prevalence |>
dplyr::filter(!is.na(.data$PREVALENCE_PCT)) |>
dplyr::filter(.data$PREVALENCE_PCT >= min_prevalence_pct)
}
signatures_prevalence <- signatures_prevalence |>
dplyr::select(-c(.data$PRIMARY_SITE,
.data$PREVALENCE_ABOVE_5PCT,
.data$PREVALENCE_ABOVE_10PCT,
.data$PREVALENCE_ABOVE_15PCT,
.data$PREVALENCE_ABOVE_20PCT)) |>
dplyr::distinct() |>
dplyr::arrange(dplyr::desc(.data$PREVALENCE_PCT)) |>
dplyr::select(-.data$PREVALENCE_PCT)
dplyr::select(-c("PREVALENCE_PCT","PRIMARY_SITE"))
}
}

Expand Down
1 change: 1 addition & 0 deletions pcgrr/R/report.R
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,7 @@ init_m_signature_content <- function() {
rep[["result"]][["vr"]] <- NULL
rep[["result"]][["mut_mat"]] <- NULL
rep[["result"]][["chromosomes"]] <- NULL
rep[["result"]][["no_site_prevalence"]] <- FALSE
rep[["result"]][["tsv"]] <- data.frame()
rep[["result"]][["contributions"]] <- list()
rep[["result"]][["contributions"]][["per_signature"]] <- data.frame()
Expand Down
2 changes: 2 additions & 0 deletions pcgrr/inst/templates/pcgr_quarto.css
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
/*height: 500px;*/
}



.value_box {
width: 250px;
height: 150px;
Expand Down
20 changes: 18 additions & 2 deletions pcgrr/inst/templates/pcgr_quarto_report/mutational_signature.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ Here, we apply the **MutationalPatterns** package [@Blokzijl2018-nc] to estimate
msig_content <-
pcg_report$content$mutational_signatures
no_site_prevalence <-
msig_content$result$no_site_prevalence
missing_data_no_refit <- F
if(msig_content$eval == F |
msig_content$missing_data == T){
Expand Down Expand Up @@ -45,7 +48,7 @@ cat("\n::: {.callout-warning}\n## Limited variant data - ",

```{r t1}
#| output: asis
#| eval: !expr missing_data_no_refit == FALSE & as.logical(msig_conf$all_reference_signatures) == FALSE
#| eval: !expr missing_data_no_refit == FALSE & as.logical(msig_conf$all_reference_signatures) == FALSE & as.logical(no_site_prevalence) == FALSE
cat('For <b>', conf$sample_properties$site,'</b> cancers, mutational signature identification uses the following reference collection of known signatures for the <i>refitting procedure</i> (i.e. previously attributed to the same type of cancer): <br>',sep="")
Expand Down Expand Up @@ -107,13 +110,26 @@ if("SIGNATURE_ID" %in% colnames(dat) &

```{r t3}
#| output: asis
#| eval: !expr missing_data_no_refit == FALSE & as.logical(msig_conf$all_reference_signatures) == TRUE
#| eval: !expr missing_data_no_refit == FALSE & (as.logical(no_site_prevalence) == FALSE & as.logical(msig_conf$all_reference_signatures) == TRUE)
cat("For this analysis, '--all_reference_signatures' have been turned on, which means that all reference signatures (n = 67) have been considered during refitting of the mutational profile with reference signatures",sep="\n")
htmltools::br()
```

```{r t4}
#| output: asis
#| eval: !expr missing_data_no_refit == FALSE & (as.logical(no_site_prevalence) == TRUE | as.logical(msig_conf$all_reference_signatures) == FALSE)
cat(paste0(
"The primary site of the input sample, '<b>",
conf$sample_properties$site,
"</b>', is not yet attributed with specific reference signatures, which means that all reference signatures (n = 67) have been considered during refitting of the mutational profile with reference signatures"),sep="\n")
htmltools::br()
```



```{r t5}
#| output: asis
#| eval: !expr missing_data_no_refit == FALSE
Expand Down
Loading

0 comments on commit 43277f4

Please sign in to comment.