diff --git a/assets/report/report.qmd b/assets/report/report.qmd index 33faf873..ef70594a 100644 --- a/assets/report/report.qmd +++ b/assets/report/report.qmd @@ -15,13 +15,21 @@ params: score_path: "" sampleset: "" run_ancestry: false + reference_panel_name: "" css: logo.css --- +::: {.callout-note} +See the online [documentation](https://pgsc-calc.readthedocs.io/en/latest/output.html#report) for additional +explanation of the terms and data presented in this report. +::: + ```{r setup, echo=FALSE, warning=FALSE, message=FALSE} library(jsonlite) library(dplyr) +library(tidyr) library(ggplot2) +library(data.table) ``` ```{r setup_logs, echo=FALSE} @@ -50,14 +58,10 @@ cat command.txt | fold -w 80 -s | awk -F ' ' 'NR==1 { print "$", $0} NR>1 { prin # Scoring file metadata -::: {.callout-note} -Additional [documentation](https://pgsc-calc.readthedocs.io/en/latest/output.html#report) is available that explains some of the terms used this report in more detail -::: - ## Scoring file summary ```{r load_scorefiles} -json_scorefiles <- read_json(params$log_scorefiles, simplifyVector=TRUE) +json_scorefiles <- read_json(params$log_scorefiles, simplifyVector=TRUE) link_traits <- function(trait_efo, mapped) { if (length(trait_efo) == 0) { @@ -159,6 +163,23 @@ DT::datatable( cat params.txt ``` +```{asis, echo = params$run_ancestry} +## Reference matching summary +``` + +```{r, echo = FALSE, message = FALSE, eval=params$run_ancestry} +intersect_stats <- scan("intersect_counts.txt", sep="\n", what=integer()) +n_target <- intersect_stats[1] +n_ref <- intersect_stats[2] +n_match <- intersect_stats[3] + +tribble( + ~'reference', ~'n target', ~'N (matched)', ~'N variants in panel', ~'% matched', + params$reference_panel_name, n_target, n_match, n_ref, n_match / n_ref +) %>% + DT::datatable() +``` + ## Summary ```{r setup_matches} @@ -207,34 +228,64 @@ compat %>% :::{.column-body-outset} ```{r match_table_detailed, echo = FALSE, warning=FALSE} -log_df %>% - group_by(sampleset, accession) %>% - count(match_status, ambiguous, is_multiallelic, duplicate_best_match, duplicate_ID, wt = count) %>% - rename(is_ambiguous = ambiguous) %>% - mutate(percent = round(n / sum(n) * 100, 2), - match_status = forcats::fct_relevel(match_status, "matched", "excluded", "unmatched")) %>% - arrange(accession, match_status) %>% - mutate(accession = stringr::str_replace_all(accession, "_", " ")) %>% - DT::datatable(rownames=FALSE, - extensions = 'Buttons', - options = list(dom = 'Bfrtip', - buttons = c('csv')), - colnames = c( - "Sampleset" = "sampleset", - "Scoring file" = "accession", - "Match type" = "match_status", - "Multiple potential matches" = "duplicate_best_match", - "Duplicated matched variants" = "duplicate_ID", - "Ambiguous" = "is_ambiguous", - "Multiallelic" = "is_multiallelic", - "%" = "percent" - )) +if(params$run_ancestry == TRUE){ + # Include match_IDs in the groupby to account for + log_df %>% + group_by(sampleset, accession) %>% + count(match_status, match_IDs, ambiguous, is_multiallelic, match_flipped, duplicate_best_match, duplicate_ID, wt = count) %>% + rename(is_ambiguous = ambiguous) %>% + mutate(percent = round(n / sum(n) * 100, 2), + match_status = forcats::fct_relevel(match_status, "matched", "excluded", "unmatched")) %>% + arrange(accession, match_status) %>% + mutate(accession = stringr::str_replace_all(accession, "_", " ")) %>% + DT::datatable(rownames=FALSE, + extensions = 'Buttons', + options = list(dom = 'Bfrtip', + buttons = c('csv')), + colnames = c( + "Sampleset" = "sampleset", + "Scoring file" = "accession", + "Match type" = "match_status", + "Variant in reference panel" = "match_IDs", + "Multiple potential matches" = "duplicate_best_match", + "Duplicated matched variants" = "duplicate_ID", + "Ambiguous" = "is_ambiguous", + "Multiallelic" = "is_multiallelic", + "Matches strand flip" = "match_flipped", + "%" = "percent" + )) +} else{ + log_df %>% + group_by(sampleset, accession) %>% + count(match_status, ambiguous, is_multiallelic,match_flipped, duplicate_best_match, duplicate_ID, wt = count) %>% + rename(is_ambiguous = ambiguous) %>% + mutate(percent = round(n / sum(n) * 100, 2), + match_status = forcats::fct_relevel(match_status, "matched", "excluded", "unmatched")) %>% + arrange(accession, match_status) %>% + mutate(accession = stringr::str_replace_all(accession, "_", " ")) %>% + DT::datatable(rownames=FALSE, + extensions = 'Buttons', + options = list(dom = 'Bfrtip', + buttons = c('csv')), + colnames = c( + "Sampleset" = "sampleset", + "Scoring file" = "accession", + "Match type" = "match_status", + "Multiple potential matches" = "duplicate_best_match", + "Duplicated matched variants" = "duplicate_ID", + "Ambiguous" = "is_ambiguous", + "Multiallelic" = "is_multiallelic", + "Matches strand flip" = "match_flipped", + "%" = "percent" + )) +} ``` ::: - +```{asis, echo = params$run_ancestry} # Genetic Ancestry +``` ```{r colour_palette, echo = FALSE, eval=params$run_ancestry} # source: https://github.com/PGScatalog/PGS_Catalog/blob/master/catalog/static/catalog/pgs.scss#L2493-L2520 @@ -244,27 +295,38 @@ thousand_genomes_colours <- c("#FFD900", "#E41A1C", "#B15928", "#4DAF4A", "#FF7F00", "#BBB", "#999") names(thousand_genomes_colours) <- c("AFR", "AMR", "ASN", "EAS", "EUR", "GME", "SAS", "MAE", "MAO", "NR", "OTH") -thousand_genomes_palette <- scale_colour_manual(name = "Most Similar Population", values = thousand_genomes_colours) +thousand_genomes_palette <- scale_colour_manual(name = "Populations", values = thousand_genomes_colours) ``` ```{r, echo = FALSE, message = FALSE, eval=params$run_ancestry} popsim <- readr::read_tsv(gsub("_pgs.", "_popsimilarity.", params$score_path)) head(popsim) +new_label_target = paste({params$sampleset}, '(Most Similar Population)') +new_label_ref = paste0('reference (', {params$reference_panel_name}, ' populations)') + +popsim$slabel <- new_label_ref +popsim[popsim$sampleset == {params$sampleset}, 'slabel'] <- new_label_target + +map_shapes <- c(1, 16) +map_shapes <- setNames(map_shapes, c(new_label_target, new_label_ref)) + # Placeholder figure: needs better legend labelling for(pc in seq.int(1,5,2)){ pcY = paste('PC', pc, sep='') pcX = paste('PC', pc+1, sep='') if (pcX %in% colnames(popsim)){ - p_pca <- ggplot(popsim[popsim$REFERENCE == TRUE,], aes(x=!!sym(pcX), y=!!sym(pcY))) + geom_point(aes(colour=SuperPop), alpha=0.25) - p_pca <- p_pca + geom_point(data=popsim[popsim$REFERENCE != TRUE,], aes(color=MostSimilarPop), shape=1) - p_pca <- p_pca + theme_bw() + thousand_genomes_palette + p_pca <- ggplot(popsim[popsim$REFERENCE == TRUE,], aes(x=!!sym(pcX), y=!!sym(pcY))) + geom_point(aes(colour=SuperPop, shape=slabel), alpha=0.25) + p_pca <- p_pca + geom_point(data=popsim[popsim$REFERENCE != TRUE,], aes(color=MostSimilarPop, shape=slabel)) + p_pca <- p_pca + theme_bw() + thousand_genomes_palette + scale_shape_manual(values=map_shapes, name='sampleset') print(p_pca) } } ``` +```{asis, echo = params$run_ancestry} ## Population similarity summary +``` ```{r, echo = FALSE, message = FALSE, eval=params$run_ancestry} popsim %>% @@ -292,9 +354,22 @@ pop_summary %>% # Scores ```{r, echo = FALSE, message = FALSE, eval=!params$run_ancestry} -scores <- readr::read_tsv(params$score_path) +# problem: aggregated_scores.txt.gz has a different structure to the ancestry outputs +# solution: pivot the table in the report, but fix in pgscatalog_utils in a future release +# problem: big scores can take a lot of memory +# use data.table as a temporary solution to pivoting datasets +scores <- data.table::fread(params$score_path) n_scores <- sum(grepl("*_SUM$", colnames(scores))) n_samples <- nrow(scores) + +id_cols <- c("sampleset", "IID") +melted_scores <- data.table::melt(scores, id.vars=id_cols, measure.vars=patterns(SUM="*_SUM$", DENOM="DENOM", AVG="*_AVG$"), variable.name = "PGS") + +# annoying fix for scores getting converted to a factor level integer +# fixed in data.table 1.14.9 but not released yet +score_names <- sub("_.*", "", colnames(scores)[grepl("_SUM$", colnames(scores))]) +setattr(melted_scores$PGS, "levels", score_names) +data.table::fwrite(melted_scores, params$score_path, sep="\t", compress="gzip") ``` ```{r, echo = FALSE, message = FALSE, eval=params$run_ancestry} @@ -338,41 +413,36 @@ In the future mean-imputation will be supported in small samplesets using ancest #### Score extract ::: {.callout-note} -Below is a summary of the aggregated scores, which might be useful for debugging. +Below is a summary of the aggregated scores, which might be useful for debugging. See here for an explanation of [plink2](https://www.cog-genomics.org/plink/2.0/formats#sscore) column names ::: ```{r, echo = FALSE} scores %>% - select(sampleset, IID, ends_with("SUM")) %>% tibble::as_tibble(.) ``` -::: {.callout-note} -See here for an explanation of [plink2](https://www.cog-genomics.org/plink/2.0/formats#sscore) column names -::: - -#### Density plot +#### Density plot(s) ::: {.callout-note} The summary density plots show up to six scoring files ::: ```{r density_ancestry, echo=FALSE, message=FALSE, warning=FALSE, eval=params$run_ancestry} -scores %>% - ungroup() %>% - select(IID, sampleset, PGS, SUM) %>% - group_by(sampleset, PGS) -> long_scores - -group_keys(long_scores) %>% - slice(1:6) -> score_keys - -long_scores %>% - inner_join(score_keys) %>% - ggplot(aes(x = SUM, fill = sampleset)) + - geom_density(alpha = 0.3) + - facet_wrap(~PGS, ncol = 2, scales = "free") + - theme_bw() + - labs(x = "PGS (SUM)", y = "Density", title = "PGS Distribution(s)") +# Select which PGS to plot +uscores <- unique(scores$PGS) +uscores_plot <- uscores[1:min(length(uscores), 6)] # plot max 6 PGS + +# Plot multiple adjustment methods at once per PGS +for(current_pgs in uscores_plot){ + long_scores <- scores %>% select(!percentile_MostSimilarPop) %>% filter(PGS == current_pgs) %>% gather(Method, score, -sampleset, -IID, -PGS) + long_scores %>% + ggplot(aes(x = score, fill = sampleset)) + + geom_density(alpha = 0.3) + + facet_wrap(~Method, scales = "free", nrow=1) + + theme_bw() + + labs(x = 'PGS', y = "Density", title = paste(current_pgs, '(adjusted distributions)')) -> p_dist + print(p_dist) +} ``` ```{r, echo = FALSE, message=FALSE, warning=FALSE, eval=!params$run_ancestry} diff --git a/bin/intersect_variants.sh b/bin/intersect_variants.sh index f4162b20..e8a0de74 100755 --- a/bin/intersect_variants.sh +++ b/bin/intersect_variants.sh @@ -77,6 +77,10 @@ fi join ref_variants.txt target_variants.txt |\ awk '{if (NR==1) print $0, "SAME_REF"; else print $0, ($3 == $8)}' > matched_variants.txt +wc -l < target_variants.txt | awk '{ print $0-1}' > intersect_counts.txt +wc -l < ref_variants.txt | awk '{ print $0-1}' >> intersect_counts.txt +wc -l < matched_variants.txt | awk '{ print $0-1}' >> intersect_counts.txt + # Current output columns are: #1 : CHR:POS:A0:A1 #2 : ID_REF diff --git a/conf/modules.config b/conf/modules.config index 20ee998c..7e3d8f80 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -76,12 +76,11 @@ process { } withLabel: fraposa { - // TODO: publish fraposa somewhere -// ext.conda = "$projectDir/environments/report/environment.yml" - ext.singularity = 'oras://dockerhub.ebi.ac.uk/gdp-public/fraposa_pgsc/singularity/pgsc_fraposa' - ext.singularity_version = ':dev' - ext.docker = 'dockerhub.ebi.ac.uk/gdp-public/fraposa_pgsc/pgsc_fraposa' - ext.docker_version = ':dev' + ext.conda = "$projectDir/environments/fraposa/environment.yml" + ext.singularity = 'oras://ghcr.io/pgscatalog/fraposa_pgsc' + ext.singularity_version = ':v0.1.0-singularity' + ext.docker = 'ghcr.io/pgscatalog/fraposa_pgsc' + ext.docker_version = ':v0.1.0' } // output configuration diff --git a/environments/fraposa/environment.yml b/environments/fraposa/environment.yml new file mode 100644 index 00000000..c0e641d9 --- /dev/null +++ b/environments/fraposa/environment.yml @@ -0,0 +1,6 @@ +name: fraposa +dependencies: +- python=3.10 +- pip +- pip: + - fraposa-pgsc==0.1.0 diff --git a/environments/report/Dockerfile b/environments/report/Dockerfile index e3c48fed..94693901 100644 --- a/environments/report/Dockerfile +++ b/environments/report/Dockerfile @@ -1,22 +1,16 @@ -FROM r-base:4.3.0 AS quarto_arm64 +ARG QUARTO_VERSION=1.3.433 -ONBUILD RUN apt-get update && \ - apt-get install -y gdebi wget +FROM r-base:4.3.0 -ONBUILD RUN wget https://github.com/quarto-dev/quarto-cli/releases/download/v1.3.302/quarto-1.3.302-linux-arm64.deb && \ - gdebi --non-interactive quarto-1.3.302-linux-arm64.deb && \ - rm quarto-1.3.302-linux-arm64.deb +ARG QUARTO_VERSION +ARG TARGETARCH -FROM r-base:4.3.0 AS quarto_amd64 - -ONBUILD RUN apt-get update && \ - apt-get install -y gdebi wget - -ONBUILD RUN wget https://github.com/quarto-dev/quarto-cli/releases/download/v1.3.302/quarto-1.3.302-linux-amd64.deb && \ - gdebi --non-interactive quarto-1.3.302-linux-amd64.deb && \ - rm quarto-1.3.302-linux-amd64.deb +RUN wget https://github.com/quarto-dev/quarto-cli/releases/download/v${QUARTO_VERSION}/quarto-${QUARTO_VERSION}-linux-${TARGETARCH}.deb && \ + dpkg -i quarto-${QUARTO_VERSION}-linux-${TARGETARCH}.deb && \ + rm quarto-${QUARTO_VERSION}-linux-${TARGETARCH}.deb -FROM quarto_${TARGETARCH} - -RUN R -e 'install.packages(c("DT", "purrr", "jsonlite", "dplyr", "ggplot2", "tidyr", "forcats", "readr"), clean=TRUE)' +RUN R -e 'install.packages(c("DT", "data.table", "purrr", "jsonlite", "dplyr", "ggplot2", "tidyr", "forcats", "readr", "R.utils"), clean=TRUE)' +RUN apt-get update \ + && apt-get install -y procps \ + && rm -rf /var/lib/apt/lists/* \ No newline at end of file diff --git a/environments/report/environment.yml b/environments/report/environment.yml index 25caf451..1d067810 100644 --- a/environments/report/environment.yml +++ b/environments/report/environment.yml @@ -11,3 +11,4 @@ dependencies: - r-readr - r-purrr - r-jsonlite + - r-datatable diff --git a/modules/local/ancestry/intersect_variants.nf b/modules/local/ancestry/intersect_variants.nf index 42d590c8..0b1999a1 100644 --- a/modules/local/ancestry/intersect_variants.nf +++ b/modules/local/ancestry/intersect_variants.nf @@ -19,6 +19,7 @@ process INTERSECT_VARIANTS { output: tuple val(id), path("${meta.id}_${meta.chrom}_matched.txt.gz"), emit: intersection + path "intersect_counts.txt", emit: intersect_count path "versions.yml", emit: versions script: @@ -36,7 +37,7 @@ process INTERSECT_VARIANTS { exit 1 else mv matched_variants.txt ${meta.id}_${meta.chrom}_matched.txt - gzip *.txt + gzip *_variants.txt *_matched.txt fi cat <<-END_VERSIONS > versions.yml diff --git a/modules/local/score_report.nf b/modules/local/score_report.nf index d2015950..2250fb4c 100644 --- a/modules/local/score_report.nf +++ b/modules/local/score_report.nf @@ -14,6 +14,8 @@ process SCORE_REPORT { input: tuple val(meta), path(scorefile), path(score_log), path(match_summary), path(ancestry) + path intersect_count + val reference_panel_name output: // includeInputs to correctly use $meta.id in publishDir path @@ -42,7 +44,8 @@ process SCORE_REPORT { quarto render report.qmd -M "self-contained:true" \ -P score_path:$scorefile \ -P sampleset:$meta.id \ - -P run_ancestry:$run_ancestry + -P run_ancestry:$run_ancestry \ + -P reference_panel_name:$reference_panel_name cat <<-END_VERSIONS > versions.yml ${task.process.tokenize(':').last()}: diff --git a/subworkflows/local/ancestry/ancestry_project.nf b/subworkflows/local/ancestry/ancestry_project.nf index b2b692e1..9a4f99d5 100644 --- a/subworkflows/local/ancestry/ancestry_project.nf +++ b/subworkflows/local/ancestry/ancestry_project.nf @@ -259,6 +259,7 @@ workflow ANCESTRY_PROJECT { emit: intersection = INTERSECT_VARIANTS.out.intersection + intersect_count = INTERSECT_VARIANTS.out.intersect_count projections = ch_projections.combine(ch_ref_projections) ref_geno = ch_ref.geno ref_pheno = ch_ref.pheno diff --git a/subworkflows/local/report.nf b/subworkflows/local/report.nf index 41ac35a2..9bce0c96 100644 --- a/subworkflows/local/report.nf +++ b/subworkflows/local/report.nf @@ -10,6 +10,7 @@ workflow REPORT { log_scorefiles log_match run_ancestry_assign // bool + intersect_count main: ch_versions = Channel.empty() @@ -30,6 +31,10 @@ workflow REPORT { // ch_scores keeps calculated score file names consistent with --skip_ancestry ch_scores = Channel.empty() + // used in report rendering + // (theoretical support for custom reference panels) + reference_panel_name = 'NO_PANEL' + if (run_ancestry_assign) { scores .combine(ref_relatedness) @@ -50,6 +55,9 @@ workflow REPORT { .map { annotate_sampleset(it) } .groupTuple(size: 2) + // an unpleasant method of grabbing reference panel name + reference_panel_name = ch_ancestry_input.map{ it.tail().last().getBaseName().tokenize('_')[1] } + // ancestry_analysis: aggregated_scores.txt.gz -> {sampleset}_pgs.txt.gz ch_scores = ch_scores.mix(ANCESTRY_ANALYSIS.out.pgs.map{annotate_sampleset(it)}) ch_versions = ch_versions.mix(ANCESTRY_ANALYSIS.out.versions) @@ -74,7 +82,7 @@ workflow REPORT { .combine(log_scorefiles) // all samplesets have the same scorefile metadata .set { ch_report_input } - SCORE_REPORT( ch_report_input ) + SCORE_REPORT( ch_report_input, intersect_count, reference_panel_name ) ch_versions = ch_versions.mix(SCORE_REPORT.out.versions) // if this workflow runs, the report must be written diff --git a/workflows/pgscalc.nf b/workflows/pgscalc.nf index 74bf5935..60304a07 100644 --- a/workflows/pgscalc.nf +++ b/workflows/pgscalc.nf @@ -260,6 +260,8 @@ workflow PGSCALC { // reference allelic frequencies are optional inputs to scoring subworkflow ref_afreq = Channel.fromPath(file('NO_FILE')) + intersect_count = Channel.fromPath(file('NO_FILE_INTERSECT_COUNT')) + if (run_ancestry_assign) { intersection = Channel.empty() ref_geno = Channel.empty() @@ -279,6 +281,8 @@ workflow PGSCALC { ref_geno = ref_geno.mix(ANCESTRY_PROJECT.out.ref_geno) ref_pheno = ref_pheno.mix(ANCESTRY_PROJECT.out.ref_pheno) ref_var = ref_var.mix(ANCESTRY_PROJECT.out.ref_var) + intersect_count = ANCESTRY_PROJECT.out.intersect_count + if (params.load_afreq) { ref_afreq = ANCESTRY_PROJECT.out.ref_afreq } @@ -369,7 +373,8 @@ workflow PGSCALC { projections, INPUT_CHECK.out.log_scorefiles, MATCH.out.db, - run_ancestry_assign + run_ancestry_assign, + intersect_count ) }