Skip to content

Commit

Permalink
Report edits for v2 (#128)
Browse files Browse the repository at this point in the history
* fix action

* Label sampleset on PCA plots in report.qmd

* add reference panel name

* Plot distributions for multiple adjustment methods

* fix action o_O

* Small edits for clarity/simplicity

* More informative legends for PCA plots

* Move pointer to online documentation closer to the header.

* Add in match_flipped (Matches strand flip) to variant log table (was previously missing)

* Oops

* Oops (v2 double set of quotes)

* Add reference intersection statistics to report (#129)

* count intersection hit/miss

* add count to report

* Ancestry version of the report needs to indicate  variants that were excluded because they do not match the reference

ToDo: replace values to be in comparison of % of variants in target that are in reference

* fix intersect stats

* hide ancestry headers automatically

* fraposa dev -> fraposa v0.1.0

* fix container tags

* melt non-ancestry aggregated scores

* update conda requirements

* upgrade quarto to fix error

* fix dockerfile

* empty commit to trigger action with fresh cache

---------

Co-authored-by: Benjamin Wingfield <[email protected]>
  • Loading branch information
smlmbrt and nebfield authored Aug 1, 2023
1 parent 7743cdd commit 2d1a163
Show file tree
Hide file tree
Showing 11 changed files with 174 additions and 82 deletions.
180 changes: 125 additions & 55 deletions assets/report/report.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,21 @@ params:
score_path: ""
sampleset: ""
run_ancestry: false
reference_panel_name: ""
css: logo.css
---

::: {.callout-note}
See the online [documentation](https://pgsc-calc.readthedocs.io/en/latest/output.html#report) for additional
explanation of the terms and data presented in this report.
:::

```{r setup, echo=FALSE, warning=FALSE, message=FALSE}
library(jsonlite)
library(dplyr)
library(tidyr)
library(ggplot2)
library(data.table)
```

```{r setup_logs, echo=FALSE}
Expand Down Expand Up @@ -50,14 +58,10 @@ cat command.txt | fold -w 80 -s | awk -F ' ' 'NR==1 { print "$", $0} NR>1 { prin

# Scoring file metadata

::: {.callout-note}
Additional [documentation](https://pgsc-calc.readthedocs.io/en/latest/output.html#report) is available that explains some of the terms used this report in more detail
:::

## Scoring file summary

```{r load_scorefiles}
json_scorefiles <- read_json(params$log_scorefiles, simplifyVector=TRUE)
json_scorefiles <- read_json(params$log_scorefiles, simplifyVector=TRUE)
link_traits <- function(trait_efo, mapped) {
if (length(trait_efo) == 0) {
Expand Down Expand Up @@ -159,6 +163,23 @@ DT::datatable(
cat params.txt
```

```{asis, echo = params$run_ancestry}
## Reference matching summary
```

```{r, echo = FALSE, message = FALSE, eval=params$run_ancestry}
intersect_stats <- scan("intersect_counts.txt", sep="\n", what=integer())
n_target <- intersect_stats[1]
n_ref <- intersect_stats[2]
n_match <- intersect_stats[3]
tribble(
~'reference', ~'n target', ~'N (matched)', ~'N variants in panel', ~'% matched',
params$reference_panel_name, n_target, n_match, n_ref, n_match / n_ref
) %>%
DT::datatable()
```

## Summary

```{r setup_matches}
Expand Down Expand Up @@ -207,34 +228,64 @@ compat %>%
:::{.column-body-outset}

```{r match_table_detailed, echo = FALSE, warning=FALSE}
log_df %>%
group_by(sampleset, accession) %>%
count(match_status, ambiguous, is_multiallelic, duplicate_best_match, duplicate_ID, wt = count) %>%
rename(is_ambiguous = ambiguous) %>%
mutate(percent = round(n / sum(n) * 100, 2),
match_status = forcats::fct_relevel(match_status, "matched", "excluded", "unmatched")) %>%
arrange(accession, match_status) %>%
mutate(accession = stringr::str_replace_all(accession, "_", " ")) %>%
DT::datatable(rownames=FALSE,
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('csv')),
colnames = c(
"Sampleset" = "sampleset",
"Scoring file" = "accession",
"Match type" = "match_status",
"Multiple potential matches" = "duplicate_best_match",
"Duplicated matched variants" = "duplicate_ID",
"Ambiguous" = "is_ambiguous",
"Multiallelic" = "is_multiallelic",
"%" = "percent"
))
if(params$run_ancestry == TRUE){
# Include match_IDs in the groupby to account for
log_df %>%
group_by(sampleset, accession) %>%
count(match_status, match_IDs, ambiguous, is_multiallelic, match_flipped, duplicate_best_match, duplicate_ID, wt = count) %>%
rename(is_ambiguous = ambiguous) %>%
mutate(percent = round(n / sum(n) * 100, 2),
match_status = forcats::fct_relevel(match_status, "matched", "excluded", "unmatched")) %>%
arrange(accession, match_status) %>%
mutate(accession = stringr::str_replace_all(accession, "_", " ")) %>%
DT::datatable(rownames=FALSE,
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('csv')),
colnames = c(
"Sampleset" = "sampleset",
"Scoring file" = "accession",
"Match type" = "match_status",
"Variant in reference panel" = "match_IDs",
"Multiple potential matches" = "duplicate_best_match",
"Duplicated matched variants" = "duplicate_ID",
"Ambiguous" = "is_ambiguous",
"Multiallelic" = "is_multiallelic",
"Matches strand flip" = "match_flipped",
"%" = "percent"
))
} else{
log_df %>%
group_by(sampleset, accession) %>%
count(match_status, ambiguous, is_multiallelic,match_flipped, duplicate_best_match, duplicate_ID, wt = count) %>%
rename(is_ambiguous = ambiguous) %>%
mutate(percent = round(n / sum(n) * 100, 2),
match_status = forcats::fct_relevel(match_status, "matched", "excluded", "unmatched")) %>%
arrange(accession, match_status) %>%
mutate(accession = stringr::str_replace_all(accession, "_", " ")) %>%
DT::datatable(rownames=FALSE,
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = c('csv')),
colnames = c(
"Sampleset" = "sampleset",
"Scoring file" = "accession",
"Match type" = "match_status",
"Multiple potential matches" = "duplicate_best_match",
"Duplicated matched variants" = "duplicate_ID",
"Ambiguous" = "is_ambiguous",
"Multiallelic" = "is_multiallelic",
"Matches strand flip" = "match_flipped",
"%" = "percent"
))
}
```

:::


```{asis, echo = params$run_ancestry}
# Genetic Ancestry
```

```{r colour_palette, echo = FALSE, eval=params$run_ancestry}
# source: https://github.com/PGScatalog/PGS_Catalog/blob/master/catalog/static/catalog/pgs.scss#L2493-L2520
Expand All @@ -244,27 +295,38 @@ thousand_genomes_colours <- c("#FFD900", "#E41A1C", "#B15928", "#4DAF4A",
"#FF7F00", "#BBB", "#999")
names(thousand_genomes_colours) <- c("AFR", "AMR", "ASN", "EAS", "EUR", "GME",
"SAS", "MAE", "MAO", "NR", "OTH")
thousand_genomes_palette <- scale_colour_manual(name = "Most Similar Population", values = thousand_genomes_colours)
thousand_genomes_palette <- scale_colour_manual(name = "Populations", values = thousand_genomes_colours)
```

```{r, echo = FALSE, message = FALSE, eval=params$run_ancestry}
popsim <- readr::read_tsv(gsub("_pgs.", "_popsimilarity.", params$score_path))
head(popsim)
new_label_target = paste({params$sampleset}, '(Most Similar Population)')
new_label_ref = paste0('reference (', {params$reference_panel_name}, ' populations)')
popsim$slabel <- new_label_ref
popsim[popsim$sampleset == {params$sampleset}, 'slabel'] <- new_label_target
map_shapes <- c(1, 16)
map_shapes <- setNames(map_shapes, c(new_label_target, new_label_ref))
# Placeholder figure: needs better legend labelling
for(pc in seq.int(1,5,2)){
pcY = paste('PC', pc, sep='')
pcX = paste('PC', pc+1, sep='')
if (pcX %in% colnames(popsim)){
p_pca <- ggplot(popsim[popsim$REFERENCE == TRUE,], aes(x=!!sym(pcX), y=!!sym(pcY))) + geom_point(aes(colour=SuperPop), alpha=0.25)
p_pca <- p_pca + geom_point(data=popsim[popsim$REFERENCE != TRUE,], aes(color=MostSimilarPop), shape=1)
p_pca <- p_pca + theme_bw() + thousand_genomes_palette
p_pca <- ggplot(popsim[popsim$REFERENCE == TRUE,], aes(x=!!sym(pcX), y=!!sym(pcY))) + geom_point(aes(colour=SuperPop, shape=slabel), alpha=0.25)
p_pca <- p_pca + geom_point(data=popsim[popsim$REFERENCE != TRUE,], aes(color=MostSimilarPop, shape=slabel))
p_pca <- p_pca + theme_bw() + thousand_genomes_palette + scale_shape_manual(values=map_shapes, name='sampleset')
print(p_pca)
}
}
```

```{asis, echo = params$run_ancestry}
## Population similarity summary
```

```{r, echo = FALSE, message = FALSE, eval=params$run_ancestry}
popsim %>%
Expand Down Expand Up @@ -292,9 +354,22 @@ pop_summary %>%
# Scores

```{r, echo = FALSE, message = FALSE, eval=!params$run_ancestry}
scores <- readr::read_tsv(params$score_path)
# problem: aggregated_scores.txt.gz has a different structure to the ancestry outputs
# solution: pivot the table in the report, but fix in pgscatalog_utils in a future release
# problem: big scores can take a lot of memory
# use data.table as a temporary solution to pivoting datasets
scores <- data.table::fread(params$score_path)
n_scores <- sum(grepl("*_SUM$", colnames(scores)))
n_samples <- nrow(scores)
id_cols <- c("sampleset", "IID")
melted_scores <- data.table::melt(scores, id.vars=id_cols, measure.vars=patterns(SUM="*_SUM$", DENOM="DENOM", AVG="*_AVG$"), variable.name = "PGS")
# annoying fix for scores getting converted to a factor level integer
# fixed in data.table 1.14.9 but not released yet
score_names <- sub("_.*", "", colnames(scores)[grepl("_SUM$", colnames(scores))])
setattr(melted_scores$PGS, "levels", score_names)
data.table::fwrite(melted_scores, params$score_path, sep="\t", compress="gzip")
```

```{r, echo = FALSE, message = FALSE, eval=params$run_ancestry}
Expand Down Expand Up @@ -338,41 +413,36 @@ In the future mean-imputation will be supported in small samplesets using ancest
#### Score extract

::: {.callout-note}
Below is a summary of the aggregated scores, which might be useful for debugging.
Below is a summary of the aggregated scores, which might be useful for debugging. See here for an explanation of [plink2](https://www.cog-genomics.org/plink/2.0/formats#sscore) column names
:::

```{r, echo = FALSE}
scores %>%
select(sampleset, IID, ends_with("SUM")) %>%
tibble::as_tibble(.)
```

::: {.callout-note}
See here for an explanation of [plink2](https://www.cog-genomics.org/plink/2.0/formats#sscore) column names
:::

#### Density plot
#### Density plot(s)

::: {.callout-note}
The summary density plots show up to six scoring files
:::

```{r density_ancestry, echo=FALSE, message=FALSE, warning=FALSE, eval=params$run_ancestry}
scores %>%
ungroup() %>%
select(IID, sampleset, PGS, SUM) %>%
group_by(sampleset, PGS) -> long_scores
group_keys(long_scores) %>%
slice(1:6) -> score_keys
long_scores %>%
inner_join(score_keys) %>%
ggplot(aes(x = SUM, fill = sampleset)) +
geom_density(alpha = 0.3) +
facet_wrap(~PGS, ncol = 2, scales = "free") +
theme_bw() +
labs(x = "PGS (SUM)", y = "Density", title = "PGS Distribution(s)")
# Select which PGS to plot
uscores <- unique(scores$PGS)
uscores_plot <- uscores[1:min(length(uscores), 6)] # plot max 6 PGS
# Plot multiple adjustment methods at once per PGS
for(current_pgs in uscores_plot){
long_scores <- scores %>% select(!percentile_MostSimilarPop) %>% filter(PGS == current_pgs) %>% gather(Method, score, -sampleset, -IID, -PGS)
long_scores %>%
ggplot(aes(x = score, fill = sampleset)) +
geom_density(alpha = 0.3) +
facet_wrap(~Method, scales = "free", nrow=1) +
theme_bw() +
labs(x = 'PGS', y = "Density", title = paste(current_pgs, '(adjusted distributions)')) -> p_dist
print(p_dist)
}
```

```{r, echo = FALSE, message=FALSE, warning=FALSE, eval=!params$run_ancestry}
Expand Down
4 changes: 4 additions & 0 deletions bin/intersect_variants.sh
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,10 @@ fi
join ref_variants.txt target_variants.txt |\
awk '{if (NR==1) print $0, "SAME_REF"; else print $0, ($3 == $8)}' > matched_variants.txt

wc -l < target_variants.txt | awk '{ print $0-1}' > intersect_counts.txt
wc -l < ref_variants.txt | awk '{ print $0-1}' >> intersect_counts.txt
wc -l < matched_variants.txt | awk '{ print $0-1}' >> intersect_counts.txt

# Current output columns are:
#1 : CHR:POS:A0:A1
#2 : ID_REF
Expand Down
11 changes: 5 additions & 6 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,11 @@ process {
}

withLabel: fraposa {
// TODO: publish fraposa somewhere
// ext.conda = "$projectDir/environments/report/environment.yml"
ext.singularity = 'oras://dockerhub.ebi.ac.uk/gdp-public/fraposa_pgsc/singularity/pgsc_fraposa'
ext.singularity_version = ':dev'
ext.docker = 'dockerhub.ebi.ac.uk/gdp-public/fraposa_pgsc/pgsc_fraposa'
ext.docker_version = ':dev'
ext.conda = "$projectDir/environments/fraposa/environment.yml"
ext.singularity = 'oras://ghcr.io/pgscatalog/fraposa_pgsc'
ext.singularity_version = ':v0.1.0-singularity'
ext.docker = 'ghcr.io/pgscatalog/fraposa_pgsc'
ext.docker_version = ':v0.1.0'
}

// output configuration
Expand Down
6 changes: 6 additions & 0 deletions environments/fraposa/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
name: fraposa
dependencies:
- python=3.10
- pip
- pip:
- fraposa-pgsc==0.1.0
28 changes: 11 additions & 17 deletions environments/report/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,22 +1,16 @@
FROM r-base:4.3.0 AS quarto_arm64
ARG QUARTO_VERSION=1.3.433

ONBUILD RUN apt-get update && \
apt-get install -y gdebi wget
FROM r-base:4.3.0

ONBUILD RUN wget https://github.com/quarto-dev/quarto-cli/releases/download/v1.3.302/quarto-1.3.302-linux-arm64.deb && \
gdebi --non-interactive quarto-1.3.302-linux-arm64.deb && \
rm quarto-1.3.302-linux-arm64.deb
ARG QUARTO_VERSION
ARG TARGETARCH

FROM r-base:4.3.0 AS quarto_amd64

ONBUILD RUN apt-get update && \
apt-get install -y gdebi wget

ONBUILD RUN wget https://github.com/quarto-dev/quarto-cli/releases/download/v1.3.302/quarto-1.3.302-linux-amd64.deb && \
gdebi --non-interactive quarto-1.3.302-linux-amd64.deb && \
rm quarto-1.3.302-linux-amd64.deb
RUN wget https://github.com/quarto-dev/quarto-cli/releases/download/v${QUARTO_VERSION}/quarto-${QUARTO_VERSION}-linux-${TARGETARCH}.deb && \
dpkg -i quarto-${QUARTO_VERSION}-linux-${TARGETARCH}.deb && \
rm quarto-${QUARTO_VERSION}-linux-${TARGETARCH}.deb

FROM quarto_${TARGETARCH}

RUN R -e 'install.packages(c("DT", "purrr", "jsonlite", "dplyr", "ggplot2", "tidyr", "forcats", "readr"), clean=TRUE)'
RUN R -e 'install.packages(c("DT", "data.table", "purrr", "jsonlite", "dplyr", "ggplot2", "tidyr", "forcats", "readr", "R.utils"), clean=TRUE)'

RUN apt-get update \
&& apt-get install -y procps \
&& rm -rf /var/lib/apt/lists/*
1 change: 1 addition & 0 deletions environments/report/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ dependencies:
- r-readr
- r-purrr
- r-jsonlite
- r-datatable
3 changes: 2 additions & 1 deletion modules/local/ancestry/intersect_variants.nf
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ process INTERSECT_VARIANTS {

output:
tuple val(id), path("${meta.id}_${meta.chrom}_matched.txt.gz"), emit: intersection
path "intersect_counts.txt", emit: intersect_count
path "versions.yml", emit: versions

script:
Expand All @@ -36,7 +37,7 @@ process INTERSECT_VARIANTS {
exit 1
else
mv matched_variants.txt ${meta.id}_${meta.chrom}_matched.txt
gzip *.txt
gzip *_variants.txt *_matched.txt
fi
cat <<-END_VERSIONS > versions.yml
Expand Down
5 changes: 4 additions & 1 deletion modules/local/score_report.nf
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ process SCORE_REPORT {

input:
tuple val(meta), path(scorefile), path(score_log), path(match_summary), path(ancestry)
path intersect_count
val reference_panel_name

output:
// includeInputs to correctly use $meta.id in publishDir path
Expand Down Expand Up @@ -42,7 +44,8 @@ process SCORE_REPORT {
quarto render report.qmd -M "self-contained:true" \
-P score_path:$scorefile \
-P sampleset:$meta.id \
-P run_ancestry:$run_ancestry
-P run_ancestry:$run_ancestry \
-P reference_panel_name:$reference_panel_name
cat <<-END_VERSIONS > versions.yml
${task.process.tokenize(':').last()}:
Expand Down
1 change: 1 addition & 0 deletions subworkflows/local/ancestry/ancestry_project.nf
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ workflow ANCESTRY_PROJECT {

emit:
intersection = INTERSECT_VARIANTS.out.intersection
intersect_count = INTERSECT_VARIANTS.out.intersect_count
projections = ch_projections.combine(ch_ref_projections)
ref_geno = ch_ref.geno
ref_pheno = ch_ref.pheno
Expand Down
Loading

0 comments on commit 2d1a163

Please sign in to comment.