From b939224d82c6a952b3364ea60ab785ff6869c303 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Canouil?= Date: Thu, 23 Jul 2020 14:27:04 +0200 Subject: [PATCH] data.table oriented workflow --- DESCRIPTION | 6 +- NAMESPACE | 1 + NEWS.md | 8 +- R/check_bin.R | 15 +- R/compute_pca.R | 324 ++++++++++++++++++++++++-------------- R/estimate_ethnicity.R | 5 +- R/pca_report.R | 22 ++- man/compute_pca.Rd | 10 +- man/estimate_ethnicity.Rd | 3 + 9 files changed, 254 insertions(+), 140 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3f1e936..03bbcb1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -24,8 +24,12 @@ Depends: Imports: data.table (>= 1.12.8), flashpcaR (>= 2.1), + fs (>= 1.4.1), + ggforce (>= 0.3.2), ggplot2 (>= 3.3.2), - gt (>= 0.2.1) + ggtext (>= 0.1.0), + gt (>= 0.2.1), + patchwork (>= 1.0.1) Suggests: roxygen2 (>= 7.1.0) Remotes: diff --git a/NAMESPACE b/NAMESPACE index beda532..709f9a5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,3 +6,4 @@ export(pca_report) import(data.table) import(ggplot2) import(gt) +import(patchwork) diff --git a/NEWS.md b/NEWS.md index 066ebc5..abc2327 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,9 +1,13 @@ # rain (development version) -## Breaking CHanges +## Breaking Changes * In `R/pca_report.R`, - + Code refactoring. + + Code refactoring using `data.table`. +* In `R/compute_pca.R`, + + Code refactoring using `data.table`. + + Tweak ethnicity figure. + + Add an ethnicity figure focussing on the studied cohort. # rain 0.4.1 diff --git a/R/check_bin.R b/R/check_bin.R index 76483d5..5824ba8 100644 --- a/R/check_bin.R +++ b/R/check_bin.R @@ -6,18 +6,17 @@ check_bin <- function(bin_path) { if (!all(sapply(bin_path, file.exists))) { message_prefix <- "[rain] " + text <- paste0( + names(bin_path[!sapply(bin_path, file.exists)]), + ' ("', bin_path[!sapply(bin_path, file.exists)], '")' + ) stop( message_prefix, paste0( "No binary found for the following tools: ", - glue::glue_collapse( - x = paste0( - names(bin_path[!sapply(bin_path, file.exists)]), - ' ("', bin_path[!sapply(bin_path, file.exists)], '")' - ), - sep = ", ", - last = " and " - ), + paste(text[-length(text)], collapse = ", "), + " and ", + text[length(text)], "!" ) ) diff --git a/R/compute_pca.R b/R/compute_pca.R index bfafe39..912503f 100644 --- a/R/compute_pca.R +++ b/R/compute_pca.R @@ -5,170 +5,254 @@ #' #' @return A `data.frame`. #' +#' @import data.table +#' @import ggplot2 +#' @import gt +#' @import patchwork +#' #' @export -compute_pca <- function(cohort_name, input_plink, output_directory, ref1kg_population) { +compute_pca <- function(cohort_name, input_plink, output_directory, ref1kg_population, n_comp = 10) { message_prefix <- "[rain] " - ###################### - ### Performing PCA ### - ###################### - message(message_prefix, "Performing PCA ...") + message(message_prefix, "Performing PCA ...", appendLF = TRUE) - fid_iid <- readr::read_delim( + fid_iid <- data.table::fread( file = paste0(input_plink, ".fam"), - delim = " ", - col_types = readr::cols_only( - X1 = readr::col_character(), - X2 = readr::col_character() - ), - col_names = FALSE + colClasses = c("V1" = "character", "V2" = "character"), + header = FALSE ) - pca_ndim <- 10 + pca_res <- flashpcaR::flashpca(input_plink, ndim = n_comp) + + pca_contrib <- sprintf( + fmt = "PC%02d (%s %%)", + 1:n_comp, + format(pca_res$values / sum(pca_res$values) * 100, digits = 2, nsmall = 2, trim = TRUE) + ) + names(pca_contrib) <- sprintf("PC%02d", 1:n_comp) - pca_res <- flashpcaR::flashpca(input_plink, ndim = pca_ndim) - pca_dfxy <- data.table::as.data.table(pca_res[["projection"]]) + pca_dfxy <- data.table::as.data.table(pca_res[["projection"]], keep.rownames = "iid") data.table::setnames( x = pca_dfxy, - old = setdiff(names(pca_dfxy), id_var), - new = sprintf("PC%02d", as.numeric(gsub("V", "", setdiff(names(pca_dfxy), id_var)))) + old = setdiff(names(pca_dfxy), "iid"), + new = sprintf("PC%02d", as.numeric(gsub("V", "", setdiff(names(pca_dfxy), "iid")))) ) if (all.equal(fid_iid[[1]], fid_iid[[2]])) { - pca_dfxy[["sample"]] <- fid_iid[[2]] + pca_dfxy[, "iid" := fid_iid[[2]]] } else { - pca_dfxy[["sample"]] <- paste(fid_iid[[1]], fid_iid[[2]], sep = "/") + pca_dfxy[, "iid" := paste(fid_iid[[1]], fid_iid[[2]], sep = "/")] } - pca_dfxy <- pca_dfxy %>% - dplyr::left_join( - y = suppressWarnings( - readr::read_tsv( - file = ref1kg_population, - col_types = readr::cols_only( - sample = readr::col_character(), - pop = readr::col_character(), - super_pop = readr::col_character() - ) - ) - ), - by = "sample" - ) %>% - dplyr::mutate( - cohort = ifelse(is.na(.data[["pop"]]), cohort_name, "1,000 Genomes"), - pop = ifelse(is.na(.data[["pop"]]), cohort_name, .data[["pop"]]), - super_pop = ifelse(is.na(.data[["super_pop"]]), cohort_name, .data[["super_pop"]]) - ) %>% - dplyr::mutate_at( - .vars = dplyr::vars(.data[["cohort"]], .data[["pop"]], .data[["super_pop"]]), - .funs = ~ stats::relevel(factor(.x), ref = cohort_name) - ) %>% - dplyr::select("sample", dplyr::everything()) - - pop_centre <- purrr::map_df(c("super_pop", "pop") , function(ipop) { - pca_dfxy %>% - dplyr::filter(.data[["cohort"]] != !!cohort_name) %>% - dplyr::select(-"cohort") %>% - dplyr::group_by(.data[[ipop]]) %>% - dplyr::summarise( - PC01_centre = mean(.data[["PC01"]]), - PC02_centre = mean(.data[["PC02"]]) - ) %>% - dplyr::mutate(pop_type = ipop) %>% - dplyr::rename("pop_closest" = !!ipop) %>% - dplyr::mutate_if(.predicate = is.factor, .funs = as.character) - }) + ref_pop_table <- data.table::fread( + file = ref1kg_population, + colClasses = "character", + header = FALSE, + col.names = c("iid", "pop", "super_pop", "sex") + ) - pca_gg_pred_best <- pca_dfxy %>% - dplyr::select(-c(.data[["pop"]], .data[["super_pop"]])) %>% - dplyr::filter(.data[["cohort"]] == !!cohort_name) %>% - dplyr::mutate(pop_centre = list(pop_centre)) %>% - tidyr::unnest("pop_centre") %>% - dplyr::group_by(.data[["sample"]], .data[["pop_type"]]) %>% - dplyr::mutate( - pop_dist = purrr::pmap_dbl( - .l = list( - .x = .data[["PC01"]], .y = .data[["PC02"]], - .x_centre = .data[["PC01_centre"]], .y_centre = .data[["PC02_centre"]] - ), - .f = function(.x, .y, .x_centre, .y_centre) { - sqrt((.x - .x_centre)^2 + (.y - .y_centre)^2) - } + pca_dfxy <- merge( + x = pca_dfxy, + y = ref_pop_table[, "cohort" := "1,000 Genomes"], + by = "iid", + all = TRUE + ) + + cohort <- NULL + pca_dfxy[!cohort %in% "1,000 Genomes", (c("pop", "super_pop", "cohort")) := list(cohort_name, cohort_name, cohort_name)] + pca_dfxy[, + (c("pop", "super_pop", "cohort")) := lapply( + X = .SD, + FUN = function(x) factor(x, levels = c(cohort_name, sort(setdiff(x, cohort_name)))) + ), + .SDcols = c("pop", "super_pop", "cohort") + ] + + pop_centre <- pca_dfxy[ + cohort %in% "1,000 Genomes", + lapply(.SD, mean), + .SDcols = sprintf("PC%02d", 1:2), + by = "pop" + ] + + PC01 <- PC02 <- dist <- which_closest <- pop <- super_pop <- NULL + pca_dfxy[, + "pop_closest" := (function(.x, .y) { + as.character( + pop_centre[, + list("dist" = sqrt((.x - PC01)^2 + (.y - PC02)^2)), + by = "pop" + ][, + "which_closest" := dist == min(dist) + ][ + (which_closest) + ][["pop"]] ) - ) %>% - dplyr::slice(which.min(.data[["pop_dist"]])) %>% - dplyr::ungroup() %>% - dplyr::select(-dplyr::ends_with("_centre"), -.data[["pop_dist"]]) %>% - tidyr::pivot_wider(names_from = "pop_type", values_from = "pop_closest") - - p <- purrr::map(.x = c("pop" = "pop", "super_pop" = "super_pop"), .f = function(ipop) { - ggplot2::ggplot( - data = pca_dfxy, - mapping = ggplot2::aes(x = .data[["PC01"]], y = .data[["PC02"]], colour = .data[[ipop]]) - ) + - ggplot2::theme_light(base_size = 12) + - ggplot2::geom_hline(yintercept = 0, linetype = 1, size = 0.5, na.rm = TRUE) + - ggplot2::geom_vline(xintercept = 0, linetype = 1, size = 0.5, na.rm = TRUE) + - ggforce::geom_mark_ellipse(mapping = ggplot2::aes(fill = .data[[ipop]]), con.cap = 0, alpha = 0.1) + - ggplot2::geom_point(mapping = ggplot2::aes(shape = .data[[ipop]]), na.rm = TRUE) + - ggplot2::scale_colour_viridis_d(na.translate = FALSE, drop = FALSE, end = 0.9) + - ggplot2::scale_fill_viridis_d(na.translate = FALSE, drop = FALSE, end = 0.9) + - ggplot2::scale_shape_manual(values = c(3, rep(1, length(unique(pca_dfxy[[ipop]]))))) + - ggplot2::labs( - shape = c("super_pop" = "Super Population", "pop" = "Population")[ipop], - colour = c("super_pop" = "Super Population", "pop" = "Population")[ipop], - fill = c("super_pop" = "Super Population", "pop" = "Population")[ipop] + })(PC01, PC02), + by = "iid" + ] + + pca_dfxy <- merge( + x = pca_dfxy, + y = unique(ref_pop_table[, list("pop_closest" = pop, "super_pop_closest" = super_pop)]), + by = "pop_closest", + all.x = TRUE + ) + + p <- lapply( + X = c("pop" = "pop", "super_pop" = "super_pop"), + FUN = function(ipop) { + ggplot2::ggplot( + data = pca_dfxy, + mapping = ggplot2::aes(x = .data[["PC01"]], y = .data[["PC02"]], colour = .data[[ipop]]) ) + - ggplot2::facet_grid(cols = ggplot2::vars(.data[["cohort"]])) + - ggplot2::guides( - colour = ggplot2::guide_legend( - ncol = 2, - label.theme = ggplot2::element_text(size = 8), - keyheight = ggplot2::unit(9, "point") + ggplot2::theme_light(base_size = 11) + + ggplot2::geom_hline(yintercept = 0, linetype = 2, size = 0.5, na.rm = TRUE) + + ggplot2::geom_vline(xintercept = 0, linetype = 2, size = 0.5, na.rm = TRUE) + + ggforce::geom_mark_ellipse(mapping = ggplot2::aes(fill = .data[[ipop]]), con.cap = 0, alpha = 0.1) + + ggplot2::geom_point(mapping = ggplot2::aes(shape = .data[[ipop]]), na.rm = TRUE) + + ggplot2::scale_colour_viridis_d(na.translate = FALSE, drop = FALSE, end = 0.9) + + ggplot2::scale_fill_viridis_d(na.translate = FALSE, drop = FALSE, end = 0.9) + + ggplot2::scale_shape_manual(values = c(3, rep(1, length(unique(pca_dfxy[[ipop]]))))) + + ggplot2::labs( + x = pca_contrib["PC01"], + y = pca_contrib["PC02"], + shape = c("super_pop" = "Super Population", "pop" = "Population")[ipop], + colour = c("super_pop" = "Super Population", "pop" = "Population")[ipop], + fill = c("super_pop" = "Super Population", "pop" = "Population")[ipop] + ) + + ggplot2::facet_grid(cols = ggplot2::vars(.data[["cohort"]])) + + ggplot2::guides( + colour = ggplot2::guide_legend( + ncol = 2, + label.theme = ggplot2::element_text(size = 8), + keyheight = ggplot2::unit(9, "point") + ) ) - ) }) - p_zoom <- purrr::map(p, ~ .x + ggplot2::coord_cartesian( - xlim = range(dplyr::filter(pca_dfxy, .data[["cohort"]] == !!cohort_name)[["PC01"]]), - ylim = range(dplyr::filter(pca_dfxy, .data[["cohort"]] == !!cohort_name)[["PC02"]]) - )) + p_zoom <- lapply( + X = p, + FUN = function(.p) { + .p + + ggplot2::coord_cartesian( + xlim = range(pca_dfxy[!cohort %in% "1,000 Genomes"][["PC01"]]), + ylim = range(pca_dfxy[!cohort %in% "1,000 Genomes"][["PC02"]]) + ) + } + ) + + p_subtitle <- paste0( + "Principal Component Analysis using ", + format(nrow(data.table::fread(paste0(input_plink, ".bim"))), big.mark = ",", digits = 0), + " SNPs
", + "With A) population level and B) super population level" + ) p_ethni <- patchwork::wrap_plots( - patchwork::wrap_plots(p[["super_pop"]], p_zoom[["super_pop"]]) + - patchwork::plot_layout(tag_level = "new", guides = "collect"), patchwork::wrap_plots(p[["pop"]], p_zoom[["pop"]]) + + patchwork::plot_layout(tag_level = "new", guides = "collect"), + patchwork::wrap_plots(p[["super_pop"]], p_zoom[["super_pop"]]) + patchwork::plot_layout(tag_level = "new", guides = "collect") ) + patchwork::plot_layout(nrow = 2, ncol = 1) + patchwork::plot_annotation( tag_levels = c("A", "1"), - title = "Estimated Ethnicity", - subtitle = paste( - "Based on Principal Component Analysis using", - scales::comma(nrow(data.table::fwrite(paste0(input_plink, ".bim")))), - "SNPs." + title = "Ethnicty Inference Based On 1,000 Genomes Project Data", + subtitle = gsub("
W", " w", p_subtitle), + theme = ggplot2::theme(plot.title.position = "plot", plot.subtitle = ggtext::element_markdown()) + ) + + p_cohort_base <- ggplot2::ggplot( + data = pca_dfxy[!cohort %in% "1,000 Genomes"], + mapping = ggplot2::aes(x = .data[["PC01"]], y = .data[["PC02"]]) + ) + + ggplot2::theme_light(base_size = 11) + + ggplot2::geom_hline(yintercept = 0, linetype = 2, size = 0.5, na.rm = TRUE) + + ggplot2::geom_vline(xintercept = 0, linetype = 2, size = 0.5, na.rm = TRUE) + + ggplot2::scale_colour_viridis_d(na.translate = FALSE, drop = FALSE, begin = 0.10, end = 0.90) + + ggplot2::scale_fill_viridis_d(na.translate = FALSE, drop = FALSE, begin = 0.10, end = 0.90) + + ggplot2::guides( + colour = ggplot2::guide_legend( + ncol = ifelse(nrow(unique(pca_dfxy[!cohort %in% "1,000 Genomes", "pop_closest"])) > 5, 2, 1) ) + ) + + ggplot2::theme(plot.title.position = "plot") + + ggplot2::coord_cartesian( + xlim = range(pca_dfxy[!cohort %in% "1,000 Genomes"][["PC01"]]), + ylim = range(pca_dfxy[!cohort %in% "1,000 Genomes"][["PC02"]]) ) + p_cohort <- patchwork::wrap_plots( + p_cohort_base + + ggplot2::geom_point(mapping = ggplot2::aes(colour = .data[["pop_closest"]]), shape = 1, na.rm = TRUE) + + ggforce::geom_mark_ellipse( + mapping = ggplot2::aes( + colour = .data[["pop_closest"]], + group = .data[["pop_closest"]] + ), + fill = "transparent", + alpha = 0.10 + ) + + ggplot2::labs( + x = pca_contrib["PC01"], + y = pca_contrib["PC02"], + shape = "Population", + colour = "Population", + fill = "Population" + ), + p_cohort_base + + ggplot2::geom_point(mapping = ggplot2::aes(colour = .data[["super_pop_closest"]]), shape = 1, na.rm = TRUE) + + ggforce::geom_mark_ellipse( + mapping = ggplot2::aes( + colour = .data[["super_pop_closest"]], + group = .data[["super_pop_closest"]] + ), + fill = "transparent", + alpha = 0.10 + ) + + ggplot2::labs( + x = pca_contrib["PC01"], + y = pca_contrib["PC02"], + shape = "Super Population", + colour = "Super Population", + fill = "Super Population" + ) + ) + + patchwork::plot_layout(nrow = 2, ncol = 1) + + patchwork::plot_annotation( + tag_levels = "A", + title = "Ethnicty Inference Based On 1,000 Genomes Project Data", + subtitle = p_subtitle, + theme = ggplot2::theme(plot.title.position = "plot", plot.subtitle = ggtext::element_markdown()) + ) - ################# - ### Exporting ### - ################# message(message_prefix, "Exporting ...") ggplot2::ggsave( - filename = file.path(output_directory, paste0(cohort_name, "_ethnicity.pdf")), + filename = file.path(output_directory, paste0(cohort_name, "_ethnicity_1kg.pdf")), plot = p_ethni, width = 29.7 - 5, height = 21 - 5, units = "cm", dpi = 120 ) + ggplot2::ggsave( + filename = file.path(output_directory, paste0(cohort_name, "_ethnicity.pdf")), + plot = p_cohort, + width = 16, + height = 16, + units = "cm", + dpi = 120 + ) invisible( - readr::write_csv( - x = pca_gg_pred_best, - path = file.path(output_directory, paste0(cohort_name, "_ethnicity.csv")) + data.table::fwrite( + x = pca_dfxy[ + !cohort %in% "1,000 Genomes", + .SD, + .SDcols = c("iid", sprintf("PC%02d", 1:n_comp), "cohort", "pop_closest", "super_pop_closest") + ], + file = file.path(output_directory, paste0(cohort_name, "_ethnicity.csv")) ) ) } diff --git a/R/estimate_ethnicity.R b/R/estimate_ethnicity.R index bbf184a..3928402 100755 --- a/R/estimate_ethnicity.R +++ b/R/estimate_ethnicity.R @@ -16,6 +16,7 @@ #' + 'haploid'/'h': Treat half-calls as haploid/homozygous (the PLINK 1 file format does not distinguish between the two). This maximizes similarity between the VCF and BCF2 parsers. #' + 'missing'/'m': Treat half-calls as missing (default). #' + 'reference'/'r': Treat the missing part as reference. +#' @param n_comp A `numeric`. The number of principal components to be computed. #' @param n_cores An `integer`. The number of CPUs to use to estimate the ethnicity. #' @param bin_path A `list(character)`. A list giving the binary path of #' `vcftools`, `bcftools`, `bgzip`, `tabix` and `plink`. @@ -35,6 +36,7 @@ estimate_ethnicity <- function( quality_threshold = 0.9, recode = "all", vcf_half_call = "missing", + n_comp = 10, n_cores = 6, bin_path = list( vcftools = "/usr/bin/vcftools", @@ -129,7 +131,8 @@ estimate_ethnicity <- function( cohort_name = cohort_name, input_plink = file.path(output_directory, "all"), output_directory = output_directory, - ref1kg_population = ref1kg_population + ref1kg_population = ref1kg_population, + n_comp = n_comp ) } diff --git a/R/pca_report.R b/R/pca_report.R index 98173d4..1413fdf 100755 --- a/R/pca_report.R +++ b/R/pca_report.R @@ -17,6 +17,7 @@ #' @import data.table #' @import ggplot2 #' @import gt +#' @import patchwork #' #' @examples #' @@ -101,14 +102,17 @@ pca_report <- function( x = paste0( .data[["x"]], "
(", - scales::percent_format(accuracy = 0.01, suffix = " %")(.data[["y"]]), - ")" + format(.data[["y"]] * 100, digits = 2, nsmall = 2), + " %)" ), y = .data[["y"]] ) ) + - ggplot2::geom_col(width = 1, colour = "white", fill = scales::viridis_pal(begin = 0.5, end = 0.5)(1), na.rm = TRUE) + - ggplot2::scale_y_continuous(labels = scales::percent_format(accuracy = 0.1, suffix = " %"), expand = ggplot2::expansion(mult = c(0, 0.05))) + + ggplot2::geom_col(width = 1, colour = "white", fill = "#21908CFF", na.rm = TRUE) + + ggplot2::scale_y_continuous( + labels = function(x) paste(format(x * 100, digits = 2, nsmall = 2), "%"), + expand = ggplot2::expansion(mult = c(0, 0.05)) + ) + ggplot2::labs( x = "Principal Components", y = "Contribution" @@ -146,7 +150,7 @@ pca_report <- function( label = gsub( pattern = "(.*)e([-+]*)0*(.*)", replacement = "\\1
×
10\\2\\3", - x = scales::scientific(.data[["Pr(>F)"]], digits = 2) + x = format(.data[["Pr(>F)"]], digits = 2, nsmall = 2, scientific = TRUE) ) ), colour = "white", @@ -162,8 +166,12 @@ pca_report <- function( labels = function(x) { paste0( x, "
(", - scales::percent_format(accuracy = 0.01, suffix = " %")((pca_res$values / sum(pca_res$values))[as.numeric(gsub("PC", "", x))]), - ")" + format( + x = (pca_res$values / sum(pca_res$values))[as.numeric(gsub("PC", "", x))] * 100, + digits = 2, + nsmall = 2 + ), + " %)" ) } ) + diff --git a/man/compute_pca.Rd b/man/compute_pca.Rd index d90f627..d551dbd 100644 --- a/man/compute_pca.Rd +++ b/man/compute_pca.Rd @@ -4,7 +4,13 @@ \alias{compute_pca} \title{Compute the genomic components (and some figures) for ethnicity based on VCF files.} \usage{ -compute_pca(cohort_name, input_plink, output_directory, ref1kg_population) +compute_pca( + cohort_name, + input_plink, + output_directory, + ref1kg_population, + n_comp = 10 +) } \arguments{ \item{cohort_name}{A \code{character}. A name to describe the studied population compared to 1,000 Genomes.} @@ -14,6 +20,8 @@ compute_pca(cohort_name, input_plink, output_directory, ref1kg_population) \item{output_directory}{A \code{character}. The path where the data and figures is written.} \item{ref1kg_population}{A \code{character}. A file which describe samples and their ethnicity.} + +\item{n_comp}{A \code{numeric}. The number of principal components to be computed.} } \value{ A \code{data.frame}. diff --git a/man/estimate_ethnicity.Rd b/man/estimate_ethnicity.Rd index ec1c81b..b29f980 100644 --- a/man/estimate_ethnicity.Rd +++ b/man/estimate_ethnicity.Rd @@ -17,6 +17,7 @@ estimate_ethnicity( quality_threshold = 0.9, recode = "all", vcf_half_call = "missing", + n_comp = 10, n_cores = 6, bin_path = list(vcftools = "/usr/bin/vcftools", bcftools = "/usr/bin/bcftools", bgzip = "/usr/bin/bgzip", tabix = "/usr/bin/tabix", plink = "/usr/bin/plink1.9") @@ -51,6 +52,8 @@ estimate_ethnicity( + 'missing'/'m': Treat half-calls as missing (default). + 'reference'/'r': Treat the missing part as reference.} +\item{n_comp}{A \code{numeric}. The number of principal components to be computed.} + \item{n_cores}{An \code{integer}. The number of CPUs to use to estimate the ethnicity.} \item{bin_path}{A \code{list(character)}. A list giving the binary path of