From b939224d82c6a952b3364ea60ab785ff6869c303 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micka=C3=ABl=20Canouil?= <mickael.canouil@cnrs.fr>
Date: Thu, 23 Jul 2020 14:27:04 +0200
Subject: [PATCH] data.table oriented workflow

---
 DESCRIPTION               |   6 +-
 NAMESPACE                 |   1 +
 NEWS.md                   |   8 +-
 R/check_bin.R             |  15 +-
 R/compute_pca.R           | 324 ++++++++++++++++++++++++--------------
 R/estimate_ethnicity.R    |   5 +-
 R/pca_report.R            |  22 ++-
 man/compute_pca.Rd        |  10 +-
 man/estimate_ethnicity.Rd |   3 +
 9 files changed, 254 insertions(+), 140 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 3f1e936..03bbcb1 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -24,8 +24,12 @@ Depends:
 Imports:
     data.table (>= 1.12.8),
     flashpcaR (>= 2.1),
+    fs (>= 1.4.1),
+    ggforce (>= 0.3.2),
     ggplot2 (>= 3.3.2),
-    gt (>= 0.2.1)
+    ggtext (>= 0.1.0),
+    gt (>= 0.2.1),
+    patchwork (>= 1.0.1)
 Suggests:
     roxygen2 (>= 7.1.0)
 Remotes: 
diff --git a/NAMESPACE b/NAMESPACE
index beda532..709f9a5 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -6,3 +6,4 @@ export(pca_report)
 import(data.table)
 import(ggplot2)
 import(gt)
+import(patchwork)
diff --git a/NEWS.md b/NEWS.md
index 066ebc5..abc2327 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,9 +1,13 @@
 # rain (development version)
 
-## Breaking CHanges
+## Breaking Changes
 
 * In `R/pca_report.R`,
-    + Code refactoring.
+    + Code refactoring using `data.table`.
+* In `R/compute_pca.R`,
+    + Code refactoring using `data.table`.
+    + Tweak ethnicity figure.
+    + Add an ethnicity figure focussing on the studied cohort.
 
 # rain 0.4.1
 
diff --git a/R/check_bin.R b/R/check_bin.R
index 76483d5..5824ba8 100644
--- a/R/check_bin.R
+++ b/R/check_bin.R
@@ -6,18 +6,17 @@
 check_bin <- function(bin_path) {
   if (!all(sapply(bin_path, file.exists))) {
     message_prefix <- "[rain] "
+    text <- paste0(
+      names(bin_path[!sapply(bin_path, file.exists)]),
+      ' ("', bin_path[!sapply(bin_path, file.exists)], '")'
+    )
     stop(
       message_prefix,
       paste0(
         "No binary found for the following tools: ",
-        glue::glue_collapse(
-          x = paste0(
-            names(bin_path[!sapply(bin_path, file.exists)]),
-            ' ("', bin_path[!sapply(bin_path, file.exists)], '")'
-          ),
-          sep = ", ",
-          last = " and "
-        ),
+        paste(text[-length(text)], collapse = ", "),
+        " and ",
+        text[length(text)],
         "!"
       )
     )
diff --git a/R/compute_pca.R b/R/compute_pca.R
index bfafe39..912503f 100644
--- a/R/compute_pca.R
+++ b/R/compute_pca.R
@@ -5,170 +5,254 @@
 #'
 #' @return A `data.frame`.
 #'
+#' @import data.table
+#' @import ggplot2
+#' @import gt
+#' @import patchwork
+#'
 #' @export
-compute_pca <- function(cohort_name, input_plink, output_directory, ref1kg_population) {
+compute_pca <- function(cohort_name, input_plink, output_directory, ref1kg_population, n_comp = 10) {
   message_prefix <- "[rain] "
 
-  ######################
-  ### Performing PCA ###
-  ######################
-  message(message_prefix, "Performing PCA ...")
+  message(message_prefix, "Performing PCA ...", appendLF = TRUE)
 
-  fid_iid <- readr::read_delim(
+  fid_iid <- data.table::fread(
     file = paste0(input_plink, ".fam"),
-    delim = " ",
-    col_types = readr::cols_only(
-      X1 = readr::col_character(),
-      X2 = readr::col_character()
-    ),
-    col_names = FALSE
+    colClasses = c("V1" = "character", "V2" = "character"),
+    header = FALSE
   )
 
-  pca_ndim <- 10
+  pca_res <- flashpcaR::flashpca(input_plink, ndim = n_comp)
+
+  pca_contrib <- sprintf(
+    fmt = "PC%02d (%s %%)",
+    1:n_comp,
+    format(pca_res$values / sum(pca_res$values) * 100, digits = 2, nsmall = 2, trim = TRUE)
+  )
+  names(pca_contrib) <- sprintf("PC%02d", 1:n_comp)
 
-  pca_res <- flashpcaR::flashpca(input_plink, ndim = pca_ndim)
-  pca_dfxy <- data.table::as.data.table(pca_res[["projection"]])
+  pca_dfxy <- data.table::as.data.table(pca_res[["projection"]], keep.rownames = "iid")
   data.table::setnames(
     x = pca_dfxy,
-    old = setdiff(names(pca_dfxy), id_var),
-    new = sprintf("PC%02d", as.numeric(gsub("V", "", setdiff(names(pca_dfxy), id_var))))
+    old = setdiff(names(pca_dfxy), "iid"),
+    new = sprintf("PC%02d", as.numeric(gsub("V", "", setdiff(names(pca_dfxy), "iid"))))
   )
 
   if (all.equal(fid_iid[[1]], fid_iid[[2]])) {
-    pca_dfxy[["sample"]] <- fid_iid[[2]]
+    pca_dfxy[, "iid" := fid_iid[[2]]]
   } else {
-    pca_dfxy[["sample"]] <- paste(fid_iid[[1]], fid_iid[[2]], sep = "/")
+    pca_dfxy[, "iid" := paste(fid_iid[[1]], fid_iid[[2]], sep = "/")]
   }
 
-  pca_dfxy <- pca_dfxy %>%
-    dplyr::left_join(
-      y = suppressWarnings(
-        readr::read_tsv(
-          file = ref1kg_population,
-          col_types = readr::cols_only(
-            sample = readr::col_character(),
-            pop = readr::col_character(),
-            super_pop = readr::col_character()
-          )
-        )
-      ),
-      by = "sample"
-    ) %>%
-    dplyr::mutate(
-      cohort = ifelse(is.na(.data[["pop"]]), cohort_name, "1,000 Genomes"),
-      pop = ifelse(is.na(.data[["pop"]]), cohort_name, .data[["pop"]]),
-      super_pop = ifelse(is.na(.data[["super_pop"]]), cohort_name, .data[["super_pop"]])
-    ) %>%
-    dplyr::mutate_at(
-      .vars = dplyr::vars(.data[["cohort"]], .data[["pop"]], .data[["super_pop"]]),
-      .funs = ~ stats::relevel(factor(.x), ref = cohort_name)
-    ) %>%
-    dplyr::select("sample", dplyr::everything())
-
-  pop_centre <- purrr::map_df(c("super_pop", "pop") , function(ipop) {
-    pca_dfxy %>%
-      dplyr::filter(.data[["cohort"]] != !!cohort_name) %>%
-      dplyr::select(-"cohort") %>%
-      dplyr::group_by(.data[[ipop]]) %>%
-      dplyr::summarise(
-        PC01_centre = mean(.data[["PC01"]]),
-        PC02_centre = mean(.data[["PC02"]])
-      ) %>%
-      dplyr::mutate(pop_type = ipop) %>%
-      dplyr::rename("pop_closest" = !!ipop) %>%
-      dplyr::mutate_if(.predicate = is.factor, .funs = as.character)
-  })
+  ref_pop_table <- data.table::fread(
+    file = ref1kg_population,
+    colClasses = "character",
+    header = FALSE,
+    col.names = c("iid", "pop", "super_pop", "sex")
+  )
 
-  pca_gg_pred_best <- pca_dfxy %>%
-    dplyr::select(-c(.data[["pop"]], .data[["super_pop"]])) %>%
-    dplyr::filter(.data[["cohort"]] == !!cohort_name) %>%
-    dplyr::mutate(pop_centre = list(pop_centre)) %>%
-    tidyr::unnest("pop_centre") %>%
-    dplyr::group_by(.data[["sample"]], .data[["pop_type"]]) %>%
-    dplyr::mutate(
-      pop_dist = purrr::pmap_dbl(
-        .l = list(
-          .x = .data[["PC01"]], .y = .data[["PC02"]],
-          .x_centre = .data[["PC01_centre"]], .y_centre = .data[["PC02_centre"]]
-        ),
-        .f = function(.x, .y, .x_centre, .y_centre) {
-          sqrt((.x - .x_centre)^2 + (.y - .y_centre)^2)
-        }
+  pca_dfxy <- merge(
+    x = pca_dfxy,
+    y = ref_pop_table[, "cohort" := "1,000 Genomes"],
+    by = "iid",
+    all = TRUE
+  )
+
+  cohort <- NULL
+  pca_dfxy[!cohort %in% "1,000 Genomes", (c("pop", "super_pop", "cohort")) := list(cohort_name, cohort_name, cohort_name)]
+  pca_dfxy[,
+    (c("pop", "super_pop", "cohort")) := lapply(
+      X = .SD,
+      FUN = function(x) factor(x, levels = c(cohort_name, sort(setdiff(x, cohort_name))))
+    ),
+    .SDcols = c("pop", "super_pop", "cohort")
+  ]
+
+  pop_centre <- pca_dfxy[
+    cohort %in% "1,000 Genomes",
+    lapply(.SD, mean),
+    .SDcols = sprintf("PC%02d", 1:2),
+    by = "pop"
+  ]
+
+  PC01 <- PC02 <- dist <- which_closest <- pop <- super_pop <- NULL
+  pca_dfxy[,
+    "pop_closest" := (function(.x, .y) {
+      as.character(
+        pop_centre[,
+          list("dist" = sqrt((.x - PC01)^2 + (.y - PC02)^2)),
+          by = "pop"
+        ][,
+          "which_closest" := dist == min(dist)
+        ][
+          (which_closest)
+        ][["pop"]]
       )
-    ) %>%
-    dplyr::slice(which.min(.data[["pop_dist"]])) %>%
-    dplyr::ungroup() %>%
-    dplyr::select(-dplyr::ends_with("_centre"), -.data[["pop_dist"]]) %>%
-    tidyr::pivot_wider(names_from = "pop_type", values_from = "pop_closest")
-
-  p <- purrr::map(.x = c("pop" = "pop", "super_pop" = "super_pop"), .f = function(ipop) {
-    ggplot2::ggplot(
-      data = pca_dfxy,
-      mapping = ggplot2::aes(x = .data[["PC01"]], y = .data[["PC02"]], colour = .data[[ipop]])
-    ) +
-      ggplot2::theme_light(base_size = 12) +
-    	ggplot2::geom_hline(yintercept = 0, linetype = 1, size = 0.5, na.rm = TRUE) +
-    	ggplot2::geom_vline(xintercept = 0, linetype = 1, size = 0.5, na.rm = TRUE) +
-      ggforce::geom_mark_ellipse(mapping = ggplot2::aes(fill = .data[[ipop]]), con.cap = 0, alpha = 0.1) +
-      ggplot2::geom_point(mapping = ggplot2::aes(shape = .data[[ipop]]), na.rm = TRUE) +
-    	ggplot2::scale_colour_viridis_d(na.translate = FALSE, drop = FALSE, end = 0.9) +
-      ggplot2::scale_fill_viridis_d(na.translate = FALSE, drop = FALSE, end = 0.9) +
-      ggplot2::scale_shape_manual(values = c(3, rep(1, length(unique(pca_dfxy[[ipop]]))))) +
-      ggplot2::labs(
-        shape = c("super_pop" = "Super Population", "pop" = "Population")[ipop],
-        colour = c("super_pop" = "Super Population", "pop" = "Population")[ipop],
-        fill = c("super_pop" = "Super Population", "pop" = "Population")[ipop]
+    })(PC01, PC02),
+    by = "iid"
+  ]
+
+  pca_dfxy <- merge(
+    x = pca_dfxy,
+    y = unique(ref_pop_table[, list("pop_closest" = pop, "super_pop_closest" = super_pop)]),
+    by = "pop_closest",
+    all.x = TRUE
+  )
+
+  p <- lapply(
+    X = c("pop" = "pop", "super_pop" = "super_pop"),
+    FUN = function(ipop) {
+      ggplot2::ggplot(
+        data = pca_dfxy,
+        mapping = ggplot2::aes(x = .data[["PC01"]], y = .data[["PC02"]], colour = .data[[ipop]])
       ) +
-      ggplot2::facet_grid(cols = ggplot2::vars(.data[["cohort"]])) +
-      ggplot2::guides(
-        colour = ggplot2::guide_legend(
-          ncol = 2,
-          label.theme = ggplot2::element_text(size = 8),
-          keyheight = ggplot2::unit(9, "point")
+        ggplot2::theme_light(base_size = 11) +
+      	ggplot2::geom_hline(yintercept = 0, linetype = 2, size = 0.5, na.rm = TRUE) +
+      	ggplot2::geom_vline(xintercept = 0, linetype = 2, size = 0.5, na.rm = TRUE) +
+        ggforce::geom_mark_ellipse(mapping = ggplot2::aes(fill = .data[[ipop]]), con.cap = 0, alpha = 0.1) +
+        ggplot2::geom_point(mapping = ggplot2::aes(shape = .data[[ipop]]), na.rm = TRUE) +
+      	ggplot2::scale_colour_viridis_d(na.translate = FALSE, drop = FALSE, end = 0.9) +
+        ggplot2::scale_fill_viridis_d(na.translate = FALSE, drop = FALSE, end = 0.9) +
+        ggplot2::scale_shape_manual(values = c(3, rep(1, length(unique(pca_dfxy[[ipop]]))))) +
+        ggplot2::labs(
+          x = pca_contrib["PC01"],
+          y = pca_contrib["PC02"],
+          shape = c("super_pop" = "Super Population", "pop" = "Population")[ipop],
+          colour = c("super_pop" = "Super Population", "pop" = "Population")[ipop],
+          fill = c("super_pop" = "Super Population", "pop" = "Population")[ipop]
+        ) +
+        ggplot2::facet_grid(cols = ggplot2::vars(.data[["cohort"]])) +
+        ggplot2::guides(
+          colour = ggplot2::guide_legend(
+            ncol = 2,
+            label.theme = ggplot2::element_text(size = 8),
+            keyheight = ggplot2::unit(9, "point")
+          )
         )
-      )
   })
 
-  p_zoom <- purrr::map(p, ~ .x + ggplot2::coord_cartesian(
-    xlim = range(dplyr::filter(pca_dfxy, .data[["cohort"]] == !!cohort_name)[["PC01"]]),
-    ylim = range(dplyr::filter(pca_dfxy, .data[["cohort"]] == !!cohort_name)[["PC02"]])
-  ))
+  p_zoom <- lapply(
+    X = p,
+    FUN = function(.p) {
+      .p +
+        ggplot2::coord_cartesian(
+          xlim = range(pca_dfxy[!cohort %in% "1,000 Genomes"][["PC01"]]),
+          ylim = range(pca_dfxy[!cohort %in% "1,000 Genomes"][["PC02"]])
+        )
+    }
+  )
+
+  p_subtitle <- paste0(
+    "Principal Component Analysis using ",
+    format(nrow(data.table::fread(paste0(input_plink, ".bim"))), big.mark = ",", digits = 0),
+    " SNPs<br>",
+    "With A) population level and B) super population level"
+  )
 
   p_ethni <- patchwork::wrap_plots(
-    patchwork::wrap_plots(p[["super_pop"]], p_zoom[["super_pop"]]) +
-      patchwork::plot_layout(tag_level = "new", guides = "collect"),
     patchwork::wrap_plots(p[["pop"]], p_zoom[["pop"]]) +
+      patchwork::plot_layout(tag_level = "new", guides = "collect"),
+    patchwork::wrap_plots(p[["super_pop"]], p_zoom[["super_pop"]]) +
       patchwork::plot_layout(tag_level = "new", guides = "collect")
   ) +
     patchwork::plot_layout(nrow = 2, ncol = 1) +
     patchwork::plot_annotation(
       tag_levels = c("A", "1"),
-      title = "Estimated Ethnicity",
-      subtitle = paste(
-        "Based on Principal Component Analysis using",
-        scales::comma(nrow(data.table::fwrite(paste0(input_plink, ".bim")))),
-        "SNPs."
+      title = "Ethnicty Inference Based On 1,000 Genomes Project Data",
+      subtitle = gsub("<br>W", " w", p_subtitle),
+      theme = ggplot2::theme(plot.title.position = "plot", plot.subtitle = ggtext::element_markdown())
+    )
+
+  p_cohort_base <- ggplot2::ggplot(
+    data = pca_dfxy[!cohort %in% "1,000 Genomes"],
+    mapping = ggplot2::aes(x = .data[["PC01"]], y = .data[["PC02"]])
+  ) +
+    ggplot2::theme_light(base_size = 11) +
+  	ggplot2::geom_hline(yintercept = 0, linetype = 2, size = 0.5, na.rm = TRUE) +
+  	ggplot2::geom_vline(xintercept = 0, linetype = 2, size = 0.5, na.rm = TRUE) +
+  	ggplot2::scale_colour_viridis_d(na.translate = FALSE, drop = FALSE, begin = 0.10, end = 0.90) +
+    ggplot2::scale_fill_viridis_d(na.translate = FALSE, drop = FALSE, begin = 0.10, end = 0.90) +
+    ggplot2::guides(
+      colour = ggplot2::guide_legend(
+        ncol = ifelse(nrow(unique(pca_dfxy[!cohort %in% "1,000 Genomes", "pop_closest"])) > 5, 2, 1)
       )
+    ) +
+    ggplot2::theme(plot.title.position = "plot") +
+    ggplot2::coord_cartesian(
+      xlim = range(pca_dfxy[!cohort %in% "1,000 Genomes"][["PC01"]]),
+      ylim = range(pca_dfxy[!cohort %in% "1,000 Genomes"][["PC02"]])
     )
 
+  p_cohort <- patchwork::wrap_plots(
+    p_cohort_base +
+      ggplot2::geom_point(mapping = ggplot2::aes(colour = .data[["pop_closest"]]), shape = 1, na.rm = TRUE) +
+      ggforce::geom_mark_ellipse(
+        mapping = ggplot2::aes(
+          colour = .data[["pop_closest"]],
+          group = .data[["pop_closest"]]
+        ),
+        fill = "transparent",
+        alpha = 0.10
+      ) +
+      ggplot2::labs(
+        x = pca_contrib["PC01"],
+        y = pca_contrib["PC02"],
+        shape = "Population",
+        colour = "Population",
+        fill = "Population"
+      ),
+    p_cohort_base +
+      ggplot2::geom_point(mapping = ggplot2::aes(colour = .data[["super_pop_closest"]]), shape = 1, na.rm = TRUE) +
+      ggforce::geom_mark_ellipse(
+        mapping = ggplot2::aes(
+          colour = .data[["super_pop_closest"]],
+          group = .data[["super_pop_closest"]]
+        ),
+        fill = "transparent",
+        alpha = 0.10
+      ) +
+      ggplot2::labs(
+        x = pca_contrib["PC01"],
+        y = pca_contrib["PC02"],
+        shape = "Super Population",
+        colour = "Super Population",
+        fill = "Super Population"
+      )
+  ) +
+    patchwork::plot_layout(nrow = 2, ncol = 1) +
+    patchwork::plot_annotation(
+      tag_levels = "A",
+      title = "Ethnicty Inference Based On 1,000 Genomes Project Data",
+      subtitle = p_subtitle,
+      theme = ggplot2::theme(plot.title.position = "plot", plot.subtitle = ggtext::element_markdown())
+    )
 
-  #################
-  ### Exporting ###
-  #################
   message(message_prefix, "Exporting ...")
   ggplot2::ggsave(
-    filename = file.path(output_directory, paste0(cohort_name, "_ethnicity.pdf")),
+    filename = file.path(output_directory, paste0(cohort_name, "_ethnicity_1kg.pdf")),
     plot = p_ethni,
     width = 29.7 - 5,
     height = 21 - 5,
     units = "cm",
     dpi = 120
   )
+  ggplot2::ggsave(
+    filename = file.path(output_directory, paste0(cohort_name, "_ethnicity.pdf")),
+    plot = p_cohort,
+    width = 16,
+    height = 16,
+    units = "cm",
+    dpi = 120
+  )
 
   invisible(
-    readr::write_csv(
-      x = pca_gg_pred_best,
-      path = file.path(output_directory, paste0(cohort_name, "_ethnicity.csv"))
+    data.table::fwrite(
+      x = pca_dfxy[
+        !cohort %in% "1,000 Genomes",
+        .SD,
+        .SDcols = c("iid", sprintf("PC%02d", 1:n_comp), "cohort", "pop_closest", "super_pop_closest")
+      ],
+      file = file.path(output_directory, paste0(cohort_name, "_ethnicity.csv"))
     )
   )
 }
diff --git a/R/estimate_ethnicity.R b/R/estimate_ethnicity.R
index bbf184a..3928402 100755
--- a/R/estimate_ethnicity.R
+++ b/R/estimate_ethnicity.R
@@ -16,6 +16,7 @@
 #'     + 'haploid'/'h': Treat half-calls as haploid/homozygous (the PLINK 1 file format does not distinguish between the two). This maximizes similarity between the VCF and BCF2 parsers.
 #'     + 'missing'/'m': Treat half-calls as missing (default).
 #'     + 'reference'/'r': Treat the missing part as reference.
+#' @param n_comp A `numeric`. The number of principal components to be computed.
 #' @param n_cores An `integer`. The number of CPUs to use to estimate the ethnicity.
 #' @param bin_path A `list(character)`. A list giving the binary path of
 #'   `vcftools`, `bcftools`, `bgzip`, `tabix` and `plink`.
@@ -35,6 +36,7 @@ estimate_ethnicity <- function(
   quality_threshold = 0.9,
   recode = "all",
   vcf_half_call = "missing",
+  n_comp = 10,
   n_cores = 6,
   bin_path = list(
     vcftools = "/usr/bin/vcftools",
@@ -129,7 +131,8 @@ estimate_ethnicity <- function(
     cohort_name = cohort_name,
     input_plink = file.path(output_directory, "all"),
     output_directory = output_directory,
-    ref1kg_population = ref1kg_population
+    ref1kg_population = ref1kg_population,
+    n_comp = n_comp
   )
 
 }
diff --git a/R/pca_report.R b/R/pca_report.R
index 98173d4..1413fdf 100755
--- a/R/pca_report.R
+++ b/R/pca_report.R
@@ -17,6 +17,7 @@
 #' @import data.table
 #' @import ggplot2
 #' @import gt
+#' @import patchwork
 #'
 #' @examples
 #'
@@ -101,14 +102,17 @@ pca_report <- function(
       x = paste0(
         .data[["x"]],
         "<br><i style='font-size:5pt;'>(",
-          scales::percent_format(accuracy = 0.01, suffix = " %")(.data[["y"]]),
-        ")</i>"
+          format(.data[["y"]] * 100, digits = 2, nsmall = 2),
+        " %)</i>"
       ),
       y = .data[["y"]]
     )
   ) +
-    ggplot2::geom_col(width = 1, colour = "white", fill = scales::viridis_pal(begin = 0.5, end = 0.5)(1), na.rm = TRUE) +
-    ggplot2::scale_y_continuous(labels = scales::percent_format(accuracy = 0.1, suffix = " %"), expand = ggplot2::expansion(mult = c(0, 0.05))) +
+    ggplot2::geom_col(width = 1, colour = "white", fill = "#21908CFF", na.rm = TRUE) +
+    ggplot2::scale_y_continuous(
+      labels = function(x) paste(format(x * 100, digits = 2, nsmall = 2), "%"),
+      expand = ggplot2::expansion(mult = c(0, 0.05))
+    ) +
     ggplot2::labs(
       x = "Principal Components",
       y = "Contribution"
@@ -146,7 +150,7 @@ pca_report <- function(
           label = gsub(
             pattern = "(.*)e([-+]*)0*(.*)",
             replacement = "\\1<br>&times;<br>10<sup>\\2\\3</sup>",
-            x = scales::scientific(.data[["Pr(>F)"]], digits = 2)
+            x = format(.data[["Pr(>F)"]], digits = 2, nsmall = 2, scientific = TRUE)
           )
         ),
         colour = "white",
@@ -162,8 +166,12 @@ pca_report <- function(
         labels = function(x) {
           paste0(
             x, "<br><i style='font-size:5pt;'>(",
-            scales::percent_format(accuracy = 0.01, suffix = " %")((pca_res$values / sum(pca_res$values))[as.numeric(gsub("PC", "", x))]),
-            ")</i>"
+            format(
+              x = (pca_res$values / sum(pca_res$values))[as.numeric(gsub("PC", "", x))] * 100,
+              digits = 2,
+              nsmall = 2
+            ),
+            " %)</i>"
           )
         }
       ) +
diff --git a/man/compute_pca.Rd b/man/compute_pca.Rd
index d90f627..d551dbd 100644
--- a/man/compute_pca.Rd
+++ b/man/compute_pca.Rd
@@ -4,7 +4,13 @@
 \alias{compute_pca}
 \title{Compute the genomic components (and some figures) for ethnicity based on VCF files.}
 \usage{
-compute_pca(cohort_name, input_plink, output_directory, ref1kg_population)
+compute_pca(
+  cohort_name,
+  input_plink,
+  output_directory,
+  ref1kg_population,
+  n_comp = 10
+)
 }
 \arguments{
 \item{cohort_name}{A \code{character}. A name to describe the studied population compared to 1,000 Genomes.}
@@ -14,6 +20,8 @@ compute_pca(cohort_name, input_plink, output_directory, ref1kg_population)
 \item{output_directory}{A \code{character}. The path where the data and figures is written.}
 
 \item{ref1kg_population}{A \code{character}. A file which describe samples and their ethnicity.}
+
+\item{n_comp}{A \code{numeric}. The number of principal components to be computed.}
 }
 \value{
 A \code{data.frame}.
diff --git a/man/estimate_ethnicity.Rd b/man/estimate_ethnicity.Rd
index ec1c81b..b29f980 100644
--- a/man/estimate_ethnicity.Rd
+++ b/man/estimate_ethnicity.Rd
@@ -17,6 +17,7 @@ estimate_ethnicity(
   quality_threshold = 0.9,
   recode = "all",
   vcf_half_call = "missing",
+  n_comp = 10,
   n_cores = 6,
   bin_path = list(vcftools = "/usr/bin/vcftools", bcftools = "/usr/bin/bcftools", bgzip
     = "/usr/bin/bgzip", tabix = "/usr/bin/tabix", plink = "/usr/bin/plink1.9")
@@ -51,6 +52,8 @@ estimate_ethnicity(
 + 'missing'/'m': Treat half-calls as missing (default).
 + 'reference'/'r': Treat the missing part as reference.}
 
+\item{n_comp}{A \code{numeric}. The number of principal components to be computed.}
+
 \item{n_cores}{An \code{integer}. The number of CPUs to use to estimate the ethnicity.}
 
 \item{bin_path}{A \code{list(character)}. A list giving the binary path of