Merge pull request #333 from immunomind/dev

Immunarch 0.9.0 release
immunomind · Dec 16, 2022 · ac2c840 · ac2c840
2 parents bc90fed + 09b0012
commit ac2c840
Show file tree

Hide file tree

Showing 16 changed files with 518 additions and 66 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: immunarch
 Type: Package
 Title: Bioinformatics Analysis of T-Cell and B-Cell Immune Repertoires
-Version: 0.8.0
+Version: 0.9.0
 Authors@R: c(
     person("Vadim I.", "Nazarov", , "[email protected]", c("aut", "cre")),
     person("Vasily O.", "Tsvetkov", , role = "aut"),
@@ -84,6 +84,6 @@ Suggests:
     rmarkdown
 VignetteBuilder: knitr
 Encoding: UTF-8
-RoxygenNote: 7.2.1
+RoxygenNote: 7.2.2
 LazyData: true
 LazyDataCompression: xz
diff --git a/NAMESPACE b/NAMESPACE
@@ -147,10 +147,12 @@ importFrom(dplyr,group_map)
 importFrom(dplyr,left_join)
 importFrom(dplyr,mutate)
 importFrom(dplyr,n)
+importFrom(dplyr,one_of)
 importFrom(dplyr,pull)
 importFrom(dplyr,rename)
 importFrom(dplyr,rowwise)
 importFrom(dplyr,select)
+importFrom(dplyr,select_)
 importFrom(dplyr,select_if)
 importFrom(dplyr,summarise)
 importFrom(dplyr,tally)
@@ -290,13 +292,13 @@ importFrom(tibble,tibble)
 importFrom(tidyr,drop_na)
 importFrom(tidyr,unite)
 importFrom(tidyr,unnest)
+importFrom(tidyselect,all_of)
 importFrom(tidyselect,any_of)
 importFrom(tidyselect,starts_with)
 importFrom(utils,capture.output)
 importFrom(utils,packageVersion)
 importFrom(utils,read.table)
 importFrom(utils,setTxtProgressBar)
-importFrom(utils,str)
 importFrom(utils,tail)
 importFrom(utils,txtProgressBar)
 importFrom(uuid,UUIDgenerate)

diff --git a/R/align_lineage.R b/R/align_lineage.R
@@ -9,7 +9,6 @@
 #' @importFrom plyr dlply .
 #' @importFrom purrr map_dfr
 #' @importFrom rlist list.remove
-#' @importFrom utils str
 #' @importFrom ape as.DNAbin clustal
 #' @importFrom doParallel registerDoParallel stopImplicitCluster
 #' @importFrom parallel mclapply
@@ -117,7 +116,7 @@ align_single_df <- function(data,
         "Found dataframe without required column ",
         required_column,
         ";\nexisting columns: ",
-        utils::str(colnames(data))
+        toString(colnames(data))
       )
     }
   }

diff --git a/R/diversity.R b/R/diversity.R
@@ -14,6 +14,7 @@ if (getRversion() >= "2.15.1") {
 #' @importFrom dplyr mutate group_by_at pull
 #' @importFrom stats qnorm
 #' @importFrom rlang sym
+#' @importFrom tidyselect all_of
 #'
 #' @description
 #' This is a utility function to estimate the diversity of species or objects in the given distribution.
@@ -188,15 +189,14 @@ repDiversity <- function(.data, .method = "chao1", .col = "aa", .max.q = 6, .min
       .data <- list(Sample = .data)
     }
 
-
     .col <- process_col_argument(.col) # ToDo: refactor this and the next branches
 
     vec <- lapply(.data, function(x) {
       if (has_class(x, "data.table")) {
         x <- x %>% lazy_dt()
       }
       x %>%
-        select(.col, IMMCOL$count) %>%
+        select(all_of(.col), IMMCOL$count) %>%
         group_by_at(vars(.col)) %>%
         summarise(Div.count = sum(!!sym(IMMCOL$count))) %>%
         pull(Div.count)
@@ -350,20 +350,20 @@ rarefaction <- function(.data, .step = NA, .quantile = c(.025, .975),
   if (is.na(.step)) {
     .step <- min(sapply(.data, function(x) sum(as.numeric(x)))) %/% 50.
   }
+  .step <- max(1, .step)
 
   if (is.na(.extrapolation)) {
     .extrapolation <- max(sapply(.data, function(x) sum(as.numeric(x)))) * 20
   }
 
   .alpha <- function(n, Xi, m) {
-    k <- Xi
     return((1 - m / n)^Xi)
   }
 
   if (.verbose) {
     pb <- set_pb(sum(sapply(seq_along(.data), function(i) {
       bc.vec <- .data[[i]]
-      bc.sum <- sum(.data[[i]])
+      bc.sum <- sum(bc.vec)
       sizes <- seq(.step, bc.sum, .step)
       if (sizes[length(sizes)] != bc.sum) {
         sizes <- c(sizes, bc.sum)
@@ -381,9 +381,7 @@ rarefaction <- function(.data, .step = NA, .quantile = c(.025, .975),
     }
     n <- sum(bc.vec)
     sizes <- seq(.step, n, .step)
-    # if (sizes[length(sizes)] != n) {
-    #   sizes <- c(sizes, n)
-    # }
+
     counts <- table(bc.vec)
     muc.res <- t(sapply(sizes, function(sz) {
       freqs <- as.numeric(names(counts))
@@ -413,7 +411,6 @@ rarefaction <- function(.data, .step = NA, .quantile = c(.025, .975),
     }))
 
     if (.extrapolation > 0) {
-      # sizes <- seq(sum(.data[[i]]), .extrapolation + max(sapply(.data, function (x) sum(x))), .step)
       sizes <- seq(
         tail(seq(.step, sum(.data[[i]]), .step), 1) + .step,
         .extrapolation,

diff --git a/R/io-parsers.R b/R/io-parsers.R
@@ -399,10 +399,10 @@ parse_mitcr <- function(.filename, .mode) {
   )
 }
 
-parse_mixcr <- function(.filename, .mode) {
+parse_mixcr <- function(.filename, .mode, .count = c("clonecount", "readcount")) {
   .filename <- .filename
   .id <- "cloneid"
-  .count <- "clonecount"
+  .count %<>% tolower()
   .sep <- "\t"
   .vd.insertions <- "VD.insertions"
   .dj.insertions <- "DJ.insertions"
@@ -677,16 +677,21 @@ parse_mixcr <- function(.filename, .mode) {
     df[[pos_extra_headers[["j3del"]]]] <- sapply(df[["refpoints"]], get_ref_point_position, 18)
   }
 
-  if (!(.count %in% table.colnames)) {
+  if (!any(.count %in% table.colnames)) {
     warn_msg <- c("  [!] Warning: can't find a column with clonal counts. Setting all clonal counts to 1.")
     warn_msg <- c(warn_msg, "\n      Did you apply repLoad to MiXCR file *_alignments.txt?")
     warn_msg <- c(warn_msg, " If so please consider moving all *.clonotypes.*.txt MiXCR files to")
     warn_msg <- c(warn_msg, " a separate folder and apply repLoad to the folder.")
     warn_msg <- c(warn_msg, "\n      Note: The *_alignments.txt file IS NOT a repertoire file suitable for any analysis.")
     message(warn_msg)
 
+    .count <- .count[1]
     df[[.count]] <- 1
+  } else if (length(.count) > 1) {
+    # if multiple column name options specified for .count, keep only the first valid
+    .count <- .count[.count %in% table.colnames][1]
   }
+
   .freq <- "Proportion"
   df$Proportion <- df[[.count]] / sum(df[[.count]], na.rm = TRUE)
 
@@ -829,8 +834,6 @@ parse_tcr <- function(.filename, .mode) {
 }
 
 parse_vdjtools <- function(.filename, .mode) {
-  skip <- 0
-
   # Check for different VDJtools outputs
   f <- file(.filename, "r")
   l <- readLines(f, 1)
@@ -959,19 +962,28 @@ parse_airr <- function(.filename, .mode) {
     .as_tsv() %>%
     airr::read_rearrangement()
 
-  df <- df %>%
-    select(
-      sequence, v_call, d_call, j_call, junction, junction_aa,
-      contains("v_germline_end"), contains("d_germline_start"), contains("d_germline_end"),
-      contains("j_germline_start"), contains("np1_length"), contains("np2_length"),
-      contains("duplicate_count")
+  df %<>%
+    select_(
+      "sequence", "v_call", "d_call", "j_call", "junction", "junction_aa",
+      ~contains("v_germline_end"), ~contains("d_germline_start"),
+      ~contains("d_germline_end"), ~contains("j_germline_start"),
+      ~contains("np1_length"), ~contains("np2_length"),
+      ~contains("duplicate_count"),
+      "cdr1", "cdr2", "cdr1_aa", "cdr2_aa", "fwr1", "fwr2", "fwr3", "fwr4",
+      "fwr1_aa", "fwr2_aa", "fwr3_aa", "fwr4_aa"
     )
 
   namekey <- c(
     duplicate_count = IMMCOL$count, junction = IMMCOL$cdr3nt, junction_aa = IMMCOL$cdr3aa,
     v_call = IMMCOL$v, d_call = IMMCOL$d, j_call = IMMCOL$j, v_germline_end = IMMCOL$ve,
     d_germline_start = IMMCOL$ds, d_germline_end = IMMCOL$de, j_germline_start = IMMCOL$js,
-    np1_length = "unidins", np2_length = IMMCOL$dnj, sequence = IMMCOL$seq
+    np1_length = "unidins", np2_length = IMMCOL$dnj, sequence = IMMCOL$seq,
+    cdr1 = IMMCOL_EXT$cdr1nt, cdr2 = IMMCOL_EXT$cdr2nt,
+    cdr1_aa = IMMCOL_EXT$cdr1aa, cdr2_aa = IMMCOL_EXT$cdr2aa,
+    fwr1 = IMMCOL_EXT$fr1nt, fwr2 = IMMCOL_EXT$fr2nt,
+    fwr3 = IMMCOL_EXT$fr3nt, fwr4 = IMMCOL_EXT$fr4nt,
+    fwr1_aa = IMMCOL_EXT$fr1aa, fwr2_aa = IMMCOL_EXT$fr2aa,
+    fwr3_aa = IMMCOL_EXT$fr3aa, fwr4_aa = IMMCOL_EXT$fr4aa
   )
 
   names(df) <- namekey[names(df)]
@@ -993,13 +1005,15 @@ parse_airr <- function(.filename, .mode) {
     }
   }
 
-  for (column in IMMCOL$order) {
+  order <- c(IMMCOL$order, IMMCOL_EXT$order[IMMCOL_EXT$order %in% namekey])
+
+  for (column in order) {
     if (!(column %in% colnames(df))) {
       df[column] <- NA
     }
   }
 
-  df <- df[IMMCOL$order]
+  df <- df[order]
   total <- sum(df$Clones)
   df[IMMCOL$prop] <- df[IMMCOL$count] / total
   df[IMMCOL$seq] <- stringr::str_remove_all(df[[IMMCOL$seq]], "N")
@@ -1039,21 +1053,50 @@ parse_10x_filt_contigs <- function(.filename, .mode) {
     .vgenes = "v_gene", .jgenes = "j_gene", .dgenes = "d_gene",
     .vend = NA, .jstart = NA, .dstart = NA, .dend = NA,
     .vd.insertions = NA, .dj.insertions = NA, .total.insertions = NA,
-    .skip = 0, .sep = ",", # .add = c("chain", "raw_clonotype_id", "raw_consensus_id", "barcode", "contig_id")
-    .add = c("chain", "barcode", "raw_clonotype_id", "contig_id", "c_gene")
+    .skip = 0, .sep = ",",
+    .add = c(
+      "chain", "barcode", "raw_clonotype_id", "contig_id", "c_gene",
+      "cdr1_nt", "cdr1", "cdr2_nt", "cdr2",
+      "fwr1_nt", "fwr1", "fwr2_nt", "fwr2", "fwr3_nt", "fwr3", "fwr4_nt", "fwr4"
+    )
   )
 
+  setnames(df, "cdr1_nt", IMMCOL_EXT$cdr1nt)
+  setnames(df, "cdr2_nt", IMMCOL_EXT$cdr2nt)
+  setnames(df, "cdr1", IMMCOL_EXT$cdr1aa)
+  setnames(df, "cdr2", IMMCOL_EXT$cdr2aa)
+  setnames(df, "fwr1_nt", IMMCOL_EXT$fr1nt)
+  setnames(df, "fwr2_nt", IMMCOL_EXT$fr2nt)
+  setnames(df, "fwr3_nt", IMMCOL_EXT$fr3nt)
+  setnames(df, "fwr4_nt", IMMCOL_EXT$fr4nt)
+  setnames(df, "fwr1", IMMCOL_EXT$fr1aa)
+  setnames(df, "fwr2", IMMCOL_EXT$fr2aa)
+  setnames(df, "fwr3", IMMCOL_EXT$fr3aa)
+  setnames(df, "fwr4", IMMCOL_EXT$fr4aa)
+
   # Process 10xGenomics filtered contigs files - count barcodes, merge consensues ids, clonotype ids and contig ids
   df <- df[order(df$chain), ]
   setDT(df)
 
   if (.mode == "paired") {
-    df <- df %>%
+    df %<>%
       lazy_dt() %>%
-      group_by(barcode, raw_clonotype_id) %>%
+      group_by_colnames("barcode", "raw_clonotype_id") %>%
       summarise(
+        CDR1.nt = paste0(get("CDR1.nt"), collapse = IMMCOL_ADD$scsep),
+        CDR1.aa = paste0(get("CDR1.aa"), collapse = IMMCOL_ADD$scsep),
+        CDR2.nt = paste0(get("CDR2.nt"), collapse = IMMCOL_ADD$scsep),
+        CDR2.aa = paste0(get("CDR2.aa"), collapse = IMMCOL_ADD$scsep),
         CDR3.nt = paste0(get("CDR3.nt"), collapse = IMMCOL_ADD$scsep),
         CDR3.aa = paste0(get("CDR3.aa"), collapse = IMMCOL_ADD$scsep),
+        FR1.nt = paste0(get("FR1.nt"), collapse = IMMCOL_ADD$scsep),
+        FR1.aa = paste0(get("FR1.aa"), collapse = IMMCOL_ADD$scsep),
+        FR2.nt = paste0(get("FR2.nt"), collapse = IMMCOL_ADD$scsep),
+        FR2.aa = paste0(get("FR2.aa"), collapse = IMMCOL_ADD$scsep),
+        FR3.nt = paste0(get("FR3.nt"), collapse = IMMCOL_ADD$scsep),
+        FR3.aa = paste0(get("FR3.aa"), collapse = IMMCOL_ADD$scsep),
+        FR4.nt = paste0(get("FR4.nt"), collapse = IMMCOL_ADD$scsep),
+        FR4.aa = paste0(get("FR4.aa"), collapse = IMMCOL_ADD$scsep),
         V.name = paste0(get("V.name"), collapse = IMMCOL_ADD$scsep),
         J.name = paste0(get("J.name"), collapse = IMMCOL_ADD$scsep),
         D.name = paste0(get("D.name"), collapse = IMMCOL_ADD$scsep),
@@ -1067,23 +1110,46 @@ parse_10x_filt_contigs <- function(.filename, .mode) {
       as.data.table()
   }
 
-  df <- df %>%
+  df %<>%
     lazy_dt() %>%
-    group_by(CDR3.nt, V.name, J.name) %>%
+    mutate(
+      CDR3.nt.sorted = sort_string(get("CDR3.nt"), IMMCOL_ADD$scsep),
+      V.name.sorted = sort_string(get("V.name"), IMMCOL_ADD$scsep),
+      J.name.sorted = sort_string(get("J.name"), IMMCOL_ADD$scsep)
+    ) %>%
+    group_by_colnames("CDR3.nt.sorted", "V.name.sorted", "J.name.sorted") %>%
     summarise(
       Clones = length(unique(get("barcode"))),
+      CDR3.nt = first(get("CDR3.nt")),
       CDR3.aa = first(get("CDR3.aa")),
+      V.name = first(get("V.name")),
       D.name = first(get("D.name")),
+      J.name = first(get("J.name")),
       chain = first(get("chain")),
       barcode = paste0(unique(get("barcode")), collapse = IMMCOL_ADD$scsep),
       raw_clonotype_id = gsub(
         "clonotype|None", "",
         paste0(unique(get("raw_clonotype_id")), collapse = IMMCOL_ADD$scsep)
       ),
       contig_id = paste0(get("contig_id"), collapse = IMMCOL_ADD$scsep),
-      c_gene = first(get("c_gene"))
+      c_gene = first(get("c_gene")),
+      CDR1.nt = first(get(IMMCOL_EXT$cdr1nt)),
+      CDR2.nt = first(get(IMMCOL_EXT$cdr2nt)),
+      CDR1.aa = first(get(IMMCOL_EXT$cdr1aa)),
+      CDR2.aa = first(get(IMMCOL_EXT$cdr2aa)),
+      FR1.nt = first(get(IMMCOL_EXT$fr1nt)),
+      FR2.nt = first(get(IMMCOL_EXT$fr2nt)),
+      FR3.nt = first(get(IMMCOL_EXT$fr3nt)),
+      FR4.nt = first(get(IMMCOL_EXT$fr4nt)),
+      FR1.aa = first(get(IMMCOL_EXT$fr1aa)),
+      FR2.aa = first(get(IMMCOL_EXT$fr2aa)),
+      FR3.aa = first(get(IMMCOL_EXT$fr3aa)),
+      FR4.aa = first(get(IMMCOL_EXT$fr4aa))
     ) %>%
-    as.data.table()
+    as.data.table() %>%
+    subset(
+      select = -c(get("CDR3.nt.sorted"), get("V.name.sorted"), get("J.name.sorted"))
+    )
 
   df$V.end <- NA
   df$J.start <- NA

diff --git a/R/io-utility.R b/R/io-utility.R
@@ -76,7 +76,7 @@
 
 
 .make_names <- function(.char) {
-  if (is.na(.char[1])) {
+  if (has_no_data(.char)) {
     NA
   } else {
     tolower(.char)
@@ -136,8 +136,8 @@
     .vend, .jstart, .dstart, .dend,
     .vd.insertions, .dj.insertions, .total.insertions
   ))
-  if (!is.na(.add[1])) {
-    swlist <- c(swlist, rep(col_guess(), length(.add)))
+  if (!has_no_data(.add)) {
+    swlist <- c(swlist, rep(list(col_guess()), length(.add)))
     names(swlist)[tail(seq_along(swlist), length(.add))] <- .add
   }