Skip to content

Commit

Permalink
Merge pull request #333 from immunomind/dev
Browse files Browse the repository at this point in the history
Immunarch 0.9.0 release
  • Loading branch information
Alexander230 authored Dec 16, 2022
2 parents bc90fed + 09b0012 commit ac2c840
Show file tree
Hide file tree
Showing 16 changed files with 518 additions and 66 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: immunarch
Type: Package
Title: Bioinformatics Analysis of T-Cell and B-Cell Immune Repertoires
Version: 0.8.0
Version: 0.9.0
Authors@R: c(
person("Vadim I.", "Nazarov", , "[email protected]", c("aut", "cre")),
person("Vasily O.", "Tsvetkov", , role = "aut"),
Expand Down Expand Up @@ -84,6 +84,6 @@ Suggests:
rmarkdown
VignetteBuilder: knitr
Encoding: UTF-8
RoxygenNote: 7.2.1
RoxygenNote: 7.2.2
LazyData: true
LazyDataCompression: xz
4 changes: 3 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -147,10 +147,12 @@ importFrom(dplyr,group_map)
importFrom(dplyr,left_join)
importFrom(dplyr,mutate)
importFrom(dplyr,n)
importFrom(dplyr,one_of)
importFrom(dplyr,pull)
importFrom(dplyr,rename)
importFrom(dplyr,rowwise)
importFrom(dplyr,select)
importFrom(dplyr,select_)
importFrom(dplyr,select_if)
importFrom(dplyr,summarise)
importFrom(dplyr,tally)
Expand Down Expand Up @@ -290,13 +292,13 @@ importFrom(tibble,tibble)
importFrom(tidyr,drop_na)
importFrom(tidyr,unite)
importFrom(tidyr,unnest)
importFrom(tidyselect,all_of)
importFrom(tidyselect,any_of)
importFrom(tidyselect,starts_with)
importFrom(utils,capture.output)
importFrom(utils,packageVersion)
importFrom(utils,read.table)
importFrom(utils,setTxtProgressBar)
importFrom(utils,str)
importFrom(utils,tail)
importFrom(utils,txtProgressBar)
importFrom(uuid,UUIDgenerate)
Expand Down
3 changes: 1 addition & 2 deletions R/align_lineage.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
#' @importFrom plyr dlply .
#' @importFrom purrr map_dfr
#' @importFrom rlist list.remove
#' @importFrom utils str
#' @importFrom ape as.DNAbin clustal
#' @importFrom doParallel registerDoParallel stopImplicitCluster
#' @importFrom parallel mclapply
Expand Down Expand Up @@ -117,7 +116,7 @@ align_single_df <- function(data,
"Found dataframe without required column ",
required_column,
";\nexisting columns: ",
utils::str(colnames(data))
toString(colnames(data))
)
}
}
Expand Down
13 changes: 5 additions & 8 deletions R/diversity.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ if (getRversion() >= "2.15.1") {
#' @importFrom dplyr mutate group_by_at pull
#' @importFrom stats qnorm
#' @importFrom rlang sym
#' @importFrom tidyselect all_of
#'
#' @description
#' This is a utility function to estimate the diversity of species or objects in the given distribution.
Expand Down Expand Up @@ -188,15 +189,14 @@ repDiversity <- function(.data, .method = "chao1", .col = "aa", .max.q = 6, .min
.data <- list(Sample = .data)
}


.col <- process_col_argument(.col) # ToDo: refactor this and the next branches

vec <- lapply(.data, function(x) {
if (has_class(x, "data.table")) {
x <- x %>% lazy_dt()
}
x %>%
select(.col, IMMCOL$count) %>%
select(all_of(.col), IMMCOL$count) %>%
group_by_at(vars(.col)) %>%
summarise(Div.count = sum(!!sym(IMMCOL$count))) %>%
pull(Div.count)
Expand Down Expand Up @@ -350,20 +350,20 @@ rarefaction <- function(.data, .step = NA, .quantile = c(.025, .975),
if (is.na(.step)) {
.step <- min(sapply(.data, function(x) sum(as.numeric(x)))) %/% 50.
}
.step <- max(1, .step)

if (is.na(.extrapolation)) {
.extrapolation <- max(sapply(.data, function(x) sum(as.numeric(x)))) * 20
}

.alpha <- function(n, Xi, m) {
k <- Xi
return((1 - m / n)^Xi)
}

if (.verbose) {
pb <- set_pb(sum(sapply(seq_along(.data), function(i) {
bc.vec <- .data[[i]]
bc.sum <- sum(.data[[i]])
bc.sum <- sum(bc.vec)
sizes <- seq(.step, bc.sum, .step)
if (sizes[length(sizes)] != bc.sum) {
sizes <- c(sizes, bc.sum)
Expand All @@ -381,9 +381,7 @@ rarefaction <- function(.data, .step = NA, .quantile = c(.025, .975),
}
n <- sum(bc.vec)
sizes <- seq(.step, n, .step)
# if (sizes[length(sizes)] != n) {
# sizes <- c(sizes, n)
# }

counts <- table(bc.vec)
muc.res <- t(sapply(sizes, function(sz) {
freqs <- as.numeric(names(counts))
Expand Down Expand Up @@ -413,7 +411,6 @@ rarefaction <- function(.data, .step = NA, .quantile = c(.025, .975),
}))

if (.extrapolation > 0) {
# sizes <- seq(sum(.data[[i]]), .extrapolation + max(sapply(.data, function (x) sum(x))), .step)
sizes <- seq(
tail(seq(.step, sum(.data[[i]]), .step), 1) + .step,
.extrapolation,
Expand Down
110 changes: 88 additions & 22 deletions R/io-parsers.R
Original file line number Diff line number Diff line change
Expand Up @@ -399,10 +399,10 @@ parse_mitcr <- function(.filename, .mode) {
)
}

parse_mixcr <- function(.filename, .mode) {
parse_mixcr <- function(.filename, .mode, .count = c("clonecount", "readcount")) {
.filename <- .filename
.id <- "cloneid"
.count <- "clonecount"
.count %<>% tolower()
.sep <- "\t"
.vd.insertions <- "VD.insertions"
.dj.insertions <- "DJ.insertions"
Expand Down Expand Up @@ -677,16 +677,21 @@ parse_mixcr <- function(.filename, .mode) {
df[[pos_extra_headers[["j3del"]]]] <- sapply(df[["refpoints"]], get_ref_point_position, 18)
}

if (!(.count %in% table.colnames)) {
if (!any(.count %in% table.colnames)) {
warn_msg <- c(" [!] Warning: can't find a column with clonal counts. Setting all clonal counts to 1.")
warn_msg <- c(warn_msg, "\n Did you apply repLoad to MiXCR file *_alignments.txt?")
warn_msg <- c(warn_msg, " If so please consider moving all *.clonotypes.*.txt MiXCR files to")
warn_msg <- c(warn_msg, " a separate folder and apply repLoad to the folder.")
warn_msg <- c(warn_msg, "\n Note: The *_alignments.txt file IS NOT a repertoire file suitable for any analysis.")
message(warn_msg)

.count <- .count[1]
df[[.count]] <- 1
} else if (length(.count) > 1) {
# if multiple column name options specified for .count, keep only the first valid
.count <- .count[.count %in% table.colnames][1]
}

.freq <- "Proportion"
df$Proportion <- df[[.count]] / sum(df[[.count]], na.rm = TRUE)

Expand Down Expand Up @@ -829,8 +834,6 @@ parse_tcr <- function(.filename, .mode) {
}

parse_vdjtools <- function(.filename, .mode) {
skip <- 0

# Check for different VDJtools outputs
f <- file(.filename, "r")
l <- readLines(f, 1)
Expand Down Expand Up @@ -959,19 +962,28 @@ parse_airr <- function(.filename, .mode) {
.as_tsv() %>%
airr::read_rearrangement()

df <- df %>%
select(
sequence, v_call, d_call, j_call, junction, junction_aa,
contains("v_germline_end"), contains("d_germline_start"), contains("d_germline_end"),
contains("j_germline_start"), contains("np1_length"), contains("np2_length"),
contains("duplicate_count")
df %<>%
select_(
"sequence", "v_call", "d_call", "j_call", "junction", "junction_aa",
~contains("v_germline_end"), ~contains("d_germline_start"),
~contains("d_germline_end"), ~contains("j_germline_start"),
~contains("np1_length"), ~contains("np2_length"),
~contains("duplicate_count"),
"cdr1", "cdr2", "cdr1_aa", "cdr2_aa", "fwr1", "fwr2", "fwr3", "fwr4",
"fwr1_aa", "fwr2_aa", "fwr3_aa", "fwr4_aa"
)

namekey <- c(
duplicate_count = IMMCOL$count, junction = IMMCOL$cdr3nt, junction_aa = IMMCOL$cdr3aa,
v_call = IMMCOL$v, d_call = IMMCOL$d, j_call = IMMCOL$j, v_germline_end = IMMCOL$ve,
d_germline_start = IMMCOL$ds, d_germline_end = IMMCOL$de, j_germline_start = IMMCOL$js,
np1_length = "unidins", np2_length = IMMCOL$dnj, sequence = IMMCOL$seq
np1_length = "unidins", np2_length = IMMCOL$dnj, sequence = IMMCOL$seq,
cdr1 = IMMCOL_EXT$cdr1nt, cdr2 = IMMCOL_EXT$cdr2nt,
cdr1_aa = IMMCOL_EXT$cdr1aa, cdr2_aa = IMMCOL_EXT$cdr2aa,
fwr1 = IMMCOL_EXT$fr1nt, fwr2 = IMMCOL_EXT$fr2nt,
fwr3 = IMMCOL_EXT$fr3nt, fwr4 = IMMCOL_EXT$fr4nt,
fwr1_aa = IMMCOL_EXT$fr1aa, fwr2_aa = IMMCOL_EXT$fr2aa,
fwr3_aa = IMMCOL_EXT$fr3aa, fwr4_aa = IMMCOL_EXT$fr4aa
)

names(df) <- namekey[names(df)]
Expand All @@ -993,13 +1005,15 @@ parse_airr <- function(.filename, .mode) {
}
}

for (column in IMMCOL$order) {
order <- c(IMMCOL$order, IMMCOL_EXT$order[IMMCOL_EXT$order %in% namekey])

for (column in order) {
if (!(column %in% colnames(df))) {
df[column] <- NA
}
}

df <- df[IMMCOL$order]
df <- df[order]
total <- sum(df$Clones)
df[IMMCOL$prop] <- df[IMMCOL$count] / total
df[IMMCOL$seq] <- stringr::str_remove_all(df[[IMMCOL$seq]], "N")
Expand Down Expand Up @@ -1039,21 +1053,50 @@ parse_10x_filt_contigs <- function(.filename, .mode) {
.vgenes = "v_gene", .jgenes = "j_gene", .dgenes = "d_gene",
.vend = NA, .jstart = NA, .dstart = NA, .dend = NA,
.vd.insertions = NA, .dj.insertions = NA, .total.insertions = NA,
.skip = 0, .sep = ",", # .add = c("chain", "raw_clonotype_id", "raw_consensus_id", "barcode", "contig_id")
.add = c("chain", "barcode", "raw_clonotype_id", "contig_id", "c_gene")
.skip = 0, .sep = ",",
.add = c(
"chain", "barcode", "raw_clonotype_id", "contig_id", "c_gene",
"cdr1_nt", "cdr1", "cdr2_nt", "cdr2",
"fwr1_nt", "fwr1", "fwr2_nt", "fwr2", "fwr3_nt", "fwr3", "fwr4_nt", "fwr4"
)
)

setnames(df, "cdr1_nt", IMMCOL_EXT$cdr1nt)
setnames(df, "cdr2_nt", IMMCOL_EXT$cdr2nt)
setnames(df, "cdr1", IMMCOL_EXT$cdr1aa)
setnames(df, "cdr2", IMMCOL_EXT$cdr2aa)
setnames(df, "fwr1_nt", IMMCOL_EXT$fr1nt)
setnames(df, "fwr2_nt", IMMCOL_EXT$fr2nt)
setnames(df, "fwr3_nt", IMMCOL_EXT$fr3nt)
setnames(df, "fwr4_nt", IMMCOL_EXT$fr4nt)
setnames(df, "fwr1", IMMCOL_EXT$fr1aa)
setnames(df, "fwr2", IMMCOL_EXT$fr2aa)
setnames(df, "fwr3", IMMCOL_EXT$fr3aa)
setnames(df, "fwr4", IMMCOL_EXT$fr4aa)

# Process 10xGenomics filtered contigs files - count barcodes, merge consensues ids, clonotype ids and contig ids
df <- df[order(df$chain), ]
setDT(df)

if (.mode == "paired") {
df <- df %>%
df %<>%
lazy_dt() %>%
group_by(barcode, raw_clonotype_id) %>%
group_by_colnames("barcode", "raw_clonotype_id") %>%
summarise(
CDR1.nt = paste0(get("CDR1.nt"), collapse = IMMCOL_ADD$scsep),
CDR1.aa = paste0(get("CDR1.aa"), collapse = IMMCOL_ADD$scsep),
CDR2.nt = paste0(get("CDR2.nt"), collapse = IMMCOL_ADD$scsep),
CDR2.aa = paste0(get("CDR2.aa"), collapse = IMMCOL_ADD$scsep),
CDR3.nt = paste0(get("CDR3.nt"), collapse = IMMCOL_ADD$scsep),
CDR3.aa = paste0(get("CDR3.aa"), collapse = IMMCOL_ADD$scsep),
FR1.nt = paste0(get("FR1.nt"), collapse = IMMCOL_ADD$scsep),
FR1.aa = paste0(get("FR1.aa"), collapse = IMMCOL_ADD$scsep),
FR2.nt = paste0(get("FR2.nt"), collapse = IMMCOL_ADD$scsep),
FR2.aa = paste0(get("FR2.aa"), collapse = IMMCOL_ADD$scsep),
FR3.nt = paste0(get("FR3.nt"), collapse = IMMCOL_ADD$scsep),
FR3.aa = paste0(get("FR3.aa"), collapse = IMMCOL_ADD$scsep),
FR4.nt = paste0(get("FR4.nt"), collapse = IMMCOL_ADD$scsep),
FR4.aa = paste0(get("FR4.aa"), collapse = IMMCOL_ADD$scsep),
V.name = paste0(get("V.name"), collapse = IMMCOL_ADD$scsep),
J.name = paste0(get("J.name"), collapse = IMMCOL_ADD$scsep),
D.name = paste0(get("D.name"), collapse = IMMCOL_ADD$scsep),
Expand All @@ -1067,23 +1110,46 @@ parse_10x_filt_contigs <- function(.filename, .mode) {
as.data.table()
}

df <- df %>%
df %<>%
lazy_dt() %>%
group_by(CDR3.nt, V.name, J.name) %>%
mutate(
CDR3.nt.sorted = sort_string(get("CDR3.nt"), IMMCOL_ADD$scsep),
V.name.sorted = sort_string(get("V.name"), IMMCOL_ADD$scsep),
J.name.sorted = sort_string(get("J.name"), IMMCOL_ADD$scsep)
) %>%
group_by_colnames("CDR3.nt.sorted", "V.name.sorted", "J.name.sorted") %>%
summarise(
Clones = length(unique(get("barcode"))),
CDR3.nt = first(get("CDR3.nt")),
CDR3.aa = first(get("CDR3.aa")),
V.name = first(get("V.name")),
D.name = first(get("D.name")),
J.name = first(get("J.name")),
chain = first(get("chain")),
barcode = paste0(unique(get("barcode")), collapse = IMMCOL_ADD$scsep),
raw_clonotype_id = gsub(
"clonotype|None", "",
paste0(unique(get("raw_clonotype_id")), collapse = IMMCOL_ADD$scsep)
),
contig_id = paste0(get("contig_id"), collapse = IMMCOL_ADD$scsep),
c_gene = first(get("c_gene"))
c_gene = first(get("c_gene")),
CDR1.nt = first(get(IMMCOL_EXT$cdr1nt)),
CDR2.nt = first(get(IMMCOL_EXT$cdr2nt)),
CDR1.aa = first(get(IMMCOL_EXT$cdr1aa)),
CDR2.aa = first(get(IMMCOL_EXT$cdr2aa)),
FR1.nt = first(get(IMMCOL_EXT$fr1nt)),
FR2.nt = first(get(IMMCOL_EXT$fr2nt)),
FR3.nt = first(get(IMMCOL_EXT$fr3nt)),
FR4.nt = first(get(IMMCOL_EXT$fr4nt)),
FR1.aa = first(get(IMMCOL_EXT$fr1aa)),
FR2.aa = first(get(IMMCOL_EXT$fr2aa)),
FR3.aa = first(get(IMMCOL_EXT$fr3aa)),
FR4.aa = first(get(IMMCOL_EXT$fr4aa))
) %>%
as.data.table()
as.data.table() %>%
subset(
select = -c(get("CDR3.nt.sorted"), get("V.name.sorted"), get("J.name.sorted"))
)

df$V.end <- NA
df$J.start <- NA
Expand Down
6 changes: 3 additions & 3 deletions R/io-utility.R
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@


.make_names <- function(.char) {
if (is.na(.char[1])) {
if (has_no_data(.char)) {
NA
} else {
tolower(.char)
Expand Down Expand Up @@ -136,8 +136,8 @@
.vend, .jstart, .dstart, .dend,
.vd.insertions, .dj.insertions, .total.insertions
))
if (!is.na(.add[1])) {
swlist <- c(swlist, rep(col_guess(), length(.add)))
if (!has_no_data(.add)) {
swlist <- c(swlist, rep(list(col_guess()), length(.add)))
names(swlist)[tail(seq_along(swlist), length(.add))] <- .add
}

Expand Down
Loading

0 comments on commit ac2c840

Please sign in to comment.