Skip to content

Commit

Permalink
Merge pull request #326 from immunomind/new-bcr-input-formats
Browse files Browse the repository at this point in the history
Support for BCR columns in new formats
  • Loading branch information
Alexander230 authored Dec 14, 2022
2 parents a114d3c + 5a6adc2 commit 09b0012
Show file tree
Hide file tree
Showing 7 changed files with 80 additions and 21 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,6 @@ Suggests:
rmarkdown
VignetteBuilder: knitr
Encoding: UTF-8
RoxygenNote: 7.2.1
RoxygenNote: 7.2.2
LazyData: true
LazyDataCompression: xz
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -147,10 +147,12 @@ importFrom(dplyr,group_map)
importFrom(dplyr,left_join)
importFrom(dplyr,mutate)
importFrom(dplyr,n)
importFrom(dplyr,one_of)
importFrom(dplyr,pull)
importFrom(dplyr,rename)
importFrom(dplyr,rowwise)
importFrom(dplyr,select)
importFrom(dplyr,select_)
importFrom(dplyr,select_if)
importFrom(dplyr,summarise)
importFrom(dplyr,tally)
Expand Down Expand Up @@ -290,6 +292,7 @@ importFrom(tibble,tibble)
importFrom(tidyr,drop_na)
importFrom(tidyr,unite)
importFrom(tidyr,unnest)
importFrom(tidyselect,all_of)
importFrom(tidyselect,any_of)
importFrom(tidyselect,starts_with)
importFrom(utils,capture.output)
Expand Down
1 change: 1 addition & 0 deletions R/diversity.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ if (getRversion() >= "2.15.1") {
#' @importFrom dplyr mutate group_by_at pull
#' @importFrom stats qnorm
#' @importFrom rlang sym
#' @importFrom tidyselect all_of
#'
#' @description
#' This is a utility function to estimate the diversity of species or objects in the given distribution.
Expand Down
82 changes: 66 additions & 16 deletions R/io-parsers.R
Original file line number Diff line number Diff line change
Expand Up @@ -834,8 +834,6 @@ parse_tcr <- function(.filename, .mode) {
}

parse_vdjtools <- function(.filename, .mode) {
skip <- 0

# Check for different VDJtools outputs
f <- file(.filename, "r")
l <- readLines(f, 1)
Expand Down Expand Up @@ -964,19 +962,28 @@ parse_airr <- function(.filename, .mode) {
.as_tsv() %>%
airr::read_rearrangement()

df <- df %>%
select(
sequence, v_call, d_call, j_call, junction, junction_aa,
contains("v_germline_end"), contains("d_germline_start"), contains("d_germline_end"),
contains("j_germline_start"), contains("np1_length"), contains("np2_length"),
contains("duplicate_count")
df %<>%
select_(
"sequence", "v_call", "d_call", "j_call", "junction", "junction_aa",
~contains("v_germline_end"), ~contains("d_germline_start"),
~contains("d_germline_end"), ~contains("j_germline_start"),
~contains("np1_length"), ~contains("np2_length"),
~contains("duplicate_count"),
"cdr1", "cdr2", "cdr1_aa", "cdr2_aa", "fwr1", "fwr2", "fwr3", "fwr4",
"fwr1_aa", "fwr2_aa", "fwr3_aa", "fwr4_aa"
)

namekey <- c(
duplicate_count = IMMCOL$count, junction = IMMCOL$cdr3nt, junction_aa = IMMCOL$cdr3aa,
v_call = IMMCOL$v, d_call = IMMCOL$d, j_call = IMMCOL$j, v_germline_end = IMMCOL$ve,
d_germline_start = IMMCOL$ds, d_germline_end = IMMCOL$de, j_germline_start = IMMCOL$js,
np1_length = "unidins", np2_length = IMMCOL$dnj, sequence = IMMCOL$seq
np1_length = "unidins", np2_length = IMMCOL$dnj, sequence = IMMCOL$seq,
cdr1 = IMMCOL_EXT$cdr1nt, cdr2 = IMMCOL_EXT$cdr2nt,
cdr1_aa = IMMCOL_EXT$cdr1aa, cdr2_aa = IMMCOL_EXT$cdr2aa,
fwr1 = IMMCOL_EXT$fr1nt, fwr2 = IMMCOL_EXT$fr2nt,
fwr3 = IMMCOL_EXT$fr3nt, fwr4 = IMMCOL_EXT$fr4nt,
fwr1_aa = IMMCOL_EXT$fr1aa, fwr2_aa = IMMCOL_EXT$fr2aa,
fwr3_aa = IMMCOL_EXT$fr3aa, fwr4_aa = IMMCOL_EXT$fr4aa
)

names(df) <- namekey[names(df)]
Expand All @@ -998,13 +1005,15 @@ parse_airr <- function(.filename, .mode) {
}
}

for (column in IMMCOL$order) {
order <- c(IMMCOL$order, IMMCOL_EXT$order[IMMCOL_EXT$order %in% namekey])

for (column in order) {
if (!(column %in% colnames(df))) {
df[column] <- NA
}
}

df <- df[IMMCOL$order]
df <- df[order]
total <- sum(df$Clones)
df[IMMCOL$prop] <- df[IMMCOL$count] / total
df[IMMCOL$seq] <- stringr::str_remove_all(df[[IMMCOL$seq]], "N")
Expand Down Expand Up @@ -1044,21 +1053,50 @@ parse_10x_filt_contigs <- function(.filename, .mode) {
.vgenes = "v_gene", .jgenes = "j_gene", .dgenes = "d_gene",
.vend = NA, .jstart = NA, .dstart = NA, .dend = NA,
.vd.insertions = NA, .dj.insertions = NA, .total.insertions = NA,
.skip = 0, .sep = ",", # .add = c("chain", "raw_clonotype_id", "raw_consensus_id", "barcode", "contig_id")
.add = c("chain", "barcode", "raw_clonotype_id", "contig_id", "c_gene")
.skip = 0, .sep = ",",
.add = c(
"chain", "barcode", "raw_clonotype_id", "contig_id", "c_gene",
"cdr1_nt", "cdr1", "cdr2_nt", "cdr2",
"fwr1_nt", "fwr1", "fwr2_nt", "fwr2", "fwr3_nt", "fwr3", "fwr4_nt", "fwr4"
)
)

setnames(df, "cdr1_nt", IMMCOL_EXT$cdr1nt)
setnames(df, "cdr2_nt", IMMCOL_EXT$cdr2nt)
setnames(df, "cdr1", IMMCOL_EXT$cdr1aa)
setnames(df, "cdr2", IMMCOL_EXT$cdr2aa)
setnames(df, "fwr1_nt", IMMCOL_EXT$fr1nt)
setnames(df, "fwr2_nt", IMMCOL_EXT$fr2nt)
setnames(df, "fwr3_nt", IMMCOL_EXT$fr3nt)
setnames(df, "fwr4_nt", IMMCOL_EXT$fr4nt)
setnames(df, "fwr1", IMMCOL_EXT$fr1aa)
setnames(df, "fwr2", IMMCOL_EXT$fr2aa)
setnames(df, "fwr3", IMMCOL_EXT$fr3aa)
setnames(df, "fwr4", IMMCOL_EXT$fr4aa)

# Process 10xGenomics filtered contigs files - count barcodes, merge consensues ids, clonotype ids and contig ids
df <- df[order(df$chain), ]
setDT(df)

if (.mode == "paired") {
df %<>%
lazy_dt() %>%
group_by(barcode, raw_clonotype_id) %>%
group_by_colnames("barcode", "raw_clonotype_id") %>%
summarise(
CDR1.nt = paste0(get("CDR1.nt"), collapse = IMMCOL_ADD$scsep),
CDR1.aa = paste0(get("CDR1.aa"), collapse = IMMCOL_ADD$scsep),
CDR2.nt = paste0(get("CDR2.nt"), collapse = IMMCOL_ADD$scsep),
CDR2.aa = paste0(get("CDR2.aa"), collapse = IMMCOL_ADD$scsep),
CDR3.nt = paste0(get("CDR3.nt"), collapse = IMMCOL_ADD$scsep),
CDR3.aa = paste0(get("CDR3.aa"), collapse = IMMCOL_ADD$scsep),
FR1.nt = paste0(get("FR1.nt"), collapse = IMMCOL_ADD$scsep),
FR1.aa = paste0(get("FR1.aa"), collapse = IMMCOL_ADD$scsep),
FR2.nt = paste0(get("FR2.nt"), collapse = IMMCOL_ADD$scsep),
FR2.aa = paste0(get("FR2.aa"), collapse = IMMCOL_ADD$scsep),
FR3.nt = paste0(get("FR3.nt"), collapse = IMMCOL_ADD$scsep),
FR3.aa = paste0(get("FR3.aa"), collapse = IMMCOL_ADD$scsep),
FR4.nt = paste0(get("FR4.nt"), collapse = IMMCOL_ADD$scsep),
FR4.aa = paste0(get("FR4.aa"), collapse = IMMCOL_ADD$scsep),
V.name = paste0(get("V.name"), collapse = IMMCOL_ADD$scsep),
J.name = paste0(get("J.name"), collapse = IMMCOL_ADD$scsep),
D.name = paste0(get("D.name"), collapse = IMMCOL_ADD$scsep),
Expand All @@ -1079,7 +1117,7 @@ parse_10x_filt_contigs <- function(.filename, .mode) {
V.name.sorted = sort_string(get("V.name"), IMMCOL_ADD$scsep),
J.name.sorted = sort_string(get("J.name"), IMMCOL_ADD$scsep)
) %>%
group_by(CDR3.nt.sorted, V.name.sorted, J.name.sorted) %>%
group_by_colnames("CDR3.nt.sorted", "V.name.sorted", "J.name.sorted") %>%
summarise(
Clones = length(unique(get("barcode"))),
CDR3.nt = first(get("CDR3.nt")),
Expand All @@ -1094,7 +1132,19 @@ parse_10x_filt_contigs <- function(.filename, .mode) {
paste0(unique(get("raw_clonotype_id")), collapse = IMMCOL_ADD$scsep)
),
contig_id = paste0(get("contig_id"), collapse = IMMCOL_ADD$scsep),
c_gene = first(get("c_gene"))
c_gene = first(get("c_gene")),
CDR1.nt = first(get(IMMCOL_EXT$cdr1nt)),
CDR2.nt = first(get(IMMCOL_EXT$cdr2nt)),
CDR1.aa = first(get(IMMCOL_EXT$cdr1aa)),
CDR2.aa = first(get(IMMCOL_EXT$cdr2aa)),
FR1.nt = first(get(IMMCOL_EXT$fr1nt)),
FR2.nt = first(get(IMMCOL_EXT$fr2nt)),
FR3.nt = first(get(IMMCOL_EXT$fr3nt)),
FR4.nt = first(get(IMMCOL_EXT$fr4nt)),
FR1.aa = first(get(IMMCOL_EXT$fr1aa)),
FR2.aa = first(get(IMMCOL_EXT$fr2aa)),
FR3.aa = first(get(IMMCOL_EXT$fr3aa)),
FR4.aa = first(get(IMMCOL_EXT$fr4aa))
) %>%
as.data.table() %>%
subset(
Expand Down
6 changes: 3 additions & 3 deletions R/io-utility.R
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@


.make_names <- function(.char) {
if (is.na(.char[1])) {
if (has_no_data(.char)) {
NA
} else {
tolower(.char)
Expand Down Expand Up @@ -136,8 +136,8 @@
.vend, .jstart, .dstart, .dend,
.vd.insertions, .dj.insertions, .total.insertions
))
if (!is.na(.add[1])) {
swlist <- c(swlist, rep(col_guess(), length(.add)))
if (!has_no_data(.add)) {
swlist <- c(swlist, rep(list(col_guess()), length(.add)))
names(swlist)[tail(seq_along(swlist), length(.add))] <- .add
}

Expand Down
2 changes: 1 addition & 1 deletion R/io.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ if (getRversion() >= "2.15.1") {
#' @importFrom jsonlite read_json
#' @importFrom stringr str_split str_detect str_replace_all str_trim
#' @importFrom methods as
#' @importFrom dplyr contains first
#' @importFrom dplyr contains first select_ group_by_at one_of
#' @importFrom utils read.table
#' @importFrom data.table setDF
#'
Expand Down
5 changes: 5 additions & 0 deletions R/tools.R
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,11 @@ has_no_data <- function(.data) {
any(sapply(list(NA, NULL, NaN), identical, .data)) | all(is.na(.data))
}

# variant of group_by that takes column names as strings
group_by_colnames <- function(.data, ...) {
group_by_at(.data, vars(one_of(...)))
}

# apply function to .data if it's a single sample or to each sample if .data is a list of samples
apply_to_sample_or_list <- function(.data, .function, .with_names = FALSE, .validate = TRUE, ...) {
if (has_no_data(.data)) {
Expand Down

0 comments on commit 09b0012

Please sign in to comment.