-
Notifications
You must be signed in to change notification settings - Fork 1
/
aggr_beds.R
executable file
·200 lines (164 loc) · 7.35 KB
/
aggr_beds.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
#!/usr/bin/env Rscript
# Example: ./aggr/aggr_beds.R /home/data/IO_RNA/circRNA/EGA_IMmotion151 test2
library(data.table)
args = commandArgs(trailingOnly = TRUE)
stopifnot(length(args) > 0)
if (length(args) == 1) {
InDir = OutDir = args[1]
} else {
InDir = args[1]
OutDir = args[2]
}
GTF = if (file.exists("/home/data/reference/hg38_ek12/gencode.v34.annotation.gtf")) {
"/home/data/reference/hg38_ek12/gencode.v34.annotation.gtf"
} else "/home/zhou/t12b/reference/hg38_ek12/gencode.v34.annotation.gtf"
commonPy = "/home/circrna/circrna-pipeline/common/common.py"
stopifnot(file.exists(commonPy), file.exists(GTF), dir.exists(InDir))
if (!dir.exists(OutDir)) {
dir.create(OutDir, recursive = TRUE)
}
message("Reading GTF annotation data...")
gtf_data = fread(GTF, header = FALSE, sep = "\t", na.strings = c(".", "NA"))
gtf_data = gtf_data[V3 == "gene"]
message("Extracting gene and region information...")
# 提取gene_id和symbol信息
# 一般来说,GTF文件的attributes列是以"; "分隔的键值对,我们可以利用这个信息提取gene_id和symbol
# 这里假设gene_id和gene_symbol都是以"gene_id"和"gene_name"的形式出现在attributes列中
gtf_data[, c("gene_id", "gene") := {
attrs <- tstrsplit(V9, "; ")
gene_id <- gsub('"', '', attrs[grep("^gene_id", attrs)])
gene <- gsub('"', '', attrs[grep("^gene_name", attrs)])
list(gene_id = gene_id, gene = gene)
}, by = 1:nrow(gtf_data)]
gtf_data = unique(gtf_data[, list(chr = V1, start = V4, end = V5, gene_id = substr(sub("gene_id ", "", gene_id), 1, 15), gene = sub("gene_name ", "", gene))])
# 输出前几行结果
message("Checking gtf data...")
print(head(gtf_data))
message("Scanning result bed files and extracting sample IDs...")
methods = c("circexplorer2", "circRNA_finder", "CIRI", "find_circ")
fileList = list.files(InDir, pattern = paste0(paste0("(", paste(methods, collapse = ")|("), ")"), ".bed"))
fileListAll = list.files(InDir, pattern = ".bed", full.names = TRUE)
fileList = fileList[file.info(fileListAll)$size > 0]
# 去除 .bed 后缀
sample_ids <- gsub("\\.bed$", "", fileList)
# 使用gsub函数去掉指定的后缀
for (suffix in methods) {
sample_ids <- gsub(paste0("\\.", suffix, "$"), "", sample_ids)
}
sample_ids = unique(sample_ids)
stopifnot(length(sample_ids) > 0)
message(length(sample_ids), " sample(s) detected")
# Functions ------------------------------------------
overlaps <- function(x, y) {
## Overlaps genome regions from x and y
if (!is.data.frame(x)) {
stop("x must be a data.frame")
}
if (!is.data.frame(y)) {
stop("y must be a data.frame")
}
x <- data.table::as.data.table(x)
y <- data.table::as.data.table(y)
colnames(x)[1:3] <- colnames(y)[1:3] <- c("chr", "start", "end")
data.table::setkey(y, chr, start, end)
out <- data.table::foverlaps(x, y)[!is.na(start)]
out
}
aggr_circRNA_beds = function(sample, methods) {
bed_files = file.path(InDir, paste0(sample, ".", methods, ".bed"))
nonexists = !file.exists(bed_files)
len_nonexists = sum(nonexists)
if (len_nonexists > 0) {
warning(len_nonexists, " result bed file(s) doesn't exist for sample ", sample, immediate. = TRUE)
cat(sample, "\t", len_nonexists, "\n", file = file.path(OutDir, "non_exist_results.report.txt"), append = TRUE)
}
if (sum(!nonexists) > 1) {
# Get common regions
cmd = paste("cat", paste(bed_files[!nonexists], collapse = " "), "|", commonPy, ">", file.path(OutDir, paste0(sample, ".common.txt")))
system(cmd)
bed_common = fread(file.path(OutDir, paste0(sample, ".common.txt")), header = FALSE, sep = "\t")
file.remove(file.path(OutDir, paste0(sample, ".common.txt"))) # Remove temp common file
# Read all files and filter them
bed_list = parallel::mclapply(methods[!nonexists], function(x) {
if (x == "CIRI") {
d = fread(file.path(InDir, paste0(sample, ".", x, ".bed")), select = c(1:4, 7), header = FALSE, sep = "\t")
} else {
d = fread(file.path(InDir, paste0(sample, ".", x, ".bed")), select = 1:5, header = FALSE, sep = "\t")
}
if (nrow(d) == 0) {
warning(sprintf("void data detected for %s with method %s", sample, x), immediate. = TRUE)
return(d)
}
if (is.character(d[[5]])) {
warning(sprintf("invalid (character) count detected for %s with method %s", sample, x), immediate. = TRUE)
d[[5]] = as.integer(d[[5]])
}
d$tool = x
d
}, mc.cores = getOption("mc.cores", 4L))
bed_dt = rbindlist(bed_list, use.names = FALSE)
colnames(bed_dt) = c("chr", "start", "end", "strand", "count", "tool")
bed_dt = bed_dt[!is.na(bed_dt$count)]
# 增加样本水平 circRNA ID 的输出
# 方便后续分析不同方法的 overlap 以及统计总量
bed_dt_o = copy(bed_dt)
bed_dt_o$sample = sample
fwrite(x = bed_dt_o, file = file_to_all, sep = "\t", append = TRUE)
# 获取和分析 common circRNA
if (nrow(bed_common) == 0) {
warning("no common circRNA detected for sample ", sample, immediate. = TRUE)
return(invisible(NULL))
}
colnames(bed_common) = c("chr", "start", "end")
bed_common = bed_common[chr %in% paste0("chr", c(1:22, "X", "Y"))]
if (nrow(bed_common) == 0) {
warning("no common circRNA detected for sample in chr 1-22,X,Y", sample, immediate. = TRUE)
return(invisible(NULL))
}
bed_dt[, id := paste(chr, start, end, sep = "-")]
bed_common[, id := paste(chr, start, end, sep = "-")]
annot = unique(overlaps(bed_common, gtf_data)[
, .(id, gene_id, gene, ovp_len = fcase(
i.start <= start, i.end - start + 1,
i.end >= end, end - i.start + 1,
i.start > start, i.end - i.start + 1
))])
# Only keep symbol, as gene_id is not required in downstream
# 仅标记最大 overlap 的基因
annot = annot[ , list(gene = gene[which.max(ovp_len)]), by = .(id)]
# Some may have no annotation on gene, such cases will not be kept
#"chr4-42725521-42744676"
# Final output for one sample
bed_dt2 = merge(bed_dt, annot, by = "id", all.x = FALSE, all.y = TRUE)
solid_ids = bed_dt2[, .(N = sum(count >= 2)), by = .(id)][N >= 1]$id # id by at least two tools with ≥1 back-splice reads
if (length(solid_ids) > 0) {
message("\t=>", length(solid_ids), " solid circRNAs detected")
bed_dt2 = bed_dt2[id %in% solid_ids]
bed_dt2$id = NULL
rv = bed_dt2[, .(tool = paste(tool, collapse = ","),
count = mean(count, na.rm=TRUE)),
by = .(chr, start, end, strand, gene)] # Get average count
rv$sample = sample
fwrite(x = rv, file = file.path(OutDir, paste0(sample, ".aggr.txt")), sep = "\t")
} else {
warning("no solid circRNA detected for sample ", sample, immediate. = TRUE)
}
} else {
warning("no solid circRNA detected for sample ", sample, immediate. = TRUE)
}
}
# Processing in batch
message("Reading, joining & annotating circRNAs...")
file_to_all = file.path(OutDir, paste0(basename(InDir), ".circRNA_all.txt"))
if (file.exists(file_to_all)) invisible(file.remove(file_to_all))
invisible(
parallel::mclapply(sample_ids, function(sample) {
message("\thandling ", sample)
aggr_circRNA_beds(sample, methods)
}, mc.cores = min(parallel::detectCores(), 20L, length(sample_ids)))
)
# for (sample in sample_ids) {
# message("\thandling ", sample)
# aggr_circRNA_beds(sample, methods)
# }
message("Done. Please check result *.aggr.txt in ", OutDir)