-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathget_gtf_ensg_annots.R
73 lines (67 loc) · 2.32 KB
/
get_gtf_ensg_annots.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
suppressPackageStartupMessages({
library(AnnotationHub)
library(argparser)
library(ensembldb)
library(httr)
library(rtracklayer)
library(stringr)
library(tools)
})
argp <- arg_parser("Get GDC GENCODE gtf and latest Ensembl annotations")
argp <- add_argument(
argp, "--data-dir", type="character", default="data", help="Data directory"
)
argp <- add_argument(
argp, "--ensdb-version", default=98, help="Ensembl release version"
)
args <- parse_args(argp)
gdc_data_base_url <- "https://api.gdc.cancer.gov/data/"
gdc_gtf_url <- paste0(gdc_data_base_url, "25aa497c-e615-4cb7-8751-71f744f9691f")
gdc_gtf_filename <- "gencode.v22.annotation.gtf.gz"
gdc_gtf_file <- paste(args$data_dir, gdc_gtf_filename, sep="/")
if (file.exists(gdc_gtf_file)) {
cat("Using existing", gdc_gtf_file, "\n")
} else {
cat("Downloading", gdc_gtf_file, "\n")
download.file(gdc_gtf_url, gdc_gtf_file)
}
if (md5sum(gdc_gtf_file) != "291330bdcff1094bc4d5645de35e0871")
stop(gdc_gtf_filename, "md5 doesn't match")
cat("Loading", basename(gdc_gtf_file), "\n")
gencode_gtf <- import(gdc_gtf_file)
gene_ids <- sort(unique(gencode_gtf$gene_id))
gene_ids_no_version <- unlist(
strsplit(gene_ids, ".", fixed=TRUE)
)[2 * (seq_len(length(gene_ids))) - 1]
ah_query_pattern <- c("Homo sapiens", "EnsDb", args$ensdb_version)
cat("Getting", ah_query_pattern, "\n")
suppressWarnings(ah <- AnnotationHub())
ahdb <- query(ah, pattern=ah_query_pattern)
edb <- ahdb[[1]]
cat("Processing annotations", "\n")
edb_gene_id_annots <- select(
edb, keys=gene_ids_no_version, keytype="GENEID",
columns=c("GENEID", "SYMBOL"), order.by="GENEID"
)
edb_gene_id_annots <- data.frame(
Symbol=edb_gene_id_annots$SYMBOL, row.names=edb_gene_id_annots$GENEID
)
gene_id_symbols <- sapply(
gene_ids_no_version,
function(gene_id) {
if (gene_id %in% row.names(edb_gene_id_annots))
as.character(edb_gene_id_annots[gene_id, "Symbol"])
else
""
}
)
out_filename <- paste(
"gencode", str_match(gdc_gtf_filename, "\\.(v\\d+)\\.")[1, 2],
"ensg", paste0("v", args$ensdb_version), "annots.tsv", sep="_"
)
out_file <- paste(args$data_dir, out_filename, sep="/")
cat("Writing", out_file, "\n")
write.table(
data.frame(ID_REF=gene_ids, Symbol=gene_id_symbols),
file=out_file, quote=FALSE, row.names=FALSE, sep="\t"
)