-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathliftOver.R
92 lines (68 loc) · 2.88 KB
/
liftOver.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#this should really be more or less a wrapper function for the c code from the
#Kent Source if we're going to release it as a BioC module. We'll need to check up on
#license issues though.
#Should also really be a module with the various chain files as data or as sep bioc modules.
#create your C or C++ or Fortran code; you may need to include the header #include R.h;
#R CMD SHLIB file.C
#R
#dyn.load(file.so)
#myfun=function(n, x){
# out <- NULL
# .C(myfun,as.integer(n),as.double(x),as.double(out))[[3]] # returns out (the third element of the list returned by .C)
#}
#however, for now:
liftOver<-function(data, chain.file, ucsc.format=T, chr.col="chr", start.col="start",end.col="end"){
#data should be a matrix or dataframe with cols for chr, start and end
#TODO: Or a RangedData / IRange object,
this<-data.frame(chr=as.character(data[,chr.col]),start=as.numeric(data[,start.col]),end=as.numeric(data[,end.col]), stringsAsFactors=F)
#Normal counting specifies ranges in 1-based, fully closed form.
#UCSC specifies ranges in 0-based, half open
if (!ucsc.format){
this$start<-this$start-1
}
#use the chrstartend as an id so we can map back to the original data
ids = paste(as.character(data[,chr.col]),data[,start.col],data[,end.col], sep=".")
this <- cbind(this, ids)
#If we have duplicate positions, remove them for the liftOver
this <- unique(this)
#we also need to chuck out anything that doesn't have positional data.
no.chr <- which(is.na(this[,1]))
no.start <- which(is.na(this[,2]))
no.end <- which(is.na(this[3]))
inds <- unique(c(no.chr,no.start,no.end))
if( length(inds)>0 ){ this <- this[-1*inds,] }
##all this stuff should be a .C() call but I don't have time to make it work just now.
in.bed <- tempfile()
#need to watch we don't get scientific notation printed out
options(scipen=10)
write.table(this, file=in.bed, sep="\t", row.names=F, col.names=F, quote=F)
out.bed <- tempfile()
out.um.bed <- tempfile()
lo.cmd <- paste("liftOver", in.bed, chain.file, out.bed, out.um.bed)
system(lo.cmd)
try(
new.bed<-read.table(out.bed, sep="\t")
,silent=T)
try(
new.um <- read.table(out.um.bed, sep="\t")
,silent=T)
#throw away the files
unlink(c(in.bed, out.bed, out.um.bed))
if (!exists('new.bed')){stop("No successful mappings")}
#use the ids as rownames chuck out the column
#order in the same order as our original data
#which should stick NAs in for anything that can't be mapped
rownames(new.bed) <- new.bed[,4]
new.bed <- new.bed[,-4]
new.bed <- new.bed[ids,]
if(!ucsc.format){
#put the data back to 1-based
new.bed[,2] <- new.bed[,2]+1
}
#replace the new positions in the original dataset
data[,chr.col] <- new.bed[,1]
data[,start.col] <- new.bed[,2]
data[,end.col] <- new.bed[,3]
#TODO: return some information about the data that won't map
return(data)
}