forked from jvhaarst/A50-plot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcontigStats.R
363 lines (307 loc) · 18.2 KB
/
contigStats.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
###########################################################################################
## Compute N50 values for Assembly Results and Plot their Cumulative Length Distribution ##
###########################################################################################
## Author: Thomas Girke
## Last update: January 28, 2010
## Utilities: Compute N50 and plot cumulative length distribution.
## More details can be found here: http://manuals.bioinformatics.ucr.edu/home/ht-seq#TOC-Analyzing-Assembly-Results
## Originally at http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/contigStats.R
## Changed by : [email protected]
## http://github.com/jvhaarst/A50-plot
##
## Overview:
## The N50 is a weighted median statistic such that 50% of the entire assembly
## is contained in contigs equal to or larger than this value. It can be calculated
## as follows: store the lengths of all contigs in a vector L of positive integers.
## Create a vector Lr where each integer is repeated as many times as defined by
## the sum its values in L. After this the N50 value is computed as the median of Lr.
## For example: if L = c(2, 2, 3, 3, 5, 7), then Lr consists of four 2's, six 3's,
## five 5's, and seven 7's; the N50 of L is the median of Lr, which is 5.
## In R the N50 according to this definition can be computed like this:
## L <- c(2, 2, 3, 3, 5, 7)
## Lr <- unlist(tapply(L, L, function(x) rep(x[1], sum(x))))
## median(Lr)
##
## With very large contig sizes the above implementation for computing the N50
## value can become inefficient. A much more efficient approach is to sort L
## decreasingly, compute for it the cumulative sum and then identify the length
## value for which the cumulative sum covers at least 50% of the combined length
## of all contigs or the length of a known target/reference sequence. Note,
## comparisons among N50 values from different assemblies are only meaningful
## when using for their calculation the same combined length value. Thus, a known
## target length value can be often a good solution for comparing assembly results.
## The following contigStats function calculates the N50 values according to the more
## efficient second method. In addition, it creates a distribution plot of the
## cumulative contig lengths, which is a very effective way for comparing assembly
## results.
#contigStats <- function(N=N, reflength, style="ggplot2", pch=20, xlab="Percentage of Assembly Covered by Contigs of Size >=Y", ylab="Contig Size [bp]", main="Cumulative Length of Contigs", sizetitle=14, sizex=12, sizey=12, sizelegend=9, xlim, ylim) {
# ## Compute cumulative length vectors for contig sets
# Nl <- lapply(names(N), function(x) rev(sort(N[[x]]))); names(Nl) <- names(N)
# Nlcum <- lapply(names(Nl), function(x) cumsum(Nl[[x]])); names(Nlcum) <- names(Nl)
#
# ## Compute N50 values
# N50 <- sapply(seq(along=N), function(x) Nl[[x]][which(Nlcum[[x]] - reflength[x]/2 >= 0)[1]])
# names(N50) <- names(N)
#
# ## Return only data (no plot)
# if(style=="data") {
# N90 <- sapply(seq(along=N), function(x) Nl[[x]][which(Nlcum[[x]] - reflength[x] * 0.90 >= 0)[1]]); names(N50) <- names(N)
# N75 <- sapply(seq(along=N), function(x) Nl[[x]][which(Nlcum[[x]] - reflength[x] * 0.75 >= 0)[1]]); names(N50) <- names(N)
# N25 <- sapply(seq(along=N), function(x) Nl[[x]][which(Nlcum[[x]] - reflength[x] * 0.25 >= 0)[1]]); names(N50) <- names(N)
# stats <- cbind(N25, N50, N75, N90, Longest=sapply(N, max), Mean=sapply(N, mean), Median=sapply(N, median), Shortest=sapply(N, min), N_Contigs=sapply(N, length))
# return(c(Nlcum, Contig_Stats=list(stats)))
# }
# ## Plot cumulative contig length with base graphics, only necessary when ggplot is unavailable
# if(style=="base") {
# if(missing(xlim)) xlim <- c(0, 100)
# if(missing(ylim)) ylim <- c(0, max(unlist(N)))
# split.screen(c(1,1))
# for(i in seq(along=Nl)) {
# if(i==1) {
# plot(Nlcum[[i]]/reflength[[i]] * 100, Nl[[i]], col=i, pch=pch, xlim=xlim, ylim=ylim, xlab=xlab, ylab=ylab, main=main)
# }
# screen(1, new=FALSE)
# plot(Nlcum[[i]]/reflength[[i]] * 100, Nl[[i]], col=i, pch=pch, xlim=xlim, ylim=ylim, xaxt="n", yaxt="n", ylab="", xlab="", main="", bty="n")
# }
# legend("bottomright", legend=paste(names(N50), ": N50 = ", N50, sep=""), cex=1.2, bty="n", pch=15, pt.cex=1.8, col=seq(along=Nl))
# close.screen(all=TRUE)
# }
# ## Plot cumulative contig length with ggplot2
# ## Note: ggplot2 plotting options can be looked up with theme_get()
# if(style=="ggplot2") {
# require("ggplot2")
# plotdf <- data.frame(Samples=rep(names(Nlcum), sapply(Nlcum, length)), length=unlist(Nl), perc=unlist(lapply(names(Nlcum), function(x) Nlcum[[x]]/reflength[[x]]*100)))
# counts <- table(plotdf[,1]); counts <- counts[names(N50)]
# N50rep <- paste(plotdf[,1], ": N50=", unlist(lapply(as.character(unique(plotdf[,1])), function(x) rep(N50[x], counts[names(N50[x])]))), sep="")
# plotdf[,1] <- N50rep
# ggplot(plotdf, aes(perc, length, color=Samples)) +
# geom_point() +
# geom_line() +
# scale_x_continuous(xlab) +
# scale_y_continuous(ylab) +
# labs(title = main) +
# theme(plot.title = element_text(size = sizetitle)) +
# theme(axis.title.x = element_text(size = sizex)) +
# theme(axis.title.y = element_text(size = sizey, angle = 90)) +
# theme(legend.text = element_text(size = sizelegend)) +
# theme(legend.title = element_text(size = 0))
# }
#}
## TODO: SHARE DATA BETWEEN MODES
require("plotmath", warn.conflicts=FALSE)
require("sitools" , warn.conflicts=FALSE)
require("gdata" , warn.conflicts=FALSE)
source('./rbind.na.R')
fmtnum<-function(num) {
fmtNums<-formatC(num, format="d", big.mark=",")
strmax<-max(unlist(lapply(seq(along=fmtNums), function(x) nchar(fmtNums))))
fmtNumsP<-formatC(fmtNums, width=strmax)
return(fmtNumsP)
}
contigStatsFlipped <- function(N=N, reflength, style="ggplot2", pch=20, xlab="Percentage of Assembly Covered by Contigs of Size >=Y", ylab="Contig Size [bp]", main="Cumulative Length of Contigs", sizetitle=14, sizex=12, sizey=12, sizelegend=9, trimSize=25000, xunity=1e3, doLookup=FALSE, yunity=1e7, xlim, ylim, outBaseName="Rplots") {
## Compute cumulative length vectors for contig sets, trimming at TRIMSIZE
cat("Trimming histograms\n")
NlTrim<- lapply(names(N ), function(x) {
nN<-rev(sort(N[[x]]))
trimSizel<-trimSize
if ( length(nN) < trimSize ) {
cat(paste("reducing trim size to" , length(nN), "\n"))
return(nN);
} else {
cat(paste("keeping trim size. size", length(nN), "\n"))
}
nN <- trimSum(nN, trimSizel, right=TRUE, na.rm=FALSE)
return(nN);
}); names(NlTrim) <- names(N)
numEls<-length(names(N))
cat(paste("number of elements:", numEls, "\n"))
Nl <- lapply(names(N), function(x) rev(sort(N[[x]])) ); names(Nl ) <- names(N)
Nlcum <- lapply(names(N), function(x) cumsum(Nl[[x]]) ); names(Nlcum ) <- names(N)
NlTrimCum <- lapply(names(N), function(x) cumsum(NlTrim[[x]])); names(NlTrimCum) <- names(N)
## Compute N50 values for use on graph
N50 <- sapply(seq(along=N), function(x) {Nl[[x]][which(Nlcum[[x]] - reflength[x]/2 >= 0)[1]]}); names(N50) <- names(N)
I50 <- sapply(seq(along=N), function(x) { which(Nlcum[[x]] - reflength[x]/2 >= 0)[1] }); names(I50) <- names(N)
Ns <- sapply(seq(along=N), function(x) length(N[[x]])) ; names(Ns ) <- names(N)
cat("N50\n")
cat(paste(N50, "\n"))
cat("I50\n")
cat(paste(I50, "\n"))
#add tick values every 5th part of the X length
maxx<-trimSize
maxx<-round(maxx/xunity)*xunity
if(missing(xlim)) xlim <- c(0, max(unlist(N )))
tickvaluesX<-seq(0, maxx, by=maxx/5)
cat("Tick values X\n" )
cat(paste( tickvaluesX , "\n"))
cat(paste(f2si(tickvaluesX), "\n"))
#find lookup value
lookup_max<-unlist(lapply(seq(along=Nlcum), function(x) Nlcum[[x]][which.max(abs(Nlcum[[x]]))]))
lookup_mean<-floor(1/mean(1/lookup_max))
cat(paste("Nlookup: ", lookup_mean , "\n"))
lookup_mean75<-floor( lookup_mean * .75)
cat(paste("Nlookup75:", lookup_mean75, "\n"))
Nlookup_mean75 <- sapply(seq(along=N), function(x) {Nl[[x]][which(Nlcum[[x]] - lookup_mean75 >= 0)[1]]}); names(Nlookup_mean75) <- names(N)
Ilookup_mean75 <- sapply(seq(along=N), function(x) { which(Nlcum[[x]] - lookup_mean75 >= 0)[1] }); names(Ilookup_mean75) <- names(N)
Ilookup_mean75Trim <- sapply(seq(along=N), function(x) {if ( (! is.na(Ilookup_mean75[x])) && (Ilookup_mean75[x] > trimSize) ) { return(trimSize); } else { return(Ilookup_mean75[x]); }} ); names(Ilookup_mean75Trim) <- names(N)
cat(paste(names(N), "Nlookup_mean75", Nlookup_mean75, "Ilookup_mean75", Ilookup_mean75, "\n"))
print(Ilookup_mean75Trim)
#pad names
strmax<-max(unlist(lapply(seq(along=names(N)), function(x) nchar(names(N)[x]))))
cat(paste("strmax", strmax, "\n"))
fmtNames<-lapply(seq(along=names(N)), function(x) { sname<-names(N)[x]; mlen<-strmax - nchar(names(N)[x]); dots<-paste(rep(".", mlen), collapse=''); paste(sname, dots, sep="")} ); names(fmtNames) <- names(N)
cat(paste("strfmt", fmtNames, "\n"))
## Return only data (no plot)
if(style=="data") {
outTsvCumm = paste(outBaseName, ".cumm.tsv", sep="")
outTsvStat = paste(outBaseName, ".stat.tsv", sep="")
N90 <- sapply(seq(along=N), function(x) Nl[[x]][which(Nlcum[[x]] - reflength[x] * 0.90 >= 0)[1]]); names(N90) <- names(N)
I90 <- sapply(seq(along=N), function(x) which(Nlcum[[x]] - reflength[x] * 0.90 >= 0)[1] ); names(I90) <- names(N)
N75 <- sapply(seq(along=N), function(x) Nl[[x]][which(Nlcum[[x]] - reflength[x] * 0.75 >= 0)[1]]); names(N75) <- names(N)
I75 <- sapply(seq(along=N), function(x) which(Nlcum[[x]] - reflength[x] * 0.75 >= 0)[1] ); names(I75) <- names(N)
N25 <- sapply(seq(along=N), function(x) Nl[[x]][which(Nlcum[[x]] - reflength[x] * 0.25 >= 0)[1]]); names(N25) <- names(N)
I25 <- sapply(seq(along=N), function(x) which(Nlcum[[x]] - reflength[x] * 0.25 >= 0)[1] ); names(I25) <- names(N)
stats <- cbind(
#N25, I25, N50, I50, N75, I75, N90, I90, qmaxmax, qmaxmin, qmaxmax50, qmaxmin50, Nqmaxmax50, Iqmaxmax50, Nqmaxmin50, Iqmaxmin50,
N25, I25, N50, I50, N75, I75, N90, I90, lookup_mean, lookup_mean75, Nlookup_mean75, Ilookup_mean75,
Longest=sapply(N, max), Shortest=sapply(N, min),
Mean=round(sapply(N, mean)), Median=round(sapply(N, median)),
N_Contigs=sapply(N, length), Total_length=sapply(N, sum)
)
alldata<-do.call(rbind.na, NlTrimCum)
#print(names(alldata)); quit()
cat("Contig Number\t" , file=outTsvCumm, append=FALSE)
cat(names(Ns) , file=outTsvCumm, append=TRUE , sep="\t" )
cat("\n" , file=outTsvCumm, append=TRUE )
write.table(t(alldata), file=outTsvCumm, sep="\t", na="\"\"", col.names=FALSE, append=TRUE )
cat("\"Name\"\t", file=outTsvStat, append=FALSE)
write.table(stats , file=outTsvStat, sep="\t", append=TRUE )
return(Contig_Stats=list(stats))
}
## Plot cumulative contig length with base graphics, only necessary when ggplot is unavailable
if(style=="base") {
outPng = paste(outBaseName, ".png" , sep="")
png(filename=outPng, width=2048, height=2048, bg="transparent", type="cairo", pointsize=48)
#add tick values every 5th part of the X length
maxy<-max(unlist(NlTrimCum))
maxy<-round(maxy/yunity)*yunity
if(missing(ylim)) ylim <- c(0, max(unlist(NlTrimCum)))
tickvaluesY<-seq(0, maxy, by=maxy/5)
cat("Tick values Y\n" )
cat(paste( tickvaluesY , "\n"))
cat(paste(f2si(tickvaluesY), "\n"))
par(mar=c(4,4,5,1)+.1)
palette(rainbow(numEls))
split.screen(c(1,1))
for(i in seq(along=NlTrim)) {
# TODO: IF I == J50, CHANGE SYMBOL?
if(i==1) {
plot(y=NlTrimCum[[i]], x=seq_along(NlTrimCum[[i]]),col=i, pch=pch, xlim=xlim, ylim=ylim, xaxt="n", yaxt="n", xlab=xlab, ylab=ylab, main=main, type='o', bty="n")
box(lwd=6)
}
screen(1, new=FALSE)
plot(y=NlTrimCum[[i]], x=seq_along(NlTrimCum[[i]]), col=i, pch=pch, xlim=xlim, ylim=ylim, xaxt="n", yaxt="n", ylab="", xlab="", main="", bty="n", ann=FALSE, type='o')
}
axis(1, at=tickvaluesX, labels=f2si(tickvaluesX))
axis(2, at=tickvaluesY, labels=f2si(tickvaluesY))
legendTxt = paste( fmtNames, ":",
" N50=" , fmtnum(N50),
" I50=" , fmtnum(I50),
" Size=", fmtnum(Ns ), sep="")
if ( doLookup ) {
axis(3, at=tickvaluesX, labels=f2si(tickvaluesX))
maxylim=max(ylim)
minylim=min(ylim)
maxxlim=max(xlim)
yblock =maxylim/80
xblock =maxxlim/350
screen(1, new=FALSE)
for (nPos in seq(1:length(Ilookup_mean75))) {
rect(Ilookup_mean75Trim[nPos]-xblock, maxylim+(yblock*1), Ilookup_mean75Trim[nPos]+xblock, maxylim+(yblock*3), col=nPos, border=NA)
}
#qmaxtxt = paste("Lookup=", fmtnum(lookup_mean75), sep="")
qmaxtxt = paste("Lookup=", f2si(round(lookup_mean75/yunity, digits=1)*yunity), sep="")
text(trimSize *.85, lookup_mean75 * 1.05, qmaxtxt)
abline(h=lookup_mean75, lty=5, lwd=8)
legendTxt = paste( fmtNames, ":",
" N50=" , fmtnum(N50),
" I50=" , fmtnum(I50),
" NL=" , fmtnum(Nlookup_mean75),
" IL=" , fmtnum(Ilookup_mean75),
" Size=", fmtnum(Ns), sep="")
}
#print legend
op <- par(family="mono", font=2)
#acex <- 0.3 + ( 1 / sqrt(strmax) )
#acex <- 1-(strmax/140)
#acex <- (5-(log10(strmax)^2))/5
#acex <- (17-sinh(log10(strmax))^2)/16
#a1 <- strmax/(130*1)
#b1 <- a1*(pi/2)
#acex <- 1-sin(b1)
#acex <- .9-sin(b1)+sin(b1/3)
#acex <- 0.74367*exp(-0.011*strmax)
#acex <- ((0.0000015464*(strmax^3))*-1)+(0.0002515167*(strmax^2))+((0.0154842134*strmax)*-1)+0.8296025387
a <- 0.0000368
b <- -0.0090173
c <- 0.8486444
acex <- (a*(strmax^2))+(b*strmax)+c
if (acex < 0.2) { acex <- 0.2 }
cat(paste("ACEX ", acex , "\n"))
legend("bottomright", legend=legendTxt, cex=acex, bty="n", pch=15, pt.cex=1, y.intersp=.8, col=seq(along=Nl), xjust=1 )
par(op)
close.screen(all=TRUE)
}
if(style=="baseperc") {
maxy<-100
if(missing(ylim)) ylim <- c(0, 100)
split.screen(c(1,1))
for(i in seq(along=NlTrim)) {
if(i==1) {
plot(y=NlTrimCum[[i]]/reflength[[i]] * 100, x=seq_along(NlTrimCum[[i]]),col=i, pch=pch, xlim=xlim, xaxt="n", yaxt="n", ylim=ylim, xlab=xlab, ylab=ylab, main=main, type='o')
}
screen(1, new=FALSE)
plot(y=NlTrimCum[[i]]/reflength[[i]] * 100, x=seq_along(NlTrimCum[[i]]), col=i, pch=pch, xlim=xlim, ylim=ylim, xaxt="n", yaxt="n", ylab="", xlab="", main="", bty="n", ann=FALSE, type='o')
axis(1, at=tickvalues, labels=f2si(tickvalues))
axis(2, at=tickvalues, labels=f2si(tickvalues))
}
legend("bottomright", legend=paste(names(N50), ": N50 = ", N50, " Size = ", Ns, sep=""), cex=0.6, bty="n", pch=15, pt.cex=0.8, col=seq(along=Nl), xjust=1 )
close.screen(all=TRUE)
}
## Plot cumulative contig length with ggplot2
## Note: ggplot2 plotting options can be looked up with theme_get()
if(style=="ggplot2") {
require("ggplot2")
plotdf <- data.frame(Samples=rep(names(Nlcum), sapply(Nlcum, length)), length=unlist(Nl), perc=unlist(lapply(names(Nlcum), function(x) Nlcum[[x]]/reflength[[x]]*100)))
counts <- table(plotdf[,1]); counts <- counts[names(N50)]
N50rep <- paste(plotdf[,1], ": N50=", unlist(lapply(as.character(unique(plotdf[,1])), function(x) rep(N50[x], counts[names(N50[x])]))), sep="")
plotdf[,1] <- N50rep
ggplot(plotdf, aes(perc, length, color=Samples)) +
geom_point() +
geom_line() +
scale_x_continuous(xlab) +
scale_y_continuous(ylab) +
labs(title = main) +
theme(plot.title = element_text(size = sizetitle)) +
theme(axis.title.x = element_text(size = sizex)) +
theme(axis.title.y = element_text(size = sizey, angle = 90)) +
theme(legend.text = element_text(size = sizelegend)) +
theme(legend.title = element_text(size = 0))
}
}
## Usage of contigStats function
## Test sample of simulated contig length values provided in list
#
#N1 <- sample(1:500, 100)
#
#N2 <- sample(1:480, 80)
#
#N <- list(N1=N1, N2=N2)
## Run contigStats function
#
#reflength <- sapply(N, sum)
#max_ref <- max(reflength)
#reflength <- sapply(N, function(x) x <- max_ref)
#
#N50 <- contigStats(N=N, reflength=reflength, pch=20, xlab="Percentage of Assembly Covered by Contigs of Size >=Y", ylab="Contig Size", main="Cumulative Plot of N Statistic")
#N50 <- contigStats(style="data",N=N, reflength=reflength, pch=20, xlab="Percentage of Assembly Covered by Contigs of Size >=Y", ylab="Contig Size", main="Cumulative Plot of N Statistic")