-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path02_fastQC.R
123 lines (98 loc) · 3.83 KB
/
02_fastQC.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# File: 02_fastQC.R
# Auth: [email protected]
# DESC: quality checks on the fastq files before trimming
# Date: 16/01/2017
## set variables and source libraries
source('header.R')
library(downloader)
url = 'https://raw.githubusercontent.com/uhkniazi/CFastqQuality/master/CFastqQuality.R'
download(url, 'CFastqQuality.R')
# load the required packages
source('CFastqQuality.R')
# delete the file after source
unlink('CFastqQuality.R')
## connect to mysql database to get sample information
library('RMySQL')
##### connect to mysql database to get samples
db = dbConnect(MySQL(), user='rstudio', password='12345', dbname='Projects', host='127.0.0.1')
dbListTables(db)
dbListFields(db, 'Sample')
# another way to get the query, preferred
g_did
dfSample = dbGetQuery(db, "select title, group1, group2 from Sample where idData=11;")
# close connection after getting data
dbDisconnect(db)
# remove any whitespace from the names
dfSample$title = gsub(" ", "", dfSample$title, fixed = T)
head(dfSample)
#### get the names of the fastq files for first sequencing run
setwd('Data_external/Fastq/')
csFiles = list.files('.', pattern = '*.gz')
# match the files to samples by title
i = sapply(dfSample$title, function(x) grep(x, csFiles))
# index of list with no matching files
i2 = sapply(i, length)
# skip this step if i2 has no 0 values
# dfSample = dfSample[which(i2 != 0),]
# match the files to their respective index in the table
lFilesIndex = lapply(dfSample$title, function(x) grep(x, csFiles))
names(lFilesIndex) = dfSample$title
## perform the analysis one sample at a time
## function to write the qa files
write.qa = function(fls, indir, title){
wd = getwd()
setwd(indir)
ob = CFastqQuality(fls, title)
setwd(wd)
cat(paste('done', title, '\n'))
return(ob)
}
ivFilesIndex = unlist(lFilesIndex)
lOb = lapply(ivFilesIndex, function(x){
write.qa(csFiles[x], getwd(), csFiles[x])
})
setwd(gcswd)
n = make.names(paste('CFastqQuality organoids rna_seq rds'))
lOb$meta = dfSample
lOb$desc = paste('CFastqQuality object from organoids rna_seq run', date())
n2 = paste0('~/Data/MetaData/', n)
save(lOb, file=n2)
## note: comment out as this entry has been made in db
# library('RMySQL')
# db = dbConnect(MySQL(), user='rstudio', password='12345', dbname='Projects', host='127.0.0.1')
# dbListTables(db)
# dbListFields(db, 'MetaFile')
# df = data.frame(idData=g_did, name=n, type='rds', location='~/Data/MetaData/', comment='organoids project FASTQ file quality data')
# dbWriteTable(db, name = 'MetaFile', value=df, append=T, row.names=F)
# dbDisconnect(db)
### create the plots of interest
getwd()
lOb$desc = NULL
lOb$meta = NULL
pdf(file='Results/qa.fastq.pdf')
iReadCount = sapply(lOb, CFastqQuality.getReadCount)
iReadCount = iReadCount/1e+6
barplot(iReadCount, las=2, main='Pre-Trim Read Count', ylab = 'No. of Reads in Millions', cex.names =0.6, col=grey.colors(2))
mQuality = sapply(lOb, function(x){
m = mGetReadQualityByCycle(x)
m = colMeans(m, na.rm = T)
return(m)
})
matplot(mQuality, type='l', main='Pre-trim base quality', ylab = 'Mean Score', xlab='Position in Read')
lReadWidth = lapply(lOb, iGetReadWidth)
boxplot(lReadWidth, las=2, main='Pre-trim Read Width', ylab = 'Read Width', col=grey.colors(2), outline=F, xaxt='n')
axis(1, at=1:length(lReadWidth), labels = names(lReadWidth), cex.axis=0.7, las=2)
dev.off(dev.cur())
## some samples may have quality drops which can be seen by clustering
hc = hclust(dist(t(mQuality)))
plot(hc, main='Clustering of Pre-trim data based on Per base quality', cex=0.6, xlab='', sub='')
abline(h = 18, col=2)
c = cutree(hc, h=18)
table(c)
# draw for each of the clusters
sapply(unique(c), function(x){
m = mQuality[,c == x]
matplot(m, type='l', main='Pre-trim Per base quality Clusters', ylab = 'Mean Score', xlab='Position in Read', col=1:ncol(m))
legend('bottomleft', legend = c(colnames(m)), fill = 1:ncol(m))
})
#n = colnames(m)