-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path06_samtools_array_job.R
142 lines (121 loc) · 4.95 KB
/
06_samtools_array_job.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# File: 06_samtools_array_job.R
# Auth: [email protected]
# DESC: create a parameter file and shell script to run array job on hpc
# Date: 17/01/2017
## set variables and source libraries
source('header.R')
## connect to mysql database to get sample information
library('RMySQL')
##### connect to mysql database to get samples
db = dbConnect(MySQL(), user='rstudio', password='12345', dbname='Projects', host='127.0.0.1')
dbListTables(db)
# check how many files each sample has
g_did
q = paste0('select count(File.idSample) as files, Sample.idData, Sample.title, Sample.id as SampleID from File, Sample
where (Sample.idData = 11 and File.idSample = Sample.id) group by File.idSample')
dfQuery = dbGetQuery(db, q)
dfQuery$title = gsub(" ", "", dfQuery$title, fixed = T)
dfQuery
# get the count of files
q = paste0('select File.*, Sample.idData from File, Sample
where (Sample.idData = 11) and (File.idSample = Sample.id) and (File.type = "fastq")')
dfCounts = dbGetQuery(db, q)
head(dfCounts)
nrow(dfCounts)
# for each sample id, get the corresponding files
cvQueries = paste0('select File.*, Sample.title from File, Sample
where (Sample.idData = 11 and Sample.id =', dfQuery$SampleID, ') and (File.idSample = Sample.id) and
(File.type = "fastq")')
# set header variables
cvShell = '#!/bin/bash'
cvShell.2 = '#$ -S /bin/bash'
cvProcessors = '#$ -pe smp 1'
cvWorkingDir = '#$ -cwd'
cvJobName = '#$ -N samtools-array'
cvStdout = '#$ -j y'
cvMemoryReserve = '#$ -l h_vmem=19G'
cvArrayJob = paste0('#$ -t 1-', nrow(dfCounts)/2)
# using high memory queue with one slot and 19 Gigs of memory
# set the directory names
cvInput = 'input/'
cvSam = '/opt/apps/bioinformatics/samtools/1.3.1/bin/samtools'
# create a parameter file and shell script
dir.create('AutoScripts')
oFile.param = file('AutoScripts/samtools_param.txt', 'wt')
temp = lapply(cvQueries, function(x){
# get the file names
dfFiles = dbGetQuery(db, x)
# check for null return
if (nrow(dfFiles) == 0) return();
# remove white space from title
dfFiles$title = gsub(" ", "", dfFiles$title, fixed=T)
# split the file names into paired end 1 and 2, identified by R1 and R2 in the file name
f = dfFiles$name
d = grepl('_R1_', f)
d = as.character(d)
d[d == 'TRUE'] = 'R1'
d[d == 'FALSE'] = 'R2'
lf = split(f, d)
# write samtools command variables
in.s1 = paste0(cvInput, lf[[1]], '.sam')
# output file in bam format
s2 = paste0(cvInput, lf[[1]], '.bam')
# remove low quality reads below 10
s3 = paste0(cvInput, lf[[1]], '_q10.bam')
# sort the file
s4 = paste0(cvInput, lf[[1]], '_q10_sort.bam')
# remove duplicates
s5 = paste0(cvInput, lf[[1]], '_q10_sort_rd.bam')
p1 = paste(in.s1, s2, s3, s4, s5, sep=' ')
writeLines(p1, oFile.param)
return(data.frame(idSample=dfFiles$idSample[1], name=c(s2, s4, s5), type=c('original bam', 'quality 10 sorted bam',
'quality 10 sorted bam duplicates removed'),
group1=dfFiles$group1[1]))
})
close(oFile.param)
temp[sapply(temp, is.null)] = NULL
dfNewData = do.call(rbind, temp)
rownames(dfNewData) = NULL
# remove the word input/ from file name
dfNewData$name = gsub('input/(\\w+)', '\\1', dfNewData$name, perl=T)
oFile = file('AutoScripts/samtools.sh', 'wt')
writeLines(c('# Autogenerated script from write_samtools_script.R', paste('# date', date())), oFile)
writeLines(c('# make sure directory paths exist before running script'), oFile)
writeLines(c(cvShell, cvShell.2, cvProcessors, cvWorkingDir, cvJobName, cvStdout, cvMemoryReserve, cvArrayJob), oFile)
writeLines('\n\n', oFile)
# module load
writeLines(c('module load bioinformatics/samtools/1.3.1'), oFile)
writeLines('\n\n', oFile)
## write array job lines
writeLines("# Parse parameter file to get variables.
number=$SGE_TASK_ID
paramfile=samtools_param.txt
ins1=`sed -n ${number}p $paramfile | awk '{print $1}'`
bamfile=`sed -n ${number}p $paramfile | awk '{print $2}'`
bamq10=`sed -n ${number}p $paramfile | awk '{print $3}'`
bamq10sort=`sed -n ${number}p $paramfile | awk '{print $4}'`
bamrd=`sed -n ${number}p $paramfile | awk '{print $5}'`
# 9. Run the program.", oFile)
# sam to bam
p1 = paste('samtools view -b -S', '$ins1', '>', '$bamfile', sep=' ')
com1 = paste(p1)
# remove low quality reads
p1 = paste('samtools view -b -q 10', '$bamfile', '>', '$bamq10', sep=' ')
com2 = paste(p1)
# sort the file
p1 = paste('samtools sort -o', '$bamq10sort', '$bamq10', sep=' ')
com3 = paste(p1)
# remove duplicates, for paired end reads
p1 = paste('samtools rmdup', '$bamq10sort', '$bamrd', sep=' ')
com4 = paste(p1)
# create index
p1 = paste('samtools index', '$bamrd', sep=' ')
com5 = paste(p1)
writeLines(c(com1, com2, com3, com4, com5), oFile)
writeLines('\n\n', oFile)
close(oFile)
dbDisconnect(db)
### update database with the file names
# dfNewData$group1 = 'Trimmomatic strict criteria'
# dbWriteTable(db, name='File', value = dfNewData, append=T, row.names=F)
# dbDisconnect(db)