06_samtools_array_job.R

# File: 06_samtools_array_job.R
# Auth: umar.niazi@kcl.as.uk
# DESC: create a parameter file and shell script to run array job on hpc
# Date: 17/01/2017


## set variables and source libraries
source('header.R')

## connect to mysql database to get sample information
library('RMySQL')

##### connect to mysql database to get samples
db = dbConnect(MySQL(), user='rstudio', password='12345', dbname='Projects', host='127.0.0.1')
dbListTables(db)

# check how many files each sample has
g_did
q = paste0('select count(File.idSample) as files, Sample.idData, Sample.title, Sample.id as SampleID from File, Sample
           where (Sample.idData = 11 and File.idSample = Sample.id) group by File.idSample')
dfQuery = dbGetQuery(db, q)
dfQuery$title = gsub(" ", "", dfQuery$title, fixed = T)
dfQuery
# get the count of files
q = paste0('select File.*, Sample.idData from File, Sample 
           where (Sample.idData = 11) and (File.idSample = Sample.id) and (File.type = "fastq")')
dfCounts = dbGetQuery(db, q)
head(dfCounts)
nrow(dfCounts)

# for each sample id, get the corresponding files
cvQueries = paste0('select File.*, Sample.title from File, Sample 
                   where (Sample.idData = 11 and Sample.id =', dfQuery$SampleID, ') and (File.idSample = Sample.id) and 
                    (File.type = "fastq")')

# set header variables 
cvShell = '#!/bin/bash'
cvShell.2 = '#$ -S /bin/bash'
cvProcessors = '#$ -pe smp 1'
cvWorkingDir = '#$ -cwd'
cvJobName = '#$ -N samtools-array'
cvStdout = '#$ -j y'
cvMemoryReserve = '#$ -l h_vmem=19G'
cvArrayJob = paste0('#$ -t 1-', nrow(dfCounts)/2)
# using high memory queue with one slot and 19 Gigs of memory

# set the directory names
cvInput = 'input/'
cvSam = '/opt/apps/bioinformatics/samtools/1.3.1/bin/samtools'

# create a parameter file and shell script
dir.create('AutoScripts')
oFile.param = file('AutoScripts/samtools_param.txt', 'wt')

temp = lapply(cvQueries, function(x){
  # get the file names
  dfFiles = dbGetQuery(db, x)
  # check for null return
  if (nrow(dfFiles) == 0) return();
  # remove white space from title
  dfFiles$title = gsub(" ", "", dfFiles$title, fixed=T)
  # split the file names into paired end 1 and 2, identified by R1 and R2 in the file name
  f = dfFiles$name
  d = grepl('_R1_', f)
  d = as.character(d)
  d[d == 'TRUE'] = 'R1'
  d[d == 'FALSE'] = 'R2'
  lf = split(f, d)
  # write samtools command variables
  in.s1 = paste0(cvInput, lf[[1]], '.sam')
  # output file in bam format
  s2 = paste0(cvInput, lf[[1]], '.bam')
  # remove low quality reads below 10
  s3 = paste0(cvInput, lf[[1]], '_q10.bam')
  # sort the file
  s4 = paste0(cvInput, lf[[1]], '_q10_sort.bam')
  # remove duplicates
  s5 = paste0(cvInput, lf[[1]], '_q10_sort_rd.bam')
  p1 = paste(in.s1, s2, s3, s4, s5, sep=' ')
  writeLines(p1, oFile.param)
  return(data.frame(idSample=dfFiles$idSample[1], name=c(s2, s4, s5), type=c('original bam', 'quality 10 sorted bam',
                                                                             'quality 10 sorted bam duplicates removed'), 
                    group1=dfFiles$group1[1]))
})

close(oFile.param)
temp[sapply(temp, is.null)] = NULL
dfNewData = do.call(rbind, temp)
rownames(dfNewData) = NULL
# remove the word input/ from file name
dfNewData$name = gsub('input/(\\w+)', '\\1', dfNewData$name, perl=T)

oFile = file('AutoScripts/samtools.sh', 'wt')

writeLines(c('# Autogenerated script from write_samtools_script.R', paste('# date', date())), oFile)
writeLines(c('# make sure directory paths exist before running script'), oFile)
writeLines(c(cvShell, cvShell.2, cvProcessors, cvWorkingDir, cvJobName, cvStdout, cvMemoryReserve, cvArrayJob), oFile)
writeLines('\n\n', oFile)

# module load
writeLines(c('module load bioinformatics/samtools/1.3.1'), oFile)
writeLines('\n\n', oFile)

## write array job lines
writeLines("# Parse parameter file to get variables.
number=$SGE_TASK_ID
paramfile=samtools_param.txt
 
ins1=`sed -n ${number}p $paramfile | awk '{print $1}'`
bamfile=`sed -n ${number}p $paramfile | awk '{print $2}'`
bamq10=`sed -n ${number}p $paramfile | awk '{print $3}'`
bamq10sort=`sed -n ${number}p $paramfile | awk '{print $4}'`
bamrd=`sed -n ${number}p $paramfile | awk '{print $5}'`

# 9. Run the program.", oFile)

# sam to bam
p1 = paste('samtools view -b -S', '$ins1', '>', '$bamfile', sep=' ')
com1 = paste(p1)
# remove low quality reads
p1 = paste('samtools view -b -q 10', '$bamfile', '>', '$bamq10', sep=' ')
com2 = paste(p1)
# sort the file
p1 = paste('samtools sort -o', '$bamq10sort', '$bamq10', sep=' ')
com3 = paste(p1)
# remove duplicates, for paired end reads
p1 = paste('samtools rmdup', '$bamq10sort', '$bamrd', sep=' ')
com4 = paste(p1)
# create index
p1 = paste('samtools index', '$bamrd', sep=' ')
com5 = paste(p1)

writeLines(c(com1, com2, com3, com4, com5), oFile)
writeLines('\n\n', oFile)

close(oFile)
dbDisconnect(db)

### update database with the file names
# dfNewData$group1 = 'Trimmomatic strict criteria'
# dbWriteTable(db, name='File', value = dfNewData, append=T, row.names=F)
# dbDisconnect(db)