-
Notifications
You must be signed in to change notification settings - Fork 1
/
fq_smarttrim
executable file
·102 lines (73 loc) · 2.46 KB
/
fq_smarttrim
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/bin/env python2.7
def exit(str):
print str
parser.print_help()
sys.exit()
def trim(string, length, overlap):
trimmedlines = []
while True:
firstchunk = string[:length]
string = string[(length - overlap):]
trimmedlines.append(firstchunk)
if len(string) < length:
trimmedlines.append(string[:-1])
break
return trimmedlines
if __name__ == "__main__":
import sys
import argparse
parser = argparse.ArgumentParser(description = 'Trims a fasta/fastq file by length, splitting too long reads into smaller ones',
prog='fq_smarttrim',
add_help = False)
required = parser.add_argument_group('Required Arguments')
required.add_argument('-i', '--input', default = '', help = 'input file')
required.add_argument('-l', '--max-length', type = int, default = 0, help = 'maximum read length')
optional = parser.add_argument_group('Optional Arguments')
optional.add_argument('-o', '--output', default = '', help = 'output file (def: stdout)')
optional.add_argument('-Q', '--fastq', action='store_true', help = 'input is fastq')
optional.add_argument('-ovl', '--overlap', default = 0, type = int, help = 'overlap when splitting reads (0)')
optional.add_argument('-m', '--min-length', default = 1, type = int, help = 'minimum read lenght')
args = vars( parser.parse_args() )
# Verify input validity
if args['input'] is '':
exit('Must provide input file')
if args['max_length'] == 0:
exit('Must provide valid maximum read length')
# Define output
if args['output'] is '':
out = sys.stdout
else:
out = open(args['output'], 'w')
# Open file
try:
infile = open(args['input'], 'r')
lines = infile.readlines()
if not args['fastq']:
i = 0
for line in lines:
# out.write(line)
if (len(line) > args['max_length']):
newlines = trim(line, args['max_length'], args['overlap'])
for newline in newlines:
if len(line) >= args['min_length']:
header = ">" + args['input'] + "_" + str(i)
i +=1
out.write(header + '\n')
out.write(newline + '\n')
else:
# read too short
pass
else:
if not (line[0] in ['>', '@', '+']) and (len(line) >= args['min_length']):
header = '>' + args['input'] + '_' + str(i)
i+=1
out.write(header + '\n')
out.write(line)
else:
print 'Fastq not supported yet'
except IOError as e:
print 'Error: File: ' + args['input'] + ' not found'
print e
finally:
if not args['output'] is '':
out.close()