-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathemboss_read.py
55 lines (52 loc) · 2.24 KB
/
emboss_read.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#defining a general function with arg: sitefile in .restrict, seqfile in .fasta
def getsites(sitefile,seqfile):
fh=open(sitefile,'r')
headers=[]
sites=[]
#processing the data
for line in fh.readlines():
#reading valid lines for lines not starting from '#'
if not line.startswith('#'):
if headers:
#stripping and splitting values for valid headers
values=line.strip().split()
#creating dictionaries for valid values
if values:
new_data=zip(headers,values)
new_dict=dict(new_data)
#creating dicitionaries of integer values for selected headers
for h in ['Start','End','3prime','5prime']:
new_dict[h]=int(new_dict[h])
#appending new dictionary to sites
sites.append(new_dict)
else:
#for line starting with space, finding first valid header
if line.replace(' ','').startswith('Start'):
headers=line.strip().split()
fh=open(seqfile,'r')
#reading sequence as one line
seq=''.join(fh.readlines()[1:]).replace('\n','')
gaps=0
esites={}
for e in sites:
#calculating and defining new start sites
while len(seq[:e['Start'] + gaps].replace('-','')) < e['Start']:
gaps +=e['Start'] - len(seq[:e['Start'] + gaps].replace('-',''))
e['gappedstart']=e['Start']+gaps
try:
esites[e['Enzyme_name']].append(e)
except:
esites[e['Enzyme_name']]=[e]
fh.close()
# add a fake 'last site' so the matching doesn't break
sites.append({'Start': len(seq), 'End':len(seq),
'3prime':len(seq),'5prime':len(seq),
'Restriction_site':'END', 'gappedstart':len(seq),
'Enzyme_name':'END'})
for e in esites:
esites[e].append({'Start': len(seq), 'End':len(seq),
'3prime':len(seq),'5prime':len(seq),
'Restriction_site':'END', 'gappedstart':len(seq),
'Enzyme_name':'END'})
#change this to return list of sites by enzyme
return esites