-
Notifications
You must be signed in to change notification settings - Fork 2
/
BA11_G - PSM Search.py
120 lines (74 loc) · 3.04 KB
/
BA11_G - PSM Search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Feb 6 23:32:58 2020
@author: jasonmoggridge
PSM-search algorithm -> find best matching peptides given spectra, proteome
PSMSearch(SpectralVectors, Proteome, threshold).
PSMSet ← an empty set
for each vector Spectrum' in SpectralVectors
Peptide ← PeptideIdentification(Spectrum', Proteome)
if Score(Peptide, Spectrum) ≥ threshold
add the PSM (Peptide, Spectrum') to PSMSet
return PSMSet
#"""
def getMass_Tables():
#look-up tables for (AA:mass)
mass_aa = {}; aa_mass = {}
with open('data/integer_mass_table.txt', 'r') as file:
for line in file.readlines():
[aa,mass] = line.strip().split()
mass_aa[int(mass)] = str(aa)
aa_mass[str(aa)] = int(mass)
return mass_aa, aa_mass
mass_aa, aa_mass = getMass_Tables()
#
def Score(peptide, spectrum):
# Computes dot-product of peptide vector and spectrum
prefixes = [sum(aa_mass[aa] for aa in peptide[:i]) for i in range(1,len(peptide)+1)]
vector = [0 for _ in range(max(prefixes)+1)]
for prefix in prefixes:
vector[prefix] = 1
return sum([vector[i]*spectrum[i] for i in range(len(vector))])
def PSM_Search(spectrum, proteome, threshold):
""" Finds best scoring candidate peptide against spectrum from proteome"""
candidates = []
# set bounds for candidate peptide size from min/max AA masses
low = (len(spectrum) - 1) // max(mass_aa.keys()) + 1
hi = (len(spectrum) - 1) // min(mass_aa.keys()) + 1
# sliding window samples peptide substrings from proteome
for i in range(len(proteome)-low+2):
for j in range(i + low, i+hi+1):
peptide = proteome[i:j]
# verify peptide mass matches spectrum mass, add to list
if len(spectrum)-1 == sum(aa_mass[aa] for aa in peptide):
if peptide not in candidates:
candidates.append(peptide)
# Evauluate candidates, ret best scoring peptide if score> threshold
best_score = threshold
match =''
for peptide in candidates:
if Score(peptide, spectrum) > best_score:
best_score = Score(peptide, spectrum)
match = peptide
if match:
return match
#
# MAIN -----------------
with open('data/dataset_11866_5.txt','r') as infile:
# data is lines of >spectra, line of >proteome, line of >threshold
lines = [line.strip() for line in infile.readlines()]
threshold = int(lines.pop())
proteome = lines.pop()
# **add a root node to all spectra**
spectra = [[0]+[int(i) for i in line.split()] for line in lines]
del(lines)
with open("results/dataset_11866_psm.txt",'w') as file:
# Call PSM search for each spectrum from input set of spectra
results = []
for spectrum in spectra:
results.append(PSM_Search(spectrum, proteome, threshold))
for result in results:
if result:
file.write(result+'\n')
print(result)