-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathprotein_tokenization.py
executable file
·256 lines (212 loc) · 11.7 KB
/
protein_tokenization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
"""Pure-python protein expression string tokenization utilities for biomedical NLP"""
import heapq
import itertools
# Positive and negative expression indicators
VOCAB_SIGN_POS = ['+', 'pos', 'positive', 'high', 'hi', 'bright']
VOCAB_SIGN_NEG = ['-', '–', '—', '−', 'neg', 'negative', 'lo', 'low', 'dim']
def match(string, words, max_unmatched_len):
"""Recursively break string input using known vocabulary (with unknown substrings of maximum size)
Generates tuples with values so far for (in this order):
- count of unmatched characters
- count of unmatched words
- count of matched words
- list of (substring, is_known) tuples where substring is a word to be broken out and is_known
is a boolean indicator of whether or not that word was present in the provided word list
"""
if len(string) == 0:
yield 0, 0, 0, []
return
# Loop through oov offsets up to at most `max_unmatched_len`
n = min(max_unmatched_len + 1, len(string))
matched = False
for u in range(n):
# Ignore u characters at beginning of current candidate and run
# prefix search for all substrings, taking the largest one possible
substr = string[u:]
for i in range(1, len(substr)+1):
prefix = substr[:i]
tail = None if i >= len(substr) else substr[:(i+1)]
if prefix not in words or (tail is not None and tail in words):
continue
# Recursion for substring starting at index u + i
for ctcu, ctwu, ctwm, tokens in match(substr[i:], words, max_unmatched_len):
matched = True
tknu, tknm = [] if u == 0 else [(string[:u], False)], [(prefix, True)]
yield ctcu + u, ctwu + len(tknu), ctwm + len(tknm), tknu + tknm + tokens
# Break when any oov offset results in a prefix hit
# (this minimizes the length of oov substrings)
if matched:
break
# Return trailing unmatched substring, if necessary
if not matched:
yield len(string), 1, 0, [(string, False)]
def split(string, words, max_unmatched_len=0, n_results=1, mode='most_words'):
"""Partition a string into words using a controlled vocabulary
This is a solution to the Word Break Problem that includes:
- Unknown words of at most length `max_unmatched_len`
- Greedy maximum length known word partitioning
- Greedy minimum length unknown word partitioning
Example:
words = ["this", "is", "the", "famous", "Word", "break", "b", "r", "e", "a", "k", "br", "problem"]
string = "WordbreakINGproblem"
results = split(string, words, max_unmatched_len=3, mode='most_words', n_results=1)[0]
print([w if known else '|' + w + '|' for w, known in results[-1]])
>>> ['Word', 'br', 'e', 'a', 'k', '|ING|', 'problem']
results = split(string, words, max_unmatched_len=3, mode='least_words', n_results=1)[0]
print([w if known else '|' + w + '|' for w, known in results[-1]])
>>> ['Word', 'break', '|ING|', 'problem']
Returns:
A list of tuples with values (in this order):
- count of unmatched characters
- count of unmatched words
- count of matched words
- list of (substring, is_known) tuples where substring is a word to be broken out and is_known
is a boolean indicator of whether or not that word was present in the provided word list
"""
if mode not in ['most_words', 'least_words']:
raise ValueError(f'Mode must be one of "most_words" or "least_words" not "{mode}"')
# Limit max unmatched length to length of input string (default to 0 if None)
max_unmatched_len = min(max(max_unmatched_len or 0, 0), len(string))
mode = (-1, -1) if mode == 'least_words' else (-1, 1)
res = []
# Get unmatched char count, unmatched word count, matched word count, and token list (in that order)
for ctcu, ctwu, ctwm, tokens in match(string, words, max_unmatched_len):
ctwt = ctwu + ctwm
# Minimize number of unmatched characters, maximize or minimize
# number of total tokens based on mode
key = (ctcu * mode[0], ctwt * mode[1])
item = (key, ctcu, ctwu, ctwm, ctwt, tokens)
# Push onto bounded heap
(heapq.heappush if len(res) < n_results else heapq.heappushpop)(res, item)
# Return all requested results except for those used in sorting heap entry
return [r[1:] for r in heapq.nlargest(n_results, res)]
class ProteinToken(object):
SIGN_POS_CHAR = '⁺'
SIGN_NEG_CHAR = '⁻'
def __init__(self, token_text, sign_text, sign_value, metadata):
self.token_text = token_text
self.sign_text = sign_text
self.sign_value = sign_value
self.text = (token_text or '') + (sign_text or '')
self.metadata = metadata
@property
def sign_value_text(self):
"""Normalized sign as text
For example, tokens like CD69hi and CD69+++ would both return '⁺' for this property
To change the normalized sign character, set ProteinToken.SIGN_POS_CHAR or ProteinToken.SIGN_NEG_CHAR
"""
sign = ''
if self.sign_value == 1:
sign = ProteinToken.SIGN_POS_CHAR
elif self.sign_value == -1:
sign = ProteinToken.SIGN_NEG_CHAR
return sign
def to_string(self, show_sign_text=True):
return ''.join([
self.token_text or '',
self.sign_value_text or '',
'(' + self.sign_text + ')' if self.sign_text and show_sign_text else ''
])
def __repr__(self):
return self.to_string(show_sign_text=True)
class ProteinTokenizer(object):
def __init__(self, vocab_pr, vocab_sign_pos=VOCAB_SIGN_POS, vocab_sign_neg=VOCAB_SIGN_NEG):
"""Tokenizer for extracting tokens with semantics common to protein expression strings in biomedical texts
This is helpful for resolving expressions in noun phrases, typically in the context of single cell assays,
such as (all of the below are from PMC articles):
- CD4+CD25+FOXP3+
- CD4+CD45RO/RBbright
- CD3CD4CD8alow
- CD4-CD8+CD3+RORγt+
- CD3CD4CD99+IL-12
- CD4+CD45RA+CD45RO−CD62L+CCR7+CD127+CD27+CD28+CD95+CD122+
Tokenization of such strings is not possible purely based on text patterns so this functionality
exists as a way to incorporate prior knowledge of known proteins and aliases.
A dictionary must be provided containing keys with protein names/aliases for which extracted tokens
will be associated with the values for those keys.
Example:
# Create the protein vocabulary
string = 'CD4+CD45RA+CD45RO-4-1BB-CD62L+++CCR7loCD127posCD27positiveCD28hiCD95+CD122+'
vocab_pr = [
# 'CCR7', 'CD27' --> intentionally omitted to show that surrounding context is still enough
'CD4', 'CD45', 'CD45RA', 'CD45RO', 'CD62L', '4-1BB',
'CD127', 'CD28', 'CD122', 'CD95', 'CD122']
vocab_pr = {k: dict(name=k) for k in vocab_pr}
tokenizer = ProteinTokenizer(vocab_pr)
for t in tokenizer.tokenize(string):
print(f'{t.text} [term={t.token_text}, sign={t.sign_text}, value={t.sign_value}, metadata={t.metadata}]')
# Note that "+++" is resolved to indicating positive expression but no more so than "+"
CD4+ [term=CD4, sign=+, value=1, metadata={'name': 'CD4'}]
CD45RA+ [term=CD45RA, sign=+, value=1, metadata={'name': 'CD45RA'}]
CD45RO- [term=CD45RO, sign=-, value=-1, metadata={'name': 'CD45RO'}]
4-1BB- [term=4-1BB, sign=-, value=-1, metadata={'name': '4-1BB'}]
CD62L+++ [term=CD62L, sign=+++, value=1, metadata={'name': 'CD62L'}]
CCR7 [term=CCR7, sign=None, value=0, metadata=None]
CD127pos [term=CD127, sign=pos, value=1, metadata={'name': 'CD127'}]
CD27 [term=CD27, sign=None, value=0, metadata=None]
CD28hi [term=CD28, sign=hi, value=1, metadata={'name': 'CD28'}]
CD95+ [term=CD95, sign=+, value=1, metadata={'name': 'CD95'}]
CD122+ [term=CD122, sign=+, value=1, metadata={'name': 'CD122'}]
Args:
- vocab_pr: dict-like with keys containing protein string names and values that are any arbitrary
metadata to be associated with matches to those names
- vocab_sign_pos: strings indicating positive expression
- vocab_sign_neg: strings indicating negative expression
"""
self.vocab = vocab_pr
self.tokens = list(vocab_pr.keys()) + vocab_sign_pos + vocab_sign_neg
self.signs = {**{v:1 for v in vocab_sign_pos}, **{v:-1 for v in vocab_sign_neg}}
def tokenize(self, string, max_unmatched_len=16):
"""Generate ProteinToken instances from provided string
Args:
- string: input string (e.g. "CD4+CD25+FOXP3+")
- max_unmatched_len: maximum length of any OOV substring; this represents a trade-off between
accuracy and efficiency as the default value (16) may not be enough to catch long
strings not in the provided dictionary, but increases it further will increase
the size of the recursive search space
Returns:
ProteinToken generator
"""
splits = split(string, self.tokens, max_unmatched_len=max_unmatched_len, n_results=1)
if not splits:
yield ProteinToken(string, None, 0)
return
# Get last item in first result (i.e. list of tokens)
tokens = splits[0][-1]
# Create groups to collapse tokens by using an integer id assigned
# to each span of tokens where consecutive sign tokens are collapsed
# and consecutive non-sign tokens are not
keys = [v[0] not in self.signs for v in tokens] # Flag true when not sign token
keys = itertools.accumulate([int(k) for k in keys]) # Cumulative sum
keys = list(enumerate(keys)) # Add token indexes
# Yield collapsed tokens
# k -> token group id
# g -> List of tuples like (token index, token group id)
for k, g in itertools.groupby(keys, lambda v: v[1]):
sn_txt, pr_txt, sn_val = None, None, 0
# Get tokens for this group (as strings)
strs = [tokens[v[0]][0] for v in g]
# Determine which tokens are for signs
mask = [t in self.signs for t in strs]
# Handle sign only group
if mask[0]:
sn_txt = strs
else:
pr_txt = [strs[i] for i in range(len(strs)) if not mask[i]]
sn_txt = [strs[i] for i in range(len(strs)) if mask[i]]
# Resolve sign string to integer value
if sn_txt:
signs = set([self.signs.get(v, 0) for v in sn_txt]) - set([0])
# Only set sign if no conflicts exist
if len(signs) == 1:
sn_val = list(signs)[0]
# Convert token lists to string
pr_txt = ''.join(pr_txt) if pr_txt else None
sn_txt = ''.join(sn_txt) if sn_txt else None
# Fetch metadta and ignore signs for OOV terms
metadata = None
if pr_txt:
metadata = self.vocab.get(pr_txt, None)
if metadata is None:
sn_txt, sn_val = None, 0
yield ProteinToken(pr_txt, sn_txt, sn_val, metadata)