-
Notifications
You must be signed in to change notification settings - Fork 183
/
Copy pathphone_numbers.py
427 lines (359 loc) · 20.7 KB
/
phone_numbers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
# coding=utf-8
import regex
import number_homoglyphs
from helpers import get_only_digits, remove_end_regex_comments
# The NUMBER_REGEXes are used to obtain strings within a post which are considered to be a single "number". While
# it would be nice to be able to just use a single regular expression like:
# r'(?:[(+{[]{1,2}\d|\d(?<=[^\d(+{[]\d|^\d))[\W_]*+(?:\d[\W_]*+){7,18}\d(?=\D|$)'
# Doing so won't get us all the possible matches of different lengths which start from the same character, even
# when using the regex package's overlapped=True option. In order to get all different possible lengths,
# we use multiple regular expressions, with each specifying an explicit length within the range in which we're
# interested and then combine the results.
# In order to make it more efficient, we combine those into a single regular expression using lookaheads and
# capture groups, which will put all of the different possibilities into capture groups, along with empty strings
# for each length which didn't match.
# The use of separate Unicode and ASCII flagged versions of the regexes is also because they can result in different
# start and end points for the numbers. We continue to keep that separation for the NUMBER_REGEX,
# NUMBER_REGEX_START, and NUMBER_REGEX_END in order to not have a separate source for a combined regex. This
# does result in our CI testing being a bit slower, but is a trade-off for not using two separate regexes, which
# would reduce maintainability.
# The minimum number of digits to be considered a "number":
NUMBER_REGEX_MINIMUM_DIGITS = 7
# The maximum number of digits to be considered a "number":
NUMBER_REGEX_MAXIMUM_DIGITS = 20
NUMBER_REGEX_RANGE_LOW = NUMBER_REGEX_MINIMUM_DIGITS - 2
NUMBER_REGEX_RANGE_HIGH = NUMBER_REGEX_MAXIMUM_DIGITS - 2
VALID_NON_DIGIT_START_CHARACTERS = r'(+{['
NUMBER_REGEX_START_TEXT = r'(?:[' + VALID_NON_DIGIT_START_CHARACTERS + \
r']{1,2}\d|\d(?<=[^\d' + VALID_NON_DIGIT_START_CHARACTERS + \
r']\d|^\d))(?:[\W_]*+|\D(?:(?=\d)|(?<=\d\D)))'
NUMBER_REGEX_MIDDLE_TEXT = r'(?:\d(?:[\W_]*+|\D(?:(?=\d)|(?<=\d\D)))){{{}}}'
NUMBER_REGEX_END_TEXT = r'\d(?=\D|$)'
def get_number_regex_with_quantfier(quantifier):
return NUMBER_REGEX_START_TEXT + NUMBER_REGEX_MIDDLE_TEXT.format(quantifier) + NUMBER_REGEX_END_TEXT
NUMBER_REGEX_RANGE_TEXT = "{},{}".format(NUMBER_REGEX_RANGE_LOW, NUMBER_REGEX_RANGE_HIGH)
NUMBER_REGEXTEXT_WITH_RANGE = get_number_regex_with_quantfier(NUMBER_REGEX_RANGE_TEXT)
# Starting the regex with a pattern for the entire range limits the rest of the overall regex to only being tested
# on characters where there's going to be a match.
NUMBER_REGEX_TEXT = r'(?={})'.format(NUMBER_REGEXTEXT_WITH_RANGE)
for number_regex_length in range(NUMBER_REGEX_RANGE_LOW, NUMBER_REGEX_RANGE_HIGH):
# These lookaheads all have an empty pattern as a second option. This causes all of them to
# always match, which results in the capture group having the capture and not causing evaluation
# of the regex to stop.
NUMBER_REGEX_TEXT += r'(?=({})|)'.format(get_number_regex_with_quantfier(number_regex_length))
# The NUMBER_REGEX is to verify that a pattern with be able to make an exact match to text strings which are
# selected by the NUMBER_REGEXes. It should be used as a test to verify patterns for number watches and
# blacklists.
NUMBER_REGEX = {
'unicode': regex.compile(NUMBER_REGEX_TEXT, flags=regex.UNICODE),
'ascii': regex.compile(NUMBER_REGEX_TEXT, flags=regex.ASCII)
}
NUMBER_REGEX_START = {
'unicode': regex.compile(r'^' + NUMBER_REGEX_START_TEXT, flags=regex.UNICODE),
'ascii': regex.compile(r'^' + NUMBER_REGEX_START_TEXT, flags=regex.ASCII)
}
NUMBER_REGEX_END = {
'unicode': regex.compile(NUMBER_REGEX_END_TEXT + r'$', flags=regex.UNICODE),
'ascii': regex.compile(NUMBER_REGEX_END_TEXT + r'$', flags=regex.ASCII)
}
def matches_regex_ascii_or_unicode(regex_dict, pattern):
return regex_dict['ascii'].search(pattern) or regex_dict['unicode'].search(pattern)
def matches_number_regex(pattern):
return matches_regex_ascii_or_unicode(NUMBER_REGEX, pattern)
def matches_number_regex_start(pattern):
return matches_regex_ascii_or_unicode(NUMBER_REGEX_START, pattern)
def matches_number_regex_end(pattern):
return matches_regex_ascii_or_unicode(NUMBER_REGEX_END, pattern)
def is_digit_count_in_number_regex_range(digit_count):
return digit_count > NUMBER_REGEX_MINIMUM_DIGITS and digit_count < NUMBER_REGEX_MAXIMUM_DIGITS
def normalize(number):
return get_only_digits(number)
def normalize_set(numbers):
return {normalize(num) for num in numbers}
def normalize_list_only_changed(numbers):
# We want all which were changed by normalization, even if that results
# in re-introducing something that was excluded.
# Example: original: ['12a34', '1234']
# ^want ^don't want
normalized_list = []
for num in numbers:
normalized = normalize(num)
if normalized != num:
normalized_list.append(normalized)
return normalized_list
def normalize_list(numbers):
return [normalize(num) for num in numbers]
def get_candidate_set_with_start_characters(candidate):
result = set()
base = regex.sub(r'^[' + VALID_NON_DIGIT_START_CHARACTERS + r']+', '', candidate)
result.add(base)
for first in VALID_NON_DIGIT_START_CHARACTERS:
result.add(first + base)
for second in VALID_NON_DIGIT_START_CHARACTERS:
result.add(first + second + base)
return result
def get_all_candidates(text):
"""
Get unprocessed number candidates, normalized entries which are differenet from their unprocessed source,
and the normalized candidates which are newly generated as a result of deobfuscation.
"""
unprocessed_list, normalized_list = get_candidates(text, True)
raw_deobfuscated_list = get_deobfuscated_candidates(text)
# The raw_deobfuscated list should contain everything that's in the unprocessed list.
# We don't want to be considering any which are the identical entries as are in the unprocessed
# list. However, it's possible that an additional identical entry was created through deobfuscation.
# So, if there are 2 copies of a number on the unprocessed_list and 3 of that number on the
# raw_deobfuscated_list, then we want to end up with 1 of that number on the deobfuscated_list.
for unprocessed in unprocessed_list:
try:
raw_deobfuscated_list.remove(unprocessed)
except ValueError:
pass
# We only ever deal with the deobfuscated numbers in normalized format. Unlike the normalized list,
# we want all of them, even if unchanged.
deobfuscated_list = normalize_list(raw_deobfuscated_list)
return set(unprocessed_list), set(normalized_list), set(deobfuscated_list)
def get_candidates(text, also_normalized=False):
"""
:param test: Text from which to extract number candidates
:param also_normalized: Also return the normalized list
:return: canidate_list or candidate_list, normalized_list
"""
# The differences between this implementation and the original get_candidates(), which was based on a
# regex implementation, are:
# 1. This doesn't have the same potential for catistrophic CPU usage based on input text.
# 2. When the first character in the candidate is not a digit, this returns only one candidate.
# For example "+(123..." will return ["+(123..."]. The regex version returns two candidates, but not
# the version without the non-digit start characters (i.e. it returns ["+(123...", "(123..."]).
# The characters other than digits which are valid at the start are in VALID_NON_DIGIT_START_CHARACTERS.
# The intent at that time was to generate more verbatim matches, but it's better to just have the one
# result. In the meantime, normalized matching has been improved and more emphasis placed on it.
# 3. The regex version routinely returned duplicate entries. This implementation only returns duplicate
# entries if there are duplicates in the input text.
candidates = []
candidates_normalized = []
in_process_normalized = []
in_process = []
in_process_digit_counts = []
non_digits = ''
prev_non_digit = ''
prev_prev_non_digit = ''
digits = ''
# alpha_count is, primarily, the number of alpha characters encountered since the last digit. However, it's
# also used as a flag, by setting alpha_count = max_alpha + 1, to indicate that some other criteria has
# been reached which should cause the same behavior.
# Specifically, it's used for when len_digits > NUMBER_REGEX_MAXIMUM_DIGITS or when
# len(non_digits) > max_non_digits.
alpha_count = 0
max_alpha = 1
# max_non_digits is moderately high, but is intended to account for potential zalgo text, and/or
# combining characters, which would leave the number still readable by humans.
max_non_digits = 50
def promote_any_in_process_with_appropriate_digit_count():
for index in range(len(in_process)):
cur_count = in_process_digit_counts[index]
if cur_count >= NUMBER_REGEX_MINIMUM_DIGITS and cur_count <= NUMBER_REGEX_MAXIMUM_DIGITS:
candidates.append(in_process[index])
if in_process_normalized[index][0] != 'z':
# The 'z' at the start is used as a flag that this isn't a valid normalized entry.
candidates_normalized.append(in_process_normalized[index])
def evict_any_in_process_with_too_many_digits():
for index in reversed(range(len(in_process))):
if in_process_digit_counts[index] > NUMBER_REGEX_MAXIMUM_DIGITS:
del in_process[index]
del in_process_normalized[index]
del in_process_digit_counts[index]
def clear_in_process_if_more_than_limit_alpha():
nonlocal in_process
nonlocal in_process_normalized
nonlocal in_process_digit_counts
if in_process and alpha_count > max_alpha:
# No sequences continue passed limit alpha characters
in_process_normalized = []
in_process = []
in_process_digit_counts = []
def if_digits_add_digits_to_all_in_process_and_promote():
nonlocal in_process
nonlocal in_process_normalized
nonlocal in_process_digit_counts
nonlocal digits
nonlocal alpha_count
nonlocal prev_non_digit
nonlocal prev_prev_non_digit
if digits:
len_digits = len(digits)
if len_digits > NUMBER_REGEX_MAXIMUM_DIGITS:
# Too many digits. No need to try adding them, nor remembering the next alpha chars
alpha_count = max_alpha + 1
clear_in_process_if_more_than_limit_alpha()
else:
in_process = [to_add + digits for to_add in in_process]
in_process_normalized = [to_add + digits for to_add in in_process_normalized]
in_process_digit_counts = [to_add + len_digits for to_add in in_process_digit_counts]
# The original regex was written so that if a sequence started with '+(123...', then
# both '+(123...' and '(123...' ended up as candidates.
if prev_non_digit in VALID_NON_DIGIT_START_CHARACTERS:
if prev_prev_non_digit in VALID_NON_DIGIT_START_CHARACTERS:
in_process.append(prev_prev_non_digit + prev_non_digit + digits)
else:
in_process.append(prev_non_digit + digits)
else:
in_process.append(digits)
in_process_normalized.append(digits)
in_process_digit_counts.append(len_digits)
promote_any_in_process_with_appropriate_digit_count()
evict_any_in_process_with_too_many_digits()
digits = ''
prev_non_digit = ''
prev_prev_non_digit = ''
for char in text:
if char >= '0' and char <= '9':
# It's a digit
digits += char
alpha_count = 0
if non_digits:
in_process = [to_add + non_digits for to_add in in_process]
non_digits = ''
else:
# Not a digit
if_digits_add_digits_to_all_in_process_and_promote()
prev_prev_non_digit = prev_non_digit
prev_non_digit = char
if (char >= 'A' and char <= 'Z') or (char >= 'a' and char <= 'z'):
alpha_count += 1
clear_in_process_if_more_than_limit_alpha()
if alpha_count > max_alpha:
non_digits = ''
else:
non_digits += char
if len(non_digits) > max_non_digits:
alpha_count = max_alpha + 1 # Secondary use is as a flag that all in_process should end.
clear_in_process_if_more_than_limit_alpha()
non_digits = ''
if_digits_add_digits_to_all_in_process_and_promote()
# We can look at returning the normalized in a bit
if also_normalized:
return candidates, candidates_normalized
return candidates
def get_normalized_candidates(text):
return normalize_set(get_candidates(text))
def get_normalized_deobfuscated_candidates(text):
return normalize_set(get_candidates(number_homoglyphs.normalize(text)))
def get_deobfuscated_candidates(text):
return get_candidates(number_homoglyphs.normalize(text))
# North American phone numbers with fairly strict formatting
# The goal here is to be sure about identification, even if that leaves ones which are not identified.
# Without a 1. It must have a separator between the 334 groupings, like \d{3}\D\d{3}\D\d{4}, but with more
# than just a single \D permitted. The start can be our normal mix.
NA_NUMBER_CENTRAL_OFFICE_AND_LINE_REGEX = r'(?<=\D)[2-9]\d{2}(?:[\W_]*+|\D(?=\d))(?<=\D)\d{4}$'
NA_NUMBER_CENTRAL_OFFICE_AND_LINE_LOOSE = r'[2-9]\d{2}(?:[\W_]*+|\D(?=\d))\d{4}$'
NA_NUMBER_WITHOUT_ONE_REGEX_START = r'^(?:[' + VALID_NON_DIGIT_START_CHARACTERS + \
r']{1,2}[2-9]|[2-9](?<=[^\d' + VALID_NON_DIGIT_START_CHARACTERS + \
r'][2-9]|^[2-9]))\d{2}' + \
r'(?:[\W_]*+|\D(?:(?=\d)|(?<=\d\D)))'
NA_NUMBER_WITHOUT_ONE_REGEX = NA_NUMBER_WITHOUT_ONE_REGEX_START + NA_NUMBER_CENTRAL_OFFICE_AND_LINE_REGEX
NA_NUMBER_WITHOUT_ONE_LOOSE = NA_NUMBER_WITHOUT_ONE_REGEX_START + NA_NUMBER_CENTRAL_OFFICE_AND_LINE_LOOSE
# With a 1. It must have a separator between the 334 groupings, like 1\d{3}\D\d{3}\D\d{4}, but with more
# than just a single \D permitted and a separator is permitted after the 1. The start can be our normal mix.
NA_NUMBER_WITH_ONE_REGEX_START = r'^(?:[' + VALID_NON_DIGIT_START_CHARACTERS + \
r']{1,2}1|1(?<=[^\d' + VALID_NON_DIGIT_START_CHARACTERS + \
r']1|^1))(?:[\W_]*+|\D(?=\d))' + \
r'[2-9]\d{2}(?:[\W_]*+|\D(?=\d))'
# There's a trend to using a straight format of "+12345678900", which should be considered a NA number.
NA_NUMBER_WITH_ONE_NO_SEPARATORS_REGEX = r'^\+?1[2-9]\d{2}[2-9]\d{2}\d{4}$'
NA_NUMBER_WITH_ONE_AREA_CODE_SHORT_SEPARATORS_REGEX = r'^\+?1\D{0,2}[2-9]\d{2}\D{0,2}[2-9]\d{2}\d{4}$'
NA_NUMBER_WITH_ONE_REGEX = NA_NUMBER_WITH_ONE_REGEX_START + NA_NUMBER_CENTRAL_OFFICE_AND_LINE_REGEX
NA_NUMBER_WITH_ONE_LOOSE = NA_NUMBER_WITH_ONE_REGEX_START + NA_NUMBER_CENTRAL_OFFICE_AND_LINE_LOOSE
NA_NUMBER_WITH_ONE_OR_ONE_NO_SEPARATORS_REGEX = '(?:' + NA_NUMBER_WITH_ONE_REGEX + '|' + \
NA_NUMBER_WITH_ONE_AREA_CODE_SHORT_SEPARATORS_REGEX + ')'
def is_north_american_phone_number_with_one(text):
return regex.match(NA_NUMBER_WITH_ONE_OR_ONE_NO_SEPARATORS_REGEX, text) is not None
def is_north_american_phone_number_without_one(text):
return regex.match(NA_NUMBER_WITHOUT_ONE_REGEX, text) is not None
def is_north_american_phone_number_with_one_loose(text):
return regex.match(NA_NUMBER_WITH_ONE_LOOSE, text) is not None
def is_north_american_phone_number_without_one_loose(text):
return regex.match(NA_NUMBER_WITHOUT_ONE_LOOSE, text) is not None
def deobfuscate(text):
return number_homoglyphs.normalize(text)
def get_north_american_with_separators_from_normalized(normalized):
base = normalized[-10:-7] + '-' + normalized[-7:-4] + '-' + normalized[-4:]
country_code = '1-' if len(normalized) > 10 else ''
return country_code + base
def get_maybe_north_american_not_in_normalized_but_in_all(pattern, normalized, all_normalized=None):
without_comments, comments = split_processed_and_comments(pattern)
north_american_extra, north_american_add_type, maybe_north_american_extra = \
get_north_american_alternate_normalized(normalize(deobfuscate(without_comments)), force=True)
if maybe_north_american_extra not in normalized and \
(all_normalized is None or maybe_north_american_extra in all_normalized):
return maybe_north_american_extra
return ''
def get_north_american_alternate_normalized(non_normalized, normalized=None, force=False):
normalized = normalized if normalized else normalize(non_normalized)
north_american_extra = ''
north_american_add_type = ''
maybe_north_american_extra = ''
non_normalized = normalized if force else non_normalized
if is_north_american_phone_number_with_one(non_normalized):
# Add a version without a one
north_american_extra = normalized[1:]
north_american_add_type = 'non-1'
elif is_north_american_phone_number_without_one(non_normalized):
# Add a version with a one
north_american_extra = '1' + normalized
north_american_add_type = 'add-1'
elif is_north_american_phone_number_with_one_loose(non_normalized):
# Add a version without a one
maybe_north_american_extra = normalized[1:]
north_american_add_type = 'non-1'
elif is_north_american_phone_number_without_one_loose(non_normalized):
# Add a version with a one
maybe_north_american_extra = '1' + normalized
north_american_add_type = 'add-1'
return north_american_extra, north_american_add_type, maybe_north_american_extra
def split_processed_and_comments(pattern):
without_comments = remove_end_regex_comments(pattern)
comment = pattern.replace(without_comments, '')
return without_comments, comment
def check_comments_for_north_american_directive(comments):
force_is_north_american = 'is noram' in comments.lower() or 'IS NA' in comments
force_no_north_american = 'no noram' in comments.lower() or 'NO NA' in comments
return force_is_north_american, force_no_north_american
def get_north_american_forced_or_no_from_pattern(pattern):
without_comments, comments = split_processed_and_comments(pattern)
return check_comments_for_north_american_directive(comments)
def process_numlist(numlist, processed=None, normalized=None):
# The normalized list does contain any processed item which is also normalized.
processed = processed if processed is not None else set()
normalized = normalized if normalized is not None else set()
unique_normalized = set()
duplicate_normalized = set()
full_list = dict()
index = 0
for entry in numlist:
index += 1
this_entry_normalized = set()
without_comments, comments = split_processed_and_comments(entry)
processed.add(without_comments)
comment = entry.replace(without_comments, '')
force_is_north_american, force_no_north_american = check_comments_for_north_american_directive(comments)
# normalized to only digits
this_entry_normalized.add(normalize(without_comments))
deobfuscated = deobfuscate(without_comments)
# deobfuscated and normalized: We don't look for the non-normalized deobfuscated
normalized_deobfuscated = normalize(deobfuscated)
this_entry_normalized.add(normalized_deobfuscated)
if not force_no_north_american:
north_american_extra, north_american_add_type, maybe_north_american_extra = \
get_north_american_alternate_normalized(deobfuscated, normalized_deobfuscated, force_is_north_american)
if maybe_north_american_extra and force_is_north_american:
north_american_extra = maybe_north_american_extra
maybe_north_american_extra = ''
if north_american_extra:
this_entry_normalized.add(north_american_extra)
# The normalized list *does* contain the processed string, if it's also normalized, as we need it to test
# against obfuscated
normalized |= this_entry_normalized
full_entry = (without_comments, this_entry_normalized)
full_list[entry] = full_entry
return full_list, processed, normalized