-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathlangDefaults.py
135 lines (119 loc) · 6.48 KB
/
langDefaults.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from string import ascii_uppercase
import re
from random import choice
from collections import defaultdict
from datetime import datetime, timedelta
import dateutil.parser
'''
default handling of language-dependent entities
'''
class LangDefaults():
def __init__(self):
self.freqMapFemale = [(ascii_uppercase, ascii_uppercase)]
self.freqMapMale = [(ascii_uppercase, ascii_uppercase)]
self.freqMapFamily = [(ascii_uppercase, ascii_uppercase)]
self.freqMapOrg = [(ascii_uppercase, ascii_uppercase)]
self.freqMapStreet = [(ascii_uppercase, ascii_uppercase)]
self.freqMapCity = [(ascii_uppercase, ascii_uppercase)]
# substitute female names
def subFemale(self, sgFile, token):
return self.getSurrogateName(sgFile, token.text, token.normCase, token.label, self.female)
# substitute male names
def subMale(self, sgFile, token):
return self.getSurrogateName(sgFile, token.text, token.normCase, token.label, self.male)
# substitute family names
def subFamily(self, sgFile, token):
return self.getSurrogateName(sgFile, token.text, token.normCase, token.label, self.family)
# substitute organizations
def subOrg(self, sgFile, token):
return self.getSurrogateName(sgFile, token.text, token.normCase, token.label, self.org)
# substitute street names
def subStreet(self, sgFile, token):
return self.getSurrogateName(sgFile, token.text, token.normCase, token.label, self.street)
# substitute city names
def subCity(self, sgFile, token):
return self.getSurrogateName(sgFile, token.text, token.normCase, token.label, self.city)
# substitute dates
def subDate(self, sgFile, token):
try:
tokenPars = dateutil.parser.parse(re.sub('\.(?=\w)','. ',token.text), parserinfo=self.dateParserInfo)
newTokenPars = tokenPars + timedelta(days=sgFile.dateShift)
except:
return self.getRandomDate(sgFile, token)
else:
newToken = re.findall('\W+|\w+', token.text)
parts = re.findall('\w+', token.text)
if re.search('[a-zA-Z]+', token.text):
month = datetime.strftime(tokenPars, '%B')
for form in self.dateFormatsAlpha:
try:
partsPars = datetime.strftime(tokenPars, form)
idxMonth = [i for i, form in enumerate(self.dateReplMonths[month]) if parts==re.findall('\w+', re.sub(month, form, partsPars))]
if idxMonth:
newMonth = datetime.strftime(newTokenPars, '%B')
if len(self.dateReplMonths[newMonth])>idxMonth[0]:
newPartsPars = re.findall('\w+', re.sub(newMonth, self.dateReplMonths[newMonth][idxMonth[0]], datetime.strftime(newTokenPars, form)))
else:
newPartsPars = re.findall('\w+', re.sub(newMonth, self.dateReplMonths[newMonth][0], datetime.strftime(newTokenPars, form)))
c = 0
for i, part in enumerate(newToken):
if part.isalnum():
newToken[i] = newPartsPars[c]
c+=1
newToken = ''.join(newToken)
sgFile.addSpellings(token.text, newToken, token.normCase, self.normalizeTokenCase(newToken), token.label)
return sgFile.sub[token.label][token.text]
except:
continue
return self.getRandomDate(sgFile, token)
else:
for form in self.dateFormatsNr:
try:
partsPars = re.findall('\w+', datetime.strftime(tokenPars, form))
if partsPars == parts:
newPartsPars = re.findall('\w+', datetime.strftime(newTokenPars, form))
c = 0
for i, part in enumerate(newToken):
if part.isdigit():
newToken[i] = newPartsPars[c]
c+=1
newToken = ''.join(newToken)
sgFile.sub[token.label][token.text] = newToken
return newToken
except:
continue
return self.getRandomDate(sgFile, token)
# get surrogate name
def getSurrogateName(self, sgFile, token, tokenNormCase, label, lex):
newToken = choice(lex[sgFile.getMapForChar(label, token[0].upper(), lex)])
sgFile.addSpellings(token, newToken, tokenNormCase, self.normalizeTokenCase(newToken), label)
return sgFile.sub[label][token]
# get surrogate for abbreviations
def getSurrogateAbbreviation(self, sgFile, token, label, lex):
if (len(token) == 1):
newToken = sgFile.getMapForChar(label, token[0].upper(), lex)
sgFile.sub[label][token] = newToken.lower() if token.islower() else newToken
return sgFile.sub[label][token]
elif token[-1] == '.' and len(token)<=4:
newToken = sgFile.getMapForChar(label, token[0].upper(), lex) + '.'
sgFile.sub[label][token] = newToken.lower() if token.islower() else newToken
return sgFile.sub[label][token]
# get same substitute for same entity in file
def getCoSurrogate(self, sgFile, token):
token.setNormCase(self.normalizeTokenCase(token.text))
return sgFile.sub[token.label].get(token.text) or sgFile.sub[token.label].get(token.normCase)
# generate random date
def getRandomDate(self, sgFile, token):
surrogate = datetime.today()+timedelta(days=sgFile.dateShift)
surrogate = surrogate.strftime(self.dateStdFormat)
sgFile.sub[token.label][token.text] = surrogate
return surrogate
# get case normalized token (standard title case)
def normalizeTokenCase(self, token):
return ''.join(t[0].upper()+t[1:].lower() for t in re.findall('\W+|\w+',token))
# read in substitute lists that are provided in a file with one entry per line
def readSubstituteLists(self, lexicon):
names = defaultdict(list)
for line in open(lexicon):
names[line[0].upper()].append(line.rstrip())
return names