Skip to content
This repository has been archived by the owner on Sep 2, 2024. It is now read-only.

Commit

Permalink
Added code to replicate the pre-processing undertaken by the scala beta.
Browse files Browse the repository at this point in the history
Signed-off-by: James Hoskisson <[email protected]>
  • Loading branch information
James Hoskisson authored and James Hoskisson committed Apr 10, 2017
1 parent 48a2f5a commit 517445e
Show file tree
Hide file tree
Showing 4 changed files with 107 additions and 3 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,5 @@ project/plugins/project/
.DS_Store
/batch/src/main/resources/application.conf
DataScience/ProbabilisticParser/training/model*.txt
*.pyc
*.crfsuite
93 changes: 91 additions & 2 deletions DataScience/ProbabilisticParser/common/tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
MODEL_FILE = 'addressCRF.crfsuite'
directory = os.path.dirname(__file__) # for relative path definitions
MODEL_PATH = os.path.join(directory, '../training/')
LUT_PATH = '/home/james/Clients/ONS/address-index-api/parsers/src/main/resources/input_pre_post_processing'

# set labels - token names expected in the training file
LABELS = ['OrganisationName',
Expand Down Expand Up @@ -67,12 +68,38 @@
Ordinal = {'FIRST', '1ST', 'SECOND', '2ND', 'THIRD', '3RD', 'FOURTH', '4TH',
'FIFTH', '5TH', 'SIXTH', '6TH', 'SEVENTH', '7TH', 'EIGHTH', '8TH'}

# Read in the files required for tokenization pre-processing.
with open(LUT_PATH + '/county') as f:
county = f.read().splitlines()
with open(LUT_PATH + '/non_county_identification') as f:
nonCountyIdentification = f.read().splitlines()
with open(LUT_PATH + '/synonym') as f:
synonyms = f.read().splitlines()
# Create a dictionary for the synonyms.
synonym_LUT = dict(map(lambda x: x.split(','), synonyms))

# get some extra info - possible incodes and the linked post towns, used to identify tokens
df = pd.read_csv(os.path.join(directory, '../../data/') + 'postcode_district_to_town.csv')
OUTCODES = set(df['postcode'].values)
POSTTOWNS = set(df['town'].values)
# county?

def synonym(token):
"""
Create a function 'synonym' which will map each of the elements in the synoyms file to the respective synonym.
:param token: The token to synonymize.
:type token: string
:return token_out: The synonym of the token passed in.
:type: string
"""

try:
token_out = synonym_LUT[token]
except:
token_out = token

return token_out

def _stripFormatting(collection):
"""
Expand Down Expand Up @@ -222,6 +249,43 @@ def tokens2features(tokens):

return feature_sequence

def replaceSynonyms(tokens):
"""
This function replaces all of the words in the synonym list with their synonyms.
:param tokens: the list of tokens to replace with synonyms.
:type tokens: list of strings.
:return tokens: the synonymized list.
:type iterator: iterateas throught the list of strings.
"""

tokens = map(lambda x: synonym(x), tokens)

return tokens

def removeCounties(in_string):
"""
This function will remove any counties which appear in the county list.
:param in_string: the string from which to remove the counties.
:type in_string: str
:return out_string: the input string with the counties removed.
:return out_string: the input string with the counties removed.
:type out_string: str
"""

separatedCounties = '|'.join(county)
countiesRegex = '(?:\\b|\\s)*({sepCounties})(?:\\s|\\Z)*'.format(sepCounties = separatedCounties)
separatedSuffixes = '|'.join(nonCountyIdentification)
suffixesRegex = '(?!$sepSuffixes&)'.format(sepSuffixes = separatedSuffixes)

# regexp takes counties that don't have suffixes after them.
regexp = re.compile(countiesRegex + suffixesRegex)
out_string = regexp.sub(' ', in_string)

return out_string

def tokenize(raw_string):
"""
Expand All @@ -241,8 +305,33 @@ def tokenize(raw_string):
except:
raw_string = str(raw_string)

# Normalize the input string according to the pre-processing in the beta.

# Convert to uppercase.
upperInput = raw_string.upper()
inputWithoutCounties = removeCounties(upperInput)

# Do the regular expression replacements as per the scala parsing.
regex1 = re.compile("(\\d+[A-Z]?) *- *(\\d+[A-Z]?)")
tokens = regex1.sub("\g<1>-\g<2>", inputWithoutCounties)

regex2 = re.compile("(\\d+)/(\\d+)")
tokens = regex2.sub("\g<1>-\g<2>", tokens)

regex3 = re.compile("(\\d+) *TO *(\\d+)")
tokens = regex3.sub("\g<1>-\g<2>", tokens)

# Do the non-regular expression replacements.
tokens = tokens.replace(" IN ", " ").replace(" CO ", " ").replace(" - ", " ").replace(",", " ").replace("\\", " ")

# Now split the string by whitespace.
tokens = tokens.split()

# Replace any synonyms from the synonyms list and remove any counties that are now discoverable.
preprocessed_string = removeCounties(' '.join(replaceSynonyms(tokens)))

re_tokens = re.compile(r"\(*\b[^\s,;#&()]+[.,;)\n]* | [#&]", re.VERBOSE | re.UNICODE)
tokens = re_tokens.findall(raw_string)
tokens = re_tokens.findall(preprocessed_string)

if not tokens:
return []
Expand Down
5 changes: 4 additions & 1 deletion DataScience/ProbabilisticParser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,9 @@ def test(raw_string='ONS LIMITED FLAT 1 12 OXFORD STREET STREET ST1 2FW', verbos
print('\nCRFsuite call results:')
os.system('crfsuite tag -pit -m training/addressCRF.crfsuite training/test.txt')

def test_county(raw_string = '7 Gate Reach, Exeter, Berks, EX2 6GA'):
print('Input string:', raw_string)
print('Python Results:', tag(raw_string))

if __name__ == "__main__":
test()
test_county()
10 changes: 10 additions & 0 deletions DataScience/ProbabilisticParser/training/test.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
OrganisationName business:0.0 company:0.0 digits\:no_digits:1.0 directional:0.0 endsinpunc:0.0 flat:0.0 has.vowels:1.0 hyphenations:0.0 length\:w\:3:1.0 locational:0.0 next\:business:0.0 next\:company:1.0 next\:digits\:no_digits:1.0 next\:directional:0.0 next\:endsinpunc:0.0 next\:flat:0.0 next\:has.vowels:1.0 next\:hyphenations:0.0 next\:length\:w\:7:1.0 next\:locational:0.0 next\:ordinal:0.0 next\:outcode:0.0 next\:posttown:0.0 next\:residential:0.0 next\:road:0.0 next\:word\:LIMITED:1.0 ordinal:0.0 outcode:0.0 posttown:0.0 rawstring.start:1.0 residential:0.0 road:0.0 word\:ONS:1.0
OrganisationName business:0.0 company:1.0 digits\:no_digits:1.0 directional:0.0 endsinpunc:0.0 flat:0.0 has.vowels:1.0 hyphenations:0.0 length\:w\:7:1.0 locational:0.0 next\:business:0.0 next\:company:0.0 next\:digits\:no_digits:1.0 next\:directional:0.0 next\:endsinpunc:0.0 next\:flat:1.0 next\:has.vowels:1.0 next\:hyphenations:0.0 next\:length\:w\:4:1.0 next\:locational:0.0 next\:ordinal:0.0 next\:outcode:0.0 next\:posttown:0.0 next\:residential:0.0 next\:road:0.0 next\:word\:FLAT:1.0 ordinal:0.0 outcode:0.0 posttown:0.0 previous\:business:0.0 previous\:company:0.0 previous\:digits\:no_digits:1.0 previous\:directional:0.0 previous\:endsinpunc:0.0 previous\:flat:0.0 previous\:has.vowels:1.0 previous\:hyphenations:0.0 previous\:length\:w\:3:1.0 previous\:locational:0.0 previous\:ordinal:0.0 previous\:outcode:0.0 previous\:posttown:0.0 previous\:rawstring.start:1.0 previous\:residential:0.0 previous\:road:0.0 previous\:word\:ONS:1.0 residential:0.0 road:0.0 word\:LIMITED:1.0
SubBuildingName business:0.0 company:0.0 digits\:no_digits:1.0 directional:0.0 endsinpunc:0.0 flat:1.0 has.vowels:1.0 hyphenations:0.0 length\:w\:4:1.0 locational:0.0 next\:business:0.0 next\:company:0.0 next\:digits\:all_digits:1.0 next\:directional:0.0 next\:endsinpunc:0.0 next\:flat:0.0 next\:has.vowels:0.0 next\:hyphenations:0.0 next\:length\:d\:1:1.0 next\:locational:0.0 next\:ordinal:0.0 next\:outcode:0.0 next\:posttown:0.0 next\:residential:0.0 next\:road:0.0 next\:word:0.0 ordinal:0.0 outcode:0.0 posttown:0.0 previous\:business:0.0 previous\:company:1.0 previous\:digits\:no_digits:1.0 previous\:directional:0.0 previous\:endsinpunc:0.0 previous\:flat:0.0 previous\:has.vowels:1.0 previous\:hyphenations:0.0 previous\:length\:w\:7:1.0 previous\:locational:0.0 previous\:ordinal:0.0 previous\:outcode:0.0 previous\:posttown:0.0 previous\:residential:0.0 previous\:road:0.0 previous\:word\:LIMITED:1.0 residential:0.0 road:0.0 word\:FLAT:1.0
SubBuildingName business:0.0 company:0.0 digits\:all_digits:1.0 directional:0.0 endsinpunc:0.0 flat:0.0 has.vowels:0.0 hyphenations:0.0 length\:d\:1:1.0 locational:0.0 next\:business:0.0 next\:company:0.0 next\:digits\:all_digits:1.0 next\:directional:0.0 next\:endsinpunc:0.0 next\:flat:0.0 next\:has.vowels:0.0 next\:hyphenations:0.0 next\:length\:d\:2:1.0 next\:locational:0.0 next\:ordinal:0.0 next\:outcode:0.0 next\:posttown:0.0 next\:residential:0.0 next\:road:0.0 next\:word:0.0 ordinal:0.0 outcode:0.0 posttown:0.0 previous\:business:0.0 previous\:company:0.0 previous\:digits\:no_digits:1.0 previous\:directional:0.0 previous\:endsinpunc:0.0 previous\:flat:1.0 previous\:has.vowels:1.0 previous\:hyphenations:0.0 previous\:length\:w\:4:1.0 previous\:locational:0.0 previous\:ordinal:0.0 previous\:outcode:0.0 previous\:posttown:0.0 previous\:residential:0.0 previous\:road:0.0 previous\:word\:FLAT:1.0 residential:0.0 road:0.0 word:0.0
BuildingNumber business:0.0 company:0.0 digits\:all_digits:1.0 directional:0.0 endsinpunc:0.0 flat:0.0 has.vowels:0.0 hyphenations:0.0 length\:d\:2:1.0 locational:0.0 next\:business:0.0 next\:company:0.0 next\:digits\:no_digits:1.0 next\:directional:0.0 next\:endsinpunc:0.0 next\:flat:0.0 next\:has.vowels:1.0 next\:hyphenations:0.0 next\:length\:w\:6:1.0 next\:locational:0.0 next\:ordinal:0.0 next\:outcode:0.0 next\:posttown:1.0 next\:residential:0.0 next\:road:0.0 next\:word\:OXFORD:1.0 ordinal:0.0 outcode:0.0 posttown:0.0 previous\:business:0.0 previous\:company:0.0 previous\:digits\:all_digits:1.0 previous\:directional:0.0 previous\:endsinpunc:0.0 previous\:flat:0.0 previous\:has.vowels:0.0 previous\:hyphenations:0.0 previous\:length\:d\:1:1.0 previous\:locational:0.0 previous\:ordinal:0.0 previous\:outcode:0.0 previous\:posttown:0.0 previous\:residential:0.0 previous\:road:0.0 previous\:word:0.0 residential:0.0 road:0.0 word:0.0
StreetName business:0.0 company:0.0 digits\:no_digits:1.0 directional:0.0 endsinpunc:0.0 flat:0.0 has.vowels:1.0 hyphenations:0.0 length\:w\:6:1.0 locational:0.0 next\:business:0.0 next\:company:0.0 next\:digits\:no_digits:1.0 next\:directional:0.0 next\:endsinpunc:0.0 next\:flat:0.0 next\:has.vowels:1.0 next\:hyphenations:0.0 next\:length\:w\:6:1.0 next\:locational:0.0 next\:ordinal:0.0 next\:outcode:0.0 next\:posttown:1.0 next\:residential:0.0 next\:road:1.0 next\:word\:STREET:1.0 ordinal:0.0 outcode:0.0 posttown:1.0 previous\:business:0.0 previous\:company:0.0 previous\:digits\:all_digits:1.0 previous\:directional:0.0 previous\:endsinpunc:0.0 previous\:flat:0.0 previous\:has.vowels:0.0 previous\:hyphenations:0.0 previous\:length\:d\:2:1.0 previous\:locational:0.0 previous\:ordinal:0.0 previous\:outcode:0.0 previous\:posttown:0.0 previous\:residential:0.0 previous\:road:0.0 previous\:word:0.0 residential:0.0 road:0.0 word\:OXFORD:1.0
StreetName business:0.0 company:0.0 digits\:no_digits:1.0 directional:0.0 endsinpunc:0.0 flat:0.0 has.vowels:1.0 hyphenations:0.0 length\:w\:6:1.0 locational:0.0 next\:business:0.0 next\:company:0.0 next\:digits\:no_digits:1.0 next\:directional:0.0 next\:endsinpunc:0.0 next\:flat:0.0 next\:has.vowels:1.0 next\:hyphenations:0.0 next\:length\:w\:6:1.0 next\:locational:0.0 next\:ordinal:0.0 next\:outcode:0.0 next\:posttown:1.0 next\:residential:0.0 next\:road:1.0 next\:word\:STREET:1.0 ordinal:0.0 outcode:0.0 posttown:1.0 previous\:business:0.0 previous\:company:0.0 previous\:digits\:no_digits:1.0 previous\:directional:0.0 previous\:endsinpunc:0.0 previous\:flat:0.0 previous\:has.vowels:1.0 previous\:hyphenations:0.0 previous\:length\:w\:6:1.0 previous\:locational:0.0 previous\:ordinal:0.0 previous\:outcode:0.0 previous\:posttown:1.0 previous\:residential:0.0 previous\:road:0.0 previous\:word\:OXFORD:1.0 residential:0.0 road:1.0 word\:STREET:1.0
StreetName business:0.0 company:0.0 digits\:no_digits:1.0 directional:0.0 endsinpunc:0.0 flat:0.0 has.vowels:1.0 hyphenations:0.0 length\:w\:6:1.0 locational:0.0 next\:business:0.0 next\:company:0.0 next\:digits\:some_digits:1.0 next\:directional:0.0 next\:endsinpunc:0.0 next\:flat:0.0 next\:has.vowels:0.0 next\:hyphenations:0.0 next\:length\:w\:3:1.0 next\:locational:0.0 next\:ordinal:0.0 next\:outcode:1.0 next\:posttown:0.0 next\:residential:0.0 next\:road:0.0 next\:word\:ST1:1.0 ordinal:0.0 outcode:0.0 posttown:1.0 previous\:business:0.0 previous\:company:0.0 previous\:digits\:no_digits:1.0 previous\:directional:0.0 previous\:endsinpunc:0.0 previous\:flat:0.0 previous\:has.vowels:1.0 previous\:hyphenations:0.0 previous\:length\:w\:6:1.0 previous\:locational:0.0 previous\:ordinal:0.0 previous\:outcode:0.0 previous\:posttown:1.0 previous\:residential:0.0 previous\:road:1.0 previous\:word\:STREET:1.0 residential:0.0 road:1.0 word\:STREET:1.0
Postcode business:0.0 company:0.0 digits\:some_digits:1.0 directional:0.0 endsinpunc:0.0 flat:0.0 has.vowels:0.0 hyphenations:0.0 length\:w\:3:1.0 locational:0.0 next\:business:0.0 next\:company:0.0 next\:digits\:some_digits:1.0 next\:directional:0.0 next\:endsinpunc:0.0 next\:flat:0.0 next\:has.vowels:0.0 next\:hyphenations:0.0 next\:length\:w\:3:1.0 next\:locational:0.0 next\:ordinal:0.0 next\:outcode:0.0 next\:posttown:0.0 next\:rawstring.end:1.0 next\:residential:0.0 next\:road:0.0 next\:word\:2FW:1.0 ordinal:0.0 outcode:1.0 posttown:0.0 previous\:business:0.0 previous\:company:0.0 previous\:digits\:no_digits:1.0 previous\:directional:0.0 previous\:endsinpunc:0.0 previous\:flat:0.0 previous\:has.vowels:1.0 previous\:hyphenations:0.0 previous\:length\:w\:6:1.0 previous\:locational:0.0 previous\:ordinal:0.0 previous\:outcode:0.0 previous\:posttown:1.0 previous\:residential:0.0 previous\:road:1.0 previous\:word\:STREET:1.0 residential:0.0 road:0.0 word\:ST1:1.0
Postcode business:0.0 company:0.0 digits\:some_digits:1.0 directional:0.0 endsinpunc:0.0 flat:0.0 has.vowels:0.0 hyphenations:0.0 length\:w\:3:1.0 locational:0.0 ordinal:0.0 outcode:0.0 posttown:0.0 previous\:business:0.0 previous\:company:0.0 previous\:digits\:some_digits:1.0 previous\:directional:0.0 previous\:endsinpunc:0.0 previous\:flat:0.0 previous\:has.vowels:0.0 previous\:hyphenations:0.0 previous\:length\:w\:3:1.0 previous\:locational:0.0 previous\:ordinal:0.0 previous\:outcode:1.0 previous\:posttown:0.0 previous\:residential:0.0 previous\:road:0.0 previous\:word\:ST1:1.0 rawstring.end:1.0 residential:0.0 road:0.0 word\:2FW:1.0

0 comments on commit 517445e

Please sign in to comment.