Added code to replicate the pre-processing undertaken by the scala beta.

Signed-off-by: James Hoskisson <[email protected]>
ONSdigital · Apr 10, 2017 · 517445e · 517445e
1 parent 48a2f5a
commit 517445e
Show file tree

Hide file tree

Showing 4 changed files with 107 additions and 3 deletions.
diff --git a/.gitignore b/.gitignore
@@ -18,3 +18,5 @@ project/plugins/project/
 .DS_Store
 /batch/src/main/resources/application.conf
 DataScience/ProbabilisticParser/training/model*.txt
+*.pyc
+*.crfsuite
diff --git a/DataScience/ProbabilisticParser/common/tokens.py b/DataScience/ProbabilisticParser/common/tokens.py
@@ -37,6 +37,7 @@
 MODEL_FILE = 'addressCRF.crfsuite'
 directory = os.path.dirname(__file__)  # for relative path definitions
 MODEL_PATH = os.path.join(directory, '../training/')
+LUT_PATH = '/home/james/Clients/ONS/address-index-api/parsers/src/main/resources/input_pre_post_processing'
 
 # set labels - token names expected in the training file
 LABELS = ['OrganisationName',
@@ -67,12 +68,38 @@
 Ordinal = {'FIRST', '1ST', 'SECOND', '2ND', 'THIRD', '3RD', 'FOURTH', '4TH',
            'FIFTH', '5TH', 'SIXTH', '6TH', 'SEVENTH', '7TH', 'EIGHTH', '8TH'}
 
+# Read in the files required for tokenization pre-processing.
+with open(LUT_PATH + '/county') as f:
+    county = f.read().splitlines()
+with open(LUT_PATH + '/non_county_identification') as f:
+    nonCountyIdentification = f.read().splitlines()
+with open(LUT_PATH + '/synonym') as f:
+    synonyms = f.read().splitlines()
+# Create a dictionary for the synonyms.
+synonym_LUT = dict(map(lambda x: x.split(','), synonyms))
+
 # get some extra info - possible incodes and the linked post towns, used to identify tokens
 df = pd.read_csv(os.path.join(directory, '../../data/') + 'postcode_district_to_town.csv')
 OUTCODES = set(df['postcode'].values)
 POSTTOWNS = set(df['town'].values)
-# county?
 
+def synonym(token):
+    """
+    Create a function 'synonym' which will map each of the elements in the synoyms file to the respective synonym.
+
+    :param token: The token to synonymize.
+    :type token: string
+
+    :return token_out: The synonym of the token passed in.
+    :type: string
+    """
+
+    try:
+        token_out = synonym_LUT[token]
+    except:
+        token_out = token
+
+    return token_out
 
 def _stripFormatting(collection):
     """
@@ -222,6 +249,43 @@ def tokens2features(tokens):
 
     return feature_sequence
 
+def replaceSynonyms(tokens):
+    """
+    This function replaces all of the words in the synonym list with their synonyms.
+
+    :param tokens: the list of tokens to replace with synonyms.
+    :type tokens: list of strings.
+
+    :return tokens: the synonymized list.
+    :type iterator: iterateas throught the list of strings.
+    """
+
+    tokens = map(lambda x: synonym(x), tokens)
+
+    return tokens
+
+def removeCounties(in_string):
+    """
+    This function will remove any counties which appear in the county list.
+
+    :param in_string: the string from which to remove the counties.
+    :type in_string: str
+
+    :return out_string: the input string with the counties removed.
+    :return out_string: the input string with the counties removed.
+    :type out_string: str
+    """
+
+    separatedCounties = '|'.join(county)
+    countiesRegex = '(?:\\b|\\s)*({sepCounties})(?:\\s|\\Z)*'.format(sepCounties = separatedCounties)
+    separatedSuffixes = '|'.join(nonCountyIdentification)
+    suffixesRegex = '(?!$sepSuffixes&)'.format(sepSuffixes = separatedSuffixes)
+
+    # regexp takes counties that don't have suffixes after them.
+    regexp = re.compile(countiesRegex + suffixesRegex)
+    out_string = regexp.sub(' ', in_string)
+
+    return out_string
 
 def tokenize(raw_string):
     """
@@ -241,8 +305,33 @@ def tokenize(raw_string):
         except:
             raw_string = str(raw_string)
 
+    # Normalize the input string according to the pre-processing in the beta.
+
+    # Convert to uppercase.
+    upperInput = raw_string.upper()
+    inputWithoutCounties = removeCounties(upperInput)
+
+    # Do the regular expression replacements as per the scala parsing.
+    regex1 = re.compile("(\\d+[A-Z]?) *- *(\\d+[A-Z]?)")
+    tokens = regex1.sub("\g<1>-\g<2>", inputWithoutCounties)
+
+    regex2 = re.compile("(\\d+)/(\\d+)")
+    tokens = regex2.sub("\g<1>-\g<2>", tokens)
+
+    regex3 = re.compile("(\\d+) *TO *(\\d+)")
+    tokens = regex3.sub("\g<1>-\g<2>", tokens)
+
+    # Do the non-regular expression replacements.
+    tokens = tokens.replace(" IN ", " ").replace(" CO ", " ").replace(" - ", " ").replace(",", " ").replace("\\", " ")
+
+    # Now split the string by whitespace.
+    tokens = tokens.split()
+
+    # Replace any synonyms from the synonyms list and remove any counties that are now discoverable.
+    preprocessed_string = removeCounties(' '.join(replaceSynonyms(tokens)))
+
     re_tokens = re.compile(r"\(*\b[^\s,;#&()]+[.,;)\n]* | [#&]", re.VERBOSE | re.UNICODE)
-    tokens = re_tokens.findall(raw_string)
+    tokens = re_tokens.findall(preprocessed_string)
 
     if not tokens:
         return []

diff --git a/DataScience/ProbabilisticParser/parser.py b/DataScience/ProbabilisticParser/parser.py
@@ -201,6 +201,9 @@ def test(raw_string='ONS LIMITED FLAT 1 12 OXFORD STREET STREET ST1 2FW', verbos
     print('\nCRFsuite call results:')
     os.system('crfsuite tag -pit -m training/addressCRF.crfsuite training/test.txt')
 
+def test_county(raw_string = '7 Gate Reach, Exeter, Berks, EX2 6GA'):
+    print('Input string:', raw_string)
+    print('Python Results:', tag(raw_string))
 
 if __name__ == "__main__":
-    test()
+    test_county()
diff --git a/DataScience/ProbabilisticParser/training/test.txt b/DataScience/ProbabilisticParser/training/test.txt
@@ -0,0 +1,10 @@
+OrganisationName	business:0.0	company:0.0	digits\:no_digits:1.0	directional:0.0	endsinpunc:0.0	flat:0.0	has.vowels:1.0	hyphenations:0.0	length\:w\:3:1.0	locational:0.0	next\:business:0.0	next\:company:1.0	next\:digits\:no_digits:1.0	next\:directional:0.0	next\:endsinpunc:0.0	next\:flat:0.0	next\:has.vowels:1.0	next\:hyphenations:0.0	next\:length\:w\:7:1.0	next\:locational:0.0	next\:ordinal:0.0	next\:outcode:0.0	next\:posttown:0.0	next\:residential:0.0	next\:road:0.0	next\:word\:LIMITED:1.0	ordinal:0.0	outcode:0.0	posttown:0.0	rawstring.start:1.0	residential:0.0	road:0.0	word\:ONS:1.0	
+OrganisationName	business:0.0	company:1.0	digits\:no_digits:1.0	directional:0.0	endsinpunc:0.0	flat:0.0	has.vowels:1.0	hyphenations:0.0	length\:w\:7:1.0	locational:0.0	next\:business:0.0	next\:company:0.0	next\:digits\:no_digits:1.0	next\:directional:0.0	next\:endsinpunc:0.0	next\:flat:1.0	next\:has.vowels:1.0	next\:hyphenations:0.0	next\:length\:w\:4:1.0	next\:locational:0.0	next\:ordinal:0.0	next\:outcode:0.0	next\:posttown:0.0	next\:residential:0.0	next\:road:0.0	next\:word\:FLAT:1.0	ordinal:0.0	outcode:0.0	posttown:0.0	previous\:business:0.0	previous\:company:0.0	previous\:digits\:no_digits:1.0	previous\:directional:0.0	previous\:endsinpunc:0.0	previous\:flat:0.0	previous\:has.vowels:1.0	previous\:hyphenations:0.0	previous\:length\:w\:3:1.0	previous\:locational:0.0	previous\:ordinal:0.0	previous\:outcode:0.0	previous\:posttown:0.0	previous\:rawstring.start:1.0	previous\:residential:0.0	previous\:road:0.0	previous\:word\:ONS:1.0	residential:0.0	road:0.0	word\:LIMITED:1.0	
+SubBuildingName	business:0.0	company:0.0	digits\:no_digits:1.0	directional:0.0	endsinpunc:0.0	flat:1.0	has.vowels:1.0	hyphenations:0.0	length\:w\:4:1.0	locational:0.0	next\:business:0.0	next\:company:0.0	next\:digits\:all_digits:1.0	next\:directional:0.0	next\:endsinpunc:0.0	next\:flat:0.0	next\:has.vowels:0.0	next\:hyphenations:0.0	next\:length\:d\:1:1.0	next\:locational:0.0	next\:ordinal:0.0	next\:outcode:0.0	next\:posttown:0.0	next\:residential:0.0	next\:road:0.0	next\:word:0.0	ordinal:0.0	outcode:0.0	posttown:0.0	previous\:business:0.0	previous\:company:1.0	previous\:digits\:no_digits:1.0	previous\:directional:0.0	previous\:endsinpunc:0.0	previous\:flat:0.0	previous\:has.vowels:1.0	previous\:hyphenations:0.0	previous\:length\:w\:7:1.0	previous\:locational:0.0	previous\:ordinal:0.0	previous\:outcode:0.0	previous\:posttown:0.0	previous\:residential:0.0	previous\:road:0.0	previous\:word\:LIMITED:1.0	residential:0.0	road:0.0	word\:FLAT:1.0	
+SubBuildingName	business:0.0	company:0.0	digits\:all_digits:1.0	directional:0.0	endsinpunc:0.0	flat:0.0	has.vowels:0.0	hyphenations:0.0	length\:d\:1:1.0	locational:0.0	next\:business:0.0	next\:company:0.0	next\:digits\:all_digits:1.0	next\:directional:0.0	next\:endsinpunc:0.0	next\:flat:0.0	next\:has.vowels:0.0	next\:hyphenations:0.0	next\:length\:d\:2:1.0	next\:locational:0.0	next\:ordinal:0.0	next\:outcode:0.0	next\:posttown:0.0	next\:residential:0.0	next\:road:0.0	next\:word:0.0	ordinal:0.0	outcode:0.0	posttown:0.0	previous\:business:0.0	previous\:company:0.0	previous\:digits\:no_digits:1.0	previous\:directional:0.0	previous\:endsinpunc:0.0	previous\:flat:1.0	previous\:has.vowels:1.0	previous\:hyphenations:0.0	previous\:length\:w\:4:1.0	previous\:locational:0.0	previous\:ordinal:0.0	previous\:outcode:0.0	previous\:posttown:0.0	previous\:residential:0.0	previous\:road:0.0	previous\:word\:FLAT:1.0	residential:0.0	road:0.0	word:0.0	
+BuildingNumber	business:0.0	company:0.0	digits\:all_digits:1.0	directional:0.0	endsinpunc:0.0	flat:0.0	has.vowels:0.0	hyphenations:0.0	length\:d\:2:1.0	locational:0.0	next\:business:0.0	next\:company:0.0	next\:digits\:no_digits:1.0	next\:directional:0.0	next\:endsinpunc:0.0	next\:flat:0.0	next\:has.vowels:1.0	next\:hyphenations:0.0	next\:length\:w\:6:1.0	next\:locational:0.0	next\:ordinal:0.0	next\:outcode:0.0	next\:posttown:1.0	next\:residential:0.0	next\:road:0.0	next\:word\:OXFORD:1.0	ordinal:0.0	outcode:0.0	posttown:0.0	previous\:business:0.0	previous\:company:0.0	previous\:digits\:all_digits:1.0	previous\:directional:0.0	previous\:endsinpunc:0.0	previous\:flat:0.0	previous\:has.vowels:0.0	previous\:hyphenations:0.0	previous\:length\:d\:1:1.0	previous\:locational:0.0	previous\:ordinal:0.0	previous\:outcode:0.0	previous\:posttown:0.0	previous\:residential:0.0	previous\:road:0.0	previous\:word:0.0	residential:0.0	road:0.0	word:0.0	
+StreetName	business:0.0	company:0.0	digits\:no_digits:1.0	directional:0.0	endsinpunc:0.0	flat:0.0	has.vowels:1.0	hyphenations:0.0	length\:w\:6:1.0	locational:0.0	next\:business:0.0	next\:company:0.0	next\:digits\:no_digits:1.0	next\:directional:0.0	next\:endsinpunc:0.0	next\:flat:0.0	next\:has.vowels:1.0	next\:hyphenations:0.0	next\:length\:w\:6:1.0	next\:locational:0.0	next\:ordinal:0.0	next\:outcode:0.0	next\:posttown:1.0	next\:residential:0.0	next\:road:1.0	next\:word\:STREET:1.0	ordinal:0.0	outcode:0.0	posttown:1.0	previous\:business:0.0	previous\:company:0.0	previous\:digits\:all_digits:1.0	previous\:directional:0.0	previous\:endsinpunc:0.0	previous\:flat:0.0	previous\:has.vowels:0.0	previous\:hyphenations:0.0	previous\:length\:d\:2:1.0	previous\:locational:0.0	previous\:ordinal:0.0	previous\:outcode:0.0	previous\:posttown:0.0	previous\:residential:0.0	previous\:road:0.0	previous\:word:0.0	residential:0.0	road:0.0	word\:OXFORD:1.0	
+StreetName	business:0.0	company:0.0	digits\:no_digits:1.0	directional:0.0	endsinpunc:0.0	flat:0.0	has.vowels:1.0	hyphenations:0.0	length\:w\:6:1.0	locational:0.0	next\:business:0.0	next\:company:0.0	next\:digits\:no_digits:1.0	next\:directional:0.0	next\:endsinpunc:0.0	next\:flat:0.0	next\:has.vowels:1.0	next\:hyphenations:0.0	next\:length\:w\:6:1.0	next\:locational:0.0	next\:ordinal:0.0	next\:outcode:0.0	next\:posttown:1.0	next\:residential:0.0	next\:road:1.0	next\:word\:STREET:1.0	ordinal:0.0	outcode:0.0	posttown:1.0	previous\:business:0.0	previous\:company:0.0	previous\:digits\:no_digits:1.0	previous\:directional:0.0	previous\:endsinpunc:0.0	previous\:flat:0.0	previous\:has.vowels:1.0	previous\:hyphenations:0.0	previous\:length\:w\:6:1.0	previous\:locational:0.0	previous\:ordinal:0.0	previous\:outcode:0.0	previous\:posttown:1.0	previous\:residential:0.0	previous\:road:0.0	previous\:word\:OXFORD:1.0	residential:0.0	road:1.0	word\:STREET:1.0	
+StreetName	business:0.0	company:0.0	digits\:no_digits:1.0	directional:0.0	endsinpunc:0.0	flat:0.0	has.vowels:1.0	hyphenations:0.0	length\:w\:6:1.0	locational:0.0	next\:business:0.0	next\:company:0.0	next\:digits\:some_digits:1.0	next\:directional:0.0	next\:endsinpunc:0.0	next\:flat:0.0	next\:has.vowels:0.0	next\:hyphenations:0.0	next\:length\:w\:3:1.0	next\:locational:0.0	next\:ordinal:0.0	next\:outcode:1.0	next\:posttown:0.0	next\:residential:0.0	next\:road:0.0	next\:word\:ST1:1.0	ordinal:0.0	outcode:0.0	posttown:1.0	previous\:business:0.0	previous\:company:0.0	previous\:digits\:no_digits:1.0	previous\:directional:0.0	previous\:endsinpunc:0.0	previous\:flat:0.0	previous\:has.vowels:1.0	previous\:hyphenations:0.0	previous\:length\:w\:6:1.0	previous\:locational:0.0	previous\:ordinal:0.0	previous\:outcode:0.0	previous\:posttown:1.0	previous\:residential:0.0	previous\:road:1.0	previous\:word\:STREET:1.0	residential:0.0	road:1.0	word\:STREET:1.0	
+Postcode	business:0.0	company:0.0	digits\:some_digits:1.0	directional:0.0	endsinpunc:0.0	flat:0.0	has.vowels:0.0	hyphenations:0.0	length\:w\:3:1.0	locational:0.0	next\:business:0.0	next\:company:0.0	next\:digits\:some_digits:1.0	next\:directional:0.0	next\:endsinpunc:0.0	next\:flat:0.0	next\:has.vowels:0.0	next\:hyphenations:0.0	next\:length\:w\:3:1.0	next\:locational:0.0	next\:ordinal:0.0	next\:outcode:0.0	next\:posttown:0.0	next\:rawstring.end:1.0	next\:residential:0.0	next\:road:0.0	next\:word\:2FW:1.0	ordinal:0.0	outcode:1.0	posttown:0.0	previous\:business:0.0	previous\:company:0.0	previous\:digits\:no_digits:1.0	previous\:directional:0.0	previous\:endsinpunc:0.0	previous\:flat:0.0	previous\:has.vowels:1.0	previous\:hyphenations:0.0	previous\:length\:w\:6:1.0	previous\:locational:0.0	previous\:ordinal:0.0	previous\:outcode:0.0	previous\:posttown:1.0	previous\:residential:0.0	previous\:road:1.0	previous\:word\:STREET:1.0	residential:0.0	road:0.0	word\:ST1:1.0	
+Postcode	business:0.0	company:0.0	digits\:some_digits:1.0	directional:0.0	endsinpunc:0.0	flat:0.0	has.vowels:0.0	hyphenations:0.0	length\:w\:3:1.0	locational:0.0	ordinal:0.0	outcode:0.0	posttown:0.0	previous\:business:0.0	previous\:company:0.0	previous\:digits\:some_digits:1.0	previous\:directional:0.0	previous\:endsinpunc:0.0	previous\:flat:0.0	previous\:has.vowels:0.0	previous\:hyphenations:0.0	previous\:length\:w\:3:1.0	previous\:locational:0.0	previous\:ordinal:0.0	previous\:outcode:1.0	previous\:posttown:0.0	previous\:residential:0.0	previous\:road:0.0	previous\:word\:ST1:1.0	rawstring.end:1.0	residential:0.0	road:0.0	word\:2FW:1.0