diff --git a/addok_france/__init__.py b/addok_france/__init__.py index 2b20ccb..c1ae1b7 100644 --- a/addok_france/__init__.py +++ b/addok_france/__init__.py @@ -14,6 +14,7 @@ extract_address = yielder(utils.extract_address) glue_ordinal = utils.glue_ordinal fold_ordinal = yielder(utils.fold_ordinal) +glue_initials = utils.glue_initials flag_housenumber = utils.flag_housenumber make_labels = utils.make_labels remove_leading_zeros = yielder(utils.remove_leading_zeros) diff --git a/addok_france/utils.py b/addok_france/utils.py index 779ab89..c7d001c 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -50,10 +50,15 @@ def clean_query(q): - q = re.sub(r'([\d]{5})', r' \1 ', q, flags=re.IGNORECASE) + q = re.sub(r'(^| )(boite postale|b\.?p\.?|cs|tsa|cidex) *(n(o|°|) *|)[\d]+ *', + r'\1', q, flags=re.IGNORECASE) + q = re.sub(r'([\d]{2})[\d]{3}(.*)c(e|é)dex ?[\d]*', r'\1\2', + q, flags=re.IGNORECASE) + q = re.sub(r'([^\d ])([\d]{5})([^\d]|$)', r'\1 \2 ', + q, flags=re.IGNORECASE) q = re.sub('c(e|é)dex ?[\d]*', '', q, flags=re.IGNORECASE) - q = re.sub(r'\b(bp|cs|tsa|cidex) *[\d]*', '', q, flags=re.IGNORECASE) q = re.sub('\d{,2}(e|[eè]me) ([eé]tage)', '', q, flags=re.IGNORECASE) + q = re.sub(r'((fax|t[eé]l|t[eé]l[eé]copieur)[ :,\.]*|)(\d{10}|[0-9][0-9][ -\./]\d\d[-\./ ]\d\d[-\./ ]\d\d[-\./ ]\d\d)', '', q, flags=re.IGNORECASE) q = re.sub(' {2,}', ' ', q, flags=re.IGNORECASE) q = re.sub('[ -]s/[ -]', ' sur ', q, flags=re.IGNORECASE) q = re.sub('[ -]s/s[ -]', ' sous ', q, flags=re.IGNORECASE) @@ -108,7 +113,7 @@ def flag_housenumber(tokens): found = False for previous, token, next_ in neighborhood(tokens): if ((token.is_first or (next_ and TYPES_PATTERN.match(next_))) - and NUMBER_PATTERN.match(token) and not found): + and NUMBER_PATTERN.match(token) and not found): token.kind = 'housenumber' found = True yield token @@ -123,10 +128,28 @@ def fold_ordinal(s): pass else: s = s.update('{}{}'.format(number, - FOLD.get(ordinal.lower(), ordinal))) + FOLD.get(ordinal.lower(), ordinal))) return s +def glue_initials(tokens): + """ glue 'F F I' into 'FFI' """ + initials = [] + for _, token, next_ in neighborhood(tokens): + isinitial = len(token) == 1 and token.isalpha() + if isinitial: + initials.append(token) + if not next_ or not isinitial: + if len(initials) > 2: + yield initials[0].update("".join(initials)) + else: + for tk in initials: + yield tk + initials = [] + if not isinitial: + yield token + + def remove_leading_zeros(s): """0003 => 3.""" # Limit digits from 1 to 3 in order to avoid processing postcodes. diff --git a/tests/test_utils.py b/tests/test_utils.py index 969ee91..eea4772 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -8,18 +8,22 @@ from addok.helpers.text import Token from addok_france.utils import (clean_query, extract_address, flag_housenumber, fold_ordinal, glue_ordinal, make_labels, - remove_leading_zeros) + remove_leading_zeros, glue_initials) @pytest.mark.parametrize("input,expected", [ ("2 allée Jules Guesde 31068 TOULOUSE CEDEX 7", - "2 allée Jules Guesde 31068 TOULOUSE"), + "2 allée Jules Guesde 31 TOULOUSE"), ("7, avenue Léon-Blum 31507 Toulouse Cedex 5", - "7, avenue Léon-Blum 31507 Toulouse"), + "7, avenue Léon-Blum 31 Toulouse"), ("159, avenue Jacques-Douzans 31604 Muret Cedex", - "159, avenue Jacques-Douzans 31604 Muret"), + "159, avenue Jacques-Douzans 31 Muret"), ("2 allée Jules Guesde BP 7015 31068 TOULOUSE", "2 allée Jules Guesde 31068 TOULOUSE"), + ("2 allée Jules Guesde B.P. 7015 31068 TOULOUSE", + "2 allée Jules Guesde 31068 TOULOUSE"), + ("2 allée Jules Guesde B.P. N 7015 31068 TOULOUSE", + "2 allée Jules Guesde 31068 TOULOUSE"), ("BP 80111 159, avenue Jacques-Douzans 31604 Muret", "159, avenue Jacques-Douzans 31604 Muret"), ("12, place de l'Hôtel-de-Ville BP 46 02150 Sissonne", @@ -27,7 +31,7 @@ ("6, rue Winston-Churchill CS 40055 60321 Compiègne", "6, rue Winston-Churchill 60321 Compiègne"), ("BP 80111 159, avenue Jacques-Douzans 31604 Muret Cedex", - "159, avenue Jacques-Douzans 31604 Muret"), + "159, avenue Jacques-Douzans 31 Muret"), ("BP 20169 Cite administrative - 8e étage Rue Gustave-Delory 59017 Lille", "Cite administrative - Rue Gustave-Delory 59017 Lille"), ("12e étage Rue Gustave-Delory 59017 Lille", @@ -52,9 +56,27 @@ ("32bis Rue des Vosges93290", "32bis Rue des Vosges 93290"), ("20 avenue de Ségur TSA 30719 75334 Paris Cedex 07", - "20 avenue de Ségur 75334 Paris"), + "20 avenue de Ségur 75 Paris"), + ("20 avenue de Ségur TSA No30719 75334 Paris Cedex 07", + "20 avenue de Ségur 75 Paris"), + ("20 avenue de Ségur TSA N 30719 75334 Paris Cedex 07", + "20 avenue de Ségur 75 Paris"), ("20 rue saint germain CIDEX 304 89110 Poilly-sur-tholon", "20 rue saint germain 89110 Poilly-sur-tholon"), + ("20 rue saint germain CIDEX N°304 89110 Poilly-sur-tholon", + "20 rue saint germain 89110 Poilly-sur-tholon"), + ("20 rue saint germain 89110 Poilly-sur-tholon 01.23.45.67.89", + "20 rue saint germain 89110 Poilly-sur-tholon"), + ("32bis Rue des Vosges93290 fax: 0123456789", + "32bis Rue des Vosges 93290"), + ("32bis Rue des Vosges 93290 tel 01 23 45 67 89", + "32bis Rue des Vosges 93290"), + ("32bis Rue des Vosges 93290 telecopieur. 01/23/45/67/89", + "32bis Rue des Vosges 93290"), + ("32bis Rue des Vosges 93290 télécopieur, 01-23-45-67-89", + "32bis Rue des Vosges 93290"), + ("10 BLD DES F F I 85300 CHALLANS", + "10 BLD DES F F I 85300 CHALLANS"), # done by glue_initials ]) def test_clean_query(input, expected): assert clean_query(input) == expected @@ -331,3 +353,22 @@ def test_make_municipality_labels(config): '59000 Lille', 'Lille 59000', ] + + +@pytest.mark.parametrize("inputs,expected", [ + (['allee', 'a', 'b', 'c', 'toto'], + ['allee', 'abc', 'toto']), + (['allee', 'a', 'b', 'c', 'toto', 'd', 'e', 'f'], + ['allee', 'abc', 'toto', 'def']), + (['allee', 'a', '2', 'c', 'toto'], + ['allee', 'a', '2', 'c', 'toto']), + (['allee', 'a', 'b', 'c'], + ['allee', 'abc']), + (['allee', 'a', 'b', 'c', 'd'], + ['allee', 'abcd']), + (['allee', 'a', 'b', 'c', 'd', 'e'], + ['allee', 'abcde']), +]) +def test_glue_initials(inputs, expected): + tokens = [Token(input_) for input_ in inputs] + assert list(glue_initials(tokens)) == expected