Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Glue initials #10

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
1 change: 1 addition & 0 deletions addok_france/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
extract_address = yielder(utils.extract_address)
glue_ordinal = utils.glue_ordinal
fold_ordinal = yielder(utils.fold_ordinal)
glue_initials = utils.glue_initials
flag_housenumber = utils.flag_housenumber
make_labels = utils.make_labels
remove_leading_zeros = yielder(utils.remove_leading_zeros)
31 changes: 27 additions & 4 deletions addok_france/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,15 @@


def clean_query(q):
q = re.sub(r'([\d]{5})', r' \1 ', q, flags=re.IGNORECASE)
q = re.sub(r'(^| )(boite postale|b\.?p\.?|cs|tsa|cidex) *(n(o|°|) *|)[\d]+ *',
r'\1', q, flags=re.IGNORECASE)
q = re.sub(r'([\d]{2})[\d]{3}(.*)c(e|é)dex ?[\d]*', r'\1\2',
q, flags=re.IGNORECASE)
q = re.sub(r'([^\d ])([\d]{5})([^\d]|$)', r'\1 \2 ',
q, flags=re.IGNORECASE)
q = re.sub('c(e|é)dex ?[\d]*', '', q, flags=re.IGNORECASE)
q = re.sub(r'\b(bp|cs|tsa|cidex) *[\d]*', '', q, flags=re.IGNORECASE)
q = re.sub('\d{,2}(e|[eè]me) ([eé]tage)', '', q, flags=re.IGNORECASE)
q = re.sub(r'((fax|t[eé]l|t[eé]l[eé]copieur)[ :,\.]*|)(\d{10}|[0-9][0-9][ -\./]\d\d[-\./ ]\d\d[-\./ ]\d\d[-\./ ]\d\d)', '', q, flags=re.IGNORECASE)
q = re.sub(' {2,}', ' ', q, flags=re.IGNORECASE)
q = re.sub('[ -]s/[ -]', ' sur ', q, flags=re.IGNORECASE)
q = re.sub('[ -]s/s[ -]', ' sous ', q, flags=re.IGNORECASE)
Expand Down Expand Up @@ -108,7 +113,7 @@ def flag_housenumber(tokens):
found = False
for previous, token, next_ in neighborhood(tokens):
if ((token.is_first or (next_ and TYPES_PATTERN.match(next_)))
and NUMBER_PATTERN.match(token) and not found):
and NUMBER_PATTERN.match(token) and not found):
token.kind = 'housenumber'
found = True
yield token
Expand All @@ -123,10 +128,28 @@ def fold_ordinal(s):
pass
else:
s = s.update('{}{}'.format(number,
FOLD.get(ordinal.lower(), ordinal)))
FOLD.get(ordinal.lower(), ordinal)))
return s


def glue_initials(tokens):
""" glue 'F F I' into 'FFI' """
initials = []
for _, token, next_ in neighborhood(tokens):
isinitial = len(token) == 1 and token.isalpha()
if isinitial:
initials.append(token)
if not next_ or not isinitial:
if len(initials) > 2:
yield initials[0].update("".join(initials))
else:
for tk in initials:
yield tk
initials = []
if not isinitial:
yield token


def remove_leading_zeros(s):
"""0003 => 3."""
# Limit digits from 1 to 3 in order to avoid processing postcodes.
Expand Down
53 changes: 47 additions & 6 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,26 +8,30 @@
from addok.helpers.text import Token
from addok_france.utils import (clean_query, extract_address, flag_housenumber,
fold_ordinal, glue_ordinal, make_labels,
remove_leading_zeros)
remove_leading_zeros, glue_initials)


@pytest.mark.parametrize("input,expected", [
("2 allée Jules Guesde 31068 TOULOUSE CEDEX 7",
"2 allée Jules Guesde 31068 TOULOUSE"),
"2 allée Jules Guesde 31 TOULOUSE"),
("7, avenue Léon-Blum 31507 Toulouse Cedex 5",
"7, avenue Léon-Blum 31507 Toulouse"),
"7, avenue Léon-Blum 31 Toulouse"),
("159, avenue Jacques-Douzans 31604 Muret Cedex",
"159, avenue Jacques-Douzans 31604 Muret"),
"159, avenue Jacques-Douzans 31 Muret"),
("2 allée Jules Guesde BP 7015 31068 TOULOUSE",
"2 allée Jules Guesde 31068 TOULOUSE"),
("2 allée Jules Guesde B.P. 7015 31068 TOULOUSE",
"2 allée Jules Guesde 31068 TOULOUSE"),
("2 allée Jules Guesde B.P. N 7015 31068 TOULOUSE",
"2 allée Jules Guesde 31068 TOULOUSE"),
("BP 80111 159, avenue Jacques-Douzans 31604 Muret",
"159, avenue Jacques-Douzans 31604 Muret"),
("12, place de l'Hôtel-de-Ville BP 46 02150 Sissonne",
"12, place de l'Hôtel-de-Ville 02150 Sissonne"),
("6, rue Winston-Churchill CS 40055 60321 Compiègne",
"6, rue Winston-Churchill 60321 Compiègne"),
("BP 80111 159, avenue Jacques-Douzans 31604 Muret Cedex",
"159, avenue Jacques-Douzans 31604 Muret"),
"159, avenue Jacques-Douzans 31 Muret"),
("BP 20169 Cite administrative - 8e étage Rue Gustave-Delory 59017 Lille",
"Cite administrative - Rue Gustave-Delory 59017 Lille"),
("12e étage Rue Gustave-Delory 59017 Lille",
Expand All @@ -52,9 +56,27 @@
("32bis Rue des Vosges93290",
"32bis Rue des Vosges 93290"),
("20 avenue de Ségur TSA 30719 75334 Paris Cedex 07",
"20 avenue de Ségur 75334 Paris"),
"20 avenue de Ségur 75 Paris"),
("20 avenue de Ségur TSA No30719 75334 Paris Cedex 07",
"20 avenue de Ségur 75 Paris"),
("20 avenue de Ségur TSA N 30719 75334 Paris Cedex 07",
"20 avenue de Ségur 75 Paris"),
("20 rue saint germain CIDEX 304 89110 Poilly-sur-tholon",
"20 rue saint germain 89110 Poilly-sur-tholon"),
("20 rue saint germain CIDEX N°304 89110 Poilly-sur-tholon",
"20 rue saint germain 89110 Poilly-sur-tholon"),
("20 rue saint germain 89110 Poilly-sur-tholon 01.23.45.67.89",
"20 rue saint germain 89110 Poilly-sur-tholon"),
("32bis Rue des Vosges93290 fax: 0123456789",
"32bis Rue des Vosges 93290"),
("32bis Rue des Vosges 93290 tel 01 23 45 67 89",
"32bis Rue des Vosges 93290"),
("32bis Rue des Vosges 93290 telecopieur. 01/23/45/67/89",
"32bis Rue des Vosges 93290"),
("32bis Rue des Vosges 93290 télécopieur, 01-23-45-67-89",
"32bis Rue des Vosges 93290"),
("10 BLD DES F F I 85300 CHALLANS",
"10 BLD DES F F I 85300 CHALLANS"), # done by glue_initials
])
def test_clean_query(input, expected):
assert clean_query(input) == expected
Expand Down Expand Up @@ -331,3 +353,22 @@ def test_make_municipality_labels(config):
'59000 Lille',
'Lille 59000',
]


@pytest.mark.parametrize("inputs,expected", [
(['allee', 'a', 'b', 'c', 'toto'],
['allee', 'abc', 'toto']),
(['allee', 'a', 'b', 'c', 'toto', 'd', 'e', 'f'],
['allee', 'abc', 'toto', 'def']),
(['allee', 'a', '2', 'c', 'toto'],
['allee', 'a', '2', 'c', 'toto']),
(['allee', 'a', 'b', 'c'],
['allee', 'abc']),
(['allee', 'a', 'b', 'c', 'd'],
['allee', 'abcd']),
(['allee', 'a', 'b', 'c', 'd', 'e'],
['allee', 'abcde']),
])
def test_glue_initials(inputs, expected):
tokens = [Token(input_) for input_ in inputs]
assert list(glue_initials(tokens)) == expected