Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] feat: Extend location insight #146

Open
wants to merge 18 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions data/ocr/location/city_blacklist_fr.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
aucun
bay
bio
bois
bonne
bouchon
branches
by
cannelle
chemin
choux
corps
cultures
don
doux
eu
four
frais
gas
gras
grasse
hours
issus
jury
long
loue
mace
mer
mures
ondes
our
oz
place
plats
poissons
racines
rance
ri
riche
ris
rouge
rue
sales
son
tasse
teneur
this
ur
vert
vieux
y
209 changes: 162 additions & 47 deletions robotoff/insights/ocr/location.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,14 @@ def load_cities_fr(source: Union[Path, BinaryIO, None] = None) -> Set[City]:
return set(cities)


def load_city_blacklist_fr(source: Union[Path, BinaryIO, None] = None) -> List[str]:
if source is None:
source = settings.OCR_CITY_BLACKLIST_FR_PATH

with open(source, "rt") as blacklist_file:
return [line.strip() for line in blacklist_file.readlines()]


class AddressExtractor:
"""Text processor to extract French addresses based on city name and postal code.

Expand All @@ -92,11 +100,11 @@ class AddressExtractor:
* The text is prepared by taking it lower case, removing accents, and replacing
the characters ' and - with " " (space), as city names must follow this format.
* City names are searched for in the text.
* For each city name found, its corresponding postal code is searched for in the
surrounding text, at a maximum distance of `postal_code_search_distance`.
* If the postal code is found, the match is added to the list of returned
addresses, along with an extract of the text surrounding the address,
at a maximum distance of `text_extract_distance`.
* For each city name found:
* A postal code is searched for in the surrounding text, at a maximum distance
of `postal_code_search_distance`.
* Marker words are searched for at the left of the city name, at a maximum
distance of `marker_search_distance`.

Args:
cities (iterable of City): Set of cities to search for.
Expand All @@ -106,20 +114,29 @@ class AddressExtractor:
detected address to extract for returning.
"""

marker_words = ["transforme", "elabore", "produit"]

def __init__(
self,
cities: Iterable[City],
city_blacklist: Iterable[str],
postal_code_search_distance: int = 10,
marker_search_distance: int = 60,
text_extract_distance: int = 30,
):
self.cities = cities
self.cities = list(cities)
self.city_blacklist = list(city_blacklist)
self.postal_code_search_distance = postal_code_search_distance
self.marker_search_distance = marker_search_distance
self.text_extract_distance = text_extract_distance

self.cities_processor = KeywordProcessor()
for city in self.cities:
self.cities_processor.add_keyword(city.name, city)

self.marker_processor = KeywordProcessor()
self.marker_processor.add_keywords_from_list(self.marker_words)

def extract_addresses(self, content: Union[str, OCRResult]) -> List[RawInsight]:
"""Extract addresses from the given OCR result.

Expand All @@ -138,29 +155,44 @@ def extract_addresses(self, content: Union[str, OCRResult]) -> List[RawInsight]:

text = self.normalize_text(text)
city_matches = self.find_city_names(text)
# language = self.get_language(ocr_result)

locations = []
for city, city_start, city_end in city_matches:
pc_match = self.find_nearby_postal_code(text, city, city_start, city_end)
if pc_match is None:
continue
for city, blacklisted, city_start, city_end in city_matches:
location = {
# "language": language,
"country_code": "fr",
"city": {
"name": city.name,
"blacklisted": blacklisted,
},
"postal_code": None,
"markers": None,
}
location_start = city_start
location_end = city_end

pc, pc_start, pc_end = pc_match
address_start = min(city_start, pc_start) - self.text_extract_distance
address_end = max(city_end, pc_end) + self.text_extract_distance
text_extract = text[max(0, address_start) : min(len(text), address_end)]

locations.append(
RawInsight(
type=InsightType.location,
data={
"country_code": "fr",
"city_name": city.name,
"postal_code": city.postal_code,
"text_extract": text_extract,
},
)
)
pc_match = self.find_nearby_postal_code(text, city, city_start, city_end)
if pc_match is not None:
match_level, pc, pc_start, pc_end = pc_match
location["postal_code"] = {
"match_level": match_level, "value": pc
}
location_start = min(location_start, pc_start)
location_end = max(location_end, pc_end)

markers = self.find_nearby_markers(text, city_start, city_end)
if markers:
location["markers"] = [m[0] for m in markers]
location_start = min(location_start, *[m[1] for m in markers])
location_end = max(location_end, *[m[2] for m in markers])

text_extract = text[
max(0, location_start - self.text_extract_distance)
: min(len(text), location_end + self.text_extract_distance)
]
location["text_extract"] = text_extract
locations.append(RawInsight(type=InsightType.location, data=location))

return locations

Expand Down Expand Up @@ -188,30 +220,71 @@ def normalize_text(text: str) -> str:
text = strip_accents_ascii(text)
return text.replace("'", " ").replace("-", " ")

def find_city_names(self, text: str) -> List[Tuple[City, int, int]]:
@staticmethod
def get_language(ocr_result: OCRResult) -> Optional[str]:
"""Return the most probable language for the text of the `OCRResult`.

Args:
ocr_result (OCRResult): The OCR result to process.

Returns:
str or None: The 2-letter language code of the most probable language
detected for the text, or None if none could be detected.
"""
languages = ocr_result.get_languages()
if languages is None:
return None
languages.pop("words")
sorted_languages = sorted(languages.items(), key=lambda x: x[1], reverse=True)
most_frequent = sorted_languages[0][0]
return None if most_frequent == "null" else most_frequent

def find_city_names(self, text: str) -> List[Tuple[City, bool, int, int]]:
"""Find all cities from the search set in the text.

Args:
text (str): Text to search city names in.

Returns:
list of (City, int, int): The list of `City`s which name was found in the
text, with the start and end indices of their names locations in the
text. Empty list if none found.
list of (City, bool, int, int): City matches found in the text, as a list
of tuples with items:

* `City` object for which the name was found in the text
* blacklist indicator: True if the city name is blacklisted, False otherwise
* start index of the city name in the text
* end index of the ciy name in the text

Empty list if no city name found.
"""
return self.cities_processor.extract_keywords(text, span_info=True)
city_matches = self.cities_processor.extract_keywords(text, span_info=True)
return [
(m[0], m[0].name in self.city_blacklist,) + m[1:]
for m in city_matches
]

def find_nearby_postal_code(
self, text: str, city: City, city_start: int, city_end: int
) -> Optional[Tuple[str, int, int]]:
) -> Optional[Tuple[str, str, int, int]]:
"""Search for a city's postal code close to its name in the text.

The postal code is searched at a maximum distance of
`postal_code_search_distance` from the city name.
The postal code is searched at a maximum distance (including the postal code
itself) of `postal_code_search_distance` from the city name.

Assumes digit-only postal code. Allows a non-digit directly next to it: for
the city "paris" with postal code "75000", "75000 paris" and "fr75000\n
paris" will match, "750006" will not. Allows a space between the department part
(first 2 digits) and the rest of the postal code: "75000 paris" and "75 000
paris" will match.

A postal code is searched for with multiple levels of specificity, in that
order:

Assumes digit-only postal code, allows non-digit directly next to it. For
example, for the city "paris" with postal code "75000", "75000 paris" and
"fr75000 paris" will match.
* exact: an exact postal code, e.g. 75000
* department: a postal code with only the department part (first 2 digits)
matching the city one's, e.g. 75xxx
* general: a sequence of 5 digits, e.g. xxxxx

Only the most specific match level is returned.

Args:
text (str): The OCR result text.
Expand All @@ -220,31 +293,73 @@ def find_nearby_postal_code(
city_end (int): End index of the city name match in `text`.

Returns:
(str, int, int) or None: If the `City`'s postal code was found close to
the city name match, it is returned along with its start and end indices
in the text. If it was not found, returns None.
(str, str, int, int) or None: If a postal code was found close to the city
name, a tuple with the following items:

* match level: "exact", "department" or "general"
* postal code match value, normalized (no spaces)
* start index of the postal code match in the text
* end index of the postal code match in the text

If no postal code match was found, returns None.
"""
if not city.postal_code.isdigit():
logger = get_logger(
"{}.{}".format(self.__module__, self.__class__.__name__)
)
logger.error("postal code contains non-digit characters: %s", city)
return None
pattern = r"(?:[^0-9]|^)({})(?:[^0-9]|$)".format(city.postal_code)

# Postal codes can be of the form "12345" or "12 345"
pc_patterns = [
("exact", "{}[ ]?{}".format(city.postal_code[:2], city.postal_code[2:])),
("department", "{}[ ]?[0-9]{{3}}".format(city.postal_code[:2])),
("general", "[0-9]{2}[ ]?[0-9]{3}"),
]
pattern = r"(?:[^0-9]|^)({})(?:[^0-9]|$)"

sub_start = max(0, city_start - self.postal_code_search_distance)
sub_end = min(len(text), city_end + self.postal_code_search_distance)
sub_text = text[sub_start:sub_end]

match = re.search(pattern, sub_text)
if match is None:
return None
else:
return match.group(1), sub_start + match.start(1), sub_start + match.end(1)
for match_level, pc_pattern in pc_patterns:
match = re.search(pattern.format(pc_pattern), sub_text)
if match is None:
continue
return (
match_level,
match.group(1).replace(" ", ""),
sub_start + match.start(1),
sub_start + match.end(1),
)
return None

def find_nearby_markers(
self, text: str, city_start: int, city_end: int
) -> List[Tuple[str, int, int]]:
"""Search for marker words near a city name.

Search only on the left, as always follows a pattern like "élaboré à quimper".

Args:
text (str): The OCR result text.
city_start (int): Start index of the city name match.
city_end (int): End index of the city name.

Returns:
list of (str, int, int): Marker words found close to the city name,
as a list of tuples with the marker word, and its start and end indices
in the text.
"""
sub_start = max(0, city_start - self.marker_search_distance)
sub_text = text[sub_start:city_start]
matches = self.marker_processor.extract_keywords(sub_text, span_info=True)
return [(m[0], sub_start + m[1], sub_start + m[2]) for m in matches]


ADDRESS_EXTRACTOR_STORE = CachedStore(
lambda: AddressExtractor(load_cities_fr()), expiration_interval=None
lambda: AddressExtractor(load_cities_fr(), load_city_blacklist_fr()),
expiration_interval=None,
)


Expand Down
3 changes: 2 additions & 1 deletion robotoff/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,8 @@
OCR_IMAGE_FLAG_MISCELLANEOUS_PATH = OCR_DATA_DIR / "image_flag_miscellaneous.txt"
OCR_PACKAGING_DATA_PATH = OCR_DATA_DIR / "packaging.txt"
OCR_TRACE_ALLERGEN_DATA_PATH = OCR_DATA_DIR / "trace_allergen.txt"
OCR_CITIES_FR_PATH = OCR_DATA_DIR / "cities_laposte_hexasmal.json.gz"
OCR_CITIES_FR_PATH = OCR_DATA_DIR / "location" / "cities_laposte_hexasmal.json.gz"
OCR_CITY_BLACKLIST_FR_PATH = OCR_DATA_DIR / "location" / "city_blacklist_fr.txt"


BRAND_PREFIX_PATH = DATA_DIR / "brand_prefix.json"
Expand Down