diff --git a/data/ocr/cities_laposte_hexasmal.json.gz b/data/ocr/location/cities_laposte_hexasmal.json.gz similarity index 100% rename from data/ocr/cities_laposte_hexasmal.json.gz rename to data/ocr/location/cities_laposte_hexasmal.json.gz diff --git a/data/ocr/location/city_blacklist_fr.txt b/data/ocr/location/city_blacklist_fr.txt new file mode 100644 index 0000000000..e5eb2ff917 --- /dev/null +++ b/data/ocr/location/city_blacklist_fr.txt @@ -0,0 +1,51 @@ +aucun +bay +bio +bois +bonne +bouchon +branches +by +cannelle +chemin +choux +corps +cultures +don +doux +eu +four +frais +gas +gras +grasse +hours +issus +jury +long +loue +mace +mer +mures +ondes +our +oz +place +plats +poissons +racines +rance +ri +riche +ris +rouge +rue +sales +son +tasse +teneur +this +ur +vert +vieux +y diff --git a/robotoff/insights/ocr/location.py b/robotoff/insights/ocr/location.py index 447d83d20a..ce52ade346 100644 --- a/robotoff/insights/ocr/location.py +++ b/robotoff/insights/ocr/location.py @@ -83,6 +83,14 @@ def load_cities_fr(source: Union[Path, BinaryIO, None] = None) -> Set[City]: return set(cities) +def load_city_blacklist_fr(source: Union[Path, BinaryIO, None] = None) -> List[str]: + if source is None: + source = settings.OCR_CITY_BLACKLIST_FR_PATH + + with open(source, "rt") as blacklist_file: + return [line.strip() for line in blacklist_file.readlines()] + + class AddressExtractor: """Text processor to extract French addresses based on city name and postal code. @@ -92,11 +100,11 @@ class AddressExtractor: * The text is prepared by taking it lower case, removing accents, and replacing the characters ' and - with " " (space), as city names must follow this format. * City names are searched for in the text. - * For each city name found, its corresponding postal code is searched for in the - surrounding text, at a maximum distance of `postal_code_search_distance`. - * If the postal code is found, the match is added to the list of returned - addresses, along with an extract of the text surrounding the address, - at a maximum distance of `text_extract_distance`. + * For each city name found: + * A postal code is searched for in the surrounding text, at a maximum distance + of `postal_code_search_distance`. + * Marker words are searched for at the left of the city name, at a maximum + distance of `marker_search_distance`. Args: cities (iterable of City): Set of cities to search for. @@ -106,20 +114,29 @@ class AddressExtractor: detected address to extract for returning. """ + marker_words = ["transforme", "elabore", "produit"] + def __init__( self, cities: Iterable[City], + city_blacklist: Iterable[str], postal_code_search_distance: int = 10, + marker_search_distance: int = 60, text_extract_distance: int = 30, ): - self.cities = cities + self.cities = list(cities) + self.city_blacklist = list(city_blacklist) self.postal_code_search_distance = postal_code_search_distance + self.marker_search_distance = marker_search_distance self.text_extract_distance = text_extract_distance self.cities_processor = KeywordProcessor() for city in self.cities: self.cities_processor.add_keyword(city.name, city) + self.marker_processor = KeywordProcessor() + self.marker_processor.add_keywords_from_list(self.marker_words) + def extract_addresses(self, content: Union[str, OCRResult]) -> List[RawInsight]: """Extract addresses from the given OCR result. @@ -138,29 +155,44 @@ def extract_addresses(self, content: Union[str, OCRResult]) -> List[RawInsight]: text = self.normalize_text(text) city_matches = self.find_city_names(text) + # language = self.get_language(ocr_result) locations = [] - for city, city_start, city_end in city_matches: - pc_match = self.find_nearby_postal_code(text, city, city_start, city_end) - if pc_match is None: - continue + for city, blacklisted, city_start, city_end in city_matches: + location = { + # "language": language, + "country_code": "fr", + "city": { + "name": city.name, + "blacklisted": blacklisted, + }, + "postal_code": None, + "markers": None, + } + location_start = city_start + location_end = city_end - pc, pc_start, pc_end = pc_match - address_start = min(city_start, pc_start) - self.text_extract_distance - address_end = max(city_end, pc_end) + self.text_extract_distance - text_extract = text[max(0, address_start) : min(len(text), address_end)] - - locations.append( - RawInsight( - type=InsightType.location, - data={ - "country_code": "fr", - "city_name": city.name, - "postal_code": city.postal_code, - "text_extract": text_extract, - }, - ) - ) + pc_match = self.find_nearby_postal_code(text, city, city_start, city_end) + if pc_match is not None: + match_level, pc, pc_start, pc_end = pc_match + location["postal_code"] = { + "match_level": match_level, "value": pc + } + location_start = min(location_start, pc_start) + location_end = max(location_end, pc_end) + + markers = self.find_nearby_markers(text, city_start, city_end) + if markers: + location["markers"] = [m[0] for m in markers] + location_start = min(location_start, *[m[1] for m in markers]) + location_end = max(location_end, *[m[2] for m in markers]) + + text_extract = text[ + max(0, location_start - self.text_extract_distance) + : min(len(text), location_end + self.text_extract_distance) + ] + location["text_extract"] = text_extract + locations.append(RawInsight(type=InsightType.location, data=location)) return locations @@ -188,30 +220,71 @@ def normalize_text(text: str) -> str: text = strip_accents_ascii(text) return text.replace("'", " ").replace("-", " ") - def find_city_names(self, text: str) -> List[Tuple[City, int, int]]: + @staticmethod + def get_language(ocr_result: OCRResult) -> Optional[str]: + """Return the most probable language for the text of the `OCRResult`. + + Args: + ocr_result (OCRResult): The OCR result to process. + + Returns: + str or None: The 2-letter language code of the most probable language + detected for the text, or None if none could be detected. + """ + languages = ocr_result.get_languages() + if languages is None: + return None + languages.pop("words") + sorted_languages = sorted(languages.items(), key=lambda x: x[1], reverse=True) + most_frequent = sorted_languages[0][0] + return None if most_frequent == "null" else most_frequent + + def find_city_names(self, text: str) -> List[Tuple[City, bool, int, int]]: """Find all cities from the search set in the text. Args: text (str): Text to search city names in. Returns: - list of (City, int, int): The list of `City`s which name was found in the - text, with the start and end indices of their names locations in the - text. Empty list if none found. + list of (City, bool, int, int): City matches found in the text, as a list + of tuples with items: + + * `City` object for which the name was found in the text + * blacklist indicator: True if the city name is blacklisted, False otherwise + * start index of the city name in the text + * end index of the ciy name in the text + + Empty list if no city name found. """ - return self.cities_processor.extract_keywords(text, span_info=True) + city_matches = self.cities_processor.extract_keywords(text, span_info=True) + return [ + (m[0], m[0].name in self.city_blacklist,) + m[1:] + for m in city_matches + ] def find_nearby_postal_code( self, text: str, city: City, city_start: int, city_end: int - ) -> Optional[Tuple[str, int, int]]: + ) -> Optional[Tuple[str, str, int, int]]: """Search for a city's postal code close to its name in the text. - The postal code is searched at a maximum distance of - `postal_code_search_distance` from the city name. + The postal code is searched at a maximum distance (including the postal code + itself) of `postal_code_search_distance` from the city name. + + Assumes digit-only postal code. Allows a non-digit directly next to it: for + the city "paris" with postal code "75000", "75000 paris" and "fr75000\n + paris" will match, "750006" will not. Allows a space between the department part + (first 2 digits) and the rest of the postal code: "75000 paris" and "75 000 + paris" will match. + + A postal code is searched for with multiple levels of specificity, in that + order: - Assumes digit-only postal code, allows non-digit directly next to it. For - example, for the city "paris" with postal code "75000", "75000 paris" and - "fr75000 paris" will match. + * exact: an exact postal code, e.g. 75000 + * department: a postal code with only the department part (first 2 digits) + matching the city one's, e.g. 75xxx + * general: a sequence of 5 digits, e.g. xxxxx + + Only the most specific match level is returned. Args: text (str): The OCR result text. @@ -220,9 +293,15 @@ def find_nearby_postal_code( city_end (int): End index of the city name match in `text`. Returns: - (str, int, int) or None: If the `City`'s postal code was found close to - the city name match, it is returned along with its start and end indices - in the text. If it was not found, returns None. + (str, str, int, int) or None: If a postal code was found close to the city + name, a tuple with the following items: + + * match level: "exact", "department" or "general" + * postal code match value, normalized (no spaces) + * start index of the postal code match in the text + * end index of the postal code match in the text + + If no postal code match was found, returns None. """ if not city.postal_code.isdigit(): logger = get_logger( @@ -230,21 +309,57 @@ def find_nearby_postal_code( ) logger.error("postal code contains non-digit characters: %s", city) return None - pattern = r"(?:[^0-9]|^)({})(?:[^0-9]|$)".format(city.postal_code) + + # Postal codes can be of the form "12345" or "12 345" + pc_patterns = [ + ("exact", "{}[ ]?{}".format(city.postal_code[:2], city.postal_code[2:])), + ("department", "{}[ ]?[0-9]{{3}}".format(city.postal_code[:2])), + ("general", "[0-9]{2}[ ]?[0-9]{3}"), + ] + pattern = r"(?:[^0-9]|^)({})(?:[^0-9]|$)" sub_start = max(0, city_start - self.postal_code_search_distance) sub_end = min(len(text), city_end + self.postal_code_search_distance) sub_text = text[sub_start:sub_end] - match = re.search(pattern, sub_text) - if match is None: - return None - else: - return match.group(1), sub_start + match.start(1), sub_start + match.end(1) + for match_level, pc_pattern in pc_patterns: + match = re.search(pattern.format(pc_pattern), sub_text) + if match is None: + continue + return ( + match_level, + match.group(1).replace(" ", ""), + sub_start + match.start(1), + sub_start + match.end(1), + ) + return None + + def find_nearby_markers( + self, text: str, city_start: int, city_end: int + ) -> List[Tuple[str, int, int]]: + """Search for marker words near a city name. + + Search only on the left, as always follows a pattern like "élaboré à quimper". + + Args: + text (str): The OCR result text. + city_start (int): Start index of the city name match. + city_end (int): End index of the city name. + + Returns: + list of (str, int, int): Marker words found close to the city name, + as a list of tuples with the marker word, and its start and end indices + in the text. + """ + sub_start = max(0, city_start - self.marker_search_distance) + sub_text = text[sub_start:city_start] + matches = self.marker_processor.extract_keywords(sub_text, span_info=True) + return [(m[0], sub_start + m[1], sub_start + m[2]) for m in matches] ADDRESS_EXTRACTOR_STORE = CachedStore( - lambda: AddressExtractor(load_cities_fr()), expiration_interval=None + lambda: AddressExtractor(load_cities_fr(), load_city_blacklist_fr()), + expiration_interval=None, ) diff --git a/robotoff/settings.py b/robotoff/settings.py index 38c9750ffb..598597d3b6 100644 --- a/robotoff/settings.py +++ b/robotoff/settings.py @@ -98,7 +98,8 @@ OCR_IMAGE_FLAG_MISCELLANEOUS_PATH = OCR_DATA_DIR / "image_flag_miscellaneous.txt" OCR_PACKAGING_DATA_PATH = OCR_DATA_DIR / "packaging.txt" OCR_TRACE_ALLERGEN_DATA_PATH = OCR_DATA_DIR / "trace_allergen.txt" -OCR_CITIES_FR_PATH = OCR_DATA_DIR / "cities_laposte_hexasmal.json.gz" +OCR_CITIES_FR_PATH = OCR_DATA_DIR / "location" / "cities_laposte_hexasmal.json.gz" +OCR_CITY_BLACKLIST_FR_PATH = OCR_DATA_DIR / "location" / "city_blacklist_fr.txt" BRAND_PREFIX_PATH = DATA_DIR / "brand_prefix.json"