From cc197e33bd6fae2b8388ee04bdfe8b1631cf96e8 Mon Sep 17 00:00:00 2001 From: Hugo Perrier Date: Thu, 5 Dec 2024 17:23:53 +0100 Subject: [PATCH 01/10] :recycle: Update MelusineRegex tutorial --- .../pipelines/preprocessing_pipeline.yaml | 3 + melusine/conf/processors/refined_tagger.yaml | 2 + melusine/message.py | 56 ++++++-- melusine/processors.py | 124 +++++++++++------- 4 files changed, 122 insertions(+), 63 deletions(-) create mode 100644 melusine/conf/processors/refined_tagger.yaml diff --git a/melusine/conf/pipelines/preprocessing_pipeline.yaml b/melusine/conf/pipelines/preprocessing_pipeline.yaml index cec8380..b9bf8bf 100644 --- a/melusine/conf/pipelines/preprocessing_pipeline.yaml +++ b/melusine/conf/pipelines/preprocessing_pipeline.yaml @@ -12,6 +12,9 @@ preprocessing_pipeline: - class_name: ContentTagger config_key: content_tagger module: melusine.processors + - class_name: RefinedTagger + config_key: refined_tagger + module: melusine.processors - class_name: TransferredEmailProcessor config_key: transferred_email_processor module: melusine.processors diff --git a/melusine/conf/processors/refined_tagger.yaml b/melusine/conf/processors/refined_tagger.yaml new file mode 100644 index 0000000..1f3bd7b --- /dev/null +++ b/melusine/conf/processors/refined_tagger.yaml @@ -0,0 +1,2 @@ +content_tagger: + default_tag: BODY diff --git a/melusine/message.py b/melusine/message.py index e58e09f..2fc30a0 100644 --- a/melusine/message.py +++ b/melusine/message.py @@ -8,7 +8,7 @@ import re from datetime import datetime -from typing import Iterable, List, Optional, Tuple +from typing import Any, Dict, Iterable, List, Optional, Tuple from melusine import config @@ -20,6 +20,8 @@ class Message: DEFAULT_STR_LINE_LENGTH = 120 DEFAULT_STR_TAG_NAME_LENGTH = 22 + MAIN_TAG_TYPE = "refined_tag" + FALLBACK_TAG_TYPE = "base_tag" def __init__( self, @@ -29,7 +31,7 @@ def __init__( date: Optional[datetime] = None, text_from: str = "", text_to: Optional[str] = None, - tags: Optional[List[Tuple[str, str]]] = None, + tags: Optional[List[Dict[str, Any]]] = None, ): """ Attributes initialization. @@ -84,8 +86,11 @@ def str_line_length(self) -> int: return config["message"].get("str_line_length", self.DEFAULT_STR_LINE_LENGTH) def extract_parts( - self, target_tags: Optional[Iterable[str]] = None, stop_at: Optional[Iterable[str]] = None - ) -> List[Tuple[str, str]]: + self, + target_tags: Optional[Iterable[str]] = None, + stop_at: Optional[Iterable[str]] = None, + tag_type: str = MAIN_TAG_TYPE, + ) -> List[Dict[str, Any]]: """ Function to extract target tags from the message. @@ -95,17 +100,23 @@ def extract_parts( Tags to be extracted. stop_at: Tags for which extraction should stop. + tag_type: + Type of tags to consider. Returns ------- - _: List[Tuple[str, str]] - List of extracted tags. + _: List of extracted tags. """ if not self.tags: return [] # List of tags in the message - tag_name_list: List[str] = [x[0] for x in self.tags] + try: + tag_name_list: List[str] = [x[tag_type] for x in self.tags] + # If tag_type is not available, fall back on base_tag + except KeyError: + tag_type = self.FALLBACK_TAG_TYPE + tag_name_list: List[str] = [x[tag_type] for x in self.tags] if target_tags is None: target_tags = tag_name_list @@ -122,11 +133,14 @@ def extract_parts( else: effective_tags = self.tags - return [x for x in effective_tags if x[0] in target_tags] + return [x for x in effective_tags if x[tag_type] in target_tags] def extract_last_body( - self, target_tags: Iterable[str] = ("BODY",), stop_at: Iterable[str] = ("GREETINGS",) - ) -> List[Tuple[str, str]]: + self, + target_tags: Iterable[str] = ("BODY",), + stop_at: Iterable[str] = ("GREETINGS",), + tag_type: str = MAIN_TAG_TYPE + ) -> List[Dict[str, Any]]: """ Extract the BODY parts of the last message in the email. @@ -134,17 +148,19 @@ def extract_last_body( ---------- target_tags: Iterable[str] stop_at: Iterable[str] + tag_type: Type of tags to consider. Returns ------- _: List[Tuple[str, str]] """ - return self.extract_parts(target_tags=target_tags, stop_at=stop_at) + return self.extract_parts(target_tags=target_tags, stop_at=stop_at, tag_type=tag_type) def has_tags( self, target_tags: Iterable[str] = ("BODY",), stop_at: Optional[Iterable[str]] = None, + tag_type: str = MAIN_TAG_TYPE, ) -> bool: """ Function to check if input tags are present in the message. @@ -155,6 +171,8 @@ def has_tags( Tags of interest. stop_at: Tags for which extraction should stop. + tag_type: + Type of tags to consider. Returns ------- @@ -168,7 +186,12 @@ def has_tags( stop_at = set() found: bool = False - for tag, _ in self.tags: + for tag_data in self.tags: + try: + tag = tag_data[tag_type] + except KeyError: + tag = tag_data[self.FALLBACK_TAG_TYPE] + # Check if tag in tags of interest if tag in target_tags: found = True @@ -180,7 +203,7 @@ def has_tags( return found - def format_tags(self) -> str: + def format_tags(self, tag_type: str = MAIN_TAG_TYPE) -> str: """ Create a pretty formatted representation of text and their associated tags. @@ -192,7 +215,12 @@ def format_tags(self) -> str: else: tag_text_length = self.str_line_length - self.str_tag_name_length text = "" - for tag_name, tag_text in self.tags: + for tag_data in self.tags: + try: + tag_name = tag_data[tag_type] + except KeyError: + tag_name = tag_data[self.FALLBACK_TAG_TYPE] + tag_text = tag_data["base_text"] text += tag_text.ljust(tag_text_length, ".") + tag_name.rjust(self.str_tag_name_length, ".") + "\n" return text.strip() diff --git a/melusine/processors.py b/melusine/processors.py index c40e49b..325d16f 100644 --- a/melusine/processors.py +++ b/melusine/processors.py @@ -746,7 +746,6 @@ def __init__( default_tag: str = "BODY", valid_part_regex: str = r"[a-z0-9?]", default_regex_flag: int = re.IGNORECASE, - post_process: bool = True, text_attribute: str = "text", ): """ @@ -784,9 +783,6 @@ def __init__( # Set text attribute self.text_attribute = text_attribute - # Activate post-processing - self.post_process = post_process - # Pattern to split text into sentences (=parts) self.split_pattern = self.compile_split_pattern() @@ -933,8 +929,6 @@ def compile_tag_regex(self, tag: str) -> re.Pattern: regex = re.compile(regex, flags=self.default_regex_flag) except re.error: raise ValueError(f"Invalid regex for tag {tag}:\n{regex}") - elif isinstance(regex, re.Pattern): - pass else: raise ValueError( f"Tag {tag} does not return any of the supported types : " @@ -965,10 +959,6 @@ def tag_text(self, text: str) -> list[tuple[str, str]]: for part in parts: tags.append(self.tag_part(part)) - # Post process tags - if self.post_process: - tags = self.post_process_tags(tags) - return tags def split_text(self, text: str) -> list[str]: @@ -1045,7 +1035,7 @@ def clean_up_after_split(parts: list[str | None]) -> list[str]: return clean_parts - def tag_part(self, part: str) -> tuple[str, str]: + def tag_part(self, part: str) -> dict[str, Any]: """ Method to apply tagging on a text chunk (sentence/part). @@ -1056,20 +1046,39 @@ def tag_part(self, part: str) -> tuple[str, str]: Returns ------- - match_tag: str - Output tag - part: str - Original text + _: tag data such as text, base_tag_list or base_tag """ - match_tag = self.default_tag - + match_tag_list = [] for tag, regex in self.regex_dict.items(): match = regex.match(part) if match: - match_tag = tag - break + match_tag_list.append(tag) + + if not match_tag_list: + match_tag_list.append(self.default_tag) - return match_tag, part + return { + "base_text": part, + "base_tag_list": match_tag_list, + "base_tag": self.get_base_tag(match_tag_list), + } + + def get_base_tag(self, match_tag_list: list[str]) -> str: + """ + Given a list of tags, return the base tag using the hierarchy from the tag_list attribute. + + Parameters + ---------- + match_tag_list: List of tags found in the text. + + Returns + ------- + _: Base tag + """ + for tag in self.tag_list: + if tag in match_tag_list: + return tag + return self.default_tag @staticmethod def word_block(n_words: int, word_character_only: bool = False) -> str: @@ -1153,21 +1162,6 @@ def find_matching_regex_patterns(self, part: str, regex: TagPattern) -> list[str return matching_regex_list - @abstractmethod - def post_process_tags(self, tags: list[tuple[str, str]]) -> list[tuple[str, str]]: - """ - Method to apply tagging rules posterior to the standard regex tagging. - - Parameters - ---------- - tags: list[tuple[str, str]] - Original tags - - Returns - ------- - _: list[tuple[str, str]] - Post-processed tags - """ class ContentTagger(BaseContentTagger): @@ -1554,39 +1548,70 @@ def SIGNATURE(self) -> str | list[str] | re.Pattern: r"^[A-Za-z]+(?: [A-Za-z]+)*, le \d{1,2} [A-Za-z]+ \d{4}.{,3}$", ] - def post_process_tags(self, tags: list[tuple[str, str]]) -> list[tuple[str, str]]: + +class RefinedTagger(MelusineTransformer): + BASE_TAG_KEY = "base_tag" + REFINED_TAG_KEY = "refined_tag" + + def __init__( + self, + input_columns: str = "messages", + output_columns: str = "messages", + default_tag: str = "BODY" + ): + """ + Parameters + ---------- + input_columns: str + Input columns for the transform operation + output_columns: str + Outputs columns for the transform operation + default_tag: str + Default tag to apply to untagged text + """ + self.default_tag = default_tag + + super().__init__( + input_columns=input_columns, + output_columns=output_columns, + func=self.post_process_messages, + ) + + def post_process_messages(self, messages: list[Message]) -> list[Message]: """ Method to apply tagging rules posterior to the standard regex tagging. Parameters ---------- - tags: list[tuple[str, str]] - Original tags + messages: list of messages Returns ------- - _: list[tuple[str, str]] - Post-processed tags + messages: list of messages post-processed """ + for message in messages: + message.tags = self.post_process_tags(message.tags) + + return messages + + def post_process_tags(self, tags: list[dict[str, Any]]) -> list[dict[str, Any]]: # Signature lines containing first/last name tags = self.detect_name_signature(tags) return tags - def detect_name_signature(self, tags: list[tuple[str, str]]) -> list[tuple[str, str]]: + def detect_name_signature(self, tags: list[dict[str, Any]]) -> list[dict[str, Any]]: """ Method to detect lines containing First name / Surname Ex: Mr Joe Dupond Parameters ---------- - tags: list[tuple[str, str]] - Original tags + tags: Original tags Returns ------- - _: list[tuple[str, str]] - Post processed tags + _: Post processed tags """ # First name / Last name Signatures capitalized_words: str = r"[A-Z][-'A-za-zÀ-ÿ]{,10}" @@ -1599,18 +1624,19 @@ def detect_name_signature(self, tags: list[tuple[str, str]]) -> list[tuple[str, # Forbidden words (lowercase) forbidden_words: set[str] = {"urgent", "attention"} - new_tags: list[tuple[str, str]] = list() - for tag, text in tags: + for tag_data in tags: + tag = tag_data[self.BASE_TAG_KEY] if tag == self.default_tag: + text = tag_data[self.BASE_TAG_KEY] match = re.match(line_with_name, text) has_forbidden_words: bool = bool(forbidden_words.intersection(text.lower().split())) if match and not has_forbidden_words: tag = "SIGNATURE_NAME" - new_tags.append((tag, text)) + tag_data[self.REFINED_TAG_KEY] = tag - return new_tags + return tags class TransferredEmailProcessor(MelusineTransformer): From 314e92772b10fdedc3092cc96c1cf03febc28b70 Mon Sep 17 00:00:00 2001 From: Hugo Perrier Date: Thu, 5 Dec 2024 17:30:14 +0100 Subject: [PATCH 02/10] :recycle: Refactor ContentTagger mechanism --- melusine/message.py | 2 +- melusine/processors.py | 8 +------- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/melusine/message.py b/melusine/message.py index 2fc30a0..bcac579 100644 --- a/melusine/message.py +++ b/melusine/message.py @@ -139,7 +139,7 @@ def extract_last_body( self, target_tags: Iterable[str] = ("BODY",), stop_at: Iterable[str] = ("GREETINGS",), - tag_type: str = MAIN_TAG_TYPE + tag_type: str = MAIN_TAG_TYPE, ) -> List[Dict[str, Any]]: """ Extract the BODY parts of the last message in the email. diff --git a/melusine/processors.py b/melusine/processors.py index 325d16f..5bd8f4f 100644 --- a/melusine/processors.py +++ b/melusine/processors.py @@ -1163,7 +1163,6 @@ def find_matching_regex_patterns(self, part: str, regex: TagPattern) -> list[str return matching_regex_list - class ContentTagger(BaseContentTagger): """ Class to add tags to a text. @@ -1553,12 +1552,7 @@ class RefinedTagger(MelusineTransformer): BASE_TAG_KEY = "base_tag" REFINED_TAG_KEY = "refined_tag" - def __init__( - self, - input_columns: str = "messages", - output_columns: str = "messages", - default_tag: str = "BODY" - ): + def __init__(self, input_columns: str = "messages", output_columns: str = "messages", default_tag: str = "BODY"): """ Parameters ---------- From 673c071a0bb50f41757dc682c65462f38e2bdd98 Mon Sep 17 00:00:00 2001 From: Hugo Perrier Date: Mon, 9 Dec 2024 14:09:25 +0100 Subject: [PATCH 03/10] :recycle: Refactor ContentTagger mechanism --- melusine/base.py | 2 +- melusine/conf/pipelines/demo_pipeline.yaml | 3 + melusine/conf/processors/refined_tagger.yaml | 2 +- melusine/detectors.py | 22 +- melusine/{io => io_mixin}/__init__.py | 2 +- melusine/{io => io_mixin}/_classes.py | 4 - melusine/message.py | 56 +- melusine/pipeline.py | 2 +- melusine/processors.py | 87 +- tests/base/test_message.py | 56 +- tests/detectors/test_thanks_detector.py | 30 +- .../detectors/test_vacation_reply_detector.py | 46 +- tests/functional/test_emails_fixtures.py | 109 ++- tests/{io => io_mixin}/__init__.py | 0 tests/{io => io_mixin}/test_io_mixin.py | 4 +- .../processors/test_content_refined_tagger.py | 819 ++++++++++++++++++ tests/processors/test_content_tagger.py | 787 ----------------- tests/processors/test_processors.py | 105 ++- 18 files changed, 1162 insertions(+), 974 deletions(-) rename melusine/{io => io_mixin}/__init__.py (66%) rename melusine/{io => io_mixin}/_classes.py (93%) rename tests/{io => io_mixin}/__init__.py (100%) rename tests/{io => io_mixin}/test_io_mixin.py (93%) create mode 100644 tests/processors/test_content_refined_tagger.py delete mode 100644 tests/processors/test_content_tagger.py diff --git a/melusine/base.py b/melusine/base.py index 36c347a..b7cd199 100644 --- a/melusine/base.py +++ b/melusine/base.py @@ -25,7 +25,7 @@ from sklearn.base import BaseEstimator, TransformerMixin from melusine.backend import backend -from melusine.io import IoMixin +from melusine.io_mixin import IoMixin logger = logging.getLogger(__name__) diff --git a/melusine/conf/pipelines/demo_pipeline.yaml b/melusine/conf/pipelines/demo_pipeline.yaml index 0b13b1f..3da0c72 100644 --- a/melusine/conf/pipelines/demo_pipeline.yaml +++ b/melusine/conf/pipelines/demo_pipeline.yaml @@ -12,6 +12,9 @@ demo_pipeline: - class_name: ContentTagger config_key: content_tagger module: melusine.processors + - class_name: RefinedTagger + config_key: refined_tagger + module: melusine.processors - class_name: TextExtractor config_key: text_extractor module: melusine.processors diff --git a/melusine/conf/processors/refined_tagger.yaml b/melusine/conf/processors/refined_tagger.yaml index 1f3bd7b..321a912 100644 --- a/melusine/conf/processors/refined_tagger.yaml +++ b/melusine/conf/processors/refined_tagger.yaml @@ -1,2 +1,2 @@ -content_tagger: +refined_tagger: default_tag: BODY diff --git a/melusine/detectors.py b/melusine/detectors.py index 96bf51a..905c0c0 100644 --- a/melusine/detectors.py +++ b/melusine/detectors.py @@ -6,7 +6,7 @@ """ -from typing import Any, Dict, List, Tuple +from typing import Any, Dict, List from melusine.base import MelusineDetector, MelusineItem, MelusineRegex from melusine.message import Message @@ -95,19 +95,12 @@ def pre_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineIte target_tags={self.BODY_PART}, stop_at={self.GREETINGS_PART} ) - # Extract the THANKS part in the last message - thanks_parts: List[Tuple[str, str]] = row[self.messages_column][0].extract_parts(target_tags={self.THANKS_PART}) - - # Compute THANKS text - if not thanks_parts: - thanks_text: str = "" - else: - thanks_text = "\n".join(x[1] for x in thanks_parts) + # Extract the THANKS text in the last message + thanks_text = row[self.messages_column][0].extract_text(target_tags={self.THANKS_PART}) # Save debug data if debug_mode: debug_dict = { - self.THANKS_PARTS_COL: thanks_parts, self.THANKS_TEXT_COL: thanks_text, self.HAS_BODY: has_body, } @@ -236,20 +229,13 @@ def pre_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineIte """ # Last message body last_message: Message = row[self.messages_column][0] - body_parts = last_message.extract_last_body() - - if body_parts: - row[self.CONST_TEXT_COL_NAME] = "\n".join(text for tag, text in body_parts) - else: - row[self.CONST_TEXT_COL_NAME] = "" + row[self.CONST_TEXT_COL_NAME] = last_message.extract_text(target_tags=("BODY",), stop_at=("GREETINGS",)) # Prepare and save debug data if debug_mode: debug_dict: Dict[str, Any] = { self.CONST_DEBUG_TEXT_KEY: row[self.CONST_TEXT_COL_NAME], } - if self.messages_column: - debug_dict[self.CONST_DEBUG_PARTS_KEY] = body_parts row[self.debug_dict_col].update(debug_dict) return row diff --git a/melusine/io/__init__.py b/melusine/io_mixin/__init__.py similarity index 66% rename from melusine/io/__init__.py rename to melusine/io_mixin/__init__.py index 8c9e89a..1249eeb 100644 --- a/melusine/io/__init__.py +++ b/melusine/io_mixin/__init__.py @@ -2,6 +2,6 @@ The melusine.io module includes classes for input/output data. """ -from melusine.io._classes import IoMixin +from melusine.io_mixin._classes import IoMixin __all__ = ["IoMixin"] diff --git a/melusine/io/_classes.py b/melusine/io_mixin/_classes.py similarity index 93% rename from melusine/io/_classes.py rename to melusine/io_mixin/_classes.py index e7c6c24..f5128ba 100644 --- a/melusine/io/_classes.py +++ b/melusine/io_mixin/_classes.py @@ -27,10 +27,6 @@ class IoMixin: Defines generic load methods. """ - def __init__(self, **kwargs: Any): - """Initialize attribute.""" - self.json_exclude_list: list[str] = ["_func", "json_exclude_list"] - @classmethod def from_config( cls: type[T], diff --git a/melusine/message.py b/melusine/message.py index bcac579..4f4b112 100644 --- a/melusine/message.py +++ b/melusine/message.py @@ -8,7 +8,7 @@ import re from datetime import datetime -from typing import Any, Dict, Iterable, List, Optional, Tuple +from typing import Any, Dict, Iterable, List, Optional from melusine import config @@ -22,6 +22,7 @@ class Message: DEFAULT_STR_TAG_NAME_LENGTH = 22 MAIN_TAG_TYPE = "refined_tag" FALLBACK_TAG_TYPE = "base_tag" + MAIN_TEXT_TYPE = "base_text" def __init__( self, @@ -65,6 +66,9 @@ def __init__( self.clean_header: str = "" self.clean_text: str = "" + self.effective_tag_key = "base_tag" + self.effective_text_key = "base_text" + @property def str_tag_name_length(self) -> int: """ @@ -89,7 +93,7 @@ def extract_parts( self, target_tags: Optional[Iterable[str]] = None, stop_at: Optional[Iterable[str]] = None, - tag_type: str = MAIN_TAG_TYPE, + tag_type: Optional[str] = None, ) -> List[Dict[str, Any]]: """ Function to extract target tags from the message. @@ -110,13 +114,11 @@ def extract_parts( if not self.tags: return [] + if tag_type is None: + tag_type = self.effective_tag_key + # List of tags in the message - try: - tag_name_list: List[str] = [x[tag_type] for x in self.tags] - # If tag_type is not available, fall back on base_tag - except KeyError: - tag_type = self.FALLBACK_TAG_TYPE - tag_name_list: List[str] = [x[tag_type] for x in self.tags] + tag_name_list: List[str] = [x[tag_type] for x in self.tags] if target_tags is None: target_tags = tag_name_list @@ -135,11 +137,42 @@ def extract_parts( return [x for x in effective_tags if x[tag_type] in target_tags] + def extract_text( + self, + target_tags: Optional[Iterable[str]] = None, + stop_at: Optional[Iterable[str]] = None, + tag_type: Optional[str] = None, + text_type: str = MAIN_TEXT_TYPE, + separator: str = "\n", + ) -> str: + """ + Function to extract target tags from the message. + + Parameters + ---------- + target_tags: + Tags to be extracted. + stop_at: + Tags for which extraction should stop. + tag_type: + Type of tags to consider. + text_type: + Type of text to consider + separator: + Separator to join the extracted texts. + + Returns + ------- + _: List of extracted tags. + """ + parts = self.extract_parts(target_tags=target_tags, stop_at=stop_at, tag_type=tag_type) + return separator.join([x[text_type] for x in parts]) + def extract_last_body( self, target_tags: Iterable[str] = ("BODY",), stop_at: Iterable[str] = ("GREETINGS",), - tag_type: str = MAIN_TAG_TYPE, + tag_type: Optional[str] = None, ) -> List[Dict[str, Any]]: """ Extract the BODY parts of the last message in the email. @@ -160,7 +193,7 @@ def has_tags( self, target_tags: Iterable[str] = ("BODY",), stop_at: Optional[Iterable[str]] = None, - tag_type: str = MAIN_TAG_TYPE, + tag_type: Optional[str] = None, ) -> bool: """ Function to check if input tags are present in the message. @@ -182,6 +215,9 @@ def has_tags( if self.tags is None: return False + if tag_type is None: + tag_type = self.effective_tag_key + if not stop_at: stop_at = set() diff --git a/melusine/pipeline.py b/melusine/pipeline.py index 7c37dfb..bead973 100644 --- a/melusine/pipeline.py +++ b/melusine/pipeline.py @@ -16,7 +16,7 @@ from melusine.backend import backend from melusine.backend.base_backend import Any from melusine.base import MelusineTransformer -from melusine.io import IoMixin +from melusine.io_mixin import IoMixin T = TypeVar("T") diff --git a/melusine/processors.py b/melusine/processors.py index 5bd8f4f..0e989f4 100644 --- a/melusine/processors.py +++ b/melusine/processors.py @@ -639,16 +639,19 @@ def extract(self, message_list: list[Message]) -> str: # Message has been tagged if message.tags is not None: if self.include_tags: - tags = message.extract_parts(target_tags=self.include_tags, stop_at=self.stop_at) - message_text_list = [x[1] for x in tags] + extracted_text = message.extract_text( + target_tags=self.include_tags, stop_at=self.stop_at, separator=self.sep + ) elif self.exclude_tags: tags = message.extract_parts(target_tags=None, stop_at=self.stop_at) - message_text_list = [part for tag, part in tags if tag not in self.exclude_tags] + message_text_list = [ + tag_data[message.effective_text_key] + for tag_data in tags + if tag_data[message.effective_tag_key] not in self.exclude_tags + ] + extracted_text = self.sep.join(message_text_list) else: - message_text_list = [part for tag, part in message.tags] - - # Join message text list - extracted_text = self.sep.join(message_text_list) + extracted_text = message.extract_text(target_tags=None, stop_at=self.stop_at, separator=self.sep) # Message has not been tagged else: @@ -929,6 +932,8 @@ def compile_tag_regex(self, tag: str) -> re.Pattern: regex = re.compile(regex, flags=self.default_regex_flag) except re.error: raise ValueError(f"Invalid regex for tag {tag}:\n{regex}") + elif isinstance(regex, re.Pattern): + pass else: raise ValueError( f"Tag {tag} does not return any of the supported types : " @@ -940,7 +945,7 @@ def compile_tag_regex(self, tag: str) -> re.Pattern: return regex - def tag_text(self, text: str) -> list[tuple[str, str]]: + def tag_text(self, text: str) -> list[dict[str, Any]]: """ Method to apply content tagging on a text. @@ -951,8 +956,7 @@ def tag_text(self, text: str) -> list[tuple[str, str]]: Returns ------- - _: list[tuple[str, str]] - List of tag/text couples (ex: [("HELLO", "bonjour")]) + _: List of tag/text couples """ parts = self.split_text(text) tags = list() @@ -1188,7 +1192,6 @@ def __init__( default_tag: str = "BODY", valid_part_regex: str = r"[a-z0-9?]", default_regex_flag: int = re.IGNORECASE | re.MULTILINE, - post_process: bool = True, text_attribute: str = "text", ): """ @@ -1212,7 +1215,6 @@ def __init__( default_tag=default_tag, valid_part_regex=valid_part_regex, default_regex_flag=default_regex_flag, - post_process=post_process, text_attribute=text_attribute, ) @@ -1517,13 +1519,15 @@ def SIGNATURE(self) -> str | list[str] | re.Pattern: return [ # Phone / Fax - r"(?:^.{,3}(?:T[ée]l(?:[ée]phone)?\.?|mobile|phone|num[ée]ro|ligne).{,20}(?: *(?:\n+|$)))", ( r"^(.{,10}:? ?\(?((?:\+|00)\(?33\)?(?: ?\(0\))?|0)\s*[1-" r"9]([\s.-]*\d{2}){4}.{,10}){,3}" + rf"({email_address_regex}.{{,10}})?" "( *(\n+|$))" ), - r"^.{,3}(T[ée]l[ée]?(phone|copie)?|Fax|mobile|phone|num[ée]ro|ligne).{,20}$", + # Make sure there are at least 6 digits + r"^.{,3}(T[ée]l[ée]?(phone|copie)?|Fax|mobile|phone|num[ée]ro|ligne).{,20}\d{2}[ .-]?\d{2}[ .-]?\d{2}.{,20} *(?:\n+|$)", + # Phone number on separate line + r"^.{,3}(T[ée]l[ée]?(phone|copie)?|Fax|mobile|phone|num[ée]ro|ligne).{,3} *(?:\n+|$)", r"^.{,3}Appel non surtax[ée].{,3}$", # Street / Address / Post code street_address_regex, @@ -1549,10 +1553,18 @@ def SIGNATURE(self) -> str | list[str] | re.Pattern: class RefinedTagger(MelusineTransformer): - BASE_TAG_KEY = "base_tag" - REFINED_TAG_KEY = "refined_tag" - - def __init__(self, input_columns: str = "messages", output_columns: str = "messages", default_tag: str = "BODY"): + """ + Post-processing class to refine initial tags. + """ + def __init__( + self, + input_columns: str = "messages", + output_columns: str = "messages", + default_tag: str = "BODY", + tag_key: str = "base_tag", + text_key: str = "base_text", + refined_tag_key: str = "refined_tag", + ): """ Parameters ---------- @@ -1562,8 +1574,14 @@ def __init__(self, input_columns: str = "messages", output_columns: str = "messa Outputs columns for the transform operation default_tag: str Default tag to apply to untagged text + tag_key: input tag jey + text_key: input text key + refined_tag_key: output tag key """ self.default_tag = default_tag + self.base_tag_key = tag_key + self.base_text_key = text_key + self.refined_tag_key = refined_tag_key super().__init__( input_columns=input_columns, @@ -1585,10 +1603,25 @@ def post_process_messages(self, messages: list[Message]) -> list[Message]: """ for message in messages: message.tags = self.post_process_tags(message.tags) + message.effective_tag_key = self.refined_tag_key return messages - def post_process_tags(self, tags: list[dict[str, Any]]) -> list[dict[str, Any]]: + def post_process_tags(self, tags: list[dict[str, Any]] | None) -> list[dict[str, Any]] | None: + """ + Method to post-process tags. + + Parameters + ---------- + tags: Initial tags + + Returns + ------- + _: Refined tags + """ + if tags is None: + return None + # Signature lines containing first/last name tags = self.detect_name_signature(tags) @@ -1619,16 +1652,16 @@ def detect_name_signature(self, tags: list[dict[str, Any]]) -> list[dict[str, An forbidden_words: set[str] = {"urgent", "attention"} for tag_data in tags: - tag = tag_data[self.BASE_TAG_KEY] + tag = tag_data[self.base_tag_key] if tag == self.default_tag: - text = tag_data[self.BASE_TAG_KEY] + text = tag_data[self.base_text_key] match = re.match(line_with_name, text) has_forbidden_words: bool = bool(forbidden_words.intersection(text.lower().split())) if match and not has_forbidden_words: tag = "SIGNATURE_NAME" - tag_data[self.REFINED_TAG_KEY] = tag + tag_data[self.refined_tag_key] = tag return tags @@ -1664,7 +1697,6 @@ def __init__( ) self.tags_to_ignore = tuple(tags_to_ignore) - self.json_exclude_list.append("input_columns") @property def email_pattern(self) -> str: @@ -1757,7 +1789,14 @@ def filter_message_list(self, message_list: list[Message]) -> list[Message]: top_message = message_list[0] parts = top_message.extract_parts() - contains_only_tags_to_ignore = all([tag.startswith(self.tags_to_ignore) for tag, _ in parts]) + try: + contains_only_tags_to_ignore = all( + [tag_data[Message.MAIN_TAG_TYPE].startswith(self.tags_to_ignore) for tag_data in parts] + ) + except KeyError: + contains_only_tags_to_ignore = all( + [tag_data[Message.FALLBACK_TAG_TYPE].startswith(self.tags_to_ignore) for tag_data in parts] + ) if contains_only_tags_to_ignore and (len(message_list) > 1): message_list = message_list[1:] diff --git a/tests/base/test_message.py b/tests/base/test_message.py index 3db9d60..49b1e07 100644 --- a/tests/base/test_message.py +++ b/tests/base/test_message.py @@ -19,9 +19,9 @@ def test_message_repr(): def test_message_has_tags(): message = Message(text="Hello") message.tags = [ - ("HELLO", "Bonjour"), - ("BODY", "Pouvez-vous"), - ("GREETINGS", "Cordialement"), + {"base_text": "Bonjour", "base_tag": "HELLO"}, + {"base_text": "Pouvez-vous", "base_tag": "BODY"}, + {"base_text": "Cordialement", "base_tag": "GREETINGS"}, ] assert not message.has_tags(target_tags=["FOOTER"]) @@ -32,9 +32,9 @@ def test_message_has_tags(): def test_message_has_tags_stop_at(): message = Message(text="Hello") message.tags = [ - ("HELLO", "Bonjour"), - ("GREETINGS", "Cordialement"), - ("BODY", "Blah Blah Blah"), + {"base_text": "Bonjour", "base_tag": "HELLO"}, + {"base_text": "Cordialement", "base_tag": "GREETINGS"}, + {"base_text": "Blah Blah Blah", "base_tag": "BODY"}, ] assert not message.has_tags(target_tags=["BODY"], stop_at=["GREETINGS"]) @@ -49,25 +49,25 @@ def test_message_has_tags_no_tags(): def test_message_extract_parts(): message = Message(text="Hello") message.tags = [ - ("HELLO", "Bonjour"), - ("BODY", "Pouvez-vous"), - ("GREETINGS", "Cordialement"), + {"base_text": "Bonjour", "base_tag": "HELLO"}, + {"base_text": "Pouvez-vous", "base_tag": "BODY"}, + {"base_text": "Cordialement", "base_tag": "GREETINGS"}, ] - assert message.extract_parts(target_tags={"BODY"}) == [("BODY", "Pouvez-vous")] + assert message.extract_parts(target_tags={"BODY"}) == [{"base_text": "Pouvez-vous", "base_tag": "BODY"}] assert message.extract_parts(target_tags=["GREETINGS", "HELLO"]) == [ - ("HELLO", "Bonjour"), - ("GREETINGS", "Cordialement"), + {"base_text": "Bonjour", "base_tag": "HELLO"}, + {"base_text": "Cordialement", "base_tag": "GREETINGS"}, ] def test_message_extract_parts_stop(): message = Message(text="Hello") message.tags = [ - ("HELLO", "Bonjour"), - ("FOOTER", "Envoyé depuis mon Iphone"), - ("GREETINGS", "Cordialement"), - ("BODY", "Blah Blah Blah"), + {"base_text": "Bonjour", "base_tag": "HELLO"}, + {"base_text": "Envoyé depuis mon Iphone", "base_tag": "FOOTER"}, + {"base_text": "Cordialement", "base_tag": "GREETINGS"}, + {"base_text": "Blah Blah Blah", "base_tag": "BODY"}, ] extracted = message.extract_parts(target_tags=["BODY"], stop_at=["FOOTER", "GREETINGS"]) @@ -84,21 +84,21 @@ def test_message_extract_parts_no_tags(): def test_message_extract_last_body(): message = Message(text="Hello") message.tags = [ - ("HELLO", "Bonjour"), - ("BODY", "Pouvez-vous"), - ("GREETINGS", "Cordialement"), + {"base_text": "Bonjour", "base_tag": "HELLO"}, + {"base_text": "Pouvez-vous", "base_tag": "BODY"}, + {"base_text": "Cordialement", "base_tag": "GREETINGS"}, ] - assert message.extract_last_body() == [("BODY", "Pouvez-vous")] + assert message.extract_last_body() == [{"base_text": "Pouvez-vous", "base_tag": "BODY"}] def test_str(): # Arrange message = Message(meta="Test\nmeta", text="Hello") message.tags = [ - ("TAG", "ABC"), - ("TAAG", "ABCD"), - ("TAAAG", "ABCDE"), + {"base_text": "ABC", "base_tag": "TAG"}, + {"base_text": "ABCD", "base_tag": "TAAG"}, + {"base_text": "ABCDE", "base_tag": "TAAAG"}, ] expected_list = [ @@ -126,9 +126,9 @@ def test_str_no_meta(): # Arrange message = Message(text="Hello") message.tags = [ - ("TAG", "ABC"), - ("TAAG", "ABCD"), - ("TAAAG", "ABCDE"), + {"base_text": "ABC", "base_tag": "TAG"}, + {"base_text": "ABCD", "base_tag": "TAAG"}, + {"base_text": "ABCDE", "base_tag": "TAAAG"}, ] expected_list = [ @@ -175,6 +175,6 @@ def test_str_no_tags(): def test_str_no_conf(reset_melusine_config): config.reset({"Test": "Test"}) - message = Message(text="test", tags=[("TEST TAG", "TEST TEXT")]) + message = Message(text="test", tags=[{"base_text": "TEST TEXT", "base_tag": "TEST TAG"}]) print(message) - assert True + assert message.__str__() diff --git a/tests/detectors/test_thanks_detector.py b/tests/detectors/test_thanks_detector.py index 7f6e41c..da50073 100644 --- a/tests/detectors/test_thanks_detector.py +++ b/tests/detectors/test_thanks_detector.py @@ -17,8 +17,8 @@ def thanks_detector_df(): m0 = Message("") m0.tags = [ - ("HELLO", "Bonjour"), - ("THANKS", "Merci beaucoup"), + {"base_text": "Bonjour", "base_tag": "HELLO"}, + {"base_text": "Merci beaucoup", "base_tag": "THANKS"}, ] m0_messages = [m0] m0_expected = True @@ -31,8 +31,8 @@ def thanks_detector_df(): m1 = Message("") m1.tags = [ - ("HELLO", "Bonjour"), - ("THANKS", "Merci, j'attends une reponse"), + {"base_text": "Bonjour", "base_tag": "HELLO"}, + {"base_text": "Merci, j'attends une reponse", "base_tag": "THANKS"}, ] m1_messages = [m1] m1_expected = False @@ -57,7 +57,6 @@ def thanks_detector_df(): def test_thanks_detector(thanks_detector_df): """Unit test of the debug mode.""" df = thanks_detector_df - df_copy = df.copy() detector = ThanksDetector( name="thanks", ) @@ -96,23 +95,26 @@ def test_thanks_detector_missing_field(thanks_detector_df): [ ( [ - ("HELLO", "Bonjour madame"), - ("BODY", "Voici le dossier"), - ("THANKS", "Merci a vous"), + {"base_text": "Bonjour madame", "base_tag": "HELLO"}, + {"base_text": "Voici le dossier", "base_tag": "BODY"}, + {"base_text": "Merci a vous", "base_tag": "THANKS"}, ], True, "Merci a vous", - [("THANKS", "Merci a vous")], + [{"base_text": "Merci a vous", "base_tag": "THANKS"}], ), ( [ - ("HELLO", "Bonjour madame"), - ("THANKS", "Merci"), - ("THANKS", "Merci a vous"), + {"base_text": "Bonjour madame", "base_tag": "HELLO"}, + {"base_text": "Merci", "base_tag": "THANKS"}, + {"base_text": "Merci a vous", "base_tag": "THANKS"}, ], False, "Merci\nMerci a vous", - [("THANKS", "Merci"), ("THANKS", "Merci a vous")], + [ + {"base_text": "Merci", "base_tag": "THANKS"}, + {"base_text": "Merci a vous", "base_tag": "THANKS"} + ], ), ], ) @@ -136,8 +138,6 @@ def test_thanks_detector_debug(tags, has_body, thanks_text, thanks_parts): assert "debug_thanks" in data assert "has_body" in data["debug_thanks"] assert "thanks_text" in data["debug_thanks"] - assert "thanks_parts" in data["debug_thanks"] assert data["debug_thanks"]["has_body"] == has_body assert data["debug_thanks"]["thanks_text"] == thanks_text - assert data["debug_thanks"]["thanks_parts"] == thanks_parts diff --git a/tests/detectors/test_vacation_reply_detector.py b/tests/detectors/test_vacation_reply_detector.py index 40a625f..7594522 100644 --- a/tests/detectors/test_vacation_reply_detector.py +++ b/tests/detectors/test_vacation_reply_detector.py @@ -32,12 +32,12 @@ def test_instanciation(): text="Bonjour, je vous confirme l'annulation du rdv du 01/01/2022 " + "à 16h. Bien cordialement, John Smith.", tags=[ - ("HELLO", "Bonjour,"), - ( - "BODY", - "je vous confirme l'annulation du rdv du 01/01/2022 à 16h.", - ), - ("GREETINGS", "Bien cordialement, John Smith."), + {"base_tag": "HELLO", "base_text": "Bonjour,"}, + { + "base_tag": "BODY", + "base_text": "je vous confirme l'annulation du rdv du 01/01/2022 à 16h." + }, + {"base_tag": "GREETINGS", "base_text": "Bien cordialement, John Smith."}, ], ) ] @@ -55,12 +55,12 @@ def test_instanciation(): text="Bonjour, \nActuellement en conge je prendrai connaissance" + " de votre message ulterieurement.\nCordialement,", tags=[ - ("HELLO", "Bonjour,"), - ( - "BODY", - "Actuellement en conge je prendrai connaissance de votre message ulterieurement.", - ), - ("GREETINGS", "Cordialement, "), + {"base_tag": "HELLO", "base_text": "Bonjour,"}, + { + "base_tag": "BODY", + "base_text": "Actuellement en conge je prendrai connaissance de votre message ulterieurement." + }, + {"base_tag": "GREETINGS", "base_text": "Cordialement, "}, ], ) ] @@ -73,8 +73,6 @@ def test_instanciation(): ) def test_transform(df, good_result): """Unit test of the transform() method.""" - df_copy = df.copy() - message_column = "messages" detector = VacationReplyDetector( @@ -100,12 +98,12 @@ def test_transform(df, good_result): text="Bonjour, \nActuellement en conge je prendrai connaissance" + " de votre message ulterieurement.\nCordialement,", tags=[ - ("HELLO", "Bonjour,"), - ( - "BODY", - "Actuellement en conge je prendrai connaissance de votre message ulterieurement.", - ), - ("GREETINGS", "Cordialement, "), + {"base_tag": "HELLO", "base_text": "Bonjour,"}, + { + "base_tag": "BODY", + "base_text": "Actuellement en conge je prendrai connaissance de votre message ulterieurement." + }, + {"base_tag": "GREETINGS", "base_text": "Cordialement, "}, ], ) ] @@ -114,12 +112,6 @@ def test_transform(df, good_result): ), True, { - "parts": [ - ( - "BODY", - "Actuellement en conge je prendrai connaissance de votre message ulterieurement.", - ) - ], "text": "Actuellement en conge je prendrai connaissance de votre message ulterieurement.", "VacationReplyRegex": { "match_result": True, @@ -138,8 +130,6 @@ def test_transform(df, good_result): ) def test_transform_debug_mode(df, good_detection_result, good_debug_info): """Unit test of the debug mode.""" - df_copy = df.copy() - messages_column = "messages" detector = VacationReplyDetector( diff --git a/tests/functional/test_emails_fixtures.py b/tests/functional/test_emails_fixtures.py index 58f719b..934a730 100644 --- a/tests/functional/test_emails_fixtures.py +++ b/tests/functional/test_emails_fixtures.py @@ -53,11 +53,22 @@ content_tagger_expected={ "messages.tags": [ [ - ("HELLO", "BonJour wORLD"), - ("BODY", "L'orem"), - ("BODY", "Ip-sum"), - ("BODY", "Lo_rem"), - ("BODY", "ip.sum."), + {"base_text": "BonJour wORLD", "base_tag": "HELLO", "base_tag_list": ["HELLO"]}, + {"base_text": "L'orem", "base_tag": "BODY", "base_tag_list": ["BODY"]}, + {"base_text": "Ip-sum", "base_tag": "BODY", "base_tag_list": ["BODY"]}, + {"base_text": "Lo_rem", "base_tag": "BODY", "base_tag_list": ["BODY"]}, + {"base_text": "ip.sum.", "base_tag": "BODY", "base_tag_list": ["BODY"]}, + ], + ], + }, + refined_tagger_expected={ + "messages.tags": [ + [ + {"base_text": "BonJour wORLD", "base_tag": "HELLO", "base_tag_list": ["HELLO"], "refined_tag": "HELLO"}, + {"base_text": "L'orem", "base_tag": "BODY", "base_tag_list": ["BODY"], "refined_tag": "BODY"}, + {"base_text": "Ip-sum", "base_tag": "BODY", "base_tag_list": ["BODY"], "refined_tag": "BODY"}, + {"base_text": "Lo_rem", "base_tag": "BODY", "base_tag_list": ["BODY"], "refined_tag": "BODY"}, + {"base_text": "ip.sum.", "base_tag": "BODY", "base_tag_list": ["BODY"], "refined_tag": "BODY"}, ], ], }, @@ -116,19 +127,47 @@ content_tagger_expected={ "messages.tags": [ [ - ("HELLO", "Bonjour,"), - ("BODY", "Vous trouverez ci-joint l'attestation"), - ("BODY", "Merci de me confirmer la bonne réception de ce message."), - ("THANKS", "Vous en remerciant par avance."), - ("GREETINGS", "Cordialement,"), - ("SIGNATURE_NAME", "Jean Dupont"), + {"base_text": "Bonjour,", "base_tag": "HELLO", "base_tag_list": ["HELLO"]}, + {"base_text": "Vous trouverez ci-joint l'attestation", "base_tag": "BODY", "base_tag_list": ["BODY"]}, + {"base_text": "Merci de me confirmer la bonne réception de ce message.", "base_tag": "BODY", "base_tag_list": ["BODY"]}, + {"base_text": "Vous en remerciant par avance.", "base_tag": "THANKS", "base_tag_list": ["THANKS"]}, + {"base_text": "Cordialement,", "base_tag": "GREETINGS", "base_tag_list": ["GREETINGS"]}, + {"base_text": "Jean Dupont", "base_tag": "BODY", "base_tag_list": ["BODY"]}, ], [ - ("HELLO", "Bonjour,"), - ("BODY", "Veuillez trouver ci-jointe la lettre"), - ("FOOTER", "La visualisation des fichiers PDF nécessite Adobe Reader."), - ("GREETINGS", "Sentiments mutualistes."), - ("SIGNATURE_NAME", "La MAIF"), + {"base_text": "Bonjour,", "base_tag": "HELLO", "base_tag_list": ["HELLO"]}, + {"base_text": "Veuillez trouver ci-jointe la lettre", "base_tag": "BODY", "base_tag_list": ["BODY"]}, + {"base_text": "La visualisation des fichiers PDF nécessite Adobe Reader.", "base_tag": "FOOTER", "base_tag_list": ["FOOTER"]}, + {"base_text": "Sentiments mutualistes.", "base_tag": "GREETINGS", "base_tag_list": ["GREETINGS"]}, + {"base_text": "La MAIF", "base_tag": "BODY", "base_tag_list": ["BODY"]}, + ], + ], + }, + refined_tagger_expected={ + "messages.tags": [ + [ + {"base_text": "Bonjour,", "base_tag": "HELLO", "base_tag_list": ["HELLO"], "refined_tag": "HELLO"}, + {"base_text": "Vous trouverez ci-joint l'attestation", "base_tag": "BODY", "base_tag_list": ["BODY"], + "refined_tag": "BODY"}, + {"base_text": "Merci de me confirmer la bonne réception de ce message.", "base_tag": "BODY", + "base_tag_list": ["BODY"], "refined_tag": "BODY"}, + {"base_text": "Vous en remerciant par avance.", "base_tag": "THANKS", "base_tag_list": ["THANKS"], + "refined_tag": "THANKS"}, + {"base_text": "Cordialement,", "base_tag": "GREETINGS", "base_tag_list": ["GREETINGS"], + "refined_tag": "GREETINGS"}, + {"base_text": "Jean Dupont", "base_tag": "BODY", "base_tag_list": ["BODY"], + "refined_tag": "SIGNATURE_NAME"}, + ], + [ + {"base_text": "Bonjour,", "base_tag": "HELLO", "base_tag_list": ["HELLO"], "refined_tag": "HELLO"}, + {"base_text": "Veuillez trouver ci-jointe la lettre", "base_tag": "BODY", "base_tag_list": ["BODY"], + "refined_tag": "BODY"}, + {"base_text": "La visualisation des fichiers PDF nécessite Adobe Reader.", "base_tag": "FOOTER", + "base_tag_list": ["FOOTER"], "refined_tag": "FOOTER"}, + {"base_text": "Sentiments mutualistes.", "base_tag": "GREETINGS", "base_tag_list": ["GREETINGS"], + "refined_tag": "GREETINGS"}, + {"base_text": "La MAIF", "base_tag": "BODY", "base_tag_list": ["BODY"], + "refined_tag": "SIGNATURE_NAME"}, ], ], }, @@ -162,16 +201,30 @@ content_tagger_expected={ "messages.tags": [ [ - ("HELLO", "Bonjour"), - ( - "BODY", - "Pouvez-vous me transmettre deux attestations au nom de mes enfants", - ), - ("BODY", "- Jane Dupond"), - ("BODY", "- Joe Dupond"), - ("THANKS", "Merci par avance"), - ("GREETINGS", "Cordialement"), - ("SIGNATURE_NAME", "Mr Jean Dupond"), + {"base_text": "Bonjour", "base_tag": "HELLO", "base_tag_list": ["HELLO"]}, + {"base_text": "Pouvez-vous me transmettre deux attestations au nom de mes enfants", "base_tag": "BODY", "base_tag_list": ["BODY"]}, + {"base_text": "- Jane Dupond", "base_tag": "BODY", "base_tag_list": ["BODY"]}, + {"base_text": "- Joe Dupond", "base_tag": "BODY", "base_tag_list": ["BODY"]}, + {"base_text": "Merci par avance", "base_tag": "THANKS", "base_tag_list": ["THANKS"]}, + {"base_text": "Cordialement", "base_tag": "GREETINGS", "base_tag_list": ["GREETINGS"]}, + {"base_text": "Mr Jean Dupond", "base_tag": "BODY", "base_tag_list": ["BODY"]}, + ] + ], + }, + refined_tagger_expected={ + "messages.tags": [ + [ + {"base_text": "Bonjour", "base_tag": "HELLO", "base_tag_list": ["HELLO"], "refined_tag": "HELLO"}, + {"base_text": "Pouvez-vous me transmettre deux attestations au nom de mes enfants", "base_tag": "BODY", + "base_tag_list": ["BODY"], "refined_tag": "BODY"}, + {"base_text": "- Jane Dupond", "base_tag": "BODY", "base_tag_list": ["BODY"], "refined_tag": "BODY"}, + {"base_text": "- Joe Dupond", "base_tag": "BODY", "base_tag_list": ["BODY"], "refined_tag": "BODY"}, + {"base_text": "Merci par avance", "base_tag": "THANKS", "base_tag_list": ["THANKS"], + "refined_tag": "THANKS"}, + {"base_text": "Cordialement", "base_tag": "GREETINGS", "base_tag_list": ["GREETINGS"], + "refined_tag": "GREETINGS"}, + {"base_text": "Mr Jean Dupond", "base_tag": "BODY", "base_tag_list": ["BODY"], + "refined_tag": "SIGNATURE_NAME"}, ] ], }, @@ -276,8 +329,8 @@ content_tagger_expected={ "messages.tags": [ [ - ("THANKS", "Bonjour et merci"), - ("GREETINGS", "Cordialement"), + {"base_text": "Bonjour et merci", "base_tag": "THANKS", "base_tag_list": ["THANKS", "HELLO"]}, + {"base_text": "Cordialement", "base_tag": "GREETINGS", "base_tag_list": ["GREETINGS"]}, ] ], }, diff --git a/tests/io/__init__.py b/tests/io_mixin/__init__.py similarity index 100% rename from tests/io/__init__.py rename to tests/io_mixin/__init__.py diff --git a/tests/io/test_io_mixin.py b/tests/io_mixin/test_io_mixin.py similarity index 93% rename from tests/io/test_io_mixin.py rename to tests/io_mixin/test_io_mixin.py index e0eb1bf..070bc0c 100644 --- a/tests/io/test_io_mixin.py +++ b/tests/io_mixin/test_io_mixin.py @@ -1,8 +1,8 @@ import pytest from melusine import config -from melusine.io import IoMixin -from melusine.io._classes import InitError +from melusine.io_mixin import IoMixin +from melusine.io_mixin._classes import InitError class FakeClass(IoMixin): diff --git a/tests/processors/test_content_refined_tagger.py b/tests/processors/test_content_refined_tagger.py new file mode 100644 index 0000000..0091417 --- /dev/null +++ b/tests/processors/test_content_refined_tagger.py @@ -0,0 +1,819 @@ +import re + +import pytest + +from melusine.message import Message +from melusine.processors import BaseContentTagger, ContentTagger, Tag, RefinedTagger + + +def test_content_tagger(): + # Text segments (= individual messages in an email conversation) + text_segments = [ + "Envoye de mon iphone", + ("Bonjour Mme X,\nSuite a blh blah blah\nBien cordialement\nJane Dupond\n(See attached file: flex.jpg)"), + ( + "Bonjour,\nVeuillez trouver ci-joint blah\n" + "Merci d'avance,\nCordialement,\n" + "Toute modification, edition, utilisation ou diffusion non autorisee est interdite" + ), + ] + + # Expected tags + expected_tags = [ + [ + {"base_text": "Envoye de mon iphone", "base_tag": "FOOTER"}, + ], + [ + {"base_text": "Bonjour Mme X,", "base_tag": "HELLO"}, + {"base_text": "Suite a blh blah blah", "base_tag": "BODY"}, + {"base_text": "Bien cordialement", "base_tag": "GREETINGS"}, + {"base_text": "Jane Dupond", "base_tag": "BODY"}, + {"base_text": "(See attached file: flex.jpg)", "base_tag": "PJ"}, + ], + [ + {"base_text": "Bonjour,", "base_tag": "HELLO"}, + {"base_text": "Veuillez trouver ci-joint blah", "base_tag": "BODY"}, + {"base_text": "Merci d'avance,", "base_tag": "THANKS"}, + {"base_text": "Cordialement,", "base_tag": "GREETINGS"}, + {"base_text": "Toute modification, edition, utilisation ou diffusion non autorisee est interdite", + "base_tag": "FOOTER"}, + ], + ] + + # Mock the output of a Segmenter (List of Message object) + messages = [Message(text=segment) for segment in text_segments] + + # Instantiate and apply the Tagger + tagger = ContentTagger() + output_messages = tagger.tag_email(messages) + + # Test output tags + for tag_data_list in expected_tags: + for tag_data in tag_data_list: + if "base_tag_list" not in tag_data_list: + tag_data["base_tag_list"] = [tag_data["base_tag"]] + + for i, message in enumerate(output_messages): + for j, tag_data in enumerate(message.tags): + assert tag_data == expected_tags[i][j] + + +def test_tag_null_message(): + messages = None + + # Instantiate and apply the Tagger + tagger = ContentTagger() + output_messages = tagger.tag_email(messages) + + assert output_messages is None + + +@pytest.mark.parametrize( + "text, expected_parts", + [ + ( + "Bonjour, merci pour votre message!\nComment-allez vous?! Je suis satisfait!!!\n" + "Bien cordialement\n\n\n\nJane Dupond\n", + [ + "Bonjour,", + "merci pour votre message!", + "Comment-allez vous?!", + "Je suis satisfait!!!", + "Bien cordialement", + "Jane Dupond", + ], + ), + ], +) +def test_content_tagger_split_text(text, expected_parts): + # Instantiate and apply the Tagger + tagger = ContentTagger() + output_parts = tagger.split_text(text) + + assert output_parts == expected_parts + + +@pytest.mark.parametrize( + "text, expected_tags", + [ + ( + "Bonjour Mme X,\nSuite a blh blah blah.\n" + "Bien cordialement\nJane Dupond\n" + "(See attached file: flex.jpg)", + [ + {"base_text": "Bonjour Mme X,", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "Suite a blh blah blah.", "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Bien cordialement", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + {"base_text": "Jane Dupond", "base_tag": "BODY", "refined_tag": "SIGNATURE_NAME"}, + {"base_text": "(See attached file: flex.jpg)", "base_tag": "PJ", "refined_tag": "PJ"}, + ], + ), + ( + "Bonjour, je confirme le rdv. Cordialement, John Smith", + [ + {"base_text": "Bonjour,", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "je confirme le rdv.", "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Cordialement, John Smith", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + ], + ), + ( + ( + "Bonjour,\nSuite a notre intervention du 16.02.22 , un taux d'humidité de 50% a été relevé.\n" + "Cordialement.\n177, rue de la fée - 75000 Paris.\n" + "Horaires : du lundi au jeudi de 08h00 à 16h30 et le vendredi de 08h00 à 16h00.\n" + "Tel : 01.45.53.11.33" + ), + [ + {"base_text": "Bonjour,", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "Suite a notre intervention du 16.02.22 , un taux d'humidité de 50% a été relevé.", + "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Cordialement.", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + {"base_text": "177, rue de la fée - 75000 Paris.", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + {"base_text": "Horaires : du lundi au jeudi de 08h00 à 16h30 et le vendredi de 08h00 à 16h00.", + "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Tel : 01.45.53.11.33", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + ], + ), + ( + ( + "bonjour\n" + "15 jours après les premières réparations, un défaut a été détecté. " + "Bien à vous\n" + "Britney Spears" + ), + [ + {"base_text": "bonjour", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "15 jours après les premières réparations, un défaut a été détecté.", "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Bien à vous", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + {"base_text": "Britney Spears", "base_tag": "BODY", "refined_tag": "SIGNATURE_NAME"}, + ], + ), + ( + ( + "Bonjour monsieur Smith\n" + "merci. Bien à vous\n" + "Britney Spears\n" + "22 hollywood boulevard\n" + "79000 Niort\n" + ), + [ + {"base_text": "Bonjour monsieur Smith", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "merci.", "base_tag": "THANKS", "refined_tag": "THANKS"}, + {"base_text": "Bien à vous", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + {"base_text": "Britney Spears", "base_tag": "BODY", "refined_tag": "SIGNATURE_NAME"}, + {"base_text": "22 hollywood boulevard", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + {"base_text": "79000 Niort", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + ], + ), + ( + ( + "Merci de me faire suivre les docs à ma nouvelle adresse qui est 0 rue du parc, 75000 Paris. " + "Merci d'avance. \nAcceptez notre salutation," + ), + [ + { + "base_text": "Merci de me faire suivre les docs à ma nouvelle adresse qui est 0 rue du parc, 75000 Paris.", + "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Merci d'avance.", "base_tag": "THANKS", "refined_tag": "THANKS"}, + {"base_text": "Acceptez notre salutation,", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + ], + ), + ( + ( + "Bonjour\n" + "Je vous relance concernant ma télévision avec le devis en PJ.\n" + "Désolé pour la qualité.\n" + "Je l'ai envoyé à partir de mon ordi.\n" + "Excellente journée à vous,\n" + "Bon we\n" + "Votre bien dévoué\n" + "amicalement votre\n" + "Cordiales salutations.\n" + "Françoise-Bénédicte Dupond\n" + "Envoyé à partir de \nCourrier \npour Windows" + ), + [ + {"base_text": "Bonjour", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "Je vous relance concernant ma télévision avec le devis en PJ.", "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Désolé pour la qualité.", "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Je l'ai envoyé à partir de mon ordi.", "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Excellente journée à vous,", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "Bon we", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "Votre bien dévoué", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + {"base_text": "amicalement votre", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + {"base_text": "Cordiales salutations.", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + {"base_text": "Françoise-Bénédicte Dupond", "base_tag": "BODY", "refined_tag": "SIGNATURE_NAME"}, + {"base_text": "Envoyé à partir de", "base_tag": "FOOTER", "refined_tag": "FOOTER"}, + {"base_text": "Courrier", "base_tag": "FOOTER", "refined_tag": "FOOTER"}, + {"base_text": "pour Windows", "base_tag": "FOOTER", "refined_tag": "FOOTER"}, + ], + ), + ( + "C'est bien note, merci beaucoup.\nSentiments dévoués.\nTélécharger \nOutlook pour Android", + [ + {"base_text": "C'est bien note, merci beaucoup.", "base_tag": "THANKS", "refined_tag": "THANKS"}, + {"base_text": "Sentiments dévoués.", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + {"base_text": "Télécharger", "base_tag": "FOOTER", "refined_tag": "FOOTER"}, + {"base_text": "Outlook pour Android", "base_tag": "FOOTER", "refined_tag": "FOOTER"}, + ], + ), + ( + "Impeccable, je vous remercie beaucoup pour votre rapidité.\nObtenir\nOutlook pour Android", + [ + {"base_text": "Impeccable, je vous remercie beaucoup pour votre rapidité.", "base_tag": "THANKS", "refined_tag": "THANKS"}, + {"base_text": "Obtenir", "base_tag": "FOOTER", "refined_tag": "FOOTER"}, + {"base_text": "Outlook pour Android", "base_tag": "FOOTER", "refined_tag": "FOOTER"}, + ], + ), + ( + ( + "Cher Monsieur,\nJe vous confirme la bonne réception de votre précédent email.\n" + "Je vous en remercie.\nBien cordialement,\nJohn Smith" + ), + [ + {"base_text": "Cher Monsieur,", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "Je vous confirme la bonne réception de votre précédent email.", "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Je vous en remercie.", "base_tag": "THANKS", "refined_tag": "THANKS"}, + {"base_text": "Bien cordialement,", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + {"base_text": "John Smith", "base_tag": "BODY", "refined_tag": "SIGNATURE_NAME"}, + ], + ), + ( + ( + "chère madame,\n" + "URGENT URGENT\n" + "Merci de me faire suivre les docs à ma nouvelle adresse qui est 0 rue du parc, 75000 Paris. " + "Merci d'avance. \nRecevez nos salutations,\nVous en souhaitant bonne réception" + ), + [ + {"base_text": "chère madame,", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "URGENT URGENT", "base_tag": "BODY", "refined_tag": "BODY"}, + { + "base_text": "Merci de me faire suivre les docs à ma nouvelle adresse qui est 0 rue du parc, 75000 Paris.", + "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Merci d'avance.", "base_tag": "THANKS", "refined_tag": "THANKS"}, + {"base_text": "Recevez nos salutations,", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + {"base_text": "Vous en souhaitant bonne réception", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + ], + ), + pytest.param( + "Un témoignage sous X\nEnvoyé depuis mon téléphone Orange", + [ + {"base_text": "Un témoignage sous X", "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Envoyé depuis mon téléphone Orange", "base_tag": "FOOTER", "refined_tag": "FOOTER"}, + ], + id="Edge case where a line ends with an isolated character", + ), + pytest.param( + " ??\n !??!", + [ + {"base_text": "??!??!", "base_tag": "BODY", "refined_tag": "BODY"}, + ], + id="Edge case where the two first lines are missing word characters", + ), + ( + "Bonjour Mme X,\nSuite a blh blah blah.\n" + "Bien cordialement\nJane Dupond\n" + "(See attached file: flex.jpg)", + [ + {"base_text": "Bonjour Mme X,", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "Suite a blh blah blah.", "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Bien cordialement", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + {"base_text": "Jane Dupond", "base_tag": "BODY", "refined_tag": "SIGNATURE_NAME"}, + {"base_text": "(See attached file: flex.jpg)", "base_tag": "PJ", "refined_tag": "PJ"}, + ], + ), + ( + "\nChère Madame\n\nC'est bien noté, merci\nBien reçu\nJ.Smith\n\n", + [ + {"base_text": "Chère Madame", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "C'est bien noté, merci", "base_tag": "THANKS", "refined_tag": "THANKS"}, + {"base_text": "Bien reçu", "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "J.Smith", "base_tag": "BODY", "refined_tag": "SIGNATURE_NAME"}, + ], + ), + ( + "\nBonjour Monsieur, ceci n'est pas un hello\nBonne fin de journee\nsalutations", + [ + {"base_text": "Bonjour Monsieur, ceci n'est pas un hello", "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Bonne fin de journee", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "salutations", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + ], + ), + ( + "\nBonjour Monsieur Stanislas von den hoeggenboord\n\nbien à toi\nJ. Smith\nChargé de clientèle", + [ + {"base_text": "Bonjour Monsieur Stanislas von den hoeggenboord", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "bien à toi", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + {"base_text": "J. Smith", "base_tag": "BODY", "refined_tag": "SIGNATURE_NAME"}, + {"base_text": "Chargé de clientèle", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + ], + ), + ( + ( + "\n1 rdv à 18h\n\n2 ème message laissé à la locataire\n3je m'en vais au bois\n" + "4 allée des iris\n 5bis rue Patrick Sebastien\n6-8 cours mirabeau\n 7 ter place du dahu\n" + "8 de la rue très longue qui ne doit pas être taggée signature" + ), + [ + {"base_text": "1 rdv à 18h", "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "2 ème message laissé à la locataire", "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "3je m'en vais au bois", "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "4 allée des iris", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + {"base_text": "5bis rue Patrick Sebastien", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + {"base_text": "6-8 cours mirabeau", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + {"base_text": "7 ter place du dahu", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + {"base_text": "8 de la rue très longue qui ne doit pas être taggée signature", "base_tag": "BODY", "refined_tag": "BODY"}, + ], + ), + ( + ( + "à L'attention de M Bob,\n" + "Bonjour,\n" + "Je vous informe que je vais accepter la proposition de L , à savoir le paiement d'une indemnité forfaitaire de résiliation du CCMI de 4000 € TTC pour clore cette affaire.\n" + "Cordialement.\n" + "Bob Smith" + ), + [ + {"base_text": "à L'attention de M Bob,", "base_tag": "FOOTER", "refined_tag": "FOOTER"}, + {"base_text": "Bonjour,", "base_tag": "HELLO", "refined_tag": "HELLO"}, + { + "base_text": "Je vous informe que je vais accepter la proposition de L , à savoir le paiement d'une indemnité forfaitaire de résiliation du CCMI de 4000 € TTC pour clore cette affaire.", + "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Cordialement.", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + {"base_text": "Bob Smith", "base_tag": "BODY", "refined_tag": "SIGNATURE_NAME"}, + ], + ), + ( + ( + "Monsieur Bob Smith\n" + "Adresse mail : BobSmith90@gmail.com\n" + "Lucy Ange\n\n" + "Bonjour Monsieur,\n" + "Suite à notre entretien téléphonique de ce matin, et au message que vous m'avez envoyé sur ma messagerie, je voudrais effectuer la réparation du véhicule Renault Twingo dans un garage partenaire de la Maif situé, si c'est possible.\n" + "Dans l'attente de votre réponse et en vous remerciant par avance,\n\n\n" + "Monsieur Bob Smith\n\n\n" + "Envoyé à partir de\n" + "Courrier\npour Windows\n\n\n\n" + "Sans virus.\nwww.avast.com" + ), + [ + {"base_text": "Monsieur Bob Smith", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "Adresse mail : BobSmith90@gmail.com", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + {"base_text": "Lucy Ange", "base_tag": "BODY", "refined_tag": "SIGNATURE_NAME"}, + {"base_text": "Bonjour Monsieur,", "base_tag": "HELLO", "refined_tag": "HELLO"}, + { + "base_text": "Suite à notre entretien téléphonique de ce matin, et au message que vous m'avez envoyé sur ma messagerie, je voudrais effectuer la réparation du véhicule Renault Twingo dans un garage partenaire de la Maif situé, si c'est possible.", + "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Dans l'attente de votre réponse et en vous remerciant par avance,", "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Monsieur Bob Smith", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "Envoyé à partir de", "base_tag": "FOOTER", "refined_tag": "FOOTER"}, + {"base_text": "Courrier", "base_tag": "FOOTER", "refined_tag": "FOOTER"}, + {"base_text": "pour Windows", "base_tag": "FOOTER", "refined_tag": "FOOTER"}, + {"base_text": "Sans virus.", "base_tag": "FOOTER", "refined_tag": "FOOTER"}, + {"base_text": "www.avast.com", "base_tag": "FOOTER", "refined_tag": "FOOTER"}, + ], + ), + ( + ( + "Bob Smith\n\n\n" + "A l’attention de Madame Lucy Ange,\n\n\n\n\n\n" + "Bonjour Madame Ange,\n\n\n\n\n\n\n\n\n" + "J’espère que vous allez bien.\n\n\n\n\n\n" + "Pour faire suite à mon mail du 21 février 2023, je me permets de revenir vers vous pour avoir votre avis sur le devis que j’ai demandé auprès d’un enquêteur.\n\n\n\n" + "Voici son retour :\n\n\n\n\n\n" + "Qu’en pensez-vous svp ?\n\n\n\n\n\n" + "Je reste à votre disposition pour tout complément d’information et vous remercie de l’intérêt que vous porterez à ma demande,\n\n\n\n\n\n" + "Bien Cordialement,\n\n\n\n\n\n" + "Bob Smith\n\n\n" + "Tél. 06.83.22.95.94" + ), + [ + {"base_text": "Bob Smith", "base_tag": "BODY", "refined_tag": "SIGNATURE_NAME"}, + {"base_text": "A l’attention de Madame Lucy Ange,", "base_tag": "FOOTER", "refined_tag": "FOOTER"}, + {"base_text": "Bonjour Madame Ange,", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "J’espère que vous allez bien.", "base_tag": "BODY", "refined_tag": "BODY"}, + { + "base_text": "Pour faire suite à mon mail du 21 février 2023, je me permets de revenir vers vous pour avoir votre avis sur le devis que j’ai demandé auprès d’un enquêteur.", + "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Voici son retour :", "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Qu’en pensez-vous svp ?", "base_tag": "BODY", "refined_tag": "BODY"}, + { + "base_text": "Je reste à votre disposition pour tout complément d’information et vous remercie de l’intérêt que vous porterez à ma demande,", + "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Bien Cordialement,", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + {"base_text": "Bob Smith", "base_tag": "BODY", "refined_tag": "SIGNATURE_NAME"}, + {"base_text": "Tél.", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + {"base_text": "06.83.22.95.94", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + ], + ), + pytest.param( + ( + "cordialement\nContact e-mail\n\n\nContact téléphone\n\n01 23 45 67 89 / abcabc@hotmail.fr\n" + "Torroella de Montgri, le 5 avril 2023\nLes formats de fichiers acceptés sont : PDF, DOC, DOCX, JPEG, " + "JPG, TIFF, TXT, ODT, XLS, XLSX\nTout autre format de fichiers ne sera pas transmis au dossier" + ), + [ + {"base_text": "cordialement", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + {"base_text": "Contact e-mail", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + {"base_text": "Contact téléphone", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + {"base_text": "01 23 45 67 89 / abcabc@hotmail.fr", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + {"base_text": "Torroella de Montgri, le 5 avril 2023", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + { + "base_text": "Les formats de fichiers acceptés sont : PDF, DOC, DOCX, JPEG, JPG, TIFF, TXT, ODT, XLS, XLSX", + "base_tag": "FOOTER", "refined_tag": "FOOTER"}, + {"base_text": "Tout autre format de fichiers ne sera pas transmis au dossier", "base_tag": "FOOTER", "refined_tag": "FOOTER"}, + ], + id="diverse_signature_patterns", + ), + pytest.param( + ( + "bonjour\nmon body\nJ. Smith\n\n01 23 45 67 89\nSecrétaire en charge des avions\n" + "Business Analyst – Tribu Sinistres – Squad Flux Entrants\n" + "Société nationale des chemins de fer\nConseiller MAIF\nGestionnaire sinistre - C99G\n" + "Service des lettres anonymes\nTechnicienne de gestion - EQUIPE ABC\n" + ), + [ + {"base_text": "bonjour", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "mon body", "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "J. Smith", "base_tag": "BODY", "refined_tag": "SIGNATURE_NAME"}, + {"base_text": "01 23 45 67 89", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + {"base_text": "Secrétaire en charge des avions", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + {"base_text": "Business Analyst – Tribu Sinistres – Squad Flux Entrants", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + {"base_text": "Société nationale des chemins de fer", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + {"base_text": "Conseiller MAIF", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + {"base_text": "Gestionnaire sinistre - C99G", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + {"base_text": "Service des lettres anonymes", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + {"base_text": "Technicienne de gestion - EQUIPE ABC", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + ], + id="signature_jobs", + ), + pytest.param( + ( + "bonjour\nmon body\nCordialement\n\n" + "analyste -------------------------------------- test test test test test test test\n" + ), + [ + {"base_text": "bonjour", "base_tag": "HELLO", "refined_tag": "HELLO"}, + {"base_text": "mon body", "base_tag": "BODY", "refined_tag": "BODY"}, + {"base_text": "Cordialement", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + {"base_text": "analyste -------------------------------------- test test test test test test test", + "base_tag": "BODY", "refined_tag": "BODY"}, + ], + id="check_catastrophic_backtracking", + ), + ], +) +def test_tag_text_generic(text, expected_tags): + # Arrange + tagger = ContentTagger() + refined_tagger = RefinedTagger() + + # Act + base_tags = tagger.tag_text(text) + refined_tags = refined_tagger.post_process_tags(base_tags) + + # Assert + for tag_data in expected_tags: + if "base_tag_list" not in tag_data: + tag_data["base_tag_list"] = [tag_data["base_tag"]] + assert refined_tags == expected_tags + + +@pytest.mark.parametrize( + "text, expected_tags", + [ + pytest.param( + ( + "Merci\n" + "Je vous remercie\n" + "Merci d'avance\n" + "Je vous remercie par avance\n" + "Vous en remerciant par avance.\n" + ), + [ + {"base_text": "Merci", "base_tag": "THANKS"}, + {"base_text": "Je vous remercie", "base_tag": "THANKS"}, + {"base_text": "Merci d'avance", "base_tag": "THANKS"}, + {"base_text": "Je vous remercie par avance", "base_tag": "THANKS"}, + {"base_text": "Vous en remerciant par avance.", "base_tag": "THANKS"}, + ], + id="french thanks patterns", + ), + ], +) +def test_tag_text_french(text, expected_tags): + # Arrange + tagger = ContentTagger() + + # Act + output_tags = tagger.tag_text(text) + + # Assert + for tag_data in expected_tags: + if "base_tag_list" not in tag_data: + tag_data["base_tag_list"] = [tag_data["base_tag"]] + assert output_tags == expected_tags + + +@pytest.mark.parametrize( + "text, expected_tags", + [ + pytest.param( + ( + "Thank you so much\n" + "thanks\n" + "thx Joanna\n" + "thanks but you forgot bla\n" + "Thx however I still need the document\n" + ), + [ + {"base_text": "Thank you so much", "base_tag": "THANKS"}, + {"base_text": "thanks", "base_tag": "THANKS"}, + {"base_text": "thx Joanna", "base_tag": "THANKS"}, + {"base_text": "thanks but you forgot bla", "base_tag": "BODY"}, + {"base_text": "Thx however I still need the document", "base_tag": "BODY"}, + ], + id="english thanks patterns", + ), + pytest.param( + ( + "Best\n" + "warm Wishes\n" + "regards\n" + "best regards\n" + "cheers\n" + "yours\n" + "yours truly\n" + "Sincerely\n" + "see you soon\n" + "Speak to you soon\n" + "talk soon\n" + "Take care\n" + "Catch you later\n" + "Have a fantastic day\n" + "Looking forward to your reply\n" + "I am looking forward to hearing from you\n" + "Hoping to hear from you\n" + ), + [ + {"base_text": "Best", "base_tag": "GREETINGS"}, + {"base_text": "warm Wishes", "base_tag": "GREETINGS"}, + {"base_text": "regards", "base_tag": "GREETINGS"}, + {"base_text": "best regards", "base_tag": "GREETINGS"}, + {"base_text": "cheers", "base_tag": "GREETINGS"}, + {"base_text": "yours", "base_tag": "GREETINGS"}, + {"base_text": "yours truly", "base_tag": "GREETINGS"}, + {"base_text": "Sincerely", "base_tag": "GREETINGS"}, + {"base_text": "see you soon", "base_tag": "GREETINGS"}, + {"base_text": "Speak to you soon", "base_tag": "GREETINGS"}, + {"base_text": "talk soon", "base_tag": "GREETINGS"}, + {"base_text": "Take care", "base_tag": "GREETINGS"}, + {"base_text": "Catch you later", "base_tag": "GREETINGS"}, + {"base_text": "Have a fantastic day", "base_tag": "GREETINGS"}, + {"base_text": "Looking forward to your reply", "base_tag": "GREETINGS"}, + {"base_text": "I am looking forward to hearing from you", "base_tag": "GREETINGS"}, + {"base_text": "Hoping to hear from you", "base_tag": "GREETINGS"}, + ], + id="english greetings", + ), + pytest.param( + ( + "Hello John\n" + "hi\n" + "Hi there\n" + "good to hear from you\n" + "it is good to hear from you\n" + "I hope you are having a great week\n" + "how are you doing\n" + "how are you positioned about the matter\n" + "i hope you are doing well\n" + "Good Morning Joanna\n" + "good Afternoon\n" + "Dear Jacky\n" + "Sir\n" + "Dear Madam\n" + "Dear Mr\n" + "Dear Ms.\n" + "Dear miss\n" + "Dear mrs.\n" + "Dear sir or madam\n" + "To whom it may concern\n" + ), + [ + {"base_text": "Hello John", "base_tag": "HELLO"}, + {"base_text": "hi", "base_tag": "HELLO"}, + {"base_text": "Hi there", "base_tag": "HELLO"}, + {"base_text": "good to hear from you", "base_tag": "HELLO"}, + {"base_text": "it is good to hear from you", "base_tag": "HELLO"}, + {"base_text": "I hope you are having a great week", "base_tag": "HELLO"}, + {"base_text": "how are you doing", "base_tag": "HELLO"}, + {"base_text": "how are you positioned about the matter", "base_tag": "BODY"}, + {"base_text": "i hope you are doing well", "base_tag": "HELLO"}, + {"base_text": "Good Morning Joanna", "base_tag": "HELLO"}, + {"base_text": "good Afternoon", "base_tag": "HELLO"}, + {"base_text": "Dear Jacky", "base_tag": "HELLO"}, + {"base_text": "Sir", "base_tag": "HELLO"}, + {"base_text": "Dear Madam", "base_tag": "HELLO"}, + {"base_text": "Dear Mr", "base_tag": "HELLO"}, + {"base_text": "Dear Ms.", "base_tag": "HELLO"}, + {"base_text": "Dear miss", "base_tag": "HELLO"}, + {"base_text": "Dear mrs.", "base_tag": "HELLO"}, + {"base_text": "Dear sir or madam", "base_tag": "HELLO"}, + {"base_text": "To whom it may concern", "base_tag": "HELLO"}, + ], + id="english hello", + ), + pytest.param( + ( + "VP of Data Science\n" + "Chief of staff\n" + "CTO at TestMelusine\n" + "CEOABC test\n" + "Lead business developer\n" + ), + [ + {"base_text": "VP of Data Science", "base_tag": "SIGNATURE"}, + {"base_text": "Chief of staff", "base_tag": "SIGNATURE"}, + {"base_text": "CTO at TestMelusine", "base_tag": "SIGNATURE"}, + {"base_text": "CEOABC test", "base_tag": "BODY"}, + {"base_text": "Lead business developer", "base_tag": "SIGNATURE"}, + ], + id="english job signature patterns", + ), + pytest.param( + ( + "9 downing street\n" + "4-6 Beverly Hill\n" + "4 Abbey road W24RA\n" + "3 Ocean Rd.\n" + "5th avenue\n" + "221b Baker St.\n" + "6bis River ln.\n" + "7 Winter lane\n" + ), + [ + {"base_text": "9 downing street", "base_tag": "SIGNATURE"}, + {"base_text": "4-6 Beverly Hill", "base_tag": "SIGNATURE"}, + {"base_text": "4 Abbey road W24RA", "base_tag": "SIGNATURE"}, + {"base_text": "3 Ocean Rd.", "base_tag": "SIGNATURE"}, + {"base_text": "5th avenue", "base_tag": "SIGNATURE"}, + {"base_text": "221b Baker St.", "base_tag": "SIGNATURE"}, + {"base_text": "6bis River ln.", "base_tag": "SIGNATURE"}, + {"base_text": "7 Winter lane", "base_tag": "SIGNATURE"}, + ], + id="english adsress signature patterns", + ), + ], +) +def test_tag_text_english(text, expected_tags): + # Arrange + tagger = ContentTagger() + + # Act + output_tags = tagger.tag_text(text) + + # Assert + for tag_data in expected_tags: + if "base_tag_list" not in tag_data: + tag_data["base_tag_list"] = [tag_data["base_tag"]] + assert output_tags == expected_tags + + +def test_tag_list(): + # Arrange + # Limit tags to "HELLO" and the default tag ("BODY") + tag_list = ["HELLO"] + + # Text segment (= individual message in an email conversation) + text = "bonjour\nblah blah blah\nmerci\ncordialement" + + # Expected tags + expected_tags = [ + {"base_text": "bonjour", "base_tag": "HELLO"}, + {"base_text": "blah blah blah", "base_tag": "BODY"}, + {"base_text": "merci", "base_tag": "BODY"}, + {"base_text": "cordialement", "base_tag": "BODY"}, + ] + + # Instantiate and apply the Tagger + tagger = ContentTagger(tag_list=tag_list) + + # Act + output_tags = tagger.tag_text(text) + + # Assert + for tag_data in expected_tags: + if "base_tag_list" not in tag_data: + tag_data["base_tag_list"] = [tag_data["base_tag"]] + assert output_tags == expected_tags + + +def test_undefined_tag(): + unknown_tag = "UNKNOWN_TAG" + + # Setup an unknown tag + tag_list = [unknown_tag] + + # Instantiate Tagger + with pytest.raises(ValueError, match=rf".*{unknown_tag}.*"): + _ = ContentTagger(tag_list=tag_list) + + +def test_unsupported_type(): + class MyClass(ContentTagger): + """Test class""" + + @Tag + def TEST_TAG(self): + """Test method""" + return 3.3 + + with pytest.raises(ValueError, match="supported types"): + _ = MyClass() + + +def test_compiled_pattern(): + class MyClass(ContentTagger): + """Test class""" + + @Tag + def TEST_TAG(self): + """Test method""" + return re.compile(r"cool_pattern") + + tagger = MyClass() + subtext, tag, match = tagger("cool_pattern is what I am looking for")[0] + + # Check tag result + assert tag == "TEST_TAG" + + +def test_str_pattern(): + class MyClass(ContentTagger): + """Test class""" + + @Tag + def TEST_TAG(self): + """Test method""" + return r"cool_pattern" + + tagger = MyClass() + subtext, tag, match = tagger("cool_pattern is what I am looking for")[0] + + # Check tag result + assert tag == "TEST_TAG" + + +def test_malformed_regex(): + from melusine.processors import Tag + + malformed_regex = r"[*." + + # Create a tagger containing an ill defined Tag (malformed regex) + class CustomTagger(ContentTagger): + """Test class""" + + @Tag + def HELLO(self): + """Test method""" + return malformed_regex + + # Instantiate Tagger + with pytest.raises(ValueError, match=rf"Invalid regex"): + _ = CustomTagger() + + +def test_direct_tagging(): + tagger = ContentTagger() + match = tagger["HELLO"].match("Bonjour") + + assert bool(match) + + +def test_call_method(): + tagger = ContentTagger() + + match_list = tagger("Bonjour a tous") + subtext, tag, regex = match_list[0] + + assert tag == "HELLO" + + +@pytest.mark.parametrize( + "text, n_words, word_character_only, expected_match", + [ + pytest.param("Hello you", 4, False, True, id="4 words match"), + pytest.param("Hello how are you today", 4, False, False, id="4 words no match"), + pytest.param("Hello! you?", 4, False, True, id="4 words match with special characters"), + pytest.param( + "Hello! you?", 4, True, False, id="4 words match with special characters (word character only True)" + ), + ], +) +def test_word_blocks(text, n_words, word_character_only, expected_match): + regex = BaseContentTagger.word_block(n_words, word_character_only=word_character_only) + + search_regex = r"^" + regex + r"$" + match = bool(re.search(search_regex, text)) + assert match == expected_match diff --git a/tests/processors/test_content_tagger.py b/tests/processors/test_content_tagger.py deleted file mode 100644 index 94dc535..0000000 --- a/tests/processors/test_content_tagger.py +++ /dev/null @@ -1,787 +0,0 @@ -import re - -import pytest - -from melusine.message import Message -from melusine.processors import BaseContentTagger, ContentTagger, Tag - - -def test_content_tagger(): - # Text segments (= individual messages in an email conversation) - text_segments = [ - "Envoye de mon iphone", - ("Bonjour Mme X,\nSuite a blh blah blah\n" "Bien cordialement\nJane Dupond\n" "(See attached file: flex.jpg)"), - ( - "Bonjour,\nVeuillez trouver ci-joint blah\n" - "Merci d'avance,\nCordialement,\n" - "Toute modification, edition, utilisation ou diffusion non autorisee est interdite" - ), - ] - - # Expected tags - expected_tags = [ - [ - ("FOOTER", "Envoye de mon iphone"), - ], - [ - ("HELLO", "Bonjour Mme X,"), - ("BODY", "Suite a blh blah blah"), - ("GREETINGS", "Bien cordialement"), - ("SIGNATURE_NAME", "Jane Dupond"), - ("PJ", "(See attached file: flex.jpg)"), - ], - [ - ("HELLO", "Bonjour,"), - ("BODY", "Veuillez trouver ci-joint blah"), - ("THANKS", "Merci d'avance,"), - ("GREETINGS", "Cordialement,"), - ( - "FOOTER", - "Toute modification, edition, utilisation ou diffusion non autorisee est interdite", - ), - ], - ] - - # Mock the output of a Segmenter (List of Message object) - messages = [Message(text=segment) for segment in text_segments] - - # Instantiate and apply the Tagger - tagger = ContentTagger() - output_messages = tagger.tag_email(messages) - - # Test output tags - output_tags = [x.tags for x in output_messages] - assert output_tags == expected_tags - - -def test_tag_null_message(): - messages = None - - # Instantiate and apply the Tagger - tagger = ContentTagger() - output_messages = tagger.tag_email(messages) - - assert output_messages is None - - -@pytest.mark.parametrize( - "text, expected_parts", - [ - ( - "Bonjour, merci pour votre message!\nComment-allez vous?! Je suis satisfait!!!\n" - "Bien cordialement\n\n\n\nJane Dupond\n", - [ - "Bonjour,", - "merci pour votre message!", - "Comment-allez vous?!", - "Je suis satisfait!!!", - "Bien cordialement", - "Jane Dupond", - ], - ), - ], -) -def test_content_tagger_split_text(text, expected_parts): - # Instantiate and apply the Tagger - tagger = ContentTagger() - output_parts = tagger.split_text(text) - - assert output_parts == expected_parts - - -@pytest.mark.parametrize( - "text, expected_tags", - [ - ( - "Bonjour Mme X,\nSuite a blh blah blah.\n" - "Bien cordialement\nJane Dupond\n" - "(See attached file: flex.jpg)", - [ - ("HELLO", "Bonjour Mme X,"), - ("BODY", "Suite a blh blah blah."), - ("GREETINGS", "Bien cordialement"), - ("SIGNATURE_NAME", "Jane Dupond"), - ("PJ", "(See attached file: flex.jpg)"), - ], - ), - ( - "Bonjour, je confirme le rdv. Cordialement, John Smith", - [ - ("HELLO", "Bonjour,"), - ("BODY", "je confirme le rdv."), - ("GREETINGS", "Cordialement, John Smith"), - ], - ), - ( - ( - "Bonjour,\nSuite a notre intervention du 16.02.22 , un taux d'humidité de 50% a été relevé.\n" - "Cordialement.\n177, rue de la fée - 75000 Paris.\n" - "Horaires : du lundi au jeudi de 08h00 à 16h30 et le vendredi de 08h00 à 16h00.\n" - "Tel : 01.45.53.11.33" - ), - [ - ("HELLO", "Bonjour,"), - ("BODY", "Suite a notre intervention du 16.02.22 , un taux d'humidité de 50% a été relevé."), - ("GREETINGS", "Cordialement."), - ("SIGNATURE", "177, rue de la fée - 75000 Paris."), - ("BODY", "Horaires : du lundi au jeudi de 08h00 à 16h30 et le vendredi de 08h00 à 16h00."), - ("SIGNATURE", "Tel : 01.45.53.11.33"), - ], - ), - ( - ( - "bonjour\n" - "15 jours après les premières réparations, un défaut a été détecté. " - "Bien à vous\n" - "Britney Spears" - ), - [ - ("HELLO", "bonjour"), - ("BODY", "15 jours après les premières réparations, un défaut a été détecté."), - ("GREETINGS", "Bien à vous"), - ("SIGNATURE_NAME", "Britney Spears"), - ], - ), - ( - ( - "Bonjour monsieur Smith\n" - "merci. Bien à vous\n" - "Britney Spears\n" - "22 hollywood boulevard\n" - "79000 Niort\n" - ), - [ - ("HELLO", "Bonjour monsieur Smith"), - ("THANKS", "merci."), - ("GREETINGS", "Bien à vous"), - ("SIGNATURE_NAME", "Britney Spears"), - ("SIGNATURE", "22 hollywood boulevard"), - ("SIGNATURE", "79000 Niort"), - ], - ), - ( - ( - "Merci de me faire suivre les docs à ma nouvelle adresse qui est 0 rue du parc, 75000 Paris. " - "Merci d'avance. \nAcceptez notre salutation," - ), - [ - ("BODY", "Merci de me faire suivre les docs à ma nouvelle adresse qui est 0 rue du parc, 75000 Paris."), - ("THANKS", "Merci d'avance."), - ("GREETINGS", "Acceptez notre salutation,"), - ], - ), - ( - ( - "Bonjour\n" - "Je vous relance concernant ma télévision avec le devis en PJ.\n" - "Désolé pour la qualité.\n" - "Je l'ai envoyé à partir de mon ordi.\n" - "Excellente journée à vous,\n" - "Bon we\n" - "Votre bien dévoué\n" - "amicalement votre\n" - "Cordiales salutations.\n" - "Françoise-Bénédicte Dupond\n" - "Envoyé à partir de \nCourrier \npour Windows" - ), - [ - ("HELLO", "Bonjour"), - ("BODY", "Je vous relance concernant ma télévision avec le devis en PJ."), - ("BODY", "Désolé pour la qualité."), - ("BODY", "Je l'ai envoyé à partir de mon ordi."), - ("HELLO", "Excellente journée à vous,"), - ("HELLO", "Bon we"), - ("GREETINGS", "Votre bien dévoué"), - ("GREETINGS", "amicalement votre"), - ("GREETINGS", "Cordiales salutations."), - ("SIGNATURE_NAME", "Françoise-Bénédicte Dupond"), - ("FOOTER", "Envoyé à partir de"), - ("FOOTER", "Courrier"), - ("FOOTER", "pour Windows"), - ], - ), - ( - "C'est bien note, merci beaucoup.\nSentiments dévoués.\nTélécharger \nOutlook pour Android", - [ - ("THANKS", "C'est bien note, merci beaucoup."), - ("GREETINGS", "Sentiments dévoués."), - ("FOOTER", "Télécharger"), - ("FOOTER", "Outlook pour Android"), - ], - ), - ( - "Impeccable, je vous remercie beaucoup pour votre rapidité.\nObtenir\nOutlook pour Android", - [ - ("THANKS", "Impeccable, je vous remercie beaucoup pour votre rapidité."), - ("FOOTER", "Obtenir"), - ("FOOTER", "Outlook pour Android"), - ], - ), - ( - ( - "Cher Monsieur,\nJe vous confirme la bonne réception de votre précédent email.\n" - "Je vous en remercie.\nBien cordialement,\nJohn Smith" - ), - [ - ("HELLO", "Cher Monsieur,"), - ("BODY", "Je vous confirme la bonne réception de votre précédent email."), - ("THANKS", "Je vous en remercie."), - ("GREETINGS", "Bien cordialement,"), - ("SIGNATURE_NAME", "John Smith"), - ], - ), - ( - ( - "chère madame,\n" - "URGENT URGENT\n" - "Merci de me faire suivre les docs à ma nouvelle adresse qui est 0 rue du parc, 75000 Paris. " - "Merci d'avance. \nRecevez nos salutations,\nVous en souhaitant bonne réception" - ), - [ - ("HELLO", "chère madame,"), - ("BODY", "URGENT URGENT"), - ("BODY", "Merci de me faire suivre les docs à ma nouvelle adresse qui est 0 rue du parc, 75000 Paris."), - ("THANKS", "Merci d'avance."), - ("GREETINGS", "Recevez nos salutations,"), - ("GREETINGS", "Vous en souhaitant bonne réception"), - ], - ), - pytest.param( - "Un témoignage sous X\nEnvoyé depuis mon téléphone Orange", - [ - ("BODY", "Un témoignage sous X"), - ("FOOTER", "Envoyé depuis mon téléphone Orange"), - ], - id="Edge case where a line ends with an isolated character", - ), - pytest.param( - " ??\n !??!", - [ - ("BODY", "??!??!"), - ], - id="Edge case where the two first lines are missing word characters", - ), - ( - "Bonjour Mme X,\nSuite a blh blah blah.\n" - "Bien cordialement\nJane Dupond\n" - "(See attached file: flex.jpg)", - [ - ("HELLO", "Bonjour Mme X,"), - ("BODY", "Suite a blh blah blah."), - ("GREETINGS", "Bien cordialement"), - ("SIGNATURE_NAME", "Jane Dupond"), - ("PJ", "(See attached file: flex.jpg)"), - ], - ), - ( - "\nChère Madame\n\nC'est bien noté, merci\nBien reçu\nJ.Smith\n\n", - [ - ("HELLO", "Chère Madame"), - ("THANKS", "C'est bien noté, merci"), - ("BODY", "Bien reçu"), - ("SIGNATURE_NAME", "J.Smith"), - ], - ), - ( - "\nBonjour Monsieur, ceci n'est pas un hello\nBonne fin de journee\nsalutations", - [ - ("BODY", "Bonjour Monsieur, ceci n'est pas un hello"), - ("HELLO", "Bonne fin de journee"), - ("GREETINGS", "salutations"), - ], - ), - ( - "\nBonjour Monsieur Stanislas von den hoeggenboord\n\nbien à toi\nJ. Smith\nChargé de clientèle", - [ - ("HELLO", "Bonjour Monsieur Stanislas von den hoeggenboord"), - ("GREETINGS", "bien à toi"), - ("SIGNATURE_NAME", "J. Smith"), - ("SIGNATURE", "Chargé de clientèle"), - ], - ), - ( - ( - "\n1 rdv à 18h\n\n2 ème message laissé à la locataire\n3je m'en vais au bois\n" - "4 allée des iris\n 5bis rue Patrick Sebastien\n6-8 cours mirabeau\n 7 ter place du dahu\n" - "8 de la rue très longue qui ne doit pas être taggée signature" - ), - [ - ("BODY", "1 rdv à 18h"), - ("BODY", "2 ème message laissé à la locataire"), - ("BODY", "3je m'en vais au bois"), - ("SIGNATURE", "4 allée des iris"), - ("SIGNATURE", "5bis rue Patrick Sebastien"), - ("SIGNATURE", "6-8 cours mirabeau"), - ("SIGNATURE", "7 ter place du dahu"), - ("BODY", "8 de la rue très longue qui ne doit pas être taggée signature"), - ], - ), - ( - ( - "à L'attention de M Bob,\n" - "Bonjour,\n" - "Je vous informe que je vais accepter la proposition de L , à savoir le paiement d'une indemnité forfaitaire de résiliation du CCMI de 4000 € TTC pour clore cette affaire.\n" - "Cordialement.\n" - "Bob Smith" - ), - [ - ("FOOTER", "à L'attention de M Bob,"), - ("HELLO", "Bonjour,"), - ( - "BODY", - "Je vous informe que je vais accepter la proposition de L , à savoir le paiement d'une indemnité forfaitaire de résiliation du CCMI de 4000 € TTC pour clore cette affaire.", - ), - ("GREETINGS", "Cordialement."), - ("SIGNATURE_NAME", "Bob Smith"), - ], - ), - ( - ( - "Monsieur Bob Smith\n" - "Adresse mail : BobSmith90@gmail.com\n" - "Lucy Ange\n\n" - "Bonjour Monsieur,\n" - "Suite à notre entretien téléphonique de ce matin, et au message que vous m'avez envoyé sur ma messagerie, je voudrais effectuer la réparation du véhicule Renault Twingo dans un garage partenaire de la Maif situé, si c'est possible.\n" - "Dans l'attente de votre réponse et en vous remerciant par avance,\n\n\n" - "Monsieur Bob Smith\n\n\n" - "Envoyé à partir de\n" - "Courrier\npour Windows\n\n\n\n" - "Sans virus.\nwww.avast.com" - ), - [ - ("HELLO", "Monsieur Bob Smith"), - ("SIGNATURE", "Adresse mail : BobSmith90@gmail.com"), - ("SIGNATURE_NAME", "Lucy Ange"), - ("HELLO", "Bonjour Monsieur,"), - ( - "BODY", - "Suite à notre entretien téléphonique de ce matin, et au message que vous m'avez envoyé sur ma messagerie, je voudrais effectuer la réparation du véhicule Renault Twingo dans un garage partenaire de la Maif situé, si c'est possible.", - ), - ("BODY", "Dans l'attente de votre réponse et en vous remerciant par avance,"), - ("HELLO", "Monsieur Bob Smith"), - ("FOOTER", "Envoyé à partir de"), - ("FOOTER", "Courrier"), - ("FOOTER", "pour Windows"), - ("FOOTER", "Sans virus."), - ("FOOTER", "www.avast.com"), - ], - ), - ( - ( - "Bob Smith\n\n\n" - "A l’attention de Madame Lucy Ange,\n\n\n\n\n\n" - "Bonjour Madame Ange,\n\n\n\n\n\n\n\n\n" - "J’espère que vous allez bien.\n\n\n\n\n\n" - "Pour faire suite à mon mail du 21 février 2023, je me permets de revenir vers vous pour avoir votre avis sur le devis que j’ai demandé auprès d’un enquêteur.\n\n\n\n" - "Voici son retour :\n\n\n\n\n\n" - "Qu’en pensez-vous svp ?\n\n\n\n\n\n" - "Je reste à votre disposition pour tout complément d’information et vous remercie de l’intérêt que vous porterez à ma demande,\n\n\n\n\n\n" - "Bien Cordialement,\n\n\n\n\n\n" - "Bob Smith\n\n\n" - "Tél. 06.83.22.95.94" - ), - [ - ("SIGNATURE_NAME", "Bob Smith"), - ("FOOTER", "A l’attention de Madame Lucy Ange,"), - ("HELLO", "Bonjour Madame Ange,"), - ("BODY", "J’espère que vous allez bien."), - ( - "BODY", - "Pour faire suite à mon mail du 21 février 2023, je me permets de revenir vers vous pour avoir votre avis sur le devis que j’ai demandé auprès d’un enquêteur.", - ), - ("BODY", "Voici son retour :"), - ("BODY", "Qu’en pensez-vous svp ?"), - ( - "BODY", - "Je reste à votre disposition pour tout complément d’information et vous remercie de l’intérêt que vous porterez à ma demande,", - ), - ("GREETINGS", "Bien Cordialement,"), - ("SIGNATURE_NAME", "Bob Smith"), - ("SIGNATURE", "Tél."), - ("SIGNATURE", "06.83.22.95.94"), - ], - ), - pytest.param( - ( - "cordialement\nContact e-mail\n\n\nContact téléphone\n\n01 23 45 67 89 / abcabc@hotmail.fr\n" - "Torroella de Montgri, le 5 avril 2023\nLes formats de fichiers acceptés sont : PDF, DOC, DOCX, JPEG, " - "JPG, TIFF, TXT, ODT, XLS, XLSX\nTout autre format de fichiers ne sera pas transmis au dossier" - ), - [ - ("GREETINGS", "cordialement"), - ("SIGNATURE", "Contact e-mail"), - ("SIGNATURE", "Contact téléphone"), - ("SIGNATURE", "01 23 45 67 89 / abcabc@hotmail.fr"), - ("SIGNATURE", "Torroella de Montgri, le 5 avril 2023"), - ( - "FOOTER", - "Les formats de fichiers acceptés sont : PDF, DOC, DOCX, JPEG, JPG, TIFF, TXT, ODT, XLS, XLSX", - ), - ("FOOTER", "Tout autre format de fichiers ne sera pas transmis au dossier"), - ], - id="diverse_signature_patterns", - ), - pytest.param( - ( - "bonjour\nmon body\nJ. Smith\n\n01 23 45 67 89\nSecrétaire en charge des avions\n" - "Business Analyst – Tribu Sinistres – Squad Flux Entrants\n" - "Société nationale des chemins de fer\nConseiller MAIF\nGestionnaire sinistre - C99G\n" - "Service des lettres anonymes\nTechnicienne de gestion - EQUIPE ABC\n" - ), - [ - ("HELLO", "bonjour"), - ("BODY", "mon body"), - ("SIGNATURE_NAME", "J. Smith"), - ("SIGNATURE", "01 23 45 67 89"), - ("SIGNATURE", "Secrétaire en charge des avions"), - ("SIGNATURE", "Business Analyst – Tribu Sinistres – Squad Flux Entrants"), - ("SIGNATURE", "Société nationale des chemins de fer"), - ("SIGNATURE", "Conseiller MAIF"), - ("SIGNATURE", "Gestionnaire sinistre - C99G"), - ("SIGNATURE", "Service des lettres anonymes"), - ("SIGNATURE", "Technicienne de gestion - EQUIPE ABC"), - ], - id="signature_jobs", - ), - pytest.param( - ( - "bonjour\nmon body\nCordialement\n\n" - "analyste -------------------------------------- test test test test test test test\n" - ), - [ - ("HELLO", "bonjour"), - ("BODY", "mon body"), - ("GREETINGS", "Cordialement"), - ("BODY", "analyste -------------------------------------- test test test test test test test"), - ], - id="check_catastrophic_backtracking", - ), - ], -) -def test_tag_text_generic(text, expected_tags): - # Instantiate and apply the Tagger - tagger = ContentTagger() - output_tags = tagger.tag_text(text) - # Test output tags - assert output_tags == expected_tags - - -@pytest.mark.parametrize( - "text, expected_tags", - [ - pytest.param( - ( - "Merci\n" - "Je vous remercie\n" - "Merci d'avance\n" - "Je vous remercie par avance\n" - "Vous en remerciant par avance.\n" - ), - [ - ("THANKS", "Merci"), - ("THANKS", "Je vous remercie"), - ("THANKS", "Merci d'avance"), - ("THANKS", "Je vous remercie par avance"), - ("THANKS", "Vous en remerciant par avance."), - ], - id="french thanks patterns", - ), - ], -) -def test_tag_text_french(text, expected_tags): - # Instantiate and apply the Tagger - tagger = ContentTagger() - output_tags = tagger.tag_text(text) - # Test output tags - assert output_tags == expected_tags - - -@pytest.mark.parametrize( - "text, expected_tags", - [ - pytest.param( - ( - "Thank you so much\n" - "thanks\n" - "thx Joanna\n" - "thanks but you forgot bla\n" - "Thx however I still need the document\n" - ), - [ - ("THANKS", "Thank you so much"), - ("THANKS", "thanks"), - ("THANKS", "thx Joanna"), - ("BODY", "thanks but you forgot bla"), - ("BODY", "Thx however I still need the document"), - ], - id="english thanks patterns", - ), - pytest.param( - ( - "Best\n" - "warm Wishes\n" - "regards\n" - "best regards\n" - "cheers\n" - "yours\n" - "yours truly\n" - "Sincerely\n" - "see you soon\n" - "Speak to you soon\n" - "talk soon\n" - "Take care\n" - "Catch you later\n" - "Have a fantastic day\n" - "Looking forward to your reply\n" - "I am looking forward to hearing from you\n" - "Hoping to hear from you\n" - ), - [ - ("GREETINGS", "Best"), - ("GREETINGS", "warm Wishes"), - ("GREETINGS", "regards"), - ("GREETINGS", "best regards"), - ("GREETINGS", "cheers"), - ("GREETINGS", "yours"), - ("GREETINGS", "yours truly"), - ("GREETINGS", "Sincerely"), - ("GREETINGS", "see you soon"), - ("GREETINGS", "Speak to you soon"), - ("GREETINGS", "talk soon"), - ("GREETINGS", "Take care"), - ("GREETINGS", "Catch you later"), - ("GREETINGS", "Have a fantastic day"), - ("GREETINGS", "Looking forward to your reply"), - ("GREETINGS", "I am looking forward to hearing from you"), - ("GREETINGS", "Hoping to hear from you"), - ], - id="english greetings", - ), - pytest.param( - ( - "Hello John\n" - "hi\n" - "Hi there\n" - "good to hear from you\n" - "it is good to hear from you\n" - "I hope you are having a great week\n" - "how are you doing\n" - "how are you positioned about the matter\n" - "i hope you are doing well\n" - "Good Morning Joanna\n" - "good Afternoon\n" - "Dear Jacky\n" - "Sir\n" - "Dear Madam\n" - "Dear Mr\n" - "Dear Ms.\n" - "Dear miss\n" - "Dear mrs.\n" - "Dear sir or madam\n" - "To whom it may concern\n" - ), - [ - ("HELLO", "Hello John"), - ("HELLO", "hi"), - ("HELLO", "Hi there"), - ("HELLO", "good to hear from you"), - ("HELLO", "it is good to hear from you"), - ("HELLO", "I hope you are having a great week"), - ("HELLO", "how are you doing"), - ("BODY", "how are you positioned about the matter"), - ("HELLO", "i hope you are doing well"), - ("HELLO", "Good Morning Joanna"), - ("HELLO", "good Afternoon"), - ("HELLO", "Dear Jacky"), - ("HELLO", "Sir"), - ("HELLO", "Dear Madam"), - ("HELLO", "Dear Mr"), - ("HELLO", "Dear Ms."), - ("HELLO", "Dear miss"), - ("HELLO", "Dear mrs."), - ("HELLO", "Dear sir or madam"), - ("HELLO", "To whom it may concern"), - ], - id="english hello", - ), - pytest.param( - ( - "VP of Data Science\n" - "Chief of staff\n" - "CTO at TestMelusine\n" - "CEOABC test\n" - "Lead business developer\n" - ), - [ - ("SIGNATURE", "VP of Data Science"), - ("SIGNATURE", "Chief of staff"), - ("SIGNATURE", "CTO at TestMelusine"), - ("BODY", "CEOABC test"), - ("SIGNATURE", "Lead business developer"), - ], - id="english job signature patterns", - ), - pytest.param( - ( - "9 downing street\n" - "4-6 Beverly Hill\n" - "4 Abbey road W24RA\n" - "3 Ocean Rd.\n" - "5th avenue\n" - "221b Baker St.\n" - "6bis River ln.\n" - "7 Winter lane\n" - ), - [ - ("SIGNATURE", "9 downing street"), - ("SIGNATURE", "4-6 Beverly Hill"), - ("SIGNATURE", "4 Abbey road W24RA"), - ("SIGNATURE", "3 Ocean Rd."), - ("SIGNATURE", "5th avenue"), - ("SIGNATURE", "221b Baker St."), - ("SIGNATURE", "6bis River ln."), - ("SIGNATURE", "7 Winter lane"), - ], - id="english adsress signature patterns", - ), - ], -) -def test_tag_text_english(text, expected_tags): - # Instantiate and apply the Tagger - tagger = ContentTagger() - output_tags = tagger.tag_text(text) - # Test output tags - assert output_tags == expected_tags - - -def test_tag_list(): - # Limit tags to "HELLO" and the default tag ("BODY") - tag_list = ["HELLO"] - - # Text segment (= individual message in an email conversation) - text = "bonjour\nblah blah blah\nmerci\ncordialement" - - # Expected tags - expected_tags = [ - ("HELLO", "bonjour"), - ("BODY", "blah blah blah"), - ("BODY", "merci"), - ("BODY", "cordialement"), - ] - - # Instantiate and apply the Tagger - tagger = ContentTagger(tag_list=tag_list) - output_tags = tagger.tag_text(text) - - # Test output tags - assert expected_tags == output_tags - - -def test_undefined_tag(): - unknown_tag = "UNKNOWN_TAG" - - # Setup an unknown tag - tag_list = [unknown_tag] - - # Instantiate Tagger - with pytest.raises(ValueError, match=rf".*{unknown_tag}.*"): - _ = ContentTagger(tag_list=tag_list) - - -def test_unsupported_type(): - class MyClass(ContentTagger): - """Test class""" - - @Tag - def TEST_TAG(self): - """Test method""" - return 3.3 - - with pytest.raises(ValueError, match="supported types"): - _ = MyClass() - - -def test_compiled_pattern(): - class MyClass(ContentTagger): - """Test class""" - - @Tag - def TEST_TAG(self): - """Test method""" - return re.compile(r"cool_pattern") - - tagger = MyClass() - subtext, tag, match = tagger("cool_pattern is what I am looking for")[0] - - # Check tag result - assert tag == "TEST_TAG" - - -def test_str_pattern(): - class MyClass(ContentTagger): - """Test class""" - - @Tag - def TEST_TAG(self): - """Test method""" - return r"cool_pattern" - - tagger = MyClass() - subtext, tag, match = tagger("cool_pattern is what I am looking for")[0] - - # Check tag result - assert tag == "TEST_TAG" - - -def test_malformed_regex(): - from melusine.processors import Tag - - malformed_regex = r"[*." - - # Create a tagger containing an ill defined Tag (malformed regex) - class CustomTagger(ContentTagger): - """Test class""" - - @Tag - def HELLO(self): - """Test method""" - return malformed_regex - - # Instantiate Tagger - with pytest.raises(ValueError, match=rf"Invalid regex"): - _ = CustomTagger() - - -def test_direct_tagging(): - tagger = ContentTagger() - match = tagger["HELLO"].match("Bonjour") - - assert bool(match) - - -def test_call_method(): - tagger = ContentTagger() - - match_list = tagger("Bonjour a tous") - subtext, tag, regex = match_list[0] - - assert tag == "HELLO" - - -@pytest.mark.parametrize( - "text, n_words, word_character_only, expected_match", - [ - pytest.param("Hello you", 4, False, True, id="4 words match"), - pytest.param("Hello how are you today", 4, False, False, id="4 words no match"), - pytest.param("Hello! you?", 4, False, True, id="4 words match with special characters"), - pytest.param( - "Hello! you?", 4, True, False, id="4 words match with special characters (word character only True)" - ), - ], -) -def test_word_blocks(text, n_words, word_character_only, expected_match): - regex = BaseContentTagger.word_block(n_words, word_character_only=word_character_only) - - search_regex = r"^" + regex + r"$" - match = bool(re.search(search_regex, text)) - assert match == expected_match diff --git a/tests/processors/test_processors.py b/tests/processors/test_processors.py index c6aaae5..03cac2a 100644 --- a/tests/processors/test_processors.py +++ b/tests/processors/test_processors.py @@ -161,7 +161,7 @@ def test_segmenter(input_text, expected_messages): ), ( [ - Message(meta="", text="Merci", tags=[("THANKS", "Merci")]), + Message(meta="", text="Merci", tags=[{"base_tag": "THANKS", "base_text": "Merci"}]), ], "Merci", ), @@ -184,9 +184,21 @@ def test_text_extractor_error(): def test_text_extractor_multiple_messages(): """Unit test""" message_list = [ - Message(meta="", text="", tags=[("BODY", "A"), ("GREETINGS", "G"), ("BODY", "A")]), - Message(meta="", text="", tags=[("BODY", "B"), ("BODY", "B"), ("BODY", "B")]), - Message(meta="", text="", tags=[("GREETINGS", "G"), ("BODY", "C"), ("BODY", "C")]), + Message(meta="", text="", tags=[ + {"base_text": "A", "base_tag": "BODY"}, + {"base_text": "G", "base_tag": "GREETINGS"}, + {"base_text": "A", "base_tag": "BODY"}, + ]), + Message(meta="", text="", tags=[ + {"base_text": "B", "base_tag": "BODY"}, + {"base_text": "B", "base_tag": "BODY"}, + {"base_text": "B", "base_tag": "BODY"}, + ]), + Message(meta="", text="", tags=[ + {"base_text": "G", "base_tag": "GREETINGS"}, + {"base_text": "C", "base_tag": "BODY"}, + {"base_text": "C", "base_tag": "BODY"}, + ]), ] expected_output = "A\nB\nB\nB" @@ -206,8 +218,15 @@ def test_text_extractor_with_tags(): Message(meta="", text="Bonjour\nblahblah\nMerci"), Message(meta="", text="Bonjour2\nMerci2"), ] - input_message_list[0].tags = [("HELLO", "Bonjour"), ("CUSTOM_TAG", "blahblah"), ("THANKS", "Merci")] - input_message_list[1].tags = [("HELLO", "Bonjour2"), ("THANKS", "Merci2")] + input_message_list[0].tags = [ + {"base_text": "Bonjour", "base_tag": "HELLO"}, + {"base_text": "blahblah", "base_tag": "CUSTOM_TAG"}, + {"base_text": "Merci", "base_tag": "THANKS"}, + ] + input_message_list[1].tags = [ + {"base_text": "Bonjour2", "base_tag": "HELLO"}, + {"base_text": "Merci2", "base_tag": "THANKS"}, + ] extractor = TextExtractor( output_columns="text", @@ -331,7 +350,9 @@ def test_date_processor(date_str: str, expected_iso_format: str) -> None: "A: avocats@test.fr; BOB Morane \n" "Objet: dossier Test ,\n", text="bla bla bla", - tags=[("BODY", "bla bla bla")], + tags=[ + {"base_text": "bla bla bla", "base_tag": "BODY"}, + ], ) ], "('FOOTER', 'SIGNATURE')", @@ -345,7 +366,9 @@ def test_date_processor(date_str: str, expected_iso_format: str) -> None: Message( meta="", text="Envoyé depuis mon Iphone", - tags=[("FOOTER", "Envoyé depuis mon Iphone")], + tags=[ + {"base_text": "Envoyé depuis mon Iphone", "base_tag": "FOOTER"}, + ] ), Message( meta="De: test.test@test.fr \n" @@ -353,7 +376,9 @@ def test_date_processor(date_str: str, expected_iso_format: str) -> None: "A: avocats@test.fr; BOB Morane \n" "Objet: dossier Test ,\n", text="bla bla bla", - tags=[("BODY", "bla bla bla")], + tags=[ + {"base_text": "bla bla bla", "base_tag": "BODY"}, + ], ), ], "('FOOTER', 'SIGNATURE')", @@ -368,14 +393,16 @@ def test_date_processor(date_str: str, expected_iso_format: str) -> None: meta="", text="Jane Doe\n4 rue des oliviers 75001 Ville", tags=[ - ("SIGNATURE", "4 rue des oliviers 75001 Ville"), + {"base_text": "4 rue des oliviers 75001 Ville", "base_tag": "SIGNATURE"}, ], ), Message( meta="De :\ntest.test42@test.fr\nEnvoyé :\nvendredi 03 mars 2023 14:28\nÀ :" "\nana@test.fr\nObjet :\nTEST", text="bla bla bla", - tags=[("BODY", "bla bla bla")], + tags=[ + {"base_text": "bla bla bla", "base_tag": "BODY"}, + ], ), ], "('FOOTER', 'SIGNATURE')", @@ -390,15 +417,17 @@ def test_date_processor(date_str: str, expected_iso_format: str) -> None: meta="", text="Jane Doe\n4 rue des oliviers 75001 Ville", tags=[ - ("SIGNATURE_NAME", "Jane Doe"), - ("SIGNATURE", "4 rue des oliviers 75001 Ville"), + {"base_text": "Jane Doe", "base_tag": "SIGNATURE_NAME"}, + {"base_text": "4 rue des oliviers 75001 Ville", "base_tag": "SIGNATURE"}, ], ), Message( meta="De :\ntest.test42@test.fr\nEnvoyé :\nvendredi 03 mars 2023 14:28\nÀ :" "\nana@test.fr\nObjet :\nTEST", text="bla bla bla", - tags=[("BODY", "bla bla bla")], + tags=[ + {"base_text": "bla bla bla", "base_tag": "BODY"}, + ], ), ], "('FOOTER', 'SIGNATURE')", @@ -415,7 +444,9 @@ def test_date_processor(date_str: str, expected_iso_format: str) -> None: "À:\nbob@test.fr\nObjet:\nTR : 1223456" ), text="bla bla bla", - tags=[("BODY", "bla bla bla")], + tags=[ + {"base_text": "bla bla bla", "base_tag": "BODY"}, + ], ), ], "('FOOTER', 'SIGNATURE')", @@ -432,7 +463,9 @@ def test_date_processor(date_str: str, expected_iso_format: str) -> None: "À:\nbob@test.fr\nObjet:\nTR : 1223456" ), text="bla bla bla", - tags=[("BODY", "bla bla bla")], + tags=[ + {"base_text": "bla bla bla", "base_tag": "BODY"}, + ], ), ], "('FOOTER', 'SIGNATURE')", @@ -446,7 +479,9 @@ def test_date_processor(date_str: str, expected_iso_format: str) -> None: Message( meta=("Le 2 mars 2023 à 18:18, Bob a écrit :"), text="bla bla bla", - tags=[("BODY", "bla bla bla")], + tags=[ + {"base_text": "bla bla bla", "base_tag": "BODY"}, + ], ), ], "('FOOTER', 'SIGNATURE')", @@ -460,7 +495,9 @@ def test_date_processor(date_str: str, expected_iso_format: str) -> None: Message( meta=("Le 01/01/2001 11:14, test.test.test@test.fr a écrit :"), text="bla bla bla", - tags=[("BODY", "bla bla bla")], + tags=[ + {"base_text": "bla bla bla", "base_tag": "BODY"}, + ], ), ], "('FOOTER', 'SIGNATURE')", @@ -474,7 +511,9 @@ def test_date_processor(date_str: str, expected_iso_format: str) -> None: Message( meta=("Le 01/01/2001 11:14, test.test.test@test.fr a écrit :"), text="bla bla bla", - tags=[("BODY", "bla bla bla")], + tags=[ + {"base_text": "bla bla bla", "base_tag": "BODY"}, + ], ), ], "('FOOTER', 'SIGNATURE')", @@ -488,7 +527,9 @@ def test_date_processor(date_str: str, expected_iso_format: str) -> None: Message( meta="", text="Jane Doe\n4 rue des oliviers 75001 Ville", - tags=[("SIGNATURE", "Jane Doe\n4 rue des oliviers 75001 Ville")], + tags=[ + {"base_text": "Jane Doe\n4 rue des oliviers 75001 Ville", "base_tag": "SIGNATURE"}, + ], ), Message( meta="De: test.test@test.fr \n" @@ -496,7 +537,9 @@ def test_date_processor(date_str: str, expected_iso_format: str) -> None: "A: avocats@test.fr; BOB Morane \n" "Objet: dossier Test ,\n", text="bla bla bla", - tags=[("BODY", "bla bla bla")], + tags=[ + {"base_text": "bla bla bla", "base_tag": "BODY"}, + ], ), ], "('FOOTER',)", @@ -510,7 +553,9 @@ def test_date_processor(date_str: str, expected_iso_format: str) -> None: Message( meta="", text="J'entends le loup, le renard et la belette", - tags=[("BODY", "J'entends le loup, le renard et la belette")], + tags=[ + {"base_text": "J'entends le loup, le renard et la belette", "base_tag": "BODY"}, + ], ), Message( meta="De: test.test@test.fr \n" @@ -518,7 +563,9 @@ def test_date_processor(date_str: str, expected_iso_format: str) -> None: "A: avocats@test.fr; BOB Morane \n" "Objet: dossier Test ,\n", text="bla bla bla", - tags=[("BODY", "bla bla bla")], + tags=[ + {"base_text": "bla bla bla", "base_tag": "BODY"}, + ], ), ], "('FOOTER', 'SIGNATURE')", @@ -532,7 +579,9 @@ def test_date_processor(date_str: str, expected_iso_format: str) -> None: Message( meta="", text="", - tags=[("BODY", "")], + tags=[ + {"base_text": "", "base_tag": "BODY"}, + ], ), ], "('FOOTER', 'SIGNATURE')", @@ -546,12 +595,16 @@ def test_date_processor(date_str: str, expected_iso_format: str) -> None: Message( meta="", text="Envoyé de mon iPhone", - tags=[("FOOTER", "bla 1")], + tags=[ + {"base_text": "bla 1", "base_tag": "FOOTER"}, + ], ), Message( meta="Nothing useful", text="bla 2", - tags=[("BODY", "bla 2")], + tags=[ + {"base_text": "bla 2", "base_tag": "BODY"}, + ], ), ], "('FOOTER', 'SIGNATURE')", From cc11fd1b1577b9b92b5bfce65bba14ec81482f70 Mon Sep 17 00:00:00 2001 From: Hugo Perrier Date: Mon, 9 Dec 2024 14:10:15 +0100 Subject: [PATCH 04/10] :white_check_mark: Refactor connectors testing --- pyproject.toml | 3 +- tests/conftest.py | 1 - tests/fixtures/connectors.py | 55 --------------------- tests/{connectors => gmail}/__init__.py | 0 tests/{connectors => gmail}/test_gmail.py | 60 ++++++++++++++++++++--- tox.ini | 12 +++-- 6 files changed, 64 insertions(+), 67 deletions(-) delete mode 100644 tests/fixtures/connectors.py rename tests/{connectors => gmail}/__init__.py (100%) rename tests/{connectors => gmail}/test_gmail.py (83%) diff --git a/pyproject.toml b/pyproject.toml index 085cb93..7a04fa8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,7 +48,8 @@ dynamic = ["version"] dev = ["tox", "pre-commit", "black", "flake8", "isort", "mypy", "pytest", "coverage", "build", "ruff"] test = ["pytest", "coverage", "pytest-cov", "google-auth-oauthlib", "google-api-python-client"] transformers = ["transformers>4"] -connectors = ["exchangelib", "google-auth-oauthlib", "google-api-python-client"] +connectors = ["exchangelib"] +gmail = ["google-auth-oauthlib", "google-api-python-client"] docs = ["mkdocs", "markdown", "mkdocs-material", "mdx-include"] [tool.setuptools.packages.find] diff --git a/tests/conftest.py b/tests/conftest.py index 678606f..bfab74c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -11,7 +11,6 @@ pytest_plugins = [ "tests.fixtures.backend", "tests.fixtures.basic_emails", - "tests.fixtures.connectors", "tests.fixtures.docs", "tests.fixtures.pipelines", "tests.fixtures.processors", diff --git a/tests/fixtures/connectors.py b/tests/fixtures/connectors.py deleted file mode 100644 index c556d91..0000000 --- a/tests/fixtures/connectors.py +++ /dev/null @@ -1,55 +0,0 @@ -from unittest.mock import MagicMock, patch - -import pytest -from google.oauth2.credentials import Credentials -from googleapiclient.http import HttpRequestMock - -from melusine.connectors.gmail import GmailConnector - - -def return_value(resp, content): - return content - - -@pytest.fixture -def mocked_gc(): - with patch("melusine.connectors.gmail.build") as mock_build: - with patch("melusine.connectors.gmail.Credentials.from_authorized_user_file") as mock_creds_from_file: - with patch("melusine.connectors.gmail.os.path.exists") as mock_exists: - mock_exists.return_value = True - mock_service = MagicMock() - mock_service.users().getProfile.return_value = HttpRequestMock( - None, {"emailAddress": "test@example.com"}, return_value - ) - mock_service.users().labels().list.return_value = HttpRequestMock( - None, - { - "labels": [ - {"id": "INBOX", "name": "INBOX", "type": "system"}, - { - "id": "TRASH", - "name": "TRASH", - "messageListVisibility": "hide", - "labelListVisibility": "labelHide", - "type": "system", - }, - {"id": "UNREAD", "name": "UNREAD", "type": "system"}, - ] - }, - return_value, - ) - mock_build.return_value = mock_service - mock_creds_from_file.return_value = Credentials("dummy") - - return GmailConnector(token_json_path="token.json", done_label="TRASH", target_column="target") - - -@pytest.fixture -def fake_image(): - image_data = b"" - width = height = 100 - for _ in range(height): - row_data = b"\xff" * (width * 3) - image_data += row_data - - return image_data diff --git a/tests/connectors/__init__.py b/tests/gmail/__init__.py similarity index 100% rename from tests/connectors/__init__.py rename to tests/gmail/__init__.py diff --git a/tests/connectors/test_gmail.py b/tests/gmail/test_gmail.py similarity index 83% rename from tests/connectors/test_gmail.py rename to tests/gmail/test_gmail.py index 390bfa2..05d23e5 100644 --- a/tests/connectors/test_gmail.py +++ b/tests/gmail/test_gmail.py @@ -1,16 +1,64 @@ -import base64 import logging import os -from unittest.mock import MagicMock, patch +import pytest import pandas as pd -import pytest -from google.oauth2.credentials import Credentials -from googleapiclient.http import HttpRequestMock +from unittest.mock import MagicMock, patch + +HttpRequestMock = pytest.importorskip('googleapiclient.http.HttpRequestMock') +from google.oauth2.credentials import Credentials from melusine.connectors.gmail import GmailConnector +def return_value(resp, content): + return content + + +@pytest.fixture +def mocked_gc(): + with patch("melusine.connectors.gmail.build") as mock_build: + with patch("melusine.connectors.gmail.Credentials.from_authorized_user_file") as mock_creds_from_file: + with patch("melusine.connectors.gmail.os.path.exists") as mock_exists: + mock_exists.return_value = True + mock_service = MagicMock() + mock_service.users().getProfile.return_value = HttpRequestMock( + None, {"emailAddress": "test@example.com"}, return_value + ) + mock_service.users().labels().list.return_value = HttpRequestMock( + None, + { + "labels": [ + {"id": "INBOX", "name": "INBOX", "type": "system"}, + { + "id": "TRASH", + "name": "TRASH", + "messageListVisibility": "hide", + "labelListVisibility": "labelHide", + "type": "system", + }, + {"id": "UNREAD", "name": "UNREAD", "type": "system"}, + ] + }, + return_value, + ) + mock_build.return_value = mock_service + mock_creds_from_file.return_value = Credentials("dummy") + + return GmailConnector(token_json_path="token.json", done_label="TRASH", target_column="target") + + +@pytest.fixture +def fake_image(): + image_data = b"" + width = height = 100 + for _ in range(height): + row_data = b"\xff" * (width * 3) + image_data += row_data + + return image_data + + def return_value(resp, content): return content @@ -266,4 +314,4 @@ def test_gc_send_email(mocked_gc, fake_image, caplog): {"attachment.jpg": fake_image}, ) - assert "Email sent to melusine_testing@yopmail.com, Message Id: 12456" + assert "Email sent to melusine_testing@yopmail.com, Message Id: 12456" in caplog.text diff --git a/tox.ini b/tox.ini index a94c341..ca64067 100644 --- a/tox.ini +++ b/tox.ini @@ -1,11 +1,11 @@ [tox] requires = tox>=4 -env_list = clean, core38, core310, transformers, report +env_list = clean, core38, core310, transformers, gmail, report [gh-actions] python = - 3.8: clean, core38, transformers + 3.8: clean, core38, transformers, gmail 3.10: core310 [testenv] @@ -13,8 +13,6 @@ commands = pytest --cov --cov-append --cov-report xml deps = pytest pytest-cov - google-auth-oauthlib - google-api-python-client depends = {core38,transformers}: clean report: core38,transformers @@ -38,6 +36,12 @@ deps={[testenv]deps} commands = pytest tests/huggingface --cov --cov-append --cov-report xml extras = transformers +[testenv:gmail] +description = run unit tests with the gmail dependencies +deps={[testenv]deps} +commands = pytest tests/gmail --cov --cov-append --cov-report xml +extras = gmail + [testenv:report] deps = coverage[toml] skip_install = true From 8f4e3739982b26c0c9ba1bcceb24835b1d7de6c0 Mon Sep 17 00:00:00 2001 From: Hugo Perrier Date: Mon, 9 Dec 2024 15:49:59 +0100 Subject: [PATCH 05/10] :rotating_light: Code linting --- docs/tutorials/08_MelusineRegex.md | 4 - melusine/processors.py | 1 + tests/detectors/test_thanks_detector.py | 5 +- .../detectors/test_vacation_reply_detector.py | 6 +- tests/functional/test_emails_fixtures.py | 122 ++++++++++---- tests/gmail/test_gmail.py | 3 +- .../processors/test_content_refined_tagger.py | 149 ++++++++++++++---- tests/processors/test_processors.py | 44 ++++-- 8 files changed, 244 insertions(+), 90 deletions(-) diff --git a/docs/tutorials/08_MelusineRegex.md b/docs/tutorials/08_MelusineRegex.md index 46a5105..e974b79 100644 --- a/docs/tutorials/08_MelusineRegex.md +++ b/docs/tutorials/08_MelusineRegex.md @@ -17,7 +17,6 @@ from melusine.base import MelusineRegex class AnnoyingEmailsRegex(MelusineRegex): - @property def positive(self) -> Union[str, Dict[str, str]]: return dict( @@ -65,7 +64,6 @@ from melusine.base import MelusineRegex class AnnoyingEmailsRegex(MelusineRegex): - @property def positive(self) -> Union[str, Dict[str, str]]: return dict( @@ -192,7 +190,6 @@ from melusine.base import MelusineRegex class AnnoyingEmailsRegex(MelusineRegex): - @property def positive(self) -> Union[str, Dict[str, str]]: return dict( @@ -237,7 +234,6 @@ That is were neutral regex can be of use. Whenever a neutral regex is matched, i ```python class IfritAlertRegex(MelusineRegex): - @property def positive(self) -> Union[str, Dict[str, str]]: return dict( diff --git a/melusine/processors.py b/melusine/processors.py index 0e989f4..06d54bd 100644 --- a/melusine/processors.py +++ b/melusine/processors.py @@ -1556,6 +1556,7 @@ class RefinedTagger(MelusineTransformer): """ Post-processing class to refine initial tags. """ + def __init__( self, input_columns: str = "messages", diff --git a/tests/detectors/test_thanks_detector.py b/tests/detectors/test_thanks_detector.py index da50073..fb6d16f 100644 --- a/tests/detectors/test_thanks_detector.py +++ b/tests/detectors/test_thanks_detector.py @@ -111,10 +111,7 @@ def test_thanks_detector_missing_field(thanks_detector_df): ], False, "Merci\nMerci a vous", - [ - {"base_text": "Merci", "base_tag": "THANKS"}, - {"base_text": "Merci a vous", "base_tag": "THANKS"} - ], + [{"base_text": "Merci", "base_tag": "THANKS"}, {"base_text": "Merci a vous", "base_tag": "THANKS"}], ), ], ) diff --git a/tests/detectors/test_vacation_reply_detector.py b/tests/detectors/test_vacation_reply_detector.py index 7594522..998d264 100644 --- a/tests/detectors/test_vacation_reply_detector.py +++ b/tests/detectors/test_vacation_reply_detector.py @@ -35,7 +35,7 @@ def test_instanciation(): {"base_tag": "HELLO", "base_text": "Bonjour,"}, { "base_tag": "BODY", - "base_text": "je vous confirme l'annulation du rdv du 01/01/2022 à 16h." + "base_text": "je vous confirme l'annulation du rdv du 01/01/2022 à 16h.", }, {"base_tag": "GREETINGS", "base_text": "Bien cordialement, John Smith."}, ], @@ -58,7 +58,7 @@ def test_instanciation(): {"base_tag": "HELLO", "base_text": "Bonjour,"}, { "base_tag": "BODY", - "base_text": "Actuellement en conge je prendrai connaissance de votre message ulterieurement." + "base_text": "Actuellement en conge je prendrai connaissance de votre message ulterieurement.", }, {"base_tag": "GREETINGS", "base_text": "Cordialement, "}, ], @@ -101,7 +101,7 @@ def test_transform(df, good_result): {"base_tag": "HELLO", "base_text": "Bonjour,"}, { "base_tag": "BODY", - "base_text": "Actuellement en conge je prendrai connaissance de votre message ulterieurement." + "base_text": "Actuellement en conge je prendrai connaissance de votre message ulterieurement.", }, {"base_tag": "GREETINGS", "base_text": "Cordialement, "}, ], diff --git a/tests/functional/test_emails_fixtures.py b/tests/functional/test_emails_fixtures.py index 934a730..0fd662d 100644 --- a/tests/functional/test_emails_fixtures.py +++ b/tests/functional/test_emails_fixtures.py @@ -129,7 +129,11 @@ [ {"base_text": "Bonjour,", "base_tag": "HELLO", "base_tag_list": ["HELLO"]}, {"base_text": "Vous trouverez ci-joint l'attestation", "base_tag": "BODY", "base_tag_list": ["BODY"]}, - {"base_text": "Merci de me confirmer la bonne réception de ce message.", "base_tag": "BODY", "base_tag_list": ["BODY"]}, + { + "base_text": "Merci de me confirmer la bonne réception de ce message.", + "base_tag": "BODY", + "base_tag_list": ["BODY"], + }, {"base_text": "Vous en remerciant par avance.", "base_tag": "THANKS", "base_tag_list": ["THANKS"]}, {"base_text": "Cordialement,", "base_tag": "GREETINGS", "base_tag_list": ["GREETINGS"]}, {"base_text": "Jean Dupont", "base_tag": "BODY", "base_tag_list": ["BODY"]}, @@ -137,7 +141,11 @@ [ {"base_text": "Bonjour,", "base_tag": "HELLO", "base_tag_list": ["HELLO"]}, {"base_text": "Veuillez trouver ci-jointe la lettre", "base_tag": "BODY", "base_tag_list": ["BODY"]}, - {"base_text": "La visualisation des fichiers PDF nécessite Adobe Reader.", "base_tag": "FOOTER", "base_tag_list": ["FOOTER"]}, + { + "base_text": "La visualisation des fichiers PDF nécessite Adobe Reader.", + "base_tag": "FOOTER", + "base_tag_list": ["FOOTER"], + }, {"base_text": "Sentiments mutualistes.", "base_tag": "GREETINGS", "base_tag_list": ["GREETINGS"]}, {"base_text": "La MAIF", "base_tag": "BODY", "base_tag_list": ["BODY"]}, ], @@ -147,27 +155,63 @@ "messages.tags": [ [ {"base_text": "Bonjour,", "base_tag": "HELLO", "base_tag_list": ["HELLO"], "refined_tag": "HELLO"}, - {"base_text": "Vous trouverez ci-joint l'attestation", "base_tag": "BODY", "base_tag_list": ["BODY"], - "refined_tag": "BODY"}, - {"base_text": "Merci de me confirmer la bonne réception de ce message.", "base_tag": "BODY", - "base_tag_list": ["BODY"], "refined_tag": "BODY"}, - {"base_text": "Vous en remerciant par avance.", "base_tag": "THANKS", "base_tag_list": ["THANKS"], - "refined_tag": "THANKS"}, - {"base_text": "Cordialement,", "base_tag": "GREETINGS", "base_tag_list": ["GREETINGS"], - "refined_tag": "GREETINGS"}, - {"base_text": "Jean Dupont", "base_tag": "BODY", "base_tag_list": ["BODY"], - "refined_tag": "SIGNATURE_NAME"}, + { + "base_text": "Vous trouverez ci-joint l'attestation", + "base_tag": "BODY", + "base_tag_list": ["BODY"], + "refined_tag": "BODY", + }, + { + "base_text": "Merci de me confirmer la bonne réception de ce message.", + "base_tag": "BODY", + "base_tag_list": ["BODY"], + "refined_tag": "BODY", + }, + { + "base_text": "Vous en remerciant par avance.", + "base_tag": "THANKS", + "base_tag_list": ["THANKS"], + "refined_tag": "THANKS", + }, + { + "base_text": "Cordialement,", + "base_tag": "GREETINGS", + "base_tag_list": ["GREETINGS"], + "refined_tag": "GREETINGS", + }, + { + "base_text": "Jean Dupont", + "base_tag": "BODY", + "base_tag_list": ["BODY"], + "refined_tag": "SIGNATURE_NAME", + }, ], [ {"base_text": "Bonjour,", "base_tag": "HELLO", "base_tag_list": ["HELLO"], "refined_tag": "HELLO"}, - {"base_text": "Veuillez trouver ci-jointe la lettre", "base_tag": "BODY", "base_tag_list": ["BODY"], - "refined_tag": "BODY"}, - {"base_text": "La visualisation des fichiers PDF nécessite Adobe Reader.", "base_tag": "FOOTER", - "base_tag_list": ["FOOTER"], "refined_tag": "FOOTER"}, - {"base_text": "Sentiments mutualistes.", "base_tag": "GREETINGS", "base_tag_list": ["GREETINGS"], - "refined_tag": "GREETINGS"}, - {"base_text": "La MAIF", "base_tag": "BODY", "base_tag_list": ["BODY"], - "refined_tag": "SIGNATURE_NAME"}, + { + "base_text": "Veuillez trouver ci-jointe la lettre", + "base_tag": "BODY", + "base_tag_list": ["BODY"], + "refined_tag": "BODY", + }, + { + "base_text": "La visualisation des fichiers PDF nécessite Adobe Reader.", + "base_tag": "FOOTER", + "base_tag_list": ["FOOTER"], + "refined_tag": "FOOTER", + }, + { + "base_text": "Sentiments mutualistes.", + "base_tag": "GREETINGS", + "base_tag_list": ["GREETINGS"], + "refined_tag": "GREETINGS", + }, + { + "base_text": "La MAIF", + "base_tag": "BODY", + "base_tag_list": ["BODY"], + "refined_tag": "SIGNATURE_NAME", + }, ], ], }, @@ -202,7 +246,11 @@ "messages.tags": [ [ {"base_text": "Bonjour", "base_tag": "HELLO", "base_tag_list": ["HELLO"]}, - {"base_text": "Pouvez-vous me transmettre deux attestations au nom de mes enfants", "base_tag": "BODY", "base_tag_list": ["BODY"]}, + { + "base_text": "Pouvez-vous me transmettre deux attestations au nom de mes enfants", + "base_tag": "BODY", + "base_tag_list": ["BODY"], + }, {"base_text": "- Jane Dupond", "base_tag": "BODY", "base_tag_list": ["BODY"]}, {"base_text": "- Joe Dupond", "base_tag": "BODY", "base_tag_list": ["BODY"]}, {"base_text": "Merci par avance", "base_tag": "THANKS", "base_tag_list": ["THANKS"]}, @@ -215,16 +263,32 @@ "messages.tags": [ [ {"base_text": "Bonjour", "base_tag": "HELLO", "base_tag_list": ["HELLO"], "refined_tag": "HELLO"}, - {"base_text": "Pouvez-vous me transmettre deux attestations au nom de mes enfants", "base_tag": "BODY", - "base_tag_list": ["BODY"], "refined_tag": "BODY"}, + { + "base_text": "Pouvez-vous me transmettre deux attestations au nom de mes enfants", + "base_tag": "BODY", + "base_tag_list": ["BODY"], + "refined_tag": "BODY", + }, {"base_text": "- Jane Dupond", "base_tag": "BODY", "base_tag_list": ["BODY"], "refined_tag": "BODY"}, {"base_text": "- Joe Dupond", "base_tag": "BODY", "base_tag_list": ["BODY"], "refined_tag": "BODY"}, - {"base_text": "Merci par avance", "base_tag": "THANKS", "base_tag_list": ["THANKS"], - "refined_tag": "THANKS"}, - {"base_text": "Cordialement", "base_tag": "GREETINGS", "base_tag_list": ["GREETINGS"], - "refined_tag": "GREETINGS"}, - {"base_text": "Mr Jean Dupond", "base_tag": "BODY", "base_tag_list": ["BODY"], - "refined_tag": "SIGNATURE_NAME"}, + { + "base_text": "Merci par avance", + "base_tag": "THANKS", + "base_tag_list": ["THANKS"], + "refined_tag": "THANKS", + }, + { + "base_text": "Cordialement", + "base_tag": "GREETINGS", + "base_tag_list": ["GREETINGS"], + "refined_tag": "GREETINGS", + }, + { + "base_text": "Mr Jean Dupond", + "base_tag": "BODY", + "base_tag_list": ["BODY"], + "refined_tag": "SIGNATURE_NAME", + }, ] ], }, diff --git a/tests/gmail/test_gmail.py b/tests/gmail/test_gmail.py index 05d23e5..f4498db 100644 --- a/tests/gmail/test_gmail.py +++ b/tests/gmail/test_gmail.py @@ -3,11 +3,10 @@ import pytest import pandas as pd -from unittest.mock import MagicMock, patch - HttpRequestMock = pytest.importorskip('googleapiclient.http.HttpRequestMock') from google.oauth2.credentials import Credentials +from unittest.mock import MagicMock, patch from melusine.connectors.gmail import GmailConnector diff --git a/tests/processors/test_content_refined_tagger.py b/tests/processors/test_content_refined_tagger.py index 0091417..5fe58a4 100644 --- a/tests/processors/test_content_refined_tagger.py +++ b/tests/processors/test_content_refined_tagger.py @@ -3,14 +3,14 @@ import pytest from melusine.message import Message -from melusine.processors import BaseContentTagger, ContentTagger, Tag, RefinedTagger +from melusine.processors import BaseContentTagger, ContentTagger, RefinedTagger, Tag def test_content_tagger(): # Text segments (= individual messages in an email conversation) text_segments = [ "Envoye de mon iphone", - ("Bonjour Mme X,\nSuite a blh blah blah\nBien cordialement\nJane Dupond\n(See attached file: flex.jpg)"), + "Bonjour Mme X,\nSuite a blh blah blah\nBien cordialement\nJane Dupond\n(See attached file: flex.jpg)", ( "Bonjour,\nVeuillez trouver ci-joint blah\n" "Merci d'avance,\nCordialement,\n" @@ -35,8 +35,10 @@ def test_content_tagger(): {"base_text": "Veuillez trouver ci-joint blah", "base_tag": "BODY"}, {"base_text": "Merci d'avance,", "base_tag": "THANKS"}, {"base_text": "Cordialement,", "base_tag": "GREETINGS"}, - {"base_text": "Toute modification, edition, utilisation ou diffusion non autorisee est interdite", - "base_tag": "FOOTER"}, + { + "base_text": "Toute modification, edition, utilisation ou diffusion non autorisee est interdite", + "base_tag": "FOOTER", + }, ], ] @@ -125,12 +127,18 @@ def test_content_tagger_split_text(text, expected_parts): ), [ {"base_text": "Bonjour,", "base_tag": "HELLO", "refined_tag": "HELLO"}, - {"base_text": "Suite a notre intervention du 16.02.22 , un taux d'humidité de 50% a été relevé.", - "base_tag": "BODY", "refined_tag": "BODY"}, + { + "base_text": "Suite a notre intervention du 16.02.22 , un taux d'humidité de 50% a été relevé.", + "base_tag": "BODY", + "refined_tag": "BODY", + }, {"base_text": "Cordialement.", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, {"base_text": "177, rue de la fée - 75000 Paris.", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, - {"base_text": "Horaires : du lundi au jeudi de 08h00 à 16h30 et le vendredi de 08h00 à 16h00.", - "base_tag": "BODY", "refined_tag": "BODY"}, + { + "base_text": "Horaires : du lundi au jeudi de 08h00 à 16h30 et le vendredi de 08h00 à 16h00.", + "base_tag": "BODY", + "refined_tag": "BODY", + }, {"base_text": "Tel : 01.45.53.11.33", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, ], ), @@ -143,7 +151,11 @@ def test_content_tagger_split_text(text, expected_parts): ), [ {"base_text": "bonjour", "base_tag": "HELLO", "refined_tag": "HELLO"}, - {"base_text": "15 jours après les premières réparations, un défaut a été détecté.", "base_tag": "BODY", "refined_tag": "BODY"}, + { + "base_text": "15 jours après les premières réparations, un défaut a été détecté.", + "base_tag": "BODY", + "refined_tag": "BODY", + }, {"base_text": "Bien à vous", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, {"base_text": "Britney Spears", "base_tag": "BODY", "refined_tag": "SIGNATURE_NAME"}, ], @@ -173,7 +185,9 @@ def test_content_tagger_split_text(text, expected_parts): [ { "base_text": "Merci de me faire suivre les docs à ma nouvelle adresse qui est 0 rue du parc, 75000 Paris.", - "base_tag": "BODY", "refined_tag": "BODY"}, + "base_tag": "BODY", + "refined_tag": "BODY", + }, {"base_text": "Merci d'avance.", "base_tag": "THANKS", "refined_tag": "THANKS"}, {"base_text": "Acceptez notre salutation,", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, ], @@ -194,7 +208,11 @@ def test_content_tagger_split_text(text, expected_parts): ), [ {"base_text": "Bonjour", "base_tag": "HELLO", "refined_tag": "HELLO"}, - {"base_text": "Je vous relance concernant ma télévision avec le devis en PJ.", "base_tag": "BODY", "refined_tag": "BODY"}, + { + "base_text": "Je vous relance concernant ma télévision avec le devis en PJ.", + "base_tag": "BODY", + "refined_tag": "BODY", + }, {"base_text": "Désolé pour la qualité.", "base_tag": "BODY", "refined_tag": "BODY"}, {"base_text": "Je l'ai envoyé à partir de mon ordi.", "base_tag": "BODY", "refined_tag": "BODY"}, {"base_text": "Excellente journée à vous,", "base_tag": "HELLO", "refined_tag": "HELLO"}, @@ -220,7 +238,11 @@ def test_content_tagger_split_text(text, expected_parts): ( "Impeccable, je vous remercie beaucoup pour votre rapidité.\nObtenir\nOutlook pour Android", [ - {"base_text": "Impeccable, je vous remercie beaucoup pour votre rapidité.", "base_tag": "THANKS", "refined_tag": "THANKS"}, + { + "base_text": "Impeccable, je vous remercie beaucoup pour votre rapidité.", + "base_tag": "THANKS", + "refined_tag": "THANKS", + }, {"base_text": "Obtenir", "base_tag": "FOOTER", "refined_tag": "FOOTER"}, {"base_text": "Outlook pour Android", "base_tag": "FOOTER", "refined_tag": "FOOTER"}, ], @@ -232,7 +254,11 @@ def test_content_tagger_split_text(text, expected_parts): ), [ {"base_text": "Cher Monsieur,", "base_tag": "HELLO", "refined_tag": "HELLO"}, - {"base_text": "Je vous confirme la bonne réception de votre précédent email.", "base_tag": "BODY", "refined_tag": "BODY"}, + { + "base_text": "Je vous confirme la bonne réception de votre précédent email.", + "base_tag": "BODY", + "refined_tag": "BODY", + }, {"base_text": "Je vous en remercie.", "base_tag": "THANKS", "refined_tag": "THANKS"}, {"base_text": "Bien cordialement,", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, {"base_text": "John Smith", "base_tag": "BODY", "refined_tag": "SIGNATURE_NAME"}, @@ -250,10 +276,16 @@ def test_content_tagger_split_text(text, expected_parts): {"base_text": "URGENT URGENT", "base_tag": "BODY", "refined_tag": "BODY"}, { "base_text": "Merci de me faire suivre les docs à ma nouvelle adresse qui est 0 rue du parc, 75000 Paris.", - "base_tag": "BODY", "refined_tag": "BODY"}, + "base_tag": "BODY", + "refined_tag": "BODY", + }, {"base_text": "Merci d'avance.", "base_tag": "THANKS", "refined_tag": "THANKS"}, {"base_text": "Recevez nos salutations,", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, - {"base_text": "Vous en souhaitant bonne réception", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, + { + "base_text": "Vous en souhaitant bonne réception", + "base_tag": "GREETINGS", + "refined_tag": "GREETINGS", + }, ], ), pytest.param( @@ -303,7 +335,11 @@ def test_content_tagger_split_text(text, expected_parts): ( "\nBonjour Monsieur Stanislas von den hoeggenboord\n\nbien à toi\nJ. Smith\nChargé de clientèle", [ - {"base_text": "Bonjour Monsieur Stanislas von den hoeggenboord", "base_tag": "HELLO", "refined_tag": "HELLO"}, + { + "base_text": "Bonjour Monsieur Stanislas von den hoeggenboord", + "base_tag": "HELLO", + "refined_tag": "HELLO", + }, {"base_text": "bien à toi", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, {"base_text": "J. Smith", "base_tag": "BODY", "refined_tag": "SIGNATURE_NAME"}, {"base_text": "Chargé de clientèle", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, @@ -323,7 +359,11 @@ def test_content_tagger_split_text(text, expected_parts): {"base_text": "5bis rue Patrick Sebastien", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, {"base_text": "6-8 cours mirabeau", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, {"base_text": "7 ter place du dahu", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, - {"base_text": "8 de la rue très longue qui ne doit pas être taggée signature", "base_tag": "BODY", "refined_tag": "BODY"}, + { + "base_text": "8 de la rue très longue qui ne doit pas être taggée signature", + "base_tag": "BODY", + "refined_tag": "BODY", + }, ], ), ( @@ -339,7 +379,9 @@ def test_content_tagger_split_text(text, expected_parts): {"base_text": "Bonjour,", "base_tag": "HELLO", "refined_tag": "HELLO"}, { "base_text": "Je vous informe que je vais accepter la proposition de L , à savoir le paiement d'une indemnité forfaitaire de résiliation du CCMI de 4000 € TTC pour clore cette affaire.", - "base_tag": "BODY", "refined_tag": "BODY"}, + "base_tag": "BODY", + "refined_tag": "BODY", + }, {"base_text": "Cordialement.", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, {"base_text": "Bob Smith", "base_tag": "BODY", "refined_tag": "SIGNATURE_NAME"}, ], @@ -359,13 +401,23 @@ def test_content_tagger_split_text(text, expected_parts): ), [ {"base_text": "Monsieur Bob Smith", "base_tag": "HELLO", "refined_tag": "HELLO"}, - {"base_text": "Adresse mail : BobSmith90@gmail.com", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + { + "base_text": "Adresse mail : BobSmith90@gmail.com", + "base_tag": "SIGNATURE", + "refined_tag": "SIGNATURE", + }, {"base_text": "Lucy Ange", "base_tag": "BODY", "refined_tag": "SIGNATURE_NAME"}, {"base_text": "Bonjour Monsieur,", "base_tag": "HELLO", "refined_tag": "HELLO"}, { "base_text": "Suite à notre entretien téléphonique de ce matin, et au message que vous m'avez envoyé sur ma messagerie, je voudrais effectuer la réparation du véhicule Renault Twingo dans un garage partenaire de la Maif situé, si c'est possible.", - "base_tag": "BODY", "refined_tag": "BODY"}, - {"base_text": "Dans l'attente de votre réponse et en vous remerciant par avance,", "base_tag": "BODY", "refined_tag": "BODY"}, + "base_tag": "BODY", + "refined_tag": "BODY", + }, + { + "base_text": "Dans l'attente de votre réponse et en vous remerciant par avance,", + "base_tag": "BODY", + "refined_tag": "BODY", + }, {"base_text": "Monsieur Bob Smith", "base_tag": "HELLO", "refined_tag": "HELLO"}, {"base_text": "Envoyé à partir de", "base_tag": "FOOTER", "refined_tag": "FOOTER"}, {"base_text": "Courrier", "base_tag": "FOOTER", "refined_tag": "FOOTER"}, @@ -395,12 +447,16 @@ def test_content_tagger_split_text(text, expected_parts): {"base_text": "J’espère que vous allez bien.", "base_tag": "BODY", "refined_tag": "BODY"}, { "base_text": "Pour faire suite à mon mail du 21 février 2023, je me permets de revenir vers vous pour avoir votre avis sur le devis que j’ai demandé auprès d’un enquêteur.", - "base_tag": "BODY", "refined_tag": "BODY"}, + "base_tag": "BODY", + "refined_tag": "BODY", + }, {"base_text": "Voici son retour :", "base_tag": "BODY", "refined_tag": "BODY"}, {"base_text": "Qu’en pensez-vous svp ?", "base_tag": "BODY", "refined_tag": "BODY"}, { "base_text": "Je reste à votre disposition pour tout complément d’information et vous remercie de l’intérêt que vous porterez à ma demande,", - "base_tag": "BODY", "refined_tag": "BODY"}, + "base_tag": "BODY", + "refined_tag": "BODY", + }, {"base_text": "Bien Cordialement,", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, {"base_text": "Bob Smith", "base_tag": "BODY", "refined_tag": "SIGNATURE_NAME"}, {"base_text": "Tél.", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, @@ -417,12 +473,26 @@ def test_content_tagger_split_text(text, expected_parts): {"base_text": "cordialement", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, {"base_text": "Contact e-mail", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, {"base_text": "Contact téléphone", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, - {"base_text": "01 23 45 67 89 / abcabc@hotmail.fr", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, - {"base_text": "Torroella de Montgri, le 5 avril 2023", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + { + "base_text": "01 23 45 67 89 / abcabc@hotmail.fr", + "base_tag": "SIGNATURE", + "refined_tag": "SIGNATURE", + }, + { + "base_text": "Torroella de Montgri, le 5 avril 2023", + "base_tag": "SIGNATURE", + "refined_tag": "SIGNATURE", + }, { "base_text": "Les formats de fichiers acceptés sont : PDF, DOC, DOCX, JPEG, JPG, TIFF, TXT, ODT, XLS, XLSX", - "base_tag": "FOOTER", "refined_tag": "FOOTER"}, - {"base_text": "Tout autre format de fichiers ne sera pas transmis au dossier", "base_tag": "FOOTER", "refined_tag": "FOOTER"}, + "base_tag": "FOOTER", + "refined_tag": "FOOTER", + }, + { + "base_text": "Tout autre format de fichiers ne sera pas transmis au dossier", + "base_tag": "FOOTER", + "refined_tag": "FOOTER", + }, ], id="diverse_signature_patterns", ), @@ -439,12 +509,24 @@ def test_content_tagger_split_text(text, expected_parts): {"base_text": "J. Smith", "base_tag": "BODY", "refined_tag": "SIGNATURE_NAME"}, {"base_text": "01 23 45 67 89", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, {"base_text": "Secrétaire en charge des avions", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, - {"base_text": "Business Analyst – Tribu Sinistres – Squad Flux Entrants", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, - {"base_text": "Société nationale des chemins de fer", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + { + "base_text": "Business Analyst – Tribu Sinistres – Squad Flux Entrants", + "base_tag": "SIGNATURE", + "refined_tag": "SIGNATURE", + }, + { + "base_text": "Société nationale des chemins de fer", + "base_tag": "SIGNATURE", + "refined_tag": "SIGNATURE", + }, {"base_text": "Conseiller MAIF", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, {"base_text": "Gestionnaire sinistre - C99G", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, {"base_text": "Service des lettres anonymes", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, - {"base_text": "Technicienne de gestion - EQUIPE ABC", "base_tag": "SIGNATURE", "refined_tag": "SIGNATURE"}, + { + "base_text": "Technicienne de gestion - EQUIPE ABC", + "base_tag": "SIGNATURE", + "refined_tag": "SIGNATURE", + }, ], id="signature_jobs", ), @@ -457,8 +539,11 @@ def test_content_tagger_split_text(text, expected_parts): {"base_text": "bonjour", "base_tag": "HELLO", "refined_tag": "HELLO"}, {"base_text": "mon body", "base_tag": "BODY", "refined_tag": "BODY"}, {"base_text": "Cordialement", "base_tag": "GREETINGS", "refined_tag": "GREETINGS"}, - {"base_text": "analyste -------------------------------------- test test test test test test test", - "base_tag": "BODY", "refined_tag": "BODY"}, + { + "base_text": "analyste -------------------------------------- test test test test test test test", + "base_tag": "BODY", + "refined_tag": "BODY", + }, ], id="check_catastrophic_backtracking", ), diff --git a/tests/processors/test_processors.py b/tests/processors/test_processors.py index 03cac2a..00c1ca0 100644 --- a/tests/processors/test_processors.py +++ b/tests/processors/test_processors.py @@ -184,21 +184,33 @@ def test_text_extractor_error(): def test_text_extractor_multiple_messages(): """Unit test""" message_list = [ - Message(meta="", text="", tags=[ - {"base_text": "A", "base_tag": "BODY"}, - {"base_text": "G", "base_tag": "GREETINGS"}, - {"base_text": "A", "base_tag": "BODY"}, - ]), - Message(meta="", text="", tags=[ - {"base_text": "B", "base_tag": "BODY"}, - {"base_text": "B", "base_tag": "BODY"}, - {"base_text": "B", "base_tag": "BODY"}, - ]), - Message(meta="", text="", tags=[ - {"base_text": "G", "base_tag": "GREETINGS"}, - {"base_text": "C", "base_tag": "BODY"}, - {"base_text": "C", "base_tag": "BODY"}, - ]), + Message( + meta="", + text="", + tags=[ + {"base_text": "A", "base_tag": "BODY"}, + {"base_text": "G", "base_tag": "GREETINGS"}, + {"base_text": "A", "base_tag": "BODY"}, + ], + ), + Message( + meta="", + text="", + tags=[ + {"base_text": "B", "base_tag": "BODY"}, + {"base_text": "B", "base_tag": "BODY"}, + {"base_text": "B", "base_tag": "BODY"}, + ], + ), + Message( + meta="", + text="", + tags=[ + {"base_text": "G", "base_tag": "GREETINGS"}, + {"base_text": "C", "base_tag": "BODY"}, + {"base_text": "C", "base_tag": "BODY"}, + ], + ), ] expected_output = "A\nB\nB\nB" @@ -368,7 +380,7 @@ def test_date_processor(date_str: str, expected_iso_format: str) -> None: text="Envoyé depuis mon Iphone", tags=[ {"base_text": "Envoyé depuis mon Iphone", "base_tag": "FOOTER"}, - ] + ], ), Message( meta="De: test.test@test.fr \n" From c57aa5612163f2a44b4bd392a5defd3fa58a0371 Mon Sep 17 00:00:00 2001 From: Hugo Perrier Date: Mon, 9 Dec 2024 17:02:31 +0100 Subject: [PATCH 06/10] :rotating_light: Code linting --- tests/gmail/test_gmail.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/gmail/test_gmail.py b/tests/gmail/test_gmail.py index f4498db..25a84cf 100644 --- a/tests/gmail/test_gmail.py +++ b/tests/gmail/test_gmail.py @@ -1,12 +1,13 @@ import logging import os -import pytest import pandas as pd +import pytest HttpRequestMock = pytest.importorskip('googleapiclient.http.HttpRequestMock') -from google.oauth2.credentials import Credentials +Credentials = pytest.importorskip('google.oauth2.credentials.Credentials') from unittest.mock import MagicMock, patch + from melusine.connectors.gmail import GmailConnector @@ -313,4 +314,4 @@ def test_gc_send_email(mocked_gc, fake_image, caplog): {"attachment.jpg": fake_image}, ) - assert "Email sent to melusine_testing@yopmail.com, Message Id: 12456" in caplog.text + assert "Email sent to melusine_testing@yopmail.com, Message Id: 12456" in caplog.text From 084fbe7b5c294ff1163bcd64989c355553345075 Mon Sep 17 00:00:00 2001 From: Hugo Perrier Date: Tue, 10 Dec 2024 10:57:29 +0100 Subject: [PATCH 07/10] :rotating_light: Code linting --- tests/gmail/test_gmail.py | 48 +++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/tests/gmail/test_gmail.py b/tests/gmail/test_gmail.py index 25a84cf..d074684 100644 --- a/tests/gmail/test_gmail.py +++ b/tests/gmail/test_gmail.py @@ -4,8 +4,8 @@ import pandas as pd import pytest -HttpRequestMock = pytest.importorskip('googleapiclient.http.HttpRequestMock') -Credentials = pytest.importorskip('google.oauth2.credentials.Credentials') +googleapiclient = pytest.importorskip("googleapiclient") +from google.oauth2.credentials import Credentials from unittest.mock import MagicMock, patch from melusine.connectors.gmail import GmailConnector @@ -22,10 +22,10 @@ def mocked_gc(): with patch("melusine.connectors.gmail.os.path.exists") as mock_exists: mock_exists.return_value = True mock_service = MagicMock() - mock_service.users().getProfile.return_value = HttpRequestMock( + mock_service.users().getProfile.return_value = googleapiclient.http.HttpRequestMock( None, {"emailAddress": "test@example.com"}, return_value ) - mock_service.users().labels().list.return_value = HttpRequestMock( + mock_service.users().labels().list.return_value = googleapiclient.http.HttpRequestMock( None, { "labels": [ @@ -70,10 +70,10 @@ def test_init(mock_exists, mock_creds_from_file, mock_build, caplog): # Mocking necessary objects and methods mock_exists.return_value = True mock_service = MagicMock() - mock_service.users().getProfile.return_value = HttpRequestMock( + mock_service.users().getProfile.return_value = googleapiclient.http.HttpRequestMock( None, {"emailAddress": "test@example.com"}, return_value ) - mock_service.users().labels().list.return_value = HttpRequestMock( + mock_service.users().labels().list.return_value = googleapiclient.http.HttpRequestMock( None, { "labels": [ @@ -113,10 +113,10 @@ def test_init(mock_exists, mock_creds_from_file, mock_build, caplog): def test_init_without_creds(mock_flow, mock_build, caplog): # Mocking necessary objects and methods mock_service = MagicMock() - mock_service.users().getProfile.return_value = HttpRequestMock( + mock_service.users().getProfile.return_value = googleapiclient.http.HttpRequestMock( None, {"emailAddress": "test@example.com"}, return_value ) - mock_service.users().labels().list.return_value = HttpRequestMock( + mock_service.users().labels().list.return_value = googleapiclient.http.HttpRequestMock( None, { "labels": [ @@ -154,10 +154,10 @@ def test_init_without_creds(mock_flow, mock_build, caplog): def test_gc_get_emails(mocked_gc, simple_email_raw, caplog): - mocked_gc.service.users().messages().list.return_value = HttpRequestMock( + mocked_gc.service.users().messages().list.return_value = googleapiclient.http.HttpRequestMock( None, {"messages": [{"id": "123"}]}, return_value ) - mocked_gc.service.users().messages().get.return_value = HttpRequestMock( + mocked_gc.service.users().messages().get.return_value = googleapiclient.http.HttpRequestMock( None, { "id": "123", @@ -188,10 +188,10 @@ def test_gc_get_emails(mocked_gc, simple_email_raw, caplog): def test_gc_get_emails_complex_mail(mocked_gc, complex_email_raw, caplog): - mocked_gc.service.users().messages().list.return_value = HttpRequestMock( + mocked_gc.service.users().messages().list.return_value = googleapiclient.http.HttpRequestMock( None, {"messages": [{"id": "123"}]}, return_value ) - mocked_gc.service.users().messages().get.return_value = HttpRequestMock( + mocked_gc.service.users().messages().get.return_value = googleapiclient.http.HttpRequestMock( None, { "id": "123", @@ -222,7 +222,9 @@ def test_gc_get_emails_complex_mail(mocked_gc, complex_email_raw, caplog): def test_gc_get_emails_none(mocked_gc, simple_email_raw, caplog): - mocked_gc.service.users().messages().list.return_value = HttpRequestMock(None, {}, return_value) + mocked_gc.service.users().messages().list.return_value = googleapiclient.http.HttpRequestMock( + None, {}, return_value + ) with caplog.at_level(logging.DEBUG): df = mocked_gc.get_emails(1, None, "2024/01/01", "2024/05/03") @@ -236,7 +238,7 @@ def test_gc_get_emails_none(mocked_gc, simple_email_raw, caplog): @patch("builtins.input", side_effect=["y", "n"]) def test_gc_check_or_create_label(mock_input, mocked_gc, caplog): - mocked_gc.service.users().labels().create.return_value = HttpRequestMock( + mocked_gc.service.users().labels().create.return_value = googleapiclient.http.HttpRequestMock( None, { "id": "Label_3", @@ -262,7 +264,9 @@ def test_gc_check_or_create_label(mock_input, mocked_gc, caplog): def test_gc_move_to_done(mocked_gc, caplog): - mocked_gc.service.users().messages().modify.return_value = HttpRequestMock(None, {}, return_value) + mocked_gc.service.users().messages().modify.return_value = googleapiclient.http.HttpRequestMock( + None, {}, return_value + ) with caplog.at_level(logging.DEBUG): mocked_gc.move_to_done(["dummy_id"]) @@ -282,7 +286,9 @@ def test_gc_move_to_error(mocked_gc, caplog): def test_gc_route_emails(mocked_gc, caplog): - mocked_gc.service.users().messages().modify.return_value = HttpRequestMock(None, {}, return_value) + mocked_gc.service.users().messages().modify.return_value = googleapiclient.http.HttpRequestMock( + None, {}, return_value + ) df = pd.DataFrame( { @@ -304,14 +310,18 @@ def test_gc_route_emails(mocked_gc, caplog): def test_gc_send_email(mocked_gc, fake_image, caplog): - mocked_gc.service.users().messages().send.return_value = HttpRequestMock(None, {"id": "12456"}, return_value) + mocked_gc.service.users().messages().send.return_value = googleapiclient.http.HttpRequestMock( + None, {"id": "12456"}, return_value + ) with caplog.at_level(logging.DEBUG): mocked_gc.send_email( - "melusine_testing.yopmail.com", + "melusine_testing@yopmail.com", "Testing Header", "Testing Body", {"attachment.jpg": fake_image}, ) - assert "Email sent to melusine_testing@yopmail.com, Message Id: 12456" in caplog.text + assert "12456" in caplog.text + assert "Email sent to" in caplog.text + assert "melusine_testing@yopmail.com" in caplog.text From 4529ca1297cc2204068655298913074507740e5f Mon Sep 17 00:00:00 2001 From: Hugo Perrier Date: Tue, 10 Dec 2024 17:17:44 +0100 Subject: [PATCH 08/10] :alien: Add a fit method to MelusineTransformer and BaseMelusineDetector --- melusine/base.py | 34 ++++++++++++++++++++++++++++++++++ tests/gmail/test_gmail.py | 11 ++++++----- 2 files changed, 40 insertions(+), 5 deletions(-) diff --git a/melusine/base.py b/melusine/base.py index b7cd199..016a910 100644 --- a/melusine/base.py +++ b/melusine/base.py @@ -99,6 +99,23 @@ def parse_column_list(columns: str | Iterable[str]) -> list[str]: columns = [columns] return list(columns) + def fit(self, X: MelusineDataset, y: Any = None) -> MelusineTransformer: + """A reference implementation of a fitting function. + + Parameters + ---------- + X : The training input samples. + + y : The target values (class labels in classification, real numbers in + regression). + + Returns + ------- + self : object + Returns self. + """ + return self + def transform(self, data: MelusineDataset) -> MelusineDataset: """ Transform input data. @@ -196,6 +213,23 @@ def transform_methods(self) -> list[Callable]: List of methods to be called by the transform method. """ + def fit(self, X: MelusineDataset, y: Any = None) -> MelusineTransformer: + """A reference implementation of a fitting function. + + Parameters + ---------- + X : The training input samples. + + y : The target values (class labels in classification, real numbers in + regression). + + Returns + ------- + self : object + Returns self. + """ + return self + def transform(self, df: MelusineDataset) -> MelusineDataset: """ Re-definition of super().transform() => specific detector's implementation diff --git a/tests/gmail/test_gmail.py b/tests/gmail/test_gmail.py index d074684..107f038 100644 --- a/tests/gmail/test_gmail.py +++ b/tests/gmail/test_gmail.py @@ -4,10 +4,11 @@ import pandas as pd import pytest -googleapiclient = pytest.importorskip("googleapiclient") -from google.oauth2.credentials import Credentials from unittest.mock import MagicMock, patch +google = pytest.importorskip("google") +googleapiclient = pytest.importorskip("googleapiclient") + from melusine.connectors.gmail import GmailConnector @@ -43,7 +44,7 @@ def mocked_gc(): return_value, ) mock_build.return_value = mock_service - mock_creds_from_file.return_value = Credentials("dummy") + mock_creds_from_file.return_value = google.oauth2.credentials.Credentials("dummy") return GmailConnector(token_json_path="token.json", done_label="TRASH", target_column="target") @@ -91,7 +92,7 @@ def test_init(mock_exists, mock_creds_from_file, mock_build, caplog): return_value, ) mock_build.return_value = mock_service - mock_creds_from_file.return_value = Credentials("dummy") + mock_creds_from_file.return_value = google.oauth2.credentials.Credentials("dummy") # Creating an instance of GmailConnector with caplog.at_level(logging.DEBUG): @@ -134,7 +135,7 @@ def test_init_without_creds(mock_flow, mock_build, caplog): return_value, ) mock_build.return_value = mock_service - mock_flow.return_value.run_local_server.return_value = Credentials("dummy") + mock_flow.return_value.run_local_server.return_value = google.oauth2.credentials.Credentials("dummy") # Creating an instance of GmailConnector with caplog.at_level(logging.DEBUG): From 241c47af26e9fe4ffe6af77745f8a06c6045fbda Mon Sep 17 00:00:00 2001 From: Hugo Perrier Date: Tue, 10 Dec 2024 17:25:14 +0100 Subject: [PATCH 09/10] =?UTF-8?q?=F0=9F=9A=A8=20Fix=20Lint=20error?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/gmail/test_gmail.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/gmail/test_gmail.py b/tests/gmail/test_gmail.py index 107f038..b8cc145 100644 --- a/tests/gmail/test_gmail.py +++ b/tests/gmail/test_gmail.py @@ -1,11 +1,10 @@ import logging import os +from unittest.mock import MagicMock, patch import pandas as pd import pytest -from unittest.mock import MagicMock, patch - google = pytest.importorskip("google") googleapiclient = pytest.importorskip("googleapiclient") From 8f3d7022fca438f0ef112441ec42c61db24e497b Mon Sep 17 00:00:00 2001 From: Hugo Perrier Date: Fri, 13 Dec 2024 16:35:23 +0100 Subject: [PATCH 10/10] :recycle: Improve content tagging --- melusine/message.py | 27 +++++++++++++-------------- melusine/processors.py | 11 +++-------- 2 files changed, 16 insertions(+), 22 deletions(-) diff --git a/melusine/message.py b/melusine/message.py index 4f4b112..b472831 100644 --- a/melusine/message.py +++ b/melusine/message.py @@ -20,9 +20,6 @@ class Message: DEFAULT_STR_LINE_LENGTH = 120 DEFAULT_STR_TAG_NAME_LENGTH = 22 - MAIN_TAG_TYPE = "refined_tag" - FALLBACK_TAG_TYPE = "base_tag" - MAIN_TEXT_TYPE = "base_text" def __init__( self, @@ -142,7 +139,7 @@ def extract_text( target_tags: Optional[Iterable[str]] = None, stop_at: Optional[Iterable[str]] = None, tag_type: Optional[str] = None, - text_type: str = MAIN_TEXT_TYPE, + text_type: Optional[str] = None, separator: str = "\n", ) -> str: """ @@ -165,6 +162,8 @@ def extract_text( ------- _: List of extracted tags. """ + if text_type is None: + text_type = self.effective_text_key parts = self.extract_parts(target_tags=target_tags, stop_at=stop_at, tag_type=tag_type) return separator.join([x[text_type] for x in parts]) @@ -223,10 +222,7 @@ def has_tags( found: bool = False for tag_data in self.tags: - try: - tag = tag_data[tag_type] - except KeyError: - tag = tag_data[self.FALLBACK_TAG_TYPE] + tag = tag_data[tag_type] # Check if tag in tags of interest if tag in target_tags: @@ -239,24 +235,27 @@ def has_tags( return found - def format_tags(self, tag_type: str = MAIN_TAG_TYPE) -> str: + def format_tags(self, tag_type: Optional[str] = None, text_type: Optional[str] = None) -> str: """ Create a pretty formatted representation of text and their associated tags. Returns: _: Pretty formatted representation of the tags and texts. """ + if tag_type is None: + tag_type = self.effective_tag_key + + if text_type is None: + text_type = self.effective_text_key + if self.tags is None: return self.text else: tag_text_length = self.str_line_length - self.str_tag_name_length text = "" for tag_data in self.tags: - try: - tag_name = tag_data[tag_type] - except KeyError: - tag_name = tag_data[self.FALLBACK_TAG_TYPE] - tag_text = tag_data["base_text"] + tag_name = tag_data[tag_type] + tag_text = tag_data[text_type] text += tag_text.ljust(tag_text_length, ".") + tag_name.rjust(self.str_tag_name_length, ".") + "\n" return text.strip() diff --git a/melusine/processors.py b/melusine/processors.py index 06d54bd..fd06302 100644 --- a/melusine/processors.py +++ b/melusine/processors.py @@ -1790,14 +1790,9 @@ def filter_message_list(self, message_list: list[Message]) -> list[Message]: top_message = message_list[0] parts = top_message.extract_parts() - try: - contains_only_tags_to_ignore = all( - [tag_data[Message.MAIN_TAG_TYPE].startswith(self.tags_to_ignore) for tag_data in parts] - ) - except KeyError: - contains_only_tags_to_ignore = all( - [tag_data[Message.FALLBACK_TAG_TYPE].startswith(self.tags_to_ignore) for tag_data in parts] - ) + contains_only_tags_to_ignore = all( + [tag_data[top_message.effective_tag_key].startswith(self.tags_to_ignore) for tag_data in parts] + ) if contains_only_tags_to_ignore and (len(message_list) > 1): message_list = message_list[1:]