Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

✨ Update Melusine Content Tagger #186

Merged
merged 10 commits into from
Dec 16, 2024
4 changes: 0 additions & 4 deletions docs/tutorials/08_MelusineRegex.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ from melusine.base import MelusineRegex


class AnnoyingEmailsRegex(MelusineRegex):

@property
def positive(self) -> Union[str, Dict[str, str]]:
return dict(
Expand Down Expand Up @@ -65,7 +64,6 @@ from melusine.base import MelusineRegex


class AnnoyingEmailsRegex(MelusineRegex):

@property
def positive(self) -> Union[str, Dict[str, str]]:
return dict(
Expand Down Expand Up @@ -192,7 +190,6 @@ from melusine.base import MelusineRegex


class AnnoyingEmailsRegex(MelusineRegex):

@property
def positive(self) -> Union[str, Dict[str, str]]:
return dict(
Expand Down Expand Up @@ -237,7 +234,6 @@ That is were neutral regex can be of use. Whenever a neutral regex is matched, i

```python
class IfritAlertRegex(MelusineRegex):

@property
def positive(self) -> Union[str, Dict[str, str]]:
return dict(
Expand Down
36 changes: 35 additions & 1 deletion melusine/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from sklearn.base import BaseEstimator, TransformerMixin

from melusine.backend import backend
from melusine.io import IoMixin
from melusine.io_mixin import IoMixin

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -99,6 +99,23 @@ def parse_column_list(columns: str | Iterable[str]) -> list[str]:
columns = [columns]
return list(columns)

def fit(self, X: MelusineDataset, y: Any = None) -> MelusineTransformer:
"""A reference implementation of a fitting function.

Parameters
----------
X : The training input samples.

y : The target values (class labels in classification, real numbers in
regression).

Returns
-------
self : object
Returns self.
"""
return self

def transform(self, data: MelusineDataset) -> MelusineDataset:
"""
Transform input data.
Expand Down Expand Up @@ -196,6 +213,23 @@ def transform_methods(self) -> list[Callable]:
List of methods to be called by the transform method.
"""

def fit(self, X: MelusineDataset, y: Any = None) -> MelusineTransformer:
"""A reference implementation of a fitting function.

Parameters
----------
X : The training input samples.

y : The target values (class labels in classification, real numbers in
regression).

Returns
-------
self : object
Returns self.
"""
return self

def transform(self, df: MelusineDataset) -> MelusineDataset:
"""
Re-definition of super().transform() => specific detector's implementation
Expand Down
3 changes: 3 additions & 0 deletions melusine/conf/pipelines/demo_pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ demo_pipeline:
- class_name: ContentTagger
config_key: content_tagger
module: melusine.processors
- class_name: RefinedTagger
config_key: refined_tagger
module: melusine.processors
- class_name: TextExtractor
config_key: text_extractor
module: melusine.processors
Expand Down
3 changes: 3 additions & 0 deletions melusine/conf/pipelines/preprocessing_pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ preprocessing_pipeline:
- class_name: ContentTagger
config_key: content_tagger
module: melusine.processors
- class_name: RefinedTagger
config_key: refined_tagger
module: melusine.processors
- class_name: TransferredEmailProcessor
config_key: transferred_email_processor
module: melusine.processors
Expand Down
2 changes: 2 additions & 0 deletions melusine/conf/processors/refined_tagger.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
refined_tagger:
default_tag: BODY
22 changes: 4 additions & 18 deletions melusine/detectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

"""

from typing import Any, Dict, List, Tuple
from typing import Any, Dict, List

from melusine.base import MelusineDetector, MelusineItem, MelusineRegex
from melusine.message import Message
Expand Down Expand Up @@ -95,19 +95,12 @@ def pre_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineIte
target_tags={self.BODY_PART}, stop_at={self.GREETINGS_PART}
)

# Extract the THANKS part in the last message
thanks_parts: List[Tuple[str, str]] = row[self.messages_column][0].extract_parts(target_tags={self.THANKS_PART})

# Compute THANKS text
if not thanks_parts:
thanks_text: str = ""
else:
thanks_text = "\n".join(x[1] for x in thanks_parts)
# Extract the THANKS text in the last message
thanks_text = row[self.messages_column][0].extract_text(target_tags={self.THANKS_PART})

# Save debug data
if debug_mode:
debug_dict = {
self.THANKS_PARTS_COL: thanks_parts,
self.THANKS_TEXT_COL: thanks_text,
self.HAS_BODY: has_body,
}
Expand Down Expand Up @@ -236,20 +229,13 @@ def pre_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineIte
"""
# Last message body
last_message: Message = row[self.messages_column][0]
body_parts = last_message.extract_last_body()

if body_parts:
row[self.CONST_TEXT_COL_NAME] = "\n".join(text for tag, text in body_parts)
else:
row[self.CONST_TEXT_COL_NAME] = ""
row[self.CONST_TEXT_COL_NAME] = last_message.extract_text(target_tags=("BODY",), stop_at=("GREETINGS",))

# Prepare and save debug data
if debug_mode:
debug_dict: Dict[str, Any] = {
self.CONST_DEBUG_TEXT_KEY: row[self.CONST_TEXT_COL_NAME],
}
if self.messages_column:
debug_dict[self.CONST_DEBUG_PARTS_KEY] = body_parts
row[self.debug_dict_col].update(debug_dict)

return row
Expand Down
2 changes: 1 addition & 1 deletion melusine/io/__init__.py → melusine/io_mixin/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@
The melusine.io module includes classes for input/output data.
"""

from melusine.io._classes import IoMixin
from melusine.io_mixin._classes import IoMixin

__all__ = ["IoMixin"]
4 changes: 0 additions & 4 deletions melusine/io/_classes.py → melusine/io_mixin/_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,6 @@ class IoMixin:
Defines generic load methods.
"""

def __init__(self, **kwargs: Any):
"""Initialize attribute."""
self.json_exclude_list: list[str] = ["_func", "json_exclude_list"]

@classmethod
def from_config(
cls: type[T],
Expand Down
91 changes: 77 additions & 14 deletions melusine/message.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import re
from datetime import datetime
from typing import Iterable, List, Optional, Tuple
from typing import Any, Dict, Iterable, List, Optional

from melusine import config

Expand All @@ -29,7 +29,7 @@ def __init__(
date: Optional[datetime] = None,
text_from: str = "",
text_to: Optional[str] = None,
tags: Optional[List[Tuple[str, str]]] = None,
tags: Optional[List[Dict[str, Any]]] = None,
):
"""
Attributes initialization.
Expand Down Expand Up @@ -63,6 +63,9 @@ def __init__(
self.clean_header: str = ""
self.clean_text: str = ""

self.effective_tag_key = "base_tag"
self.effective_text_key = "base_text"

@property
def str_tag_name_length(self) -> int:
"""
Expand All @@ -84,8 +87,11 @@ def str_line_length(self) -> int:
return config["message"].get("str_line_length", self.DEFAULT_STR_LINE_LENGTH)

def extract_parts(
self, target_tags: Optional[Iterable[str]] = None, stop_at: Optional[Iterable[str]] = None
) -> List[Tuple[str, str]]:
self,
target_tags: Optional[Iterable[str]] = None,
stop_at: Optional[Iterable[str]] = None,
tag_type: Optional[str] = None,
) -> List[Dict[str, Any]]:
"""
Function to extract target tags from the message.

Expand All @@ -95,17 +101,21 @@ def extract_parts(
Tags to be extracted.
stop_at:
Tags for which extraction should stop.
tag_type:
Type of tags to consider.

Returns
-------
_: List[Tuple[str, str]]
List of extracted tags.
_: List of extracted tags.
"""
if not self.tags:
return []

if tag_type is None:
tag_type = self.effective_tag_key

# List of tags in the message
tag_name_list: List[str] = [x[0] for x in self.tags]
tag_name_list: List[str] = [x[tag_type] for x in self.tags]

if target_tags is None:
target_tags = tag_name_list
Expand All @@ -122,29 +132,67 @@ def extract_parts(
else:
effective_tags = self.tags

return [x for x in effective_tags if x[0] in target_tags]
return [x for x in effective_tags if x[tag_type] in target_tags]

def extract_text(
self,
target_tags: Optional[Iterable[str]] = None,
stop_at: Optional[Iterable[str]] = None,
tag_type: Optional[str] = None,
text_type: Optional[str] = None,
separator: str = "\n",
) -> str:
"""
Function to extract target tags from the message.

Parameters
----------
target_tags:
Tags to be extracted.
stop_at:
Tags for which extraction should stop.
tag_type:
Type of tags to consider.
text_type:
Type of text to consider
separator:
Separator to join the extracted texts.

Returns
-------
_: List of extracted tags.
"""
if text_type is None:
text_type = self.effective_text_key
parts = self.extract_parts(target_tags=target_tags, stop_at=stop_at, tag_type=tag_type)
return separator.join([x[text_type] for x in parts])

def extract_last_body(
self, target_tags: Iterable[str] = ("BODY",), stop_at: Iterable[str] = ("GREETINGS",)
) -> List[Tuple[str, str]]:
self,
target_tags: Iterable[str] = ("BODY",),
stop_at: Iterable[str] = ("GREETINGS",),
tag_type: Optional[str] = None,
) -> List[Dict[str, Any]]:
"""
Extract the BODY parts of the last message in the email.

Parameters
----------
target_tags: Iterable[str]
stop_at: Iterable[str]
tag_type: Type of tags to consider.

Returns
-------
_: List[Tuple[str, str]]
"""
return self.extract_parts(target_tags=target_tags, stop_at=stop_at)
return self.extract_parts(target_tags=target_tags, stop_at=stop_at, tag_type=tag_type)

def has_tags(
self,
target_tags: Iterable[str] = ("BODY",),
stop_at: Optional[Iterable[str]] = None,
tag_type: Optional[str] = None,
) -> bool:
"""
Function to check if input tags are present in the message.
Expand All @@ -155,6 +203,8 @@ def has_tags(
Tags of interest.
stop_at:
Tags for which extraction should stop.
tag_type:
Type of tags to consider.

Returns
-------
Expand All @@ -164,11 +214,16 @@ def has_tags(
if self.tags is None:
return False

if tag_type is None:
tag_type = self.effective_tag_key

if not stop_at:
stop_at = set()

found: bool = False
for tag, _ in self.tags:
for tag_data in self.tags:
tag = tag_data[tag_type]

# Check if tag in tags of interest
if tag in target_tags:
found = True
Expand All @@ -180,19 +235,27 @@ def has_tags(

return found

def format_tags(self) -> str:
def format_tags(self, tag_type: Optional[str] = None, text_type: Optional[str] = None) -> str:
"""
Create a pretty formatted representation of text and their associated tags.

Returns:
_: Pretty formatted representation of the tags and texts.
"""
if tag_type is None:
tag_type = self.effective_tag_key

if text_type is None:
text_type = self.effective_text_key

if self.tags is None:
return self.text
else:
tag_text_length = self.str_line_length - self.str_tag_name_length
text = ""
for tag_name, tag_text in self.tags:
for tag_data in self.tags:
tag_name = tag_data[tag_type]
tag_text = tag_data[text_type]
text += tag_text.ljust(tag_text_length, ".") + tag_name.rjust(self.str_tag_name_length, ".") + "\n"

return text.strip()
Expand Down
2 changes: 1 addition & 1 deletion melusine/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from melusine.backend import backend
from melusine.backend.base_backend import Any
from melusine.base import MelusineTransformer
from melusine.io import IoMixin
from melusine.io_mixin import IoMixin

T = TypeVar("T")

Expand Down
Loading
Loading