From 983954f252c2c935fd9eb5804f7179842cba4a30 Mon Sep 17 00:00:00 2001 From: Ron Rademaker Date: Wed, 16 May 2018 11:25:28 +0200 Subject: [PATCH 1/4] Increase flexibility of bruteforce signature extration --- talon/signature/bruteforce.py | 140 ++------------------- talon/signature/constants.py | 34 +++++ talon/signature/extractor.py | 195 +++++++++++++++++++++++++++++ tests/signature/bruteforce_test.py | 4 +- 4 files changed, 240 insertions(+), 133 deletions(-) create mode 100644 talon/signature/extractor.py diff --git a/talon/signature/bruteforce.py b/talon/signature/bruteforce.py index e502bab8..c352cfc6 100644 --- a/talon/signature/bruteforce.py +++ b/talon/signature/bruteforce.py @@ -1,64 +1,8 @@ from __future__ import absolute_import - -import logging - -import regex as re +from talon.signature.extractor import BruteForceExtractor from talon.signature.constants import (SIGNATURE_MAX_LINES, TOO_LONG_SIGNATURE_LINE) -from talon.utils import get_delimiter - -log = logging.getLogger(__name__) - -# regex to fetch signature based on common signature words -RE_SIGNATURE = re.compile(r''' - ( - (?: - ^[\s]*--*[\s]*[a-z \.]*$ - | - ^thanks[\s,!]*$ - | - ^regards[\s,!]*$ - | - ^cheers[\s,!]*$ - | - ^best[ a-z]*[\s,!]*$ - ) - .* - ) - ''', re.I | re.X | re.M | re.S) - -# signatures appended by phone email clients -RE_PHONE_SIGNATURE = re.compile(r''' - ( - (?: - ^sent[ ]{1}from[ ]{1}my[\s,!\w]*$ - | - ^sent[ ]from[ ]Mailbox[ ]for[ ]iPhone.*$ - | - ^sent[ ]([\S]*[ ])?from[ ]my[ ]BlackBerry.*$ - | - ^Enviado[ ]desde[ ]mi[ ]([\S]+[ ]){0,2}BlackBerry.*$ - ) - .* - ) - ''', re.I | re.X | re.M | re.S) - -# see _mark_candidate_indexes() for details -# c - could be signature line -# d - line starts with dashes (could be signature or list item) -# l - long line -RE_SIGNATURE_CANDIDATE = re.compile(r''' - (?Pc+d)[^d] - | - (?Pc+d)$ - | - (?Pc+) - | - (?Pd)[^d] - | - (?Pd)$ -''', re.I | re.X | re.M | re.S) def extract_signature(msg_body): @@ -73,46 +17,8 @@ def extract_signature(msg_body): >>> extract_signature('Hey man!') ('Hey man!', None) ''' - try: - # identify line delimiter first - delimiter = get_delimiter(msg_body) - - # make an assumption - stripped_body = msg_body.strip() - phone_signature = None - - # strip off phone signature - phone_signature = RE_PHONE_SIGNATURE.search(msg_body) - if phone_signature: - stripped_body = stripped_body[:phone_signature.start()] - phone_signature = phone_signature.group() - - # decide on signature candidate - lines = stripped_body.splitlines() - candidate = get_signature_candidate(lines) - candidate = delimiter.join(candidate) - - # try to extract signature - signature = RE_SIGNATURE.search(candidate) - if not signature: - return (stripped_body.strip(), phone_signature) - else: - signature = signature.group() - # when we splitlines() and then join them - # we can lose a new line at the end - # we did it when identifying a candidate - # so we had to do it for stripped_body now - stripped_body = delimiter.join(lines) - stripped_body = stripped_body[:-len(signature)] - - if phone_signature: - signature = delimiter.join([signature, phone_signature]) - - return (stripped_body.strip(), - signature.strip()) - except Exception: - log.exception('ERROR extracting signature') - return (msg_body, None) + brute_force_extractor = BruteForceExtractor(max_lines=SIGNATURE_MAX_LINES, max_line_length=TOO_LONG_SIGNATURE_LINE) + return brute_force_extractor.extract_signature(msg_body) def get_signature_candidate(lines): @@ -126,26 +32,8 @@ def get_signature_candidate(lines): * not include more than one line that starts with dashes """ # non empty lines indexes - non_empty = [i for i, line in enumerate(lines) if line.strip()] - - # if message is empty or just one line then there is no signature - if len(non_empty) <= 1: - return [] - - # we don't expect signature to start at the 1st line - candidate = non_empty[1:] - # signature shouldn't be longer then SIGNATURE_MAX_LINES - candidate = candidate[-SIGNATURE_MAX_LINES:] - - markers = _mark_candidate_indexes(lines, candidate) - candidate = _process_marked_candidate_indexes(candidate, markers) - - # get actual lines for the candidate instead of indexes - if candidate: - candidate = lines[candidate[0]:] - return candidate - - return [] + brute_force_extractor = BruteForceExtractor(max_lines=SIGNATURE_MAX_LINES, max_line_length=TOO_LONG_SIGNATURE_LINE) + return brute_force_extractor._get_signature_candidate(lines) def _mark_candidate_indexes(lines, candidate): @@ -161,18 +49,8 @@ def _mark_candidate_indexes(lines, candidate): 'cdc' """ # at first consider everything to be potential signature lines - markers = list('c' * len(candidate)) - - # mark lines starting from bottom up - for i, line_idx in reversed(list(enumerate(candidate))): - if len(lines[line_idx].strip()) > TOO_LONG_SIGNATURE_LINE: - markers[i] = 'l' - else: - line = lines[line_idx].strip() - if line.startswith('-') and line.strip("-"): - markers[i] = 'd' - - return "".join(markers) + brute_force_extractor = BruteForceExtractor(max_lines=SIGNATURE_MAX_LINES, max_line_length=TOO_LONG_SIGNATURE_LINE) + return brute_force_extractor._mark_candidate_indexes(lines, candidate) def _process_marked_candidate_indexes(candidate, markers): @@ -183,5 +61,5 @@ def _process_marked_candidate_indexes(candidate, markers): >>> _process_marked_candidate_indexes([9, 12, 14, 15, 17], 'clddc') [15, 17] """ - match = RE_SIGNATURE_CANDIDATE.match(markers[::-1]) - return candidate[-match.end('candidate'):] if match else [] + brute_force_extractor = BruteForceExtractor(max_lines=SIGNATURE_MAX_LINES, max_line_length=TOO_LONG_SIGNATURE_LINE) + return brute_force_extractor._process_marked_candidate_indexes(candidate, markers) diff --git a/talon/signature/constants.py b/talon/signature/constants.py index 14f2006c..66d72bfa 100644 --- a/talon/signature/constants.py +++ b/talon/signature/constants.py @@ -1,2 +1,36 @@ +import regex as re + SIGNATURE_MAX_LINES = 11 TOO_LONG_SIGNATURE_LINE = 60 + +# signatures appended by phone email clients +RE_PHONE_SIGNATURE = re.compile(r''' + ( + (?: + ^sent[ ]{1}from[ ]{1}my[\s,!\w]*$ + | + ^sent[ ]from[ ]Mailbox[ ]for[ ]iPhone.*$ + | + ^sent[ ]([\S]*[ ])?from[ ]my[ ]BlackBerry.*$ + | + ^Enviado[ ]desde[ ]mi[ ]([\S]+[ ]){0,2}BlackBerry.*$ + ) + .* + ) + ''', re.I | re.X | re.M | re.S) + +# see _mark_candidate_indexes() for details +# c - could be signature line +# d - line starts with dashes (could be signature or list item) +# l - long line +RE_SIGNATURE_CANDIDATE = re.compile(r''' + (?Pc+d)[^d] + | + (?Pc+d)$ + | + (?Pc+) + | + (?Pd)[^d] + | + (?Pd)$ +''', re.I | re.X | re.M | re.S) \ No newline at end of file diff --git a/talon/signature/extractor.py b/talon/signature/extractor.py new file mode 100644 index 00000000..f3b143c3 --- /dev/null +++ b/talon/signature/extractor.py @@ -0,0 +1,195 @@ +""" +Module with object oriented approach to +signature extractions. Built to be more +flexible and to support more languages. +""" +from __future__ import absolute_import +import re +import logging + +from abc import ABC, abstractmethod +from talon.utils import get_delimiter +from talon.signature.constants import (SIGNATURE_MAX_LINES, + TOO_LONG_SIGNATURE_LINE, + RE_SIGNATURE_CANDIDATE, + RE_PHONE_SIGNATURE) + +log = logging.getLogger(__name__) + +# Defaults taken from bruteforce.py +DEFAULT_GREETINGS = ( + '[\s]*--*[\s]*[a-z \.]', + 'thanks[\s,!]', + 'regards[\s,!]', + 'cheers[\s,!]', + 'best[ a-z]*[\s,!]' +) + + +class AbstractExtractor(ABC): + """ + Abstract base class for + signature extractors. + """ + + @abstractmethod + def extract_signature(self, message: str): + """ + Extract the signature from + message and return the + text and signature + + :param message: str + :return: (text: str, signature: str) + """ + pass + + +class BruteForceExtractor(AbstractExtractor): + """ + Brute force signature extractor. + More flexible OO approach to + talon.signatures.bruteforce.extract_signature + """ + + def __init__(self, max_lines=SIGNATURE_MAX_LINES, max_line_length=TOO_LONG_SIGNATURE_LINE, + greetings=DEFAULT_GREETINGS): + """ + Create a new brute force extractor. Allows override + max signature length, max signature line length and + common greetings (allows multi language support). + """ + self.max_lines = max_lines + self.max_line_length = max_line_length + self._compile_greetings(greetings) + + def extract_signature(self, msg_body: str): + """ + Use brute force to extract the + signature (ie. regex and + string matching) + + :param message: str + :return: (text: str, signature: str) + """ + try: + # identify line delimiter first + delimiter = get_delimiter(msg_body) + + # make an assumption + stripped_body = msg_body.strip() + phone_signature = None + + # strip off phone signature + phone_signature = RE_PHONE_SIGNATURE.search(msg_body) + if phone_signature: + stripped_body = stripped_body[:phone_signature.start()] + phone_signature = phone_signature.group() + + # decide on signature candidate + lines = stripped_body.splitlines() + candidate = self._get_signature_candidate(lines) + candidate = delimiter.join(candidate) + + # try to extract signature + signature = self.re_signature.search(candidate) + if not signature: + return (stripped_body.strip(), phone_signature) + else: + signature = signature.group() + # when we splitlines() and then join them + # we can lose a new line at the end + # we did it when identifying a candidate + # so we had to do it for stripped_body now + stripped_body = delimiter.join(lines) + stripped_body = stripped_body[:-len(signature)] + + if phone_signature: + signature = delimiter.join([signature, phone_signature]) + + return (stripped_body.strip(), + signature.strip()) + except Exception: + log.exception('ERROR extracting signature') + return (msg_body, None) + + def _compile_greetings(self, greetings): + """ + Init the regex to detect the + greeting based on the passed + greetings + + :param greetings: + """ + greetings = ['^{}*$'.format(greeting) for greeting in greetings] + greetings = '|'.join(greetings) + self.re_signature = re.compile(r'((?:{}).*)'.format(greetings), re.I | re.X | re.M | re.S) + + def _get_signature_candidate(self, lines): + """Return lines that could hold signature + + The lines should: + + * be among last SIGNATURE_MAX_LINES non-empty lines. + * not include first line + * be shorter than TOO_LONG_SIGNATURE_LINE + * not include more than one line that starts with dashes + """ + # non empty lines indexes + non_empty = [i for i, line in enumerate(lines) if line.strip()] + + # if message is empty or just one line then there is no signature + if len(non_empty) <= 1: + return [] + + # we don't expect signature to start at the 1st line + candidate = non_empty[1:] + # signature shouldn't be longer then SIGNATURE_MAX_LINES + candidate = candidate[-self.max_lines:] + + markers = self._mark_candidate_indexes(lines, candidate) + candidate = self._process_marked_candidate_indexes(candidate, markers) + + # get actual lines for the candidate instead of indexes + if candidate: + candidate = lines[candidate[0]:] + return candidate + + return [] + + def _mark_candidate_indexes(self, lines, candidate): + """Mark candidate indexes with markers + + Markers: + + * c - line that could be a signature line + * l - long line + * d - line that starts with dashes but has other chars as well + + >>> _mark_candidate_lines(['Some text', '', '-', 'Bob'], [0, 2, 3]) + 'cdc' + """ + # at first consider everything to be potential signature lines + markers = list('c' * len(candidate)) + + # mark lines starting from bottom up + for i, line_idx in reversed(list(enumerate(candidate))): + if len(lines[line_idx].strip()) > self.max_line_length: + markers[i] = 'l' + else: + line = lines[line_idx].strip() + if line.startswith('-') and line.strip("-"): + markers[i] = 'd' + + return "".join(markers) + + def _process_marked_candidate_indexes(self, candidate, markers): + """ + Run regexes against candidate's marked indexes to strip + signature candidate. + + >>> _process_marked_candidate_indexes([9, 12, 14, 15, 17], 'clddc') + [15, 17] + """ + match = RE_SIGNATURE_CANDIDATE.match(markers[::-1]) + return candidate[-match.end('candidate'):] if match else [] diff --git a/tests/signature/bruteforce_test.py b/tests/signature/bruteforce_test.py index 382615bb..0d3221ff 100644 --- a/tests/signature/bruteforce_test.py +++ b/tests/signature/bruteforce_test.py @@ -4,7 +4,7 @@ from .. import * from talon.signature import bruteforce - +from talon.signature import extractor def test_empty_body(): eq_(('', None), bruteforce.extract_signature('')) @@ -135,7 +135,7 @@ def test_blackberry_signature(): bruteforce.extract_signature(msg_body)) -@patch.object(bruteforce, 'get_delimiter', Mock(side_effect=Exception())) +@patch.object(extractor, 'get_delimiter', Mock(side_effect=Exception())) def test_crash_in_extract_signature(): msg_body = '''Hey! -roman''' From 7f00af73d1f00d2c4b0cc617973dea59bfd688b3 Mon Sep 17 00:00:00 2001 From: Ron Rademaker Date: Tue, 14 Aug 2018 10:10:48 +0200 Subject: [PATCH 2/4] Fix syntax errors --- talon/signature/data/classifier | Bin 608 -> 729 bytes talon/signature/extractor.py | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/talon/signature/data/classifier b/talon/signature/data/classifier index 1c3a4b0865f3e951b1b3b17fb31bacc48d8d005b..c5c8a706120a2093339a412699fb8f177c981cc3 100644 GIT binary patch literal 729 zcmZ`%O>Yx15Z!ds4>m2dd{fFtp+E}MY6?O^LIM<|N-XT5NX0kHyK%Y(uXo4xhUO3n zF3?}akKu&GkAR62azXfD&Cbkw^WJzrhnfAT1UH)4K|wObtra$UBQ@sU9d;uKw!c7Q z1aocL41z$F)NHZa$P`TZAL7~r8xcuffzEX?MJV<(K}%90Od zs$@!ZT~tF>$qa?CtSl<-qU6;}WBBmB;Z-F}1dFI*X#}U*p4HZ=Cx$5V;7IrQxIKcF zulJ=Anim3=>DpfrvM+^UyB(sdA*|FXUBt>jn6$J4PRD`olp3{VqH@e;T%j>I6F0o^ zUdDa6j)=vD)f*&)9XgK@!XXweb|7LJ*B9dgK5u~VRaYvwz5jgu$&X(f z-5olQZCNS$47T#|i`*H(-F6=3t^D6b@A*aF!$n7knyrgL+e5cUaQ}bMM;`hBp<`WM lE&C+oQSRVjLRT@zravvi2eh)(utOIxy-PYzNe4)sl0TCt-i!bM delta 426 zcmZ9IyG{a85Qg_gSQk(+5sl&<6`~SvF?MP&p&(L_Ol(Znvv6QfvbQ;V7Nsz1qqc+x z@eOTMuQH4y15#h9D+RW=v)IhFaqIz18 zMg@BRdY2jX^E1v#?~;=spxnS@=@Ioi94mqzMpn}y@hYR~MQ+eDycQW|4NQUFW)Ag; zOEC@EQHr%0<>u~Q*$aA@i7ZHVDYyKP;&@Gh>|jEk-Uh7|5eFw_&576YT{9F!P+YBpDtrgmswZ Date: Tue, 14 Aug 2018 10:15:23 +0200 Subject: [PATCH 3/4] Skip default parameters --- talon/signature/bruteforce.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/talon/signature/bruteforce.py b/talon/signature/bruteforce.py index c352cfc6..b61a0ea1 100644 --- a/talon/signature/bruteforce.py +++ b/talon/signature/bruteforce.py @@ -17,7 +17,7 @@ def extract_signature(msg_body): >>> extract_signature('Hey man!') ('Hey man!', None) ''' - brute_force_extractor = BruteForceExtractor(max_lines=SIGNATURE_MAX_LINES, max_line_length=TOO_LONG_SIGNATURE_LINE) + brute_force_extractor = BruteForceExtractor() return brute_force_extractor.extract_signature(msg_body) @@ -32,7 +32,7 @@ def get_signature_candidate(lines): * not include more than one line that starts with dashes """ # non empty lines indexes - brute_force_extractor = BruteForceExtractor(max_lines=SIGNATURE_MAX_LINES, max_line_length=TOO_LONG_SIGNATURE_LINE) + brute_force_extractor = BruteForceExtractor() return brute_force_extractor._get_signature_candidate(lines) @@ -49,7 +49,7 @@ def _mark_candidate_indexes(lines, candidate): 'cdc' """ # at first consider everything to be potential signature lines - brute_force_extractor = BruteForceExtractor(max_lines=SIGNATURE_MAX_LINES, max_line_length=TOO_LONG_SIGNATURE_LINE) + brute_force_extractor = BruteForceExtractor() return brute_force_extractor._mark_candidate_indexes(lines, candidate) @@ -61,5 +61,5 @@ def _process_marked_candidate_indexes(candidate, markers): >>> _process_marked_candidate_indexes([9, 12, 14, 15, 17], 'clddc') [15, 17] """ - brute_force_extractor = BruteForceExtractor(max_lines=SIGNATURE_MAX_LINES, max_line_length=TOO_LONG_SIGNATURE_LINE) + brute_force_extractor = BruteForceExtractor() return brute_force_extractor._process_marked_candidate_indexes(candidate, markers) From be8097c9a64d7139cedccd01f1999b5c0eb3730c Mon Sep 17 00:00:00 2001 From: Ron Rademaker Date: Tue, 14 Aug 2018 10:17:37 +0200 Subject: [PATCH 4/4] Comment line lenghts --- talon/signature/extractor.py | 28 +++++++++------------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/talon/signature/extractor.py b/talon/signature/extractor.py index 2280b474..d93d846c 100644 --- a/talon/signature/extractor.py +++ b/talon/signature/extractor.py @@ -1,6 +1,5 @@ """ -Module with object oriented approach to -signature extractions. Built to be more +Module with object oriented approach to signature extractions. Built to be more flexible and to support more languages. """ from __future__ import absolute_import @@ -28,16 +27,13 @@ class AbstractExtractor(ABC): """ - Abstract base class for - signature extractors. + Abstract base class for signature extractors. """ @abstractmethod def extract_signature(self, message): """ - Extract the signature from - message and return the - text and signature + Extract the signature from message and return the text and signature :param message: str :return: (text: str, signature: str) @@ -55,9 +51,8 @@ class BruteForceExtractor(AbstractExtractor): def __init__(self, max_lines=SIGNATURE_MAX_LINES, max_line_length=TOO_LONG_SIGNATURE_LINE, greetings=DEFAULT_GREETINGS): """ - Create a new brute force extractor. Allows override - max signature length, max signature line length and - common greetings (allows multi language support). + Create a new brute force extractor. Allows override max signature length, + max signature line length and common greetings (allows multi language support). """ self.max_lines = max_lines self.max_line_length = max_line_length @@ -65,9 +60,7 @@ def __init__(self, max_lines=SIGNATURE_MAX_LINES, max_line_length=TOO_LONG_SIGNA def extract_signature(self, msg_body): """ - Use brute force to extract the - signature (ie. regex and - string matching) + Use brute force to extract the signature (ie. regex and string matching) :param message: str :return: (text: str, signature: str) @@ -97,10 +90,8 @@ def extract_signature(self, msg_body): return (stripped_body.strip(), phone_signature) else: signature = signature.group() - # when we splitlines() and then join them - # we can lose a new line at the end - # we did it when identifying a candidate - # so we had to do it for stripped_body now + # when we splitlines() and then join them we can lose a new line at the end + # we did it when identifying a candidate so we had to do it for stripped_body now stripped_body = delimiter.join(lines) stripped_body = stripped_body[:-len(signature)] @@ -185,8 +176,7 @@ def _mark_candidate_indexes(self, lines, candidate): def _process_marked_candidate_indexes(self, candidate, markers): """ - Run regexes against candidate's marked indexes to strip - signature candidate. + Run regexes against candidate's marked indexes to strip signature candidate. >>> _process_marked_candidate_indexes([9, 12, 14, 15, 17], 'clddc') [15, 17]