Introduce get_boundaries API

wikimedia · Nov 15, 2023 · c5e4827 · c5e4827
1 parent bf1ba36
commit c5e4827
Show file tree

Hide file tree

Showing 2 changed files with 91 additions and 37 deletions.
diff --git a/sentencex/__init__.py b/sentencex/__init__.py
@@ -1,7 +1,7 @@
 from typing import List
 
 from . import languages
-from .base import Language, Languages
+from .base import Language, Languages, SentenceBoundary
 from .fallbacks import LANGUAGE_FALLBACKS
 
 
@@ -30,4 +30,4 @@ def segment(language, text: str) -> List[str]:
     return get_language_class(language)().segment(text)
 
 
-__all__ = ["languages", "segment"]
+__all__ = ["languages", "segment", "SentenceBoundary"]
diff --git a/sentencex/base.py b/sentencex/base.py
@@ -1,9 +1,47 @@
 import re
-from typing import Dict, Iterator, Tuple
+from typing import Dict, Iterator, Tuple, List
+from dataclasses import dataclass, field
 
 from .terminators import GLOBAL_SENTENCE_TERMINATORS
 
 
+@dataclass
+class SentenceBoundary:
+    """Class for keeping track of a sentence boundary."""
+
+    start: int = 0  # The start of the boundary region
+    end: int = 0  # The end of the boundary region
+    term_index: int = 0  # The index of the terminator
+    terminator: str = ""  # Terminator character
+    ambiguous: bool = field(init=False)
+
+    def __post_init__(self):
+        self.ambiguous = self.is_ambiguous()
+
+    def is_ambiguous(self) -> bool:
+        """whether the sentence terminating punctuation is ambiguous"""
+        return (
+            self.terminator
+            in [
+                "\u002E"  # FULL STOP
+                "\u2024"  # ONE DOT LEADER
+                "\uFE52"  # SMALL FULL STOP
+                "\uFF0E"  # FULLWIDTH FULL STOP
+            ]
+        )
+
+    def length(self):
+        return len(self.end - self.start)
+
+    def apply_offset(self, offset: int):
+        self.start += offset
+        self.end += offset
+        self.term_index += offset
+
+    def get_sentence(self, text):
+        return text[self.start : self.end]
+
+
 class Languages(type):
     REGISTRY: Dict[str, type] = {}
 
@@ -89,32 +127,32 @@ def is_exclamation_word(self, head: str, tail: str) -> bool:
     def get_lastword(self, text: str):
         return re.split(r"[\s\.]+", text)[-1]
 
-    def findBoundary(self, text, match):
-        tail = text[match.start() + 1 :]
-        head = text[: match.start()]
+    def findBoundary(self, text, match) -> SentenceBoundary:
+        [match_start_index, match_end_index] = match.span()
+        end = match_end_index
 
-        # Trailing non-final punctuation: not a sentence boundary
-        # if re.match(r"^[,;:]", tail):
-        #     return None
+        tail = text[match_start_index + 1 :]
+        head = text[:match_start_index]
 
         # If next word is numbered reference, expand boundary to that.'
         number_ref_match = self.numbered_reference_regex.match(tail)
 
         if number_ref_match:
-            return match.start() + 1 + len(number_ref_match.group(0))
+            end = match_end_index + len(number_ref_match.group(0))
 
         # Next character is number or lower-case: not a sentence boundary
-        if self.continue_in_next_word(tail):
+        if self.continue_in_next_word(tail) and not number_ref_match:
             return None
         if self.is_abbreviation(head, tail, match.group(0)):
             return None
         if self.is_exclamation_word(head, tail):
             return None
 
-        # Include any closing punctuation and trailing space
-        match_len = len(match.group(0))
-        # print(match_len)
-        return match.start() + match_len
+        continuing_white_spaces = re.match(r"^\s+", tail)
+        if continuing_white_spaces:
+            end = end + len(continuing_white_spaces.group(0))
+
+        return SentenceBoundary(term_index=match_start_index, end=end, terminator=match.group(0))
 
     def continue_in_next_word(self, text_after_boundary) -> bool:
         return re.match(r"^[0-9a-z]", text_after_boundary)
@@ -126,44 +164,48 @@ def get_skippable_ranges(self, text) -> Tuple[int, int]:
         skippable_ranges += [match.span() for match in self.email_regex.finditer(text)]
         return skippable_ranges
 
-    def segment(self, text: str) -> Iterator[str]:
+    def get_boundaries(self, text: str) -> Iterator[SentenceBoundary]:
         """
-        Splits the given input text into sentences.
+        Get sentence boundaries in the given input text.
 
         Args:
             text (str): The input text to be segmented into sentences.
 
         Yields:
-            Iterator[str]: An iterator that yields each sentence from the input text.
+            Iterator[SentenceBoundary]: An iterator that yields `SentenceBoundary` from the input text.
 
         """
+
         # Split the text into paragraphs using consecutive newlines as delimiters.
-        paragraphs = re.split(r"(\n{2,})", text)
+        # The result will have the delimiter as member of array.
+        paragraphs: List[str] = re.split(r"([\n]{2})", text)
 
-        # Iterate over each paragraph.
-        for paragraph in paragraphs:
-            # Initialize a list to store the boundaries of sentences.
-            boundaries = [0]
+        # paragraph offset
+        offset: int = 0
 
+        paragraph: str
+        for paragraph in paragraphs:
             # Find all matches of sentence breaks in the paragraph.
             matches = self.sentence_break_regex.finditer(paragraph)
             skippable_ranges = self.get_skippable_ranges(paragraph)
 
+            prev_end = offset
             # Iterate over each match of sentence breaks.
             for match in matches:
-                # Find the boundary of the sentence.
-                boundary = self.findBoundary(paragraph, match)
+                boundary: SentenceBoundary = self.findBoundary(paragraph, match)
 
                 # If boundary is None, skip to the next match.
                 if boundary is None:
                     continue
+                boundary.start = prev_end
 
                 # Check if the boundary is inside a skippable range (quote, parentheses, or email).
                 in_range = False
                 for qstart, qend in skippable_ranges:
-                    if boundary > qstart and boundary < qend:
-                        if boundary + 1 == qend and self.is_punctuation_between_quotes():
-                            boundary = qend
+                    if boundary.end > qstart and boundary.end < qend:
+                        if boundary.end + 1 == qend and self.is_punctuation_between_quotes():
+                            boundary.end = qend
+                            boundary.close = paragraph[qend - 1]
                             in_range = False
                         else:
                             in_range = True
@@ -174,17 +216,29 @@ def segment(self, text: str) -> Iterator[str]:
                     continue
 
                 # Add the boundary to the boundaries list.
-                boundaries.append(boundary)
+                boundary.apply_offset(offset)
+                yield boundary
+                prev_end = boundary.end
 
-            # Iterate over each pair of boundaries.
-            for i, j in zip(boundaries, boundaries[1:] + [None]):
-                # Slice the paragraph using the boundaries to get the sentence.
-                sentence = paragraph[i:j]
+            if prev_end != len(paragraph):
+                yield SentenceBoundary(start=prev_end, end=len(paragraph) + offset)
 
-                # If the sentence has a length, yield the sentence
-                # stripped of leading/trailing spaces.
-                if len(sentence):
-                    yield sentence.strip(" ")
+            offset += len(paragraph)
+
+    def segment(self, text: str) -> Iterator[str]:
+        """
+        Splits the given input text into sentences.
+
+        Args:
+            text (str): The input text to be segmented into sentences.
+
+        Yields:
+            Iterator[str]: An iterator that yields each sentence from the input text.
+
+        """
+        boundaries = self.get_boundaries(text)
+        for boundary in boundaries:
+            yield boundary.get_sentence(text).strip(" ")
 
     def is_punctuation_between_quotes(self) -> bool:
         return False