Introduce get_boundaries API

wikimedia · Nov 14, 2023 · 069127f · 069127f
1 parent bf1ba36
commit 069127f
Showing 1 changed file with 100 additions and 61 deletions.
diff --git a/sentencex/base.py b/sentencex/base.py
@@ -1,9 +1,35 @@
 import re
-from typing import Dict, Iterator, Tuple
+from typing import Dict, Iterator, Tuple, List
+from dataclasses import dataclass
 
 from .terminators import GLOBAL_SENTENCE_TERMINATORS
 
 
+@dataclass
+class SentenceBoundary:
+    """Class for keeping track of a sentence boundary."""
+
+    start: int = 0  # The start of the boundary region
+    end: int = 0  # The end of the boundary region
+    term_index: int = 0  # The index of the terminator
+    terminator: str = ""  # Terminator character
+
+    def is_ambiguous(self) -> bool:
+        """whether the sentence terminating punctuation is ambiguous"""
+        return (
+            self.terminator
+            in [
+                "\u002E"  # ATerm # Po       FULL STOP
+                "\u2024"  # ATerm # Po       ONE DOT LEADER
+                "\uFE52"  # ATerm # Po       SMALL FULL STOP
+                "\uFF0E"  # ATerm # Po       FULLWIDTH FULL STOP
+            ]
+        )
+
+    def length(self):
+        return len(self.end - self.start)
+
+
 class Languages(type):
     REGISTRY: Dict[str, type] = {}
 
@@ -89,32 +115,32 @@ def is_exclamation_word(self, head: str, tail: str) -> bool:
     def get_lastword(self, text: str):
         return re.split(r"[\s\.]+", text)[-1]
 
-    def findBoundary(self, text, match):
-        tail = text[match.start() + 1 :]
-        head = text[: match.start()]
+    def findBoundary(self, text, match) -> SentenceBoundary:
+        [match_start_index, match_end_index] = match.span()
+        end = match_end_index
 
-        # Trailing non-final punctuation: not a sentence boundary
-        # if re.match(r"^[,;:]", tail):
-        #     return None
+        tail = text[match_start_index + 1 :]
+        head = text[:match_start_index]
 
         # If next word is numbered reference, expand boundary to that.'
         number_ref_match = self.numbered_reference_regex.match(tail)
 
         if number_ref_match:
-            return match.start() + 1 + len(number_ref_match.group(0))
+            end = match_end_index + len(number_ref_match.group(0))
 
         # Next character is number or lower-case: not a sentence boundary
-        if self.continue_in_next_word(tail):
+        if self.continue_in_next_word(tail) and not number_ref_match:
             return None
         if self.is_abbreviation(head, tail, match.group(0)):
             return None
         if self.is_exclamation_word(head, tail):
             return None
 
-        # Include any closing punctuation and trailing space
-        match_len = len(match.group(0))
-        # print(match_len)
-        return match.start() + match_len
+        continuing_white_spaces = re.match(r"^\s+", tail)
+        if continuing_white_spaces:
+            end = end + len(continuing_white_spaces.group(0))
+
+        return SentenceBoundary(term_index=match_start_index, end=end, terminator=match.group(0))
 
     def continue_in_next_word(self, text_after_boundary) -> bool:
         return re.match(r"^[0-9a-z]", text_after_boundary)
@@ -126,6 +152,63 @@ def get_skippable_ranges(self, text) -> Tuple[int, int]:
         skippable_ranges += [match.span() for match in self.email_regex.finditer(text)]
         return skippable_ranges
 
+    def get_boundaries(self, text: str) -> List[SentenceBoundary]:
+        """
+        Get sentence boundaries in the given input text.
+
+        Args:
+            text (str): The input text to be segmented into sentences.
+
+        Yields:
+            Iterator[SentenceBoundary]: An iterator that yields each sentence from the input text.
+
+        """
+
+        # FIXME Need chunking here. For example, work at paragraphs. Otherwise
+        # regexes need to work on huge text.
+
+        # Initialize a list to store the boundaries of sentences.
+        boundaries = []
+
+        # Find all matches of sentence breaks in the paragraph.
+        matches = self.sentence_break_regex.finditer(text)
+        skippable_ranges = self.get_skippable_ranges(text)
+
+        prev_end = 0
+        # Iterate over each match of sentence breaks.
+        for match in matches:
+            boundary: SentenceBoundary = self.findBoundary(text, match)
+
+            # If boundary is None, skip to the next match.
+            if boundary is None:
+                continue
+            boundary.start = prev_end
+
+            # Check if the boundary is inside a skippable range (quote, parentheses, or email).
+            in_range = False
+            for qstart, qend in skippable_ranges:
+                if boundary.end > qstart and boundary.end < qend:
+                    if boundary.end + 1 == qend and self.is_punctuation_between_quotes():
+                        boundary.end = qend
+                        boundary.close = text[qend]
+                        in_range = False
+                    else:
+                        in_range = True
+                    break
+
+            # If in_range is True, skip to the next match.
+            if in_range:
+                continue
+
+            # Add the boundary to the boundaries list.
+            boundaries.append(boundary)
+            prev_end = boundary.end
+
+        if prev_end != len(text):
+            boundaries.append(SentenceBoundary(start=prev_end, end=len(text)))
+
+        return boundaries
+
     def segment(self, text: str) -> Iterator[str]:
         """
         Splits the given input text into sentences.
@@ -137,54 +220,10 @@ def segment(self, text: str) -> Iterator[str]:
             Iterator[str]: An iterator that yields each sentence from the input text.
 
         """
-        # Split the text into paragraphs using consecutive newlines as delimiters.
-        paragraphs = re.split(r"(\n{2,})", text)
-
-        # Iterate over each paragraph.
-        for paragraph in paragraphs:
-            # Initialize a list to store the boundaries of sentences.
-            boundaries = [0]
-
-            # Find all matches of sentence breaks in the paragraph.
-            matches = self.sentence_break_regex.finditer(paragraph)
-            skippable_ranges = self.get_skippable_ranges(paragraph)
-
-            # Iterate over each match of sentence breaks.
-            for match in matches:
-                # Find the boundary of the sentence.
-                boundary = self.findBoundary(paragraph, match)
-
-                # If boundary is None, skip to the next match.
-                if boundary is None:
-                    continue
-
-                # Check if the boundary is inside a skippable range (quote, parentheses, or email).
-                in_range = False
-                for qstart, qend in skippable_ranges:
-                    if boundary > qstart and boundary < qend:
-                        if boundary + 1 == qend and self.is_punctuation_between_quotes():
-                            boundary = qend
-                            in_range = False
-                        else:
-                            in_range = True
-                        break
-
-                # If in_range is True, skip to the next match.
-                if in_range:
-                    continue
-
-                # Add the boundary to the boundaries list.
-                boundaries.append(boundary)
-
-            # Iterate over each pair of boundaries.
-            for i, j in zip(boundaries, boundaries[1:] + [None]):
-                # Slice the paragraph using the boundaries to get the sentence.
-                sentence = paragraph[i:j]
-
-                # If the sentence has a length, yield the sentence
-                # stripped of leading/trailing spaces.
-                if len(sentence):
-                    yield sentence.strip(" ")
+        boundaries = self.get_boundaries(text)
+        print(boundaries)
+        for boundary in boundaries:
+            yield text[boundary.start : boundary.end].strip()
 
     def is_punctuation_between_quotes(self) -> bool:
         return False