diff --git a/sentencex/__init__.py b/sentencex/__init__.py index dc83513..3e699f9 100644 --- a/sentencex/__init__.py +++ b/sentencex/__init__.py @@ -1,7 +1,7 @@ from typing import List from . import languages -from .base import Language, Languages +from .base import Language, Languages, SentenceBoundary from .fallbacks import LANGUAGE_FALLBACKS @@ -30,4 +30,4 @@ def segment(language, text: str) -> List[str]: return get_language_class(language)().segment(text) -__all__ = ["languages", "segment"] +__all__ = ["languages", "segment", "SentenceBoundary"] diff --git a/sentencex/base.py b/sentencex/base.py index a2218e6..a9f015d 100644 --- a/sentencex/base.py +++ b/sentencex/base.py @@ -1,9 +1,47 @@ import re -from typing import Dict, Iterator, Tuple +from dataclasses import dataclass, field +from typing import Dict, Iterator, List, Tuple from .terminators import GLOBAL_SENTENCE_TERMINATORS +@dataclass +class SentenceBoundary: + """Class for keeping track of a sentence boundary.""" + + start: int = 0 # The start of the boundary region + end: int = 0 # The end of the boundary region + term_index: int = 0 # The index of the terminator + terminator: str = "" # Terminator character + ambiguous: bool = field(init=False) + + def __post_init__(self): + self.ambiguous = self.is_ambiguous() + + def is_ambiguous(self) -> bool: + """whether the sentence terminating punctuation is ambiguous""" + return ( + self.terminator + in [ + "\u002E" # FULL STOP + "\u2024" # ONE DOT LEADER + "\uFE52" # SMALL FULL STOP + "\uFF0E" # FULLWIDTH FULL STOP + ] + ) + + def length(self): + return len(self.end - self.start) + + def apply_offset(self, offset: int): + self.start += offset + self.end += offset + self.term_index += offset + + def get_sentence(self, text): + return text[self.start : self.end] + + class Languages(type): REGISTRY: Dict[str, type] = {} @@ -89,32 +127,32 @@ def is_exclamation_word(self, head: str, tail: str) -> bool: def get_lastword(self, text: str): return re.split(r"[\s\.]+", text)[-1] - def findBoundary(self, text, match): - tail = text[match.start() + 1 :] - head = text[: match.start()] + def findBoundary(self, text, match) -> SentenceBoundary: + [match_start_index, match_end_index] = match.span() + end = match_end_index - # Trailing non-final punctuation: not a sentence boundary - # if re.match(r"^[,;:]", tail): - # return None + tail = text[match_start_index + 1 :] + head = text[:match_start_index] # If next word is numbered reference, expand boundary to that.' number_ref_match = self.numbered_reference_regex.match(tail) if number_ref_match: - return match.start() + 1 + len(number_ref_match.group(0)) + end = match_end_index + len(number_ref_match.group(0)) # Next character is number or lower-case: not a sentence boundary - if self.continue_in_next_word(tail): + if self.continue_in_next_word(tail) and not number_ref_match: return None if self.is_abbreviation(head, tail, match.group(0)): return None if self.is_exclamation_word(head, tail): return None - # Include any closing punctuation and trailing space - match_len = len(match.group(0)) - # print(match_len) - return match.start() + match_len + continuing_white_spaces = re.match(r"^\s+", tail) + if continuing_white_spaces: + end = end + len(continuing_white_spaces.group(0)) + + return SentenceBoundary(term_index=match_start_index, end=end, terminator=match.group(0)) def continue_in_next_word(self, text_after_boundary) -> bool: return re.match(r"^[0-9a-z]", text_after_boundary) @@ -126,44 +164,48 @@ def get_skippable_ranges(self, text) -> Tuple[int, int]: skippable_ranges += [match.span() for match in self.email_regex.finditer(text)] return skippable_ranges - def segment(self, text: str) -> Iterator[str]: + def get_boundaries(self, text: str) -> Iterator[SentenceBoundary]: """ - Splits the given input text into sentences. + Get sentence boundaries in the given input text. Args: text (str): The input text to be segmented into sentences. Yields: - Iterator[str]: An iterator that yields each sentence from the input text. + Iterator[SentenceBoundary]: An iterator that yields `SentenceBoundary`. """ + # Split the text into paragraphs using consecutive newlines as delimiters. - paragraphs = re.split(r"(\n{2,})", text) + # The result will have the delimiter as member of array. + paragraphs: List[str] = re.split(r"([\n]{2})", text) - # Iterate over each paragraph. - for paragraph in paragraphs: - # Initialize a list to store the boundaries of sentences. - boundaries = [0] + # paragraph offset + offset: int = 0 + paragraph: str + for paragraph in paragraphs: # Find all matches of sentence breaks in the paragraph. matches = self.sentence_break_regex.finditer(paragraph) skippable_ranges = self.get_skippable_ranges(paragraph) + prev_end = offset # Iterate over each match of sentence breaks. for match in matches: - # Find the boundary of the sentence. - boundary = self.findBoundary(paragraph, match) + boundary: SentenceBoundary = self.findBoundary(paragraph, match) # If boundary is None, skip to the next match. if boundary is None: continue + boundary.start = prev_end # Check if the boundary is inside a skippable range (quote, parentheses, or email). in_range = False for qstart, qend in skippable_ranges: - if boundary > qstart and boundary < qend: - if boundary + 1 == qend and self.is_punctuation_between_quotes(): - boundary = qend + if boundary.end > qstart and boundary.end < qend: + if boundary.end + 1 == qend and self.is_punctuation_between_quotes(): + boundary.end = qend + boundary.close = paragraph[qend - 1] in_range = False else: in_range = True @@ -174,17 +216,29 @@ def segment(self, text: str) -> Iterator[str]: continue # Add the boundary to the boundaries list. - boundaries.append(boundary) + boundary.apply_offset(offset) + yield boundary + prev_end = boundary.end - # Iterate over each pair of boundaries. - for i, j in zip(boundaries, boundaries[1:] + [None]): - # Slice the paragraph using the boundaries to get the sentence. - sentence = paragraph[i:j] + if prev_end != len(paragraph): + yield SentenceBoundary(start=prev_end, end=len(paragraph) + offset) - # If the sentence has a length, yield the sentence - # stripped of leading/trailing spaces. - if len(sentence): - yield sentence.strip(" ") + offset += len(paragraph) + + def segment(self, text: str) -> Iterator[str]: + """ + Splits the given input text into sentences. + + Args: + text (str): The input text to be segmented into sentences. + + Yields: + Iterator[str]: An iterator that yields each sentence from the input text. + + """ + boundaries = self.get_boundaries(text) + for boundary in boundaries: + yield boundary.get_sentence(text).strip(" ") def is_punctuation_between_quotes(self) -> bool: return False