diff --git a/sentencex/base.py b/sentencex/base.py index a2218e6..7aa2265 100644 --- a/sentencex/base.py +++ b/sentencex/base.py @@ -1,9 +1,35 @@ import re -from typing import Dict, Iterator, Tuple +from typing import Dict, Iterator, Tuple, List +from dataclasses import dataclass from .terminators import GLOBAL_SENTENCE_TERMINATORS +@dataclass +class SentenceBoundary: + """Class for keeping track of a sentence boundary.""" + + start: int = 0 # The start of the boundary region + end: int = 0 # The end of the boundary region + term_index: int = 0 # The index of the terminator + terminator: str = "" # Terminator character + + def is_ambiguous(self) -> bool: + """whether the sentence terminating punctuation is ambiguous""" + return ( + self.terminator + in [ + "\u002E" # ATerm # Po FULL STOP + "\u2024" # ATerm # Po ONE DOT LEADER + "\uFE52" # ATerm # Po SMALL FULL STOP + "\uFF0E" # ATerm # Po FULLWIDTH FULL STOP + ] + ) + + def length(self): + return len(self.end - self.start) + + class Languages(type): REGISTRY: Dict[str, type] = {} @@ -89,32 +115,32 @@ def is_exclamation_word(self, head: str, tail: str) -> bool: def get_lastword(self, text: str): return re.split(r"[\s\.]+", text)[-1] - def findBoundary(self, text, match): - tail = text[match.start() + 1 :] - head = text[: match.start()] + def findBoundary(self, text, match) -> SentenceBoundary: + [match_start_index, match_end_index] = match.span() + end = match_end_index - # Trailing non-final punctuation: not a sentence boundary - # if re.match(r"^[,;:]", tail): - # return None + tail = text[match_start_index + 1 :] + head = text[:match_start_index] # If next word is numbered reference, expand boundary to that.' number_ref_match = self.numbered_reference_regex.match(tail) if number_ref_match: - return match.start() + 1 + len(number_ref_match.group(0)) + end = match_end_index + len(number_ref_match.group(0)) # Next character is number or lower-case: not a sentence boundary - if self.continue_in_next_word(tail): + if self.continue_in_next_word(tail) and not number_ref_match: return None if self.is_abbreviation(head, tail, match.group(0)): return None if self.is_exclamation_word(head, tail): return None - # Include any closing punctuation and trailing space - match_len = len(match.group(0)) - # print(match_len) - return match.start() + match_len + continuing_white_spaces = re.match(r"^\s+", tail) + if continuing_white_spaces: + end = end + len(continuing_white_spaces.group(0)) + + return SentenceBoundary(term_index=match_start_index, end=end, terminator=match.group(0)) def continue_in_next_word(self, text_after_boundary) -> bool: return re.match(r"^[0-9a-z]", text_after_boundary) @@ -126,6 +152,63 @@ def get_skippable_ranges(self, text) -> Tuple[int, int]: skippable_ranges += [match.span() for match in self.email_regex.finditer(text)] return skippable_ranges + def get_boundaries(self, text: str) -> List[SentenceBoundary]: + """ + Get sentence boundaries in the given input text. + + Args: + text (str): The input text to be segmented into sentences. + + Yields: + Iterator[SentenceBoundary]: An iterator that yields each sentence from the input text. + + """ + + # FIXME Need chunking here. For example, work at paragraphs. Otherwise + # regexes need to work on huge text. + + # Initialize a list to store the boundaries of sentences. + boundaries = [] + + # Find all matches of sentence breaks in the paragraph. + matches = self.sentence_break_regex.finditer(text) + skippable_ranges = self.get_skippable_ranges(text) + + prev_end = 0 + # Iterate over each match of sentence breaks. + for match in matches: + boundary: SentenceBoundary = self.findBoundary(text, match) + + # If boundary is None, skip to the next match. + if boundary is None: + continue + boundary.start = prev_end + + # Check if the boundary is inside a skippable range (quote, parentheses, or email). + in_range = False + for qstart, qend in skippable_ranges: + if boundary.end > qstart and boundary.end < qend: + if boundary.end + 1 == qend and self.is_punctuation_between_quotes(): + boundary.end = qend + boundary.close = text[qend] + in_range = False + else: + in_range = True + break + + # If in_range is True, skip to the next match. + if in_range: + continue + + # Add the boundary to the boundaries list. + boundaries.append(boundary) + prev_end = boundary.end + + if prev_end != len(text): + boundaries.append(SentenceBoundary(start=prev_end, end=len(text))) + + return boundaries + def segment(self, text: str) -> Iterator[str]: """ Splits the given input text into sentences. @@ -137,54 +220,10 @@ def segment(self, text: str) -> Iterator[str]: Iterator[str]: An iterator that yields each sentence from the input text. """ - # Split the text into paragraphs using consecutive newlines as delimiters. - paragraphs = re.split(r"(\n{2,})", text) - - # Iterate over each paragraph. - for paragraph in paragraphs: - # Initialize a list to store the boundaries of sentences. - boundaries = [0] - - # Find all matches of sentence breaks in the paragraph. - matches = self.sentence_break_regex.finditer(paragraph) - skippable_ranges = self.get_skippable_ranges(paragraph) - - # Iterate over each match of sentence breaks. - for match in matches: - # Find the boundary of the sentence. - boundary = self.findBoundary(paragraph, match) - - # If boundary is None, skip to the next match. - if boundary is None: - continue - - # Check if the boundary is inside a skippable range (quote, parentheses, or email). - in_range = False - for qstart, qend in skippable_ranges: - if boundary > qstart and boundary < qend: - if boundary + 1 == qend and self.is_punctuation_between_quotes(): - boundary = qend - in_range = False - else: - in_range = True - break - - # If in_range is True, skip to the next match. - if in_range: - continue - - # Add the boundary to the boundaries list. - boundaries.append(boundary) - - # Iterate over each pair of boundaries. - for i, j in zip(boundaries, boundaries[1:] + [None]): - # Slice the paragraph using the boundaries to get the sentence. - sentence = paragraph[i:j] - - # If the sentence has a length, yield the sentence - # stripped of leading/trailing spaces. - if len(sentence): - yield sentence.strip(" ") + boundaries = self.get_boundaries(text) + print(boundaries) + for boundary in boundaries: + yield text[boundary.start : boundary.end].strip() def is_punctuation_between_quotes(self) -> bool: return False