Skip to content

Commit

Permalink
Introduce get_boundaries API
Browse files Browse the repository at this point in the history
  • Loading branch information
santhoshtr committed Nov 15, 2023
1 parent bf1ba36 commit c5e4827
Show file tree
Hide file tree
Showing 2 changed files with 91 additions and 37 deletions.
4 changes: 2 additions & 2 deletions sentencex/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import List

from . import languages
from .base import Language, Languages
from .base import Language, Languages, SentenceBoundary
from .fallbacks import LANGUAGE_FALLBACKS


Expand Down Expand Up @@ -30,4 +30,4 @@ def segment(language, text: str) -> List[str]:
return get_language_class(language)().segment(text)


__all__ = ["languages", "segment"]
__all__ = ["languages", "segment", "SentenceBoundary"]
124 changes: 89 additions & 35 deletions sentencex/base.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,47 @@
import re
from typing import Dict, Iterator, Tuple
from typing import Dict, Iterator, Tuple, List
from dataclasses import dataclass, field

from .terminators import GLOBAL_SENTENCE_TERMINATORS


@dataclass
class SentenceBoundary:
"""Class for keeping track of a sentence boundary."""

start: int = 0 # The start of the boundary region
end: int = 0 # The end of the boundary region
term_index: int = 0 # The index of the terminator
terminator: str = "" # Terminator character
ambiguous: bool = field(init=False)

def __post_init__(self):
self.ambiguous = self.is_ambiguous()

def is_ambiguous(self) -> bool:
"""whether the sentence terminating punctuation is ambiguous"""
return (
self.terminator
in [
"\u002E" # FULL STOP
"\u2024" # ONE DOT LEADER
"\uFE52" # SMALL FULL STOP
"\uFF0E" # FULLWIDTH FULL STOP
]
)

def length(self):
return len(self.end - self.start)

def apply_offset(self, offset: int):
self.start += offset
self.end += offset
self.term_index += offset

def get_sentence(self, text):
return text[self.start : self.end]


class Languages(type):
REGISTRY: Dict[str, type] = {}

Expand Down Expand Up @@ -89,32 +127,32 @@ def is_exclamation_word(self, head: str, tail: str) -> bool:
def get_lastword(self, text: str):
return re.split(r"[\s\.]+", text)[-1]

def findBoundary(self, text, match):
tail = text[match.start() + 1 :]
head = text[: match.start()]
def findBoundary(self, text, match) -> SentenceBoundary:
[match_start_index, match_end_index] = match.span()
end = match_end_index

# Trailing non-final punctuation: not a sentence boundary
# if re.match(r"^[,;:]", tail):
# return None
tail = text[match_start_index + 1 :]
head = text[:match_start_index]

# If next word is numbered reference, expand boundary to that.'
number_ref_match = self.numbered_reference_regex.match(tail)

if number_ref_match:
return match.start() + 1 + len(number_ref_match.group(0))
end = match_end_index + len(number_ref_match.group(0))

# Next character is number or lower-case: not a sentence boundary
if self.continue_in_next_word(tail):
if self.continue_in_next_word(tail) and not number_ref_match:
return None
if self.is_abbreviation(head, tail, match.group(0)):
return None
if self.is_exclamation_word(head, tail):
return None

# Include any closing punctuation and trailing space
match_len = len(match.group(0))
# print(match_len)
return match.start() + match_len
continuing_white_spaces = re.match(r"^\s+", tail)
if continuing_white_spaces:
end = end + len(continuing_white_spaces.group(0))

return SentenceBoundary(term_index=match_start_index, end=end, terminator=match.group(0))

def continue_in_next_word(self, text_after_boundary) -> bool:
return re.match(r"^[0-9a-z]", text_after_boundary)
Expand All @@ -126,44 +164,48 @@ def get_skippable_ranges(self, text) -> Tuple[int, int]:
skippable_ranges += [match.span() for match in self.email_regex.finditer(text)]
return skippable_ranges

def segment(self, text: str) -> Iterator[str]:
def get_boundaries(self, text: str) -> Iterator[SentenceBoundary]:
"""
Splits the given input text into sentences.
Get sentence boundaries in the given input text.
Args:
text (str): The input text to be segmented into sentences.
Yields:
Iterator[str]: An iterator that yields each sentence from the input text.
Iterator[SentenceBoundary]: An iterator that yields `SentenceBoundary` from the input text.
"""

# Split the text into paragraphs using consecutive newlines as delimiters.
paragraphs = re.split(r"(\n{2,})", text)
# The result will have the delimiter as member of array.
paragraphs: List[str] = re.split(r"([\n]{2})", text)

# Iterate over each paragraph.
for paragraph in paragraphs:
# Initialize a list to store the boundaries of sentences.
boundaries = [0]
# paragraph offset
offset: int = 0

paragraph: str
for paragraph in paragraphs:
# Find all matches of sentence breaks in the paragraph.
matches = self.sentence_break_regex.finditer(paragraph)
skippable_ranges = self.get_skippable_ranges(paragraph)

prev_end = offset
# Iterate over each match of sentence breaks.
for match in matches:
# Find the boundary of the sentence.
boundary = self.findBoundary(paragraph, match)
boundary: SentenceBoundary = self.findBoundary(paragraph, match)

# If boundary is None, skip to the next match.
if boundary is None:
continue
boundary.start = prev_end

# Check if the boundary is inside a skippable range (quote, parentheses, or email).
in_range = False
for qstart, qend in skippable_ranges:
if boundary > qstart and boundary < qend:
if boundary + 1 == qend and self.is_punctuation_between_quotes():
boundary = qend
if boundary.end > qstart and boundary.end < qend:
if boundary.end + 1 == qend and self.is_punctuation_between_quotes():
boundary.end = qend
boundary.close = paragraph[qend - 1]
in_range = False
else:
in_range = True
Expand All @@ -174,17 +216,29 @@ def segment(self, text: str) -> Iterator[str]:
continue

# Add the boundary to the boundaries list.
boundaries.append(boundary)
boundary.apply_offset(offset)
yield boundary
prev_end = boundary.end

# Iterate over each pair of boundaries.
for i, j in zip(boundaries, boundaries[1:] + [None]):
# Slice the paragraph using the boundaries to get the sentence.
sentence = paragraph[i:j]
if prev_end != len(paragraph):
yield SentenceBoundary(start=prev_end, end=len(paragraph) + offset)

# If the sentence has a length, yield the sentence
# stripped of leading/trailing spaces.
if len(sentence):
yield sentence.strip(" ")
offset += len(paragraph)

def segment(self, text: str) -> Iterator[str]:
"""
Splits the given input text into sentences.
Args:
text (str): The input text to be segmented into sentences.
Yields:
Iterator[str]: An iterator that yields each sentence from the input text.
"""
boundaries = self.get_boundaries(text)
for boundary in boundaries:
yield boundary.get_sentence(text).strip(" ")

def is_punctuation_between_quotes(self) -> bool:
return False

0 comments on commit c5e4827

Please sign in to comment.