Skip to content

Commit

Permalink
Introduce get_boundaries API
Browse files Browse the repository at this point in the history
  • Loading branch information
santhoshtr committed Nov 14, 2023
1 parent bf1ba36 commit 069127f
Showing 1 changed file with 100 additions and 61 deletions.
161 changes: 100 additions & 61 deletions sentencex/base.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,35 @@
import re
from typing import Dict, Iterator, Tuple
from typing import Dict, Iterator, Tuple, List
from dataclasses import dataclass

from .terminators import GLOBAL_SENTENCE_TERMINATORS


@dataclass
class SentenceBoundary:
"""Class for keeping track of a sentence boundary."""

start: int = 0 # The start of the boundary region
end: int = 0 # The end of the boundary region
term_index: int = 0 # The index of the terminator
terminator: str = "" # Terminator character

def is_ambiguous(self) -> bool:
"""whether the sentence terminating punctuation is ambiguous"""
return (
self.terminator
in [
"\u002E" # ATerm # Po FULL STOP
"\u2024" # ATerm # Po ONE DOT LEADER
"\uFE52" # ATerm # Po SMALL FULL STOP
"\uFF0E" # ATerm # Po FULLWIDTH FULL STOP
]
)

def length(self):
return len(self.end - self.start)


class Languages(type):
REGISTRY: Dict[str, type] = {}

Expand Down Expand Up @@ -89,32 +115,32 @@ def is_exclamation_word(self, head: str, tail: str) -> bool:
def get_lastword(self, text: str):
return re.split(r"[\s\.]+", text)[-1]

def findBoundary(self, text, match):
tail = text[match.start() + 1 :]
head = text[: match.start()]
def findBoundary(self, text, match) -> SentenceBoundary:
[match_start_index, match_end_index] = match.span()
end = match_end_index

# Trailing non-final punctuation: not a sentence boundary
# if re.match(r"^[,;:]", tail):
# return None
tail = text[match_start_index + 1 :]
head = text[:match_start_index]

# If next word is numbered reference, expand boundary to that.'
number_ref_match = self.numbered_reference_regex.match(tail)

if number_ref_match:
return match.start() + 1 + len(number_ref_match.group(0))
end = match_end_index + len(number_ref_match.group(0))

# Next character is number or lower-case: not a sentence boundary
if self.continue_in_next_word(tail):
if self.continue_in_next_word(tail) and not number_ref_match:
return None
if self.is_abbreviation(head, tail, match.group(0)):
return None
if self.is_exclamation_word(head, tail):
return None

# Include any closing punctuation and trailing space
match_len = len(match.group(0))
# print(match_len)
return match.start() + match_len
continuing_white_spaces = re.match(r"^\s+", tail)
if continuing_white_spaces:
end = end + len(continuing_white_spaces.group(0))

return SentenceBoundary(term_index=match_start_index, end=end, terminator=match.group(0))

def continue_in_next_word(self, text_after_boundary) -> bool:
return re.match(r"^[0-9a-z]", text_after_boundary)
Expand All @@ -126,6 +152,63 @@ def get_skippable_ranges(self, text) -> Tuple[int, int]:
skippable_ranges += [match.span() for match in self.email_regex.finditer(text)]
return skippable_ranges

def get_boundaries(self, text: str) -> List[SentenceBoundary]:
"""
Get sentence boundaries in the given input text.
Args:
text (str): The input text to be segmented into sentences.
Yields:
Iterator[SentenceBoundary]: An iterator that yields each sentence from the input text.
"""

# FIXME Need chunking here. For example, work at paragraphs. Otherwise
# regexes need to work on huge text.

# Initialize a list to store the boundaries of sentences.
boundaries = []

# Find all matches of sentence breaks in the paragraph.
matches = self.sentence_break_regex.finditer(text)
skippable_ranges = self.get_skippable_ranges(text)

prev_end = 0
# Iterate over each match of sentence breaks.
for match in matches:
boundary: SentenceBoundary = self.findBoundary(text, match)

# If boundary is None, skip to the next match.
if boundary is None:
continue
boundary.start = prev_end

# Check if the boundary is inside a skippable range (quote, parentheses, or email).
in_range = False
for qstart, qend in skippable_ranges:
if boundary.end > qstart and boundary.end < qend:
if boundary.end + 1 == qend and self.is_punctuation_between_quotes():
boundary.end = qend
boundary.close = text[qend]
in_range = False
else:
in_range = True
break

# If in_range is True, skip to the next match.
if in_range:
continue

# Add the boundary to the boundaries list.
boundaries.append(boundary)
prev_end = boundary.end

if prev_end != len(text):
boundaries.append(SentenceBoundary(start=prev_end, end=len(text)))

return boundaries

def segment(self, text: str) -> Iterator[str]:
"""
Splits the given input text into sentences.
Expand All @@ -137,54 +220,10 @@ def segment(self, text: str) -> Iterator[str]:
Iterator[str]: An iterator that yields each sentence from the input text.
"""
# Split the text into paragraphs using consecutive newlines as delimiters.
paragraphs = re.split(r"(\n{2,})", text)

# Iterate over each paragraph.
for paragraph in paragraphs:
# Initialize a list to store the boundaries of sentences.
boundaries = [0]

# Find all matches of sentence breaks in the paragraph.
matches = self.sentence_break_regex.finditer(paragraph)
skippable_ranges = self.get_skippable_ranges(paragraph)

# Iterate over each match of sentence breaks.
for match in matches:
# Find the boundary of the sentence.
boundary = self.findBoundary(paragraph, match)

# If boundary is None, skip to the next match.
if boundary is None:
continue

# Check if the boundary is inside a skippable range (quote, parentheses, or email).
in_range = False
for qstart, qend in skippable_ranges:
if boundary > qstart and boundary < qend:
if boundary + 1 == qend and self.is_punctuation_between_quotes():
boundary = qend
in_range = False
else:
in_range = True
break

# If in_range is True, skip to the next match.
if in_range:
continue

# Add the boundary to the boundaries list.
boundaries.append(boundary)

# Iterate over each pair of boundaries.
for i, j in zip(boundaries, boundaries[1:] + [None]):
# Slice the paragraph using the boundaries to get the sentence.
sentence = paragraph[i:j]

# If the sentence has a length, yield the sentence
# stripped of leading/trailing spaces.
if len(sentence):
yield sentence.strip(" ")
boundaries = self.get_boundaries(text)
print(boundaries)
for boundary in boundaries:
yield text[boundary.start : boundary.end].strip()

def is_punctuation_between_quotes(self) -> bool:
return False

0 comments on commit 069127f

Please sign in to comment.