Skip to content

Commit

Permalink
Reduce some calls to re.sub (#50)
Browse files Browse the repository at this point in the history
Reduce some calls to re.sub
  • Loading branch information
nipunsadvilkar authored Nov 13, 2019
2 parents 16e8683 + 1ba5f71 commit f7c640f
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 25 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,7 @@
# v0.2.2

- 🐛 Fix unbalanced parenthesis - \#47

# v0.2.3

- 🐛 Performance improvement in `abbreviation_replacer`- \#50
21 changes: 21 additions & 0 deletions examples/test_timing_script.py

Large diffs are not rendered by default.

71 changes: 47 additions & 24 deletions pysbd/abbreviation_replacer.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,63 @@
# -*- coding: utf-8 -*-
import re
from pysbd.utils import Text

# TODO: SENTENCE_STARTERS should be lang specific
from pysbd.lang.standard import Abbreviation, SENTENCE_STARTERS
from pysbd.lang.common.numbers import (Common, SingleLetterAbbreviationRules,
AmPmRules)
from pysbd.lang.common.numbers import Common, SingleLetterAbbreviationRules, AmPmRules


def replace_pre_number_abbr(txt, abbr):
txt = re.sub(r'(?<=\s{abbr})\.(?=\s\d)|(?<=^{abbr})\.(?=\s\d)'.format(abbr=abbr.strip()), "∯", txt)
txt = re.sub(r'(?<=\s{abbr})\.(?=\s+\()|(?<=^{abbr})\.(?=\s+\()'.format(abbr=abbr.strip()), "∯", txt)
# prepend a space to avoid needing another regex for start of string
txt = " " + txt
txt = re.sub(r"(?<=\s{abbr})\.(?=(\s\d|\s+\())".format(abbr=abbr.strip()), "∯", txt)
# remove the prepended space
txt = txt[1:]
return txt


def replace_prepositive_abbr(txt, abbr):
txt = re.sub(r'(?<=\s{abbr})\.(?=\s)|(?<=^{abbr})\.(?=\s)'.format(abbr=abbr.strip()), "∯", txt)
txt = re.sub(r'(?<=\s{abbr})\.(?=:\d+)|(?<=^{abbr})\.(?=:\d+)'.format(abbr=abbr.strip()), "∯", txt)
# prepend a space to avoid needing another regex for start of string
txt = " " + txt
txt = re.sub(r"(?<=\s{abbr})\.(?=(\s|:\d+))".format(abbr=abbr.strip()), "∯", txt)
# remove the prepended space
txt = txt[1:]
return txt


def replace_period_of_abbr(txt, abbr):
txt = re.sub(r"(?<=\s{abbr})\.(?=((\.|\:|-|\?)|(\s([a-z]|I\s|I'm|I'll|\d|\())))|(?<=^{abbr})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))".format(abbr=abbr.strip()), '∯', txt)
txt = re.sub(r"(?<=\s{abbr})\.(?=,)|(?<=^{abbr})\.(?=,)".format(abbr=abbr.strip()), '∯', txt)
# prepend a space to avoid needing another regex for start of string
txt = " " + txt
txt = re.sub(
r"(?<=\s{abbr})\.(?=((\.|\:|-|\?|,)|(\s([a-z]|I\s|I'm|I'll|\d|\())))".format(
abbr=abbr.strip()
),
"∯",
txt,
)
# remove the prepended space
txt = txt[1:]
return txt


def replace_abbreviation_as_sentence_boundary(txt):
for word in SENTENCE_STARTERS:
escaped = re.escape(word)
regex = r"(U∯S|U\.S|U∯K|E∯U|E\.U|U∯S∯A|U\.S\.A|I|i.v|I.V)∯(?=\s{}\s)".format(escaped)
txt = re.sub(regex, '\\1.', txt)
sent_starters = "|".join((r"(?=\s{}\s)".format(word) for word in SENTENCE_STARTERS))
regex = r"(U∯S|U\.S|U∯K|E∯U|E\.U|U∯S∯A|U\.S\.A|I|i.v|I.V)∯({})".format(sent_starters)
txt = re.sub(regex, '\\1.', txt)
return txt


class AbbreviationReplacer(object):

def __init__(self, text, language='en'):
def __init__(self, text, language="en"):
self.text = text
self.language = language

def replace(self):
self.text = Text(self.text).apply(Common.PossessiveAbbreviationRule,
Common.KommanditgesellschaftRule,
*SingleLetterAbbreviationRules.All)
self.text = Text(self.text).apply(
Common.PossessiveAbbreviationRule,
Common.KommanditgesellschaftRule,
*SingleLetterAbbreviationRules.All
)
self.text = self.search_for_abbreviations_in_string()
self.replace_multi_period_abbreviations()
self.text = Text(self.text).apply(*AmPmRules.All)
Expand All @@ -52,9 +67,15 @@ def replace(self):
def replace_multi_period_abbreviations(self):
def mpa_replace(match):
match = match.group()
match = re.sub(re.escape(r'.'), '∯', match)
match = re.sub(re.escape(r"."), "∯", match)
return match
self.text = re.sub(Common.MULTI_PERIOD_ABBREVIATION_REGEX, mpa_replace, self.text, flags=re.IGNORECASE)

self.text = re.sub(
Common.MULTI_PERIOD_ABBREVIATION_REGEX,
mpa_replace,
self.text,
flags=re.IGNORECASE,
)

def search_for_abbreviations_in_string(self):
original = self.text
Expand All @@ -64,25 +85,27 @@ def search_for_abbreviations_in_string(self):
if stripped not in lowered:
continue
abbrev_match = re.findall(
r'(?:^|\s|\r|\n){}'.format(stripped), original,
flags=re.IGNORECASE)
r"(?:^|\s|\r|\n){}".format(stripped), original, flags=re.IGNORECASE
)
if not abbrev_match:
continue
next_word_start = r"(?<={" + str(re.escape(stripped)) + "} ).{1}"
char_array = re.findall(next_word_start, self.text)
for ind, match in enumerate(abbrev_match):
self.text = self.scan_for_replacements(self.text, match, ind, char_array)
self.text = self.scan_for_replacements(
self.text, match, ind, char_array
)
return self.text

def scan_for_replacements(self, txt, am, ind, char_array):
try:
char = char_array[ind]
except IndexError:
char = ''
char = ""
prepositive = Abbreviation.PREPOSITIVE_ABBREVIATIONS
number_abbr = Abbreviation.NUMBER_ABBREVIATIONS
upper = str(char).isupper()
if (not upper or am.strip().lower() in prepositive):
if not upper or am.strip().lower() in prepositive:
if am.strip().lower() in prepositive:
txt = replace_prepositive_abbr(txt, am)
elif am.strip().lower() in number_abbr:
Expand Down
2 changes: 1 addition & 1 deletion pysbd/about.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/

__title__ = "pysbd"
__version__ = "0.2.2"
__version__ = "0.2.3"
__summary__ = "pysbd (Python Sentence Boundary Disambiguation) is a rule-based sentence boundary detection that works out-of-the-box across many languages."
__uri__ = "http://nipunsadvilkar.github.io/"
__author__ = "Nipun Sadvilkar"
Expand Down

0 comments on commit f7c640f

Please sign in to comment.