Reduce some calls to re.sub (#50)

Reduce some calls to re.sub
nipunsadvilkar · Nov 13, 2019 · f7c640f · f7c640f
2 parents 16e8683 + 1ba5f71
commit f7c640f
Show file tree

Hide file tree

Showing 4 changed files with 73 additions and 25 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -40,3 +40,7 @@
 # v0.2.2
 
 -   🐛 Fix unbalanced parenthesis - \#47
+
+# v0.2.3
+
+-   🐛 Performance improvement in `abbreviation_replacer`- \#50
diff --git a/examples/test_timing_script.py b/examples/test_timing_script.py
diff --git a/pysbd/abbreviation_replacer.py b/pysbd/abbreviation_replacer.py
@@ -1,48 +1,63 @@
 # -*- coding: utf-8 -*-
 import re
 from pysbd.utils import Text
+
 # TODO: SENTENCE_STARTERS should be lang specific
 from pysbd.lang.standard import Abbreviation, SENTENCE_STARTERS
-from pysbd.lang.common.numbers import (Common, SingleLetterAbbreviationRules,
-                                       AmPmRules)
+from pysbd.lang.common.numbers import Common, SingleLetterAbbreviationRules, AmPmRules
 
 
 def replace_pre_number_abbr(txt, abbr):
-    txt = re.sub(r'(?<=\s{abbr})\.(?=\s\d)|(?<=^{abbr})\.(?=\s\d)'.format(abbr=abbr.strip()), "∯", txt)
-    txt = re.sub(r'(?<=\s{abbr})\.(?=\s+\()|(?<=^{abbr})\.(?=\s+\()'.format(abbr=abbr.strip()), "∯", txt)
+    # prepend a space to avoid needing another regex for start of string
+    txt = " " + txt
+    txt = re.sub(r"(?<=\s{abbr})\.(?=(\s\d|\s+\())".format(abbr=abbr.strip()), "∯", txt)
+    # remove the prepended space
+    txt = txt[1:]
     return txt
 
 
 def replace_prepositive_abbr(txt, abbr):
-    txt = re.sub(r'(?<=\s{abbr})\.(?=\s)|(?<=^{abbr})\.(?=\s)'.format(abbr=abbr.strip()), "∯", txt)
-    txt = re.sub(r'(?<=\s{abbr})\.(?=:\d+)|(?<=^{abbr})\.(?=:\d+)'.format(abbr=abbr.strip()), "∯", txt)
+    # prepend a space to avoid needing another regex for start of string
+    txt = " " + txt
+    txt = re.sub(r"(?<=\s{abbr})\.(?=(\s|:\d+))".format(abbr=abbr.strip()), "∯", txt)
+    # remove the prepended space
+    txt = txt[1:]
     return txt
 
 
 def replace_period_of_abbr(txt, abbr):
-    txt = re.sub(r"(?<=\s{abbr})\.(?=((\.|\:|-|\?)|(\s([a-z]|I\s|I'm|I'll|\d|\())))|(?<=^{abbr})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))".format(abbr=abbr.strip()), '∯', txt)
-    txt = re.sub(r"(?<=\s{abbr})\.(?=,)|(?<=^{abbr})\.(?=,)".format(abbr=abbr.strip()), '∯', txt)
+    # prepend a space to avoid needing another regex for start of string
+    txt = " " + txt
+    txt = re.sub(
+        r"(?<=\s{abbr})\.(?=((\.|\:|-|\?|,)|(\s([a-z]|I\s|I'm|I'll|\d|\())))".format(
+            abbr=abbr.strip()
+        ),
+        "∯",
+        txt,
+    )
+    # remove the prepended space
+    txt = txt[1:]
     return txt
 
 
 def replace_abbreviation_as_sentence_boundary(txt):
-    for word in SENTENCE_STARTERS:
-        escaped = re.escape(word)
-        regex = r"(U∯S|U\.S|U∯K|E∯U|E\.U|U∯S∯A|U\.S\.A|I|i.v|I.V)∯(?=\s{}\s)".format(escaped)
-        txt = re.sub(regex, '\\1.', txt)
+    sent_starters = "|".join((r"(?=\s{}\s)".format(word) for word in SENTENCE_STARTERS))
+    regex = r"(U∯S|U\.S|U∯K|E∯U|E\.U|U∯S∯A|U\.S\.A|I|i.v|I.V)∯({})".format(sent_starters)
+    txt = re.sub(regex, '\\1.', txt)
     return txt
 
 
 class AbbreviationReplacer(object):
-
-    def __init__(self, text, language='en'):
+    def __init__(self, text, language="en"):
         self.text = text
         self.language = language
 
     def replace(self):
-        self.text = Text(self.text).apply(Common.PossessiveAbbreviationRule,
-                                          Common.KommanditgesellschaftRule,
-                                          *SingleLetterAbbreviationRules.All)
+        self.text = Text(self.text).apply(
+            Common.PossessiveAbbreviationRule,
+            Common.KommanditgesellschaftRule,
+            *SingleLetterAbbreviationRules.All
+        )
         self.text = self.search_for_abbreviations_in_string()
         self.replace_multi_period_abbreviations()
         self.text = Text(self.text).apply(*AmPmRules.All)
@@ -52,9 +67,15 @@ def replace(self):
     def replace_multi_period_abbreviations(self):
         def mpa_replace(match):
             match = match.group()
-            match = re.sub(re.escape(r'.'), '∯', match)
+            match = re.sub(re.escape(r"."), "∯", match)
             return match
-        self.text = re.sub(Common.MULTI_PERIOD_ABBREVIATION_REGEX, mpa_replace, self.text, flags=re.IGNORECASE)
+
+        self.text = re.sub(
+            Common.MULTI_PERIOD_ABBREVIATION_REGEX,
+            mpa_replace,
+            self.text,
+            flags=re.IGNORECASE,
+        )
 
     def search_for_abbreviations_in_string(self):
         original = self.text
@@ -64,25 +85,27 @@ def search_for_abbreviations_in_string(self):
             if stripped not in lowered:
                 continue
             abbrev_match = re.findall(
-                r'(?:^|\s|\r|\n){}'.format(stripped), original,
-                flags=re.IGNORECASE)
+                r"(?:^|\s|\r|\n){}".format(stripped), original, flags=re.IGNORECASE
+            )
             if not abbrev_match:
                 continue
             next_word_start = r"(?<={" + str(re.escape(stripped)) + "} ).{1}"
             char_array = re.findall(next_word_start, self.text)
             for ind, match in enumerate(abbrev_match):
-                self.text = self.scan_for_replacements(self.text, match, ind, char_array)
+                self.text = self.scan_for_replacements(
+                    self.text, match, ind, char_array
+                )
         return self.text
 
     def scan_for_replacements(self, txt, am, ind, char_array):
         try:
             char = char_array[ind]
         except IndexError:
-            char = ''
+            char = ""
         prepositive = Abbreviation.PREPOSITIVE_ABBREVIATIONS
         number_abbr = Abbreviation.NUMBER_ABBREVIATIONS
         upper = str(char).isupper()
-        if (not upper or am.strip().lower() in prepositive):
+        if not upper or am.strip().lower() in prepositive:
             if am.strip().lower() in prepositive:
                 txt = replace_prepositive_abbr(txt, am)
             elif am.strip().lower() in number_abbr:

diff --git a/pysbd/about.py b/pysbd/about.py
@@ -2,7 +2,7 @@
 # https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
 
 __title__ = "pysbd"
-__version__ = "0.2.2"
+__version__ = "0.2.3"
 __summary__ = "pysbd (Python Sentence Boundary Disambiguation) is a rule-based sentence boundary detection that works out-of-the-box across many languages."
 __uri__ = "http://nipunsadvilkar.github.io/"
 __author__ = "Nipun Sadvilkar"