Skip to content

Commit

Permalink
Create script to make character regexes
Browse files Browse the repository at this point in the history
  • Loading branch information
eliasdabbas committed Mar 30, 2024
1 parent 9c1e0e5 commit 57f6867
Showing 1 changed file with 59 additions and 0 deletions.
59 changes: 59 additions & 0 deletions advertools/_regex_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import re
from unicodedata import name

APOSTROPHE = []
BRACKET = []
COLON = []
COMMA = []
CURRENCY = []
CURRENCY_RAW = []
EXCLAMATION_MARK_RAW = []
FULL_STOP = []
HASHTAG = []
HASHTAG_RAW = []
MENTION = []
MENTION_RAW = []
PAREN = []
QUESTION_MARK_RAW = []
QUOTE = []
SENTENCE_END = []
URL = []
URL_RAW = []
WORD_DELIM = []

for i in range(2_000_000):
try:
if "APOSTROPHE" in name(chr(i)) and (chr(i) not in ["ʼn", "\U000e0027"]):
APOSTROPHE.append(chr(i))
if (
"BRACKET" in name(chr(i))
and "IDEOGRAPH" not in name(chr(i))
and "TORTOISE SHELL BRACKETED LATIN CAPITAL LETTER S" not in name(chr(i))
):
BRACKET.append(chr(i))
if "COLON" in name(chr(i)) and i != 8353: # remove the colon currency sign (₡)
COLON.append(chr(i))
if (
("COMMA" in name(chr(i)))
and not re.match("LATIN (SMALL|CAPITAL) LETTER", name(chr(i)))
and not re.match("DIGIT", name(chr(i)))
):
COMMA.append(chr(i))
if "EXCLAMATION" in name(chr(i)):
EXCLAMATION_MARK_RAW.append(chr(i))
if (
"FULL STOP" in name(chr(i))
and (not name(chr(i)).startswith("DIGIT"))
and (not name(chr(i)).startswith("NUMBER"))
):
FULL_STOP.append(chr(i))
if "QUOT" in name(chr(i)) and name(chr(i)) != "YI SYLLABLE QUOT":
QUOTE.append(chr(i))
if "CURRENC" in name(chr(i)):
CURRENCY.append(chr(i))
if ("PAREN" in name(chr(i))) and not re.match("PARENTHESIZED", name(chr(i))):
PAREN.append(chr(i))
if "QUESTION" in name(chr(i)) and "IDEOGRAPH" not in name(chr(i)):
QUESTION_MARK_RAW.append(chr(i))
except Exception:
continue

0 comments on commit 57f6867

Please sign in to comment.