From d73fe9aea5285efd43fc5c3a204f1648719382ee Mon Sep 17 00:00:00 2001 From: doggo4242 Date: Thu, 1 Jul 2021 11:04:03 -0400 Subject: [PATCH 1/6] added ascii filter, minor code cleanup --- confusables/__init__.py | 15 +++++++-------- confusables/utils.py | 5 ----- 2 files changed, 7 insertions(+), 13 deletions(-) delete mode 100644 confusables/utils.py diff --git a/confusables/__init__.py b/confusables/__init__.py index d87d8bb..9649e09 100644 --- a/confusables/__init__.py +++ b/confusables/__init__.py @@ -4,12 +4,11 @@ from itertools import product from .config import CONFUSABLE_MAPPING_PATH, NON_NORMAL_ASCII_CHARS -from .utils import is_ascii # read confusable mappings from file, build 2-way map of the pairs with open(os.path.join(os.path.dirname(__file__), CONFUSABLE_MAPPING_PATH), "r") as mappings: - CONFUSABLE_MAP = json.loads(mappings.readline()) + CONFUSABLE_MAP = json.load(mappings) def is_confusable(str1, str2): @@ -51,21 +50,21 @@ def confusable_regex(string, include_character_padding=False): return regex -def normalize(string, prioritize_alpha=False): - normal_forms = set([""]) +def normalize(string, prioritize_alpha=False,filter_all_ascii=False): + normal_forms = {""} for char in string: normalized_chars = [] confusable_chars = confusable_characters(char) - if not is_ascii(char) or not char.isalpha(): + if not char.isascii() or not (char.isalnum() or filter_all_ascii): for confusable in confusable_chars: if prioritize_alpha: - if ((char.isalpha() and confusable.isalpha() and is_ascii(confusable)) or (not char.isalpha() and is_ascii(confusable))) and confusable not in NON_NORMAL_ASCII_CHARS: + if ((char.isalnum() and confusable.isalnum() and confusable.isascii()) or (not char.isalnum() and confusable.isascii())) and confusable not in NON_NORMAL_ASCII_CHARS: normal = confusable if len(confusable) > 1: normal = normalize(confusable)[0] normalized_chars.append(normal) else: - if is_ascii(confusable) and confusable not in NON_NORMAL_ASCII_CHARS: + if confusable.isascii() and confusable not in NON_NORMAL_ASCII_CHARS: normal = confusable if len(confusable) > 1: normal = normalize(confusable)[0] @@ -75,5 +74,5 @@ def normalize(string, prioritize_alpha=False): if len(normalized_chars) == 0: normalized_chars = [char] - normal_forms = set([x[0]+x[1].lower() for x in list(product(normal_forms, normalized_chars))]) + normal_forms = {x[0]+x[1].lower() for x in list(product(normal_forms, normalized_chars))} return sorted(list(normal_forms)) diff --git a/confusables/utils.py b/confusables/utils.py deleted file mode 100644 index 398a70a..0000000 --- a/confusables/utils.py +++ /dev/null @@ -1,5 +0,0 @@ -def is_ascii(string): - for char in string: - if ord(char) >= 128: - return False - return True From 07d999db8616e013bb0814e183aa75166ec47bda Mon Sep 17 00:00:00 2001 From: doggo4242 Date: Thu, 1 Jul 2021 16:20:13 -0400 Subject: [PATCH 2/6] minor changes --- confusables/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/confusables/__init__.py b/confusables/__init__.py index 9649e09..c1eff6f 100644 --- a/confusables/__init__.py +++ b/confusables/__init__.py @@ -50,12 +50,12 @@ def confusable_regex(string, include_character_padding=False): return regex -def normalize(string, prioritize_alpha=False,filter_all_ascii=False): +def normalize(string, prioritize_alpha=False,ignore_all_ascii=False): normal_forms = {""} for char in string: normalized_chars = [] confusable_chars = confusable_characters(char) - if not char.isascii() or not (char.isalnum() or filter_all_ascii): + if not char.isascii() or not (char.isalnum() or ignore_all_ascii): for confusable in confusable_chars: if prioritize_alpha: if ((char.isalnum() and confusable.isalnum() and confusable.isascii()) or (not char.isalnum() and confusable.isascii())) and confusable not in NON_NORMAL_ASCII_CHARS: From f998905bdcc171803418a6e9567cb3c3774e4b86 Mon Sep 17 00:00:00 2001 From: doggo4242 Date: Mon, 5 Jul 2021 17:50:20 -0400 Subject: [PATCH 3/6] simplified conditional --- confusables/__init__.py | 4 ++-- confusables/parse.py | 39 ++++++++++++++++++--------------------- 2 files changed, 20 insertions(+), 23 deletions(-) diff --git a/confusables/__init__.py b/confusables/__init__.py index c1eff6f..a4b9eb8 100644 --- a/confusables/__init__.py +++ b/confusables/__init__.py @@ -50,12 +50,12 @@ def confusable_regex(string, include_character_padding=False): return regex -def normalize(string, prioritize_alpha=False,ignore_all_ascii=False): +def normalize(string, prioritize_alpha=False): normal_forms = {""} for char in string: normalized_chars = [] confusable_chars = confusable_characters(char) - if not char.isascii() or not (char.isalnum() or ignore_all_ascii): + if not char.isascii(): for confusable in confusable_chars: if prioritize_alpha: if ((char.isalnum() and confusable.isalnum() and confusable.isascii()) or (not char.isalnum() and confusable.isascii())) and confusable not in NON_NORMAL_ASCII_CHARS: diff --git a/confusables/parse.py b/confusables/parse.py index 64ec50d..3a7dda6 100644 --- a/confusables/parse.py +++ b/confusables/parse.py @@ -13,7 +13,7 @@ def _get_accented_characters(char): def _get_confusable_chars(character, unicode_confusable_map, depth): mapped_chars = unicode_confusable_map[character] - group = set([character]) + group = {character} if depth <= MAX_SIMILARITY_DEPTH: for mapped_char in mapped_chars: group.update(_get_confusable_chars(mapped_char, unicode_confusable_map, depth + 1)) @@ -40,30 +40,28 @@ def parse_new_mapping_file(): if unicode_confusable_map.get(str1): unicode_confusable_map[str1].add(str2) else: - unicode_confusable_map[str1] = set([str2]) + unicode_confusable_map[str1] = {str2} if unicode_confusable_map.get(str2): unicode_confusable_map[str2].add(str1) else: - unicode_confusable_map[str2] = set([str1]) + unicode_confusable_map[str2] = {str1} if len(str1) == 1: case_change = str1.lower() if str1.isupper() else str1.upper() - if case_change != str1: - unicode_confusable_map[str1].add(case_change) - if unicode_confusable_map.get(case_change) is not None: - unicode_confusable_map[case_change].add(str1) - else: - unicode_confusable_map[case_change] = set([str1]) + unicode_confusable_map[str1].add(case_change) + if unicode_confusable_map.get(case_change) is not None: + unicode_confusable_map[case_change].add(str1) + else: + unicode_confusable_map[case_change] = {str1} if len(str2) == 1: case_change = str2.lower() if str2.isupper() else str2.upper() - if case_change != str2: - unicode_confusable_map[str2].add(case_change) - if unicode_confusable_map.get(case_change) is not None: - unicode_confusable_map[case_change].add(str2) - else: - unicode_confusable_map[case_change] = set([str2]) + unicode_confusable_map[str2].add(case_change) + if unicode_confusable_map.get(case_change) is not None: + unicode_confusable_map[case_change].add(str2) + else: + unicode_confusable_map[case_change] = {str2} for char in string.ascii_lowercase: accented = _get_accented_characters(char) @@ -72,7 +70,7 @@ def parse_new_mapping_file(): if unicode_confusable_map.get(accent): unicode_confusable_map[accent].add(char) else: - unicode_confusable_map[accent] = set([char]) + unicode_confusable_map[accent] = {char} for char in string.ascii_uppercase: accented = _get_accented_characters(char) @@ -81,17 +79,16 @@ def parse_new_mapping_file(): if unicode_confusable_map.get(accent): unicode_confusable_map[accent].add(char) else: - unicode_confusable_map[accent] = set([char]) + unicode_confusable_map[accent] = {char} CONFUSABLE_MAP = {} - characters_to_map = list(unicode_confusable_map.keys()) - for character in list(unicode_confusable_map.keys()): + for character in unicode_confusable_map.keys(): char_group = _get_confusable_chars(character, unicode_confusable_map, 0) CONFUSABLE_MAP[character] = list(char_group) mapping_file = open(os.path.join(os.path.dirname(__file__), CONFUSABLE_MAPPING_PATH), "w") - mapping_file.write(json.dumps(CONFUSABLE_MAP)) + json.dump(CONFUSABLE_MAP,mapping_file) mapping_file.close() -parse_new_mapping_file() \ No newline at end of file +parse_new_mapping_file() From befb6cff96f91328e899402f190ea6687861fb8a Mon Sep 17 00:00:00 2001 From: doggo4242 Date: Mon, 5 Jul 2021 18:03:13 -0400 Subject: [PATCH 4/6] fix condition --- confusables/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/confusables/__init__.py b/confusables/__init__.py index a4b9eb8..8dda894 100644 --- a/confusables/__init__.py +++ b/confusables/__init__.py @@ -58,7 +58,7 @@ def normalize(string, prioritize_alpha=False): if not char.isascii(): for confusable in confusable_chars: if prioritize_alpha: - if ((char.isalnum() and confusable.isalnum() and confusable.isascii()) or (not char.isalnum() and confusable.isascii())) and confusable not in NON_NORMAL_ASCII_CHARS: + if ((char.isalpha() and confusable.isalpha() and confusable.isascii()) or (not char.isalpha() and confusable.isascii())) and confusable not in NON_NORMAL_ASCII_CHARS: normal = confusable if len(confusable) > 1: normal = normalize(confusable)[0] From c72587257bbdd0cb878b7087fa0a56eb899ce691 Mon Sep 17 00:00:00 2001 From: doggo4242 Date: Mon, 5 Jul 2021 18:08:16 -0400 Subject: [PATCH 5/6] fix condition --- confusables/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/confusables/__init__.py b/confusables/__init__.py index 8dda894..cd6628d 100644 --- a/confusables/__init__.py +++ b/confusables/__init__.py @@ -55,7 +55,7 @@ def normalize(string, prioritize_alpha=False): for char in string: normalized_chars = [] confusable_chars = confusable_characters(char) - if not char.isascii(): + if not (char.isascii() and char.isalnum()): for confusable in confusable_chars: if prioritize_alpha: if ((char.isalpha() and confusable.isalpha() and confusable.isascii()) or (not char.isalpha() and confusable.isascii())) and confusable not in NON_NORMAL_ASCII_CHARS: From c1e4ea7a5ad6a3ef0dfdc754f77bcb1276fbc65c Mon Sep 17 00:00:00 2001 From: doggo4242 Date: Mon, 5 Jul 2021 18:47:09 -0400 Subject: [PATCH 6/6] simplified expression --- confusables/parse.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/confusables/parse.py b/confusables/parse.py index 3a7dda6..e9ff4c3 100644 --- a/confusables/parse.py +++ b/confusables/parse.py @@ -48,7 +48,7 @@ def parse_new_mapping_file(): unicode_confusable_map[str2] = {str1} if len(str1) == 1: - case_change = str1.lower() if str1.isupper() else str1.upper() + case_change = str1.swapcase() unicode_confusable_map[str1].add(case_change) if unicode_confusable_map.get(case_change) is not None: unicode_confusable_map[case_change].add(str1) @@ -56,7 +56,7 @@ def parse_new_mapping_file(): unicode_confusable_map[case_change] = {str1} if len(str2) == 1: - case_change = str2.lower() if str2.isupper() else str2.upper() + case_change = str2.swapcase() unicode_confusable_map[str2].add(case_change) if unicode_confusable_map.get(case_change) is not None: unicode_confusable_map[case_change].add(str2)