Skip to content

Commit

Permalink
tweaked scoring algo
Browse files Browse the repository at this point in the history
  • Loading branch information
0x4f53 committed Nov 13, 2023
1 parent c78eca7 commit ecd46f8
Showing 1 changed file with 15 additions and 25 deletions.
40 changes: 15 additions & 25 deletions text_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,16 +36,7 @@ def string_tokenizer(text):

return final_word_list

def similarity(stringA, stringB):
return (
math.floor (
difflib.SequenceMatcher (
a=stringA.lower(),
b=stringB.lower()
)
.ratio() * 100
)
)
def similarity(a, b): return difflib.SequenceMatcher(None, a, b).ratio() * 100

def get_regexes():
with open('definitions.json') as json_file:
Expand All @@ -66,9 +57,7 @@ def phone_pii(text, rules):
return phone_numbers

def id_card_numbers_pii(text, rules):

results = []

# Clear all non-regional regexes
regional_regexes = {}
for key in rules.keys():
Expand Down Expand Up @@ -119,28 +108,29 @@ def regional_pii(text):
for entity in named_entities:
if isinstance(entity, nltk.tree.Tree):
if entity.label() in ['GPE', 'GSP', 'LOCATION', 'FACILITY']:
location_name = ' '.join([word for word, tag in entity.leaves() if word.lower() not in stop_words])
location_name = ' '.join([word for word, tag in entity.leaves() if word.lower() not in stop_words and len(word) > 2])
locations.append(location_name)

return list(set(locations))

def keywords_classify_pii(rules, intelligible_text_list):
keys = rules.keys()

scores = {}
for key in keys:

for key, rule in rules.items():
scores[key] = 0
keywords = rules[key]["keywords"]
if keywords != None:
# Compare each word in intelligible list with each word in keywords list
count = 0
keywords = rule.get("keywords", [])
if keywords is not None:
for intelligible_text_word in intelligible_text_list:
for keywords_word in keywords:
if similarity(intelligible_text_word, keywords_word) > 75:
count += 1

scores[key] = count
if similarity(
intelligible_text_word.lower()
.replace(".", "")
.replace("'", "")
.replace("-", "")
.replace("_", "")
.replace(",", ""),
keywords_word.lower()
) > 80: scores[key] += 1

return scores

0 comments on commit ecd46f8

Please sign in to comment.