Skip to content

Commit

Permalink
Showing anonymized data in DB apps (daxa-ai#533)
Browse files Browse the repository at this point in the history
* Showing anonymized data in DB apps

* Updating location changes for fixing local UI issue
  • Loading branch information
dristysrivastava authored Sep 11, 2024
1 parent 6ebdf6e commit a13617a
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 30 deletions.
1 change: 1 addition & 0 deletions pebblo/app/service/loader/loader_doc_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ def _update_doc_details(doc, doc_info):
doc["topics"] = doc_info.topics
doc["entity_details"] = doc_info.entityDetails
doc["topic_details"] = doc_info.topicDetails
doc["doc"] = doc_info.data
logger.debug("Input doc updated with classification result")

@timeit
Expand Down
91 changes: 76 additions & 15 deletions pebblo/entity_classifier/entity_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,9 @@ def custom_analyze(self):
# Add the private key recognizer to the Presidio Analyzer
self.analyzer.registry.add_recognizer(pk_recognizer)

def analyze_response(self, input_text, anonymize_all_entities=True):
def analyze_response(
self, input_text: str, anonymize_all_entities: bool = True
) -> list:
"""
Analyze the given input text to detect and classify entities based on predefined criteria.
Expand Down Expand Up @@ -90,7 +92,9 @@ def analyze_response(self, input_text, anonymize_all_entities=True):
# Return the list of classified entities that met the criteria
return final_results

def anonymize_response(self, analyzer_results, input_text):
def anonymize_response(
self, analyzer_results: list, input_text: str
) -> (list, str):
# Returns anonymized output
anonymized_text = self.anonymizer.anonymize(
text=input_text, analyzer_results=analyzer_results
Expand All @@ -99,33 +103,90 @@ def anonymize_response(self, analyzer_results, input_text):
return anonymized_text.items, anonymized_text.text

@staticmethod
def get_analyzed_entities_response(data, anonymized_response=None):
def _sort_analyzed_data(data: list) -> list:
"""
This function sort analyzed response data based on its start position
"""
# Convert input data into dictionary structure
analyzed_data = [
{
"entity_type": entry.entity_type,
"start": entry.start,
"end": entry.end,
"score": entry.score,
}
for entry in data
]
analyzed_data.sort(key=lambda x: x["start"])
return analyzed_data

@staticmethod
def _sort_anonymized_data(data: list) -> list:
"""
This function sort anonymized response data based on its start position
"""
# Convert input data into dictionary structure
anonymized_data = [
{"entity_type": entry.entity_type, "start": entry.start, "end": entry.end}
for entry in data
]

# Sort data based on start
anonymized_data.sort(key=lambda x: x["start"])
return anonymized_data

@staticmethod
def update_anonymized_location(
start: int, end: int, location_count: int
) -> (str, int):
"""
As we are replacing < with &lt; and > with &gt; respectively in the anonymized text, to make we need to
adjust the location to match the updated text. Since the length difference between &lt; and <, as well as
between &gt; and > is 3 characters each, we add a total of 6 i.e., (3 + 3) to the end_location to account for
the increased length after the replacements.
"""
location = f"{start+location_count}_{end+location_count+6}"
location_count += 6
return location, location_count

def get_analyzed_entities_response(
self, data: list, anonymized_response: list = None
) -> list:
# Returns entities with its location i.e. start to end and confidence score

analyzed_data = self._sort_analyzed_data(data)
if anonymized_response:
anonymized_response = self._sort_anonymized_data(anonymized_response)

response = []
for index, value in enumerate(data):
location_count = 0
for index, value in enumerate(analyzed_data):
mapped_entity = None
if value.entity_type in Entities.__members__:
mapped_entity = Entities[value.entity_type].value
elif value.entity_type in SecretEntities.__members__:
mapped_entity = SecretEntities[value.entity_type].value
if value["entity_type"] in Entities.__members__:
mapped_entity = Entities[value["entity_type"]].value
elif value["entity_type"] in SecretEntities.__members__:
mapped_entity = SecretEntities[value["entity_type"]].value

location = f"{value.start}_{value.end}"
location = f"{value['start']}_{value['end']}"
if anonymized_response:
anonymized_data = anonymized_response[len(data) - index - 1]
location = f"{anonymized_data.start}_{anonymized_data.end}"
anonymized_data = anonymized_response[index]
if anonymized_data["entity_type"] == value["entity_type"]:
location, location_count = self.update_anonymized_location(
anonymized_data["start"], anonymized_data["end"], location_count
)
response.append(
{
"entity_type": value.entity_type,
"entity_type": value["entity_type"],
"location": location,
"confidence_score": value.score,
"confidence_score": value["score"],
"entity_group": entity_group_conf_mapping[mapped_entity][1],
}
)
return response

def presidio_entity_classifier_and_anonymizer(
self, input_text, anonymize_snippets=False
):
self, input_text: str, anonymize_snippets: bool = False
) -> (dict, int, str, dict):
"""
Perform classification on the input data and return a dictionary with the count of each entity group.
And also returns plain input text as anonymized text output
Expand Down
17 changes: 2 additions & 15 deletions pebblo/entity_classifier/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,12 @@
from pebblo.utils import get_confidence_score_label


def location_key(item):
"""
To convert the location string into a tuple of integers
"""
loc = item["location"]
return tuple(map(int, loc.split("_")))


def get_entities(entities_list, response):
def get_entities(entities_list: list, response: list) -> (dict, dict, int):
"""
Returns entity groups, its details such as confidence score, location and its group grouped by entity type
"""
entity_groups: dict = dict()
entity_details: dict = dict()
entity_details_response: dict = dict()

mapped_entity = None
total_count = 0
Expand All @@ -53,11 +44,7 @@ def get_entities(entities_list, response):
entity_details[mapped_entity] = [entity_data]
total_count += 1

for entity, entity_data in entity_details.items():
# Sorting entity details based on location in ascending order
entity_details_response[entity] = sorted(entity_data, key=location_key)

return entity_groups, entity_details_response, total_count
return entity_groups, entity_details, total_count


def add_custom_regex_analyzer_registry():
Expand Down

0 comments on commit a13617a

Please sign in to comment.