Showing anonymized data in DB apps (daxa-ai#533)

* Showing anonymized data in DB apps * Updating location changes for fixing local UI issue
Raj725 · Sep 11, 2024 · a13617a · a13617a
1 parent 6ebdf6e
commit a13617a
Show file tree

Hide file tree

Showing 3 changed files with 79 additions and 30 deletions.
diff --git a/pebblo/app/service/loader/loader_doc_service.py b/pebblo/app/service/loader/loader_doc_service.py
@@ -227,6 +227,7 @@ def _update_doc_details(doc, doc_info):
         doc["topics"] = doc_info.topics
         doc["entity_details"] = doc_info.entityDetails
         doc["topic_details"] = doc_info.topicDetails
+        doc["doc"] = doc_info.data
         logger.debug("Input doc updated with classification result")
 
     @timeit

diff --git a/pebblo/entity_classifier/entity_classifier.py b/pebblo/entity_classifier/entity_classifier.py
@@ -47,7 +47,9 @@ def custom_analyze(self):
         # Add the private key recognizer to the Presidio Analyzer
         self.analyzer.registry.add_recognizer(pk_recognizer)
 
-    def analyze_response(self, input_text, anonymize_all_entities=True):
+    def analyze_response(
+        self, input_text: str, anonymize_all_entities: bool = True
+    ) -> list:
         """
         Analyze the given input text to detect and classify entities based on predefined criteria.
 
@@ -90,7 +92,9 @@ def analyze_response(self, input_text, anonymize_all_entities=True):
         # Return the list of classified entities that met the criteria
         return final_results
 
-    def anonymize_response(self, analyzer_results, input_text):
+    def anonymize_response(
+        self, analyzer_results: list, input_text: str
+    ) -> (list, str):
         # Returns anonymized output
         anonymized_text = self.anonymizer.anonymize(
             text=input_text, analyzer_results=analyzer_results
@@ -99,33 +103,90 @@ def anonymize_response(self, analyzer_results, input_text):
         return anonymized_text.items, anonymized_text.text
 
     @staticmethod
-    def get_analyzed_entities_response(data, anonymized_response=None):
+    def _sort_analyzed_data(data: list) -> list:
+        """
+        This function sort analyzed response data based on its start position
+        """
+        # Convert input data into dictionary structure
+        analyzed_data = [
+            {
+                "entity_type": entry.entity_type,
+                "start": entry.start,
+                "end": entry.end,
+                "score": entry.score,
+            }
+            for entry in data
+        ]
+        analyzed_data.sort(key=lambda x: x["start"])
+        return analyzed_data
+
+    @staticmethod
+    def _sort_anonymized_data(data: list) -> list:
+        """
+        This function sort anonymized response data based on its start position
+        """
+        # Convert input data into dictionary structure
+        anonymized_data = [
+            {"entity_type": entry.entity_type, "start": entry.start, "end": entry.end}
+            for entry in data
+        ]
+
+        # Sort data based on start
+        anonymized_data.sort(key=lambda x: x["start"])
+        return anonymized_data
+
+    @staticmethod
+    def update_anonymized_location(
+        start: int, end: int, location_count: int
+    ) -> (str, int):
+        """
+        As we are replacing < with &lt; and > with &gt; respectively in the anonymized text, to make we need to
+        adjust the location to match the updated text. Since the length difference between &lt; and <, as well as
+        between &gt; and > is 3 characters each, we add a total of 6 i.e., (3 + 3) to the end_location to account for
+        the increased length after the replacements.
+        """
+        location = f"{start+location_count}_{end+location_count+6}"
+        location_count += 6
+        return location, location_count
+
+    def get_analyzed_entities_response(
+        self, data: list, anonymized_response: list = None
+    ) -> list:
         # Returns entities with its location i.e. start to end and confidence score
+
+        analyzed_data = self._sort_analyzed_data(data)
+        if anonymized_response:
+            anonymized_response = self._sort_anonymized_data(anonymized_response)
+
         response = []
-        for index, value in enumerate(data):
+        location_count = 0
+        for index, value in enumerate(analyzed_data):
             mapped_entity = None
-            if value.entity_type in Entities.__members__:
-                mapped_entity = Entities[value.entity_type].value
-            elif value.entity_type in SecretEntities.__members__:
-                mapped_entity = SecretEntities[value.entity_type].value
+            if value["entity_type"] in Entities.__members__:
+                mapped_entity = Entities[value["entity_type"]].value
+            elif value["entity_type"] in SecretEntities.__members__:
+                mapped_entity = SecretEntities[value["entity_type"]].value
 
-            location = f"{value.start}_{value.end}"
+            location = f"{value['start']}_{value['end']}"
             if anonymized_response:
-                anonymized_data = anonymized_response[len(data) - index - 1]
-                location = f"{anonymized_data.start}_{anonymized_data.end}"
+                anonymized_data = anonymized_response[index]
+                if anonymized_data["entity_type"] == value["entity_type"]:
+                    location, location_count = self.update_anonymized_location(
+                        anonymized_data["start"], anonymized_data["end"], location_count
+                    )
             response.append(
                 {
-                    "entity_type": value.entity_type,
+                    "entity_type": value["entity_type"],
                     "location": location,
-                    "confidence_score": value.score,
+                    "confidence_score": value["score"],
                     "entity_group": entity_group_conf_mapping[mapped_entity][1],
                 }
             )
         return response
 
     def presidio_entity_classifier_and_anonymizer(
-        self, input_text, anonymize_snippets=False
-    ):
+        self, input_text: str, anonymize_snippets: bool = False
+    ) -> (dict, int, str, dict):
         """
         Perform classification on the input data and return a dictionary with the count of each entity group.
         And also returns plain input text as anonymized text output

diff --git a/pebblo/entity_classifier/utils/utils.py b/pebblo/entity_classifier/utils/utils.py
@@ -15,21 +15,12 @@
 from pebblo.utils import get_confidence_score_label
 
 
-def location_key(item):
-    """
-    To convert the location string into a tuple of integers
-    """
-    loc = item["location"]
-    return tuple(map(int, loc.split("_")))
-
-
-def get_entities(entities_list, response):
+def get_entities(entities_list: list, response: list) -> (dict, dict, int):
     """
     Returns entity groups, its details such as confidence score, location and its group grouped by entity type
     """
     entity_groups: dict = dict()
     entity_details: dict = dict()
-    entity_details_response: dict = dict()
 
     mapped_entity = None
     total_count = 0
@@ -53,11 +44,7 @@ def get_entities(entities_list, response):
                 entity_details[mapped_entity] = [entity_data]
             total_count += 1
 
-    for entity, entity_data in entity_details.items():
-        # Sorting entity details based on location in ascending order
-        entity_details_response[entity] = sorted(entity_data, key=location_key)
-
-    return entity_groups, entity_details_response, total_count
+    return entity_groups, entity_details, total_count
 
 
 def add_custom_regex_analyzer_registry():