diff --git a/pebblo/app/models/db_models.py b/pebblo/app/models/db_models.py index 8eaa6507..642fd57a 100644 --- a/pebblo/app/models/db_models.py +++ b/pebblo/app/models/db_models.py @@ -81,12 +81,10 @@ class AiDataModel(BaseModel): data: Optional[Union[list, str]] = None entityCount: int entities: dict + entityDetails: Optional[dict] = {} topicCount: Optional[int] = 0 topics: Optional[dict] = {} - - def dict(self, **kwargs): - kwargs["exclude_none"] = True - return super().dict(**kwargs) + topicDetails: Optional[dict] = {} class RetrievalContext(BaseModel): @@ -183,5 +181,7 @@ class AiSnippet(BaseModel): lastModified: Optional[str] = None entities: dict topics: dict + entityDetails: Optional[dict] = {} + topicDetails: Optional[dict] = {} policyViolations: Optional[List[dict]] = [] # label_feedback: Optional[List[LabelFeedback]] = [] diff --git a/pebblo/app/service/loader/loader_doc_service.py b/pebblo/app/service/loader/loader_doc_service.py index e1fb6077..2e156a13 100644 --- a/pebblo/app/service/loader/loader_doc_service.py +++ b/pebblo/app/service/loader/loader_doc_service.py @@ -170,8 +170,10 @@ def _get_doc_classification(self, doc): data=doc.get("doc", None), entities={}, entityCount=0, + entityDetails={}, topics={}, topicCount=0, + topicDetails={}, ) try: if doc_info.data: @@ -189,8 +191,10 @@ def _get_doc_classification(self, doc): ) doc_info.topics = topics doc_info.entities = entities + doc_info.entityDetails = entity_details doc_info.topicCount = topic_count doc_info.entityCount = entity_count + doc_info.topicDetails = topic_details doc_info.data = anonymized_doc logger.debug("Doc classification finished.") return doc_info @@ -209,6 +213,8 @@ def _update_doc_details(doc, doc_info): logger.debug("Update doc details with classification result") doc["entities"] = doc_info.entities doc["topics"] = doc_info.topics + doc["entity_details"] = doc_info.entityDetails + doc["topic_details"] = doc_info.topicDetails logger.debug("Input doc updated with classification result") @timeit diff --git a/pebblo/app/service/loader/snippet/snippet.py b/pebblo/app/service/loader/snippet/snippet.py index 10362093..2dabb3df 100644 --- a/pebblo/app/service/loader/snippet/snippet.py +++ b/pebblo/app/service/loader/snippet/snippet.py @@ -71,6 +71,8 @@ def create_snippet(self, doc, data_source, document): "loaderSourcePath": data_source.get("sourcePath"), "entities": doc.get("entities", {}), "topics": doc.get("topics", {}), + "entityDetails": doc.get("entity_details", {}), + "topicDetails": doc.get("topic_details", {}), } ai_snippet_obj = AiSnippet(**snippet_details) ai_snippet = ai_snippet_obj.dict() diff --git a/pebblo/app/service/local_ui/loader_apps.py b/pebblo/app/service/local_ui/loader_apps.py index bd96608b..8084c94f 100644 --- a/pebblo/app/service/local_ui/loader_apps.py +++ b/pebblo/app/service/local_ui/loader_apps.py @@ -37,106 +37,139 @@ def __init__(self): self.loader_document_with_findings_list = [] self.loader_findings_summary_list = [] - def _get_snippet_details(self, snippet_ids, owner): + def _get_snippet_details(self, snippet_ids, owner, label_name): + """ + This function finds snippet details based on labels + """ + response = [] for snippet_id in snippet_ids: status, output = self.db.query(AiSnippetsTable, {"id": snippet_id}) if not status or len(output) == 0: continue snippet_details = output[0].data + entity_details = {} + topic_details = {} + if snippet_details.get("topicDetails") and snippet_details[ + "topicDetails" + ].get(label_name): + topic_details = { + label_name: snippet_details["topicDetails"].get(label_name) + } + if snippet_details.get("entityDetails") and snippet_details[ + "entityDetails" + ].get(label_name): + entity_details = { + label_name: snippet_details["entityDetails"].get(label_name) + } snippet_obj = { "snippet": snippet_details["doc"], "sourcePath": snippet_details["sourcePath"], - # "topicDetails": {}, # TODO: To be added post 0.1.18 - # "entityDetails": {}, # TODO: to be added post 0.1.18 + "topicDetails": topic_details, + "entityDetails": entity_details, "fileOwner": owner, "authorizedIdentities": [], } response.append(snippet_obj) return response - def get_findings_for_loader_app(self, app_data): - topic_count = 0 - entity_count = 0 - total_snippet_count = 0 - snippets = [] - if app_data.get("docEntities"): - for entity, entity_data in app_data.get("docEntities").items(): - entity_count += entity_data.get("count") - self.loader_findings += entity_data.get("count") - - findings_exists = False - for findings in self.loader_findings_list: - if findings.get("labelName") == entity: - findings_exists = True - findings["findings"] += entity_data["count"] - findings["snippetCount"] += len(entity_data["snippetIds"]) - findings["fileCount"] = len(app_data["documents"]) - total_snippet_count += findings["snippetCount"] - snippets.extend( - self._get_snippet_details( - entity_data["snippetIds"], app_data["owner"] - ) - ) - break - if not findings_exists: - logger.debug("finding not exist") - findings = { - "appName": app_data["name"], - "labelName": entity, - "findings": entity_data["count"], - "findingsType": "entities", - "snippetCount": len(entity_data["snippetIds"]), - "fileCount": len(app_data["documents"]), - "snippets": self._get_snippet_details( - entity_data["snippetIds"], app_data["owner"] - ), - } - total_snippet_count += findings["snippetCount"] - shallow_copy = findings.copy() - self.loader_findings_list.append(shallow_copy) - del findings["snippets"] - self.loader_findings_summary_list.append(findings) + def _findings_for_app_entities( + self, app_data, snippets, total_snippet_count, entity_count + ): + """ + This function finds findings for apps with entities + """ - if app_data.get("docTopics"): - for topic, topic_data in app_data.get("docTopics").items(): - topic_count += topic_data.get("count") - self.loader_findings += topic_data.get("count") - - findings_exists = False - for findings in self.loader_findings_list: - if findings.get("labelName") == topic: - findings_exists = True - findings["findings"] += topic_data["count"] - findings["snippetCount"] += len(topic_data["snippetIds"]) - findings["fileCount"] = len(app_data["documents"]) - total_snippet_count += findings["snippetCount"] - snippets.extend( - self._get_snippet_details( - topic_data["snippetIds"], app_data["owner"] - ) + for entity, entity_data in app_data.get("docEntities").items(): + entity_count += entity_data.get("count") + self.loader_findings += entity_data.get("count") + + findings_exists = False + for findings in self.loader_findings_list: + if findings.get("labelName") == entity: + findings_exists = True + findings["findings"] += entity_data["count"] + findings["snippetCount"] += len(entity_data["snippetIds"]) + findings["fileCount"] = len(app_data["documents"]) + total_snippet_count += findings["snippetCount"] + snippets.extend( + self._get_snippet_details( + entity_data["snippetIds"], app_data["owner"], entity ) - break - if not findings_exists: - findings = { - "appName": app_data["name"], - "labelName": topic, - "findings": topic_data["count"], - "findingsType": "topics", - "snippetCount": len(topic_data["snippetIds"]), - "fileCount": len(app_data["documents"]), - "snippets": self._get_snippet_details( - topic_data["snippetIds"], app_data["owner"] - ), - } + ) + break + if not findings_exists: + logger.debug("finding not exist") + findings = { + "appName": app_data["name"], + "labelName": entity, + "findings": entity_data["count"], + "findingsType": "entities", + "snippetCount": len(entity_data["snippetIds"]), + "fileCount": len(app_data["documents"]), + "snippets": self._get_snippet_details( + entity_data["snippetIds"], app_data["owner"], entity + ), + } + total_snippet_count += findings["snippetCount"] + shallow_copy = findings.copy() + self.loader_findings_list.append(shallow_copy) + del findings["snippets"] + self.loader_findings_summary_list.append(findings) + return entity_count, snippets, total_snippet_count + + def _findings_for_app_topics( + self, app_data, snippets, total_snippet_count, topic_count + ): + """ + This function finds findings for apps with topics + """ + + for topic, topic_data in app_data.get("docTopics").items(): + topic_count += topic_data.get("count") + self.loader_findings += topic_data.get("count") + + findings_exists = False + for findings in self.loader_findings_list: + if findings.get("labelName") == topic: + findings_exists = True + findings["findings"] += topic_data["count"] + findings["snippetCount"] += len(topic_data["snippetIds"]) + findings["fileCount"] = len(app_data["documents"]) total_snippet_count += findings["snippetCount"] - shallow_copy = findings.copy() - self.loader_findings_list.append(shallow_copy) - del findings["snippets"] - self.loader_findings_summary_list.append(findings) + snippets.extend( + self._get_snippet_details( + topic_data["snippetIds"], app_data["owner"], topic + ) + ) + break + if not findings_exists: + findings = { + "appName": app_data["name"], + "labelName": topic, + "findings": topic_data["count"], + "findingsType": "topics", + "snippetCount": len(topic_data["snippetIds"]), + "fileCount": len(app_data["documents"]), + "snippets": self._get_snippet_details( + topic_data["snippetIds"], app_data["owner"], topic + ), + } + total_snippet_count += findings["snippetCount"] + shallow_copy = findings.copy() + self.loader_findings_list.append(shallow_copy) + del findings["snippets"] + self.loader_findings_summary_list.append(findings) + return topic_count, snippets, total_snippet_count + + def _update_loader_datasource( + self, app_data, entity_count, topic_count, total_snippet_count + ): + """ + This function updates loader datasource details and count + """ - # Data Source Details - status, data_sources = self.db.query( + _, data_sources = self.db.query( AiDataSourceTable, {"loadId": app_data.get("id")} ) for data_source in data_sources: @@ -158,10 +191,12 @@ def get_findings_for_loader_app(self, app_data): # Data Source Count self.loader_data_source = len(self.loader_data_source_list) - # Fetch required data for DocumentWithFindings - status, documents = self.db.query( - AiDocumentTable, {"loadId": app_data.get("id")} - ) + def _get_documents_with_findings(self, app_data): + """ + Fetch required data for DocumentWithFindings + """ + + _, documents = self.db.query(AiDocumentTable, {"loadId": app_data.get("id")}) loader_document_with_findings = app_data.get("documentsWithFindings") documents_with_findings_data = [] for document in documents: @@ -184,6 +219,33 @@ def get_findings_for_loader_app(self, app_data): # Documents with findings Count self.loader_files_findings = len(self.loader_document_with_findings_list) + def get_findings_for_loader_app(self, app_data): + """ + This function calculates findings for loader app + """ + + entity_count = 0 + topic_count = 0 + total_snippet_count = 0 + snippets = [] + if app_data.get("docEntities"): + entity_count, snippets, total_snippet_count = ( + self._findings_for_app_entities( + app_data, snippets, total_snippet_count, entity_count + ) + ) + + if app_data.get("docTopics"): + topic_count, snippets, total_snippet_count = self._findings_for_app_topics( + app_data, snippets, total_snippet_count, topic_count + ) + + self._update_loader_datasource( + app_data, entity_count, topic_count, total_snippet_count + ) + + self._get_documents_with_findings(app_data) + app_details = LoaderAppListDetails( name=app_data.get("name"), topics=topic_count, @@ -216,9 +278,9 @@ def get_all_loader_apps(self): continue self.loader_apps_at_risk += 1 - loader_app = self.get_findings_for_loader_app(app_data) - all_loader_apps.append(loader_app) - app_processed.append(app_data["name"]) + loader_app = self.get_findings_for_loader_app(app_data) + all_loader_apps.append(loader_app) + app_processed.append(app_data["name"]) # TODO: Sort loader apps # sorted_loader_apps = self._sort_loader_apps(all_loader_apps)