Skip to content

Commit

Permalink
feat: remove not needed encryption of secrets
Browse files Browse the repository at this point in the history
Instead use an uuid generator as we do for pii, and reuse
same session store mechanism

Closes: #929
  • Loading branch information
yrobla committed Feb 20, 2025
1 parent b23effd commit 69c5fef
Show file tree
Hide file tree
Showing 10 changed files with 118 additions and 327 deletions.
9 changes: 1 addition & 8 deletions src/codegate/pipeline/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,9 @@ def secure_cleanup(self):
"""Securely cleanup sensitive data for this session"""
if self.manager is None or self.session_id == "":
return

self.manager.cleanup_session(self.session_id)
self.session_id = ""

# Securely wipe the API key using the same method as secrets manager
if self.api_key is not None:
api_key_bytes = bytearray(self.api_key.encode())
self.manager.crypto.wipe_bytearray(api_key_bytes)
self.api_key = None

self.model = None


Expand Down
65 changes: 20 additions & 45 deletions src/codegate/pipeline/pii/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,41 +7,11 @@

from codegate.db.models import AlertSeverity
from codegate.pipeline.base import PipelineContext
from codegate.session.session_store import SessionStore

logger = structlog.get_logger("codegate.pii.analyzer")


class PiiSessionStore:
"""
A class to manage PII (Personally Identifiable Information) session storage.
Attributes:
session_id (str): The unique identifier for the session. If not provided, a new UUID
is generated. mappings (Dict[str, str]): A dictionary to store mappings between UUID
placeholders and PII.
Methods:
add_mapping(pii: str) -> str:
Adds a PII string to the session store and returns a UUID placeholder for it.
get_pii(uuid_placeholder: str) -> str:
Retrieves the PII string associated with the given UUID placeholder. If the placeholder
is not found, returns the placeholder itself.
"""

def __init__(self, session_id: str = None):
self.session_id = session_id or str(uuid.uuid4())
self.mappings: Dict[str, str] = {}

def add_mapping(self, pii: str) -> str:
uuid_placeholder = f"<{str(uuid.uuid4())}>"
self.mappings[uuid_placeholder] = pii
return uuid_placeholder

def get_pii(self, uuid_placeholder: str) -> str:
return self.mappings.get(uuid_placeholder, uuid_placeholder)


class PiiAnalyzer:
"""
PiiAnalyzer class for analyzing and anonymizing text containing PII.
Expand All @@ -52,12 +22,12 @@ class PiiAnalyzer:
Get or create the singleton instance of PiiAnalyzer.
analyze:
text (str): The text to analyze for PII.
Tuple[str, List[Dict[str, Any]], PiiSessionStore]: The anonymized text, a list of
Tuple[str, List[Dict[str, Any]], SessionStore]: The anonymized text, a list of
found PII details, and the session store.
entities (List[str]): The PII entities to analyze for.
restore_pii:
anonymized_text (str): The text with anonymized PII.
session_store (PiiSessionStore): The PiiSessionStore used for anonymization.
session_store (SessionStore): The SessionStore used for anonymization.
str: The text with original PII restored.
"""

Expand Down Expand Up @@ -95,13 +65,13 @@ def __init__(self):
# Create analyzer with custom NLP engine
self.analyzer = AnalyzerEngine(nlp_engine=nlp_engine)
self.anonymizer = AnonymizerEngine()
self.session_store = PiiSessionStore()
self.session_store = SessionStore()

PiiAnalyzer._instance = self

def analyze(
self, text: str, context: Optional[PipelineContext] = None
) -> Tuple[str, List[Dict[str, Any]], PiiSessionStore]:
self, text: str, session_id: str, context: Optional[PipelineContext] = None
) -> Tuple[str, List[Dict[str, Any]]]:
# Prioritize credit card detection first
entities = [
"PHONE_NUMBER",
Expand Down Expand Up @@ -135,7 +105,7 @@ def analyze(
anonymized_text = text
for result in analyzer_results:
pii_value = text[result.start : result.end]
uuid_placeholder = self.session_store.add_mapping(pii_value)
uuid_placeholder = self.session_store.add_mapping(session_id, pii_value)
pii_info = {
"type": result.entity_type,
"value": pii_value,
Expand All @@ -155,7 +125,7 @@ def analyze(
uuid=uuid_placeholder,
# Don't log the actual PII value for security
value_length=len(pii_value),
session_id=self.session_store.session_id,
session_id=session_id,
)

# Log summary of all PII found in this analysis
Expand All @@ -176,30 +146,35 @@ def analyze(
"PII analysis complete",
total_pii_found=len(found_pii),
pii_types=[p["type"] for p in found_pii],
session_id=self.session_store.session_id,
session_id=session_id
)

# Return the anonymized text, PII details, and session store
return anonymized_text, found_pii, self.session_store
return anonymized_text, found_pii

# If no PII found, return original text, empty list, and session store
return text, [], self.session_store
return text, []

def restore_pii(self, anonymized_text: str, session_store: PiiSessionStore) -> str:
def restore_pii(self, anonymized_text: str, session_id: str) -> str:
"""
Restore the original PII (Personally Identifiable Information) in the given anonymized text.
This method replaces placeholders in the anonymized text with their corresponding original
PII values using the mappings stored in the provided PiiSessionStore.
PII values using the mappings stored in the provided SessionStore.
Args:
anonymized_text (str): The text containing placeholders for PII.
session_store (PiiSessionStore): The session store containing mappings of placeholders
session_store (SessionStore): The session store containing mappings of placeholders
to original PII.
Returns:
str: The text with the original PII restored.
"""
for uuid_placeholder, original_pii in session_store.mappings.items():
session_data = self.session_store.get_by_session_id(session_id)
if not session_data:
logger.warning("No active PII session found for given session ID. Unable to restore PII.")
return anonymized_text

for uuid_placeholder, original_pii in session_data.items():
anonymized_text = anonymized_text.replace(uuid_placeholder, original_pii)
return anonymized_text
23 changes: 10 additions & 13 deletions src/codegate/pipeline/pii/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
import structlog

from codegate.pipeline.base import PipelineContext
from codegate.pipeline.pii.analyzer import PiiAnalyzer, PiiSessionStore
from codegate.pipeline.pii.analyzer import PiiAnalyzer
from codegate.session.session_store import SessionStore

logger = structlog.get_logger("codegate")

Expand All @@ -16,22 +17,22 @@ class PiiManager:
Attributes:
analyzer (PiiAnalyzer): The singleton instance of PiiAnalyzer used for
PII detection and restoration.
session_store (PiiSessionStore): The session store for the current PII session.
session_store (SessionStore): The session store for the current PII session.
Methods:
__init__():
Initializes the PiiManager with the singleton PiiAnalyzer instance and sets the
session store.
analyze(text: str) -> Tuple[str, List[Dict[str, Any]]]:
analyze(text: str, session_id: str) -> Tuple[str, List[Dict[str, Any]]]:
Analyzes the given text for PII, anonymizes it, and logs the detected PII details.
Args:
text (str): The text to be analyzed for PII.
Returns:
Tuple[str, List[Dict[str, Any]]]: A tuple containing the anonymized text and
a list of found PII details.
restore_pii(anonymized_text: str) -> str:
restore_pii(anonymized_text: str, session_id: str ) -> str:
Restores the PII in the given anonymized text using the current session.
Args:
anonymized_text (str): The text with anonymized PII to be restored.
Expand All @@ -48,16 +49,16 @@ def __init__(self):
self._session_store = self.analyzer.session_store

@property
def session_store(self) -> PiiSessionStore:
def session_store(self) -> SessionStore:
"""Get the current session store."""
# Always return the analyzer's current session store
return self.analyzer.session_store

def analyze(
self, text: str, context: Optional[PipelineContext] = None
self, text: str, session_id: str, context: Optional[PipelineContext] = None
) -> Tuple[str, List[Dict[str, Any]]]:
# Call analyzer and get results
anonymized_text, found_pii, _ = self.analyzer.analyze(text, context=context)
anonymized_text, found_pii = self.analyzer.analyze(text, session_id, context=context)

# Log found PII details (without modifying the found_pii list)
if found_pii:
Expand All @@ -72,13 +73,9 @@ def analyze(
# Return the exact same objects we got from the analyzer
return anonymized_text, found_pii

def restore_pii(self, anonymized_text: str) -> str:
def restore_pii(self, anonymized_text: str, session_id: str) -> str:
"""
Restore PII in the given anonymized text using the current session.
"""
if self.session_store is None:
logger.warning("No active PII session found. Unable to restore PII.")
return anonymized_text

# Use the analyzer's restore_pii method with the current session store
return self.analyzer.restore_pii(anonymized_text, self.session_store)
return self.analyzer.restore_pii(anonymized_text, session_id)
19 changes: 13 additions & 6 deletions src/codegate/pipeline/pii/pii.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import re
from typing import Any, Dict, List, Optional
import uuid

import structlog
from litellm import ChatCompletionRequest, ChatCompletionSystemMessage, ModelResponse
Expand Down Expand Up @@ -37,7 +38,7 @@ class CodegatePii(PipelineStep):
Processes the chat completion request to detect and redact PII. Updates the request with
anonymized text and stores PII details in the context metadata.
restore_pii(anonymized_text: str) -> str:
restore_pii(anonymized_text: str, session_id: str) -> str:
Restores the original PII from the anonymized text using the PiiManager.
"""

Expand Down Expand Up @@ -75,12 +76,13 @@ async def process(
total_pii_found = 0
all_pii_details: List[Dict[str, Any]] = []
last_redacted_text = ""
session_id = context.session_id if hasattr(context, "session_id") else str(uuid.uuid4())

for i, message in enumerate(new_request["messages"]):
if "content" in message and message["content"]:
# This is where analyze and anonymize the text
original_text = str(message["content"])
anonymized_text, pii_details = self.pii_manager.analyze(original_text, context)
anonymized_text, pii_details = self.pii_manager.analyze(original_text, session_id, context)

if pii_details:
total_pii_found += len(pii_details)
Expand All @@ -99,6 +101,7 @@ async def process(
context.metadata["redacted_pii_count"] = total_pii_found
context.metadata["redacted_pii_details"] = all_pii_details
context.metadata["redacted_text"] = last_redacted_text
context.metadata["session_id"] = session_id

if total_pii_found > 0:
context.metadata["pii_manager"] = self.pii_manager
Expand All @@ -113,8 +116,8 @@ async def process(

return PipelineResult(request=new_request, context=context)

def restore_pii(self, anonymized_text: str) -> str:
return self.pii_manager.restore_pii(anonymized_text)
def restore_pii(self, anonymized_text: str, session_id: str) -> str:
return self.pii_manager.restore_pii(anonymized_text, session_id)


class PiiUnRedactionStep(OutputPipelineStep):
Expand Down Expand Up @@ -151,7 +154,7 @@ def _is_complete_uuid(self, uuid_str: str) -> bool:
"""Check if the string is a complete UUID"""
return bool(self.complete_uuid_pattern.match(uuid_str))

async def process_chunk(
async def process_chunk( # noqa: C901
self,
chunk: ModelResponse,
context: OutputPipelineContext,
Expand All @@ -162,6 +165,10 @@ async def process_chunk(
return [chunk]

content = chunk.choices[0].delta.content
session_id = input_context.metadata.get("session_id", "")
if not session_id:
logger.error("Could not get any session id, cannot process pii")
return [chunk]

# Add current chunk to buffer
if context.prefix_buffer:
Expand Down Expand Up @@ -199,7 +206,7 @@ async def process_chunk(
if pii_manager and pii_manager.session_store:
# Restore original value from PII manager
logger.debug("Attempting to restore PII from UUID marker")
original = pii_manager.session_store.get_pii(uuid_marker)
original = pii_manager.session_store.get_mapping(session_id, uuid_marker)
logger.debug(f"Restored PII: {original}")
result.append(original)
else:
Expand Down
Loading

0 comments on commit 69c5fef

Please sign in to comment.