added evals file with Textgrad

PeoplePlusAI · Sep 28, 2024 · 6f1572e · 6f1572e
1 parent 6dd0648
commit 6f1572e
Show file tree

Hide file tree

Showing 8 changed files with 481 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -17,4 +17,5 @@ autogen/.cache/41/cache.db
 /__pycache__
 archive/sample_sukoon_code.ipynb
 whatsapp-bot
-/langgraph
+/langgraph
+prompts/prompts.txt
diff --git a/prompts/common_questions_asked.txt → data/common_questions_asked.txt b/prompts/common_questions_asked.txt → data/common_questions_asked.txt
diff --git a/prompts/sample_data.txt → data/sample_data.txt b/prompts/sample_data.txt → data/sample_data.txt
diff --git a/evals/analysis.py b/evals/analysis.py
@@ -0,0 +1,141 @@
+import openai
+import json
+import os  
+import requests
+import pandas as pd
+from dotenv import load_dotenv, find_dotenv
+from openai import OpenAI
+import re
+# Load environment variables from .env file
+load_dotenv(find_dotenv())
+
+api_key = os.getenv("OPENAI_API_KEY")
+SUPABASE_KEY = os.getenv("SUPABASE_CLIENT_ANON_KEY")
+
+client = OpenAI(
+    api_key=api_key,
+)
+
+def analyze_conversation(user_message, sukoon_response):
+    # Construct the prompt for the LLM
+    prompt = f"""
+            Given the following conversation between a user and a chatbot named 'Sukoon':
+
+            User: "{user_message}"
+
+            Sukoon: "{sukoon_response}"
+
+            Analyze the conversation and provide the analysis in the following JSON format:
+
+            {{
+            "user_analysis": {{
+                "primary_concern": "<main mental health issue or emotional state>",
+                "subject_category": "<choose from: General Greeting, Emotional Support, Mental Health Information, Coping Strategies, Crisis Management, Feedback, or Other>",
+                "emotional_tone": "<overall emotional tone of the user's message>"
+            }},
+            "sukoon_response_evaluation": {{
+                "empathy_rating": <rate from 1-5 how well Sukoon demonstrated empathy>,
+                "relevance_rating": <rate from 1-5 how relevant Sukoon's response was to the user's concern>,
+                "clarity_rating": <rate from 1-5 how clear and easy to understand Sukoon's response was>,
+                "helpfulness_rating": <rate from 1-5 how helpful Sukoon's suggestions or information were>,
+                "overall_rating": <calculate the average of the above ratings>,
+                "strengths": ["<list key strengths of Sukoon's response>"],
+                "areas_for_improvement": ["<list areas where Sukoon's response could be improved>"],
+                "suggested_follow_up": "<provide a suggestion for how Sukoon could follow up or what question it could ask next>"
+            }}
+            }}
+
+            Ensure that the output is valid JSON that can be parsed by Python's json.loads() function. Avoid including any markdown formatting like code blocks. Output only the JSON object.
+            """
+
+    # Call the OpenAI API to get the analysis
+    response = client.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {"role": "system", "content": "You are an assistant that analyzes conversations between users and the chatbot 'Sukoon'."},
+            {"role": "user", "content": prompt}
+        ],
+        temperature=0.7,
+        max_tokens=500,
+        response_format = { "type": "json_object" }
+    )
+
+    # Extract the assistant's reply
+    analysis_text = response.choices[0].message.content
+    # Attempt to parse the response as JSON
+    try:
+        # Remove code block formatting if present
+        analysis_text = analysis_text.strip()
+        if analysis_text.startswith("```"):
+            # Use regex to remove code block markers
+            analysis_text = re.sub(r'^```(?:json)?\n?', '', analysis_text)
+            analysis_text = re.sub(r'\n?```$', '', analysis_text)
+
+        analysis_json = json.loads(analysis_text)
+
+        # Add original messages to the analysis
+        analysis_json['user_message'] = user_message
+        analysis_json['sukoon_response'] = sukoon_response
+    # Attempt to parse the response as JSON
+        analysis_json = json.loads(analysis_text)
+        return analysis_json
+    except json.JSONDecodeError as e:
+        print("Failed to parse JSON:", e)
+        print("LLM Response:", analysis_text)
+        return None
+
+# Example usage
+if __name__ == "__main__":
+    # sample conversation
+    # user_message = "I've been feeling really down lately and I don't know why. Nothing seems to make me happy anymore."
+    # sukoon_response = "I'm sorry to hear that you've been feeling down. It's not uncommon to experience periods of low mood, and it can be frustrating when you can't pinpoint the reason. Have you noticed any changes in your daily routine or any recent stressful events? Sometimes, talking to a mental health professional can help you explore these feelings and find ways to cope. Would you like to discuss some simple self-care strategies that might help lift your mood?"
+
+    headers = {
+        'apikey': SUPABASE_KEY,
+        'Authorization': f'Bearer {SUPABASE_KEY}',
+    }
+    # to read all table
+    # response = requests.get('https://supabase.pplus.ai/rest/v1/Sukoon Student?select=*', headers=headers)
+    # to read specific columns - user_message and Output
+    response = requests.get('https://supabase.pplus.ai/rest/v1/Sukoon%20Student?select=user_message,Output', headers=headers)
+
+    if response.status_code ==200:
+        data = response.json()
+        df = pd.DataFrame(data)
+    elif response.status_code ==500:
+        print("Internal server error")
+    else:
+        print("Error fetching data:", response.status_code)
+        df = pd.DataFrame()
+
+    # Convert DataFrame to list of records
+    records = df.to_dict('records')
+
+    # List to store all analyses
+    analyses = []
+
+    # for index, row in df.iterrows():
+    for row in records:
+        user_message = row.get('user_message', '')
+        sukoon_response = row.get('Output', '')
+        if not user_message or not sukoon_response:
+            continue  # Skip if data is missing
+
+        analysis = analyze_conversation(user_message, sukoon_response)
+        analysis['user_message'] = user_message
+        analysis['sukoon_response'] = sukoon_response
+        if analysis:
+            analyses.append(analysis)
+            print(json.dumps(analysis, indent=2))
+        else:
+            # Handle cases where analysis failed
+            print(f"Analysis failed for conversation with user_message: {user_message}")
+
+        # append all files to one analysis text or somewhere suitable
+        with open("data/analysis_data/analysis.json", "w") as file:
+            # file.write(f"{json.dumps(analysis)}\n\n")
+            json.dump(analyses, file, indent=2)
+
+
+
+
diff --git a/evals/analysis.txt b/evals/analysis.txt
@@ -0,0 +1,2 @@
+{"user_analysis": {"primary_concern": "exam pressure and fear of failing", "subject_category": "Emotional Support", "emotional_tone": "anxious and overwhelmed"}, "sukoon_response_evaluation": {"empathy_rating": 4, "relevance_rating": 5, "clarity_rating": 5, "helpfulness_rating": 4, "overall_rating": 4.5, "strengths": ["Demonstrates understanding of the user's emotional state", "Offers practical suggestions to alleviate anxiety", "Encourages breaking down the problem into smaller steps"], "areas_for_improvement": ["Could provide more specific examples or resources", "Might ask about the user's study habits or current strategies"], "suggested_follow_up": "How about we explore one of those small steps together? Which topic do you think you could review first?"}}
+
diff --git a/evals/evals.md b/evals/evals.md
@@ -0,0 +1,83 @@
+## Explanation of the Sukoon Chatbot Evaluation Process
+
+### Purpose of Evaluation
+We evaluate the Sukoon chatbot's responses to ensure it's providing helpful, empathetic, and appropriate support for users' mental health concerns. This process helps us improve the chatbot's performance over time.
+
+### The Evaluation Process
+
+1. **User Input Analysis**  
+   We first look at what the user is saying to understand their needs.
+
+   | Aspect            | Description                                      | Example                             |
+   |-------------------|--------------------------------------------------|-------------------------------------|
+   | Primary Concern    | The main mental health issue or emotional state  | Depression, anxiety, stress         |
+   | Subject Category   | The type of support needed                       | Emotional support, coping strategies, crisis management |
+   | Emotional Tone     | The overall mood of the user's message          | Sad, frustrated, hopeful            |
+
+2. **Sukoon's Response Evaluation**  
+   We then analyze how well Sukoon responded to the user's needs.
+
+   | Criteria     | Description                                          | Scale |
+   |--------------|------------------------------------------------------|-------|
+   | Empathy      | How well Sukoon understood and acknowledged the user's feelings | 1-5   |
+   | Relevance    | How well Sukoon's response addressed the user's specific concern | 1-5   |
+   | Clarity      | How easy it was to understand Sukoon's response     | 1-5   |
+   | Helpfulness  | How useful Sukoon's suggestions or information were  | 1-5   |
+
+3. **Overall Assessment**  
+   We identify strengths, areas for improvement, and suggest follow-up questions.
+
+### Visual Representation of the Process
+
+```mermaid
+graph TD
+    A[User Message] --> B[Analyze User Input]
+    A --> C[Sukoon Response]
+    B --> D[Evaluate Sukoon's Response]
+    C --> D
+    D --> E[Overall Assessment]
+    E --> F[Identify Strengths]
+    E --> G[Identify Areas for Improvement]
+    E --> H[Suggest Follow-up]
+```
+
+### Why This Evaluation Matters
+
+1. **Ensures Quality:** By consistently evaluating responses, we can maintain a high standard of support.
+2. **Identifies Patterns:** We can spot recurring issues or exceptionally good responses.
+3. **Guides Improvements:** The feedback helps us refine Sukoon's responses and capabilities.
+4. **Enhances User Experience:** Better responses lead to more effective support for users.
+5. **Safety Check:** We can ensure Sukoon is appropriately handling sensitive or crisis situations.
+
+### Example Evaluation
+
+**User:** "I've been feeling really anxious about my upcoming job interview. I can't sleep at night."
+
+**Sukoon:** "I understand that job interviews can be stressful, and it's affecting your sleep. It's normal to feel anxious about important events. Have you tried any relaxation techniques like deep breathing or progressive muscle relaxation? These can help calm your mind and improve sleep. Also, preparing well for the interview might help boost your confidence. Would you like to explore some strategies to manage interview anxiety?"
+
+#### Evaluation:
+- **User Analysis:**
+  - Primary Concern: Anxiety
+  - Subject Category: Coping Strategies
+  - Emotional Tone: Worried
+
+- **Sukoon's Response Evaluation:**
+  - Empathy Rating: 4/5
+  - Relevance Rating: 5/5
+  - Clarity Rating: 4/5
+  - Helpfulness Rating: 4/5
+  - Overall Rating: 4.25/5
+
+- **Strengths:**
+  1. Acknowledged the user's feelings
+  2. Normalized the experience
+  3. Offered specific coping strategies
+  4. Provided a practical suggestion (interview preparation)
+
+- **Areas for Improvement:**
+  1. Could provide a specific deep breathing technique
+  2. Might ask about the user's current preparation status
+
+- **Suggested Follow-up:** "What steps have you already taken to prepare for your interview?"
+
+By regularly performing this type of evaluation, we can ensure that Sukoon is providing the best possible support to users dealing with mental health concerns.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		{"user_analysis": {"primary_concern": "exam pressure and fear of failing", "subject_category": "Emotional Support", "emotional_tone": "anxious and overwhelmed"}, "sukoon_response_evaluation": {"empathy_rating": 4, "relevance_rating": 5, "clarity_rating": 5, "helpfulness_rating": 4, "overall_rating": 4.5, "strengths": ["Demonstrates understanding of the user's emotional state", "Offers practical suggestions to alleviate anxiety", "Encourages breaking down the problem into smaller steps"], "areas_for_improvement": ["Could provide more specific examples or resources", "Might ask about the user's study habits or current strategies"], "suggested_follow_up": "How about we explore one of those small steps together? Which topic do you think you could review first?"}}