Skip to content

Commit

Permalink
added evals file with Textgrad
Browse files Browse the repository at this point in the history
  • Loading branch information
luv-singh-ai committed Sep 28, 2024
1 parent 6dd0648 commit 6f1572e
Show file tree
Hide file tree
Showing 8 changed files with 481 additions and 1 deletion.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,5 @@ autogen/.cache/41/cache.db
/__pycache__
archive/sample_sukoon_code.ipynb
whatsapp-bot
/langgraph
/langgraph
prompts/prompts.txt
File renamed without changes.
File renamed without changes.
141 changes: 141 additions & 0 deletions evals/analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
import openai
import json
import os
import requests
import pandas as pd
from dotenv import load_dotenv, find_dotenv
from openai import OpenAI
import re
# Load environment variables from .env file
load_dotenv(find_dotenv())

api_key = os.getenv("OPENAI_API_KEY")
SUPABASE_KEY = os.getenv("SUPABASE_CLIENT_ANON_KEY")

client = OpenAI(
api_key=api_key,
)

def analyze_conversation(user_message, sukoon_response):
# Construct the prompt for the LLM
prompt = f"""
Given the following conversation between a user and a chatbot named 'Sukoon':
User: "{user_message}"
Sukoon: "{sukoon_response}"
Analyze the conversation and provide the analysis in the following JSON format:
{{
"user_analysis": {{
"primary_concern": "<main mental health issue or emotional state>",
"subject_category": "<choose from: General Greeting, Emotional Support, Mental Health Information, Coping Strategies, Crisis Management, Feedback, or Other>",
"emotional_tone": "<overall emotional tone of the user's message>"
}},
"sukoon_response_evaluation": {{
"empathy_rating": <rate from 1-5 how well Sukoon demonstrated empathy>,
"relevance_rating": <rate from 1-5 how relevant Sukoon's response was to the user's concern>,
"clarity_rating": <rate from 1-5 how clear and easy to understand Sukoon's response was>,
"helpfulness_rating": <rate from 1-5 how helpful Sukoon's suggestions or information were>,
"overall_rating": <calculate the average of the above ratings>,
"strengths": ["<list key strengths of Sukoon's response>"],
"areas_for_improvement": ["<list areas where Sukoon's response could be improved>"],
"suggested_follow_up": "<provide a suggestion for how Sukoon could follow up or what question it could ask next>"
}}
}}
Ensure that the output is valid JSON that can be parsed by Python's json.loads() function. Avoid including any markdown formatting like code blocks. Output only the JSON object.
"""

# Call the OpenAI API to get the analysis
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "You are an assistant that analyzes conversations between users and the chatbot 'Sukoon'."},
{"role": "user", "content": prompt}
],
temperature=0.7,
max_tokens=500,
response_format = { "type": "json_object" }
)

# Extract the assistant's reply
analysis_text = response.choices[0].message.content
# Attempt to parse the response as JSON
try:
# Remove code block formatting if present
analysis_text = analysis_text.strip()
if analysis_text.startswith("```"):
# Use regex to remove code block markers
analysis_text = re.sub(r'^```(?:json)?\n?', '', analysis_text)
analysis_text = re.sub(r'\n?```$', '', analysis_text)

analysis_json = json.loads(analysis_text)

# Add original messages to the analysis
analysis_json['user_message'] = user_message
analysis_json['sukoon_response'] = sukoon_response
# Attempt to parse the response as JSON
analysis_json = json.loads(analysis_text)
return analysis_json
except json.JSONDecodeError as e:
print("Failed to parse JSON:", e)
print("LLM Response:", analysis_text)
return None

# Example usage
if __name__ == "__main__":
# sample conversation
# user_message = "I've been feeling really down lately and I don't know why. Nothing seems to make me happy anymore."
# sukoon_response = "I'm sorry to hear that you've been feeling down. It's not uncommon to experience periods of low mood, and it can be frustrating when you can't pinpoint the reason. Have you noticed any changes in your daily routine or any recent stressful events? Sometimes, talking to a mental health professional can help you explore these feelings and find ways to cope. Would you like to discuss some simple self-care strategies that might help lift your mood?"

headers = {
'apikey': SUPABASE_KEY,
'Authorization': f'Bearer {SUPABASE_KEY}',
}
# to read all table
# response = requests.get('https://supabase.pplus.ai/rest/v1/Sukoon Student?select=*', headers=headers)
# to read specific columns - user_message and Output
response = requests.get('https://supabase.pplus.ai/rest/v1/Sukoon%20Student?select=user_message,Output', headers=headers)

if response.status_code ==200:
data = response.json()
df = pd.DataFrame(data)
elif response.status_code ==500:
print("Internal server error")
else:
print("Error fetching data:", response.status_code)
df = pd.DataFrame()

# Convert DataFrame to list of records
records = df.to_dict('records')

# List to store all analyses
analyses = []

# for index, row in df.iterrows():
for row in records:
user_message = row.get('user_message', '')
sukoon_response = row.get('Output', '')
if not user_message or not sukoon_response:
continue # Skip if data is missing

analysis = analyze_conversation(user_message, sukoon_response)
analysis['user_message'] = user_message
analysis['sukoon_response'] = sukoon_response
if analysis:
analyses.append(analysis)
print(json.dumps(analysis, indent=2))
else:
# Handle cases where analysis failed
print(f"Analysis failed for conversation with user_message: {user_message}")

# append all files to one analysis text or somewhere suitable
with open("data/analysis_data/analysis.json", "w") as file:
# file.write(f"{json.dumps(analysis)}\n\n")
json.dump(analyses, file, indent=2)




2 changes: 2 additions & 0 deletions evals/analysis.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"user_analysis": {"primary_concern": "exam pressure and fear of failing", "subject_category": "Emotional Support", "emotional_tone": "anxious and overwhelmed"}, "sukoon_response_evaluation": {"empathy_rating": 4, "relevance_rating": 5, "clarity_rating": 5, "helpfulness_rating": 4, "overall_rating": 4.5, "strengths": ["Demonstrates understanding of the user's emotional state", "Offers practical suggestions to alleviate anxiety", "Encourages breaking down the problem into smaller steps"], "areas_for_improvement": ["Could provide more specific examples or resources", "Might ask about the user's study habits or current strategies"], "suggested_follow_up": "How about we explore one of those small steps together? Which topic do you think you could review first?"}}

83 changes: 83 additions & 0 deletions evals/evals.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
## Explanation of the Sukoon Chatbot Evaluation Process

### Purpose of Evaluation
We evaluate the Sukoon chatbot's responses to ensure it's providing helpful, empathetic, and appropriate support for users' mental health concerns. This process helps us improve the chatbot's performance over time.

### The Evaluation Process

1. **User Input Analysis**
We first look at what the user is saying to understand their needs.

| Aspect | Description | Example |
|-------------------|--------------------------------------------------|-------------------------------------|
| Primary Concern | The main mental health issue or emotional state | Depression, anxiety, stress |
| Subject Category | The type of support needed | Emotional support, coping strategies, crisis management |
| Emotional Tone | The overall mood of the user's message | Sad, frustrated, hopeful |

2. **Sukoon's Response Evaluation**
We then analyze how well Sukoon responded to the user's needs.

| Criteria | Description | Scale |
|--------------|------------------------------------------------------|-------|
| Empathy | How well Sukoon understood and acknowledged the user's feelings | 1-5 |
| Relevance | How well Sukoon's response addressed the user's specific concern | 1-5 |
| Clarity | How easy it was to understand Sukoon's response | 1-5 |
| Helpfulness | How useful Sukoon's suggestions or information were | 1-5 |

3. **Overall Assessment**
We identify strengths, areas for improvement, and suggest follow-up questions.

### Visual Representation of the Process

```mermaid
graph TD
A[User Message] --> B[Analyze User Input]
A --> C[Sukoon Response]
B --> D[Evaluate Sukoon's Response]
C --> D
D --> E[Overall Assessment]
E --> F[Identify Strengths]
E --> G[Identify Areas for Improvement]
E --> H[Suggest Follow-up]
```

### Why This Evaluation Matters

1. **Ensures Quality:** By consistently evaluating responses, we can maintain a high standard of support.
2. **Identifies Patterns:** We can spot recurring issues or exceptionally good responses.
3. **Guides Improvements:** The feedback helps us refine Sukoon's responses and capabilities.
4. **Enhances User Experience:** Better responses lead to more effective support for users.
5. **Safety Check:** We can ensure Sukoon is appropriately handling sensitive or crisis situations.

### Example Evaluation

**User:** "I've been feeling really anxious about my upcoming job interview. I can't sleep at night."

**Sukoon:** "I understand that job interviews can be stressful, and it's affecting your sleep. It's normal to feel anxious about important events. Have you tried any relaxation techniques like deep breathing or progressive muscle relaxation? These can help calm your mind and improve sleep. Also, preparing well for the interview might help boost your confidence. Would you like to explore some strategies to manage interview anxiety?"

#### Evaluation:
- **User Analysis:**
- Primary Concern: Anxiety
- Subject Category: Coping Strategies
- Emotional Tone: Worried

- **Sukoon's Response Evaluation:**
- Empathy Rating: 4/5
- Relevance Rating: 5/5
- Clarity Rating: 4/5
- Helpfulness Rating: 4/5
- Overall Rating: 4.25/5

- **Strengths:**
1. Acknowledged the user's feelings
2. Normalized the experience
3. Offered specific coping strategies
4. Provided a practical suggestion (interview preparation)

- **Areas for Improvement:**
1. Could provide a specific deep breathing technique
2. Might ask about the user's current preparation status

- **Suggested Follow-up:** "What steps have you already taken to prepare for your interview?"

By regularly performing this type of evaluation, we can ensure that Sukoon is providing the best possible support to users dealing with mental health concerns.
Loading

0 comments on commit 6f1572e

Please sign in to comment.