-
Notifications
You must be signed in to change notification settings - Fork 3.5k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add crew Testing/Evaluating feature (#998)
* feat: add crew Testing/evalauting feature * feat: add docs and add unit test * feat: improve testing output table * feat: add tests * feat: fix type checking issue * feat: add raise ValueError when testing if output is not the expected * docs: add docs for Testing * feat: improve tests and fix some issue * feat: back to sync * feat: change opdeai model * feat: fix test
- Loading branch information
1 parent
2d086ab
commit 2d2154e
Showing
7 changed files
with
350 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
--- | ||
title: crewAI Testing | ||
description: Learn how to test your crewAI Crew and evaluate their performance. | ||
--- | ||
|
||
## Introduction | ||
|
||
Testing is a crucial part of the development process, and it is essential to ensure that your crew is performing as expected. And with crewAI, you can easily test your crew and evaluate its performance using the built-in testing capabilities. | ||
|
||
### Using the Testing Feature | ||
|
||
We added the CLI command `crewai test` to make it easy to test your crew. This command will run your crew for a specified number of iterations and provide detailed performance metrics. | ||
The parameters are `n_iterations` and `model` which are optional and default to 2 and `gpt-4o-mini` respectively. For now the only provider available is OpenAI. | ||
|
||
```bash | ||
crewai test | ||
``` | ||
|
||
If you want to run more iterations or use a different model, you can specify the parameters like this: | ||
|
||
```bash | ||
crewai test --n_iterations 5 --model gpt-4o | ||
``` | ||
|
||
What happens when you run the `crewai test` command is that the crew will be executed for the specified number of iterations, and the performance metrics will be displayed at the end of the run. | ||
|
||
A table of scores at the end will show the performance of the crew in terms of the following metrics: | ||
``` | ||
Task Scores | ||
(1-10 Higher is better) | ||
┏━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━━━━━┓ | ||
┃ Tasks/Crew ┃ Run 1 ┃ Run 2 ┃ Avg. Total ┃ | ||
┡━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━━━━━┩ | ||
│ Task 1 │ 10.0 │ 9.0 │ 9.5 │ | ||
│ Task 2 │ 9.0 │ 9.0 │ 9.0 │ | ||
│ Crew │ 9.5 │ 9.0 │ 9.2 │ | ||
└────────────┴───────┴───────┴────────────┘ | ||
``` | ||
|
||
The example above shows the test results for two runs of the crew with two tasks, with the average total score for each task and the crew as a whole. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
149 changes: 149 additions & 0 deletions
149
src/crewai/utilities/evaluators/crew_evaluator_handler.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,149 @@ | ||
from collections import defaultdict | ||
|
||
from langchain_openai import ChatOpenAI | ||
from pydantic import BaseModel, Field | ||
from rich.console import Console | ||
from rich.table import Table | ||
|
||
from crewai.agent import Agent | ||
from crewai.task import Task | ||
from crewai.tasks.task_output import TaskOutput | ||
|
||
|
||
class TaskEvaluationPydanticOutput(BaseModel): | ||
quality: float = Field( | ||
description="A score from 1 to 10 evaluating on completion, quality, and overall performance from the task_description and task_expected_output to the actual Task Output." | ||
) | ||
|
||
|
||
class CrewEvaluator: | ||
""" | ||
A class to evaluate the performance of the agents in the crew based on the tasks they have performed. | ||
Attributes: | ||
crew (Crew): The crew of agents to evaluate. | ||
openai_model_name (str): The model to use for evaluating the performance of the agents (for now ONLY OpenAI accepted). | ||
tasks_scores (defaultdict): A dictionary to store the scores of the agents for each task. | ||
iteration (int): The current iteration of the evaluation. | ||
""" | ||
|
||
tasks_scores: defaultdict = defaultdict(list) | ||
iteration: int = 0 | ||
|
||
def __init__(self, crew, openai_model_name: str): | ||
self.crew = crew | ||
self.openai_model_name = openai_model_name | ||
self._setup_for_evaluating() | ||
|
||
def _setup_for_evaluating(self) -> None: | ||
"""Sets up the crew for evaluating.""" | ||
for task in self.crew.tasks: | ||
task.callback = self.evaluate | ||
|
||
def set_iteration(self, iteration: int) -> None: | ||
self.iteration = iteration | ||
|
||
def _evaluator_agent(self): | ||
return Agent( | ||
role="Task Execution Evaluator", | ||
goal=( | ||
"Your goal is to evaluate the performance of the agents in the crew based on the tasks they have performed using score from 1 to 10 evaluating on completion, quality, and overall performance." | ||
), | ||
backstory="Evaluator agent for crew evaluation with precise capabilities to evaluate the performance of the agents in the crew based on the tasks they have performed", | ||
verbose=False, | ||
llm=ChatOpenAI(model=self.openai_model_name), | ||
) | ||
|
||
def _evaluation_task( | ||
self, evaluator_agent: Agent, task_to_evaluate: Task, task_output: str | ||
) -> Task: | ||
return Task( | ||
description=( | ||
"Based on the task description and the expected output, compare and evaluate the performance of the agents in the crew based on the Task Output they have performed using score from 1 to 10 evaluating on completion, quality, and overall performance." | ||
f"task_description: {task_to_evaluate.description} " | ||
f"task_expected_output: {task_to_evaluate.expected_output} " | ||
f"agent: {task_to_evaluate.agent.role if task_to_evaluate.agent else None} " | ||
f"agent_goal: {task_to_evaluate.agent.goal if task_to_evaluate.agent else None} " | ||
f"Task Output: {task_output}" | ||
), | ||
expected_output="Evaluation Score from 1 to 10 based on the performance of the agents on the tasks", | ||
agent=evaluator_agent, | ||
output_pydantic=TaskEvaluationPydanticOutput, | ||
) | ||
|
||
def print_crew_evaluation_result(self) -> None: | ||
""" | ||
Prints the evaluation result of the crew in a table. | ||
A Crew with 2 tasks using the command crewai test -n 2 | ||
will output the following table: | ||
Task Scores | ||
(1-10 Higher is better) | ||
┏━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━━━━━┓ | ||
┃ Tasks/Crew ┃ Run 1 ┃ Run 2 ┃ Avg. Total ┃ | ||
┡━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━━━━━┩ | ||
│ Task 1 │ 10.0 │ 9.0 │ 9.5 │ | ||
│ Task 2 │ 9.0 │ 9.0 │ 9.0 │ | ||
│ Crew │ 9.5 │ 9.0 │ 9.2 │ | ||
└────────────┴───────┴───────┴────────────┘ | ||
""" | ||
task_averages = [ | ||
sum(scores) / len(scores) for scores in zip(*self.tasks_scores.values()) | ||
] | ||
crew_average = sum(task_averages) / len(task_averages) | ||
|
||
# Create a table | ||
table = Table(title="Tasks Scores \n (1-10 Higher is better)") | ||
|
||
# Add columns for the table | ||
table.add_column("Tasks/Crew") | ||
for run in range(1, len(self.tasks_scores) + 1): | ||
table.add_column(f"Run {run}") | ||
table.add_column("Avg. Total") | ||
|
||
# Add rows for each task | ||
for task_index in range(len(task_averages)): | ||
task_scores = [ | ||
self.tasks_scores[run][task_index] | ||
for run in range(1, len(self.tasks_scores) + 1) | ||
] | ||
avg_score = task_averages[task_index] | ||
table.add_row( | ||
f"Task {task_index + 1}", *map(str, task_scores), f"{avg_score:.1f}" | ||
) | ||
|
||
# Add a row for the crew average | ||
crew_scores = [ | ||
sum(self.tasks_scores[run]) / len(self.tasks_scores[run]) | ||
for run in range(1, len(self.tasks_scores) + 1) | ||
] | ||
table.add_row("Crew", *map(str, crew_scores), f"{crew_average:.1f}") | ||
|
||
# Display the table in the terminal | ||
console = Console() | ||
console.print(table) | ||
|
||
def evaluate(self, task_output: TaskOutput): | ||
"""Evaluates the performance of the agents in the crew based on the tasks they have performed.""" | ||
current_task = None | ||
for task in self.crew.tasks: | ||
if task.description == task_output.description: | ||
current_task = task | ||
break | ||
|
||
if not current_task or not task_output: | ||
raise ValueError( | ||
"Task to evaluate and task output are required for evaluation" | ||
) | ||
|
||
evaluator_agent = self._evaluator_agent() | ||
evaluation_task = self._evaluation_task( | ||
evaluator_agent, current_task, task_output.raw | ||
) | ||
|
||
evaluation_result = evaluation_task.execute_sync() | ||
|
||
if isinstance(evaluation_result.pydantic, TaskEvaluationPydanticOutput): | ||
self.tasks_scores[self.iteration].append(evaluation_result.pydantic.quality) | ||
else: | ||
raise ValueError("Evaluation result is not in the expected format") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
113 changes: 113 additions & 0 deletions
113
tests/utilities/evaluators/test_crew_evaluator_handler.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
from unittest import mock | ||
|
||
import pytest | ||
|
||
from crewai.agent import Agent | ||
from crewai.crew import Crew | ||
from crewai.task import Task | ||
from crewai.tasks.task_output import TaskOutput | ||
from crewai.utilities.evaluators.crew_evaluator_handler import ( | ||
CrewEvaluator, | ||
TaskEvaluationPydanticOutput, | ||
) | ||
|
||
|
||
class TestCrewEvaluator: | ||
@pytest.fixture | ||
def crew_planner(self): | ||
agent = Agent(role="Agent 1", goal="Goal 1", backstory="Backstory 1") | ||
task = Task( | ||
description="Task 1", | ||
expected_output="Output 1", | ||
agent=agent, | ||
) | ||
crew = Crew(agents=[agent], tasks=[task]) | ||
|
||
return CrewEvaluator(crew, openai_model_name="gpt-4o-mini") | ||
|
||
def test_setup_for_evaluating(self, crew_planner): | ||
crew_planner._setup_for_evaluating() | ||
assert crew_planner.crew.tasks[0].callback == crew_planner.evaluate | ||
|
||
def test_set_iteration(self, crew_planner): | ||
crew_planner.set_iteration(1) | ||
assert crew_planner.iteration == 1 | ||
|
||
def test_evaluator_agent(self, crew_planner): | ||
agent = crew_planner._evaluator_agent() | ||
assert agent.role == "Task Execution Evaluator" | ||
assert ( | ||
agent.goal | ||
== "Your goal is to evaluate the performance of the agents in the crew based on the tasks they have performed using score from 1 to 10 evaluating on completion, quality, and overall performance." | ||
) | ||
assert ( | ||
agent.backstory | ||
== "Evaluator agent for crew evaluation with precise capabilities to evaluate the performance of the agents in the crew based on the tasks they have performed" | ||
) | ||
assert agent.verbose is False | ||
assert agent.llm.model_name == "gpt-4o-mini" | ||
|
||
def test_evaluation_task(self, crew_planner): | ||
evaluator_agent = Agent( | ||
role="Evaluator Agent", | ||
goal="Evaluate the performance of the agents in the crew", | ||
backstory="Master in Evaluation", | ||
) | ||
task_to_evaluate = Task( | ||
description="Task 1", | ||
expected_output="Output 1", | ||
agent=Agent(role="Agent 1", goal="Goal 1", backstory="Backstory 1"), | ||
) | ||
task_output = "Task Output 1" | ||
task = crew_planner._evaluation_task( | ||
evaluator_agent, task_to_evaluate, task_output | ||
) | ||
|
||
assert task.description.startswith( | ||
"Based on the task description and the expected output, compare and evaluate the performance of the agents in the crew based on the Task Output they have performed using score from 1 to 10 evaluating on completion, quality, and overall performance." | ||
) | ||
|
||
assert task.agent == evaluator_agent | ||
assert ( | ||
task.description | ||
== "Based on the task description and the expected output, compare and evaluate " | ||
"the performance of the agents in the crew based on the Task Output they have " | ||
"performed using score from 1 to 10 evaluating on completion, quality, and overall " | ||
"performance.task_description: Task 1 task_expected_output: Output 1 " | ||
"agent: Agent 1 agent_goal: Goal 1 Task Output: Task Output 1" | ||
) | ||
|
||
@mock.patch("crewai.utilities.evaluators.crew_evaluator_handler.Console") | ||
@mock.patch("crewai.utilities.evaluators.crew_evaluator_handler.Table") | ||
def test_print_crew_evaluation_result(self, table, console, crew_planner): | ||
crew_planner.tasks_scores = { | ||
1: [10, 9, 8], | ||
2: [9, 8, 7], | ||
} | ||
|
||
crew_planner.print_crew_evaluation_result() | ||
|
||
table.assert_has_calls( | ||
[ | ||
mock.call(title="Tasks Scores \n (1-10 Higher is better)"), | ||
mock.call().add_column("Tasks/Crew"), | ||
mock.call().add_column("Run 1"), | ||
mock.call().add_column("Run 2"), | ||
mock.call().add_column("Avg. Total"), | ||
mock.call().add_row("Task 1", "10", "9", "9.5"), | ||
mock.call().add_row("Task 2", "9", "8", "8.5"), | ||
mock.call().add_row("Task 3", "8", "7", "7.5"), | ||
mock.call().add_row("Crew", "9.0", "8.0", "8.5"), | ||
] | ||
) | ||
console.assert_has_calls([mock.call(), mock.call().print(table())]) | ||
|
||
def test_evaluate(self, crew_planner): | ||
task_output = TaskOutput( | ||
description="Task 1", agent=str(crew_planner.crew.agents[0]) | ||
) | ||
|
||
with mock.patch.object(Task, "execute_sync") as execute: | ||
execute().pydantic = TaskEvaluationPydanticOutput(quality=9.5) | ||
crew_planner.evaluate(task_output) | ||
assert crew_planner.tasks_scores[0] == [9.5] |