forked from confident-ai/deepeval
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathb.py
96 lines (87 loc) · 2.81 KB
/
b.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from typing import List
from deepeval.metrics.base_metric import BaseMetric
from deepeval.metrics.utils import check_llm_test_case_params
from deepeval.test_case import LLMTestCase, ConversationalTestCase
from deepeval import confident_evaluate, evaluate
from deepeval.metrics import (
AnswerRelevancyMetric,
BiasMetric,
FaithfulnessMetric,
ConversationCompletenessMetric,
SummarizationMetric,
)
from deepeval.test_case.llm_test_case import LLMTestCaseParams
test_case = ConversationalTestCase(
chatbot_role="A programmer",
turns=[
LLMTestCase(
input="Message input", actual_output="Message actual output"
)
],
)
test_case2 = ConversationalTestCase(
turns=[
LLMTestCase(
input="Message input", actual_output="Message actual output"
)
]
)
required_params: List[LLMTestCaseParams] = [
LLMTestCaseParams.INPUT,
LLMTestCaseParams.ACTUAL_OUTPUT,
LLMTestCaseParams.RETRIEVAL_CONTEXT,
]
from deepeval.metrics import GEval
# correctness_metric = GEval(
# name="Correctness",
# criteria="Determine whether the actual output is factually correct based on the expected output.",
# # NOTE: you can only provide either criteria or evaluation_steps, and not both
# evaluation_steps=[
# "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
# "You should also heavily penalize omission of detail",
# "Vague language, or contradicting OPINIONS, are OK",
# ],
# evaluation_params=[
# LLMTestCaseParams.INPUT,
# LLMTestCaseParams.ACTUAL_OUTPUT,
# ],
# )
# evaluate(
# test_cases=[
# LLMTestCase(
# input="Message input number 1!",
# actual_output="Message actual output number 1...",
# retrieval_context=["I love dogs"],
# ),
# LLMTestCase(
# input="Message input 2, this is just a test",
# actual_output="Message actual output 2, this is just a test",
# retrieval_context=["I love dogs"],
# ),
# ],
# metrics=[
# # correctness_metric,
# # AnswerRelevancyMetric(),
# # BiasMetric(),
# SummarizationMetric(verbose_mode=True, truths_extraction_limit=3),
# FaithfulnessMetric(verbose_mode=True, truths_extraction_limit=3),
# ],
# # throttle_value=10,
# # max_concurrent=1,
# )
confident_evaluate(experiment_name="Convo", test_cases=[test_case])
# evaluate(
# test_cases=[
# LLMTestCase(
# input="Message input", actual_output="Message actual output"
# )
# ],
# metrics=[
# AnswerRelevancyMetric(),
# BiasMetric(),
# FaithfulnessMetric(),
# ConversationCompletenessMetric(),
# ],
# run_async=True,
# ignore_errors=True,
# )