Skip to content

Commit

Permalink
Merge pull request #30 from confident-ai/main
Browse files Browse the repository at this point in the history
Merge from main
  • Loading branch information
Anindyadeep authored Feb 4, 2024
2 parents 3457635 + 0728b19 commit b07020a
Show file tree
Hide file tree
Showing 71 changed files with 1,793 additions and 426 deletions.
5 changes: 5 additions & 0 deletions .github/workflows/deepeval-results.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,11 @@ jobs:
- name: Check if 'deepeval' script is available
run: ls -l $(poetry env info --path)/bin/deepeval || echo "deepeval script not found"

- name: Run deepeval login
env:
CONFIDENT_API_KEY: ${{ secrets.CONFIDENT_API_KEY }}
run: poetry run deepeval login --confident-api-key "$CONFIDENT_API_KEY"

- name: Run deepeval tests and capture output
run: poetry run deepeval test run tests/test_quickstart.py > output.txt 2>&1

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,4 +65,4 @@ jobs:
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
run: |
poetry run pytest tests/ --ignore=tests/test_g_eval.py
poetry run pytest tests/
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ Whether your application is implemented via RAG or fine-tuning, LangChain or Lla
- Contextual Recall
- Contextual Precision
- RAGAS
- Toxicity
- Hallucination
- Toxicity
- Bias
Expand Down
2 changes: 1 addition & 1 deletion deepeval/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__: str = "0.20.49"
__version__: str = "0.20.56"
8 changes: 7 additions & 1 deletion deepeval/cli/test.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import pytest
import typer
import os
import json
from typing_extensions import Annotated
from typing import Optional
from deepeval.test_run import test_run_manager, TEMP_FILE_NAME
from deepeval.utils import delete_file_if_exists
from deepeval.utils import delete_file_if_exists, get_deployment_configs
from deepeval.test_run import invoke_test_run_end_hook
from deepeval.telemetry import capture_evaluation_count

Expand Down Expand Up @@ -56,6 +57,11 @@ def run(
if exit_on_first_failure:
pytest_args.insert(0, "-x")

deployment_configs = get_deployment_configs()
if deployment_configs is not None:
deployment_configs_json = json.dumps(deployment_configs)
pytest_args.extend(["--deployment", deployment_configs_json])

pytest_args.extend(
[
"--verbose" if verbose else "--quiet",
Expand Down
1 change: 1 addition & 0 deletions deepeval/dataset/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ class Golden(BaseModel):
actual_output: Optional[str] = Field(None, alias="actualOutput")
expected_output: Optional[str] = Field(None, alias="expectedOutput")
context: Optional[list] = Field(None)
retrieval_context: Optional[list] = Field(None, alias="retrievalContext")
additional_metadata: Optional[Dict] = Field(
None, alias="additionalMetadata"
)
Expand Down
33 changes: 26 additions & 7 deletions deepeval/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from rich.console import Console
import json
import webbrowser
import os

from deepeval.metrics import BaseMetric
from deepeval.test_case import LLMTestCase
Expand All @@ -26,9 +25,20 @@ class EvaluationDataset:
test_cases: List[LLMTestCase]
goldens: List[Golden]

def __init__(self, test_cases: List[LLMTestCase] = []):
self.test_cases = test_cases
self.goldens = []
def __init__(
self,
alias: Optional[str] = None,
goldens: Optional[List[Golden]] = None,
test_cases: Optional[List[LLMTestCase]] = None,
):
if test_cases is not None:
for test_case in test_cases:
test_case.dataset_alias = alias
self.test_cases = test_cases
else:
self.test_cases = []
self.goldens = goldens or []
self.alias = alias

def add_test_case(self, test_case: LLMTestCase):
self.test_cases.append(test_case)
Expand All @@ -39,6 +49,11 @@ def __iter__(self):
def evaluate(self, metrics: List[BaseMetric]):
from deepeval import evaluate

if len(self.test_cases) == 0:
raise ValueError(
"No test cases found in evaluation dataset. Unable to evaluate empty dataset."
)

return evaluate(self.test_cases, metrics)

def add_test_cases_from_csv_file(
Expand Down Expand Up @@ -109,6 +124,7 @@ def get_column_data(df: pd.DataFrame, col_name: str, default=None):
actual_output=actual_output,
expected_output=expected_output,
context=context,
dataset_alias=self.alias,
)
)

Expand Down Expand Up @@ -171,6 +187,7 @@ def add_test_cases_from_json_file(
actual_output=actual_output,
expected_output=expected_output,
context=context,
dataset_alias=self.alias,
)
)

Expand Down Expand Up @@ -238,6 +255,7 @@ def add_test_cases_from_hf_dataset(
actual_output=actual_output,
expected_output=expected_output,
context=context,
dataset_alias=self.alias,
)
)

Expand Down Expand Up @@ -274,6 +292,7 @@ def push(self, alias: str):

def pull(self, alias: str, auto_convert_goldens_to_test_cases: bool = True):
if is_confident():
self.alias = alias
api = Api()
result = api.get_request(
endpoint=Endpoints.DATASET_ENDPOINT.value,
Expand All @@ -284,10 +303,10 @@ def pull(self, alias: str, auto_convert_goldens_to_test_cases: bool = True):
goldens=result["goldens"],
)

self.goldens = response.goldens

if auto_convert_goldens_to_test_cases:
self.test_cases = convert_goldens_to_test_cases(self.goldens)
self.test_cases = convert_goldens_to_test_cases(
response.goldens, alias
)
else:
raise Exception(
"Run `deepeval login` to pull dataset from Confident AI"
Expand Down
8 changes: 6 additions & 2 deletions deepeval/dataset/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List
from typing import List, Optional
from deepeval.dataset.api import Golden
from deepeval.test_case import LLMTestCase

Expand All @@ -18,14 +18,18 @@ def convert_test_cases_to_goldens(
return goldens


def convert_goldens_to_test_cases(goldens: List[Golden]) -> List[LLMTestCase]:
def convert_goldens_to_test_cases(
goldens: List[Golden], dataset_alias: Optional[str] = None
) -> List[LLMTestCase]:
test_cases = []
for golden in goldens:
test_case = LLMTestCase(
input=golden.input,
actual_output=golden.actual_output,
expected_output=golden.expected_output,
context=golden.context,
retrieval_context=golden.retrieval_context,
dataset_alias=dataset_alias,
)
test_cases.append(test_case)
return test_cases
Empty file.
1 change: 1 addition & 0 deletions deepeval/integrations/harness/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from deepeval.integrations.harness import DeepEvalHarnessCallback
26 changes: 26 additions & 0 deletions deepeval/integrations/harness/callback.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from typing import List, Union


# from deepeval.experimental import BaseEvaluationExperiment

try:
from transformers.trainer_callback import TrainerCallback

class DeepEvalHarnessCallback(TrainerCallback):
"""
A [transformers.TrainerCallback] that logs various harness LLM evaluation metrics to DeepEval
"""

def __init__(self, experiments):
super().__init__()
self.experiments = experiments

raise NotImplementedError("DeepEvalHarnessCallback is WIP")

except ImportError:

class DeepEvalHarnessCallback:
def __init__(self, *args, **kwargs):
raise ImportError(
"The 'transformers' library is required to use the DeepEvalHarnessCallback."
)
1 change: 1 addition & 0 deletions deepeval/integrations/hugging_face/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from deepeval.integrations.hugging_face import DeepEvalHuggingFaceCallback
Loading

0 comments on commit b07020a

Please sign in to comment.