Skip to content

Commit

Permalink
Adding experiment_result class for backup
Browse files Browse the repository at this point in the history
  • Loading branch information
Anindyadeep committed Mar 5, 2024
1 parent fc26b5d commit 9fef28a
Showing 1 changed file with 0 additions and 100 deletions.
100 changes: 0 additions & 100 deletions deepeval/experimental/harness/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,103 +99,3 @@ def run(self, tasks: Union[str, List[str]]):
print(config_to_pass)
results = self.evaluator(tasks=tasks, config=config_to_pass)
return results


# class EvaluationResult(BaseModel):
# task_name: str = Field(..., description="The name of the task")
# saved_path: Union[str, PosixPath] = Field(
# ..., description="The evaluation csv path."
# )
# accuracy: float = Field(
# ...,
# description="The final value of the number of test cases passed out of total number of test cases.",
# )
# std_dev: float = Field(..., description="Standard deviation of the results")


# class EvaluationResult(BaseEvaluationExperiment):
# def __init__(self, task_name: str, task_limit: int) -> None:
# self.task_name, self.task_limit = task_name, task_limit

# def visualize_result(
# self, sub_task: Optional[Union[str, List[str]]] = None
# ) -> None:
# raise NotImplementedError

# def collect_results_and_push(self, push_to_hub: Optional[bool] = False):
# all_task_data = HarnessTasks.get_dataset_from_task(
# task_names=self.task_name, task_limit=self.task_limit
# )
# all_task_results = {}
# all_jsonl = os.listdir(self.experiment_folder)

# for task_name, task_data in all_task_data.items():
# test_cases = []
# # todo: this is not applicable, need to find the string which matches

# try:
# closest_match = difflib.get_close_matches(
# task_name, all_jsonl, n=1
# )
# task_jsonl = closest_match[0]

# task_evaluation_dict = json.load(
# open(self.experiment_folder / task_jsonl, "r")
# )
# all_responses, is_correct_overall = [], []
# # also collect the accuracy
# scores = []
# all_prompts, all_targets = [], []

# for eval_dict in task_evaluation_dict:
# responses = eval_dict["filtered_resps"]
# filtered_responses, is_response_correct = zip(*responses)
# all_responses.extend(list(filtered_responses))
# is_correct_overall.extend(list(is_response_correct))
# scores.append(eval_dict["acc"])

# for _, prompt, target, response in zip(
# task_data["doc_id"],
# task_data["prompt"],
# task_data["target"],
# all_responses,
# ):
# test_case = LLMTestCase(
# input=prompt,
# actual_output=str(response),
# expected_output=str(target),
# )
# test_cases.append(test_case)
# all_prompts.append(prompt)
# all_targets.append(target)

# dataset = EvaluationDataset(test_cases=test_cases)
# # Provide an alias when pushing a dataset
# dataset.evaluate(
# metrics=[ExactMatchAccuracyMetric(minimum_score=0.5)]
# )

# if push_to_hub:
# # this is very unstable, for each task it opens a new window in confident-ai.
# dataset.push(alias=task_name)

# # do not save in the memory
# pd.DataFrame(
# {
# "id": list(range(1, len(all_prompts) + 1)),
# "prompt": all_prompts,
# "target": all_targets,
# "response": all_responses,
# "is_correct": is_correct_overall,
# }
# ).to_csv(self.evaluation_csvs_folder / f"{task_name}.csv"),

# all_task_results[task_name] = EvaluationResult(
# task_name=task_name,
# saved_path=self.evaluation_csvs_folder / f"{task_name}.csv",
# accuracy=sum(scores) / len(scores),
# )
# except Exception as e:
# print(f"Task {task_name} not found or not run.\nError: {e}")
# continue
# return all_task_results

0 comments on commit 9fef28a

Please sign in to comment.