From d5d067e78a2d022675bf6a2602b0a7ff571289a2 Mon Sep 17 00:00:00 2001 From: recursix Date: Wed, 4 Sep 2024 14:57:54 -0400 Subject: [PATCH 01/58] core functions --- src/agentlab/experiments/graph_execution.py | 92 +++++++++++++++++++++ tests/experiments/test_graph_execution.py | 82 ++++++++++++++++++ 2 files changed, 174 insertions(+) create mode 100644 src/agentlab/experiments/graph_execution.py create mode 100644 tests/experiments/test_graph_execution.py diff --git a/src/agentlab/experiments/graph_execution.py b/src/agentlab/experiments/graph_execution.py new file mode 100644 index 00000000..35bb7441 --- /dev/null +++ b/src/agentlab/experiments/graph_execution.py @@ -0,0 +1,92 @@ +from dask import compute, delayed +from dask.distributed import Client +from browsergym.experiments.loop import ExpArgs +import logging + + +def run_experiments(n_workers, exp_args_list: list[ExpArgs], exp_dir): + """Run a list of experiments in parallel while respecting dependencies.""" + + logging.info(f"Saving experiments to {exp_dir}") + for exp_args in exp_args_list: + exp_args.agent_args.prepare() + exp_args.prepare(exp_root=exp_dir) + + try: + execute_task_graph(Client(n_workers=n_workers), exp_args_list) + finally: + logging.info("All jobs are finished. Calling agent_args.close() on all agents...") + for exp_args in exp_args_list: + exp_args.agent_args.close() + logging.info("Experiment finished.") + + +def _run(exp_arg: ExpArgs, *dependencies): + """Capture dependencies to ensure they are run before the current task.""" + return exp_arg.run() + + +def execute_task_graph(dask_client, exp_args_list: list[ExpArgs]): + """Execute a task graph in parallel while respecting dependencies.""" + exp_args_map = {exp_args.exp_id: exp_args for exp_args in exp_args_list} + + with dask_client: + tasks = {} + + def get_task(exp_arg: ExpArgs): + if exp_arg.exp_id not in tasks: + dependencies = [get_task(exp_args_map[dep_key]) for dep_key in exp_arg.depends_on] + tasks[exp_arg.exp_id] = delayed(_run)(exp_arg, *dependencies) + return tasks[exp_arg.exp_id] + + for exp_arg in exp_args_list: + get_task(exp_arg) + + task_ids, task_list = zip(*tasks.items()) + results = compute(*task_list) + + return {task_id: result for task_id, result in zip(task_ids, results)} + + +def add_dependencies(exp_args_list: list[ExpArgs], task_dependencies: dict[list] = None): + """Add dependencies to a list of ExpArgs. + + Args: + exp_args_list: list[ExpArgs] + A list of experiments to run. + task_dependencies: dict + A dictionary mapping task names to a list of task names that they + depend on. If None or empty, no dependencies are added. + + Returns: + list[ExpArgs] + The modified exp_args_list with dependencies added. + """ + + if task_dependencies is None or all([len(dep) == 0 for dep in task_dependencies.values()]): + # nothing to be done + return exp_args_list + + exp_args_map = {exp_args.env_args.task_name: exp_args for exp_args in exp_args_list} + if len(exp_args_map) != len(exp_args_list): + raise ValueError( + ( + "Task names are not unique in exp_args_map, " + "you can't run multiple seeds with task dependencies." + ) + ) + + for task_name in exp_args_map.keys(): + if task_name not in task_dependencies: + raise ValueError(f"Task {task_name} is missing from task_dependencies") + + # turn dependencies from task names to exp_ids + for task_name, exp_args in exp_args_map.items(): + + exp_args.depends_on = [ + exp_args_map[dep_name].exp_id + for dep_name in task_dependencies[task_name] + if dep_name in exp_args_map # ignore dependencies that are not to be run + ] + + return exp_args_list diff --git a/tests/experiments/test_graph_execution.py b/tests/experiments/test_graph_execution.py new file mode 100644 index 00000000..9af01d49 --- /dev/null +++ b/tests/experiments/test_graph_execution.py @@ -0,0 +1,82 @@ +from dask.distributed import Client +import pytest +from agentlab.experiments.graph_execution import execute_task_graph, add_dependencies +from time import time, sleep +from browsergym.experiments.loop import ExpArgs, EnvArgs + + +# Mock implementation of the ExpArgs class with timestamp checks +class MockedExpArgs: + def __init__(self, task_id, depends_on=None): + self.task_id = task_id + self.depends_on = depends_on if depends_on else [] + self.start_time = None + self.end_time = None + + def run(self): + self.start_time = time() + sleep(0.5) # Simulate task execution time + self.end_time = time() + return self + + +def test_execute_task_graph(): + # Define a list of ExpArgs with dependencies + exp_args_list = [ + MockedExpArgs(task_id="task1", depends_on=[]), + MockedExpArgs(task_id="task2", depends_on=["task1"]), + MockedExpArgs(task_id="task3", depends_on=["task1"]), + MockedExpArgs(task_id="task4", depends_on=["task2", "task3"]), + ] + + # Execute the task graph + results = execute_task_graph(Client(n_workers=3), exp_args_list) + + exp_args_list = [results[task_id] for task_id in ["task1", "task2", "task3", "task4"]] + + # Verify that all tasks were executed in the proper order + assert exp_args_list[0].start_time < exp_args_list[1].start_time + assert exp_args_list[0].start_time < exp_args_list[2].start_time + assert exp_args_list[1].end_time < exp_args_list[3].start_time + assert exp_args_list[2].end_time < exp_args_list[3].start_time + + # Verify that parallel tasks (task2 and task3) started within a short time of each other + parallel_start_diff = abs(exp_args_list[1].start_time - exp_args_list[2].start_time) + assert parallel_start_diff < 0.1 # Allow for a small delay + + # Ensure that the entire task graph took the expected amount of time + total_time = exp_args_list[-1].end_time - exp_args_list[0].start_time + assert total_time >= 1.5 # Since the critical path involves at least 1.5 seconds of work + + +def test_add_dependencies(): + # Prepare a simple list of ExpArgs + + def make_exp_args(task_name, exp_id): + return ExpArgs(agent_args=None, env_args=EnvArgs(task_name=task_name), exp_id=exp_id) + + exp_args_list = [ + make_exp_args("task1", "1"), + make_exp_args("task2", "2"), + make_exp_args("task3", "3"), + ] + + # Define simple task_dependencies + task_dependencies = {"task1": ["task2"], "task2": [], "task3": ["task1"]} + + # Call the function + modified_list = add_dependencies(exp_args_list, task_dependencies) + + # Verify dependencies + assert modified_list[0].depends_on == ["2"] # task1 depends on task2 + assert modified_list[1].depends_on == [] # task2 has no dependencies + assert modified_list[2].depends_on == ["1"] # task3 depends on task1 + + # assert raise if task_dependencies is wrong + task_dependencies = {"task1": ["task2"], "task2": [], "task4": ["task3"]} + with pytest.raises(ValueError): + add_dependencies(exp_args_list, task_dependencies) + + +if __name__ == "__main__": + test_add_dependencies() From df2aaebe647509ecb08b8a91c7667b84ce9faced Mon Sep 17 00:00:00 2001 From: recursix Date: Wed, 4 Sep 2024 15:56:13 -0400 Subject: [PATCH 02/58] switch to dask --- src/agentlab/experiments/graph_execution.py | 4 ++-- src/agentlab/experiments/launch_exp.py | 8 +++----- tests/experiments/test_graph_execution.py | 20 ++++++++++---------- 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/src/agentlab/experiments/graph_execution.py b/src/agentlab/experiments/graph_execution.py index 35bb7441..8e82b92b 100644 --- a/src/agentlab/experiments/graph_execution.py +++ b/src/agentlab/experiments/graph_execution.py @@ -83,10 +83,10 @@ def add_dependencies(exp_args_list: list[ExpArgs], task_dependencies: dict[list] # turn dependencies from task names to exp_ids for task_name, exp_args in exp_args_map.items(): - exp_args.depends_on = [ + exp_args.depends_on = tuple( exp_args_map[dep_name].exp_id for dep_name in task_dependencies[task_name] if dep_name in exp_args_map # ignore dependencies that are not to be run - ] + ) return exp_args_list diff --git a/src/agentlab/experiments/launch_exp.py b/src/agentlab/experiments/launch_exp.py index 14fcbf0e..5d0c400a 100644 --- a/src/agentlab/experiments/launch_exp.py +++ b/src/agentlab/experiments/launch_exp.py @@ -4,7 +4,8 @@ from pathlib import Path from browsergym.experiments.loop import ExpArgs, yield_all_exp_results -from joblib import Parallel, delayed +from agentlab.experiments.graph_execution import execute_task_graph +from dask.distributed import Client def import_object(path: str): @@ -24,10 +25,7 @@ def run_experiments(n_jobs, exp_args_list: list[ExpArgs], exp_dir): exp_args.prepare(exp_root=exp_dir) try: - prefer = "processes" - Parallel(n_jobs=n_jobs, prefer=prefer)( - delayed(exp_args.run)() for exp_args in exp_args_list - ) + execute_task_graph(Client(n_workers=n_jobs), exp_args_list) finally: # will close servers even if there is an exception or ctrl+c # servers won't be closed if the script is killed with kill -9 or segfaults. diff --git a/tests/experiments/test_graph_execution.py b/tests/experiments/test_graph_execution.py index 9af01d49..8b11c4a1 100644 --- a/tests/experiments/test_graph_execution.py +++ b/tests/experiments/test_graph_execution.py @@ -7,8 +7,8 @@ # Mock implementation of the ExpArgs class with timestamp checks class MockedExpArgs: - def __init__(self, task_id, depends_on=None): - self.task_id = task_id + def __init__(self, exp_id, depends_on=None): + self.exp_id = exp_id self.depends_on = depends_on if depends_on else [] self.start_time = None self.end_time = None @@ -23,10 +23,10 @@ def run(self): def test_execute_task_graph(): # Define a list of ExpArgs with dependencies exp_args_list = [ - MockedExpArgs(task_id="task1", depends_on=[]), - MockedExpArgs(task_id="task2", depends_on=["task1"]), - MockedExpArgs(task_id="task3", depends_on=["task1"]), - MockedExpArgs(task_id="task4", depends_on=["task2", "task3"]), + MockedExpArgs(exp_id="task1", depends_on=[]), + MockedExpArgs(exp_id="task2", depends_on=["task1"]), + MockedExpArgs(exp_id="task3", depends_on=["task1"]), + MockedExpArgs(exp_id="task4", depends_on=["task2", "task3"]), ] # Execute the task graph @@ -68,9 +68,9 @@ def make_exp_args(task_name, exp_id): modified_list = add_dependencies(exp_args_list, task_dependencies) # Verify dependencies - assert modified_list[0].depends_on == ["2"] # task1 depends on task2 - assert modified_list[1].depends_on == [] # task2 has no dependencies - assert modified_list[2].depends_on == ["1"] # task3 depends on task1 + assert modified_list[0].depends_on == ("2",) # task1 depends on task2 + assert modified_list[1].depends_on == () # task2 has no dependencies + assert modified_list[2].depends_on == ("1",) # task3 depends on task1 # assert raise if task_dependencies is wrong task_dependencies = {"task1": ["task2"], "task2": [], "task4": ["task3"]} @@ -79,4 +79,4 @@ def make_exp_args(task_name, exp_id): if __name__ == "__main__": - test_add_dependencies() + test_execute_task_graph() From edb162c79dafd3c80f9072d8227d9e39a998895e Mon Sep 17 00:00:00 2001 From: recursix Date: Wed, 4 Sep 2024 16:56:01 -0400 Subject: [PATCH 03/58] removing joblib dependency and adding dask --- requirements.txt | 2 +- src/agentlab/llm/llm_utils.py | 19 ------------------- 2 files changed, 1 insertion(+), 20 deletions(-) diff --git a/requirements.txt b/requirements.txt index 198ef904..5e4d7764 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ browsergym -joblib>=1.2.0 +dask openai>=1.7,<2 langchain>=0.1,<1 langchain_openai diff --git a/src/agentlab/llm/llm_utils.py b/src/agentlab/llm/llm_utils.py index 0e68e082..1a8d8b70 100644 --- a/src/agentlab/llm/llm_utils.py +++ b/src/agentlab/llm/llm_utils.py @@ -7,16 +7,13 @@ import re import time from functools import cache -from pathlib import Path from typing import TYPE_CHECKING from warnings import warn import numpy as np import tiktoken import yaml -from joblib import Memory from langchain.schema import BaseMessage, HumanMessage, SystemMessage -from langchain_openai import ChatOpenAI from openai import BadRequestError, RateLimitError from PIL import Image from transformers import AutoModel, AutoTokenizer @@ -451,22 +448,6 @@ def parse_html_tags(text, keys=(), optional_keys=(), merge_multiple=False): return content_dict, valid, retry_message -class ChatCached: - # I wish I could extend ChatOpenAI, but it is somehow locked, I don't know if it's pydantic soercey. - - def __init__(self, chat, memory=None): - self.chat = chat - self.memory = memory if memory else Memory(location=Path.home() / "llm-cache", verbose=10) - self._call = self.memory.cache(self.chat.__call__, ignore=["self"]) - self._generate = self.memory.cache(self.chat.generate, ignore=["self"]) - - def __call__(self, messages): - return self._call(messages) - - def generate(self, messages): - return self._generate(messages) - - def download_and_save_model(model_name: str, save_dir: str = "."): model = AutoModel.from_pretrained(model_name) model.save_pretrained(save_dir) From 82ff348134f0155d6a1700c41bb8a2a6c4b2637c Mon Sep 17 00:00:00 2001 From: Thibault Le Sellier de Chezelles Date: Wed, 4 Sep 2024 17:11:00 -0400 Subject: [PATCH 04/58] fixing imports --- src/agentlab/agents/generic_agent/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/agentlab/agents/generic_agent/__init__.py b/src/agentlab/agents/generic_agent/__init__.py index d348daba..d9839c4d 100644 --- a/src/agentlab/agents/generic_agent/__init__.py +++ b/src/agentlab/agents/generic_agent/__init__.py @@ -2,15 +2,16 @@ AGENT_3_5, AGENT_8B, AGENT_70B, + RANDOM_SEARCH_AGENT, AGENT_4o, + AGENT_4o_MINI, AGENT_4o_VISION, - RANDOM_SEARCH_AGENT, ) - __all__ = [ "AGENT_3_5", "AGENT_4o", + "AGENT_4o_MINI", "AGENT_4o_VISION", "AGENT_70B", "AGENT_8B", From 0dbdd9831746cb79072c5a13027ce28bf240cd51 Mon Sep 17 00:00:00 2001 From: recursix Date: Wed, 11 Sep 2024 13:01:51 -0400 Subject: [PATCH 05/58] handles multiple backends --- src/agentlab/experiments/launch_exp.py | 38 +++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/src/agentlab/experiments/launch_exp.py b/src/agentlab/experiments/launch_exp.py index 5d0c400a..e088a5bd 100644 --- a/src/agentlab/experiments/launch_exp.py +++ b/src/agentlab/experiments/launch_exp.py @@ -5,7 +5,6 @@ from browsergym.experiments.loop import ExpArgs, yield_all_exp_results from agentlab.experiments.graph_execution import execute_task_graph -from dask.distributed import Client def import_object(path: str): @@ -18,14 +17,45 @@ def import_object(path: str): return obj -def run_experiments(n_jobs, exp_args_list: list[ExpArgs], exp_dir): +def run_experiments(n_jobs, exp_args_list: list[ExpArgs], exp_dir, parallel_backend="joblib"): + """Run a list of ExpArgs in parallel. + + To ensure optimal parallelism, make sure ExpArgs.depend_on is set correctly + and the backend is set to dask. + + Args: + n_jobs: int + Number of parallel jobs. + exp_args_list: list[ExpArgs] + List of ExpArgs objects. + exp_dir: Path + Directory where the experiments will be saved. + parallel_backend: str + Parallel backend to use. Either "joblib", "dask" or "sequential". + + """ logging.info(f"Saving experiments to {exp_dir}") for exp_args in exp_args_list: exp_args.agent_args.prepare() exp_args.prepare(exp_root=exp_dir) - try: - execute_task_graph(Client(n_workers=n_jobs), exp_args_list) + if parallel_backend == "joblib": + from joblib import Parallel, delayed + + Parallel(n_jobs=n_jobs, prefer="processes")( + delayed(exp_args.run)() for exp_args in exp_args_list + ) + + elif parallel_backend == "dask": + from dask.distributed import Client, LocalCluster + + cluster = LocalCluster(n_workers=n_jobs, processes=True) + execute_task_graph(Client(cluster), exp_args_list) + elif parallel_backend == "sequential": + for exp_args in exp_args_list: + exp_args.run() + else: + raise ValueError(f"Unknown parallel_backend: {parallel_backend}") finally: # will close servers even if there is an exception or ctrl+c # servers won't be closed if the script is killed with kill -9 or segfaults. From 7da5cac1a263076a458fcdc4e45c89142759ba7e Mon Sep 17 00:00:00 2001 From: recursix Date: Wed, 11 Sep 2024 13:02:09 -0400 Subject: [PATCH 06/58] ensure asyncio loop creation --- src/agentlab/experiments/graph_execution.py | 31 ++++++++------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/src/agentlab/experiments/graph_execution.py b/src/agentlab/experiments/graph_execution.py index 8e82b92b..382bdc1d 100644 --- a/src/agentlab/experiments/graph_execution.py +++ b/src/agentlab/experiments/graph_execution.py @@ -1,29 +1,22 @@ +import asyncio from dask import compute, delayed -from dask.distributed import Client from browsergym.experiments.loop import ExpArgs -import logging -def run_experiments(n_workers, exp_args_list: list[ExpArgs], exp_dir): - """Run a list of experiments in parallel while respecting dependencies.""" - - logging.info(f"Saving experiments to {exp_dir}") - for exp_args in exp_args_list: - exp_args.agent_args.prepare() - exp_args.prepare(exp_root=exp_dir) - +def _run(exp_arg: ExpArgs, *dependencies): + """Capture dependencies to ensure they are run before the current task.""" try: - execute_task_graph(Client(n_workers=n_workers), exp_args_list) - finally: - logging.info("All jobs are finished. Calling agent_args.close() on all agents...") - for exp_args in exp_args_list: - exp_args.agent_args.close() - logging.info("Experiment finished.") + # Create a new event loop + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + # Run the experiment in the new loop + result = loop.run_until_complete(asyncio.to_thread(exp_arg.run)) -def _run(exp_arg: ExpArgs, *dependencies): - """Capture dependencies to ensure they are run before the current task.""" - return exp_arg.run() + return result + finally: + # Clean up the event loop + loop.close() def execute_task_graph(dask_client, exp_args_list: list[ExpArgs]): From 25e241a9d2a739b367c88343ccd4e39f0d9fc327 Mon Sep 17 00:00:00 2001 From: recursix Date: Wed, 11 Sep 2024 13:02:31 -0400 Subject: [PATCH 07/58] more tests --- tests/experiments/test_graph_execution.py | 10 +++++++- tests/experiments/test_launch_exp.py | 31 +++++++++++++++++------ 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/tests/experiments/test_graph_execution.py b/tests/experiments/test_graph_execution.py index 8b11c4a1..7283e93e 100644 --- a/tests/experiments/test_graph_execution.py +++ b/tests/experiments/test_graph_execution.py @@ -15,6 +15,13 @@ def __init__(self, exp_id, depends_on=None): def run(self): self.start_time = time() + + # simulate playright code, (this was causing issues due to python async loop) + import playwright.sync_api + + pw = playwright.sync_api.sync_playwright().start() + pw.selectors.set_test_id_attribute("mytestid") + sleep(0.5) # Simulate task execution time self.end_time = time() return self @@ -30,7 +37,7 @@ def test_execute_task_graph(): ] # Execute the task graph - results = execute_task_graph(Client(n_workers=3), exp_args_list) + results = execute_task_graph(Client(n_workers=3, processes=True), exp_args_list) exp_args_list = [results[task_id] for task_id in ["task1", "task2", "task3", "task4"]] @@ -80,3 +87,4 @@ def make_exp_args(task_name, exp_id): if __name__ == "__main__": test_execute_task_graph() + # test_add_dependencies() diff --git a/tests/experiments/test_launch_exp.py b/tests/experiments/test_launch_exp.py index 98380506..bf87fa21 100644 --- a/tests/experiments/test_launch_exp.py +++ b/tests/experiments/test_launch_exp.py @@ -25,11 +25,8 @@ def test_relaunch_study(): assert len(exp_args_list) == 2 -if __name__ == "__main__": - test_relaunch_study() - - -def test_launch_system(): +@pytest.mark.repeat(3) # there was stochastic bug caused by asyncio loop not started +def test_launch_system(backend="dask"): exp_args_list = [] for seed in range(3): exp_args_list.append( @@ -45,16 +42,32 @@ def test_launch_system(): with tempfile.TemporaryDirectory() as tmp_dir: study_dir = make_study_dir(tmp_dir, "generic_agent_test") - run_experiments(n_jobs=3, exp_args_list=exp_args_list, exp_dir=study_dir) + run_experiments( + n_jobs=3, exp_args_list=exp_args_list, exp_dir=study_dir, parallel_backend=backend + ) results_df = inspect_results.load_result_df(study_dir, progress_fn=None) assert len(results_df) == len(exp_args_list) + for _, row in results_df.iterrows(): + if row.stack_trace is not None: + print(row.stack_trace) + assert row.err_msg is None + assert row.cum_reward == 1.0 + global_report = inspect_results.global_report(results_df) assert len(global_report) == 2 - assert global_report.avg_reward.iloc[0] == 1.0 assert global_report.std_err.iloc[0] == 0 assert global_report.n_completed.iloc[0] == "3/3" + assert global_report.avg_reward.iloc[0] == 1.0 + + +def test_launch_system_joblib(): + test_launch_system(backend="joblib") + + +def test_launch_system_sequntial(): + test_launch_system(backend="sequential") @pytest.mark.pricy @@ -82,4 +95,6 @@ def test_4o_mini_on_miniwob_tiny_test(): if __name__ == "__main__": - test_4o_mini_on_miniwob_tiny_test() + # test_4o_mini_on_miniwob_tiny_test() + # test_launch_system() + test_launch_system_joblib() From 01c8652c91deb3da0639cbc618a4b2a45e1ca3ca Mon Sep 17 00:00:00 2001 From: recursix Date: Wed, 11 Sep 2024 14:49:11 -0400 Subject: [PATCH 08/58] setting dashboard address to None --- src/agentlab/experiments/launch_exp.py | 116 +------------------------ 1 file changed, 1 insertion(+), 115 deletions(-) diff --git a/src/agentlab/experiments/launch_exp.py b/src/agentlab/experiments/launch_exp.py index e088a5bd..530b663b 100644 --- a/src/agentlab/experiments/launch_exp.py +++ b/src/agentlab/experiments/launch_exp.py @@ -49,7 +49,7 @@ def run_experiments(n_jobs, exp_args_list: list[ExpArgs], exp_dir, parallel_back elif parallel_backend == "dask": from dask.distributed import Client, LocalCluster - cluster = LocalCluster(n_workers=n_jobs, processes=True) + cluster = LocalCluster(n_workers=n_jobs, processes=True, dashboard_address=None) execute_task_graph(Client(cluster), exp_args_list) elif parallel_backend == "sequential": for exp_args in exp_args_list: @@ -71,17 +71,6 @@ def make_study_dir(exp_root, study_name, add_date=True): return Path(exp_root) / study_name -# def study_agent_on_benchmark(exp_root, study_func, agent, benchmark, extra_kwargs={}): -# exp_args_list = study_func(agent, benchmark, **extra_kwargs) -# study_name = f"{study_func.__name__}_{agent.__class__.__name__}_on_{benchmark}" -# return exp_args_list, make_study_dir(exp_root, study_name) - - -# def make_study(exp_root, study_func, extra_kwargs={}): -# exp_args_list = study_func(**extra_kwargs) -# return exp_args_list, make_study_dir(exp_root, f"{study_func.__name__}") - - def relaunch_study(study_dir: Path, relaunch_mode="incomplete_only"): """Return exp_args_list and study_dir @@ -136,112 +125,9 @@ def _yield_incomplete_experiments(exp_root, relaunch_mode="incomplete_only"): raise ValueError(f"Unknown relaunch_mode: {relaunch_mode}") -# def str2dict(arg): -# try: -# return json.loads(arg) -# except json.JSONDecodeError as e: -# raise argparse.ArgumentTypeError(f"Invalid dictionary format: {e}") - - def split_path(path: str): """Split a path into a module name and an object name.""" if "/" in path: path = path.replace("/", ".") module_name, obj_name = path.rsplit(".", 1) return module_name, obj_name - - -# def main(): -# from agentlab.experiments.exp_utils import RESULTS_DIR - -# logging.getLogger().setLevel(logging.INFO) - -# parser = argparse.ArgumentParser() -# parser.add_argument( -# "--exp_root", -# default=RESULTS_DIR, -# help="folder where experiments will be saved", -# ) -# parser.add_argument( -# "--n_jobs", -# default=1, -# type=int, -# help="number of parallel jobs", -# ) -# parser.add_argument( -# "--exp_config", -# type=str, -# default="final_run", -# help="Python path to the experiment function to launch", -# ) -# parser.add_argument( -# "--benchmark", -# type=str, -# default="miniwob", -# choices=["miniwob", "workarena.l1", "workarena.l2", "workarena.l3"], -# help="Benchmark to launch", -# ) -# parser.add_argument( -# "--agent_config", -# type=str, -# default=None, -# help="Python path to the agent config", -# ) -# parser.add_argument( -# "--relaunch_mode", -# default=None, -# type=str, -# choices=[None, "incomplete_only", "all_errors", "server_errors"], -# help="Find all incomplete experiments and relaunch them.", -# ) -# parser.add_argument( -# "--extra_kwargs", -# default="{}", -# type=str2dict, -# help="Extra arguments to pass to the experiment group.", -# ) - -# parser.add_argument( -# "-y", "--auto_accept", action="store_true", help="Skip the prompt to accept the experiment" -# ) - -# parser.add_argument("--shuffle_jobs", action="store_true", help="Shuffle the jobs") - -# args, unknown = parser.parse_known_args() - -# # if relaunch_mode is not None, we will relaunch the experiments -# if args.relaunch_mode is not None: -# assert args.exp_root is not None, "You must specify an exp_root to relaunch experiments." -# exp_args_list, exp_dir = relaunch_study(args.exp_config, args.relaunch_mode) -# else: -# # we launch an experiment using the exp_config -# assert args.exp_config is not None, "You must specify an exp_config." -# study_func = import_object(args.exp_config) -# if args.agent_config is not None: -# agent = import_object(args.agent_config) -# exp_args_list, exp_dir = study_agent_on_benchmark( -# args.exp_root, study_func, agent, args.benchmark, args.extra_kwargs -# ) -# else: -# exp_args_list, exp_dir = make_study(args.exp_root, study_func, args.extra_kwargs) - -# message = f"\nYou are about to launch {len(exp_args_list)} experiments in {exp_dir}.\nPress Y to continue.\n" - -# if args.shuffle_jobs: -# logging.info("Shuffling jobs") -# random.shuffle(exp_args_list) - -# if args.auto_accept: -# logging.info(message) -# answer = "y" -# else: -# answer = input(message) - -# if answer.lower() != "y": -# logging.info("Aborting.") -# else: -# run_experiments(args.n_jobs, exp_args_list, exp_dir) - - -# if __name__ == "__main__": -# main() From c6370bd143443907a7a410b6eb00881e3c145d25 Mon Sep 17 00:00:00 2001 From: recursix Date: Wed, 11 Sep 2024 15:50:44 -0400 Subject: [PATCH 09/58] minor --- tests/experiments/test_graph_execution.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/experiments/test_graph_execution.py b/tests/experiments/test_graph_execution.py index 7283e93e..c1fc0e7c 100644 --- a/tests/experiments/test_graph_execution.py +++ b/tests/experiments/test_graph_execution.py @@ -1,9 +1,11 @@ -from dask.distributed import Client +from dask.distributed import Client, LocalCluster import pytest from agentlab.experiments.graph_execution import execute_task_graph, add_dependencies from time import time, sleep from browsergym.experiments.loop import ExpArgs, EnvArgs +TASK_TIME = 0.1 + # Mock implementation of the ExpArgs class with timestamp checks class MockedExpArgs: @@ -21,8 +23,7 @@ def run(self): pw = playwright.sync_api.sync_playwright().start() pw.selectors.set_test_id_attribute("mytestid") - - sleep(0.5) # Simulate task execution time + sleep(TASK_TIME) # Simulate task execution time self.end_time = time() return self @@ -37,7 +38,8 @@ def test_execute_task_graph(): ] # Execute the task graph - results = execute_task_graph(Client(n_workers=3, processes=True), exp_args_list) + cluster = LocalCluster(n_workers=3, processes=True) + results = execute_task_graph(Client(cluster), exp_args_list) exp_args_list = [results[task_id] for task_id in ["task1", "task2", "task3", "task4"]] @@ -53,7 +55,9 @@ def test_execute_task_graph(): # Ensure that the entire task graph took the expected amount of time total_time = exp_args_list[-1].end_time - exp_args_list[0].start_time - assert total_time >= 1.5 # Since the critical path involves at least 1.5 seconds of work + assert ( + total_time >= TASK_TIME * 3 + ) # Since the critical path involves at least 1.5 seconds of work def test_add_dependencies(): From 7ad0e674128708c3b3aded224fe262854e474fd5 Mon Sep 17 00:00:00 2001 From: recursix Date: Mon, 16 Sep 2024 10:24:58 -0400 Subject: [PATCH 10/58] Finally found a way to make it work --- src/agentlab/experiments/graph_execution.py | 59 ++++++++++++--------- src/agentlab/experiments/launch_exp.py | 7 ++- tests/experiments/test_graph_execution.py | 24 +++++---- tests/experiments/test_launch_exp.py | 2 +- 4 files changed, 52 insertions(+), 40 deletions(-) diff --git a/src/agentlab/experiments/graph_execution.py b/src/agentlab/experiments/graph_execution.py index 382bdc1d..fe74a1f6 100644 --- a/src/agentlab/experiments/graph_execution.py +++ b/src/agentlab/experiments/graph_execution.py @@ -1,42 +1,53 @@ -import asyncio from dask import compute, delayed from browsergym.experiments.loop import ExpArgs +from distributed import LocalCluster, Client def _run(exp_arg: ExpArgs, *dependencies): - """Capture dependencies to ensure they are run before the current task.""" - try: - # Create a new event loop - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) + return exp_arg.run() - # Run the experiment in the new loop - result = loop.run_until_complete(asyncio.to_thread(exp_arg.run)) - return result - finally: - # Clean up the event loop - loop.close() +def make_dask_client(n_worker): + """Create a Dask client with a LocalCluster backend. + I struggled to find an appropriate configuration. + I believe it has to do with the interplay of playwright async loop (even if + used in sync mode) and the fact that dask uses asyncio under the hood. + Making sure we use processes and 1 thread per worker seems to work. -def execute_task_graph(dask_client, exp_args_list: list[ExpArgs]): + Args: + n_worker: int + Number of workers to create. + + Returns: + A Dask client object. + """ + cluster = LocalCluster( + n_workers=n_worker, + processes=True, + threads_per_worker=1, + ) + + return Client(cluster, asynchronous=True) + + +def execute_task_graph(exp_args_list: list[ExpArgs]): """Execute a task graph in parallel while respecting dependencies.""" exp_args_map = {exp_args.exp_id: exp_args for exp_args in exp_args_list} - with dask_client: - tasks = {} + tasks = {} - def get_task(exp_arg: ExpArgs): - if exp_arg.exp_id not in tasks: - dependencies = [get_task(exp_args_map[dep_key]) for dep_key in exp_arg.depends_on] - tasks[exp_arg.exp_id] = delayed(_run)(exp_arg, *dependencies) - return tasks[exp_arg.exp_id] + def get_task(exp_arg: ExpArgs): + if exp_arg.exp_id not in tasks: + dependencies = [get_task(exp_args_map[dep_key]) for dep_key in exp_arg.depends_on] + tasks[exp_arg.exp_id] = delayed(_run)(exp_arg, *dependencies) + return tasks[exp_arg.exp_id] - for exp_arg in exp_args_list: - get_task(exp_arg) + for exp_arg in exp_args_list: + get_task(exp_arg) - task_ids, task_list = zip(*tasks.items()) - results = compute(*task_list) + task_ids, task_list = zip(*tasks.items()) + results = compute(*task_list) return {task_id: result for task_id, result in zip(task_ids, results)} diff --git a/src/agentlab/experiments/launch_exp.py b/src/agentlab/experiments/launch_exp.py index 6b84bacf..77593a59 100644 --- a/src/agentlab/experiments/launch_exp.py +++ b/src/agentlab/experiments/launch_exp.py @@ -4,7 +4,6 @@ from pathlib import Path from browsergym.experiments.loop import ExpArgs, yield_all_exp_results -from agentlab.experiments.graph_execution import execute_task_graph def import_object(path: str): @@ -47,10 +46,10 @@ def run_experiments(n_jobs, exp_args_list: list[ExpArgs], exp_dir, parallel_back ) elif parallel_backend == "dask": - from dask.distributed import Client, LocalCluster + from agentlab.experiments.graph_execution import execute_task_graph, make_dask_client - cluster = LocalCluster(n_workers=n_jobs, processes=True, dashboard_address=None) - execute_task_graph(Client(cluster), exp_args_list) + with make_dask_client(n_worker=n_jobs): + execute_task_graph(exp_args_list) elif parallel_backend == "sequential": for exp_args in exp_args_list: exp_args.run() diff --git a/tests/experiments/test_graph_execution.py b/tests/experiments/test_graph_execution.py index c1fc0e7c..0522de19 100644 --- a/tests/experiments/test_graph_execution.py +++ b/tests/experiments/test_graph_execution.py @@ -1,10 +1,13 @@ -from dask.distributed import Client, LocalCluster import pytest -from agentlab.experiments.graph_execution import execute_task_graph, add_dependencies +from agentlab.experiments.graph_execution import ( + execute_task_graph, + add_dependencies, + make_dask_client, +) from time import time, sleep from browsergym.experiments.loop import ExpArgs, EnvArgs -TASK_TIME = 0.1 +TASK_TIME = 3 # Mock implementation of the ExpArgs class with timestamp checks @@ -18,11 +21,11 @@ def __init__(self, exp_id, depends_on=None): def run(self): self.start_time = time() - # simulate playright code, (this was causing issues due to python async loop) - import playwright.sync_api + # # simulate playright code, (this was causing issues due to python async loop) + # import playwright.sync_api - pw = playwright.sync_api.sync_playwright().start() - pw.selectors.set_test_id_attribute("mytestid") + # pw = playwright.sync_api.sync_playwright().start() + # pw.selectors.set_test_id_attribute("mytestid") sleep(TASK_TIME) # Simulate task execution time self.end_time = time() return self @@ -37,9 +40,8 @@ def test_execute_task_graph(): MockedExpArgs(exp_id="task4", depends_on=["task2", "task3"]), ] - # Execute the task graph - cluster = LocalCluster(n_workers=3, processes=True) - results = execute_task_graph(Client(cluster), exp_args_list) + with make_dask_client(n_worker=5): + results = execute_task_graph(exp_args_list) exp_args_list = [results[task_id] for task_id in ["task1", "task2", "task3", "task4"]] @@ -51,7 +53,7 @@ def test_execute_task_graph(): # Verify that parallel tasks (task2 and task3) started within a short time of each other parallel_start_diff = abs(exp_args_list[1].start_time - exp_args_list[2].start_time) - assert parallel_start_diff < 0.1 # Allow for a small delay + assert parallel_start_diff < 1.5 # Allow for a small delay # Ensure that the entire task graph took the expected amount of time total_time = exp_args_list[-1].end_time - exp_args_list[0].start_time diff --git a/tests/experiments/test_launch_exp.py b/tests/experiments/test_launch_exp.py index bf87fa21..b3e1fcd8 100644 --- a/tests/experiments/test_launch_exp.py +++ b/tests/experiments/test_launch_exp.py @@ -43,7 +43,7 @@ def test_launch_system(backend="dask"): study_dir = make_study_dir(tmp_dir, "generic_agent_test") run_experiments( - n_jobs=3, exp_args_list=exp_args_list, exp_dir=study_dir, parallel_backend=backend + n_jobs=2, exp_args_list=exp_args_list, exp_dir=study_dir, parallel_backend=backend ) results_df = inspect_results.load_result_df(study_dir, progress_fn=None) From a396d9a1cc232fc7b40a94c96548c8c11c014f6f Mon Sep 17 00:00:00 2001 From: recursix Date: Mon, 16 Sep 2024 11:15:40 -0400 Subject: [PATCH 11/58] initial reproducibility files --- pyproject.toml | 5 +- requirements.txt | 1 + src/agentlab/__init__.py | 1 + .../experiments/reproducibility_script.py | 27 +-- .../experiments/reproducibility_util.py | 167 ++++++++++++++++++ .../experiments/test_reproducibility_util.py | 41 +++++ 6 files changed, 220 insertions(+), 22 deletions(-) create mode 100644 src/agentlab/experiments/reproducibility_util.py create mode 100644 tests/experiments/test_reproducibility_util.py diff --git a/pyproject.toml b/pyproject.toml index 875e92aa..f4570b6a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "agentlab" -version = "0.2.0" +dynamic = ["version", "dependencies"] description = "Main package for developing agents and experiments" authors = [ {name = "Rim Assouel", email = "rim.assouel@gmail.com"}, @@ -13,6 +13,7 @@ authors = [ {name = "Alex Lacoste", email = "alex.lacoste@servicenow.com"}, {name = "Tom Marty", email = "tom.marty@polymtl.ca"}, {name = "Massimo Caccia", email = "massimo.caccia1@servicenow.com"} + {name = "Thibault Le Sellier de Chezelles", email = "thibault.de.chezelles@gmail.com"} ] readme = "README.md" requires-python = ">3.7" @@ -24,12 +25,12 @@ classifiers = [ "Intended Audience :: Science/Research", "Topic :: Scientific/Engineering :: Artificial Intelligence", ] -dynamic = ["dependencies"] [project.urls] "Homepage" = "https://github.com/ServiceNow/AgentLab" [tool.setuptools.dynamic] +version = {attr = "agentlab.__version__"} dependencies = {file = ["requirements.txt"]} [tool.black] diff --git a/requirements.txt b/requirements.txt index fec19b65..4117dd92 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,3 +18,4 @@ ipython pyyaml>=6 pandas gradio +gitpython # for the reproducibility script diff --git a/src/agentlab/__init__.py b/src/agentlab/__init__.py index e69de29b..3ced3581 100644 --- a/src/agentlab/__init__.py +++ b/src/agentlab/__init__.py @@ -0,0 +1 @@ +__version__ = "0.2.1" diff --git a/src/agentlab/experiments/reproducibility_script.py b/src/agentlab/experiments/reproducibility_script.py index ece0c082..dfb5880f 100644 --- a/src/agentlab/experiments/reproducibility_script.py +++ b/src/agentlab/experiments/reproducibility_script.py @@ -1,11 +1,3 @@ -""" -Note: This script is a convenience script to launch experiments instead of using -the command line. - -Don't push your changes to this file to git unless you are making structural changes. -""" - -from copy import deepcopy import logging from agentlab.agents.generic_agent import AGENT_4o, AGENT_4o_MINI @@ -13,30 +5,25 @@ from agentlab.experiments import study_generators from agentlab.experiments.exp_utils import RESULTS_DIR from agentlab.experiments.launch_exp import make_study_dir, run_experiments, relaunch_study -from agentlab.agents.generic_agent.generic_agent import GenericAgent +from agentlab.experiments.reproducibility_util import set_temp -logging.getLogger().setLevel(logging.INFO) - -def set_temp(agent: GenericAgent, temperature=0): - agent = deepcopy(agent) - agent.chat_model_args.temperature = temperature - return agent +logging.getLogger().setLevel(logging.INFO) if __name__ == "__main__": - agent = set_temp(AGENT_4o_MINI) + agent_args = set_temp(AGENT_4o_MINI) ## select the benchmark to run on - # benchmark = "miniwob" - benchmark = "miniwob_tiny_test" + benchmark = "miniwob" + # benchmark = "miniwob_tiny_test" # benchmark = "workarena.l1" # benchmark = "workarena.l2" # benchmark = "workarena.l3" # benchmark = "webarena" - study_name, exp_args_list = study_generators.run_agents_on_benchmark(agent, benchmark) + study_name, exp_args_list = study_generators.run_agents_on_benchmark(agent_args, benchmark) study_dir = make_study_dir(RESULTS_DIR, study_name) # ## alternatively, relaunch an existing study @@ -48,4 +35,4 @@ def set_temp(agent: GenericAgent, temperature=0): # n_jobs = -1 # to use all available cores # run the experiments - run_experiments(n_jobs, exp_args_list, study_dir) + run_experiments(n_jobs, exp_args_list, study_dir, parallel_backend="dask") diff --git a/src/agentlab/experiments/reproducibility_util.py b/src/agentlab/experiments/reproducibility_util.py new file mode 100644 index 00000000..32aca0e3 --- /dev/null +++ b/src/agentlab/experiments/reproducibility_util.py @@ -0,0 +1,167 @@ +from copy import deepcopy + +from agentlab.agents.generic_agent.generic_agent import GenericAgentArgs +from pathlib import Path +from git import Repo, InvalidGitRepositoryError +from importlib import metadata +from git.config import GitConfigParser +import os + + +def _get_repo(module): + return Repo(Path(module.__file__).resolve().parent, search_parent_directories=True) + + +def _get_benchmark_version(benchmark_name): + if benchmark_name.startswith("miniwob"): + return metadata.distribution("browsergym.miniwob").version + elif benchmark_name.startswith("workarena"): + return metadata.distribution("browsergym.workarena").version + elif benchmark_name.startswith("webarena"): + return metadata.distribution("browsergym.webarena").version + elif benchmark_name.startswith("visualwebarena"): + return metadata.distribution("browsergym.visualwebarena").version + else: + raise ValueError(f"Unknown benchmark {benchmark_name}") + + +def get_git_username(repo: Repo) -> str: + """ + Retrieves the first available Git username from various sources. + + This function checks multiple locations for the Git username in the following order: + 1. Repository-specific configuration + 2. GitHub API (if the remote is a GitHub repository) + 3. Global Git configuration + 4. System Git configuration + 5. Environment variables (GIT_AUTHOR_NAME and GIT_COMMITTER_NAME) + + Args: + repo (git.Repo): A GitPython Repo object representing the Git repository. + + Returns: + str: The first non-None username found, or None if no username is found. + """ + # Repository-specific configuration + username = repo.config_reader().get_value("user", "name", None) + if username: + return username + + # GitHub username + remote_url = repo.remotes.origin.url + if "github.com" in remote_url: + import re + import urllib.request + import json + + match = re.search(r"github\.com[:/](.+)/(.+)\.git", remote_url) + if match: + owner, repo_name = match.groups() + api_url = f"https://api.github.com/repos/{owner}/{repo_name}" + with urllib.request.urlopen(api_url) as response: + data = json.loads(response.read().decode()) + username = data["owner"]["login"] + if username: + return username + + # Global configuration + username = GitConfigParser(repo.git.config("--global", "--list"), read_only=True).get_value( + "user", "name", None + ) + if username: + return username + + # System configuration + username = GitConfigParser(repo.git.config("--system", "--list"), read_only=True).get_value( + "user", "name", None + ) + if username: + return username + + # Environment variables + return os.environ.get("GIT_AUTHOR_NAME") or os.environ.get("GIT_COMMITTER_NAME") + + +def get_git_info(module): + """ + Retrieve comprehensive git information for the given module. + + This function attempts to find the git repository containing the specified + module and returns the current commit hash and a comprehensive list of all + files that contribute to the repository's state. + + Args: + module: The Python module object to check for git information. + + Returns: + tuple: A tuple containing two elements: + - str or None: The current git commit hash, or None if not a git repo. + - list of tuple: A list of (status, Path) tuples for all modified files. + Empty list if not a git repo. Status can be 'M' (modified), 'A' (added), + 'D' (deleted), 'R' (renamed), 'C' (copied), 'U' (updated but unmerged), + or '??' (untracked). + """ + + try: + repo = _get_repo(module) + + git_hash = repo.head.object.hexsha + + modified_files = [] + + # Staged changes + staged_changes = repo.index.diff(repo.head.commit) + for change in staged_changes: + modified_files.append((change.change_type, Path(change.a_path))) + + # Unstaged changes + unstaged_changes = repo.index.diff(None) + for change in unstaged_changes: + modified_files.append((change.change_type, Path(change.a_path))) + + # Untracked files + untracked_files = repo.untracked_files + for file in untracked_files: + modified_files.append(("??", Path(file))) + + return git_hash, modified_files + except InvalidGitRepositoryError: + return None, [] + + +def get_reproducibility_info(benchmark_name, ignore_changes=False): + import agentlab + from browsergym import core + + info = { + "git_user": get_git_username(_get_repo(agentlab)), + "benchmark": benchmark_name, + "benchmark_version": _get_benchmark_version(benchmark_name), + } + + def add_info(module_name, module): + git_hash, modified_files = get_git_info(module) + + modified_files_str = "\n".join([f"{status} {file}" for status, file in modified_files]) + + if len(modified_files) > 0 and not ignore_changes: + raise ValueError( + f"Module {module_name} has uncommitted changes." + "Please commit or stash these changes before running the experiment or set ignore_changes=True." + f"Modified files: \n{modified_files_str}\n" + ) + + info[f"{module_name}_version"] = module.__version__ + info[f"{module_name}_git_hash"] = git_hash + info[f"{module_name}__local_modifications"] = modified_files_str + + add_info("agentlab", agentlab) + add_info("browsergym", core) + return info + + +def set_temp(agent_args: GenericAgentArgs, temperature=0): + """Set temperature to 0. Assumes a GenericAgent structure.""" + agent_args = deepcopy(agent_args) + agent_args.chat_model_args.temperature = temperature + return agent_args diff --git a/tests/experiments/test_reproducibility_util.py b/tests/experiments/test_reproducibility_util.py new file mode 100644 index 00000000..60dded59 --- /dev/null +++ b/tests/experiments/test_reproducibility_util.py @@ -0,0 +1,41 @@ +from agentlab.experiments import reproducibility_util +from agentlab.agents.generic_agent import AGENT_4o_MINI +import pytest +import json + + +def test_set_temp(): + agent_args = reproducibility_util.set_temp(AGENT_4o_MINI) + assert agent_args.chat_model_args.temperature == 0 + + +@pytest.mark.parametrize( + "benchmark_name", + ["miniwob", "workarena.l1", "webarena", "visualwebarena"], +) +def test_get_reproducibility_info(benchmark_name): + info = reproducibility_util.get_reproducibility_info(benchmark_name, ignore_changes=True) + + print("reproducibility info:") + print(json.dumps(info, indent=4)) + + # assert keys in info + assert "git_user" in info + assert "benchmark" in info + assert "benchmark_version" in info + assert "agentlab_version" in info + assert "agentlab_git_hash" in info + assert "agentlab__local_modifications" in info + assert "browsergym_version" in info + assert "browsergym_git_hash" in info + assert "browsergym__local_modifications" in info + + +if __name__ == "__main__": + # test_set_temp() + for benchmark_name in [ + "miniwob", + "workarena.l1", + "webarena", + ]: + test_get_reproducibility_info(benchmark_name) From 3db84f76a4969abf1a7318dc4485ad52b9e49d46 Mon Sep 17 00:00:00 2001 From: recursix Date: Thu, 19 Sep 2024 11:14:40 -0400 Subject: [PATCH 12/58] Seems to be superflus --- src/agentlab/experiments/graph_execution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/agentlab/experiments/graph_execution.py b/src/agentlab/experiments/graph_execution.py index fe74a1f6..c12a1048 100644 --- a/src/agentlab/experiments/graph_execution.py +++ b/src/agentlab/experiments/graph_execution.py @@ -28,7 +28,7 @@ def make_dask_client(n_worker): threads_per_worker=1, ) - return Client(cluster, asynchronous=True) + return Client(cluster) def execute_task_graph(exp_args_list: list[ExpArgs]): From ed9e568a46d4b352064c524462e6e025d5aacdd7 Mon Sep 17 00:00:00 2001 From: recursix Date: Thu, 19 Sep 2024 11:14:56 -0400 Subject: [PATCH 13/58] adding a reproducibility journal --- .gitignore | 1 - reproducibility_journal.csv | 0 2 files changed, 1 deletion(-) create mode 100644 reproducibility_journal.csv diff --git a/.gitignore b/.gitignore index 2c4eec38..d0037afc 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,6 @@ __pycache__/ *$py.class results/ .vscode -*.csv # C extensions *.so # Distribution / packaging diff --git a/reproducibility_journal.csv b/reproducibility_journal.csv new file mode 100644 index 00000000..e69de29b From 85ac6fa20c2e0b34eac44859205fe99ce772ab8a Mon Sep 17 00:00:00 2001 From: recursix Date: Thu, 19 Sep 2024 11:15:11 -0400 Subject: [PATCH 14/58] minor update --- src/agentlab/experiments/launch_command.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/agentlab/experiments/launch_command.py b/src/agentlab/experiments/launch_command.py index 01b48a7f..e02d2ee1 100644 --- a/src/agentlab/experiments/launch_command.py +++ b/src/agentlab/experiments/launch_command.py @@ -17,7 +17,7 @@ # choose your agent or provide a new agent agent_args = AGENT_4o_MINI -# agent = AGENT_4o +# agent_args = AGENT_4o ## select the benchmark to run on @@ -37,8 +37,8 @@ ## alternatively, relaunch an existing study -study_dir = get_most_recent_folder(RESULTS_DIR, contains=None) -exp_args_list, study_dir = relaunch_study(study_dir, relaunch_mode="incomplete_or_error") +# study_dir = get_most_recent_folder(RESULTS_DIR, contains=None) +# exp_args_list, study_dir = relaunch_study(study_dir, relaunch_mode="incomplete_or_error") ## Number of parallel jobs From ad5110edc1baf4ff9084c2467196176b2bacc304 Mon Sep 17 00:00:00 2001 From: recursix Date: Thu, 19 Sep 2024 11:15:46 -0400 Subject: [PATCH 15/58] more robust --- src/agentlab/experiments/launch_exp.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/agentlab/experiments/launch_exp.py b/src/agentlab/experiments/launch_exp.py index 77593a59..b2ed28ec 100644 --- a/src/agentlab/experiments/launch_exp.py +++ b/src/agentlab/experiments/launch_exp.py @@ -33,6 +33,11 @@ def run_experiments(n_jobs, exp_args_list: list[ExpArgs], exp_dir, parallel_back Parallel backend to use. Either "joblib", "dask" or "sequential". """ + + if n_jobs == 1 and parallel_backend != "sequential": + logging.warning("Only 1 job, switching to sequential backend.") + parallel_backend = "sequential" + logging.info(f"Saving experiments to {exp_dir}") for exp_args in exp_args_list: exp_args.agent_args.prepare() @@ -67,7 +72,9 @@ def run_experiments(n_jobs, exp_args_list: list[ExpArgs], exp_dir, parallel_back def make_study_dir(exp_root, study_name, add_date=True): if add_date: study_name = f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}_{study_name}" - return Path(exp_root) / study_name + study_dir = Path(exp_root) / study_name + study_dir.mkdir(parents=True, exist_ok=True) + return study_dir def relaunch_study(study_dir: str | Path, relaunch_mode="incomplete_only"): @@ -91,7 +98,7 @@ def relaunch_study(study_dir: str | Path, relaunch_mode="incomplete_only"): if len(exp_args_list) == 0: logging.info(f"No incomplete experiments found in {study_dir}.") - return + return [], study_dir message = f"Make sure the processes that were running are all stopped. Otherwise, " f"there will be concurrent writing in the same directories.\n" From baf9afa357c9a5dca6079e9e209406536c32eb73 Mon Sep 17 00:00:00 2001 From: recursix Date: Thu, 19 Sep 2024 11:17:30 -0400 Subject: [PATCH 16/58] adding reproducibility tools --- .../experiments/reproducibility_script.py | 44 ++-- .../experiments/reproducibility_util.py | 188 ++++++++++++++++-- .../experiments/test_reproducibility_util.py | 64 +++++- 3 files changed, 263 insertions(+), 33 deletions(-) diff --git a/src/agentlab/experiments/reproducibility_script.py b/src/agentlab/experiments/reproducibility_script.py index dfb5880f..c8b34667 100644 --- a/src/agentlab/experiments/reproducibility_script.py +++ b/src/agentlab/experiments/reproducibility_script.py @@ -5,7 +5,11 @@ from agentlab.experiments import study_generators from agentlab.experiments.exp_utils import RESULTS_DIR from agentlab.experiments.launch_exp import make_study_dir, run_experiments, relaunch_study -from agentlab.experiments.reproducibility_util import set_temp +from agentlab.experiments.reproducibility_util import ( + set_temp, + write_reproducibility_info, + add_experiment_to_journal, +) logging.getLogger().setLevel(logging.INFO) @@ -16,23 +20,37 @@ agent_args = set_temp(AGENT_4o_MINI) ## select the benchmark to run on - benchmark = "miniwob" - # benchmark = "miniwob_tiny_test" - # benchmark = "workarena.l1" + # benchmark = "miniwob" + benchmark = "miniwob_tiny_test" + # benchmark = "workarena.l1 # benchmark = "workarena.l2" # benchmark = "workarena.l3" # benchmark = "webarena" - study_name, exp_args_list = study_generators.run_agents_on_benchmark(agent_args, benchmark) - study_dir = make_study_dir(RESULTS_DIR, study_name) - - # ## alternatively, relaunch an existing study - # study_dir = get_most_recent_folder(RESULTS_DIR, contains=None) - # exp_args_list, study_dir = relaunch_study(study_dir, relaunch_mode="incomplete_or_error") - ## Number of parallel jobs - n_jobs = 3 # Make sure to use 1 job when debugging in VSCode + n_jobs = 1 # Make sure to use 1 job when debugging in VSCode # n_jobs = -1 # to use all available cores + relaunch = True + + if relaunch: + # relaunch an existing study + study_dir = get_most_recent_folder(RESULTS_DIR, contains=None) + exp_args_list, study_dir = relaunch_study(study_dir, relaunch_mode="incomplete_or_error") + else: + study_name, exp_args_list = study_generators.run_agents_on_benchmark(agent_args, benchmark) + study_dir = make_study_dir(RESULTS_DIR, study_name) + + write_reproducibility_info( + study_dir=study_dir, + agent_name=agent_args.agent_name, + benchmark_name=benchmark, + ignore_changes=True, + ) + # run the experiments - run_experiments(n_jobs, exp_args_list, study_dir, parallel_backend="dask") + try: + run_experiments(n_jobs, exp_args_list, study_dir, parallel_backend="dask") + finally: + # will try to gather info at the end even if run_experiments failed + add_experiment_to_journal(study_dir) diff --git a/src/agentlab/experiments/reproducibility_util.py b/src/agentlab/experiments/reproducibility_util.py index 32aca0e3..cfcca6e6 100644 --- a/src/agentlab/experiments/reproducibility_util.py +++ b/src/agentlab/experiments/reproducibility_util.py @@ -1,4 +1,9 @@ from copy import deepcopy +import csv +from datetime import datetime +import json +import logging +import platform from agentlab.agents.generic_agent.generic_agent import GenericAgentArgs from pathlib import Path @@ -6,6 +11,7 @@ from importlib import metadata from git.config import GitConfigParser import os +import agentlab def _get_repo(module): @@ -25,10 +31,12 @@ def _get_benchmark_version(benchmark_name): raise ValueError(f"Unknown benchmark {benchmark_name}") -def get_git_username(repo: Repo) -> str: +def _get_git_username(repo: Repo) -> str: """ Retrieves the first available Git username from various sources. + Note: overlycomplex designed by Claude and not fully tested. + This function checks multiple locations for the Git username in the following order: 1. Repository-specific configuration 2. GitHub API (if the remote is a GitHub repository) @@ -82,7 +90,7 @@ def get_git_username(repo: Repo) -> str: return os.environ.get("GIT_AUTHOR_NAME") or os.environ.get("GIT_COMMITTER_NAME") -def get_git_info(module): +def _get_git_info(module): """ Retrieve comprehensive git information for the given module. @@ -129,37 +137,191 @@ def get_git_info(module): return None, [] -def get_reproducibility_info(benchmark_name, ignore_changes=False): +def get_reproducibility_info(agent_name, benchmark_name, ignore_changes=False): + """ + Retrieve a dict of information that could influence the reproducibility of an experiment. + """ import agentlab from browsergym import core info = { - "git_user": get_git_username(_get_repo(agentlab)), + "git_user": _get_git_username(_get_repo(agentlab)), + "agent_name": agent_name, "benchmark": benchmark_name, "benchmark_version": _get_benchmark_version(benchmark_name), + "date": datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), + "os": f"{platform.system()} ({platform.version()})", + "python_version": platform.python_version(), + "playwright_version": metadata.distribution("playwright").version, } - def add_info(module_name, module): - git_hash, modified_files = get_git_info(module) + def add_git_info(module_name, module): + git_hash, modified_files = _get_git_info(module) - modified_files_str = "\n".join([f"{status} {file}" for status, file in modified_files]) + modified_files_str = "\n".join([f" {status}: {file}" for status, file in modified_files]) - if len(modified_files) > 0 and not ignore_changes: - raise ValueError( - f"Module {module_name} has uncommitted changes." - "Please commit or stash these changes before running the experiment or set ignore_changes=True." + if len(modified_files) > 0: + msg = ( + f"Module {module_name} has uncommitted changes. " f"Modified files: \n{modified_files_str}\n" ) + if ignore_changes: + logging.warning( + msg + "Ignoring changes as requested and proceeding to experiments." + ) + else: + raise ValueError( + msg + "Please commit or stash your changes before running the experiment." + ) info[f"{module_name}_version"] = module.__version__ info[f"{module_name}_git_hash"] = git_hash info[f"{module_name}__local_modifications"] = modified_files_str - add_info("agentlab", agentlab) - add_info("browsergym", core) + add_git_info("agentlab", agentlab) + add_git_info("browsergym", core) + return info + + +def _assert_compatible(info: dict, old_info: dict): + """Make sure that the two info dicts are compatible.""" + # TODO may need to adapt if there are multiple agents, and the re-run on + # error only has a subset of agents. Hence old_info.agent_name != info.agent_name + for key in info.keys(): + if key in ("date", "avg_reward", "std_err", "n_completed", "n_err"): + continue + if info[key] != old_info[key]: + raise ValueError( + f"Reproducibility info already exist and is not compatible." + f"Key {key} has changed from {old_info[key]} to {info[key]}." + ) + + +def write_reproducibility_info(study_dir, agent_name, benchmark_name, ignore_changes=False): + info = get_reproducibility_info(agent_name, benchmark_name, ignore_changes=ignore_changes) + return save_reproducibility_info(study_dir, info) + + +def save_reproducibility_info(study_dir, info): + """ + Save a JSON file containing reproducibility information to the specified directory. + """ + + info_path = Path(study_dir) / "reproducibility_info.json" + + if info_path.exists(): + with open(info_path, "r") as f: + existing_info = json.load(f) + _assert_compatible(info, existing_info) + logging.info( + "Reproducibility info already exists and is compatible. Overwriting the old one." + ) + + with open(info_path, "w") as f: + json.dump(info, f, indent=4) + + info_str = json.dumps(info, indent=4) + logging.info(f"Reproducibility info saved to {info_path}. Info: {info_str}") + return info +def load_reproducibility_info(study_dir) -> dict[str]: + """Retrieve the reproducibility info from the study directory.""" + info_path = Path(study_dir) / "reproducibility_info.json" + with open(info_path, "r") as f: + return json.load(f) + + +# def save_reward(study_dir: str | Path, reward: float | list[float], std_err: float | list[float]): +# """Append success rate and std_err to the journal.""" + +# info = load_reproducibility_info(study_dir) +# info["reward"] = reward +# info["std_err"] = std_err +# save_reproducibility_info(study_dir, info) + + +from agentlab.analyze import inspect_results + + +def add_reward(info, study_dir, ignore_incomplete=False): + result_df = inspect_results.load_result_df(study_dir) + report = inspect_results.global_report(result_df) + + if "[ALL TASKS]" in report.index: + assert isinstance(info["agent_name"], str) + + n_err = report.loc["[ALL TASKS]", "n_err"].item() + n_completed, n_total = report.loc["[ALL TASKS]", "n_completed"].split("/") + if n_err > 0 and not ignore_incomplete: + raise ValueError( + f"Experiment has {n_err} errors. Please rerun the study and make sure all tasks are completed." + ) + if n_completed != n_total and not ignore_incomplete: + raise ValueError( + f"Experiment has {n_completed} completed tasks out of {n_total}. " + f"Please rerun the study and make sure all tasks are completed." + ) + + for key in ("avg_reward", "std_err", "n_err", "n_completed"): + value = report.loc["[ALL TASKS]", key] + if hasattr(value, "item"): + value = value.item() + info[key] = value + else: + raise ValueError("Multi agent not implemented yet") + + +def _get_csv_headers(file_path: str) -> list[str]: + with open(file_path, "r", newline="") as file: + reader = csv.reader(file) + try: + headers = next(reader) + except StopIteration: + headers = None + return headers + + +def append_to_journal(info, journal_path=None): + if journal_path is None: + journal_path = Path(agentlab.__file__).parent.parent.parent / "reproducibility_journal.csv" + + rows = [] + headers = None + if journal_path.exists(): + headers = _get_csv_headers(journal_path) + + if headers is None: + headers = list(info.keys()) + rows.append(headers) + + if isinstance(info["agent_name"], (list, tuple)): + # handle multiple agents + assert len(info["agent_name"]) == len(info["reward"]) + assert len(info["agent_name"]) == len(info["std_err"]) + + for i, agent_name in info["agent_name"]: + sub_info = info.copy() + sub_info["agent_name"] = agent_name + sub_info["reward"] = info["reward"][i] + sub_info["std_err"] = info["std_err"][i] + rows.append([str(sub_info[key]) for key in headers]) + else: + rows.append([str(info[key]) for key in headers]) + with open(journal_path, "a", newline="") as file: + writer = csv.writer(file) + for row in rows: + writer.writerow(row) + + +def add_experiment_to_journal(study_dir, ignore_incomplete=False): + info = load_reproducibility_info(study_dir) + add_reward(info, study_dir, ignore_incomplete) + save_reproducibility_info(study_dir, info) + append_to_journal(info) + + def set_temp(agent_args: GenericAgentArgs, temperature=0): """Set temperature to 0. Assumes a GenericAgent structure.""" agent_args = deepcopy(agent_args) diff --git a/tests/experiments/test_reproducibility_util.py b/tests/experiments/test_reproducibility_util.py index 60dded59..815aac08 100644 --- a/tests/experiments/test_reproducibility_util.py +++ b/tests/experiments/test_reproducibility_util.py @@ -1,3 +1,6 @@ +from pathlib import Path +import tempfile +import time from agentlab.experiments import reproducibility_util from agentlab.agents.generic_agent import AGENT_4o_MINI import pytest @@ -14,7 +17,7 @@ def test_set_temp(): ["miniwob", "workarena.l1", "webarena", "visualwebarena"], ) def test_get_reproducibility_info(benchmark_name): - info = reproducibility_util.get_reproducibility_info(benchmark_name, ignore_changes=True) + info = reproducibility_util.get_reproducibility_info("test_agent", benchmark_name, ignore_changes=True) print("reproducibility info:") print(json.dumps(info, indent=4)) @@ -31,11 +34,58 @@ def test_get_reproducibility_info(benchmark_name): assert "browsergym__local_modifications" in info +def test_save_reproducibility_info(): + with tempfile.TemporaryDirectory() as tmp_dir: + tmp_dir = Path(tmp_dir) + + info1 = reproducibility_util.save_reproducibility_info( + study_dir=tmp_dir, + info=reproducibility_util.get_reproducibility_info( + agent_name="test_agent", + benchmark_name="miniwob", + ignore_changes=True, + ), + ) + time.sleep(1) # make sure the date changes by at least 1s + + # this should overwrite the previous info since they are the same beside + # the date + info2 = reproducibility_util.save_reproducibility_info( + study_dir=tmp_dir, + info=reproducibility_util.get_reproducibility_info( + agent_name="test_agent", + benchmark_name="miniwob", + ignore_changes=True, + ), + ) + + reproducibility_util._assert_compatible(info1, info2) + + # this should not overwrite info2 as the agent name is different, it + # should raise an error + with pytest.raises(ValueError): + reproducibility_util.save_reproducibility_info( + study_dir=tmp_dir, + info=reproducibility_util.get_reproducibility_info( + agent_name="test_agent_alt", + benchmark_name="miniwob", + ignore_changes=True, + ), + ) + + # load json + info3 = reproducibility_util.load_reproducibility_info(tmp_dir) + + assert info2 == info3 + assert info1 != info3 + + test_study_dir = Path(__file__).parent.parent / "data" / "test_study" + + reproducibility_util.add_reward(info3, test_study_dir, ignore_incomplete=True) + reproducibility_util.append_to_journal(info3, journal_path=tmp_dir / "journal.csv") + print((tmp_dir / "journal.csv").read_text()) + if __name__ == "__main__": # test_set_temp() - for benchmark_name in [ - "miniwob", - "workarena.l1", - "webarena", - ]: - test_get_reproducibility_info(benchmark_name) + # test_get_reproducibility_info() + test_save_reproducibility_info() From b0268b65a2510d7a6d7fd35c3a2031d74960c6ac Mon Sep 17 00:00:00 2001 From: recursix Date: Thu, 19 Sep 2024 20:55:40 -0400 Subject: [PATCH 17/58] fix white listing --- .../experiments/reproducibility_util.py | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/src/agentlab/experiments/reproducibility_util.py b/src/agentlab/experiments/reproducibility_util.py index cfcca6e6..b3fd96b1 100644 --- a/src/agentlab/experiments/reproducibility_util.py +++ b/src/agentlab/experiments/reproducibility_util.py @@ -90,7 +90,7 @@ def _get_git_username(repo: Repo) -> str: return os.environ.get("GIT_AUTHOR_NAME") or os.environ.get("GIT_COMMITTER_NAME") -def _get_git_info(module): +def _get_git_info(module, changes_white_list=()) -> tuple[str, list[tuple[str, Path]]]: """ Retrieve comprehensive git information for the given module. @@ -100,6 +100,7 @@ def _get_git_info(module): Args: module: The Python module object to check for git information. + changes_white_list: A list of file paths to ignore when checking for changes. Returns: tuple: A tuple containing two elements: @@ -132,12 +133,19 @@ def _get_git_info(module): for file in untracked_files: modified_files.append(("??", Path(file))) - return git_hash, modified_files + # wildcard matching from white list + modified_files_filtered = [] + for status, file in modified_files: + if any(file.match(pattern) for pattern in changes_white_list): + continue + modified_files_filtered.append((status, file)) + + return git_hash, modified_files_filtered except InvalidGitRepositoryError: return None, [] -def get_reproducibility_info(agent_name, benchmark_name, ignore_changes=False): +def get_reproducibility_info(agent_name, benchmark_name, changes_white_list=("*/reproducibility_script.py",), ignore_changes=False): """ Retrieve a dict of information that could influence the reproducibility of an experiment. """ @@ -156,7 +164,8 @@ def get_reproducibility_info(agent_name, benchmark_name, ignore_changes=False): } def add_git_info(module_name, module): - git_hash, modified_files = _get_git_info(module) + git_hash, modified_files = _get_git_info(module, changes_white_list) + modified_files_str = "\n".join([f" {status}: {file}" for status, file in modified_files]) @@ -233,13 +242,6 @@ def load_reproducibility_info(study_dir) -> dict[str]: return json.load(f) -# def save_reward(study_dir: str | Path, reward: float | list[float], std_err: float | list[float]): -# """Append success rate and std_err to the journal.""" - -# info = load_reproducibility_info(study_dir) -# info["reward"] = reward -# info["std_err"] = std_err -# save_reproducibility_info(study_dir, info) from agentlab.analyze import inspect_results From bb7ddb0ad7d8e3bf038481d9007fa40d7cdd94e7 Mon Sep 17 00:00:00 2001 From: recursix Date: Thu, 19 Sep 2024 20:59:51 -0400 Subject: [PATCH 18/58] minor --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f4570b6a..7b3e1140 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ authors = [ {name = "Maxime Gasse", email = "maxime.gasse@servicenow.com"}, {name = "Alex Lacoste", email = "alex.lacoste@servicenow.com"}, {name = "Tom Marty", email = "tom.marty@polymtl.ca"}, - {name = "Massimo Caccia", email = "massimo.caccia1@servicenow.com"} + {name = "Massimo Caccia", email = "massimo.caccia1@servicenow.com"}, {name = "Thibault Le Sellier de Chezelles", email = "thibault.de.chezelles@gmail.com"} ] readme = "README.md" From 8b4884fcdccf7bf3c3557c53c0d06e0b2902dabb Mon Sep 17 00:00:00 2001 From: recursix Date: Thu, 19 Sep 2024 21:00:22 -0400 Subject: [PATCH 19/58] minor --- .../experiments/reproducibility_script.py | 2 +- src/agentlab/experiments/reproducibility_util.py | 16 +++++++++++----- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/agentlab/experiments/reproducibility_script.py b/src/agentlab/experiments/reproducibility_script.py index c8b34667..6c8a8497 100644 --- a/src/agentlab/experiments/reproducibility_script.py +++ b/src/agentlab/experiments/reproducibility_script.py @@ -31,7 +31,7 @@ n_jobs = 1 # Make sure to use 1 job when debugging in VSCode # n_jobs = -1 # to use all available cores - relaunch = True + relaunch = False if relaunch: # relaunch an existing study diff --git a/src/agentlab/experiments/reproducibility_util.py b/src/agentlab/experiments/reproducibility_util.py index b3fd96b1..99a11e6c 100644 --- a/src/agentlab/experiments/reproducibility_util.py +++ b/src/agentlab/experiments/reproducibility_util.py @@ -145,7 +145,16 @@ def _get_git_info(module, changes_white_list=()) -> tuple[str, list[tuple[str, P return None, [] -def get_reproducibility_info(agent_name, benchmark_name, changes_white_list=("*/reproducibility_script.py",), ignore_changes=False): +def get_reproducibility_info( + agent_name, + benchmark_name, + changes_white_list=( # Files that are often modified during experiments but do not affect reproducibility + "*/reproducibility_script.py", + "*/reproducibility_journal.csv", + "*/launch_command.py", + ), + ignore_changes=False, +): """ Retrieve a dict of information that could influence the reproducibility of an experiment. """ @@ -166,7 +175,6 @@ def get_reproducibility_info(agent_name, benchmark_name, changes_white_list=("*/ def add_git_info(module_name, module): git_hash, modified_files = _get_git_info(module, changes_white_list) - modified_files_str = "\n".join([f" {status}: {file}" for status, file in modified_files]) if len(modified_files) > 0: @@ -242,8 +250,6 @@ def load_reproducibility_info(study_dir) -> dict[str]: return json.load(f) - - from agentlab.analyze import inspect_results @@ -293,7 +299,7 @@ def append_to_journal(info, journal_path=None): headers = None if journal_path.exists(): headers = _get_csv_headers(journal_path) - + if headers is None: headers = list(info.keys()) rows.append(headers) From e685f10d029a87cb0da06f5ec4d14a7388474a46 Mon Sep 17 00:00:00 2001 From: recursix Date: Thu, 19 Sep 2024 21:01:40 -0400 Subject: [PATCH 20/58] minor --- reproducibility_journal.csv | 3 +++ src/agentlab/experiments/reproducibility_script.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/reproducibility_journal.csv b/reproducibility_journal.csv index e69de29b..7318805f 100644 --- a/reproducibility_journal.csv +++ b/reproducibility_journal.csv @@ -0,0 +1,3 @@ +git_user,agent_name,benchmark,benchmark_version,date,avg_reward,std_err,n_err,n_completed,os,python_version,playwright_version,agentlab_version,agentlab_git_hash,agentlab__local_modifications,browsergym_version,browsergym_git_hash,browsergym__local_modifications +recursix,GenericAgent-gpt-4o-mini,miniwob_tiny_test,0.6.3,2024-09-19_18-37-00,1.0,0.0,0,4/4,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,baf9afa357c9a5dca6079e9e209406536c32eb73," M: reproducibility_journal.csv + M: src/agentlab/experiments/reproducibility_util.py",0.6.4,b73531271d2ce688c104eb4dfba2819583f1ba36, diff --git a/src/agentlab/experiments/reproducibility_script.py b/src/agentlab/experiments/reproducibility_script.py index 6c8a8497..a56449ab 100644 --- a/src/agentlab/experiments/reproducibility_script.py +++ b/src/agentlab/experiments/reproducibility_script.py @@ -45,7 +45,7 @@ study_dir=study_dir, agent_name=agent_args.agent_name, benchmark_name=benchmark, - ignore_changes=True, + ignore_changes=False, ) # run the experiments From ac8b7f82c1421ec9529b9c36a4df2c19872d69cb Mon Sep 17 00:00:00 2001 From: recursix Date: Thu, 19 Sep 2024 21:50:49 -0400 Subject: [PATCH 21/58] minor --- src/agentlab/experiments/reproducibility_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/agentlab/experiments/reproducibility_util.py b/src/agentlab/experiments/reproducibility_util.py index 99a11e6c..c7a746d0 100644 --- a/src/agentlab/experiments/reproducibility_util.py +++ b/src/agentlab/experiments/reproducibility_util.py @@ -150,7 +150,7 @@ def get_reproducibility_info( benchmark_name, changes_white_list=( # Files that are often modified during experiments but do not affect reproducibility "*/reproducibility_script.py", - "*/reproducibility_journal.csv", + "*reproducibility_journal.csv", "*/launch_command.py", ), ignore_changes=False, From 295f01005faf8f2c73a31be6a18cec19d563b54b Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 20 Sep 2024 07:16:08 -0400 Subject: [PATCH 22/58] minor fix --- src/agentlab/analyze/agent_xray.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py index caa462eb..fa6635e6 100644 --- a/src/agentlab/analyze/agent_xray.py +++ b/src/agentlab/analyze/agent_xray.py @@ -434,7 +434,7 @@ def run_gradio(results_dir: Path): step_id.change(fn=if_active("Logs")(update_logs), outputs=logs) step_id.change(fn=if_active("Stats")(update_stats), outputs=stats) step_id.change( - fn=if_active("Agent Info HTML")(update_agent_info_html), + fn=if_active("Agent Info HTML", 3)(update_agent_info_html), outputs=[agent_info_html, screenshot1, screenshot2], ) step_id.change(fn=if_active("Agent Info MD")(update_agent_info_md), outputs=agent_info_md) From 5ac4a7c9457cf9ea4fe0374eb86f95986f57c278 Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 20 Sep 2024 14:36:41 -0400 Subject: [PATCH 23/58] more tests --- .../experiments/reproducibility_script.py | 8 +- tests/agents/test_agent.py | 108 +++++++++++++++++- 2 files changed, 106 insertions(+), 10 deletions(-) diff --git a/src/agentlab/experiments/reproducibility_script.py b/src/agentlab/experiments/reproducibility_script.py index a56449ab..36cd286c 100644 --- a/src/agentlab/experiments/reproducibility_script.py +++ b/src/agentlab/experiments/reproducibility_script.py @@ -20,15 +20,15 @@ agent_args = set_temp(AGENT_4o_MINI) ## select the benchmark to run on - # benchmark = "miniwob" - benchmark = "miniwob_tiny_test" + benchmark = "miniwob" + # benchmark = "miniwob_tiny_test" # benchmark = "workarena.l1 # benchmark = "workarena.l2" # benchmark = "workarena.l3" # benchmark = "webarena" ## Number of parallel jobs - n_jobs = 1 # Make sure to use 1 job when debugging in VSCode + n_jobs = 6 # Make sure to use 1 job when debugging in VSCode # n_jobs = -1 # to use all available cores relaunch = False @@ -50,7 +50,7 @@ # run the experiments try: - run_experiments(n_jobs, exp_args_list, study_dir, parallel_backend="dask") + run_experiments(n_jobs, exp_args_list, study_dir, parallel_backend="joblib") finally: # will try to gather info at the end even if run_experiments failed add_experiment_to_journal(study_dir) diff --git a/tests/agents/test_agent.py b/tests/agents/test_agent.py index 6174ff24..3b1be95a 100644 --- a/tests/agents/test_agent.py +++ b/tests/agents/test_agent.py @@ -1,11 +1,15 @@ +import re import tempfile -from agentlab.agents.generic_agent.generic_agent import GenericAgentArgs -from agentlab.agents.generic_agent.agent_configs import FLAGS_GPT_3_5 -from agentlab.llm.chat_api import CheatMiniWoBLLMArgs +from dataclasses import dataclass +from pathlib import Path + from browsergym.experiments.loop import EnvArgs, ExpArgs -from agentlab.experiments import launch_exp + +from agentlab.agents.generic_agent.agent_configs import FLAGS_GPT_3_5 +from agentlab.agents.generic_agent.generic_agent import GenericAgentArgs from agentlab.analyze import inspect_results -from pathlib import Path +from agentlab.experiments import launch_exp +from agentlab.llm.chat_api import AIMessage, BaseModelArgs, CheatMiniWoBLLMArgs def test_generic_agent(): @@ -38,5 +42,97 @@ def test_generic_agent(): assert result_record[key].iloc[0] == target_val +@dataclass +class CheatMiniWoBLLM_Retry: + """For unit-testing purposes only. It only work with miniwob.click-test task.""" + + n_retry: int + retry_count: int = 0 + + def invoke(self, messages) -> str: + if self.retry_count < self.n_retry: + self.retry_count += 1 + return AIMessage(content="I'm retrying") + + prompt = messages[1].content + match = re.search(r"^\s*\[(\d+)\].*button", prompt, re.MULTILINE | re.IGNORECASE) + + if match: + bid = match.group(1) + action = f'click("{bid}")' + else: + raise Exception("Can't find the button's bid") + + answer = f"""I'm clicking the button as requested. + +{action} + +""" + return AIMessage(content=answer) + + def __call__(self, messages) -> str: + return self.invoke(messages) + + +@dataclass +class CheatMiniWoBLLMArgs_Retry(BaseModelArgs): + n_retry: int = 2 + model_name: str = "test/cheat_miniwob_click_test_retry" + + def make_model(self): + return CheatMiniWoBLLM_Retry(n_retry=self.n_retry) + + +def test_generic_agent_retry(): + exp_args = ExpArgs( + agent_args=GenericAgentArgs( + chat_model_args=CheatMiniWoBLLMArgs_Retry(n_retry=2), + flags=FLAGS_GPT_3_5, + ), + env_args=EnvArgs(task_name="miniwob.click-test", task_seed=42), + ) + + with tempfile.TemporaryDirectory() as tmp_dir: + launch_exp.run_experiments(1, [exp_args], Path(tmp_dir) / "generic_agent_test") + result_record = inspect_results.load_result_df(tmp_dir, progress_fn=None) + + target = { + "stats.cum_n_retry": 2, + "stats.cum_busted_retry": 0, + "n_steps": 1, + "cum_reward": 1.0, + } + + for key, target_val in target.items(): + assert key in result_record + assert result_record[key].iloc[0] == target_val + + +def test_bust_retry(): + exp_args = ExpArgs( + agent_args=GenericAgentArgs( + chat_model_args=CheatMiniWoBLLMArgs_Retry(n_retry=10), + flags=FLAGS_GPT_3_5, + ), + env_args=EnvArgs(task_name="miniwob.click-test", task_seed=42), + ) + + with tempfile.TemporaryDirectory() as tmp_dir: + launch_exp.run_experiments(1, [exp_args], Path(tmp_dir) / "generic_agent_test") + result_record = inspect_results.load_result_df(tmp_dir, progress_fn=None) + + target = { + "stats.cum_n_retry": 5, + "stats.cum_busted_retry": 1, + "n_steps": 1, + "cum_reward": 0, + } + + for key, target_val in target.items(): + assert key in result_record + assert result_record[key].iloc[0] == target_val + + if __name__ == "__main__": - test_generic_agent() + # test_generic_agent() + test_bust_retry() From d4cf9698f6073b8dcc3897972139a83705b15641 Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 20 Sep 2024 14:37:38 -0400 Subject: [PATCH 24/58] more results yay --- reproducibility_journal.csv | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/reproducibility_journal.csv b/reproducibility_journal.csv index 7318805f..e8a0893c 100644 --- a/reproducibility_journal.csv +++ b/reproducibility_journal.csv @@ -1,3 +1,5 @@ git_user,agent_name,benchmark,benchmark_version,date,avg_reward,std_err,n_err,n_completed,os,python_version,playwright_version,agentlab_version,agentlab_git_hash,agentlab__local_modifications,browsergym_version,browsergym_git_hash,browsergym__local_modifications -recursix,GenericAgent-gpt-4o-mini,miniwob_tiny_test,0.6.3,2024-09-19_18-37-00,1.0,0.0,0,4/4,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,baf9afa357c9a5dca6079e9e209406536c32eb73," M: reproducibility_journal.csv - M: src/agentlab/experiments/reproducibility_util.py",0.6.4,b73531271d2ce688c104eb4dfba2819583f1ba36, +recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob_tiny_test,0.6.3,2024-09-19_21-07-34,0.75,0.217,0,4/4,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,c99bdf74c98f323cc6a646467ba5f21154b6fd18,,0.6.4,b73531271d2ce688c104eb4dfba2819583f1ba36, +recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob_tiny_test,0.6.3,2024-09-19_21-28-58,1.0,0.0,0,4/4,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,c99bdf74c98f323cc6a646467ba5f21154b6fd18," M: reproducibility_journal.csv + M: src/agentlab/experiments/task_collections.py",0.6.4,b73531271d2ce688c104eb4dfba2819583f1ba36, +recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob,0.6.3,2024-09-20_07-16-21,0.546,0.02,0,625/625,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,295f01005faf8f2c73a31be6a18cec19d563b54b,,0.6.4,b73531271d2ce688c104eb4dfba2819583f1ba36, From 1dc720bbea32b4cb6289da19444ade48d234d61a Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 20 Sep 2024 14:44:11 -0400 Subject: [PATCH 25/58] disabling this test --- tests/experiments/test_graph_execution.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/experiments/test_graph_execution.py b/tests/experiments/test_graph_execution.py index 0522de19..9235358d 100644 --- a/tests/experiments/test_graph_execution.py +++ b/tests/experiments/test_graph_execution.py @@ -52,8 +52,8 @@ def test_execute_task_graph(): assert exp_args_list[2].end_time < exp_args_list[3].start_time # Verify that parallel tasks (task2 and task3) started within a short time of each other - parallel_start_diff = abs(exp_args_list[1].start_time - exp_args_list[2].start_time) - assert parallel_start_diff < 1.5 # Allow for a small delay + # parallel_start_diff = abs(exp_args_list[1].start_time - exp_args_list[2].start_time) + # assert parallel_start_diff < 1.5 # Allow for a small delay # Ensure that the entire task graph took the expected amount of time total_time = exp_args_list[-1].end_time - exp_args_list[0].start_time From 82f618152ed4045bdbe5d1b2886b1a428a3d6f5c Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 20 Sep 2024 14:53:41 -0400 Subject: [PATCH 26/58] update --- reproducibility_journal.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reproducibility_journal.csv b/reproducibility_journal.csv index e8a0893c..907464ce 100644 --- a/reproducibility_journal.csv +++ b/reproducibility_journal.csv @@ -2,4 +2,4 @@ git_user,agent_name,benchmark,benchmark_version,date,avg_reward,std_err,n_err,n_ recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob_tiny_test,0.6.3,2024-09-19_21-07-34,0.75,0.217,0,4/4,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,c99bdf74c98f323cc6a646467ba5f21154b6fd18,,0.6.4,b73531271d2ce688c104eb4dfba2819583f1ba36, recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob_tiny_test,0.6.3,2024-09-19_21-28-58,1.0,0.0,0,4/4,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,c99bdf74c98f323cc6a646467ba5f21154b6fd18," M: reproducibility_journal.csv M: src/agentlab/experiments/task_collections.py",0.6.4,b73531271d2ce688c104eb4dfba2819583f1ba36, -recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob,0.6.3,2024-09-20_07-16-21,0.546,0.02,0,625/625,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,295f01005faf8f2c73a31be6a18cec19d563b54b,,0.6.4,b73531271d2ce688c104eb4dfba2819583f1ba36, +recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob, 0.6.3,2024-09-20_07-16-21,0.546,0.02,0,625/625,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,295f01005faf8f2c73a31be6a18cec19d563b54b,,0.6.4,b73531271d2ce688c104eb4dfba2819583f1ba36, From eb871ac5ef9d3e0dd33c3554cea32fc15b4284db Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 20 Sep 2024 14:54:23 -0400 Subject: [PATCH 27/58] update --- reproducibility_journal.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reproducibility_journal.csv b/reproducibility_journal.csv index 907464ce..e8a0893c 100644 --- a/reproducibility_journal.csv +++ b/reproducibility_journal.csv @@ -2,4 +2,4 @@ git_user,agent_name,benchmark,benchmark_version,date,avg_reward,std_err,n_err,n_ recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob_tiny_test,0.6.3,2024-09-19_21-07-34,0.75,0.217,0,4/4,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,c99bdf74c98f323cc6a646467ba5f21154b6fd18,,0.6.4,b73531271d2ce688c104eb4dfba2819583f1ba36, recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob_tiny_test,0.6.3,2024-09-19_21-28-58,1.0,0.0,0,4/4,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,c99bdf74c98f323cc6a646467ba5f21154b6fd18," M: reproducibility_journal.csv M: src/agentlab/experiments/task_collections.py",0.6.4,b73531271d2ce688c104eb4dfba2819583f1ba36, -recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob, 0.6.3,2024-09-20_07-16-21,0.546,0.02,0,625/625,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,295f01005faf8f2c73a31be6a18cec19d563b54b,,0.6.4,b73531271d2ce688c104eb4dfba2819583f1ba36, +recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob,0.6.3,2024-09-20_07-16-21,0.546,0.02,0,625/625,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,295f01005faf8f2c73a31be6a18cec19d563b54b,,0.6.4,b73531271d2ce688c104eb4dfba2819583f1ba36, From fa0c4891ac3a994096b2ea33a2de26f90d5d25f8 Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 20 Sep 2024 15:04:10 -0400 Subject: [PATCH 28/58] black --- tests/experiments/test_reproducibility_util.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/experiments/test_reproducibility_util.py b/tests/experiments/test_reproducibility_util.py index 815aac08..edda3682 100644 --- a/tests/experiments/test_reproducibility_util.py +++ b/tests/experiments/test_reproducibility_util.py @@ -17,7 +17,9 @@ def test_set_temp(): ["miniwob", "workarena.l1", "webarena", "visualwebarena"], ) def test_get_reproducibility_info(benchmark_name): - info = reproducibility_util.get_reproducibility_info("test_agent", benchmark_name, ignore_changes=True) + info = reproducibility_util.get_reproducibility_info( + "test_agent", benchmark_name, ignore_changes=True + ) print("reproducibility info:") print(json.dumps(info, indent=4)) @@ -85,6 +87,7 @@ def test_save_reproducibility_info(): reproducibility_util.append_to_journal(info3, journal_path=tmp_dir / "journal.csv") print((tmp_dir / "journal.csv").read_text()) + if __name__ == "__main__": # test_set_temp() # test_get_reproducibility_info() From abd3212498f2d5922f7fa28f8effe2cbe9b634ed Mon Sep 17 00:00:00 2001 From: ThibaultLSDC Date: Fri, 20 Sep 2024 15:17:34 -0400 Subject: [PATCH 29/58] maybe fixing github workflow ? --- .github/workflows/unit_tests.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index e012e3fc..38c2e5bb 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -20,6 +20,9 @@ jobs: - name: Checkout Repository uses: actions/checkout@v4 + - name: Set up Git user + run: git config --global user.email "not_a_real_email@address.com" && git config --global user.name "GitHub Actions" + - name: Set up Python uses: actions/setup-python@v5 with: From 4ebee28abd9ee30b5c95932916d30cd5a23cf518 Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 20 Sep 2024 15:35:36 -0400 Subject: [PATCH 30/58] make get_git_username great again --- .../experiments/reproducibility_util.py | 80 +++++++++++-------- 1 file changed, 46 insertions(+), 34 deletions(-) diff --git a/src/agentlab/experiments/reproducibility_util.py b/src/agentlab/experiments/reproducibility_util.py index c7a746d0..50cc6df2 100644 --- a/src/agentlab/experiments/reproducibility_util.py +++ b/src/agentlab/experiments/reproducibility_util.py @@ -51,40 +51,52 @@ def _get_git_username(repo: Repo) -> str: str: The first non-None username found, or None if no username is found. """ # Repository-specific configuration - username = repo.config_reader().get_value("user", "name", None) - if username: - return username - - # GitHub username - remote_url = repo.remotes.origin.url - if "github.com" in remote_url: - import re - import urllib.request - import json - - match = re.search(r"github\.com[:/](.+)/(.+)\.git", remote_url) - if match: - owner, repo_name = match.groups() - api_url = f"https://api.github.com/repos/{owner}/{repo_name}" - with urllib.request.urlopen(api_url) as response: - data = json.loads(response.read().decode()) - username = data["owner"]["login"] - if username: - return username - - # Global configuration - username = GitConfigParser(repo.git.config("--global", "--list"), read_only=True).get_value( - "user", "name", None - ) - if username: - return username - - # System configuration - username = GitConfigParser(repo.git.config("--system", "--list"), read_only=True).get_value( - "user", "name", None - ) - if username: - return username + try: + username = repo.config_reader().get_value("user", "name", None) + if username: + return username + except Exception: + pass + + try: + # GitHub username + remote_url = repo.remotes.origin.url + if "github.com" in remote_url: + import re + import urllib.request + import json + + match = re.search(r"github\.com[:/](.+)/(.+)\.git", remote_url) + if match: + owner, repo_name = match.groups() + api_url = f"https://api.github.com/repos/{owner}/{repo_name}" + with urllib.request.urlopen(api_url) as response: + data = json.loads(response.read().decode()) + username = data["owner"]["login"] + if username: + return username + except Exception: + pass + + try: + # Global configuration + username = GitConfigParser(repo.git.config("--global", "--list"), read_only=True).get_value( + "user", "name", None + ) + if username: + return username + except Exception: + pass + + try: + # System configuration + username = GitConfigParser(repo.git.config("--system", "--list"), read_only=True).get_value( + "user", "name", None + ) + if username: + return username + except Exception: + pass # Environment variables return os.environ.get("GIT_AUTHOR_NAME") or os.environ.get("GIT_COMMITTER_NAME") From 58f5ec7f8132c14b607c0fe546ca42eecf340ec1 Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 20 Sep 2024 15:38:08 -0400 Subject: [PATCH 31/58] trigger change --- tests/experiments/test_reproducibility_util.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/experiments/test_reproducibility_util.py b/tests/experiments/test_reproducibility_util.py index edda3682..1836187c 100644 --- a/tests/experiments/test_reproducibility_util.py +++ b/tests/experiments/test_reproducibility_util.py @@ -92,3 +92,4 @@ def test_save_reproducibility_info(): # test_set_temp() # test_get_reproducibility_info() test_save_reproducibility_info() + pass From f6216486d5faac2c8b3fb0a63e114e5a4bafde47 Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 20 Sep 2024 16:09:50 -0400 Subject: [PATCH 32/58] new browsergym --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 981f1ce5..9dce3fcc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ pytest-xdist pytest-playwright dask distributed -browsergym>=0.6.0 +browsergym>=0.7.0 joblib>=1.2.0 openai>=1.7,<2 langchain>=0.1,<1 From 60a1b227fb92931c9959bf25692ee4d491a6beb3 Mon Sep 17 00:00:00 2001 From: recursix Date: Sat, 21 Sep 2024 11:51:53 -0400 Subject: [PATCH 33/58] GPT-4o result (and new comment column) --- reproducibility_journal.csv | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/reproducibility_journal.csv b/reproducibility_journal.csv index e8a0893c..956bc9b1 100644 --- a/reproducibility_journal.csv +++ b/reproducibility_journal.csv @@ -1,5 +1,6 @@ -git_user,agent_name,benchmark,benchmark_version,date,avg_reward,std_err,n_err,n_completed,os,python_version,playwright_version,agentlab_version,agentlab_git_hash,agentlab__local_modifications,browsergym_version,browsergym_git_hash,browsergym__local_modifications -recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob_tiny_test,0.6.3,2024-09-19_21-07-34,0.75,0.217,0,4/4,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,c99bdf74c98f323cc6a646467ba5f21154b6fd18,,0.6.4,b73531271d2ce688c104eb4dfba2819583f1ba36, -recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob_tiny_test,0.6.3,2024-09-19_21-28-58,1.0,0.0,0,4/4,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,c99bdf74c98f323cc6a646467ba5f21154b6fd18," M: reproducibility_journal.csv +git_user,agent_name,benchmark,benchmark_version,date,avg_reward,std_err,n_err,n_completed,comment,os,python_version,playwright_version,agentlab_version,agentlab_git_hash,agentlab__local_modifications,browsergym_version,browsergym_git_hash,browsergym__local_modifications +recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob_tiny_test,0.6.3,2024-09-19_21-07-34,0.75,0.217,0,4/4,,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,c99bdf74c98f323cc6a646467ba5f21154b6fd18,,0.6.4,b73531271d2ce688c104eb4dfba2819583f1ba36, +recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob_tiny_test,0.6.3,2024-09-19_21-28-58,1.0,0.0,0,4/4,,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,c99bdf74c98f323cc6a646467ba5f21154b6fd18," M: reproducibility_journal.csv M: src/agentlab/experiments/task_collections.py",0.6.4,b73531271d2ce688c104eb4dfba2819583f1ba36, -recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob,0.6.3,2024-09-20_07-16-21,0.546,0.02,0,625/625,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,295f01005faf8f2c73a31be6a18cec19d563b54b,,0.6.4,b73531271d2ce688c104eb4dfba2819583f1ba36, +recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob,0.6.3,2024-09-20_07-16-21,0.546,0.02,0,625/625,,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,295f01005faf8f2c73a31be6a18cec19d563b54b,,0.6.4,b73531271d2ce688c104eb4dfba2819583f1ba36, +recursix,GenericAgent-gpt-4o-2024-05-13,miniwob,0.6.3,2024-09-20_22-09-43,0.656,0.019,0,625/625,,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,f6216486d5faac2c8b3fb0a63e114e5a4bafde47,,0.6.4,8cef8fe34940ff490d0cc06b0c8f100180d09d43, From dd9aa0da1f9096ee55da1257f333999fe1c07562 Mon Sep 17 00:00:00 2001 From: recursix Date: Sat, 21 Sep 2024 11:52:40 -0400 Subject: [PATCH 34/58] Seems like there was a change to 4o flags, trying these --- src/agentlab/agents/generic_agent/agent_configs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/agentlab/agents/generic_agent/agent_configs.py b/src/agentlab/agents/generic_agent/agent_configs.py index a53046b2..e671ab19 100644 --- a/src/agentlab/agents/generic_agent/agent_configs.py +++ b/src/agentlab/agents/generic_agent/agent_configs.py @@ -208,8 +208,8 @@ action=dp.ActionFlags( multi_actions=False, action_set="bid", - long_description=True, - individual_examples=True, + long_description=False, + individual_examples=False, ), use_plan=False, use_criticise=False, From 54ea0af46fac66477b0a9d0abf9d86d307d237c9 Mon Sep 17 00:00:00 2001 From: recursix Date: Sat, 21 Sep 2024 11:53:06 -0400 Subject: [PATCH 35/58] minor comment --- src/agentlab/agents/generic_agent/generic_agent.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/agentlab/agents/generic_agent/generic_agent.py b/src/agentlab/agents/generic_agent/generic_agent.py index a53f1aeb..6ac8ad7d 100644 --- a/src/agentlab/agents/generic_agent/generic_agent.py +++ b/src/agentlab/agents/generic_agent/generic_agent.py @@ -27,6 +27,7 @@ def __post_init__(self): pass def set_benchmark(self, benchmark): + """Override Some flags based on the benchmark.""" if benchmark == "miniwob": self.flags.obs.use_html = True From 24214e5e9056c0cf36a61cdd8b58c69d95818e02 Mon Sep 17 00:00:00 2001 From: recursix Date: Sat, 21 Sep 2024 11:53:28 -0400 Subject: [PATCH 36/58] better xray --- src/agentlab/analyze/agent_xray.py | 39 +++++++++++++++--------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py index fa6635e6..a152d5d1 100644 --- a/src/agentlab/analyze/agent_xray.py +++ b/src/agentlab/analyze/agent_xray.py @@ -31,7 +31,7 @@ def display_table(df: pd.DataFrame): return df -def remove_args_frcom_col(df: pd.DataFrame): +def remove_args_from_col(df: pd.DataFrame): df.columns = [col.replace("_args", "") for col in df.columns] df.index.names = [col.replace("_args", "") for col in df.index.names] return df @@ -220,15 +220,18 @@ def run_gradio(results_dir: Path): with gr.Tab("Select Task and Seed", id="Select Task"): with gr.Row(): with gr.Column(scale=4): - with gr.Accordion("Task Selector (click for help)", open=False): - gr.Markdown( - """\ - Click on a row to select a task. It will trigger the update of other fields. + with gr.Row(): # combining the title (help) and the refresh button + with gr.Accordion("Task Selector (click for help)", open=False): + gr.Markdown( + """\ + Click on a row to select a task. It will trigger the update of other fields. + + **GRADIO BUG**: If you sort the columns the click will not match the + content. You have to sort back with the Idx column to align the click with + the order.""" + ) + refresh_results_button = gr.Button("↺", scale=0, size="sm") - **GRADIO BUG**: If you sort the columns the click will not match the - content. You have to sort back with the Idx column to align the click with - the order.""" - ) task_table = gr.DataFrame(height=500, show_label=False, interactive=False) with gr.Column(scale=2): @@ -387,6 +390,10 @@ def run_gradio(results_dir: Path): fn=refresh_exp_dir_choices, inputs=exp_dir_choice, outputs=exp_dir_choice ) + refresh_results_button.click( + fn=refresh_exp_dir_choices, inputs=exp_dir_choice, outputs=exp_dir_choice + ) + exp_dir_choice.change( fn=new_exp_dir, inputs=exp_dir_choice, @@ -853,17 +860,11 @@ def get_agent_report(result_df: pd.DataFrame): levels = list(range(result_df.index.nlevels)) if len(levels) == 1: - df = pd.DataFrame([{AGENT_NAME_KEY: result_df[AGENT_NAME_KEY].iloc[0]}]) - df.set_index(AGENT_NAME_KEY, inplace=True) - return df + result_df = result_df.set_index(AGENT_NAME_KEY, append=True) + levels = list(range(result_df.index.nlevels)) report = result_df.groupby(level=levels[1:]).apply(inspect_results.summarize) - # def rename_index(name: str): - # return name.replace("agent_args.flags.", "") - - # index_names = [rename_index(name) for name in report.index.names] - # report = report.rename_axis(index=index_names) return report @@ -874,7 +875,7 @@ def update_global_stats(): return stats -def new_exp_dir(exp_dir, progress=gr.Progress()): +def new_exp_dir(exp_dir, progress=gr.Progress(), just_refresh=False): if exp_dir == select_dir_instructions: return None, None @@ -887,7 +888,7 @@ def new_exp_dir(exp_dir, progress=gr.Progress()): info.exp_list_dir = info.results_dir / exp_dir info.result_df = inspect_results.load_result_df(info.exp_list_dir, progress_fn=progress.tqdm) - info.result_df = remove_args_frcom_col(info.result_df) + info.result_df = remove_args_from_col(info.result_df) agent_report = display_table(get_agent_report(info.result_df)) info.agent_id_keys = agent_report.index.names From b8da07b426e96a81a0338d943c884b2211f5bbe5 Mon Sep 17 00:00:00 2001 From: recursix Date: Sat, 21 Sep 2024 11:55:19 -0400 Subject: [PATCH 37/58] minor fix --- tests/experiments/test_reproducibility_util.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/experiments/test_reproducibility_util.py b/tests/experiments/test_reproducibility_util.py index 1836187c..04c88b4a 100644 --- a/tests/experiments/test_reproducibility_util.py +++ b/tests/experiments/test_reproducibility_util.py @@ -90,6 +90,5 @@ def test_save_reproducibility_info(): if __name__ == "__main__": # test_set_temp() - # test_get_reproducibility_info() + test_get_reproducibility_info("miniwob") test_save_reproducibility_info() - pass From 1ecaf9b5e3f9f9dd7eba9e33ca41f9150306c2ab Mon Sep 17 00:00:00 2001 From: recursix Date: Sat, 21 Sep 2024 11:55:32 -0400 Subject: [PATCH 38/58] addming a comment field --- src/agentlab/experiments/reproducibility_util.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/agentlab/experiments/reproducibility_util.py b/src/agentlab/experiments/reproducibility_util.py index 50cc6df2..a06b1536 100644 --- a/src/agentlab/experiments/reproducibility_util.py +++ b/src/agentlab/experiments/reproducibility_util.py @@ -160,6 +160,7 @@ def _get_git_info(module, changes_white_list=()) -> tuple[str, list[tuple[str, P def get_reproducibility_info( agent_name, benchmark_name, + comment=None, changes_white_list=( # Files that are often modified during experiments but do not affect reproducibility "*/reproducibility_script.py", "*reproducibility_journal.csv", @@ -177,6 +178,7 @@ def get_reproducibility_info( "git_user": _get_git_username(_get_repo(agentlab)), "agent_name": agent_name, "benchmark": benchmark_name, + "comment": comment, "benchmark_version": _get_benchmark_version(benchmark_name), "date": datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), "os": f"{platform.system()} ({platform.version()})", @@ -226,8 +228,12 @@ def _assert_compatible(info: dict, old_info: dict): ) -def write_reproducibility_info(study_dir, agent_name, benchmark_name, ignore_changes=False): - info = get_reproducibility_info(agent_name, benchmark_name, ignore_changes=ignore_changes) +def write_reproducibility_info( + study_dir, agent_name, benchmark_name, comment=None, ignore_changes=False +): + info = get_reproducibility_info( + agent_name, benchmark_name, comment, ignore_changes=ignore_changes + ) return save_reproducibility_info(study_dir, info) From 5aba9bc180e8d6e08759f79401aee9422fb0ea6b Mon Sep 17 00:00:00 2001 From: recursix Date: Sat, 21 Sep 2024 11:55:50 -0400 Subject: [PATCH 39/58] new agent --- src/agentlab/experiments/reproducibility_script.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/agentlab/experiments/reproducibility_script.py b/src/agentlab/experiments/reproducibility_script.py index 36cd286c..6b2b99b0 100644 --- a/src/agentlab/experiments/reproducibility_script.py +++ b/src/agentlab/experiments/reproducibility_script.py @@ -17,7 +17,7 @@ if __name__ == "__main__": - agent_args = set_temp(AGENT_4o_MINI) + agent_args = set_temp(AGENT_4o) ## select the benchmark to run on benchmark = "miniwob" From 7bf424eca1b42dd6b699db346b7f319e48959f9f Mon Sep 17 00:00:00 2001 From: recursix Date: Sat, 21 Sep 2024 15:12:57 -0400 Subject: [PATCH 40/58] another test with GPT-4o --- reproducibility_journal.csv | 1 + 1 file changed, 1 insertion(+) diff --git a/reproducibility_journal.csv b/reproducibility_journal.csv index 956bc9b1..32386c6a 100644 --- a/reproducibility_journal.csv +++ b/reproducibility_journal.csv @@ -4,3 +4,4 @@ recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob_tiny_test,0.6.3,2024-09-19_ M: src/agentlab/experiments/task_collections.py",0.6.4,b73531271d2ce688c104eb4dfba2819583f1ba36, recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob,0.6.3,2024-09-20_07-16-21,0.546,0.02,0,625/625,,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,295f01005faf8f2c73a31be6a18cec19d563b54b,,0.6.4,b73531271d2ce688c104eb4dfba2819583f1ba36, recursix,GenericAgent-gpt-4o-2024-05-13,miniwob,0.6.3,2024-09-20_22-09-43,0.656,0.019,0,625/625,,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,f6216486d5faac2c8b3fb0a63e114e5a4bafde47,,0.6.4,8cef8fe34940ff490d0cc06b0c8f100180d09d43, +recursix,GenericAgent-gpt-4o-2024-05-13,miniwob,0.6.3,2024-09-21_12-04-39,0.656,0.019,0,625/625,None,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,fe561b93c5f053e9f9625358862f542523b5e14a,,0.7.0,ed6d6992ef64bfb91aca7002d33cb6ed5ec031ef, From 7e0ab030e6078d945f8531a68e3d410af083b9fd Mon Sep 17 00:00:00 2001 From: recursix Date: Sat, 21 Sep 2024 15:16:59 -0400 Subject: [PATCH 41/58] adding llama3 from openrouter --- src/agentlab/agents/generic_agent/agent_configs.py | 12 ++++++++---- src/agentlab/llm/llm_configs.py | 7 +++++++ 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/src/agentlab/agents/generic_agent/agent_configs.py b/src/agentlab/agents/generic_agent/agent_configs.py index 8dbd38d9..57b2c18b 100644 --- a/src/agentlab/agents/generic_agent/agent_configs.py +++ b/src/agentlab/agents/generic_agent/agent_configs.py @@ -96,7 +96,7 @@ ) # llama3-70b default config -FLAGS_70B = GenericPromptFlags( +FLAGS_LLAMA3_70B = GenericPromptFlags( obs=dp.ObsFlags( use_html=False, use_ax_tree=True, @@ -135,9 +135,13 @@ add_missparsed_messages=True, ) -AGENT_70B = GenericAgentArgs( - chat_model_args=CHAT_MODEL_ARGS_DICT["meta-llama/Meta-Llama-3-70B-Instruct"], - flags=FLAGS_70B, +AGENT_LLAMA3_70B = GenericAgentArgs( + chat_model_args=CHAT_MODEL_ARGS_DICT["openrouter/meta-llama/Meta-Llama-3-70B-Instruct"], + flags=FLAGS_LLAMA3_70B, +) +AGENT_LLAMA31_70B = GenericAgentArgs( + chat_model_args=CHAT_MODEL_ARGS_DICT["openrouter/meta-llama/llama-3.1-70b-instruct"], + flags=FLAGS_LLAMA3_70B, ) FLAGS_8B = GenericPromptFlags( diff --git a/src/agentlab/llm/llm_configs.py b/src/agentlab/llm/llm_configs.py index 9fbb0ee7..076ee687 100644 --- a/src/agentlab/llm/llm_configs.py +++ b/src/agentlab/llm/llm_configs.py @@ -112,6 +112,13 @@ max_new_tokens=4000, temperature=1e-1, ), + "openrouter/meta-llama/llama-3-70b-instruct": OpenRouterModelArgs( + model_name="meta-llama/llama-3-70b-instruct", + max_total_tokens=128_000, + max_input_tokens=40_000, + max_new_tokens=4000, + temperature=1e-1, + ), "openrouter/meta-llama/llama-3.1-8b-instruct:free": OpenRouterModelArgs( model_name="meta-llama/llama-3.1-8b-instruct:free", max_total_tokens=128_000, From 03eae3276c03b32d41f13908055a68d8e5e3a254 Mon Sep 17 00:00:00 2001 From: recursix Date: Sat, 21 Sep 2024 15:32:53 -0400 Subject: [PATCH 42/58] fix naming --- src/agentlab/agents/generic_agent/__init__.py | 4 ++-- src/agentlab/agents/generic_agent/agent_configs.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/agentlab/agents/generic_agent/__init__.py b/src/agentlab/agents/generic_agent/__init__.py index fec74910..d34c15f7 100644 --- a/src/agentlab/agents/generic_agent/__init__.py +++ b/src/agentlab/agents/generic_agent/__init__.py @@ -1,7 +1,7 @@ from .agent_configs import ( AGENT_3_5, AGENT_8B, - AGENT_70B, + AGENT_LLAMA3_70B, AGENT_CUSTOM, RANDOM_SEARCH_AGENT, AGENT_4o, @@ -14,7 +14,7 @@ "AGENT_4o", "AGENT_4o_MINI", "AGENT_4o_VISION", - "AGENT_70B", + "AGENT_LLAMA3_70B", "AGENT_8B", "RANDOM_SEARCH_AGENT", "AGENT_CUSTOM", diff --git a/src/agentlab/agents/generic_agent/agent_configs.py b/src/agentlab/agents/generic_agent/agent_configs.py index 57b2c18b..4c0a39a7 100644 --- a/src/agentlab/agents/generic_agent/agent_configs.py +++ b/src/agentlab/agents/generic_agent/agent_configs.py @@ -136,7 +136,7 @@ ) AGENT_LLAMA3_70B = GenericAgentArgs( - chat_model_args=CHAT_MODEL_ARGS_DICT["openrouter/meta-llama/Meta-Llama-3-70B-Instruct"], + chat_model_args=CHAT_MODEL_ARGS_DICT["openrouter/meta-llama/llama-3-70b-instruct"], flags=FLAGS_LLAMA3_70B, ) AGENT_LLAMA31_70B = GenericAgentArgs( From 796c37eb54ac9c18778939b3b501feee92dd2154 Mon Sep 17 00:00:00 2001 From: recursix Date: Mon, 23 Sep 2024 16:01:02 -0400 Subject: [PATCH 43/58] unused import --- src/agentlab/agents/generic_agent/generic_agent.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/agentlab/agents/generic_agent/generic_agent.py b/src/agentlab/agents/generic_agent/generic_agent.py index c4502442..5a14f927 100644 --- a/src/agentlab/agents/generic_agent/generic_agent.py +++ b/src/agentlab/agents/generic_agent/generic_agent.py @@ -7,7 +7,6 @@ from agentlab.agents import dynamic_prompting as dp from agentlab.agents.agent_args import AgentArgs -from agentlab.agents.utils import openai_monitored_agent from agentlab.llm.chat_api import BaseModelArgs from agentlab.llm.llm_utils import RetryError, retry_raise from agentlab.llm.tracking import cost_tracker_decorator From 8fc49e9e3ea4606904f226eda4d2f12950c585a1 Mon Sep 17 00:00:00 2001 From: recursix Date: Mon, 23 Sep 2024 16:03:07 -0400 Subject: [PATCH 44/58] new summary tools and remove "_args" from columns in results --- src/agentlab/analyze/inspect_results.ipynb | 542 +++++++++++++++++- src/agentlab/analyze/inspect_results.py | 162 ++++-- .../experiments/reproducibility_util.py | 43 +- tests/agents/test_agent.py | 2 +- tests/analyze/test_inspect_results.py | 35 ++ tests/experiments/test_launch_exp.py | 12 +- 6 files changed, 727 insertions(+), 69 deletions(-) create mode 100644 tests/analyze/test_inspect_results.py diff --git a/src/agentlab/analyze/inspect_results.ipynb b/src/agentlab/analyze/inspect_results.ipynb index 673c0b10..b0a38e37 100644 --- a/src/agentlab/analyze/inspect_results.ipynb +++ b/src/agentlab/analyze/inspect_results.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -13,6 +13,544 @@ "%autoreload 2" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### load all summaries" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Searching experiments directories.: 0it [00:00, ?it/s]\n", + "Searching experiments directories.: 0it [00:00, ?it/s]\n", + "Searching experiments directories.: 0it [00:00, ?it/s]\n", + "Searching experiments directories.: 0it [00:00, ?it/s]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
agent.agent_nameenv.benchmarkavg_rewardstd_erravg_stepsn_completedn_err
study_dir
2024-09-21_15-38-29_GenericAgent-meta-llama_llama-3-70b-instruct_on_miniwobGenericAgent-meta-llama_llama-3-70b-instructminiwob0.5920.0204.323625/6252
2024-09-21_15-34-02_GenericAgent-meta-llama_llama-3-70b-instruct_on_miniwobGenericAgent-meta-llama_llama-3-70b-instructminiwob0.0000.0000.000625/625625
2024-09-21_12-04-39_GenericAgent-gpt-4o-2024-05-13_on_miniwobGenericAgent-gpt-4o-2024-05-13miniwob0.6560.0194.138625/6250
2024-09-20_22-09-43_GenericAgent-gpt-4o-2024-05-13_on_miniwobGenericAgent-gpt-4o-2024-05-13miniwob0.6560.0194.019625/6250
2024-09-20_07-16-21_GenericAgent-gpt-4o-mini-2024-07-18_on_miniwobGenericAgent-gpt-4o-mini-2024-07-18miniwob0.5460.0204.981625/6250
2024-09-19_21-53-57_GenericAgent-gpt-4o-mini-2024-07-18_on_miniwobGenericAgent-gpt-4o-mini-2024-07-18miniwob0.5140.0385.232177/6250
2024-09-19_21-51-12_GenericAgent-gpt-4o-mini-2024-07-18_on_miniwobGenericAgent-gpt-4o-mini-2024-07-18miniwob0.0000.00010.0003/6250
2024-09-19_21-30-36_GenericAgent-gpt-4o-mini-2024-07-18_on_miniwob_tiny_testGenericAgent-gpt-4o-mini-2024-07-18miniwob1.0000.0002.7504/40
2024-09-19_21-28-58_GenericAgent-gpt-4o-mini-2024-07-18_on_miniwob_tiny_testGenericAgent-gpt-4o-mini-2024-07-18miniwob1.0000.0002.7504/40
2024-09-19_21-07-34_GenericAgent-gpt-4o-mini-2024-07-18_on_miniwob_tiny_testGenericAgent-gpt-4o-mini-2024-07-18miniwob0.7500.2172.7504/40
2024-09-19_18-37-00_GenericAgent-gpt-4o-mini_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob1.0000.0002.7504/40
2024-09-19_11-45-30_GenericAgent-gpt-4o-mini_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob1.0000.0002.7504/40
2024-09-18_11-56-51_GenericAgent-gpt-4o-mini_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob1.0000.0002.0003/30
2024-09-18_11-47-33_GenericAgent-gpt-4o-mini_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob1.0000.0002.0003/30
2024-09-13_14-19-10_GenericAgent-gpt-4o-mini_on_miniwobGenericAgent-gpt-4o-miniminiwob0.5360.0205.083625/6250
2024-09-13_14-18-03_GenericAgent-gpt-4o-mini_on_miniwobGenericAgent-gpt-4o-miniminiwobNaNNaNNaN0/6250
2024-09-13_09-46-43_GenericAgent-gpt-4o-mini_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob1.0000.0002.7504/40
2024-09-13_09-32-27_GenericAgent-gpt-4o-mini_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob1.0000.0003.0002/40
2024-09-13_09-10-48_GenericAgent-gpt-4o-mini_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob1.0000.0002.7504/40
2024-09-12_22-22-10_GenericAgent-gpt-4o-mini_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob0.7500.2172.5004/41
2024-09-12_22-12-28_GenericAgent-gpt-4o-mini_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob1.0000.0002.7504/40
2024-09-12_22-10-47_GenericAgent-gpt-4o-mini_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob0.7500.2172.5004/41
2024-09-12_15-25-03_GenericAgent-gpt-4o-mini_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwobNaNNaNNaN0/40
2024-09-12_15-14-36_GenericAgent-gpt-4o-mini_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob0.7500.2171.7504/41
2024-09-12_15-08-05_GenericAgent-gpt-4o-mini_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob0.7500.2172.5004/41
2024-09-12_14-44-03_GenericAgent-gpt-4o-mini_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob0.7500.2172.5004/41
2024-09-12_14-37-09_GenericAgent-gpt-4o-mini_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob0.7500.2171.5004/41
2024-09-12_13-55-50_GenericAgent-gpt-4o-mini_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob0.7500.2171.5004/41
2024-09-12_13-12-54_GenericAgent-gpt-4o-mini_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob1.0000.0002.7504/40
2024-09-12_12-08-38_GenericAgentArgs_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob1.0000.0002.7504/40
2024-09-12_12-01-32_GenericAgentArgs_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob0.7500.2172.7504/40
2024-09-12_08-39-16_GenericAgent-gpt-4o-mini_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob0.7500.2172.5004/41
\n", + "
" + ], + "text/plain": [ + " agent.agent_name \\\n", + "study_dir \n", + "2024-09-21_15-38-29_GenericAgent-meta-llama_lla... GenericAgent-meta-llama_llama-3-70b-instruct \n", + "2024-09-21_15-34-02_GenericAgent-meta-llama_lla... GenericAgent-meta-llama_llama-3-70b-instruct \n", + "2024-09-21_12-04-39_GenericAgent-gpt-4o-2024-05... GenericAgent-gpt-4o-2024-05-13 \n", + "2024-09-20_22-09-43_GenericAgent-gpt-4o-2024-05... GenericAgent-gpt-4o-2024-05-13 \n", + "2024-09-20_07-16-21_GenericAgent-gpt-4o-mini-20... GenericAgent-gpt-4o-mini-2024-07-18 \n", + "2024-09-19_21-53-57_GenericAgent-gpt-4o-mini-20... GenericAgent-gpt-4o-mini-2024-07-18 \n", + "2024-09-19_21-51-12_GenericAgent-gpt-4o-mini-20... GenericAgent-gpt-4o-mini-2024-07-18 \n", + "2024-09-19_21-30-36_GenericAgent-gpt-4o-mini-20... GenericAgent-gpt-4o-mini-2024-07-18 \n", + "2024-09-19_21-28-58_GenericAgent-gpt-4o-mini-20... GenericAgent-gpt-4o-mini-2024-07-18 \n", + "2024-09-19_21-07-34_GenericAgent-gpt-4o-mini-20... GenericAgent-gpt-4o-mini-2024-07-18 \n", + "2024-09-19_18-37-00_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", + "2024-09-19_11-45-30_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", + "2024-09-18_11-56-51_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", + "2024-09-18_11-47-33_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", + "2024-09-13_14-19-10_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", + "2024-09-13_14-18-03_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", + "2024-09-13_09-46-43_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", + "2024-09-13_09-32-27_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", + "2024-09-13_09-10-48_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", + "2024-09-12_22-22-10_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", + "2024-09-12_22-12-28_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", + "2024-09-12_22-10-47_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", + "2024-09-12_15-25-03_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", + "2024-09-12_15-14-36_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", + "2024-09-12_15-08-05_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", + "2024-09-12_14-44-03_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", + "2024-09-12_14-37-09_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", + "2024-09-12_13-55-50_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", + "2024-09-12_13-12-54_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", + "2024-09-12_12-08-38_GenericAgentArgs_on_miniwob... GenericAgent-gpt-4o-mini \n", + "2024-09-12_12-01-32_GenericAgentArgs_on_miniwob... GenericAgent-gpt-4o-mini \n", + "2024-09-12_08-39-16_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", + "\n", + " env.benchmark avg_reward \\\n", + "study_dir \n", + "2024-09-21_15-38-29_GenericAgent-meta-llama_lla... miniwob 0.592 \n", + "2024-09-21_15-34-02_GenericAgent-meta-llama_lla... miniwob 0.000 \n", + "2024-09-21_12-04-39_GenericAgent-gpt-4o-2024-05... miniwob 0.656 \n", + "2024-09-20_22-09-43_GenericAgent-gpt-4o-2024-05... miniwob 0.656 \n", + "2024-09-20_07-16-21_GenericAgent-gpt-4o-mini-20... miniwob 0.546 \n", + "2024-09-19_21-53-57_GenericAgent-gpt-4o-mini-20... miniwob 0.514 \n", + "2024-09-19_21-51-12_GenericAgent-gpt-4o-mini-20... miniwob 0.000 \n", + "2024-09-19_21-30-36_GenericAgent-gpt-4o-mini-20... miniwob 1.000 \n", + "2024-09-19_21-28-58_GenericAgent-gpt-4o-mini-20... miniwob 1.000 \n", + "2024-09-19_21-07-34_GenericAgent-gpt-4o-mini-20... miniwob 0.750 \n", + "2024-09-19_18-37-00_GenericAgent-gpt-4o-mini_on... miniwob 1.000 \n", + "2024-09-19_11-45-30_GenericAgent-gpt-4o-mini_on... miniwob 1.000 \n", + "2024-09-18_11-56-51_GenericAgent-gpt-4o-mini_on... miniwob 1.000 \n", + "2024-09-18_11-47-33_GenericAgent-gpt-4o-mini_on... miniwob 1.000 \n", + "2024-09-13_14-19-10_GenericAgent-gpt-4o-mini_on... miniwob 0.536 \n", + "2024-09-13_14-18-03_GenericAgent-gpt-4o-mini_on... miniwob NaN \n", + "2024-09-13_09-46-43_GenericAgent-gpt-4o-mini_on... miniwob 1.000 \n", + "2024-09-13_09-32-27_GenericAgent-gpt-4o-mini_on... miniwob 1.000 \n", + "2024-09-13_09-10-48_GenericAgent-gpt-4o-mini_on... miniwob 1.000 \n", + "2024-09-12_22-22-10_GenericAgent-gpt-4o-mini_on... miniwob 0.750 \n", + "2024-09-12_22-12-28_GenericAgent-gpt-4o-mini_on... miniwob 1.000 \n", + "2024-09-12_22-10-47_GenericAgent-gpt-4o-mini_on... miniwob 0.750 \n", + "2024-09-12_15-25-03_GenericAgent-gpt-4o-mini_on... miniwob NaN \n", + "2024-09-12_15-14-36_GenericAgent-gpt-4o-mini_on... miniwob 0.750 \n", + "2024-09-12_15-08-05_GenericAgent-gpt-4o-mini_on... miniwob 0.750 \n", + "2024-09-12_14-44-03_GenericAgent-gpt-4o-mini_on... miniwob 0.750 \n", + "2024-09-12_14-37-09_GenericAgent-gpt-4o-mini_on... miniwob 0.750 \n", + "2024-09-12_13-55-50_GenericAgent-gpt-4o-mini_on... miniwob 0.750 \n", + "2024-09-12_13-12-54_GenericAgent-gpt-4o-mini_on... miniwob 1.000 \n", + "2024-09-12_12-08-38_GenericAgentArgs_on_miniwob... miniwob 1.000 \n", + "2024-09-12_12-01-32_GenericAgentArgs_on_miniwob... miniwob 0.750 \n", + "2024-09-12_08-39-16_GenericAgent-gpt-4o-mini_on... miniwob 0.750 \n", + "\n", + " std_err avg_steps \\\n", + "study_dir \n", + "2024-09-21_15-38-29_GenericAgent-meta-llama_lla... 0.020 4.323 \n", + "2024-09-21_15-34-02_GenericAgent-meta-llama_lla... 0.000 0.000 \n", + "2024-09-21_12-04-39_GenericAgent-gpt-4o-2024-05... 0.019 4.138 \n", + "2024-09-20_22-09-43_GenericAgent-gpt-4o-2024-05... 0.019 4.019 \n", + "2024-09-20_07-16-21_GenericAgent-gpt-4o-mini-20... 0.020 4.981 \n", + "2024-09-19_21-53-57_GenericAgent-gpt-4o-mini-20... 0.038 5.232 \n", + "2024-09-19_21-51-12_GenericAgent-gpt-4o-mini-20... 0.000 10.000 \n", + "2024-09-19_21-30-36_GenericAgent-gpt-4o-mini-20... 0.000 2.750 \n", + "2024-09-19_21-28-58_GenericAgent-gpt-4o-mini-20... 0.000 2.750 \n", + "2024-09-19_21-07-34_GenericAgent-gpt-4o-mini-20... 0.217 2.750 \n", + "2024-09-19_18-37-00_GenericAgent-gpt-4o-mini_on... 0.000 2.750 \n", + "2024-09-19_11-45-30_GenericAgent-gpt-4o-mini_on... 0.000 2.750 \n", + "2024-09-18_11-56-51_GenericAgent-gpt-4o-mini_on... 0.000 2.000 \n", + "2024-09-18_11-47-33_GenericAgent-gpt-4o-mini_on... 0.000 2.000 \n", + "2024-09-13_14-19-10_GenericAgent-gpt-4o-mini_on... 0.020 5.083 \n", + "2024-09-13_14-18-03_GenericAgent-gpt-4o-mini_on... NaN NaN \n", + "2024-09-13_09-46-43_GenericAgent-gpt-4o-mini_on... 0.000 2.750 \n", + "2024-09-13_09-32-27_GenericAgent-gpt-4o-mini_on... 0.000 3.000 \n", + "2024-09-13_09-10-48_GenericAgent-gpt-4o-mini_on... 0.000 2.750 \n", + "2024-09-12_22-22-10_GenericAgent-gpt-4o-mini_on... 0.217 2.500 \n", + "2024-09-12_22-12-28_GenericAgent-gpt-4o-mini_on... 0.000 2.750 \n", + "2024-09-12_22-10-47_GenericAgent-gpt-4o-mini_on... 0.217 2.500 \n", + "2024-09-12_15-25-03_GenericAgent-gpt-4o-mini_on... NaN NaN \n", + "2024-09-12_15-14-36_GenericAgent-gpt-4o-mini_on... 0.217 1.750 \n", + "2024-09-12_15-08-05_GenericAgent-gpt-4o-mini_on... 0.217 2.500 \n", + "2024-09-12_14-44-03_GenericAgent-gpt-4o-mini_on... 0.217 2.500 \n", + "2024-09-12_14-37-09_GenericAgent-gpt-4o-mini_on... 0.217 1.500 \n", + "2024-09-12_13-55-50_GenericAgent-gpt-4o-mini_on... 0.217 1.500 \n", + "2024-09-12_13-12-54_GenericAgent-gpt-4o-mini_on... 0.000 2.750 \n", + "2024-09-12_12-08-38_GenericAgentArgs_on_miniwob... 0.000 2.750 \n", + "2024-09-12_12-01-32_GenericAgentArgs_on_miniwob... 0.217 2.750 \n", + "2024-09-12_08-39-16_GenericAgent-gpt-4o-mini_on... 0.217 2.500 \n", + "\n", + " n_completed n_err \n", + "study_dir \n", + "2024-09-21_15-38-29_GenericAgent-meta-llama_lla... 625/625 2 \n", + "2024-09-21_15-34-02_GenericAgent-meta-llama_lla... 625/625 625 \n", + "2024-09-21_12-04-39_GenericAgent-gpt-4o-2024-05... 625/625 0 \n", + "2024-09-20_22-09-43_GenericAgent-gpt-4o-2024-05... 625/625 0 \n", + "2024-09-20_07-16-21_GenericAgent-gpt-4o-mini-20... 625/625 0 \n", + "2024-09-19_21-53-57_GenericAgent-gpt-4o-mini-20... 177/625 0 \n", + "2024-09-19_21-51-12_GenericAgent-gpt-4o-mini-20... 3/625 0 \n", + "2024-09-19_21-30-36_GenericAgent-gpt-4o-mini-20... 4/4 0 \n", + "2024-09-19_21-28-58_GenericAgent-gpt-4o-mini-20... 4/4 0 \n", + "2024-09-19_21-07-34_GenericAgent-gpt-4o-mini-20... 4/4 0 \n", + "2024-09-19_18-37-00_GenericAgent-gpt-4o-mini_on... 4/4 0 \n", + "2024-09-19_11-45-30_GenericAgent-gpt-4o-mini_on... 4/4 0 \n", + "2024-09-18_11-56-51_GenericAgent-gpt-4o-mini_on... 3/3 0 \n", + "2024-09-18_11-47-33_GenericAgent-gpt-4o-mini_on... 3/3 0 \n", + "2024-09-13_14-19-10_GenericAgent-gpt-4o-mini_on... 625/625 0 \n", + "2024-09-13_14-18-03_GenericAgent-gpt-4o-mini_on... 0/625 0 \n", + "2024-09-13_09-46-43_GenericAgent-gpt-4o-mini_on... 4/4 0 \n", + "2024-09-13_09-32-27_GenericAgent-gpt-4o-mini_on... 2/4 0 \n", + "2024-09-13_09-10-48_GenericAgent-gpt-4o-mini_on... 4/4 0 \n", + "2024-09-12_22-22-10_GenericAgent-gpt-4o-mini_on... 4/4 1 \n", + "2024-09-12_22-12-28_GenericAgent-gpt-4o-mini_on... 4/4 0 \n", + "2024-09-12_22-10-47_GenericAgent-gpt-4o-mini_on... 4/4 1 \n", + "2024-09-12_15-25-03_GenericAgent-gpt-4o-mini_on... 0/4 0 \n", + "2024-09-12_15-14-36_GenericAgent-gpt-4o-mini_on... 4/4 1 \n", + "2024-09-12_15-08-05_GenericAgent-gpt-4o-mini_on... 4/4 1 \n", + "2024-09-12_14-44-03_GenericAgent-gpt-4o-mini_on... 4/4 1 \n", + "2024-09-12_14-37-09_GenericAgent-gpt-4o-mini_on... 4/4 1 \n", + "2024-09-12_13-55-50_GenericAgent-gpt-4o-mini_on... 4/4 1 \n", + "2024-09-12_13-12-54_GenericAgent-gpt-4o-mini_on... 4/4 0 \n", + "2024-09-12_12-08-38_GenericAgentArgs_on_miniwob... 4/4 0 \n", + "2024-09-12_12-01-32_GenericAgentArgs_on_miniwob... 4/4 0 \n", + "2024-09-12_08-39-16_GenericAgent-gpt-4o-mini_on... 4/4 1 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inspect_results.get_all_summaries(RESULTS_DIR, ignore_cache=False)\n" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -159,7 +697,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.1.-1" + "version": "3.12.2" } }, "nbformat": 4, diff --git a/src/agentlab/analyze/inspect_results.py b/src/agentlab/analyze/inspect_results.py index d69b1656..07d4176e 100644 --- a/src/agentlab/analyze/inspect_results.py +++ b/src/agentlab/analyze/inspect_results.py @@ -1,7 +1,9 @@ import fnmatch import io +import json import random import re +import traceback import warnings from collections import defaultdict from datetime import datetime @@ -35,6 +37,9 @@ pd.set_option("display.multi_sparse", False) +AGENT_NAME_KEY = "agent.agent_name" +TASK_KEY = "env.task_name" + def get_constants_and_variables(df: pd.DataFrame, drop_constants: bool = False): """Filter out constants from the dataframe.""" @@ -58,12 +63,12 @@ def get_constants_and_variables(df: pd.DataFrame, drop_constants: bool = False): def set_index_from_variables( df: pd.DataFrame, - index_white_list=("agent_args.*",), + index_white_list=("agent.*",), index_black_list=("*model_url*", "*extra*"), - task_key="env_args.task_name", - force_at_leaste_one_variable=False, + task_key=TASK_KEY, + add_agent_and_benchmark=True, ): - """Set the index, inplace, to env_args.task_name and all variables. + """Set the index, inplace, to env.task_name and all variables. Introspects `df` to find all fields that are variable and set the index to those fields. This will allow to easily groupby and compare results. To @@ -79,17 +84,23 @@ def set_index_from_variables( task_key: The key to use as the first level of the index. force_at_leaste_one_variable: If True, force at least one variable in the index. If no variable is found, the index will be set to - task_key + "agent_args.agent_name". + task_key + "agent.agent_name". """ df.reset_index(inplace=True) constants, variables, _ = get_constants_and_variables(df) index_variables = [] + if add_agent_and_benchmark: + index_variables.append("agent.agent_name") + if "env.benchmark" not in df.columns: + df["env.benchmark"] = df[TASK_KEY].map(_benchmark_from_task_name) + index_variables.append("env.benchmark") + for var in variables: white = any([fnmatch.fnmatch(var, pattern) for pattern in index_white_list]) black = any([fnmatch.fnmatch(var, pattern) for pattern in index_black_list]) - if white and not black: + if white and (not black) and (not var in index_variables): index_variables.append(var) for var in index_variables: @@ -99,10 +110,7 @@ def set_index_from_variables( ) df[var] = df[var].fillna("None") - if len(index_variables) == 0 and force_at_leaste_one_variable: - if "agent_args.agent_name" in constants: - index_variables = ["agent_args.agent_name"] - # agent_variables = [var for var in variables if var.startswith("agent_args.")] + # agent_variables = [var for var in variables if var.startswith("agent.")] df.set_index([task_key] + index_variables, inplace=True) df.sort_index(inplace=True) @@ -112,19 +120,20 @@ def load_result_df( progress_fn=tqdm, set_index=True, result_df=None, - index_white_list=("agent_args.*",), + index_white_list=("agent.*",), index_black_list=("*model_url*", "*extra*"), + remove_args_suffix=True, ): """Load the result dataframe. - Will set the index to env_args.task_name and all columens that are not constant and - starts with agent_args. This will allow to easily groupby and compare + Will set the index to env.task_name and all columens that are not constant and + starts with agent. This will allow to easily groupby and compare results. This index can be changed later using df.set_index. Args: exp_dir: Path to the experiment directory progress_fn: Progress function to use when loading the results - set_index: If True, set the index to env_args.task_name and variable agent_args + set_index: If True, set the index to env.task_name and variable agent result_df: If not None, speed up the loading process by reusing alreading loaded objects. index_white_list: List of wildard patterns to match variables that @@ -148,6 +157,10 @@ def load_result_df( result_list = progress_fn(result_list, desc="Loading results") df = pd.DataFrame([exp_result.get_exp_record() for exp_result in result_list]) + + if remove_args_suffix: + df.columns = [col.replace("_args", "") for col in df.columns] + if set_index: set_index_from_variables(df, index_white_list, index_black_list) return df @@ -211,9 +224,7 @@ def report_constant_and_variables(df, show_stack_traces=True): print(f" ...\n") -def get_bootstrap( - df, metric, reduce_fn=np.nanmean, n_bootstrap=100, group_by="env_args.task_name", prior=0.5 -): +def get_bootstrap(df, metric, reduce_fn=np.nanmean, n_bootstrap=100, group_by=TASK_KEY, prior=0.5): """Get the stratified bootstrap mean and std for the given metric.""" grouped_df = df.reset_index(inplace=False).groupby(group_by) array = convert_df_to_array(grouped_df, metric=metric, threshold=0.7) @@ -390,7 +401,7 @@ def add_order(row): def global_report( result_df: pd.DataFrame, reduce_fn=summarize, - rename_index=lambda name: name.replace("agent_args.flags.", ""), + rename_index=lambda name: name.replace("agent.flags.", ""), ): """Produce a report that summarize all tasks and all episodes for each agent. @@ -400,7 +411,7 @@ def global_report( reduce_fn: The function to use to reduce the sub dataframe. By default this is summarize. rename_index: Function to rename the index. By default we remove the prefix - "agent_args.flags." + "agent.flags." Returns: pd.DataFrame: The report @@ -751,30 +762,103 @@ def _categorize_error(row): return error_report -def split_by_key(df: pd.DataFrame, key, force_at_leaste_one_variable=True): - """Return a dict of dataframes spearted by the given key.""" - # check if key in df - if not (key in df.columns): - df = df.reset_index(key, inplace=False) +# =============== + + +def _benchmark_from_task_name(task_name: str): + """Extract the benchmark from the task name. + TODO should be more robost, e.g. handle workarna.L1, workarena.L2, etc. + """ + return task_name.split(".")[0] + + +def summarize_study(result_df: pd.DataFrame) -> pd.DataFrame: + """Create a summary of the study. + + Similar to global report, but handles single agent differently. + """ + + levels = list(range(result_df.index.nlevels)) + return result_df.groupby(level=levels[1:]).apply(summarize) + + +def get_study_summary(study_dir: Path, ignore_cache=False, sentinel=None) -> pd.DataFrame: + """Get the cached study summary for the given study directory. + + The cashe is based on the modified times of all the files in the study. + + Args: + study_dir: The study directory to summarize + ignore_cache: If True, ignore the cache and recompute the summary + sentinel: Captures internal values for unit testing. + + Returns: + pd.DataFrame: The study summary + """ + study_dir = Path(study_dir) + is_stale, mtimes, summary_path, mtimes_path = _is_stale(study_dir) + + if not ignore_cache: + if summary_path.exists() and not is_stale: + if sentinel is not None: + sentinel["from_cache"] = True + return pd.read_csv(summary_path) + + result_df = load_result_df(study_dir) + if result_df is None: + return None + + summary = summarize_study(result_df) + + summary.to_csv(summary_path) + mtimes_path.write_text(json.dumps(mtimes)) + + if sentinel is not None: + sentinel["from_cache"] = False + return summary + + +def get_all_summaries(results_dir: Path, skip_hidden=True, ignore_cache=False): + summaries = [] + for study_dir in results_dir.iterdir(): + if skip_hidden and study_dir.name.startswith("_"): + continue + + try: + summary = get_study_summary(study_dir, ignore_cache=ignore_cache) + if summary is not None: + # set as index + summary["study_dir"] = study_dir.name + summary.set_index("study_dir", inplace=True) + summaries.append(summary) + + except Exception as e: + traceback.print_exc() + continue + + summaries = pd.concat(summaries) + # reverse sort according to index + summaries.sort_index(ascending=False, inplace=True) + return summaries + - df_dict = {} - for value in df[key].unique(): - sub_df = df[df[key] == value].copy() - set_index_from_variables(sub_df, force_at_leaste_one_variable=force_at_leaste_one_variable) - df_dict[value] = sub_df +def _get_mtimes(dir: Path, pattern="[!_.]*", whitelist=()): + """Recursevly get all file's modif date""" + # use glob to get all files + files = list(dir.rglob(pattern)) + return {str(f.relative_to(dir)): f.stat().st_mtime for f in files if f not in whitelist} - return df_dict +def _is_stale(study_dir: Path): + summary_path = study_dir / "study_summary.csv" + mtimes_path = study_dir / "_last_modification_times.json" + mtimes = _get_mtimes(study_dir, whitelist=(summary_path,)) + if not mtimes_path.exists() or not summary_path.exists(): + return True, mtimes, summary_path, mtimes_path -# def set_task_category_as_index(result_df, task_category_map=TASK_CATEGORY_MAP): -# """Create task_category index from task_name if needed and re-assign index -# from variables using task_category.""" -# # rested index task_name (level 0) -# new_df = result_df.reset_index(inplace=False) -# if not "task_category" in new_df.columns: -# new_df["task_category"] = new_df["env_args.task_name"].map(task_category_map) -# set_index_from_variables(new_df, task_key="task_category") -# return new_df + mtimes_saved = json.loads(mtimes_path.read_text()) + if mtimes_saved == mtimes: + return False, mtimes, summary_path, mtimes_path def get_all_task_messages(exp_dir, max_n_exp=None): diff --git a/src/agentlab/experiments/reproducibility_util.py b/src/agentlab/experiments/reproducibility_util.py index a06b1536..70a1575a 100644 --- a/src/agentlab/experiments/reproducibility_util.py +++ b/src/agentlab/experiments/reproducibility_util.py @@ -273,30 +273,31 @@ def load_reproducibility_info(study_dir) -> dict[str]: def add_reward(info, study_dir, ignore_incomplete=False): result_df = inspect_results.load_result_df(study_dir) - report = inspect_results.global_report(result_df) + report = inspect_results.summarize_study(result_df) - if "[ALL TASKS]" in report.index: - assert isinstance(info["agent_name"], str) + if len(report) > 1: + raise ValueError("Multi agent not implemented yet") - n_err = report.loc["[ALL TASKS]", "n_err"].item() - n_completed, n_total = report.loc["[ALL TASKS]", "n_completed"].split("/") - if n_err > 0 and not ignore_incomplete: - raise ValueError( - f"Experiment has {n_err} errors. Please rerun the study and make sure all tasks are completed." - ) - if n_completed != n_total and not ignore_incomplete: - raise ValueError( - f"Experiment has {n_completed} completed tasks out of {n_total}. " - f"Please rerun the study and make sure all tasks are completed." - ) + assert isinstance(info["agent_name"], str) - for key in ("avg_reward", "std_err", "n_err", "n_completed"): - value = report.loc["[ALL TASKS]", key] - if hasattr(value, "item"): - value = value.item() - info[key] = value - else: - raise ValueError("Multi agent not implemented yet") + idx = report.index[0] + n_err = report.loc[idx, "n_err"].item() + n_completed, n_total = report.loc[idx, "n_completed"].split("/") + if n_err > 0 and not ignore_incomplete: + raise ValueError( + f"Experiment has {n_err} errors. Please rerun the study and make sure all tasks are completed." + ) + if n_completed != n_total and not ignore_incomplete: + raise ValueError( + f"Experiment has {n_completed} completed tasks out of {n_total}. " + f"Please rerun the study and make sure all tasks are completed." + ) + + for key in ("avg_reward", "std_err", "n_err", "n_completed"): + value = report.loc[idx, key] + if hasattr(value, "item"): + value = value.item() + info[key] = value def _get_csv_headers(file_path: str) -> list[str]: diff --git a/tests/agents/test_agent.py b/tests/agents/test_agent.py index 3b1be95a..f35a388f 100644 --- a/tests/agents/test_agent.py +++ b/tests/agents/test_agent.py @@ -34,7 +34,7 @@ def test_generic_agent(): "truncated": False, "err_msg": None, "stack_trace": None, - "agent_args.flags.obs.use_ax_tree": True, + "agent.flags.obs.use_ax_tree": True, } for key, target_val in target.items(): diff --git a/tests/analyze/test_inspect_results.py b/tests/analyze/test_inspect_results.py new file mode 100644 index 00000000..0bbc2922 --- /dev/null +++ b/tests/analyze/test_inspect_results.py @@ -0,0 +1,35 @@ +from pathlib import Path +import shutil +import tempfile + +import pandas as pd +from agentlab.analyze.inspect_results import get_study_summary + + +def test_get_study_summary(): + + with tempfile.TemporaryDirectory() as tmp_dir: + study_dir = Path(tmp_dir) / "test_study" + + study_dir_original = Path(__file__).parent.parent / "data" / "test_study" + + # recursively copy the study to the temp dir using shutil + shutil.copytree(study_dir_original, study_dir) + + sentinel = {} + + summary = get_study_summary(study_dir, sentinel=sentinel) + assert isinstance(summary, pd.DataFrame) + assert sentinel["from_cache"] == False + + summary = get_study_summary(study_dir, sentinel=sentinel) + assert isinstance(summary, pd.DataFrame) + assert sentinel["from_cache"] == True + + summary = get_study_summary(study_dir, ignore_cache=True, sentinel=sentinel) + assert isinstance(summary, pd.DataFrame) + assert sentinel["from_cache"] == False + + +if __name__ == "__main__": + test_get_study_summary() diff --git a/tests/experiments/test_launch_exp.py b/tests/experiments/test_launch_exp.py index b3e1fcd8..51149657 100644 --- a/tests/experiments/test_launch_exp.py +++ b/tests/experiments/test_launch_exp.py @@ -55,11 +55,11 @@ def test_launch_system(backend="dask"): assert row.err_msg is None assert row.cum_reward == 1.0 - global_report = inspect_results.global_report(results_df) - assert len(global_report) == 2 - assert global_report.std_err.iloc[0] == 0 - assert global_report.n_completed.iloc[0] == "3/3" - assert global_report.avg_reward.iloc[0] == 1.0 + study_summary = inspect_results.summarize_study(results_df) + assert len(study_summary) == 1 + assert study_summary.std_err.iloc[0] == 0 + assert study_summary.n_completed.iloc[0] == "3/3" + assert study_summary.avg_reward.iloc[0] == 1.0 def test_launch_system_joblib(): @@ -97,4 +97,4 @@ def test_4o_mini_on_miniwob_tiny_test(): if __name__ == "__main__": # test_4o_mini_on_miniwob_tiny_test() # test_launch_system() - test_launch_system_joblib() + test_launch_system_sequntial() From 7e2afd3031a14b9baded3f3aa95d2884be7d0a27 Mon Sep 17 00:00:00 2001 From: recursix Date: Mon, 23 Sep 2024 16:03:34 -0400 Subject: [PATCH 45/58] add Llama --- src/agentlab/experiments/reproducibility_script.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/agentlab/experiments/reproducibility_script.py b/src/agentlab/experiments/reproducibility_script.py index 6b2b99b0..777df3ac 100644 --- a/src/agentlab/experiments/reproducibility_script.py +++ b/src/agentlab/experiments/reproducibility_script.py @@ -1,6 +1,6 @@ import logging -from agentlab.agents.generic_agent import AGENT_4o, AGENT_4o_MINI +from agentlab.agents.generic_agent import AGENT_4o, AGENT_4o_MINI, AGENT_LLAMA3_70B from agentlab.analyze.inspect_results import get_most_recent_folder from agentlab.experiments import study_generators from agentlab.experiments.exp_utils import RESULTS_DIR @@ -17,7 +17,7 @@ if __name__ == "__main__": - agent_args = set_temp(AGENT_4o) + agent_args = set_temp(AGENT_LLAMA3_70B) ## select the benchmark to run on benchmark = "miniwob" From f08e47b525b07f9e929f680762b8b558e3cd4ab5 Mon Sep 17 00:00:00 2001 From: recursix Date: Mon, 23 Sep 2024 16:03:49 -0400 Subject: [PATCH 46/58] initial code for reproducibility agent --- .../generic_agent/reproducibility_agent.py | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 src/agentlab/agents/generic_agent/reproducibility_agent.py diff --git a/src/agentlab/agents/generic_agent/reproducibility_agent.py b/src/agentlab/agents/generic_agent/reproducibility_agent.py new file mode 100644 index 00000000..279b186b --- /dev/null +++ b/src/agentlab/agents/generic_agent/reproducibility_agent.py @@ -0,0 +1,63 @@ +from dataclasses import dataclass +import time +from .generic_agent import GenericAgentArgs, GenericAgent +from browsergym.experiments.loop import ExpResult +from browsergym.experiments.agent import AgentInfo + + +class ReproChatModel: + """A chat model that reproduces a conversation. + + Args: + messages (list): A list of messages previously executed. + delay (int): A delay to simulate the time it takes to generate a response. + """ + + def __init__(self, messages, delay=1) -> None: + self.messages = messages + self.delay = delay + + def invoke(self, messages): + time.sleep(self.delay) + # return the next message in the list + return self.messages[len(messages)] + + +@dataclass +class ReproAgentArgs(GenericAgentArgs): + + repro_dir: str = None + + def make_agent(self): + return ReproAgent(self.chat_model_args, self.flags, self.max_retry, self.repro_dir) + + +class ReproAgent(GenericAgent): + + def __init__( + self, + chat_model_args, + flags, + max_retry=4, + repro_dir=None, + ): + self.exp_result = ExpResult(repro_dir) + super().__init__(chat_model_args, flags, max_retry) + + def get_action(self, obs): + + # replace the chat model with a reproducible chat that will mimic the + # same answers + step = len(self.actions) + step_info = self.exp_result.get_step_info(step) + chat_messages = step_info["agent_info"]["chat_messages"] + self.chat_llm = ReproChatModel(chat_messages) + + action, agent_info = super().get_action(obs) + + return _make_agent_stats(action, agent_info, step_info) + + +def _make_agent_stats(action, agent_info, step_info): + # TODO + return action, agent_info From f7494cb375c9685a30cb011cd2ce04d00953474e Mon Sep 17 00:00:00 2001 From: recursix Date: Wed, 25 Sep 2024 14:07:26 +0000 Subject: [PATCH 47/58] adjust inspect results --- src/agentlab/analyze/inspect_results.ipynb | 535 +-------------------- src/agentlab/analyze/inspect_results.py | 90 ++-- 2 files changed, 63 insertions(+), 562 deletions(-) diff --git a/src/agentlab/analyze/inspect_results.ipynb b/src/agentlab/analyze/inspect_results.ipynb index b0a38e37..6db09092 100644 --- a/src/agentlab/analyze/inspect_results.ipynb +++ b/src/agentlab/analyze/inspect_results.ipynb @@ -2,12 +2,14 @@ "cells": [ { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from agentlab.experiments.exp_utils import RESULTS_DIR\n", "from agentlab.analyze import inspect_results\n", + "import pandas as pd\n", + "pd.set_option('display.max_rows', 200)\n", "\n", "%load_ext autoreload\n", "%autoreload 2" @@ -22,533 +24,12 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Searching experiments directories.: 0it [00:00, ?it/s]\n", - "Searching experiments directories.: 0it [00:00, ?it/s]\n", - "Searching experiments directories.: 0it [00:00, ?it/s]\n", - "Searching experiments directories.: 0it [00:00, ?it/s]\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
agent.agent_nameenv.benchmarkavg_rewardstd_erravg_stepsn_completedn_err
study_dir
2024-09-21_15-38-29_GenericAgent-meta-llama_llama-3-70b-instruct_on_miniwobGenericAgent-meta-llama_llama-3-70b-instructminiwob0.5920.0204.323625/6252
2024-09-21_15-34-02_GenericAgent-meta-llama_llama-3-70b-instruct_on_miniwobGenericAgent-meta-llama_llama-3-70b-instructminiwob0.0000.0000.000625/625625
2024-09-21_12-04-39_GenericAgent-gpt-4o-2024-05-13_on_miniwobGenericAgent-gpt-4o-2024-05-13miniwob0.6560.0194.138625/6250
2024-09-20_22-09-43_GenericAgent-gpt-4o-2024-05-13_on_miniwobGenericAgent-gpt-4o-2024-05-13miniwob0.6560.0194.019625/6250
2024-09-20_07-16-21_GenericAgent-gpt-4o-mini-2024-07-18_on_miniwobGenericAgent-gpt-4o-mini-2024-07-18miniwob0.5460.0204.981625/6250
2024-09-19_21-53-57_GenericAgent-gpt-4o-mini-2024-07-18_on_miniwobGenericAgent-gpt-4o-mini-2024-07-18miniwob0.5140.0385.232177/6250
2024-09-19_21-51-12_GenericAgent-gpt-4o-mini-2024-07-18_on_miniwobGenericAgent-gpt-4o-mini-2024-07-18miniwob0.0000.00010.0003/6250
2024-09-19_21-30-36_GenericAgent-gpt-4o-mini-2024-07-18_on_miniwob_tiny_testGenericAgent-gpt-4o-mini-2024-07-18miniwob1.0000.0002.7504/40
2024-09-19_21-28-58_GenericAgent-gpt-4o-mini-2024-07-18_on_miniwob_tiny_testGenericAgent-gpt-4o-mini-2024-07-18miniwob1.0000.0002.7504/40
2024-09-19_21-07-34_GenericAgent-gpt-4o-mini-2024-07-18_on_miniwob_tiny_testGenericAgent-gpt-4o-mini-2024-07-18miniwob0.7500.2172.7504/40
2024-09-19_18-37-00_GenericAgent-gpt-4o-mini_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob1.0000.0002.7504/40
2024-09-19_11-45-30_GenericAgent-gpt-4o-mini_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob1.0000.0002.7504/40
2024-09-18_11-56-51_GenericAgent-gpt-4o-mini_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob1.0000.0002.0003/30
2024-09-18_11-47-33_GenericAgent-gpt-4o-mini_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob1.0000.0002.0003/30
2024-09-13_14-19-10_GenericAgent-gpt-4o-mini_on_miniwobGenericAgent-gpt-4o-miniminiwob0.5360.0205.083625/6250
2024-09-13_14-18-03_GenericAgent-gpt-4o-mini_on_miniwobGenericAgent-gpt-4o-miniminiwobNaNNaNNaN0/6250
2024-09-13_09-46-43_GenericAgent-gpt-4o-mini_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob1.0000.0002.7504/40
2024-09-13_09-32-27_GenericAgent-gpt-4o-mini_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob1.0000.0003.0002/40
2024-09-13_09-10-48_GenericAgent-gpt-4o-mini_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob1.0000.0002.7504/40
2024-09-12_22-22-10_GenericAgent-gpt-4o-mini_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob0.7500.2172.5004/41
2024-09-12_22-12-28_GenericAgent-gpt-4o-mini_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob1.0000.0002.7504/40
2024-09-12_22-10-47_GenericAgent-gpt-4o-mini_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob0.7500.2172.5004/41
2024-09-12_15-25-03_GenericAgent-gpt-4o-mini_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwobNaNNaNNaN0/40
2024-09-12_15-14-36_GenericAgent-gpt-4o-mini_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob0.7500.2171.7504/41
2024-09-12_15-08-05_GenericAgent-gpt-4o-mini_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob0.7500.2172.5004/41
2024-09-12_14-44-03_GenericAgent-gpt-4o-mini_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob0.7500.2172.5004/41
2024-09-12_14-37-09_GenericAgent-gpt-4o-mini_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob0.7500.2171.5004/41
2024-09-12_13-55-50_GenericAgent-gpt-4o-mini_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob0.7500.2171.5004/41
2024-09-12_13-12-54_GenericAgent-gpt-4o-mini_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob1.0000.0002.7504/40
2024-09-12_12-08-38_GenericAgentArgs_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob1.0000.0002.7504/40
2024-09-12_12-01-32_GenericAgentArgs_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob0.7500.2172.7504/40
2024-09-12_08-39-16_GenericAgent-gpt-4o-mini_on_miniwob_tiny_testGenericAgent-gpt-4o-miniminiwob0.7500.2172.5004/41
\n", - "
" - ], - "text/plain": [ - " agent.agent_name \\\n", - "study_dir \n", - "2024-09-21_15-38-29_GenericAgent-meta-llama_lla... GenericAgent-meta-llama_llama-3-70b-instruct \n", - "2024-09-21_15-34-02_GenericAgent-meta-llama_lla... GenericAgent-meta-llama_llama-3-70b-instruct \n", - "2024-09-21_12-04-39_GenericAgent-gpt-4o-2024-05... GenericAgent-gpt-4o-2024-05-13 \n", - "2024-09-20_22-09-43_GenericAgent-gpt-4o-2024-05... GenericAgent-gpt-4o-2024-05-13 \n", - "2024-09-20_07-16-21_GenericAgent-gpt-4o-mini-20... GenericAgent-gpt-4o-mini-2024-07-18 \n", - "2024-09-19_21-53-57_GenericAgent-gpt-4o-mini-20... GenericAgent-gpt-4o-mini-2024-07-18 \n", - "2024-09-19_21-51-12_GenericAgent-gpt-4o-mini-20... GenericAgent-gpt-4o-mini-2024-07-18 \n", - "2024-09-19_21-30-36_GenericAgent-gpt-4o-mini-20... GenericAgent-gpt-4o-mini-2024-07-18 \n", - "2024-09-19_21-28-58_GenericAgent-gpt-4o-mini-20... GenericAgent-gpt-4o-mini-2024-07-18 \n", - "2024-09-19_21-07-34_GenericAgent-gpt-4o-mini-20... GenericAgent-gpt-4o-mini-2024-07-18 \n", - "2024-09-19_18-37-00_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", - "2024-09-19_11-45-30_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", - "2024-09-18_11-56-51_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", - "2024-09-18_11-47-33_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", - "2024-09-13_14-19-10_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", - "2024-09-13_14-18-03_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", - "2024-09-13_09-46-43_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", - "2024-09-13_09-32-27_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", - "2024-09-13_09-10-48_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", - "2024-09-12_22-22-10_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", - "2024-09-12_22-12-28_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", - "2024-09-12_22-10-47_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", - "2024-09-12_15-25-03_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", - "2024-09-12_15-14-36_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", - "2024-09-12_15-08-05_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", - "2024-09-12_14-44-03_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", - "2024-09-12_14-37-09_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", - "2024-09-12_13-55-50_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", - "2024-09-12_13-12-54_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", - "2024-09-12_12-08-38_GenericAgentArgs_on_miniwob... GenericAgent-gpt-4o-mini \n", - "2024-09-12_12-01-32_GenericAgentArgs_on_miniwob... GenericAgent-gpt-4o-mini \n", - "2024-09-12_08-39-16_GenericAgent-gpt-4o-mini_on... GenericAgent-gpt-4o-mini \n", - "\n", - " env.benchmark avg_reward \\\n", - "study_dir \n", - "2024-09-21_15-38-29_GenericAgent-meta-llama_lla... miniwob 0.592 \n", - "2024-09-21_15-34-02_GenericAgent-meta-llama_lla... miniwob 0.000 \n", - "2024-09-21_12-04-39_GenericAgent-gpt-4o-2024-05... miniwob 0.656 \n", - "2024-09-20_22-09-43_GenericAgent-gpt-4o-2024-05... miniwob 0.656 \n", - "2024-09-20_07-16-21_GenericAgent-gpt-4o-mini-20... miniwob 0.546 \n", - "2024-09-19_21-53-57_GenericAgent-gpt-4o-mini-20... miniwob 0.514 \n", - "2024-09-19_21-51-12_GenericAgent-gpt-4o-mini-20... miniwob 0.000 \n", - "2024-09-19_21-30-36_GenericAgent-gpt-4o-mini-20... miniwob 1.000 \n", - "2024-09-19_21-28-58_GenericAgent-gpt-4o-mini-20... miniwob 1.000 \n", - "2024-09-19_21-07-34_GenericAgent-gpt-4o-mini-20... miniwob 0.750 \n", - "2024-09-19_18-37-00_GenericAgent-gpt-4o-mini_on... miniwob 1.000 \n", - "2024-09-19_11-45-30_GenericAgent-gpt-4o-mini_on... miniwob 1.000 \n", - "2024-09-18_11-56-51_GenericAgent-gpt-4o-mini_on... miniwob 1.000 \n", - "2024-09-18_11-47-33_GenericAgent-gpt-4o-mini_on... miniwob 1.000 \n", - "2024-09-13_14-19-10_GenericAgent-gpt-4o-mini_on... miniwob 0.536 \n", - "2024-09-13_14-18-03_GenericAgent-gpt-4o-mini_on... miniwob NaN \n", - "2024-09-13_09-46-43_GenericAgent-gpt-4o-mini_on... miniwob 1.000 \n", - "2024-09-13_09-32-27_GenericAgent-gpt-4o-mini_on... miniwob 1.000 \n", - "2024-09-13_09-10-48_GenericAgent-gpt-4o-mini_on... miniwob 1.000 \n", - "2024-09-12_22-22-10_GenericAgent-gpt-4o-mini_on... miniwob 0.750 \n", - "2024-09-12_22-12-28_GenericAgent-gpt-4o-mini_on... miniwob 1.000 \n", - "2024-09-12_22-10-47_GenericAgent-gpt-4o-mini_on... miniwob 0.750 \n", - "2024-09-12_15-25-03_GenericAgent-gpt-4o-mini_on... miniwob NaN \n", - "2024-09-12_15-14-36_GenericAgent-gpt-4o-mini_on... miniwob 0.750 \n", - "2024-09-12_15-08-05_GenericAgent-gpt-4o-mini_on... miniwob 0.750 \n", - "2024-09-12_14-44-03_GenericAgent-gpt-4o-mini_on... miniwob 0.750 \n", - "2024-09-12_14-37-09_GenericAgent-gpt-4o-mini_on... miniwob 0.750 \n", - "2024-09-12_13-55-50_GenericAgent-gpt-4o-mini_on... miniwob 0.750 \n", - "2024-09-12_13-12-54_GenericAgent-gpt-4o-mini_on... miniwob 1.000 \n", - "2024-09-12_12-08-38_GenericAgentArgs_on_miniwob... miniwob 1.000 \n", - "2024-09-12_12-01-32_GenericAgentArgs_on_miniwob... miniwob 0.750 \n", - "2024-09-12_08-39-16_GenericAgent-gpt-4o-mini_on... miniwob 0.750 \n", - "\n", - " std_err avg_steps \\\n", - "study_dir \n", - "2024-09-21_15-38-29_GenericAgent-meta-llama_lla... 0.020 4.323 \n", - "2024-09-21_15-34-02_GenericAgent-meta-llama_lla... 0.000 0.000 \n", - "2024-09-21_12-04-39_GenericAgent-gpt-4o-2024-05... 0.019 4.138 \n", - "2024-09-20_22-09-43_GenericAgent-gpt-4o-2024-05... 0.019 4.019 \n", - "2024-09-20_07-16-21_GenericAgent-gpt-4o-mini-20... 0.020 4.981 \n", - "2024-09-19_21-53-57_GenericAgent-gpt-4o-mini-20... 0.038 5.232 \n", - "2024-09-19_21-51-12_GenericAgent-gpt-4o-mini-20... 0.000 10.000 \n", - "2024-09-19_21-30-36_GenericAgent-gpt-4o-mini-20... 0.000 2.750 \n", - "2024-09-19_21-28-58_GenericAgent-gpt-4o-mini-20... 0.000 2.750 \n", - "2024-09-19_21-07-34_GenericAgent-gpt-4o-mini-20... 0.217 2.750 \n", - "2024-09-19_18-37-00_GenericAgent-gpt-4o-mini_on... 0.000 2.750 \n", - "2024-09-19_11-45-30_GenericAgent-gpt-4o-mini_on... 0.000 2.750 \n", - "2024-09-18_11-56-51_GenericAgent-gpt-4o-mini_on... 0.000 2.000 \n", - "2024-09-18_11-47-33_GenericAgent-gpt-4o-mini_on... 0.000 2.000 \n", - "2024-09-13_14-19-10_GenericAgent-gpt-4o-mini_on... 0.020 5.083 \n", - "2024-09-13_14-18-03_GenericAgent-gpt-4o-mini_on... NaN NaN \n", - "2024-09-13_09-46-43_GenericAgent-gpt-4o-mini_on... 0.000 2.750 \n", - "2024-09-13_09-32-27_GenericAgent-gpt-4o-mini_on... 0.000 3.000 \n", - "2024-09-13_09-10-48_GenericAgent-gpt-4o-mini_on... 0.000 2.750 \n", - "2024-09-12_22-22-10_GenericAgent-gpt-4o-mini_on... 0.217 2.500 \n", - "2024-09-12_22-12-28_GenericAgent-gpt-4o-mini_on... 0.000 2.750 \n", - "2024-09-12_22-10-47_GenericAgent-gpt-4o-mini_on... 0.217 2.500 \n", - "2024-09-12_15-25-03_GenericAgent-gpt-4o-mini_on... NaN NaN \n", - "2024-09-12_15-14-36_GenericAgent-gpt-4o-mini_on... 0.217 1.750 \n", - "2024-09-12_15-08-05_GenericAgent-gpt-4o-mini_on... 0.217 2.500 \n", - "2024-09-12_14-44-03_GenericAgent-gpt-4o-mini_on... 0.217 2.500 \n", - "2024-09-12_14-37-09_GenericAgent-gpt-4o-mini_on... 0.217 1.500 \n", - "2024-09-12_13-55-50_GenericAgent-gpt-4o-mini_on... 0.217 1.500 \n", - "2024-09-12_13-12-54_GenericAgent-gpt-4o-mini_on... 0.000 2.750 \n", - "2024-09-12_12-08-38_GenericAgentArgs_on_miniwob... 0.000 2.750 \n", - "2024-09-12_12-01-32_GenericAgentArgs_on_miniwob... 0.217 2.750 \n", - "2024-09-12_08-39-16_GenericAgent-gpt-4o-mini_on... 0.217 2.500 \n", - "\n", - " n_completed n_err \n", - "study_dir \n", - "2024-09-21_15-38-29_GenericAgent-meta-llama_lla... 625/625 2 \n", - "2024-09-21_15-34-02_GenericAgent-meta-llama_lla... 625/625 625 \n", - "2024-09-21_12-04-39_GenericAgent-gpt-4o-2024-05... 625/625 0 \n", - "2024-09-20_22-09-43_GenericAgent-gpt-4o-2024-05... 625/625 0 \n", - "2024-09-20_07-16-21_GenericAgent-gpt-4o-mini-20... 625/625 0 \n", - "2024-09-19_21-53-57_GenericAgent-gpt-4o-mini-20... 177/625 0 \n", - "2024-09-19_21-51-12_GenericAgent-gpt-4o-mini-20... 3/625 0 \n", - "2024-09-19_21-30-36_GenericAgent-gpt-4o-mini-20... 4/4 0 \n", - "2024-09-19_21-28-58_GenericAgent-gpt-4o-mini-20... 4/4 0 \n", - "2024-09-19_21-07-34_GenericAgent-gpt-4o-mini-20... 4/4 0 \n", - "2024-09-19_18-37-00_GenericAgent-gpt-4o-mini_on... 4/4 0 \n", - "2024-09-19_11-45-30_GenericAgent-gpt-4o-mini_on... 4/4 0 \n", - "2024-09-18_11-56-51_GenericAgent-gpt-4o-mini_on... 3/3 0 \n", - "2024-09-18_11-47-33_GenericAgent-gpt-4o-mini_on... 3/3 0 \n", - "2024-09-13_14-19-10_GenericAgent-gpt-4o-mini_on... 625/625 0 \n", - "2024-09-13_14-18-03_GenericAgent-gpt-4o-mini_on... 0/625 0 \n", - "2024-09-13_09-46-43_GenericAgent-gpt-4o-mini_on... 4/4 0 \n", - "2024-09-13_09-32-27_GenericAgent-gpt-4o-mini_on... 2/4 0 \n", - "2024-09-13_09-10-48_GenericAgent-gpt-4o-mini_on... 4/4 0 \n", - "2024-09-12_22-22-10_GenericAgent-gpt-4o-mini_on... 4/4 1 \n", - "2024-09-12_22-12-28_GenericAgent-gpt-4o-mini_on... 4/4 0 \n", - "2024-09-12_22-10-47_GenericAgent-gpt-4o-mini_on... 4/4 1 \n", - "2024-09-12_15-25-03_GenericAgent-gpt-4o-mini_on... 0/4 0 \n", - "2024-09-12_15-14-36_GenericAgent-gpt-4o-mini_on... 4/4 1 \n", - "2024-09-12_15-08-05_GenericAgent-gpt-4o-mini_on... 4/4 1 \n", - "2024-09-12_14-44-03_GenericAgent-gpt-4o-mini_on... 4/4 1 \n", - "2024-09-12_14-37-09_GenericAgent-gpt-4o-mini_on... 4/4 1 \n", - "2024-09-12_13-55-50_GenericAgent-gpt-4o-mini_on... 4/4 1 \n", - "2024-09-12_13-12-54_GenericAgent-gpt-4o-mini_on... 4/4 0 \n", - "2024-09-12_12-08-38_GenericAgentArgs_on_miniwob... 4/4 0 \n", - "2024-09-12_12-01-32_GenericAgentArgs_on_miniwob... 4/4 0 \n", - "2024-09-12_08-39-16_GenericAgent-gpt-4o-mini_on... 4/4 1 " - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "inspect_results.get_all_summaries(RESULTS_DIR, ignore_cache=False)\n" + "all_summaries = inspect_results.get_all_summaries(RESULTS_DIR.resolve().parent / \"ICML-Neurips-final-run\", ignore_cache=False, ignore_stale=True)\n", + "all_summaries" ] }, { @@ -697,7 +178,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.2" + "version": "3.12.6" } }, "nbformat": 4, diff --git a/src/agentlab/analyze/inspect_results.py b/src/agentlab/analyze/inspect_results.py index 07d4176e..488c8c54 100644 --- a/src/agentlab/analyze/inspect_results.py +++ b/src/agentlab/analyze/inspect_results.py @@ -782,7 +782,47 @@ def summarize_study(result_df: pd.DataFrame) -> pd.DataFrame: return result_df.groupby(level=levels[1:]).apply(summarize) -def get_study_summary(study_dir: Path, ignore_cache=False, sentinel=None) -> pd.DataFrame: +def split_by_key(df: pd.DataFrame, key): + """Return a dict of dataframes spearted by the given key.""" + # check if key in df + if not (key in df.columns): + df = df.reset_index(key, inplace=False) + + df_dict = {} + for value in df[key].unique(): + sub_df = df[df[key] == value].copy() + set_index_from_variables(sub_df) + df_dict[value] = sub_df + + return df_dict + +def get_all_summaries(results_dir: Path, skip_hidden=True, ignore_cache=False, ignore_stale=False): + summaries = [] + for study_dir in results_dir.iterdir(): + print(study_dir.name) + if skip_hidden and study_dir.name.startswith("_"): + print(" skip (starts with '_')") + continue + + try: + summary = get_study_summary(study_dir, ignore_cache=ignore_cache, ignore_stale=ignore_stale) + if summary is not None: + # set as index + summary["study_dir"] = study_dir.name + summary.set_index("study_dir", inplace=True) + summaries.append(summary) + + except Exception as e: + traceback.print_exc() + continue + + summaries = pd.concat(summaries) + # reverse sort according to index + summaries.sort_index(ascending=False, inplace=True) + return summaries + + +def get_study_summary(study_dir: Path, ignore_cache=False, sentinel=None, ignore_stale=False) -> pd.DataFrame: """Get the cached study summary for the given study directory. The cashe is based on the modified times of all the files in the study. @@ -796,7 +836,12 @@ def get_study_summary(study_dir: Path, ignore_cache=False, sentinel=None) -> pd. pd.DataFrame: The study summary """ study_dir = Path(study_dir) - is_stale, mtimes, summary_path, mtimes_path = _is_stale(study_dir) + + summary_path = study_dir / "study_summary.csv" + if not ignore_stale: + is_stale = _is_stale(study_dir, summary_path) + else: + is_stale = False if not ignore_cache: if summary_path.exists() and not is_stale: @@ -811,37 +856,12 @@ def get_study_summary(study_dir: Path, ignore_cache=False, sentinel=None) -> pd. summary = summarize_study(result_df) summary.to_csv(summary_path) - mtimes_path.write_text(json.dumps(mtimes)) if sentinel is not None: sentinel["from_cache"] = False return summary -def get_all_summaries(results_dir: Path, skip_hidden=True, ignore_cache=False): - summaries = [] - for study_dir in results_dir.iterdir(): - if skip_hidden and study_dir.name.startswith("_"): - continue - - try: - summary = get_study_summary(study_dir, ignore_cache=ignore_cache) - if summary is not None: - # set as index - summary["study_dir"] = study_dir.name - summary.set_index("study_dir", inplace=True) - summaries.append(summary) - - except Exception as e: - traceback.print_exc() - continue - - summaries = pd.concat(summaries) - # reverse sort according to index - summaries.sort_index(ascending=False, inplace=True) - return summaries - - def _get_mtimes(dir: Path, pattern="[!_.]*", whitelist=()): """Recursevly get all file's modif date""" # use glob to get all files @@ -849,17 +869,17 @@ def _get_mtimes(dir: Path, pattern="[!_.]*", whitelist=()): return {str(f.relative_to(dir)): f.stat().st_mtime for f in files if f not in whitelist} -def _is_stale(study_dir: Path): - summary_path = study_dir / "study_summary.csv" +def _is_stale(study_dir: Path, summary_path: Path) -> bool: mtimes_path = study_dir / "_last_modification_times.json" mtimes = _get_mtimes(study_dir, whitelist=(summary_path,)) if not mtimes_path.exists() or not summary_path.exists(): - return True, mtimes, summary_path, mtimes_path - - mtimes_saved = json.loads(mtimes_path.read_text()) - if mtimes_saved == mtimes: - return False, mtimes, summary_path, mtimes_path - + stale = True + else: + mtimes_saved = json.loads(mtimes_path.read_text()) + stale = mtimes_saved != mtimes + mtimes_path.write_text(json.dumps(mtimes)) + return stale + def get_all_task_messages(exp_dir, max_n_exp=None): result_list = list(yield_all_exp_results(exp_dir, progress_fn=tqdm)) From 4066da30e12b466f6397eb0fb3a136df80327107 Mon Sep 17 00:00:00 2001 From: recursix Date: Wed, 25 Sep 2024 21:41:47 -0400 Subject: [PATCH 48/58] infer from benchmark --- .../experiments/reproducibility_util.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/agentlab/experiments/reproducibility_util.py b/src/agentlab/experiments/reproducibility_util.py index 70a1575a..97e16213 100644 --- a/src/agentlab/experiments/reproducibility_util.py +++ b/src/agentlab/experiments/reproducibility_util.py @@ -12,6 +12,7 @@ from git.config import GitConfigParser import os import agentlab +from browsergym.experiments.loop import ExpArgs def _get_repo(module): @@ -228,6 +229,25 @@ def _assert_compatible(info: dict, old_info: dict): ) +def _benchmark_from_task_name(task_name: str): + """Extract the benchmark from the task name. + TODO should be more robost, e.g. handle workarna.L1, workarena.L2, etc. + """ + return task_name.split(".")[0] + + +def infer_agent(exp_args_list: list[ExpArgs]): + agent_names = set(exp_args.agent_args.agent_name for exp_args in exp_args_list) + return ",".join(agent_names) + + +def infer_benchmark(exp_args_list: list[ExpArgs]): + benchmark_names = set( + _benchmark_from_task_name(exp_args.env_args.task_name) for exp_args in exp_args_list + ) + return ",".join(benchmark_names) + + def write_reproducibility_info( study_dir, agent_name, benchmark_name, comment=None, ignore_changes=False ): From ef204d39d493b5127ed82817f11999d1cb325eb3 Mon Sep 17 00:00:00 2001 From: recursix Date: Wed, 25 Sep 2024 21:42:20 -0400 Subject: [PATCH 49/58] fix reproducibility agent --- .../generic_agent/reproducibility_agent.py | 68 +++++++++++++++++-- src/agentlab/experiments/reproduce_study.py | 39 +++++++++++ 2 files changed, 102 insertions(+), 5 deletions(-) create mode 100644 src/agentlab/experiments/reproduce_study.py diff --git a/src/agentlab/agents/generic_agent/reproducibility_agent.py b/src/agentlab/agents/generic_agent/reproducibility_agent.py index 279b186b..3138be00 100644 --- a/src/agentlab/agents/generic_agent/reproducibility_agent.py +++ b/src/agentlab/agents/generic_agent/reproducibility_agent.py @@ -1,7 +1,11 @@ from dataclasses import dataclass +import logging +from pathlib import Path import time + +from agentlab.agents.agent_args import AgentArgs from .generic_agent import GenericAgentArgs, GenericAgent -from browsergym.experiments.loop import ExpResult +from browsergym.experiments.loop import ExpResult, ExpArgs, yield_all_exp_results from browsergym.experiments.agent import AgentInfo @@ -26,10 +30,11 @@ def invoke(self, messages): @dataclass class ReproAgentArgs(GenericAgentArgs): - repro_dir: str = None + # starting with "_" will prevent from being part of the index in the load_results function + _repro_dir: str = None def make_agent(self): - return ReproAgent(self.chat_model_args, self.flags, self.max_retry, self.repro_dir) + return ReproAgent(self.chat_model_args, self.flags, self.max_retry, self._repro_dir) class ReproAgent(GenericAgent): @@ -50,9 +55,16 @@ def get_action(self, obs): # same answers step = len(self.actions) step_info = self.exp_result.get_step_info(step) - chat_messages = step_info["agent_info"]["chat_messages"] - self.chat_llm = ReproChatModel(chat_messages) + chat_messages = step_info.agent_info.get("chat_messages", None) + if chat_messages is None: + err_msg = self.exp_result.summary_info["err_msg"] + agent_info = AgentInfo( + markup_page=f"Agent had no chat messages. Perhaps there was an error. err_msg:\n{err_msg}", + ) + return None, agent_info + + self.chat_llm = ReproChatModel(chat_messages) action, agent_info = super().get_action(obs) return _make_agent_stats(action, agent_info, step_info) @@ -61,3 +73,49 @@ def get_action(self, obs): def _make_agent_stats(action, agent_info, step_info): # TODO return action, agent_info + + +def reproduce_study(original_study_dir: Path | str): + """Reproduce a study by running the same experiments with the same agent.""" + + original_study_dir = Path(original_study_dir) + + study_name = f"reproducibility_of_{original_study_dir.name}" + + exp_args_list = [] + for exp_result in yield_all_exp_results(original_study_dir, progress_fn=None): + agent_args = make_repro_agent(exp_result.exp_args.agent_args, exp_dir=exp_result.exp_dir) + exp_args_list.append( + ExpArgs( + agent_args=agent_args, + env_args=exp_result.exp_args.env_args, + logging_level=logging.DEBUG, + ) + ) + return study_name, exp_args_list + + +def make_repro_agent(agent_args: AgentArgs, exp_dir: Path | str): + """Create a reproducibility agent from an existing agent. + + Note, if a new flag was added, it was not saved in the original pickle. When + loading the pickle it silently adds the missing flag and set it to its + default value. The new repro agent_args will thus have the new flag set to + its default value. + + Args: + agent_args (AgentArgs): The original agent args. + exp_dir (Path | str): The directory where the experiment was saved. + + """ + exp_dir = Path(exp_dir) + assert isinstance(agent_args, GenericAgentArgs) + assert exp_dir.exists() # sanity check + + return ReproAgentArgs( + agent_name=f"Repro_{agent_args.agent_name}", + chat_model_args=agent_args.chat_model_args, + flags=agent_args.flags, + max_retry=agent_args.max_retry, + _repro_dir=exp_dir, + ) diff --git a/src/agentlab/experiments/reproduce_study.py b/src/agentlab/experiments/reproduce_study.py new file mode 100644 index 00000000..ba2a72da --- /dev/null +++ b/src/agentlab/experiments/reproduce_study.py @@ -0,0 +1,39 @@ +import logging + +from agentlab.agents.generic_agent.reproducibility_agent import reproduce_study +from agentlab.experiments.exp_utils import RESULTS_DIR +from agentlab.experiments.launch_exp import make_study_dir, run_experiments +from agentlab.experiments.reproducibility_util import ( + write_reproducibility_info, + add_experiment_to_journal, + infer_agent, + infer_benchmark, +) + + +logging.getLogger().setLevel(logging.INFO) + + +if __name__ == "__main__": + + # study_dir = RESULTS_DIR / "2024-06-02_18-16-17_final_run" + old_study_dir = ( + RESULTS_DIR / "2024-09-12_08-39-16_GenericAgent-gpt-4o-mini_on_miniwob_tiny_test" + ) + study_name, exp_args_list = reproduce_study(old_study_dir) + study_dir = make_study_dir(RESULTS_DIR, study_name) + n_jobs = 1 + + write_reproducibility_info( + study_dir=study_dir, + agent_name=infer_agent(exp_args_list), + benchmark_name=infer_benchmark(exp_args_list), + ignore_changes=True, + ) + + # run the experiments + + run_experiments(n_jobs, exp_args_list, study_dir, parallel_backend="joblib") + # finally: + # # will try to gather info at the end even if run_experiments failed + # add_experiment_to_journal(study_dir) From 5112abe7ffdf13dbc4947021f892acc5e850b74f Mon Sep 17 00:00:00 2001 From: recursix Date: Wed, 25 Sep 2024 21:43:19 -0400 Subject: [PATCH 50/58] prevent the repro_dir to be an index variable --- src/agentlab/analyze/inspect_results.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/agentlab/analyze/inspect_results.py b/src/agentlab/analyze/inspect_results.py index 488c8c54..dbf1e2a8 100644 --- a/src/agentlab/analyze/inspect_results.py +++ b/src/agentlab/analyze/inspect_results.py @@ -64,7 +64,7 @@ def get_constants_and_variables(df: pd.DataFrame, drop_constants: bool = False): def set_index_from_variables( df: pd.DataFrame, index_white_list=("agent.*",), - index_black_list=("*model_url*", "*extra*"), + index_black_list=("*model_url*", "*extra*", "*._*"), task_key=TASK_KEY, add_agent_and_benchmark=True, ): @@ -121,7 +121,7 @@ def load_result_df( set_index=True, result_df=None, index_white_list=("agent.*",), - index_black_list=("*model_url*", "*extra*"), + index_black_list=("*model_url*", "*extra*", "*._*"), remove_args_suffix=True, ): """Load the result dataframe. @@ -796,6 +796,7 @@ def split_by_key(df: pd.DataFrame, key): return df_dict + def get_all_summaries(results_dir: Path, skip_hidden=True, ignore_cache=False, ignore_stale=False): summaries = [] for study_dir in results_dir.iterdir(): @@ -805,7 +806,9 @@ def get_all_summaries(results_dir: Path, skip_hidden=True, ignore_cache=False, i continue try: - summary = get_study_summary(study_dir, ignore_cache=ignore_cache, ignore_stale=ignore_stale) + summary = get_study_summary( + study_dir, ignore_cache=ignore_cache, ignore_stale=ignore_stale + ) if summary is not None: # set as index summary["study_dir"] = study_dir.name @@ -822,7 +825,9 @@ def get_all_summaries(results_dir: Path, skip_hidden=True, ignore_cache=False, i return summaries -def get_study_summary(study_dir: Path, ignore_cache=False, sentinel=None, ignore_stale=False) -> pd.DataFrame: +def get_study_summary( + study_dir: Path, ignore_cache=False, sentinel=None, ignore_stale=False +) -> pd.DataFrame: """Get the cached study summary for the given study directory. The cashe is based on the modified times of all the files in the study. @@ -879,7 +884,7 @@ def _is_stale(study_dir: Path, summary_path: Path) -> bool: stale = mtimes_saved != mtimes mtimes_path.write_text(json.dumps(mtimes)) return stale - + def get_all_task_messages(exp_dir, max_n_exp=None): result_list = list(yield_all_exp_results(exp_dir, progress_fn=tqdm)) From 5325c6951d2c08e60b715626dc86035e050095ad Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 27 Sep 2024 16:07:50 -0400 Subject: [PATCH 51/58] updating repro agent stats --- .../generic_agent/reproducibility_agent.py | 65 ++++++++++++++++--- 1 file changed, 55 insertions(+), 10 deletions(-) diff --git a/src/agentlab/agents/generic_agent/reproducibility_agent.py b/src/agentlab/agents/generic_agent/reproducibility_agent.py index 3138be00..4c9c76e1 100644 --- a/src/agentlab/agents/generic_agent/reproducibility_agent.py +++ b/src/agentlab/agents/generic_agent/reproducibility_agent.py @@ -1,3 +1,4 @@ +import copy from dataclasses import dataclass import logging from pathlib import Path @@ -7,6 +8,7 @@ from .generic_agent import GenericAgentArgs, GenericAgent from browsergym.experiments.loop import ExpResult, ExpArgs, yield_all_exp_results from browsergym.experiments.agent import AgentInfo +import difflib class ReproChatModel: @@ -17,14 +19,17 @@ class ReproChatModel: delay (int): A delay to simulate the time it takes to generate a response. """ - def __init__(self, messages, delay=1) -> None: - self.messages = messages + def __init__(self, old_messages, delay=1) -> None: + self.old_messages = old_messages self.delay = delay - def invoke(self, messages): + def invoke(self, messages: list): + self.new_messages = copy(messages) + old_response = self.old_messages[len(messages)] + self.new_messages.append(old_response) time.sleep(self.delay) # return the next message in the list - return self.messages[len(messages)] + return old_response @dataclass @@ -55,8 +60,8 @@ def get_action(self, obs): # same answers step = len(self.actions) step_info = self.exp_result.get_step_info(step) - chat_messages = step_info.agent_info.get("chat_messages", None) - if chat_messages is None: + old_chat_messages = step_info.agent_info.get("chat_messages", None) + if old_chat_messages is None: err_msg = self.exp_result.summary_info["err_msg"] agent_info = AgentInfo( @@ -64,17 +69,57 @@ def get_action(self, obs): ) return None, agent_info - self.chat_llm = ReproChatModel(chat_messages) + self.chat_llm = ReproChatModel(old_chat_messages) action, agent_info = super().get_action(obs) - return _make_agent_stats(action, agent_info, step_info) + return _make_agent_stats( + action, agent_info, step_info, old_chat_messages, self.chat_llm.new_messages + ) + + +def _make_agent_stats(action, agent_info, step_info, old_chat_messages, new_chat_messages): + + # format all messages into a string + old_msg_str = _format_messages(old_chat_messages) + new_msg_str = _format_messages(new_chat_messages) + html_diff = _make_diff(old_str=old_msg_str, new_str=new_msg_str) + if isinstance(agent_info, dict): + agent_info = AgentInfo(**agent_info) + + agent_info.html_page = html_diff + agent_info.stats = _diff_stats(old_msg_str, new_msg_str) -def _make_agent_stats(action, agent_info, step_info): - # TODO return action, agent_info +def _format_messages(messages: list[dict]): + return "\n".join(f"{m['role']} message:\n{m['content']}\n" for m in messages) + + +def _make_diff(old_str, new_str): + diff = difflib.HtmlDiff().make_file( + old_str.splitlines(), new_str.splitlines(), fromdesc="Old Version", todesc="New Version" + ) + return diff + + +def _diff_stats(str1: str, str2: str): + lines1 = str1.splitlines() + lines2 = str2.splitlines() + + diff = list(difflib.Differ().compare(lines1, lines2)) + + # Count added and removed lines + added = sum(1 for line in diff if line.startswith("+ ")) + removed = sum(1 for line in diff if line.startswith("- ")) + + # Calculate difference ratio + difference_ratio = (added + removed) / (2 * max(len(lines1), len(lines2))) + + return dict(lines_added=added, lines_removed=removed, difference_ratio=difference_ratio) + + def reproduce_study(original_study_dir: Path | str): """Reproduce a study by running the same experiments with the same agent.""" From d8ad4bde814cf24e28ae73eb92a2fa602a9eb810 Mon Sep 17 00:00:00 2001 From: recursix Date: Tue, 1 Oct 2024 10:37:49 -0400 Subject: [PATCH 52/58] Reproducibility agent --- src/agentlab/agents/dynamic_prompting.py | 11 +- .../generic_agent/reproducibility_agent.py | 182 +++++++++++++++--- src/agentlab/analyze/agent_xray.py | 45 ++--- src/agentlab/experiments/launch_exp.py | 35 +++- src/agentlab/experiments/reproduce_study.py | 34 ++-- .../experiments/reproducibility_script.py | 13 +- .../experiments/reproducibility_util.py | 40 ++-- tests/experiments/test_launch_exp.py | 4 +- 8 files changed, 249 insertions(+), 115 deletions(-) diff --git a/src/agentlab/agents/dynamic_prompting.py b/src/agentlab/agents/dynamic_prompting.py index 91c3dd3f..101ffe50 100644 --- a/src/agentlab/agents/dynamic_prompting.py +++ b/src/agentlab/agents/dynamic_prompting.py @@ -577,9 +577,14 @@ def _parse_answer(self, text_answer): ans_dict = {"action": code, "parse_error": str(e)} try: - # just check if action can be mapped to python code but keep action as is - # the environment will be responsible for mapping it to python - self.action_set.to_python_code(ans_dict["action"]) + if ans_dict["action"] == "None": + # Used by reproducibility agent for backward compatibility of + # traces missing LLM's response in chat messages. + ans_dict["action"] = None + else: + # just check if action can be mapped to python code but keep action as is + # the environment will be responsible for mapping it to python + self.action_set.to_python_code(ans_dict["action"]) except Exception as e: raise ParseError( f"Error while parsing action\n: {e}\n" diff --git a/src/agentlab/agents/generic_agent/reproducibility_agent.py b/src/agentlab/agents/generic_agent/reproducibility_agent.py index 4c9c76e1..6c778b44 100644 --- a/src/agentlab/agents/generic_agent/reproducibility_agent.py +++ b/src/agentlab/agents/generic_agent/reproducibility_agent.py @@ -1,15 +1,32 @@ -import copy +"""Reproducibility Agent + + +This module contains the classes and functions to reproduce the results of a +study. It is used to create a new study that will run the same experiments as +the original study, but with a reproducibility agent that will mimic the same +answers as the original agent. + +Stats are collected to compare the original agent's answers with the new agent's +answers. Load the this reproducibility study in agent-xray to compare the results. +""" + +from copy import copy from dataclasses import dataclass import logging from pathlib import Path import time +from bs4 import BeautifulSoup + from agentlab.agents.agent_args import AgentArgs from .generic_agent import GenericAgentArgs, GenericAgent from browsergym.experiments.loop import ExpResult, ExpArgs, yield_all_exp_results from browsergym.experiments.agent import AgentInfo import difflib +from langchain.schema import BaseMessage, AIMessage +from langchain_community.adapters.openai import convert_message_to_dict + class ReproChatModel: """A chat model that reproduces a conversation. @@ -25,6 +42,12 @@ def __init__(self, old_messages, delay=1) -> None: def invoke(self, messages: list): self.new_messages = copy(messages) + + if len(messages) >= len(self.old_messages): + # if for some reason the llm response was not saved + # TODO(thibault): convert this to dict instead of AIMessage in the bye langchain PR. + return AIMessage(content="""None""") + old_response = self.old_messages[len(messages)] self.new_messages.append(old_response) time.sleep(self.delay) @@ -38,6 +61,13 @@ class ReproAgentArgs(GenericAgentArgs): # starting with "_" will prevent from being part of the index in the load_results function _repro_dir: str = None + def __post_init__(self): + try: # some attributes might be temporarily args.CrossProd for hyperparameter generation + super().__post_init__() + self.agent_name = f"Repro_{self.agent_name}" + except AttributeError: + pass + def make_agent(self): return ReproAgent(self.chat_model_args, self.flags, self.max_retry, self._repro_dir) @@ -61,11 +91,12 @@ def get_action(self, obs): step = len(self.actions) step_info = self.exp_result.get_step_info(step) old_chat_messages = step_info.agent_info.get("chat_messages", None) + if old_chat_messages is None: err_msg = self.exp_result.summary_info["err_msg"] agent_info = AgentInfo( - markup_page=f"Agent had no chat messages. Perhaps there was an error. err_msg:\n{err_msg}", + markdown_page=f"Agent had no chat messages. Perhaps there was an error. err_msg:\n{err_msg}", ) return None, agent_info @@ -77,49 +108,39 @@ def get_action(self, obs): ) -def _make_agent_stats(action, agent_info, step_info, old_chat_messages, new_chat_messages): +# TODO(thibault): move this to llm utils in bye langchain PR. +def messages_to_dict(messages: list[dict] | list[BaseMessage]) -> dict: + new_messages = [] + for m in messages: + if isinstance(m, dict): + new_messages.append(m) + elif isinstance(m, str): + new_messages.append({"role": "", "content": m}) + elif isinstance(m, BaseMessage): + new_messages.append(convert_message_to_dict(m)) + else: + raise ValueError(f"Unknown message type: {type(m)}") + return new_messages - # format all messages into a string - old_msg_str = _format_messages(old_chat_messages) - new_msg_str = _format_messages(new_chat_messages) - html_diff = _make_diff(old_str=old_msg_str, new_str=new_msg_str) +def _make_agent_stats(action, agent_info, step_info, old_chat_messages, new_chat_messages): if isinstance(agent_info, dict): agent_info = AgentInfo(**agent_info) - agent_info.html_page = html_diff - agent_info.stats = _diff_stats(old_msg_str, new_msg_str) + old_msg_str = _format_messages(old_chat_messages) + new_msg_str = _format_messages(new_chat_messages) + + agent_info.html_page = _make_diff(old_str=old_msg_str, new_str=new_msg_str) + agent_info.stats.update(_diff_stats(old_msg_str, new_msg_str)) return action, agent_info def _format_messages(messages: list[dict]): + messages = messages_to_dict(messages) return "\n".join(f"{m['role']} message:\n{m['content']}\n" for m in messages) -def _make_diff(old_str, new_str): - diff = difflib.HtmlDiff().make_file( - old_str.splitlines(), new_str.splitlines(), fromdesc="Old Version", todesc="New Version" - ) - return diff - - -def _diff_stats(str1: str, str2: str): - lines1 = str1.splitlines() - lines2 = str2.splitlines() - - diff = list(difflib.Differ().compare(lines1, lines2)) - - # Count added and removed lines - added = sum(1 for line in diff if line.startswith("+ ")) - removed = sum(1 for line in diff if line.startswith("- ")) - - # Calculate difference ratio - difference_ratio = (added + removed) / (2 * max(len(lines1), len(lines2))) - - return dict(lines_added=added, lines_removed=removed, difference_ratio=difference_ratio) - - def reproduce_study(original_study_dir: Path | str): """Reproduce a study by running the same experiments with the same agent.""" @@ -164,3 +185,100 @@ def make_repro_agent(agent_args: AgentArgs, exp_dir: Path | str): max_retry=agent_args.max_retry, _repro_dir=exp_dir, ) + + +def _make_diff(old_str, new_str): + page = difflib.HtmlDiff().make_file( + old_str.splitlines(), new_str.splitlines(), fromdesc="Old Version", todesc="New Version" + ) + page = page.replace('nowrap="nowrap"', "") # Remove nowrap attribute + page = _set_style(page, DIFF_STYLE) + return page + + +def _diff_stats(str1: str, str2: str): + """Try some kind of metrics to make stats about the amount of diffs between two strings.""" + lines1 = str1.splitlines() + lines2 = str2.splitlines() + + diff = list(difflib.Differ().compare(lines1, lines2)) + + # Count added and removed lines + added = sum(1 for line in diff if line.startswith("+ ")) + removed = sum(1 for line in diff if line.startswith("- ")) + + # Calculate difference ratio + difference_ratio = (added + removed) / (2 * max(len(lines1), len(lines2))) + + return dict(lines_added=added, lines_removed=removed, difference_ratio=difference_ratio) + + +def _set_style(html_str: str, style: str, prepend_previous_style: bool = False): + """Add a style tag to an HTML string.""" + + soup = BeautifulSoup(html_str, "html.parser") + style_tag = soup.find("style") + + if not style_tag: + style_tag = soup.new_tag("style") + soup.head.append(style_tag) + + current_style = style_tag.string or "" + + if prepend_previous_style: + style = f"{style}\n{current_style}" + else: + style = f"{current_style}\n{style}" + + style_tag.string = style + + return str(soup) + + +# this is the style to adjust the diff table inside gradio +DIFF_STYLE = """ + table.diff { + font-size: 10px; + font-family: Courier; + border: medium; + width: 100%; + max-width: 100%; /* Ensure table does not exceed its container */ + table-layout: auto; /* Adjust column sizes dynamically */ + word-wrap: break-word; + overflow-wrap: break-word; + } + /* Constrain the max-width of the 3rd and 6th columns */ + td:nth-child(3), td:nth-child(6) { + max-width: 200px; /* Adjust this value to suit your content */ + white-space: normal; /* Allow wrapping in content columns */ + overflow-wrap: break-word; /* Break long words/content */ + } + /* Ensure span elements wrap inside the table */ + .diff_add, .diff_chg, .diff_sub { + word-wrap: break-word; /* Wrap long text */ + overflow-wrap: break-word; + } + + /* Keep the rest of the table flexible */ + td { + white-space: normal; /* Allow wrapping for content */ + } + .diff_header { + background-color: #e0e0e0; + } + td.diff_header { + text-align: right; + } + .diff_next { + background-color: #c0c0c0; + } + .diff_add { + background-color: #aaffaa; + } + .diff_chg { + background-color: #ffff77; + } + .diff_sub { + background-color: #ffaaaa; + } +""" diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py index b1352afc..df484e1a 100644 --- a/src/agentlab/analyze/agent_xray.py +++ b/src/agentlab/analyze/agent_xray.py @@ -1,3 +1,4 @@ +import base64 import traceback from copy import deepcopy from io import BytesIO @@ -346,10 +347,10 @@ def run_gradio(results_dir: Path): with gr.Tab("Agent Info HTML") as tab_agent_info_html: with gr.Row(): - screenshot1 = gr.Image( + screenshot1_agent = gr.Image( show_label=False, interactive=False, show_download_button=False ) - screenshot2 = gr.Image( + screenshot2_agent = gr.Image( show_label=False, interactive=False, show_download_button=False ) agent_info_html = gr.HTML() @@ -443,7 +444,7 @@ def run_gradio(results_dir: Path): step_id.change(fn=if_active("Stats")(update_stats), outputs=stats) step_id.change( fn=if_active("Agent Info HTML", 3)(update_agent_info_html), - outputs=[agent_info_html, screenshot1, screenshot2], + outputs=[agent_info_html, screenshot1_agent, screenshot2_agent], ) step_id.change(fn=if_active("Agent Info MD")(update_agent_info_md), outputs=agent_info_md) step_id.change( @@ -616,27 +617,6 @@ def update_agent_info_md(): return None -def update_agent_info_html(): - global info - # screenshots from current and next step - screenshot_pre_action = image_to_jpg_base64_url(get_screenshot(info, info.step, False)) - screenshot_post_action = image_to_jpg_base64_url(get_screenshot(info, info.step + 1, False)) - - try: - agent_info = info.exp_result.steps_info[info.step].agent_info - page = agent_info.get("html_page", ["No Agent Info"]) - - # Page contains placeholders for screenshots - page = page.replace("screenshot_pre_action_placeholder", screenshot_pre_action) - page = page.replace("screenshot_post_action_placeholder", screenshot_post_action) - page = page.replace("max-width: 48%;", "max-width: 100%;") - if page is None: - page = """Fill up html_page attribute in AgentInfo to display here.""" - return page - except (FileNotFoundError, IndexError): - return None - - def update_agent_info_html(): global info # screenshots from current and next step @@ -645,15 +625,30 @@ def update_agent_info_html(): s2 = get_screenshot(info, info.step + 1, False) agent_info = info.exp_result.steps_info[info.step].agent_info page = agent_info.get("html_page", ["No Agent Info"]) - # Page contains placeholders for screenshots if page is None: page = """Fill up html_page attribute in AgentInfo to display here.""" + else: + page = _page_to_iframe(page) return page, s1, s2 except (FileNotFoundError, IndexError): return None, None, None +def _page_to_iframe(page: str): + html_bytes = page.encode("utf-8") + encoded_html = base64.b64encode(html_bytes).decode("ascii") + data_url = f"data:text/html;base64,{encoded_html}" + + # Create iframe with the data URL + page = f""" + +""" + return page + + def submit_action(input_text): global info agent_info = info.exp_result.steps_info[info.step].agent_info diff --git a/src/agentlab/experiments/launch_exp.py b/src/agentlab/experiments/launch_exp.py index b2ed28ec..f2868c5d 100644 --- a/src/agentlab/experiments/launch_exp.py +++ b/src/agentlab/experiments/launch_exp.py @@ -5,6 +5,12 @@ from browsergym.experiments.loop import ExpArgs, yield_all_exp_results +from agentlab.experiments.reproducibility_util import ( + infer_agent, + infer_benchmark, + write_reproducibility_info, +) + def import_object(path: str): module_name, obj_name = split_path(path) @@ -16,7 +22,13 @@ def import_object(path: str): return obj -def run_experiments(n_jobs, exp_args_list: list[ExpArgs], exp_dir, parallel_backend="joblib"): +def run_experiments( + n_jobs, + exp_args_list: list[ExpArgs], + study_dir, + parallel_backend="joblib", + strict_reproducibility=False, +): """Run a list of ExpArgs in parallel. To ensure optimal parallelism, make sure ExpArgs.depend_on is set correctly @@ -31,17 +43,32 @@ def run_experiments(n_jobs, exp_args_list: list[ExpArgs], exp_dir, parallel_back Directory where the experiments will be saved. parallel_backend: str Parallel backend to use. Either "joblib", "dask" or "sequential". - + strict_reproducibility: bool + If True, will raise an error: + * if there are local modifications in the git repositories or + * if the reproduibility info is inccompatible with an already + existing one e.g. when relaunch the study to fix errors. + Otherwise, it will only warn. """ + study_dir = Path(study_dir) + study_dir.mkdir(parents=True, exist_ok=True) + + write_reproducibility_info( + study_dir=study_dir, + agent_name=infer_agent(exp_args_list), + benchmark_name=infer_benchmark(exp_args_list), + strict_reproducibility=strict_reproducibility, + ) + if n_jobs == 1 and parallel_backend != "sequential": logging.warning("Only 1 job, switching to sequential backend.") parallel_backend = "sequential" - logging.info(f"Saving experiments to {exp_dir}") + logging.info(f"Saving experiments to {study_dir}") for exp_args in exp_args_list: exp_args.agent_args.prepare() - exp_args.prepare(exp_root=exp_dir) + exp_args.prepare(exp_root=study_dir) try: if parallel_backend == "joblib": from joblib import Parallel, delayed diff --git a/src/agentlab/experiments/reproduce_study.py b/src/agentlab/experiments/reproduce_study.py index ba2a72da..5c5759b1 100644 --- a/src/agentlab/experiments/reproduce_study.py +++ b/src/agentlab/experiments/reproduce_study.py @@ -1,14 +1,15 @@ +""" +This script will leverage an old study to reproduce it on the same tasks and +same seeds. Instead of calling the LLM it will reuse the responses from the old +llm. Load the study in agent-xray and look at the Agent Info HTML to compare +the diff in HTML format. +""" + import logging from agentlab.agents.generic_agent.reproducibility_agent import reproduce_study from agentlab.experiments.exp_utils import RESULTS_DIR from agentlab.experiments.launch_exp import make_study_dir, run_experiments -from agentlab.experiments.reproducibility_util import ( - write_reproducibility_info, - add_experiment_to_journal, - infer_agent, - infer_benchmark, -) logging.getLogger().setLevel(logging.INFO) @@ -16,24 +17,11 @@ if __name__ == "__main__": - # study_dir = RESULTS_DIR / "2024-06-02_18-16-17_final_run" - old_study_dir = ( - RESULTS_DIR / "2024-09-12_08-39-16_GenericAgent-gpt-4o-mini_on_miniwob_tiny_test" - ) - study_name, exp_args_list = reproduce_study(old_study_dir) + old_study = "2024-06-02_18-16-17_final_run" + # old_study = "2024-09-12_08-39-16_GenericAgent-gpt-4o-mini_on_miniwob_tiny_test" + + study_name, exp_args_list = reproduce_study(RESULTS_DIR / old_study) study_dir = make_study_dir(RESULTS_DIR, study_name) n_jobs = 1 - write_reproducibility_info( - study_dir=study_dir, - agent_name=infer_agent(exp_args_list), - benchmark_name=infer_benchmark(exp_args_list), - ignore_changes=True, - ) - - # run the experiments - run_experiments(n_jobs, exp_args_list, study_dir, parallel_backend="joblib") - # finally: - # # will try to gather info at the end even if run_experiments failed - # add_experiment_to_journal(study_dir) diff --git a/src/agentlab/experiments/reproducibility_script.py b/src/agentlab/experiments/reproducibility_script.py index 777df3ac..72128e54 100644 --- a/src/agentlab/experiments/reproducibility_script.py +++ b/src/agentlab/experiments/reproducibility_script.py @@ -5,11 +5,7 @@ from agentlab.experiments import study_generators from agentlab.experiments.exp_utils import RESULTS_DIR from agentlab.experiments.launch_exp import make_study_dir, run_experiments, relaunch_study -from agentlab.experiments.reproducibility_util import ( - set_temp, - write_reproducibility_info, - add_experiment_to_journal, -) +from agentlab.experiments.reproducibility_util import set_temp, add_experiment_to_journal logging.getLogger().setLevel(logging.INFO) @@ -41,13 +37,6 @@ study_name, exp_args_list = study_generators.run_agents_on_benchmark(agent_args, benchmark) study_dir = make_study_dir(RESULTS_DIR, study_name) - write_reproducibility_info( - study_dir=study_dir, - agent_name=agent_args.agent_name, - benchmark_name=benchmark, - ignore_changes=False, - ) - # run the experiments try: run_experiments(n_jobs, exp_args_list, study_dir, parallel_backend="joblib") diff --git a/src/agentlab/experiments/reproducibility_util.py b/src/agentlab/experiments/reproducibility_util.py index 97e16213..99116ab9 100644 --- a/src/agentlab/experiments/reproducibility_util.py +++ b/src/agentlab/experiments/reproducibility_util.py @@ -215,7 +215,7 @@ def add_git_info(module_name, module): return info -def _assert_compatible(info: dict, old_info: dict): +def _assert_compatible(info: dict, old_info: dict, raise_if_incompatible=True): """Make sure that the two info dicts are compatible.""" # TODO may need to adapt if there are multiple agents, and the re-run on # error only has a subset of agents. Hence old_info.agent_name != info.agent_name @@ -223,10 +223,17 @@ def _assert_compatible(info: dict, old_info: dict): if key in ("date", "avg_reward", "std_err", "n_completed", "n_err"): continue if info[key] != old_info[key]: - raise ValueError( - f"Reproducibility info already exist and is not compatible." - f"Key {key} has changed from {old_info[key]} to {info[key]}." - ) + if not raise_if_incompatible: + logging.warning( + f"Reproducibility info already exist and is not compatible." + f"Key {key} has changed from {old_info[key]} to {info[key]}." + ) + else: + raise ValueError( + f"Reproducibility info already exist and is not compatible." + f"Key {key} has changed from {old_info[key]} to {info[key]}." + f"Set strict_reproducibility=False to bypass this error." + ) def _benchmark_from_task_name(task_name: str): @@ -237,27 +244,32 @@ def _benchmark_from_task_name(task_name: str): def infer_agent(exp_args_list: list[ExpArgs]): - agent_names = set(exp_args.agent_args.agent_name for exp_args in exp_args_list) - return ",".join(agent_names) + return list(set(exp_args.agent_args.agent_name for exp_args in exp_args_list)) def infer_benchmark(exp_args_list: list[ExpArgs]): - benchmark_names = set( + bench_name = set( _benchmark_from_task_name(exp_args.env_args.task_name) for exp_args in exp_args_list ) - return ",".join(benchmark_names) + if len(bench_name) > 1: + raise ValueError( + f"Multiple benchmarks in the same study are not well supported: {bench_name}." + "Comment out the reproducibility part of the code to proceed at your own risk." + ) + + return bench_name.pop() def write_reproducibility_info( - study_dir, agent_name, benchmark_name, comment=None, ignore_changes=False + study_dir, agent_name, benchmark_name, comment=None, strict_reproducibility=True ): info = get_reproducibility_info( - agent_name, benchmark_name, comment, ignore_changes=ignore_changes + agent_name, benchmark_name, comment, ignore_changes=not strict_reproducibility ) - return save_reproducibility_info(study_dir, info) + return save_reproducibility_info(study_dir, info, strict_reproducibility) -def save_reproducibility_info(study_dir, info): +def save_reproducibility_info(study_dir, info, strict_reproducibility=True): """ Save a JSON file containing reproducibility information to the specified directory. """ @@ -267,7 +279,7 @@ def save_reproducibility_info(study_dir, info): if info_path.exists(): with open(info_path, "r") as f: existing_info = json.load(f) - _assert_compatible(info, existing_info) + _assert_compatible(info, existing_info, raise_if_incompatible=strict_reproducibility) logging.info( "Reproducibility info already exists and is compatible. Overwriting the old one." ) diff --git a/tests/experiments/test_launch_exp.py b/tests/experiments/test_launch_exp.py index 51149657..45eacf61 100644 --- a/tests/experiments/test_launch_exp.py +++ b/tests/experiments/test_launch_exp.py @@ -43,7 +43,7 @@ def test_launch_system(backend="dask"): study_dir = make_study_dir(tmp_dir, "generic_agent_test") run_experiments( - n_jobs=2, exp_args_list=exp_args_list, exp_dir=study_dir, parallel_backend=backend + n_jobs=2, exp_args_list=exp_args_list, study_dir=study_dir, parallel_backend=backend ) results_df = inspect_results.load_result_df(study_dir, progress_fn=None) @@ -80,7 +80,7 @@ def test_4o_mini_on_miniwob_tiny_test(): ) study_dir = make_study_dir(tmp_dir, study_name) - run_experiments(n_jobs=4, exp_args_list=exp_args_list, exp_dir=study_dir) + run_experiments(n_jobs=4, exp_args_list=exp_args_list, study_dir=study_dir) results_df = inspect_results.load_result_df(study_dir, progress_fn=None) for row in results_df.iterrows(): From fe27819a99b163fd9240ba3e144e010413bff24d Mon Sep 17 00:00:00 2001 From: recursix Date: Tue, 1 Oct 2024 11:38:50 -0400 Subject: [PATCH 53/58] instructions to setup workarena --- README.md | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 02fda142..b7fc0035 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,33 @@ export MINIWOB_URL="file://$HOME/dev/miniwob-plusplus/miniwob/html/miniwob/" ``` +
+ +WorkArena + +See [detailed instructions on workarena github](https://github.com/ServiceNow/WorkArena?tab=readme-ov-file#getting-started) + +At a glance: +* [Sign in](https://developer.servicenow.com/) and reqeuest a `washington` instance. +* Once the instance is ready, you should see `` and `` +* Add these to your `.bashrc` (or `.zshrc`) and `source` it (note: make sure that + all variables are in single quotes unless you happen to have a password with a + single quote in it) +```bash +export SNOW_INSTANCE_URL='https://.service-now.com/' +export SNOW_INSTANCE_UNAME='admin' +export SNOW_INSTANCE_PWD='' +``` + +```bash +pip install browsergym-workarena +playwright install +workarena-install +``` + + +
+
WebArena on AWS TODO @@ -65,17 +92,7 @@ TODO
-
- -WorkArena - -```bash -export SNOW_INSTANCE_URL="https://.service-now.com/" -export SNOW_INSTANCE_UNAME="admin" -export SNOW_INSTANCE_PWD= -``` -
## Launch experiments From 4a8f078817400649db2a611c3ec0464c3a9f3946 Mon Sep 17 00:00:00 2001 From: ThibaultLSDC Date: Tue, 1 Oct 2024 14:56:44 -0400 Subject: [PATCH 54/58] fixing tests --- tests/experiments/test_launch_exp.py | 27 +++++++++++++++------------ tests/llm/test_chat_api.py | 23 ----------------------- 2 files changed, 15 insertions(+), 35 deletions(-) diff --git a/tests/experiments/test_launch_exp.py b/tests/experiments/test_launch_exp.py index 45eacf61..d7eb981b 100644 --- a/tests/experiments/test_launch_exp.py +++ b/tests/experiments/test_launch_exp.py @@ -1,14 +1,15 @@ +import tempfile +from pathlib import Path + import pytest -from agentlab.experiments.launch_exp import relaunch_study, run_experiments, make_study_dir -from agentlab.experiments.study_generators import run_agents_on_benchmark from browsergym.experiments.loop import EnvArgs, ExpArgs -from agentlab.agents.generic_agent.generic_agent import GenericAgentArgs + from agentlab.agents.generic_agent.agent_configs import FLAGS_GPT_3_5, AGENT_4o_MINI -from agentlab.llm.chat_api import CheatMiniWoBLLMArgs +from agentlab.agents.generic_agent.generic_agent import GenericAgentArgs from agentlab.analyze import inspect_results -import tempfile - -from pathlib import Path +from agentlab.experiments.launch_exp import make_study_dir, relaunch_study, run_experiments +from agentlab.experiments.study_generators import run_agents_on_benchmark +from agentlab.llm.chat_api import CheatMiniWoBLLMArgs def test_relaunch_study(): @@ -89,12 +90,14 @@ def test_4o_mini_on_miniwob_tiny_test(): print(row[1].stack_trace) assert len(results_df) == len(exp_args_list) - global_report = inspect_results.global_report(results_df) - print(global_report) - assert global_report.avg_reward["[ALL TASKS]"] == 1.0 + summary = inspect_results.summarize_study(results_df) + print(summary) + assert len(summary) == 1 + reward = summary.avg_reward.iloc[0] + assert reward == 1.0 if __name__ == "__main__": - # test_4o_mini_on_miniwob_tiny_test() + test_4o_mini_on_miniwob_tiny_test() # test_launch_system() - test_launch_system_sequntial() + # test_launch_system_sequntial() diff --git a/tests/llm/test_chat_api.py b/tests/llm/test_chat_api.py index cf62fe32..ef54169f 100644 --- a/tests/llm/test_chat_api.py +++ b/tests/llm/test_chat_api.py @@ -16,29 +16,6 @@ skip_tests = False -@pytest.mark.pricy -@pytest.mark.skipif(skip_tests, reason="Skipping on remote as HF token have limited usage") -def test_api_model_args_hf(): - model_name = "HuggingFaceH4/starchat-beta" - - model_args = HuggingFaceModelArgs( - model_name=model_name, - max_total_tokens=8192, - max_input_tokens=8192 - 512, - max_new_tokens=512, - temperature=1e-1, - ) - model = model_args.make_model() - - messages = [ - SystemMessage(content="You are an helpful virtual assistant"), - HumanMessage(content="Give the third prime number"), - ] - answer = model.invoke(messages) - - assert "5" in answer.content - - @pytest.mark.pricy @pytest.mark.skipif(skip_tests, reason="Skipping on remote as Azure is pricy") def test_api_model_args_azure(): From 6474558724232a437e935f01c4907bb42c039bd2 Mon Sep 17 00:00:00 2001 From: recursix Date: Tue, 1 Oct 2024 16:06:51 -0400 Subject: [PATCH 55/58] handles better a few edge cases --- reproducibility_journal.csv | 1 + src/agentlab/experiments/launch_exp.py | 4 +++ .../experiments/reproducibility_util.py | 26 +++++++++---------- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/reproducibility_journal.csv b/reproducibility_journal.csv index 32386c6a..62c7b70d 100644 --- a/reproducibility_journal.csv +++ b/reproducibility_journal.csv @@ -5,3 +5,4 @@ recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob_tiny_test,0.6.3,2024-09-19_ recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob,0.6.3,2024-09-20_07-16-21,0.546,0.02,0,625/625,,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,295f01005faf8f2c73a31be6a18cec19d563b54b,,0.6.4,b73531271d2ce688c104eb4dfba2819583f1ba36, recursix,GenericAgent-gpt-4o-2024-05-13,miniwob,0.6.3,2024-09-20_22-09-43,0.656,0.019,0,625/625,,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,f6216486d5faac2c8b3fb0a63e114e5a4bafde47,,0.6.4,8cef8fe34940ff490d0cc06b0c8f100180d09d43, recursix,GenericAgent-gpt-4o-2024-05-13,miniwob,0.6.3,2024-09-21_12-04-39,0.656,0.019,0,625/625,None,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,fe561b93c5f053e9f9625358862f542523b5e14a,,0.7.0,ed6d6992ef64bfb91aca7002d33cb6ed5ec031ef, +recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob,0.6.3,2024-10-01_11-45-23,0.539,0.02,0,625/625,None,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,fe27819a99b163fd9240ba3e144e010413bff24d,,0.7.1,b0ad675572e01cac0d7255100112de0828877148, diff --git a/src/agentlab/experiments/launch_exp.py b/src/agentlab/experiments/launch_exp.py index f2868c5d..b82d2782 100644 --- a/src/agentlab/experiments/launch_exp.py +++ b/src/agentlab/experiments/launch_exp.py @@ -51,6 +51,10 @@ def run_experiments( Otherwise, it will only warn. """ + if len(exp_args_list) == 0: + logging.warning("No experiments to run.") + return + study_dir = Path(study_dir) study_dir.mkdir(parents=True, exist_ok=True) diff --git a/src/agentlab/experiments/reproducibility_util.py b/src/agentlab/experiments/reproducibility_util.py index 99116ab9..f702d2d0 100644 --- a/src/agentlab/experiments/reproducibility_util.py +++ b/src/agentlab/experiments/reproducibility_util.py @@ -304,13 +304,19 @@ def load_reproducibility_info(study_dir) -> dict[str]: def add_reward(info, study_dir, ignore_incomplete=False): + """Add the average reward and standard error to the info dict. + + Verifies that all tasks are completed and that there are no errors. + """ result_df = inspect_results.load_result_df(study_dir) report = inspect_results.summarize_study(result_df) if len(report) > 1: raise ValueError("Multi agent not implemented yet") - assert isinstance(info["agent_name"], str) + if isinstance(info["agent_name"], (list, tuple)): + if len(info["agent_name"]) > 1: + raise ValueError("Multi agent not implemented yet") idx = report.index[0] n_err = report.loc[idx, "n_err"].item() @@ -343,6 +349,7 @@ def _get_csv_headers(file_path: str) -> list[str]: def append_to_journal(info, journal_path=None): + """Append the info and results to the reproducibility journal.""" if journal_path is None: journal_path = Path(agentlab.__file__).parent.parent.parent / "reproducibility_journal.csv" @@ -356,18 +363,11 @@ def append_to_journal(info, journal_path=None): rows.append(headers) if isinstance(info["agent_name"], (list, tuple)): - # handle multiple agents - assert len(info["agent_name"]) == len(info["reward"]) - assert len(info["agent_name"]) == len(info["std_err"]) - - for i, agent_name in info["agent_name"]: - sub_info = info.copy() - sub_info["agent_name"] = agent_name - sub_info["reward"] = info["reward"][i] - sub_info["std_err"] = info["std_err"][i] - rows.append([str(sub_info[key]) for key in headers]) - else: - rows.append([str(info[key]) for key in headers]) + if len(info["agent_name"]) > 1: + raise ValueError("Multi agent not implemented yet") + info["agent_name"] = info["agent_name"][0] + + rows.append([str(info[key]) for key in headers]) with open(journal_path, "a", newline="") as file: writer = csv.writer(file) for row in rows: From 628d1c87138eaebc4946d5949dfcd9f38bc5f0f9 Mon Sep 17 00:00:00 2001 From: recursix Date: Tue, 1 Oct 2024 21:44:57 -0400 Subject: [PATCH 56/58] default progress function to None --- src/agentlab/analyze/inspect_results.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/agentlab/analyze/inspect_results.py b/src/agentlab/analyze/inspect_results.py index dbf1e2a8..403aefdf 100644 --- a/src/agentlab/analyze/inspect_results.py +++ b/src/agentlab/analyze/inspect_results.py @@ -826,7 +826,7 @@ def get_all_summaries(results_dir: Path, skip_hidden=True, ignore_cache=False, i def get_study_summary( - study_dir: Path, ignore_cache=False, sentinel=None, ignore_stale=False + study_dir: Path, ignore_cache=False, sentinel=None, ignore_stale=False, progress_fn=None ) -> pd.DataFrame: """Get the cached study summary for the given study directory. @@ -854,7 +854,7 @@ def get_study_summary( sentinel["from_cache"] = True return pd.read_csv(summary_path) - result_df = load_result_df(study_dir) + result_df = load_result_df(study_dir, progress_fn=progress_fn) if result_df is None: return None From 69f147a89274ce1f232096acd5602be5b9140ae9 Mon Sep 17 00:00:00 2001 From: recursix Date: Tue, 1 Oct 2024 21:45:10 -0400 Subject: [PATCH 57/58] minor formatting --- README.md | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index b7fc0035..f6fce265 100644 --- a/README.md +++ b/README.md @@ -61,22 +61,23 @@ export MINIWOB_URL="file://$HOME/dev/miniwob-plusplus/miniwob/html/miniwob/" See [detailed instructions on workarena github](https://github.com/ServiceNow/WorkArena?tab=readme-ov-file#getting-started) At a glance: -* [Sign in](https://developer.servicenow.com/) and reqeuest a `washington` instance. -* Once the instance is ready, you should see `` and `` -* Add these to your `.bashrc` (or `.zshrc`) and `source` it (note: make sure that +1) [Sign in](https://developer.servicenow.com/) and reqeuest a `washington` instance. +2) Once the instance is ready, you should see `` and `` +3) Add these to your `.bashrc` (or `.zshrc`) and `source` it (note: make sure that all variables are in single quotes unless you happen to have a password with a single quote in it) -```bash -export SNOW_INSTANCE_URL='https://.service-now.com/' -export SNOW_INSTANCE_UNAME='admin' -export SNOW_INSTANCE_PWD='' -``` - -```bash -pip install browsergym-workarena -playwright install -workarena-install -``` + ```bash + export SNOW_INSTANCE_URL='https://.service-now.com/' + export SNOW_INSTANCE_UNAME='admin' + export SNOW_INSTANCE_PWD='' + ``` +4) finally run these commands: + + ```bash + pip install browsergym-workarena + playwright install + workarena-install + ``` From 146ad629efa21b8b0241911faadd144850818617 Mon Sep 17 00:00:00 2001 From: recursix Date: Wed, 2 Oct 2024 11:08:56 +0000 Subject: [PATCH 58/58] minor --- src/agentlab/experiments/reproducibility_script.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/agentlab/experiments/reproducibility_script.py b/src/agentlab/experiments/reproducibility_script.py index 72128e54..e8599b5c 100644 --- a/src/agentlab/experiments/reproducibility_script.py +++ b/src/agentlab/experiments/reproducibility_script.py @@ -18,7 +18,7 @@ ## select the benchmark to run on benchmark = "miniwob" # benchmark = "miniwob_tiny_test" - # benchmark = "workarena.l1 + # benchmark = "workarena.l1" # benchmark = "workarena.l2" # benchmark = "workarena.l3" # benchmark = "webarena"