Skip to content

Commit

Permalink
Study refactor (#73)
Browse files Browse the repository at this point in the history
* adapting to new Benchmark class

* fixing tests

* fix tests

* typo

* not ready for gradio 5

* study id and a few fixes

* fixing pricy tests

---------

Co-authored-by: ThibaultLSDC <[email protected]>
  • Loading branch information
recursix and ThibaultLSDC authored Oct 20, 2024
1 parent f95df4a commit 98acd0c
Show file tree
Hide file tree
Showing 22 changed files with 567 additions and 877 deletions.
22 changes: 11 additions & 11 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,30 +6,30 @@
repository.
"""

import bgym
import logging

from agentlab.agents.generic_agent import (
RANDOM_SEARCH_AGENT,
AGENT_4o,
AGENT_4o_MINI,
AGENT_LLAMA3_70B,
AGENT_LLAMA31_70B,
)
from agentlab.analyze.inspect_results import get_most_recent_folder
from agentlab.experiments import study_generators
from agentlab.experiments.study import Study

logging.getLogger().setLevel(logging.INFO)

# choose your agent or provide a new agent
agent_args = [AGENT_4o_MINI]
# agent_args = [AGENT_4o]

## select the benchmark to run on

# ## select the benchmark to run on
benchmark = "miniwob_tiny_test"
# benchmark = "miniwob"
# benchmark = "workarena.l1"
# benchmark = "workarena.l2"
# benchmark = "workarena.l3"
# benchmark = "miniwob_all"
# benchmark = "workarena_l1"
# benchmark = "workarena_l2"
# benchmark = "workarena_l3"
# benchmark = "webarena"

# Set reproducibility_mode = True for reproducibility
Expand All @@ -53,11 +53,11 @@

if relaunch:
# relaunch an existing study
study_dir = get_most_recent_folder()
study = study_generators.make_relaunch_study(study_dir, relaunch_mode="incomplete_or_error")
study = Study.load_most_recent()
study.find_incomplete(relaunch_mode="incomplete_or_error")

else:
study = study_generators.run_agents_on_benchmark(agent_args, benchmark)
study = Study(agent_args, benchmark)

study.run(n_jobs=n_jobs, parallel_backend="joblib", strict_reproducibility=reproducibility_mode)

Expand Down
22 changes: 11 additions & 11 deletions reproducibility_journal.csv
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
git_user,agent_name,benchmark,benchmark_version,date,avg_reward,std_err,n_err,n_completed,comment,os,python_version,playwright_version,agentlab_version,agentlab_git_hash,agentlab__local_modifications,browsergym_version,browsergym_git_hash,browsergym__local_modifications
recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob_tiny_test,0.6.3,2024-09-19_21-07-34,0.75,0.217,0,4/4,,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,c99bdf74c98f323cc6a646467ba5f21154b6fd18,,0.6.4,b73531271d2ce688c104eb4dfba2819583f1ba36,
recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob_tiny_test,0.6.3,2024-09-19_21-28-58,1.0,0.0,0,4/4,,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,c99bdf74c98f323cc6a646467ba5f21154b6fd18," M: reproducibility_journal.csv
git_user,agent_name,benchmark,benchmark_version,date,study_id,avg_reward,std_err,n_err,n_completed,comment,os,python_version,playwright_version,agentlab_version,agentlab_git_hash,agentlab__local_modifications,browsergym_version,browsergym_git_hash,browsergym__local_modifications
recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob_tiny_test,0.6.3,2024-09-19_21-07-34,,0.75,0.217,0,4/4,,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,c99bdf74c98f323cc6a646467ba5f21154b6fd18,,0.6.4,b73531271d2ce688c104eb4dfba2819583f1ba36,
recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob_tiny_test,0.6.3,2024-09-19_21-28-58,,1.0,0.0,0,4/4,,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,c99bdf74c98f323cc6a646467ba5f21154b6fd18," M: reproducibility_journal.csv
M: src/agentlab/experiments/task_collections.py",0.6.4,b73531271d2ce688c104eb4dfba2819583f1ba36,
recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob,0.6.3,2024-09-20_07-16-21,0.546,0.02,0,625/625,,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,295f01005faf8f2c73a31be6a18cec19d563b54b,,0.6.4,b73531271d2ce688c104eb4dfba2819583f1ba36,
recursix,GenericAgent-gpt-4o-2024-05-13,miniwob,0.6.3,2024-09-20_22-09-43,0.656,0.019,0,625/625,,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,f6216486d5faac2c8b3fb0a63e114e5a4bafde47,,0.6.4,8cef8fe34940ff490d0cc06b0c8f100180d09d43,
recursix,GenericAgent-gpt-4o-2024-05-13,miniwob,0.6.3,2024-09-21_12-04-39,0.656,0.019,0,625/625,None,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,fe561b93c5f053e9f9625358862f542523b5e14a,,0.7.0,ed6d6992ef64bfb91aca7002d33cb6ed5ec031ef,
recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob,0.6.3,2024-10-01_11-45-23,0.539,0.02,0,625/625,None,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,fe27819a99b163fd9240ba3e144e010413bff24d,,0.7.1,b0ad675572e01cac0d7255100112de0828877148,
recursix,GenericAgent-gpt-4o-mini-2024-07-18,workarena.l1,0.3.2,2024-10-05_13-21-27,0.23,0.023,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,aadf86b397cd36c581e1a61e491aec649ac5a140, M: main.py,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
recursix,GenericAgent-gpt-4o-2024-05-13,workarena.l1,0.3.2,2024-10-05_15-45-42,0.382,0.027,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,ab447e997af589bbd022de7a5189a7685ddfa6ef,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
recursix,GenericAgent-meta-llama_llama-3.1-70b-instruct,miniwob_tiny_test,0.7.0,2024-10-05_17-49-15,1.0,0.0,0,4/4,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,a98fa24426a6ddde8443e8be44ed94cd9522e5ca,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
recursix,GenericAgent-meta-llama_llama-3-70b-instruct,workarena.l1,0.3.2,2024-10-09_21-16-37,0.176,0.021,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,c847dbd334184271b32b252409a1b6c1042d7442,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob,0.6.3,2024-09-20_07-16-21,,0.546,0.02,0,625/625,,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,295f01005faf8f2c73a31be6a18cec19d563b54b,,0.6.4,b73531271d2ce688c104eb4dfba2819583f1ba36,
recursix,GenericAgent-gpt-4o-2024-05-13,miniwob,0.6.3,2024-09-20_22-09-43,,0.656,0.019,0,625/625,,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,f6216486d5faac2c8b3fb0a63e114e5a4bafde47,,0.6.4,8cef8fe34940ff490d0cc06b0c8f100180d09d43,
recursix,GenericAgent-gpt-4o-2024-05-13,miniwob,0.6.3,2024-09-21_12-04-39,,0.656,0.019,0,625/625,None,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,fe561b93c5f053e9f9625358862f542523b5e14a,,0.7.0,ed6d6992ef64bfb91aca7002d33cb6ed5ec031ef,
recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob,0.6.3,2024-10-01_11-45-23,,0.539,0.02,0,625/625,None,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,fe27819a99b163fd9240ba3e144e010413bff24d,,0.7.1,b0ad675572e01cac0d7255100112de0828877148,
recursix,GenericAgent-gpt-4o-mini-2024-07-18,workarena.l1,0.3.2,2024-10-05_13-21-27,,0.23,0.023,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,aadf86b397cd36c581e1a61e491aec649ac5a140," M: main.py",0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
recursix,GenericAgent-gpt-4o-2024-05-13,workarena.l1,0.3.2,2024-10-05_15-45-42,,0.382,0.027,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,ab447e997af589bbd022de7a5189a7685ddfa6ef,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
recursix,GenericAgent-meta-llama_llama-3-70b-instruct,workarena.l1,0.3.2,2024-10-09_21-16-37,,0.176,0.021,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,c847dbd334184271b32b252409a1b6c1042d7442,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
recursix,GenericAgent-meta-llama_llama-3.1-70b-instruct,miniwob_tiny_test,0.7.0,2024-10-05_17-49-15,,1.0,0.0,0,4/4,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,a98fa24426a6ddde8443e8be44ed94cd9522e5ca,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ contexttimer
ipython
pyyaml>=6
pandas
gradio==4.*
gradio<5
gitpython # for the reproducibility script
requests
matplotlib
3 changes: 2 additions & 1 deletion src/agentlab/agents/agent_args.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from bgym import AbstractAgentArgs
import bgym


class AgentArgs(AbstractAgentArgs):

def set_benchmark(self, benchmark: str, demo_mode: bool):
def set_benchmark(self, benchmark: bgym.Benchmark, demo_mode: bool):
"""Optional method to set benchmark specific flags.
This allows the agent to have minor adjustments based on the benchmark.
Expand Down
41 changes: 21 additions & 20 deletions src/agentlab/agents/dynamic_prompting.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import abc
import difflib
import logging
import platform
import time
Expand All @@ -9,6 +8,7 @@
from typing import Literal
from warnings import warn

import bgym
from browsergym.core.action.base import AbstractActionSet
from browsergym.core.action.highlevel import HighLevelActionSet
from browsergym.core.action.python import PythonActionSet
Expand Down Expand Up @@ -94,13 +94,14 @@ class ObsFlags(Flags):

@dataclass
class ActionFlags(Flags):
multi_actions: bool = False
action_set: str = "bid"
is_strict: bool = False
demo_mode: Literal["off", "default", "all_blue", "only_visible_elements"] = "off"
action_set: bgym.HighLevelActionSetArgs = None # should be set by the set_benchmark method
long_description: bool = True
individual_examples: bool = False

# for backward compatibility
multi_actions: bool = None
is_strict: bool = None


class PromptElement:
"""Base class for all prompt elements. Prompt elements can be hidden."""
Expand Down Expand Up @@ -592,24 +593,24 @@ def _parse_answer(self, text_answer):
return ans_dict


def make_action_set(action_flags: ActionFlags) -> AbstractActionSet:
# def make_action_set(action_flags: ActionFlags) -> AbstractActionSet:

if action_flags.action_set == "python":
action_set = PythonActionSet(strict=action_flags.is_strict)
if action_flags.demo_mode != "off":
warn(
f'Action_set "python" is incompatible with demo_mode={repr(action_flags.demo_mode)}.'
)
return action_set
# if action_flags.action_set == "python":
# action_set = PythonActionSet(strict=action_flags.is_strict)
# if action_flags.demo_mode != "off":
# warn(
# f'Action_set "python" is incompatible with demo_mode={repr(action_flags.demo_mode)}.'
# )
# return action_set

action_set = HighLevelActionSet(
subsets=list(set(["chat"] + ["infeas"] + action_flags.action_set.split("+"))),
multiaction=action_flags.multi_actions,
strict=action_flags.is_strict,
demo_mode=action_flags.demo_mode,
)
# action_set = HighLevelActionSet(
# subsets=list(set(["chat"] + ["infeas"] + action_flags.action_set.split("+"))),
# multiaction=action_flags.multi_actions,
# strict=action_flags.is_strict,
# demo_mode=action_flags.demo_mode,
# )

return action_set
# return action_set


class Think(PromptElement):
Expand Down
41 changes: 27 additions & 14 deletions src/agentlab/agents/generic_agent/agent_configs.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import bgym
from agentlab.agents import dynamic_prompting as dp
from agentlab.experiments import args
from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT
Expand Down Expand Up @@ -25,8 +26,10 @@
filter_visible_elements_only=False,
),
action=dp.ActionFlags(
multi_actions=False,
action_set="bid",
action_set=bgym.HighLevelActionSetArgs(
subsets=["bid"],
multiaction=False,
),
long_description=False,
individual_examples=True,
),
Expand Down Expand Up @@ -71,8 +74,10 @@
filter_visible_elements_only=False,
),
action=dp.ActionFlags(
multi_actions=False, # often detrimental
action_set="bid",
action_set=bgym.HighLevelActionSetArgs(
subsets=["bid"],
multiaction=False,
),
long_description=False,
individual_examples=True,
),
Expand Down Expand Up @@ -116,8 +121,10 @@
filter_visible_elements_only=False,
),
action=dp.ActionFlags(
multi_actions=False,
action_set="bid",
action_set=bgym.HighLevelActionSetArgs(
subsets=["bid"],
multiaction=False,
),
long_description=False,
individual_examples=True,
),
Expand Down Expand Up @@ -164,8 +171,10 @@
filter_visible_elements_only=False,
),
action=dp.ActionFlags(
multi_actions=True,
action_set="bid",
action_set=bgym.HighLevelActionSetArgs(
subsets=["bid"],
multiaction=True,
),
long_description=False,
individual_examples=True,
),
Expand Down Expand Up @@ -210,8 +219,10 @@
filter_visible_elements_only=False,
),
action=dp.ActionFlags(
multi_actions=False,
action_set="bid",
action_set=bgym.HighLevelActionSetArgs(
subsets=["bid"],
multiaction=False,
),
long_description=False,
individual_examples=False,
),
Expand Down Expand Up @@ -270,10 +281,12 @@
filter_visible_elements_only=args.Choice([True, False], p=[0.3, 0.7]),
),
action=dp.ActionFlags(
multi_actions=args.Choice([True, False], p=[0.7, 0.3]),
action_set=args.Choice(["bid", "bid+coord"]),
# action_set=args.Choice(["python", "bid", "coord",
# "bid+coord"]),
action_set=bgym.HighLevelActionSetArgs(
subsets=args.Choice([["bid"], ["bid", "coord"]]),
multiaction=args.Choice([True, False], p=[0.7, 0.3]),
),
long_description=False,
individual_examples=False,
),
# drop_ax_tree_first=True, # this flag is no longer active, according to browsergym doc
use_plan=args.Choice([True, False]),
Expand Down
19 changes: 15 additions & 4 deletions src/agentlab/agents/generic_agent/generic_agent.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from copy import deepcopy
from dataclasses import asdict, dataclass
from functools import partial
from warnings import warn

import bgym
from browsergym.experiments.agent import Agent, AgentInfo

from agentlab.agents import dynamic_prompting as dp
Expand All @@ -25,13 +27,22 @@ def __post_init__(self):
except AttributeError:
pass

def set_benchmark(self, benchmark, demo_mode):
def set_benchmark(self, benchmark: bgym.Benchmark, demo_mode):
"""Override Some flags based on the benchmark."""
if benchmark == "miniwob":
if benchmark.name.startswith("miniwob"):
self.flags.obs.use_html = True

self.flags.action.action_set = deepcopy(benchmark.high_level_action_set_args)

# for backward compatibility with old traces
if self.flags.action.multi_actions is not None:
self.flags.action.action_set.multiaction = self.flags.action.multi_actions
if self.flags.action.is_strict is not None:
self.flags.action.action_set.strict = self.flags.action.is_strict

# verify if we can remove this
if demo_mode:
self.flags.action.demo_mode = "all_blue"
self.action_set.demo_mode = "all_blue"

def set_reproducibility_mode(self):
self.chat_model_args.temperature = 0
Expand Down Expand Up @@ -62,7 +73,7 @@ def __init__(
self.max_retry = max_retry

self.flags = flags
self.action_set = dp.make_action_set(self.flags.action)
self.action_set = self.flags.action.action_set.make_action_set()
self._obs_preprocessor = dp.make_obs_preprocessor(flags.obs)

self._check_flag_constancy()
Expand Down
76 changes: 1 addition & 75 deletions src/agentlab/agents/generic_agent/generic_agent_prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def __init__(
def time_for_caution():
# no need for caution if we're in single action mode
return flags.be_cautious and (
flags.action.multi_actions or flags.action.action_set == "python"
flags.action.action_set.multiaction or flags.action.action_set == "python"
)

self.be_cautious = dp.BeCautious(visible=time_for_caution)
Expand Down Expand Up @@ -242,77 +242,3 @@ class Criticise(dp.PromptElement):

def _parse_answer(self, text_answer):
return parse_html_tags_raise(text_answer, optional_keys=["action_draft", "criticise"])


if __name__ == "__main__":
html_template = """
<html>
<body>
<div>
Hello World.
Step {}.
</div>
</body>
</html>
"""

OBS_HISTORY = [
{
"goal": "do this and that",
"pruned_html": html_template.format(1),
"axtree_txt": "[1] Click me",
"last_action_error": "",
"focused_element_bid": "32",
},
{
"goal": "do this and that",
"pruned_html": html_template.format(2),
"axtree_txt": "[1] Click me",
"last_action_error": "",
"focused_element_bid": "32",
},
{
"goal": "do this and that",
"pruned_html": html_template.format(3),
"axtree_txt": "[1] Click me",
"last_action_error": "Hey, there is an error now",
"focused_element_bid": "32",
},
]
ACTIONS = ["click('41')", "click('42')"]
MEMORIES = ["memory A", "memory B"]
THOUGHTS = ["thought A", "thought B"]

flags = dp.ObsFlags(
use_html=True,
use_ax_tree=True,
use_plan=True,
use_criticise=True,
use_thinking=True,
use_error_logs=True,
use_past_error_logs=True,
use_history=True,
use_action_history=True,
use_memory=True,
use_diff=True,
html_type="pruned_html",
use_concrete_example=True,
use_abstract_example=True,
multi_actions=True,
use_screenshot=False,
)

print(
MainPrompt(
action_set=dp.make_action_set(
"bid", is_strict=False, multiaction=True, demo_mode="off"
),
obs_history=OBS_HISTORY,
actions=ACTIONS,
memories=MEMORIES,
thoughts=THOUGHTS,
previous_plan="No plan yet",
step=0,
flags=flags,
).prompt
)
Loading

0 comments on commit 98acd0c

Please sign in to comment.