Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Study refactor #73

Merged
merged 8 commits into from
Oct 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 11 additions & 11 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,30 +6,30 @@
repository.
"""

import bgym
import logging

from agentlab.agents.generic_agent import (
RANDOM_SEARCH_AGENT,
AGENT_4o,
AGENT_4o_MINI,
AGENT_LLAMA3_70B,
AGENT_LLAMA31_70B,
)
from agentlab.analyze.inspect_results import get_most_recent_folder
from agentlab.experiments import study_generators
from agentlab.experiments.study import Study

logging.getLogger().setLevel(logging.INFO)

# choose your agent or provide a new agent
agent_args = [AGENT_4o_MINI]
# agent_args = [AGENT_4o]

## select the benchmark to run on

# ## select the benchmark to run on
benchmark = "miniwob_tiny_test"
# benchmark = "miniwob"
# benchmark = "workarena.l1"
# benchmark = "workarena.l2"
# benchmark = "workarena.l3"
# benchmark = "miniwob_all"
# benchmark = "workarena_l1"
# benchmark = "workarena_l2"
# benchmark = "workarena_l3"
# benchmark = "webarena"

# Set reproducibility_mode = True for reproducibility
Expand All @@ -53,11 +53,11 @@

if relaunch:
# relaunch an existing study
study_dir = get_most_recent_folder()
study = study_generators.make_relaunch_study(study_dir, relaunch_mode="incomplete_or_error")
study = Study.load_most_recent()
study.find_incomplete(relaunch_mode="incomplete_or_error")

else:
study = study_generators.run_agents_on_benchmark(agent_args, benchmark)
study = Study(agent_args, benchmark)

study.run(n_jobs=n_jobs, parallel_backend="joblib", strict_reproducibility=reproducibility_mode)

Expand Down
22 changes: 11 additions & 11 deletions reproducibility_journal.csv
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
git_user,agent_name,benchmark,benchmark_version,date,avg_reward,std_err,n_err,n_completed,comment,os,python_version,playwright_version,agentlab_version,agentlab_git_hash,agentlab__local_modifications,browsergym_version,browsergym_git_hash,browsergym__local_modifications
recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob_tiny_test,0.6.3,2024-09-19_21-07-34,0.75,0.217,0,4/4,,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,c99bdf74c98f323cc6a646467ba5f21154b6fd18,,0.6.4,b73531271d2ce688c104eb4dfba2819583f1ba36,
recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob_tiny_test,0.6.3,2024-09-19_21-28-58,1.0,0.0,0,4/4,,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,c99bdf74c98f323cc6a646467ba5f21154b6fd18," M: reproducibility_journal.csv
git_user,agent_name,benchmark,benchmark_version,date,study_id,avg_reward,std_err,n_err,n_completed,comment,os,python_version,playwright_version,agentlab_version,agentlab_git_hash,agentlab__local_modifications,browsergym_version,browsergym_git_hash,browsergym__local_modifications
recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob_tiny_test,0.6.3,2024-09-19_21-07-34,,0.75,0.217,0,4/4,,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,c99bdf74c98f323cc6a646467ba5f21154b6fd18,,0.6.4,b73531271d2ce688c104eb4dfba2819583f1ba36,
recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob_tiny_test,0.6.3,2024-09-19_21-28-58,,1.0,0.0,0,4/4,,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,c99bdf74c98f323cc6a646467ba5f21154b6fd18," M: reproducibility_journal.csv
M: src/agentlab/experiments/task_collections.py",0.6.4,b73531271d2ce688c104eb4dfba2819583f1ba36,
recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob,0.6.3,2024-09-20_07-16-21,0.546,0.02,0,625/625,,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,295f01005faf8f2c73a31be6a18cec19d563b54b,,0.6.4,b73531271d2ce688c104eb4dfba2819583f1ba36,
recursix,GenericAgent-gpt-4o-2024-05-13,miniwob,0.6.3,2024-09-20_22-09-43,0.656,0.019,0,625/625,,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,f6216486d5faac2c8b3fb0a63e114e5a4bafde47,,0.6.4,8cef8fe34940ff490d0cc06b0c8f100180d09d43,
recursix,GenericAgent-gpt-4o-2024-05-13,miniwob,0.6.3,2024-09-21_12-04-39,0.656,0.019,0,625/625,None,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,fe561b93c5f053e9f9625358862f542523b5e14a,,0.7.0,ed6d6992ef64bfb91aca7002d33cb6ed5ec031ef,
recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob,0.6.3,2024-10-01_11-45-23,0.539,0.02,0,625/625,None,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,fe27819a99b163fd9240ba3e144e010413bff24d,,0.7.1,b0ad675572e01cac0d7255100112de0828877148,
recursix,GenericAgent-gpt-4o-mini-2024-07-18,workarena.l1,0.3.2,2024-10-05_13-21-27,0.23,0.023,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,aadf86b397cd36c581e1a61e491aec649ac5a140, M: main.py,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
recursix,GenericAgent-gpt-4o-2024-05-13,workarena.l1,0.3.2,2024-10-05_15-45-42,0.382,0.027,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,ab447e997af589bbd022de7a5189a7685ddfa6ef,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
recursix,GenericAgent-meta-llama_llama-3.1-70b-instruct,miniwob_tiny_test,0.7.0,2024-10-05_17-49-15,1.0,0.0,0,4/4,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,a98fa24426a6ddde8443e8be44ed94cd9522e5ca,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
recursix,GenericAgent-meta-llama_llama-3-70b-instruct,workarena.l1,0.3.2,2024-10-09_21-16-37,0.176,0.021,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,c847dbd334184271b32b252409a1b6c1042d7442,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob,0.6.3,2024-09-20_07-16-21,,0.546,0.02,0,625/625,,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,295f01005faf8f2c73a31be6a18cec19d563b54b,,0.6.4,b73531271d2ce688c104eb4dfba2819583f1ba36,
recursix,GenericAgent-gpt-4o-2024-05-13,miniwob,0.6.3,2024-09-20_22-09-43,,0.656,0.019,0,625/625,,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,f6216486d5faac2c8b3fb0a63e114e5a4bafde47,,0.6.4,8cef8fe34940ff490d0cc06b0c8f100180d09d43,
recursix,GenericAgent-gpt-4o-2024-05-13,miniwob,0.6.3,2024-09-21_12-04-39,,0.656,0.019,0,625/625,None,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,fe561b93c5f053e9f9625358862f542523b5e14a,,0.7.0,ed6d6992ef64bfb91aca7002d33cb6ed5ec031ef,
recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob,0.6.3,2024-10-01_11-45-23,,0.539,0.02,0,625/625,None,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,fe27819a99b163fd9240ba3e144e010413bff24d,,0.7.1,b0ad675572e01cac0d7255100112de0828877148,
recursix,GenericAgent-gpt-4o-mini-2024-07-18,workarena.l1,0.3.2,2024-10-05_13-21-27,,0.23,0.023,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,aadf86b397cd36c581e1a61e491aec649ac5a140," M: main.py",0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
recursix,GenericAgent-gpt-4o-2024-05-13,workarena.l1,0.3.2,2024-10-05_15-45-42,,0.382,0.027,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,ab447e997af589bbd022de7a5189a7685ddfa6ef,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
recursix,GenericAgent-meta-llama_llama-3-70b-instruct,workarena.l1,0.3.2,2024-10-09_21-16-37,,0.176,0.021,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,c847dbd334184271b32b252409a1b6c1042d7442,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
recursix,GenericAgent-meta-llama_llama-3.1-70b-instruct,miniwob_tiny_test,0.7.0,2024-10-05_17-49-15,,1.0,0.0,0,4/4,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,a98fa24426a6ddde8443e8be44ed94cd9522e5ca,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ contexttimer
ipython
pyyaml>=6
pandas
gradio==4.*
gradio<5
gitpython # for the reproducibility script
requests
matplotlib
3 changes: 2 additions & 1 deletion src/agentlab/agents/agent_args.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from bgym import AbstractAgentArgs
import bgym


class AgentArgs(AbstractAgentArgs):

def set_benchmark(self, benchmark: str, demo_mode: bool):
def set_benchmark(self, benchmark: bgym.Benchmark, demo_mode: bool):
"""Optional method to set benchmark specific flags.

This allows the agent to have minor adjustments based on the benchmark.
Expand Down
41 changes: 21 additions & 20 deletions src/agentlab/agents/dynamic_prompting.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import abc
import difflib
import logging
import platform
import time
Expand All @@ -9,6 +8,7 @@
from typing import Literal
from warnings import warn

import bgym
from browsergym.core.action.base import AbstractActionSet
from browsergym.core.action.highlevel import HighLevelActionSet
from browsergym.core.action.python import PythonActionSet
Expand Down Expand Up @@ -94,13 +94,14 @@ class ObsFlags(Flags):

@dataclass
class ActionFlags(Flags):
multi_actions: bool = False
action_set: str = "bid"
is_strict: bool = False
demo_mode: Literal["off", "default", "all_blue", "only_visible_elements"] = "off"
action_set: bgym.HighLevelActionSetArgs = None # should be set by the set_benchmark method
long_description: bool = True
individual_examples: bool = False

# for backward compatibility
multi_actions: bool = None
is_strict: bool = None


class PromptElement:
"""Base class for all prompt elements. Prompt elements can be hidden."""
Expand Down Expand Up @@ -592,24 +593,24 @@ def _parse_answer(self, text_answer):
return ans_dict


def make_action_set(action_flags: ActionFlags) -> AbstractActionSet:
# def make_action_set(action_flags: ActionFlags) -> AbstractActionSet:

if action_flags.action_set == "python":
action_set = PythonActionSet(strict=action_flags.is_strict)
if action_flags.demo_mode != "off":
warn(
f'Action_set "python" is incompatible with demo_mode={repr(action_flags.demo_mode)}.'
)
return action_set
# if action_flags.action_set == "python":
# action_set = PythonActionSet(strict=action_flags.is_strict)
# if action_flags.demo_mode != "off":
# warn(
# f'Action_set "python" is incompatible with demo_mode={repr(action_flags.demo_mode)}.'
# )
# return action_set

action_set = HighLevelActionSet(
subsets=list(set(["chat"] + ["infeas"] + action_flags.action_set.split("+"))),
multiaction=action_flags.multi_actions,
strict=action_flags.is_strict,
demo_mode=action_flags.demo_mode,
)
# action_set = HighLevelActionSet(
# subsets=list(set(["chat"] + ["infeas"] + action_flags.action_set.split("+"))),
# multiaction=action_flags.multi_actions,
# strict=action_flags.is_strict,
# demo_mode=action_flags.demo_mode,
# )

return action_set
# return action_set


class Think(PromptElement):
Expand Down
41 changes: 27 additions & 14 deletions src/agentlab/agents/generic_agent/agent_configs.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import bgym
from agentlab.agents import dynamic_prompting as dp
from agentlab.experiments import args
from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT
Expand Down Expand Up @@ -25,8 +26,10 @@
filter_visible_elements_only=False,
),
action=dp.ActionFlags(
multi_actions=False,
action_set="bid",
action_set=bgym.HighLevelActionSetArgs(
subsets=["bid"],
Copy link
Collaborator

@ThibaultLSDC ThibaultLSDC Oct 19, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

iiuc that a default subset that would get overwritten by the set_benchmark method?

multiaction=False,
),
long_description=False,
individual_examples=True,
),
Expand Down Expand Up @@ -71,8 +74,10 @@
filter_visible_elements_only=False,
),
action=dp.ActionFlags(
multi_actions=False, # often detrimental
action_set="bid",
action_set=bgym.HighLevelActionSetArgs(
subsets=["bid"],
multiaction=False,
),
long_description=False,
individual_examples=True,
),
Expand Down Expand Up @@ -116,8 +121,10 @@
filter_visible_elements_only=False,
),
action=dp.ActionFlags(
multi_actions=False,
action_set="bid",
action_set=bgym.HighLevelActionSetArgs(
subsets=["bid"],
multiaction=False,
),
long_description=False,
individual_examples=True,
),
Expand Down Expand Up @@ -164,8 +171,10 @@
filter_visible_elements_only=False,
),
action=dp.ActionFlags(
multi_actions=True,
action_set="bid",
action_set=bgym.HighLevelActionSetArgs(
subsets=["bid"],
multiaction=True,
),
long_description=False,
individual_examples=True,
),
Expand Down Expand Up @@ -210,8 +219,10 @@
filter_visible_elements_only=False,
),
action=dp.ActionFlags(
multi_actions=False,
action_set="bid",
action_set=bgym.HighLevelActionSetArgs(
subsets=["bid"],
multiaction=False,
),
long_description=False,
individual_examples=False,
),
Expand Down Expand Up @@ -270,10 +281,12 @@
filter_visible_elements_only=args.Choice([True, False], p=[0.3, 0.7]),
),
action=dp.ActionFlags(
multi_actions=args.Choice([True, False], p=[0.7, 0.3]),
action_set=args.Choice(["bid", "bid+coord"]),
# action_set=args.Choice(["python", "bid", "coord",
# "bid+coord"]),
action_set=bgym.HighLevelActionSetArgs(
subsets=args.Choice([["bid"], ["bid", "coord"]]),
multiaction=args.Choice([True, False], p=[0.7, 0.3]),
),
long_description=False,
individual_examples=False,
),
# drop_ax_tree_first=True, # this flag is no longer active, according to browsergym doc
use_plan=args.Choice([True, False]),
Expand Down
19 changes: 15 additions & 4 deletions src/agentlab/agents/generic_agent/generic_agent.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from copy import deepcopy
from dataclasses import asdict, dataclass
from functools import partial
from warnings import warn

import bgym
from browsergym.experiments.agent import Agent, AgentInfo

from agentlab.agents import dynamic_prompting as dp
Expand All @@ -25,13 +27,22 @@ def __post_init__(self):
except AttributeError:
pass

def set_benchmark(self, benchmark, demo_mode):
def set_benchmark(self, benchmark: bgym.Benchmark, demo_mode):
"""Override Some flags based on the benchmark."""
if benchmark == "miniwob":
if benchmark.name.startswith("miniwob"):
self.flags.obs.use_html = True

self.flags.action.action_set = deepcopy(benchmark.high_level_action_set_args)

# for backward compatibility with old traces
if self.flags.action.multi_actions is not None:
self.flags.action.action_set.multiaction = self.flags.action.multi_actions
if self.flags.action.is_strict is not None:
self.flags.action.action_set.strict = self.flags.action.is_strict

# verify if we can remove this
if demo_mode:
self.flags.action.demo_mode = "all_blue"
self.action_set.demo_mode = "all_blue"

def set_reproducibility_mode(self):
self.chat_model_args.temperature = 0
Expand Down Expand Up @@ -62,7 +73,7 @@ def __init__(
self.max_retry = max_retry

self.flags = flags
self.action_set = dp.make_action_set(self.flags.action)
self.action_set = self.flags.action.action_set.make_action_set()
self._obs_preprocessor = dp.make_obs_preprocessor(flags.obs)

self._check_flag_constancy()
Expand Down
76 changes: 1 addition & 75 deletions src/agentlab/agents/generic_agent/generic_agent_prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def __init__(
def time_for_caution():
# no need for caution if we're in single action mode
return flags.be_cautious and (
flags.action.multi_actions or flags.action.action_set == "python"
flags.action.action_set.multiaction or flags.action.action_set == "python"
)

self.be_cautious = dp.BeCautious(visible=time_for_caution)
Expand Down Expand Up @@ -242,77 +242,3 @@ class Criticise(dp.PromptElement):

def _parse_answer(self, text_answer):
return parse_html_tags_raise(text_answer, optional_keys=["action_draft", "criticise"])


if __name__ == "__main__":
html_template = """
<html>
<body>
<div>
Hello World.
Step {}.
</div>
</body>
</html>
"""

OBS_HISTORY = [
{
"goal": "do this and that",
"pruned_html": html_template.format(1),
"axtree_txt": "[1] Click me",
"last_action_error": "",
"focused_element_bid": "32",
},
{
"goal": "do this and that",
"pruned_html": html_template.format(2),
"axtree_txt": "[1] Click me",
"last_action_error": "",
"focused_element_bid": "32",
},
{
"goal": "do this and that",
"pruned_html": html_template.format(3),
"axtree_txt": "[1] Click me",
"last_action_error": "Hey, there is an error now",
"focused_element_bid": "32",
},
]
ACTIONS = ["click('41')", "click('42')"]
MEMORIES = ["memory A", "memory B"]
THOUGHTS = ["thought A", "thought B"]

flags = dp.ObsFlags(
use_html=True,
use_ax_tree=True,
use_plan=True,
use_criticise=True,
use_thinking=True,
use_error_logs=True,
use_past_error_logs=True,
use_history=True,
use_action_history=True,
use_memory=True,
use_diff=True,
html_type="pruned_html",
use_concrete_example=True,
use_abstract_example=True,
multi_actions=True,
use_screenshot=False,
)

print(
MainPrompt(
action_set=dp.make_action_set(
"bid", is_strict=False, multiaction=True, demo_mode="off"
),
obs_history=OBS_HISTORY,
actions=ACTIONS,
memories=MEMORIES,
thoughts=THOUGHTS,
previous_plan="No plan yet",
step=0,
flags=flags,
).prompt
)
Loading