Skip to content

Commit

Permalink
Merge branch 'main' into kevin
Browse files Browse the repository at this point in the history
  • Loading branch information
SmartManoj committed Mar 3, 2025
2 parents 73536bd + 395c1ea commit 890d73a
Show file tree
Hide file tree
Showing 30 changed files with 184 additions and 34 deletions.
13 changes: 10 additions & 3 deletions ISSUE_TRIAGE.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@
These are the procedures and guidelines on how issues are triaged in this repo by the maintainers.

## General
* Most issues must be tagged with **enhancement** or **bug**.
* Issues may be tagged with what it relates to (**backend**, **frontend**, **agent quality**, etc.).
* All issues must be tagged with **enhancement**, **bug** or **troubleshooting/help**.
* Issues may be tagged with what it relates to (**agent quality**, **frontend**, **resolver**, etc.).

## Severity
* **Low**: Minor issues or affecting single user.
* **Medium**: Affecting multiple users.
* **High**: High visibility issues or affecting many users.
* **Critical**: Affecting all users or potential security issues.

## Effort
Expand All @@ -18,8 +19,14 @@ These are the procedures and guidelines on how issues are triaged in this repo b

## Not Enough Information
* User is asked to provide more information (logs, how to reproduce, etc.) when the issue is not clear.
* If an issue is unclear and the author does not provide more information or respond to a request, the issue may be closed as **not planned** (Usually after a week).
* If an issue is unclear and the author does not provide more information or respond to a request,
the issue may be closed as **not planned** (Usually after a week).

## Multiple Requests/Fixes in One Issue
* These issues will be narrowed down to one request/fix so the issue is more easily tracked and fixed.
* Issues may be broken down into multiple issues if required.

## Stale and Auto Closures
* In order to keep a maintainable backlog, issues that have no activity within 30 days are automatically marked as **Stale**.
* If issues marked as **Stale** continue to have no activity for 7 more days, they will automatically be closed as not planned.
* Issues may be reopened by maintainers if deemed important.
2 changes: 2 additions & 0 deletions evaluation/benchmarks/EDA/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from openhands.core.logger import openhands_logger as logger
from openhands.core.main import create_runtime, run_controller
from openhands.events.action import MessageAction
from openhands.utils.async_utils import call_async_from_sync

game = None

Expand Down Expand Up @@ -121,6 +122,7 @@ def process_instance(

# Here's how you can run the agent (similar to the `main` function) and get the final task state
runtime = create_runtime(config)
call_async_from_sync(runtime.connect)

state: State | None = asyncio.run(
run_controller(
Expand Down
2 changes: 2 additions & 0 deletions evaluation/benchmarks/agent_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from openhands.events.action import AgentFinishAction, CmdRunAction, MessageAction
from openhands.events.observation import CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync


def get_config(
Expand Down Expand Up @@ -210,6 +211,7 @@ def process_instance(
# =============================================

runtime: Runtime = create_runtime(config)
call_async_from_sync(runtime.connect)

initialize_runtime(runtime, instance=instance)

Expand Down
3 changes: 2 additions & 1 deletion evaluation/benchmarks/aider_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from openhands.events.action import CmdRunAction, MessageAction
from openhands.events.observation import CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync

# Configure visibility of unit tests to the Agent.
USE_UNIT_TESTS = os.environ.get('USE_UNIT_TESTS', 'false').lower() == 'true'
Expand Down Expand Up @@ -203,7 +204,7 @@ def process_instance(
# =============================================

runtime: Runtime = create_runtime(config)

call_async_from_sync(runtime.connect)
initialize_runtime(runtime, instance=instance)

# Here's how you can run the agent (similar to the `main` function) and get the final task state
Expand Down
2 changes: 2 additions & 0 deletions evaluation/benchmarks/biocoder/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from openhands.events.action import CmdRunAction, MessageAction
from openhands.events.observation import CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync

AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
'CodeActAgent': functools.partial(
Expand Down Expand Up @@ -274,6 +275,7 @@ def process_instance(
instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
initialize_runtime(runtime, instance)

# Here's how you can run the agent (similar to the `main` function) and get the final task state
Expand Down
2 changes: 2 additions & 0 deletions evaluation/benchmarks/bird/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from openhands.events.action import CmdRunAction, MessageAction
from openhands.events.observation import CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync


def codeact_user_response(state: State) -> str:
Expand Down Expand Up @@ -399,6 +400,7 @@ def execute_sql(db_path, sql):
instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
initialize_runtime(runtime, instance)

# Here's how you can run the agent (similar to the `main` function) and get the final task state
Expand Down
2 changes: 2 additions & 0 deletions evaluation/benchmarks/browsing_delegation/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from openhands.core.logger import openhands_logger as logger
from openhands.core.main import create_runtime, run_controller
from openhands.events.action import MessageAction
from openhands.utils.async_utils import call_async_from_sync

# Only CodeActAgent can delegate to BrowsingAgent
SUPPORTED_AGENT_CLS = {'CodeActAgent'}
Expand Down Expand Up @@ -74,6 +75,7 @@ def process_instance(
)

runtime = create_runtime(config)
call_async_from_sync(runtime.connect)

state: State | None = asyncio.run(
run_controller(
Expand Down
2 changes: 2 additions & 0 deletions evaluation/benchmarks/commit0_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
from openhands.events.observation import CmdOutputObservation, ErrorObservation
from openhands.events.serialization.event import event_to_dict
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync
from openhands.utils.shutdown_listener import sleep_if_should_continue

USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
Expand Down Expand Up @@ -394,6 +395,7 @@ def process_instance(
logger.info(f'Starting evaluation for instance {instance.instance_id}.')

runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
try:
initialize_runtime(runtime, instance)

Expand Down
2 changes: 2 additions & 0 deletions evaluation/benchmarks/discoverybench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from openhands.events.action import AgentFinishAction, CmdRunAction, MessageAction
from openhands.events.observation import CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync

EVALUATION_LLM = 'gpt-4-1106-preview'

Expand Down Expand Up @@ -281,6 +282,7 @@ def process_instance(

# Here's how you can run the agent (similar to the `main` function) and get the final task state
runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
initialize_runtime(runtime, instance.data_files)

state: State | None = asyncio.run(
Expand Down
2 changes: 2 additions & 0 deletions evaluation/benchmarks/gaia/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from openhands.events.action import AgentFinishAction, CmdRunAction, MessageAction
from openhands.events.observation import CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync

DATASET_CACHE_DIR = os.path.join(os.path.dirname(__file__), 'data')

Expand Down Expand Up @@ -148,6 +149,7 @@ def process_instance(
logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})

runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
initialize_runtime(runtime, instance)

# Here's how you can run the agent (similar to the `main` function) and get the final task state
Expand Down
2 changes: 2 additions & 0 deletions evaluation/benchmarks/gorilla/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from openhands.core.logger import openhands_logger as logger
from openhands.core.main import create_runtime, run_controller
from openhands.events.action import MessageAction
from openhands.utils.async_utils import call_async_from_sync

AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
'CodeActAgent': codeact_user_response,
Expand Down Expand Up @@ -82,6 +83,7 @@ def process_instance(

# Here's how you can run the agent (similar to the `main` function) and get the final task state
runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
state: State | None = asyncio.run(
run_controller(
config=config,
Expand Down
2 changes: 2 additions & 0 deletions evaluation/benchmarks/gpqa/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
MessageAction,
)
from openhands.events.observation import Observation
from openhands.utils.async_utils import call_async_from_sync

ACTION_FORMAT = """
<<FINAL_ANSWER||
Expand Down Expand Up @@ -214,6 +215,7 @@ def process_instance(
"""

runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
state: State | None = asyncio.run(
run_controller(
config=config,
Expand Down
2 changes: 2 additions & 0 deletions evaluation/benchmarks/humanevalfix/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
from openhands.events.action import CmdRunAction, MessageAction
from openhands.events.observation import CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync

IMPORT_HELPER = {
'python': [
Expand Down Expand Up @@ -232,6 +233,7 @@ def process_instance(

# Here's how you can run the agent (similar to the `main` function) and get the final task state
runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
initialize_runtime(runtime, instance)
state: State | None = asyncio.run(
run_controller(
Expand Down
2 changes: 2 additions & 0 deletions evaluation/benchmarks/logic_reasoning/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
)
from openhands.events.observation import CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync

AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
'CodeActAgent': codeact_user_response,
Expand Down Expand Up @@ -206,6 +207,7 @@ def process_instance(
instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
initialize_runtime(runtime, instance)

# Here's how you can run the agent (similar to the `main` function) and get the final task state
Expand Down
2 changes: 2 additions & 0 deletions evaluation/benchmarks/miniwob/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
BROWSER_EVAL_GET_GOAL_ACTION,
BROWSER_EVAL_GET_REWARDS_ACTION,
)
from openhands.utils.async_utils import call_async_from_sync

SUPPORTED_AGENT_CLS = {'BrowsingAgent', 'CodeActAgent'}

Expand Down Expand Up @@ -145,6 +146,7 @@ def process_instance(
logger.info(f'Starting evaluation for instance {env_id}.')

runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
task_str, obs = initialize_runtime(runtime)

task_str += (
Expand Down
2 changes: 2 additions & 0 deletions evaluation/benchmarks/mint/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
)
from openhands.events.observation import CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync


def codeact_user_response_mint(state: State, task: Task, task_config: dict[str, int]):
Expand Down Expand Up @@ -184,6 +185,7 @@ def process_instance(
)

runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
initialize_runtime(runtime)

state: State | None = asyncio.run(
Expand Down
2 changes: 2 additions & 0 deletions evaluation/benchmarks/ml_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from openhands.events.action import CmdRunAction, MessageAction
from openhands.events.observation import CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync

config = load_app_config()

Expand Down Expand Up @@ -234,6 +235,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
initialize_runtime(runtime, instance)

# Run the agent
Expand Down
2 changes: 2 additions & 0 deletions evaluation/benchmarks/scienceagentbench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from openhands.events.action import CmdRunAction, MessageAction
from openhands.events.observation import CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync

AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
'CodeActAgent': codeact_user_response,
Expand Down Expand Up @@ -195,6 +196,7 @@ def process_instance(
"""

runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
initialize_runtime(runtime, instance)

# Here's how you can run the agent (similar to the `main` function) and get the final task state
Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/swe_bench/eval_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ def process_instance(
'model_patch': model_patch,
'instance_id': instance_id,
},
log_path=test_output_path,
test_log_path=test_output_path,
include_tests_status=True,
)
report = _report[instance_id]
Expand Down
2 changes: 2 additions & 0 deletions evaluation/benchmarks/swe_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
from openhands.events.observation import CmdOutputObservation, ErrorObservation
from openhands.events.serialization.event import event_to_dict
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync
from openhands.utils.shutdown_listener import sleep_if_should_continue

check_if_resolved()
Expand Down Expand Up @@ -630,6 +631,7 @@ def process_instance(
runtime = create_runtime(config, sid=instance.instance_id)
else:
runtime = create_runtime(config)
call_async_from_sync(runtime.connect)

try:
initialize_runtime(runtime, instance)
Expand Down
3 changes: 2 additions & 1 deletion evaluation/benchmarks/the_agent_company/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from openhands.events.action import CmdRunAction, MessageAction
from openhands.events.observation import BrowserOutputObservation, CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync


def get_config(
Expand Down Expand Up @@ -275,7 +276,7 @@ def run_evaluator(
args.task_image_name, task_short_name, temp_dir, agent_llm_config, agent_config
)
runtime: Runtime = create_runtime(config)

call_async_from_sync(runtime.connect)
init_task_env(runtime, args.server_hostname, env_llm_config)

dependencies = load_dependencies(runtime)
Expand Down
2 changes: 2 additions & 0 deletions evaluation/benchmarks/toolqa/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from openhands.events.action import CmdRunAction, MessageAction
from openhands.events.observation import CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync

AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
'CodeActAgent': codeact_user_response,
Expand Down Expand Up @@ -104,6 +105,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})

runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
initialize_runtime(runtime)

# Here's how you can run the agent (similar to the `main` function) and get the final task state
Expand Down
3 changes: 3 additions & 0 deletions evaluation/benchmarks/visualwebarena/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
BROWSER_EVAL_GET_GOAL_ACTION,
BROWSER_EVAL_GET_REWARDS_ACTION,
)
from openhands.utils.async_utils import call_async_from_sync

SUPPORTED_AGENT_CLS = {'VisualBrowsingAgent'}
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
Expand Down Expand Up @@ -159,6 +160,8 @@ def process_instance(
logger.info(f'Starting evaluation for instance {env_id}.')

runtime = create_runtime(config)
call_async_from_sync(runtime.connect)

task_str, goal_image_urls = initialize_runtime(runtime)
initial_user_action = MessageAction(content=task_str, image_urls=goal_image_urls)
state: State | None = asyncio.run(
Expand Down
2 changes: 2 additions & 0 deletions evaluation/benchmarks/webarena/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
BROWSER_EVAL_GET_GOAL_ACTION,
BROWSER_EVAL_GET_REWARDS_ACTION,
)
from openhands.utils.async_utils import call_async_from_sync

SUPPORTED_AGENT_CLS = {'BrowsingAgent'}

Expand Down Expand Up @@ -144,6 +145,7 @@ def process_instance(
logger.info(f'Starting evaluation for instance {env_id}.')

runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
task_str = initialize_runtime(runtime)

state: State | None = asyncio.run(
Expand Down
Loading

0 comments on commit 890d73a

Please sign in to comment.