Merge branch 'All-Hands-AI:main' into main

RainRat · Jan 16, 2025 · f8cc2e4 · f8cc2e4
2 parents 90fbbcc + 0c961bf
commit f8cc2e4
Show file tree

Hide file tree

Showing 60 changed files with 677 additions and 533 deletions.
diff --git a/.github/workflows/openhands-resolver.yml b/.github/workflows/openhands-resolver.yml
@@ -184,6 +184,7 @@ jobs:
             });
 
       - name: Install OpenHands
+        id: install_openhands
         uses: actions/github-script@v7
         env:
           COMMENT_BODY: ${{ github.event.comment.body || '' }}
@@ -196,7 +197,6 @@ jobs:
             const reviewBody = process.env.REVIEW_BODY.trim();
             const labelName = process.env.LABEL_NAME.trim();
             const eventName = process.env.EVENT_NAME.trim();
-
             // Check conditions
             const isExperimentalLabel = labelName === "fix-me-experimental";
             const isIssueCommentExperimental =
@@ -205,6 +205,9 @@ jobs:
             const isReviewCommentExperimental =
               eventName === "pull_request_review" && reviewBody.includes("@openhands-agent-exp");
 
+            // Set output variable
+            core.setOutput('isExperimental', isExperimentalLabel || isIssueCommentExperimental || isReviewCommentExperimental);
+
             // Perform package installation
             if (isExperimentalLabel || isIssueCommentExperimental || isReviewCommentExperimental) {
               console.log("Installing experimental OpenHands...");
@@ -230,7 +233,8 @@ jobs:
             --issue-number ${{ env.ISSUE_NUMBER }} \
             --issue-type ${{ env.ISSUE_TYPE }} \
             --max-iterations ${{ env.MAX_ITERATIONS }} \
-            --comment-id ${{ env.COMMENT_ID }}
+            --comment-id ${{ env.COMMENT_ID }} \
+            --is-experimental ${{ steps.install_openhands.outputs.isExperimental }}
 
       - name: Check resolution result
         id: check_result

diff --git a/config.template.toml b/config.template.toml
@@ -23,6 +23,9 @@ workspace_base = "./workspace"
 # Cache directory path
 #cache_dir = "/tmp/cache"
 
+# Reasoning effort for o1 models (low, medium, high, or not set)
+#reasoning_effort = "medium"
+
 # Debugging enabled
 #debug = false
 
@@ -220,8 +223,8 @@ codeact_enable_jupyter = true
 # LLM config group to use
 #llm_config = 'your-llm-config-group'
 
-# Whether to use microagents at all
-#use_microagents = true
+# Whether to use prompt extension (e.g., microagent, repo/runtime info) at all
+#enable_prompt_extensions = true
 
 # List of microagents to disable
 #disabled_microagents = []

diff --git a/...n/zh-Hans/docusaurus-plugin-content-docs/current/usage/configuration-options.md b/...n/zh-Hans/docusaurus-plugin-content-docs/current/usage/configuration-options.md
@@ -373,7 +373,7 @@ Agent 配置选项在 `config.toml` 文件的 `[agent]` 和 `[agent.<agent_name>
   - 描述: 是否在 action space 中启用 Jupyter
 
 **Microagent 使用**
-- `use_microagents`
+- `enable_prompt_extensions`
   - 类型: `bool`
   - 默认值: `true`
   - 描述: 是否使用 microagents

diff --git a/docs/modules/usage/configuration-options.md b/docs/modules/usage/configuration-options.md
@@ -336,7 +336,7 @@ The agent configuration options are defined in the `[agent]` and `[agent.<agent_
   - Description: Whether Jupyter is enabled in the action space
 
 ### Microagent Usage
-- `use_microagents`
+- `enable_prompt_extensions`
   - Type: `bool`
   - Default: `true`
   - Description: Whether to use microagents at all

diff --git a/evaluation/benchmarks/EDA/run_infer.py b/evaluation/benchmarks/EDA/run_infer.py
@@ -76,7 +76,7 @@ def get_config(
     )
     config.set_llm_config(metadata.llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
-    agent_config.use_microagents = False
+    agent_config.enable_prompt_extensions = False
     return config
 
 

diff --git a/evaluation/benchmarks/agent_bench/run_infer.py b/evaluation/benchmarks/agent_bench/run_infer.py
@@ -60,7 +60,7 @@ def get_config(
     )
     config.set_llm_config(metadata.llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
-    agent_config.use_microagents = False
+    agent_config.enable_prompt_extensions = False
     return config
 
 

diff --git a/evaluation/benchmarks/aider_bench/run_infer.py b/evaluation/benchmarks/aider_bench/run_infer.py
@@ -68,7 +68,7 @@ def get_config(
     )
     config.set_llm_config(metadata.llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
-    agent_config.use_microagents = False
+    agent_config.enable_prompt_extensions = False
 
     # copy 'draft_editor' config if exists
     config_copy = copy.deepcopy(config)

diff --git a/evaluation/benchmarks/biocoder/run_infer.py b/evaluation/benchmarks/biocoder/run_infer.py
@@ -74,7 +74,7 @@ def get_config(
     )
     config.set_llm_config(metadata.llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
-    agent_config.use_microagents = False
+    agent_config.enable_prompt_extensions = False
     return config
 
 

diff --git a/evaluation/benchmarks/bird/run_infer.py b/evaluation/benchmarks/bird/run_infer.py
@@ -87,7 +87,7 @@ def get_config(
     )
     config.set_llm_config(metadata.llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
-    agent_config.use_microagents = False
+    agent_config.enable_prompt_extensions = False
     return config
 
 

diff --git a/evaluation/benchmarks/browsing_delegation/run_infer.py b/evaluation/benchmarks/browsing_delegation/run_infer.py
@@ -51,7 +51,7 @@ def get_config(
     )
     config.set_llm_config(metadata.llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
-    agent_config.use_microagents = False
+    agent_config.enable_prompt_extensions = False
     return config
 
 

diff --git a/evaluation/benchmarks/commit0_bench/run_infer.py b/evaluation/benchmarks/commit0_bench/run_infer.py
@@ -171,7 +171,7 @@ def initialize_runtime(
     action = CmdRunAction(
         command=f'git clone -b commit0_combined https://github.com/{instance["repo"]}.git'
     )
-    action.timeout = 600
+    action.set_hard_timeout(600)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -181,7 +181,7 @@ def initialize_runtime(
     )
 
     action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
-    action.timeout = 600
+    action.set_hard_timeout(600)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -191,7 +191,7 @@ def initialize_runtime(
     )
 
     action = CmdRunAction(command='git checkout -b openhands')
-    action.timeout = 600
+    action.set_hard_timeout(600)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -201,7 +201,7 @@ def initialize_runtime(
 
     # Install commit0
     action = CmdRunAction(command='/root/.cargo/bin/uv pip install commit0')
-    action.timeout = 600
+    action.set_hard_timeout(600)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     # logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -231,7 +231,7 @@ def complete_runtime(
     workspace_dir_name = _get_commit0_workspace_dir_name(instance)
 
     action = CmdRunAction(command='git add .')
-    action.timeout = 600
+    action.set_hard_timeout(600)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -241,7 +241,7 @@ def complete_runtime(
     )
 
     action = CmdRunAction(command='git commit -m "openhands edits"')
-    action.timeout = 600
+    action.set_hard_timeout(600)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -258,7 +258,7 @@ def complete_runtime(
         action = CmdRunAction(
             command=f"git diff {instance['base_commit']} HEAD -- . ':(exclude)spec.pdf.bz2'"
         )
-        action.timeout = 600 + 100 * n_retries
+        action.set_hard_timeout(600 + 100 * n_retries)
         logger.info(action, extra={'msg_type': 'ACTION'})
         obs = runtime.run_action(action)
         # logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -282,7 +282,7 @@ def complete_runtime(
     action = CmdRunAction(
         command=f"{instance['test']['test_cmd']} --json-report --json-report-file=report.json --continue-on-collection-errors {test_dir} > test_output.txt 2>&1"
     )
-    action.timeout = 600
+    action.set_hard_timeout(600)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -292,7 +292,7 @@ def complete_runtime(
     )
     # Read test output
     action = CmdRunAction(command='cat test_output.txt')
-    action.timeout = 600
+    action.set_hard_timeout(600)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     # logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -305,7 +305,7 @@ def complete_runtime(
 
     # Save pytest exit code
     action = CmdRunAction(command='echo $?')
-    action.timeout = 600
+    action.set_hard_timeout(600)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     # logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -318,7 +318,7 @@ def complete_runtime(
 
     # Read the test report
     action = CmdRunAction(command='cat report.json')
-    action.timeout = 600
+    action.set_hard_timeout(600)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     # logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -330,7 +330,7 @@ def complete_runtime(
     repo_name = instance['repo'].split('/')[1]
     repo_name = repo_name.replace('.', '-')
     action = CmdRunAction(command=f'commit0 get-tests {repo_name}')
-    action.timeout = 600
+    action.set_hard_timeout(600)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     # logger.info(obs, extra={'msg_type': 'OBSERVATION'})

diff --git a/evaluation/benchmarks/discoverybench/run_infer.py b/evaluation/benchmarks/discoverybench/run_infer.py
@@ -78,7 +78,7 @@ def get_config(
     )
     config.set_llm_config(metadata.llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
-    agent_config.use_microagents = False
+    agent_config.enable_prompt_extensions = False
     agent_config = AgentConfig(
         function_calling=False,
         codeact_enable_jupyter=True,

diff --git a/evaluation/benchmarks/gaia/run_infer.py b/evaluation/benchmarks/gaia/run_infer.py
@@ -63,7 +63,7 @@ def get_config(
     )
     config.set_llm_config(metadata.llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
-    agent_config.use_microagents = False
+    agent_config.enable_prompt_extensions = False
     return config
 
 

diff --git a/evaluation/benchmarks/gorilla/run_infer.py b/evaluation/benchmarks/gorilla/run_infer.py
@@ -56,7 +56,7 @@ def get_config(
     )
     config.set_llm_config(metadata.llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
-    agent_config.use_microagents = False
+    agent_config.enable_prompt_extensions = False
     return config
 
 

diff --git a/evaluation/benchmarks/gpqa/run_infer.py b/evaluation/benchmarks/gpqa/run_infer.py
@@ -77,7 +77,7 @@ def get_config(
     )
     config.set_llm_config(metadata.llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
-    agent_config.use_microagents = False
+    agent_config.enable_prompt_extensions = False
     return config
 
 

diff --git a/evaluation/benchmarks/humanevalfix/run_infer.py b/evaluation/benchmarks/humanevalfix/run_infer.py
@@ -98,7 +98,7 @@ def get_config(
     )
     config.set_llm_config(metadata.llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
-    agent_config.use_microagents = False
+    agent_config.enable_prompt_extensions = False
     return config
 
 

diff --git a/evaluation/benchmarks/logic_reasoning/run_infer.py b/evaluation/benchmarks/logic_reasoning/run_infer.py
@@ -62,7 +62,7 @@ def get_config(
     )
     config.set_llm_config(metadata.llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
-    agent_config.use_microagents = False
+    agent_config.enable_prompt_extensions = False
     return config
 
 

diff --git a/evaluation/benchmarks/mint/run_infer.py b/evaluation/benchmarks/mint/run_infer.py
@@ -120,7 +120,7 @@ def get_config(
     )
     config.set_llm_config(metadata.llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
-    agent_config.use_microagents = False
+    agent_config.enable_prompt_extensions = False
     return config
 
 

diff --git a/evaluation/benchmarks/ml_bench/run_infer.py b/evaluation/benchmarks/ml_bench/run_infer.py
@@ -93,7 +93,7 @@ def get_config(
     )
     config.set_llm_config(metadata.llm_config)
     agent_config = config.get_agent_config(metadata.agent_class)
-    agent_config.use_microagents = False
+    agent_config.enable_prompt_extensions = False
     return config
 
 

diff --git a/evaluation/benchmarks/swe_bench/eval_infer.py b/evaluation/benchmarks/swe_bench/eval_infer.py
@@ -174,7 +174,7 @@ def process_instance(
 
     # Set +x
     action = CmdRunAction(command='chmod +x /tmp/eval.sh')
-    action.timeout = 600
+    action.set_hard_timeout(600)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -189,7 +189,7 @@ def process_instance(
         "echo 'APPLY_PATCH_FAIL')))"
     )
     action = CmdRunAction(command=exec_command)
-    action.timeout = 600
+    action.set_hard_timeout(600)
     obs = runtime.run_action(action)
     assert isinstance(obs, CmdOutputObservation)
     apply_patch_output = obs.content
@@ -212,7 +212,7 @@ def process_instance(
             # Run eval script in background and save output to log file
             log_file = '/tmp/eval_output.log'
             action = CmdRunAction(command=f'/tmp/eval.sh > {log_file} 2>&1 & echo $!')
-            action.timeout = 60  # Short timeout just to get the process ID
+            action.set_hard_timeout(60)  # Short timeout just to get the process ID
             obs = runtime.run_action(action)
 
             if isinstance(obs, CmdOutputObservation) and obs.exit_code == 0:
@@ -235,7 +235,7 @@ def process_instance(
                     check_action = CmdRunAction(
                         command=f'ps -p {pid} > /dev/null; echo $?'
                     )
-                    check_action.timeout = 60
+                    check_action.set_hard_timeout(60)
                     check_obs = runtime.run_action(check_action)
                     if (
                         isinstance(check_obs, CmdOutputObservation)
@@ -252,7 +252,7 @@ def process_instance(
 
                 # Read the log file
                 cat_action = CmdRunAction(command=f'cat {log_file}')
-                cat_action.timeout = 300
+                cat_action.set_hard_timeout(300)
                 cat_obs = runtime.run_action(cat_action)
 
                 # Grade answer