From 851d88593cf2d6a55d657bc414824f3f8d4c55d8 Mon Sep 17 00:00:00 2001 From: mamoodi Date: Wed, 4 Dec 2024 10:08:22 -0500 Subject: [PATCH 1/3] Release 0.15.0 (#5402) --- Development.md | 2 +- README.md | 6 +++--- compose.yml | 2 +- containers/dev/compose.yml | 2 +- docs/modules/usage/how-to/cli-mode.md | 4 ++-- docs/modules/usage/how-to/headless-mode.md | 4 ++-- docs/modules/usage/installation.mdx | 6 +++--- docs/modules/usage/runtimes.md | 2 +- frontend/package-lock.json | 4 ++-- frontend/package.json | 2 +- pyproject.toml | 4 +--- 11 files changed, 18 insertions(+), 20 deletions(-) diff --git a/Development.md b/Development.md index afc0bcd1479d..77638ff08ccd 100644 --- a/Development.md +++ b/Development.md @@ -100,7 +100,7 @@ poetry run pytest ./tests/unit/test_*.py To reduce build time (e.g., if no changes were made to the client-runtime component), you can use an existing Docker container image by setting the SANDBOX_RUNTIME_CONTAINER_IMAGE environment variable to the desired Docker image. -Example: `export SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.14-nikolaik` +Example: `export SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.15-nikolaik` ## Develop inside Docker container diff --git a/README.md b/README.md index bf9d16157aed..7396099de859 100644 --- a/README.md +++ b/README.md @@ -38,16 +38,16 @@ See the [Installation](https://docs.all-hands.dev/modules/usage/installation) gu system requirements and more information. ```bash -docker pull docker.all-hands.dev/all-hands-ai/runtime:0.14-nikolaik +docker pull docker.all-hands.dev/all-hands-ai/runtime:0.15-nikolaik docker run -it --pull=always \ - -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.14-nikolaik \ + -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.15-nikolaik \ -e LOG_ALL_EVENTS=true \ -v /var/run/docker.sock:/var/run/docker.sock \ -p 3000:3000 \ --add-host host.docker.internal:host-gateway \ --name openhands-app \ - docker.all-hands.dev/all-hands-ai/openhands:0.14 + docker.all-hands.dev/all-hands-ai/openhands:0.15 ``` You'll find OpenHands running at [http://localhost:3000](http://localhost:3000)! diff --git a/compose.yml b/compose.yml index b54e270ab28e..acda26e61368 100644 --- a/compose.yml +++ b/compose.yml @@ -7,7 +7,7 @@ services: image: openhands:latest container_name: openhands-app-${DATE:-} environment: - - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.14-nikolaik} + - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.15-nikolaik} - SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234} - WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace} ports: diff --git a/containers/dev/compose.yml b/containers/dev/compose.yml index b6a9b37a064f..dc7f7af5dc0e 100644 --- a/containers/dev/compose.yml +++ b/containers/dev/compose.yml @@ -11,7 +11,7 @@ services: - BACKEND_HOST=${BACKEND_HOST:-"0.0.0.0"} - SANDBOX_API_HOSTNAME=host.docker.internal # - - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.14-nikolaik} + - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.15-nikolaik} - SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234} - WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace} ports: diff --git a/docs/modules/usage/how-to/cli-mode.md b/docs/modules/usage/how-to/cli-mode.md index 852b8f7010ef..d5426c0f3d37 100644 --- a/docs/modules/usage/how-to/cli-mode.md +++ b/docs/modules/usage/how-to/cli-mode.md @@ -50,7 +50,7 @@ LLM_API_KEY="sk_test_12345" ```bash docker run -it \ --pull=always \ - -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.14-nikolaik \ + -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.15-nikolaik \ -e SANDBOX_USER_ID=$(id -u) \ -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \ -e LLM_API_KEY=$LLM_API_KEY \ @@ -59,7 +59,7 @@ docker run -it \ -v /var/run/docker.sock:/var/run/docker.sock \ --add-host host.docker.internal:host-gateway \ --name openhands-app-$(date +%Y%m%d%H%M%S) \ - docker.all-hands.dev/all-hands-ai/openhands:0.14 \ + docker.all-hands.dev/all-hands-ai/openhands:0.15 \ python -m openhands.core.cli ``` diff --git a/docs/modules/usage/how-to/headless-mode.md b/docs/modules/usage/how-to/headless-mode.md index 89ba9a5794b7..dfd4dd5e3e14 100644 --- a/docs/modules/usage/how-to/headless-mode.md +++ b/docs/modules/usage/how-to/headless-mode.md @@ -44,7 +44,7 @@ LLM_API_KEY="sk_test_12345" ```bash docker run -it \ --pull=always \ - -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.14-nikolaik \ + -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.15-nikolaik \ -e SANDBOX_USER_ID=$(id -u) \ -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \ -e LLM_API_KEY=$LLM_API_KEY \ @@ -54,6 +54,6 @@ docker run -it \ -v /var/run/docker.sock:/var/run/docker.sock \ --add-host host.docker.internal:host-gateway \ --name openhands-app-$(date +%Y%m%d%H%M%S) \ - docker.all-hands.dev/all-hands-ai/openhands:0.14 \ + docker.all-hands.dev/all-hands-ai/openhands:0.15 \ python -m openhands.core.main -t "write a bash script that prints hi" ``` diff --git a/docs/modules/usage/installation.mdx b/docs/modules/usage/installation.mdx index e33e4bcef907..a60963167786 100644 --- a/docs/modules/usage/installation.mdx +++ b/docs/modules/usage/installation.mdx @@ -11,16 +11,16 @@ The easiest way to run OpenHands is in Docker. ```bash -docker pull docker.all-hands.dev/all-hands-ai/runtime:0.14-nikolaik +docker pull docker.all-hands.dev/all-hands-ai/runtime:0.15-nikolaik docker run -it --rm --pull=always \ - -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.14-nikolaik \ + -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.15-nikolaik \ -e LOG_ALL_EVENTS=true \ -v /var/run/docker.sock:/var/run/docker.sock \ -p 3000:3000 \ --add-host host.docker.internal:host-gateway \ --name openhands-app \ - docker.all-hands.dev/all-hands-ai/openhands:0.14 + docker.all-hands.dev/all-hands-ai/openhands:0.15 ``` You can also run OpenHands in a scriptable [headless mode](https://docs.all-hands.dev/modules/usage/how-to/headless-mode), as an [interactive CLI](https://docs.all-hands.dev/modules/usage/how-to/cli-mode), or using the [OpenHands GitHub Action](https://docs.all-hands.dev/modules/usage/how-to/github-action). diff --git a/docs/modules/usage/runtimes.md b/docs/modules/usage/runtimes.md index e6212d43055a..be165131c516 100644 --- a/docs/modules/usage/runtimes.md +++ b/docs/modules/usage/runtimes.md @@ -16,7 +16,7 @@ some flags being passed to `docker run` that make this possible: ``` docker run # ... - -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.11-nikolaik \ + -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.15-nikolaik \ -v /var/run/docker.sock:/var/run/docker.sock \ # ... ``` diff --git a/frontend/package-lock.json b/frontend/package-lock.json index 277961f9b45c..ea59e9e41a73 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -1,12 +1,12 @@ { "name": "openhands-frontend", - "version": "0.14.3", + "version": "0.15.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "openhands-frontend", - "version": "0.14.3", + "version": "0.15.0", "dependencies": { "@monaco-editor/react": "^4.6.0", "@nextui-org/react": "^2.4.8", diff --git a/frontend/package.json b/frontend/package.json index b1cdd1cfaf7c..7980387f71ba 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -1,6 +1,6 @@ { "name": "openhands-frontend", - "version": "0.14.3", + "version": "0.15.0", "private": true, "type": "module", "engines": { diff --git a/pyproject.toml b/pyproject.toml index 6f15945e223d..a098aa81a648 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "openhands-ai" -version = "0.14.3" +version = "0.15.0" description = "OpenHands: Code Less, Make More" authors = ["OpenHands"] license = "MIT" @@ -98,7 +98,6 @@ reportlab = "*" [tool.coverage.run] concurrency = ["gevent"] - [tool.poetry.group.runtime.dependencies] jupyterlab = "*" notebook = "*" @@ -129,7 +128,6 @@ ignore = ["D1"] [tool.ruff.lint.pydocstyle] convention = "google" - [tool.poetry.group.evaluation.dependencies] streamlit = "*" whatthepatch = "*" From c5117bc48d8590967f804fae7b761186f8361e82 Mon Sep 17 00:00:00 2001 From: "Ryan H. Tran" Date: Thu, 5 Dec 2024 00:25:24 +0700 Subject: [PATCH 2/3] Upgrade `openhands-aci` to v0.1.2 (#5397) --- openhands/runtime/action_execution_server.py | 22 +++++ poetry.lock | 10 +-- pyproject.toml | 2 +- tests/unit/test_agent_skill.py | 84 -------------------- 4 files changed, 28 insertions(+), 90 deletions(-) diff --git a/openhands/runtime/action_execution_server.py b/openhands/runtime/action_execution_server.py index b48a9b55a693..26b728284e33 100644 --- a/openhands/runtime/action_execution_server.py +++ b/openhands/runtime/action_execution_server.py @@ -9,8 +9,10 @@ import asyncio import base64 import io +import json import mimetypes import os +import re import shutil import tempfile import time @@ -199,6 +201,26 @@ async def run_ipython(self, action: IPythonRunCellAction) -> Observation: obs: IPythonRunCellObservation = await _jupyter_plugin.run(action) obs.content = obs.content.rstrip() + matches = re.findall( + r'(.*?)', obs.content, re.DOTALL + ) + if matches: + results = [] + for match in matches: + try: + result_dict = json.loads(match) + results.append( + result_dict.get('formatted_output_and_error', '') + ) + except json.JSONDecodeError: + # Handle JSON decoding errors if necessary + results.append( + f"Invalid JSON in 'openhands-aci' output: {match}" + ) + + # Combine the results (e.g., join them) or handle them as required + obs.content = '\n'.join(results) + if action.include_extra: obs.content += ( f'\n[Jupyter current working directory: {self.bash_session.pwd}]' diff --git a/poetry.lock b/poetry.lock index 5eb49683b956..03b64cee12e1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. [[package]] name = "aenum" @@ -5483,13 +5483,13 @@ numpy = {version = ">=1.26.0", markers = "python_version >= \"3.12\""} [[package]] name = "openhands-aci" -version = "0.1.1" +version = "0.1.2" description = "An Agent-Computer Interface (ACI) designed for software development agents OpenHands." optional = false python-versions = "<4.0,>=3.12" files = [ - {file = "openhands_aci-0.1.1-py3-none-any.whl", hash = "sha256:8831f97b887571005dca0d70a9f6f0a4f9feb35d3d41f499e70d72b5fb68a599"}, - {file = "openhands_aci-0.1.1.tar.gz", hash = "sha256:705b74a12a8f428e64295b5de125f553500f62ef5ab3a5a6284d8fcf638025e6"}, + {file = "openhands_aci-0.1.2-py3-none-any.whl", hash = "sha256:a2fcae7a2f1047d516d6862742c7a2f8ea988c6a58295599bc305c99b8d53067"}, + {file = "openhands_aci-0.1.2.tar.gz", hash = "sha256:c3c91aa3f13554159168b44a7f86bf333da30067fa6370a46ed785bf4240631b"}, ] [package.dependencies] @@ -10087,4 +10087,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = "^3.12" -content-hash = "56a80082afb76e518239060855598921d94a0373123b2d9222cf8c7b6238b7ad" +content-hash = "1b42dcc42b1dae014b1951246781a850c95ce2d6fdaab45f8b62f5a04ebd5e53" diff --git a/pyproject.toml b/pyproject.toml index a098aa81a648..576c27d6dcd3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,7 +64,7 @@ modal = "^0.66.26" runloop-api-client = "0.10.0" pygithub = "^2.5.0" joblib = "*" -openhands-aci = "^0.1.1" +openhands-aci = "^0.1.2" python-socketio = "^5.11.4" redis = "^5.2.0" diff --git a/tests/unit/test_agent_skill.py b/tests/unit/test_agent_skill.py index 6079eb659aea..63745f4dd2b4 100644 --- a/tests/unit/test_agent_skill.py +++ b/tests/unit/test_agent_skill.py @@ -715,87 +715,3 @@ def test_parse_pptx(tmp_path): 'Hello, this is the second test PPTX slide.\n\n' ) assert output == expected_output, f'Expected output does not match. Got: {output}' - - -# ============================================================================= - - -def test_file_editor_view(tmp_path): - # generate a random directory - random_dir = tmp_path / 'dir_1' - random_dir.mkdir() - # create a file in the directory - random_file = random_dir / 'a.txt' - random_file.write_text('Line 1\nLine 2\nLine 3\nLine 4\nLine 5') - random_dir_2 = tmp_path / 'dir_2' - random_dir_2.mkdir() - random_file_2 = random_dir_2 / 'b.txt' - random_file_2.write_text('Line 1\nLine 2\nLine 3\nLine 4\nLine 5') - - from openhands.runtime.plugins.agent_skills.agentskills import file_editor - - # view the file - result = file_editor(command='view', path=str(random_file)) - print('\n', result) - assert result is not None - assert ( - result.split('\n') - == f"""Here's the result of running `cat -n` on {random_file}: - 1\tLine 1 - 2\tLine 2 - 3\tLine 3 - 4\tLine 4 - 5\tLine 5 -""".split('\n') - ) - - # view the directory - result = file_editor(command='view', path=str(tmp_path)) - print('\n', result) - assert result is not None - assert ( - result.strip().split('\n') - == f"""Here's the files and directories up to 2 levels deep in {tmp_path}, excluding hidden items: -{tmp_path} -{tmp_path}/dir_2 -{tmp_path}/dir_2/b.txt -{tmp_path}/dir_1 -{tmp_path}/dir_1/a.txt -""".strip().split('\n') - ) - - -def test_file_editor_create(tmp_path): - # generate a random directory - random_dir = tmp_path / 'dir_1' - random_dir.mkdir() - # create a file in the directory - random_file = random_dir / 'a.txt' - - from openhands.runtime.plugins.agent_skills.agentskills import file_editor - - # view an unexist file - result = file_editor(command='view', path=str(random_file)) - print(result) - assert result is not None - assert ( - result - == f'ERROR:\nInvalid `path` parameter: {random_file}. The path {random_file} does not exist. Please provide a valid path.' - ) - - # create a file - result = file_editor(command='create', path=str(random_file), file_text='Line 6') - print(result) - assert result is not None - assert result == f'File created successfully at: {random_file}' - - # view again - result = file_editor(command='view', path=str(random_file)) - print(result) - assert result is not None - assert ( - result.strip().split('\n') - == f"""Here's the result of running `cat -n` on {random_file}: - 1\tLine 6 -""".strip().split('\n') - ) From 8f47547b08f24256077aca062e0dd5599740d082 Mon Sep 17 00:00:00 2001 From: Cheng Yang <93481273+young010101@users.noreply.github.com> Date: Thu, 5 Dec 2024 01:28:04 +0800 Subject: [PATCH 3/3] docs: fix markdown linting and broken links (#5401) --- evaluation/benchmarks/EDA/README.md | 7 +++---- evaluation/benchmarks/agent_bench/README.md | 2 +- evaluation/benchmarks/aider_bench/README.md | 2 +- evaluation/benchmarks/biocoder/README.md | 7 ++++--- evaluation/benchmarks/bird/README.md | 5 ++--- evaluation/benchmarks/browsing_delegation/README.md | 2 +- evaluation/benchmarks/commit0_bench/README.md | 5 ++--- evaluation/benchmarks/gaia/README.md | 4 +++- evaluation/benchmarks/gorilla/README.md | 2 +- evaluation/benchmarks/gpqa/README.md | 13 +++++++++---- evaluation/benchmarks/humanevalfix/README.md | 6 ++---- evaluation/benchmarks/logic_reasoning/README.md | 3 ++- evaluation/benchmarks/miniwob/README.md | 3 +-- evaluation/benchmarks/ml_bench/README.md | 2 +- evaluation/benchmarks/scienceagentbench/README.md | 5 +++-- evaluation/benchmarks/swe_bench/README.md | 12 +++++++----- evaluation/benchmarks/toolqa/README.md | 2 +- evaluation/benchmarks/webarena/README.md | 2 +- 18 files changed, 45 insertions(+), 39 deletions(-) diff --git a/evaluation/benchmarks/EDA/README.md b/evaluation/benchmarks/EDA/README.md index fee875c5dd51..11de7ca36e13 100644 --- a/evaluation/benchmarks/EDA/README.md +++ b/evaluation/benchmarks/EDA/README.md @@ -4,12 +4,10 @@ This folder contains evaluation harness for evaluating agents on the Entity-dedu ## Setup Environment and LLM Configuration -Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM. - +Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM. ## Start the evaluation - ```bash export OPENAI_API_KEY="sk-XXX"; # This is required for evaluation (to simulate another party of conversation) ./evaluation/benchmarks/EDA/scripts/run_infer.sh [model_config] [git-version] [agent] [dataset] [eval_limit] @@ -37,7 +35,8 @@ For example, ``` ## Reference -``` + +```bibtex @inproceedings{zhang2023entity, title={Probing the Multi-turn Planning Capabilities of LLMs via 20 Question Games}, author={Zhang, Yizhe and Lu, Jiarui and Jaitly, Navdeep}, diff --git a/evaluation/benchmarks/agent_bench/README.md b/evaluation/benchmarks/agent_bench/README.md index ea7da04e9f29..9ee8482eb39b 100644 --- a/evaluation/benchmarks/agent_bench/README.md +++ b/evaluation/benchmarks/agent_bench/README.md @@ -4,7 +4,7 @@ This folder contains evaluation harness for evaluating agents on the [AgentBench ## Setup Environment and LLM Configuration -Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM. +Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM. ## Start the evaluation diff --git a/evaluation/benchmarks/aider_bench/README.md b/evaluation/benchmarks/aider_bench/README.md index 965fc06d7ecc..086cfe58160a 100644 --- a/evaluation/benchmarks/aider_bench/README.md +++ b/evaluation/benchmarks/aider_bench/README.md @@ -10,7 +10,7 @@ Hugging Face dataset based on the ## Setup Environment and LLM Configuration -Please follow instruction [here](../README.md#setup) to setup your local +Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM. ## Start the evaluation diff --git a/evaluation/benchmarks/biocoder/README.md b/evaluation/benchmarks/biocoder/README.md index 035f2d20bf12..4cd1643fa98f 100644 --- a/evaluation/benchmarks/biocoder/README.md +++ b/evaluation/benchmarks/biocoder/README.md @@ -4,13 +4,14 @@ Implements evaluation of agents on BioCoder from the BioCoder benchmark introduc ## Setup Environment and LLM Configuration -Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM. +Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM. ## BioCoder Docker Image In the openhands branch of the Biocoder repository, we have slightly modified our original Docker image to work with the OpenHands environment. In the Docker image are testing scripts (`/testing/start_test_openhands.py` and aux files in `/testing_files/`) to assist with evaluation. Additionally, we have installed all dependencies, including OpenJDK, mamba (with Python 3.6), and many system libraries. Notably, we have **not** packaged all repositories into the image, so they are downloaded at runtime. **Before first execution, pull our Docker image with the following command** + ```bash docker pull public.ecr.aws/i5g0m1f6/eval_biocoder:v1.0 ``` @@ -19,7 +20,6 @@ To reproduce this image, please see the Dockerfile_Openopenhands in the `biocode ## Start the evaluation - ```bash ./evaluation/benchmarks/biocoder/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] ``` @@ -47,7 +47,8 @@ with current OpenHands version, then your command would be: ``` ## Reference -``` + +```bibtex @misc{tang2024biocoder, title={BioCoder: A Benchmark for Bioinformatics Code Generation with Large Language Models}, author={Xiangru Tang and Bill Qian and Rick Gao and Jiakang Chen and Xinyun Chen and Mark Gerstein}, diff --git a/evaluation/benchmarks/bird/README.md b/evaluation/benchmarks/bird/README.md index 90e3fa300cbd..41874fe99f59 100644 --- a/evaluation/benchmarks/bird/README.md +++ b/evaluation/benchmarks/bird/README.md @@ -4,7 +4,7 @@ Implements evaluation of agents on BIRD introduced in [Can LLM Already Serve as ## Setup Environment and LLM Configuration -Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM. +Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM. ## Run Inference on Bird @@ -22,8 +22,7 @@ like to evaluate. It could also be a release tag like `0.6.2`. For each problem, OpenHands is given a set number of iterations to fix the failing code. The history field shows each iteration's response to correct its code that fails any test case. - -``` +```json { "task_id": "0", "instruction": "You are a SQL expert and need to complete the following text-to-SQL tasks.\n\nCREATE TABLE frpm\n(\n CDSCode TEXT not null\n primary key,\n `Academic Year` TEXT null,\n `County Code` TEXT null,\n `District Code` INTEGER null,\n `School Code` TEXT null,\n `County Name` TEXT null,\n `District Name` TEXT null,\n `School Name` TEXT null,\n `District Type` TEXT null,\n `School Type` TEXT null,\n `Educational Option Type` TEXT null,\n `NSLP Provision Status` TEXT null,\n `Charter School (Y/N)` INTEGER null,\n `Charter School Number` TEXT null,\n `Charter Funding Type` TEXT null,\n IRC INTEGER null,\n `Low Grade` TEXT null,\n `High Grade` TEXT null,\n `Enrollment (K-12)` REAL null,\n `Free Meal Count (K-12)` REAL null,\n `Percent (%) Eligible Free (K-12)` REAL null,\n `FRPM Count (K-12)` REAL null,\n `Percent (%) Eligible FRPM (K-12)` REAL null,\n `Enrollment (Ages 5-17)` REAL null,\n `Free Meal Count (Ages 5-17)` REAL null,\n `Percent (%) Eligible Free (Ages 5-17)` REAL null,\n `FRPM Count (Ages 5-17)` REAL null,\n `Percent (%) Eligible FRPM (Ages 5-17)` REAL null,\n `2013-14 CALPADS Fall 1 Certification Status` INTEGER null,\n foreign key (CDSCode) references schools (CDSCode)\n);\n\nCREATE TABLE satscores\n(\n cds TEXT not null\n primary key,\n rtype TEXT not null,\n sname TEXT null,\n dname TEXT null,\n cname TEXT null,\n enroll12 INTEGER not null,\n NumTstTakr INTEGER not null,\n AvgScrRead INTEGER null,\n AvgScrMath INTEGER null,\n AvgScrWrite INTEGER null,\n NumGE1500 INTEGER null,\n-- PctGE1500 double null,\n foreign key (cds) references schools (CDSCode)\n);\n\nCREATE TABLE schools\n(\n CDSCode TEXT not null\n primary key,\n NCESDist TEXT null,\n NCESSchool TEXT null,\n StatusType TEXT not null,\n County TEXT not null,\n District TEXT not null,\n School TEXT null,\n Street TEXT null,\n StreetAbr TEXT null,\n City TEXT null,\n Zip TEXT null,\n State TEXT null,\n MailStreet TEXT null,\n MailStrAbr TEXT null,\n MailCity TEXT null,\n MailZip TEXT null,\n MailState TEXT null,\n Phone TEXT null,\n Ext TEXT null,\n Website TEXT null,\n OpenDate DATE null,\n ClosedDate DATE null,\n Charter INTEGER null,\n CharterNum TEXT null,\n FundingType TEXT null,\n DOC TEXT not null,\n DOCType TEXT not null,\n SOC TEXT null,\n SOCType TEXT null,\n EdOpsCode TEXT null,\n EdOpsName TEXT null,\n EILCode TEXT null,\n EILName TEXT null,\n GSoffered TEXT null,\n GSserved TEXT null,\n Virtual TEXT null,\n Magnet INTEGER null,\n Latitude REAL null,\n Longitude REAL null,\n AdmFName1 TEXT null,\n AdmLName1 TEXT null,\n AdmEmail1 TEXT null,\n AdmFName2 TEXT null,\n AdmLName2 TEXT null,\n AdmEmail2 TEXT null,\n AdmFName3 TEXT null,\n AdmLName3 TEXT null,\n AdmEmail3 TEXT null,\n LastUpdate DATE not null\n);\n\n-- External Knowledge: Eligible free rate for K-12 = `Free Meal Count (K-12)` / `Enrollment (K-12)`\n\n-- Using valid SQLite and understanding External Knowledge, answer the following questions for the tables provided above.\n\n-- Using valid SQLite, answer the following questions for the tables provided above.\nQuestion: What is the highest eligible free rate for K-12 students in the schools in Alameda County?\n\n\nPlease write the SQL in one line without line breaks.And write a new python file named 0.py to call the SQL you wrote.You need to follow the code template below:\n\n\n import sqlite3\n def execute_sql(db_path, sql):\n with sqlite3.connect(db_path) as conn:\n cursor = conn.cursor()\n cursor.execute(sql)\n result = cursor.fetchall()\n return result\n\n if __name__ == '__main__':\n sql = \"\" # filling your SQL here\n db_path = \"california_schools/california_schools.sqlite\"\n print(db_path)\n result = execute_sql(db_path, sql)\n print(result)\n \n\nEnvironment has been set up for you to start working.You may assume all necessary tools are installed.\n\nIMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\nYou SHOULD INCLUDE PROPER INDENTATION in your edit commands.\nWhen you think you have fixed the issue through code changes, please finish the interaction using the "finish" tool.\n", diff --git a/evaluation/benchmarks/browsing_delegation/README.md b/evaluation/benchmarks/browsing_delegation/README.md index a06170f8b9e0..9ae349b81900 100644 --- a/evaluation/benchmarks/browsing_delegation/README.md +++ b/evaluation/benchmarks/browsing_delegation/README.md @@ -7,7 +7,7 @@ If so, the browsing performance upper-bound of CodeActAgent will be the performa ## Setup Environment and LLM Configuration -Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM. +Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM. ## Run Inference diff --git a/evaluation/benchmarks/commit0_bench/README.md b/evaluation/benchmarks/commit0_bench/README.md index 78b58b02137f..9ac3a0e05dd3 100644 --- a/evaluation/benchmarks/commit0_bench/README.md +++ b/evaluation/benchmarks/commit0_bench/README.md @@ -4,19 +4,18 @@ This folder contains the evaluation harness that we built on top of the original The evaluation consists of three steps: -1. Environment setup: [install python environment](../README.md#development-environment), [configure LLM config](../README.md#configure-openhands-and-your-llm). +1. Environment setup: [install python environment](../../README.md#development-environment), [configure LLM config](../../README.md#configure-openhands-and-your-llm). 2. [Run Evaluation](#run-inference-on-commit0-instances): Generate a edit patch for each Commit0 Repo, and get the evaluation results ## Setup Environment and LLM Configuration -Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM. +Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM. ## OpenHands Commit0 Instance-level Docker Support OpenHands supports using the Commit0 Docker for **[inference](#run-inference-on-commit0-instances). This is now the default behavior. - ## Run Inference on Commit0 Instances Make sure your Docker daemon is running, and you have ample disk space (at least 200-500GB, depends on the Commit0 set you are running on) for the [instance-level docker image](#openhands-commit0-instance-level-docker-support). diff --git a/evaluation/benchmarks/gaia/README.md b/evaluation/benchmarks/gaia/README.md index f592e5f7118d..9a7bbd7fa346 100644 --- a/evaluation/benchmarks/gaia/README.md +++ b/evaluation/benchmarks/gaia/README.md @@ -4,9 +4,10 @@ This folder contains evaluation harness for evaluating agents on the [GAIA bench ## Setup Environment and LLM Configuration -Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM. +Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM. ## Run the evaluation + We are using the GAIA dataset hosted on [Hugging Face](https://huggingface.co/datasets/gaia-benchmark/GAIA). Please accept the terms and make sure to have logged in on your computer by `huggingface-cli login` before running the evaluation. @@ -41,6 +42,7 @@ For example, ## Get score Then you can get stats by running the following command: + ```bash python ./evaluation/benchmarks/gaia/get_score.py \ --file diff --git a/evaluation/benchmarks/gorilla/README.md b/evaluation/benchmarks/gorilla/README.md index c6f1cde55b40..d5a076234a5a 100644 --- a/evaluation/benchmarks/gorilla/README.md +++ b/evaluation/benchmarks/gorilla/README.md @@ -4,7 +4,7 @@ This folder contains evaluation harness we built on top of the original [Gorilla ## Setup Environment and LLM Configuration -Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM. +Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM. ## Run Inference on APIBench Instances diff --git a/evaluation/benchmarks/gpqa/README.md b/evaluation/benchmarks/gpqa/README.md index 235b9ab9b281..735584d4556e 100644 --- a/evaluation/benchmarks/gpqa/README.md +++ b/evaluation/benchmarks/gpqa/README.md @@ -3,6 +3,7 @@ Implements the evaluation of agents on the GPQA benchmark introduced in [GPQA: A Graduate-Level Google-Proof Q&A Benchmark](https://arxiv.org/abs/2308.07124). This code implements the evaluation of agents on the GPQA Benchmark with Open Book setting. + - The benchmark consists of 448 high-quality and extremely difficult multiple-choice questions in the domains of biology, physics, and chemistry. The questions are intentionally designed to be "Google-proof," meaning that even highly skilled non-expert validators achieve only 34% accuracy despite unrestricted access to the web. - Even experts in the corresponding domains achieve only 65% accuracy. - State-of-the-art AI systems achieve only 39% accuracy on this challenging dataset. @@ -11,20 +12,24 @@ This code implements the evaluation of agents on the GPQA Benchmark with Open Bo Accurate solving of above graduate level questions would require both tool use (e.g., python for calculations) and web-search for finding related facts as information required for the questions might not be part of the LLM knowledge / training data. Further references: -- https://arxiv.org/pdf/2311.12022 -- https://paperswithcode.com/dataset/gpqa -- https://github.com/idavidrein/gpqa + +- +- +- ## Setup Environment and LLM Configuration -Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM. +Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM. ## Run Inference on GPQA Benchmark + 'gpqa_main', 'gqpa_diamond', 'gpqa_experts', 'gpqa_extended' -- data split options From the root of the OpenHands repo, run the following command: + ```bash ./evaluation/benchmarks/gpqa/scripts/run_infer.sh [model_config_name] [git-version] [num_samples_eval] [data_split] [AgentClass] ``` + You can replace `model_config_name` with any model you set up in `config.toml`. - `model_config_name`: The model configuration name from `config.toml` that you want to evaluate. diff --git a/evaluation/benchmarks/humanevalfix/README.md b/evaluation/benchmarks/humanevalfix/README.md index 5f3ae58ee29d..60dabef1f609 100644 --- a/evaluation/benchmarks/humanevalfix/README.md +++ b/evaluation/benchmarks/humanevalfix/README.md @@ -4,7 +4,7 @@ Implements evaluation of agents on HumanEvalFix from the HumanEvalPack benchmark ## Setup Environment and LLM Configuration -Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM. +Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM. ## Run Inference on HumanEvalFix @@ -14,13 +14,11 @@ Please follow instruction [here](../README.md#setup) to setup your local develop You can replace `eval_gpt4_1106_preview` with any model you set up in `config.toml`. - ## Examples For each problem, OpenHands is given a set number of iterations to fix the failing code. The history field shows each iteration's response to correct its code that fails any test case. - -``` +```json { "task_id": "Python/2", "instruction": "Please fix the function in Python__2.py such that all test cases pass.\nEnvironment has been set up for you to start working. You may assume all necessary tools are installed.\n\n# Problem Statement\ndef truncate_number(number: float) -> float:\n return number % 1.0 + 1.0\n\n\n\n\n\n\ndef check(truncate_number):\n assert truncate_number(3.5) == 0.5\n assert abs(truncate_number(1.33) - 0.33) < 1e-6\n assert abs(truncate_number(123.456) - 0.456) < 1e-6\n\ncheck(truncate_number)\n\nIMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\nYou should NOT modify any existing test case files. If needed, you can add new test cases in a NEW file to reproduce the issue.\nYou SHOULD INCLUDE PROPER INDENTATION in your edit commands.\nWhen you think you have fixed the issue through code changes, please finish the interaction using the "finish" tool.\n", diff --git a/evaluation/benchmarks/logic_reasoning/README.md b/evaluation/benchmarks/logic_reasoning/README.md index d4e4d3e9a554..bba0076f25fa 100644 --- a/evaluation/benchmarks/logic_reasoning/README.md +++ b/evaluation/benchmarks/logic_reasoning/README.md @@ -4,9 +4,10 @@ This folder contains evaluation harness for evaluating agents on the logic reaso ## Setup Environment and LLM Configuration -Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM. +Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM. ## Run Inference on logic_reasoning + The following code will run inference on the first example of the ProofWriter dataset, ```bash diff --git a/evaluation/benchmarks/miniwob/README.md b/evaluation/benchmarks/miniwob/README.md index 5535e45a7dc0..3809925b3fd6 100644 --- a/evaluation/benchmarks/miniwob/README.md +++ b/evaluation/benchmarks/miniwob/README.md @@ -4,7 +4,7 @@ This folder contains evaluation for [MiniWoB++](https://miniwob.farama.org/) ben ## Setup Environment and LLM Configuration -Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM. +Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM. ## Test if your environment works @@ -42,7 +42,6 @@ poetry run python evaluation/benchmarks/miniwob/get_success_rate.py evaluation/e You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenHands/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions). - ## BrowsingAgent V1.0 result Tested on BrowsingAgent V1.0 diff --git a/evaluation/benchmarks/ml_bench/README.md b/evaluation/benchmarks/ml_bench/README.md index 528edddc148a..e8b386205230 100644 --- a/evaluation/benchmarks/ml_bench/README.md +++ b/evaluation/benchmarks/ml_bench/README.md @@ -12,7 +12,7 @@ For more details on the ML-Bench task and dataset, please refer to the paper: [M ## Setup Environment and LLM Configuration -Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM. +Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM. ## Run Inference on ML-Bench diff --git a/evaluation/benchmarks/scienceagentbench/README.md b/evaluation/benchmarks/scienceagentbench/README.md index 4d979177215b..5cb39da591af 100644 --- a/evaluation/benchmarks/scienceagentbench/README.md +++ b/evaluation/benchmarks/scienceagentbench/README.md @@ -1,10 +1,10 @@ # ScienceAgentBench Evaluation with OpenHands -This folder contains the evaluation harness for [ScienceAgentBench](https://osu-nlp-group.github.io/ScienceAgentBench/) (paper: https://arxiv.org/abs/2410.05080). +This folder contains the evaluation harness for [ScienceAgentBench](https://osu-nlp-group.github.io/ScienceAgentBench/) (paper: ). ## Setup Environment and LLM Configuration -Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM. +Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM. ## Setup ScienceAgentBench @@ -45,6 +45,7 @@ After the inference is completed, you may use the following command to extract n ```bash python post_proc.py [log_fname] ``` + - `log_fname`, e.g. `evaluation/.../output.jsonl`, is the automatically saved trajectory log of an OpenHands agent. Output will be write to e.g. `evaluation/.../output.converted.jsonl` diff --git a/evaluation/benchmarks/swe_bench/README.md b/evaluation/benchmarks/swe_bench/README.md index b69a7389555c..7ed1e2688198 100644 --- a/evaluation/benchmarks/swe_bench/README.md +++ b/evaluation/benchmarks/swe_bench/README.md @@ -6,20 +6,19 @@ This folder contains the evaluation harness that we built on top of the original The evaluation consists of three steps: -1. Environment setup: [install python environment](../README.md#development-environment), [configure LLM config](../README.md#configure-openhands-and-your-llm), and [pull docker](#openhands-swe-bench-instance-level-docker-support). +1. Environment setup: [install python environment](../../README.md#development-environment), [configure LLM config](../../README.md#configure-openhands-and-your-llm), and [pull docker](#openhands-swe-bench-instance-level-docker-support). 2. [Run inference](#run-inference-on-swe-bench-instances): Generate a edit patch for each Github issue 3. [Evaluate patches using SWE-Bench docker](#evaluate-generated-patches) ## Setup Environment and LLM Configuration -Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM. +Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM. ## OpenHands SWE-Bench Instance-level Docker Support OpenHands now support using the [official evaluation docker](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md) for both **[inference](#run-inference-on-swe-bench-instances) and [evaluation](#evaluate-generated-patches)**. This is now the default behavior. - ## Run Inference on SWE-Bench Instances Make sure your Docker daemon is running, and you have ample disk space (at least 200-500GB, depends on the SWE-Bench set you are running on) for the [instance-level docker image](#openhands-swe-bench-instance-level-docker-support). @@ -52,7 +51,8 @@ default, it is set to 1. - `dataset_split`, split for the huggingface dataset. e.g., `test`, `dev`. Default to `test`. There are also two optional environment variables you can set. -``` + +```bash export USE_HINT_TEXT=true # if you want to use hint text in the evaluation. Default to false. Ignore this if you are not sure. export USE_INSTANCE_IMAGE=true # if you want to use instance-level docker images. Default to true ``` @@ -127,6 +127,7 @@ With `output.jsonl` file, you can run `eval_infer.sh` to evaluate generated patc **This evaluation is performed using the official dockerized evaluation announced [here](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md).** > If you want to evaluate existing results, you should first run this to clone existing outputs +> >```bash >git clone https://huggingface.co/spaces/OpenHands/evaluation evaluation/evaluation_outputs >``` @@ -143,6 +144,7 @@ Then you can run the following: ``` The script now accepts optional arguments: + - `instance_id`: Specify a single instance to evaluate (optional) - `dataset_name`: The name of the dataset to use (default: `"princeton-nlp/SWE-bench_Lite"`) - `split`: The split of the dataset to use (default: `"test"`) @@ -179,7 +181,6 @@ To clean-up all existing runtimes that you've already started, run: ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh ``` - ## Visualize Results First you need to clone `https://huggingface.co/spaces/OpenHands/evaluation` and add your own running results from openhands into the `outputs` of the cloned repo. @@ -189,6 +190,7 @@ git clone https://huggingface.co/spaces/OpenHands/evaluation ``` **(optional) setup streamlit environment with conda**: + ```bash cd evaluation conda create -n streamlit python=3.10 diff --git a/evaluation/benchmarks/toolqa/README.md b/evaluation/benchmarks/toolqa/README.md index eda478f4489f..b6b25da43b0e 100644 --- a/evaluation/benchmarks/toolqa/README.md +++ b/evaluation/benchmarks/toolqa/README.md @@ -4,7 +4,7 @@ This folder contains an evaluation harness we built on top of the original [Tool ## Setup Environment and LLM Configuration -Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM. +Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM. ## Run Inference on ToolQA Instances diff --git a/evaluation/benchmarks/webarena/README.md b/evaluation/benchmarks/webarena/README.md index 3e403d5a7f46..68f37c1a7b8f 100644 --- a/evaluation/benchmarks/webarena/README.md +++ b/evaluation/benchmarks/webarena/README.md @@ -4,7 +4,7 @@ This folder contains evaluation for [WebArena](https://github.com/web-arena-x/we ## Setup Environment and LLM Configuration -Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM. +Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM. ## Setup WebArena Environment