Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Es vector store basic implementation #21

Open
wants to merge 24 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
24a59bc
update required packages
noamoss Jan 31, 2025
f7c2433
add es vectorstore class and method
noamoss Jan 31, 2025
f0fd72e
register the new es vector store
noamoss Jan 31, 2025
40f86ab
update sync flow, suppoer es vector store
noamoss Jan 31, 2025
6ed0a48
add cli interface for es vetorstore sync
noamoss Jan 31, 2025
ae39290
update readme.md
noamoss Jan 31, 2025
a8d2b03
remove query method from es vectorstore
noamoss Jan 31, 2025
b6379d9
update .env.example defintions for es vector store
noamoss Jan 31, 2025
ca13b4f
add tests
noamoss Jan 31, 2025
2b7752e
update requirements.txt
noamoss Jan 31, 2025
838c9cb
fix tools initialize condition logic
noamoss Feb 9, 2025
725c228
replace hard-coded embedding with config variable
noamoss Feb 9, 2025
e19d797
refactor: simplify file upload batch processing
noamoss Feb 9, 2025
1604d25
include context name in the index name
noamoss Feb 9, 2025
8c41833
replace file_search with semantic search function tool and add tests
noamoss Feb 9, 2025
503e735
add embedding constants to the right config file...
noamoss Feb 9, 2025
cdf0a9f
remove OpenAI-specific vector store IDs from ES implementation
noamoss Feb 9, 2025
f5eee44
update tests with constant embedding model
noamoss Feb 9, 2025
f66d424
create a local cli tool for communication with the assistants
noamoss Feb 14, 2025
9d14e0d
ui improvements
noamoss Feb 14, 2025
fd84e36
add 'requires_actions' detilas to output
noamoss Feb 14, 2025
40cffc9
Revert "add 'requires_actions' detilas to output"
noamoss Feb 14, 2025
374b8d9
Revert "ui improvements"
noamoss Feb 14, 2025
8c7dfae
Revert "create a local cli tool for communication with the assistants"
noamoss Feb 14, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .env.sample
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
AIRTABLE_API_KEY=
OPENAI_API_KEY=
OPENAI_API_KEY=
ES_USERNAME=
ES_PASSWORD=
ES_HOST=
10 changes: 9 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@ $ pip install -U -e .
$ botnim --help
```

for development:
```bash
$ pip install -U -e .[dev]
```

## Directory Structure

- `.env.sample`: Sample environment file for the benchmarking scripts.
Expand All @@ -27,6 +32,9 @@ $ botnim --help
- `__init__.py`: Package initialization.
- `vector_store_base.py`: Abstract base class for vector store implementations.
- `vector_store_openai.py`: OpenAI Vector Store implementation.
- `vector_store_es.py`: Elasticsearch Vector Store implementation
- see the `backend/es` directory for examples
- run `pytest` to test the Elasticsearch Vector Store.
- `benchmark/`: Benchmarking scripts for the bots.
Copy this file to `.env` and fill in the necessary values.
- `run-benchmark.py`: Main benchmarking script.
Expand All @@ -52,7 +60,7 @@ $ botnim --help
- Configure the source URL in the bot's `config.yaml`
- The content will be automatically downloaded during sync
Either:
3. `botnim sync {staging/production} {budgetkey/takanon}` to sync the specifications with the OpenAI account.
3. `botnim sync {staging/production} {budgetkey/takanon} --backend {openai/es}` to sync the specifications with the OpenAI account.
- Use `--replace-context` flag to force a complete rebuild of the vector store (useful when context files have been modified)
Or
3. Commit the changes to the repository
Expand Down
18 changes: 11 additions & 7 deletions botnim/cli.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,41 @@
import click
from .sync import sync_agents
from .benchmark.runner import run_benchmarks
from .config import SPECS


@click.group()
def cli():
"""A simple CLI tool."""
pass

# Sync command, receives two arguments: production/staging and a list of bots to sync ('budgetkey'/'takanon' or 'all')
@cli.command()
@cli.command(name='sync')
@click.argument('environment', type=click.Choice(['production', 'staging']))
@click.argument('bots', type=click.Choice(['budgetkey', 'takanon', 'all']))
@click.option('--replace-context', is_flag=True, help='Replace existing context')
def sync(environment, bots, replace_context):
@click.option('--backend', type=click.Choice(['es', 'openai']), default='openai', help='Vector store backend')
def sync(environment, bots, replace_context, backend):
"""Sync bots to Airtable."""
click.echo(f"Syncing {bots} to {environment}")
sync_agents(environment, bots, replace_context=replace_context)
sync_agents(environment, bots, backend=backend,replace_context=replace_context)

# Run benchmarks command, receives three arguments: production/staging, a list of bots to run benchmarks on ('budgetkey'/'takanon' or 'all') and whether to run benchmarks on the production environment to work locally (true/false)
@cli.command()
@cli.command(name='benchmarks')
@click.argument('environment', type=click.Choice(['production', 'staging']))
@click.argument('bots', type=click.Choice(['budgetkey', 'takanon', 'all']))
@click.argument('local', type=click.BOOL)
@click.option('--reuse-answers', type=click.BOOL, default=False)
@click.option('--local', is_flag=True, default=False, help='Run benchmarks locally')
@click.option('--reuse-answers', is_flag=True, default=False)
@click.option('--select', type=click.STRING, default='failed', help='failed/all/AirTable record ID')
@click.option('--concurrency', type=click.INT, default=None)
def benchmarks(environment, bots, local, reuse_answers, select, concurrency):
"""Run benchmarks on bots."""
click.echo(f"Running benchmarks on {bots} in {environment} (save results locally: {local}, reuse answers: {reuse_answers}, select: {select})")
run_benchmarks(environment, bots, local, reuse_answers, select, concurrency)


def main():
cli()

if __name__ == '__main__':
main()
main()
9 changes: 9 additions & 0 deletions botnim/config.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,16 @@
from pathlib import Path
import dotenv
import logging

ROOT = Path(__file__).parent.parent
SPECS = ROOT / 'specs'

dotenv.load_dotenv(ROOT / '.env')

# Logging configuration
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def get_logger(name: str) -> logging.Logger:
"""Get a logger instance for the given name"""
return logging.getLogger(name)
31 changes: 21 additions & 10 deletions botnim/sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,17 @@
import json
import io
from pathlib import Path

import yaml

from openai import OpenAI

from .config import SPECS
from .vector_store import VectorStoreOpenAI
from .vector_store import VectorStoreOpenAI, VectorStoreES


api_key = os.environ['OPENAI_API_KEY']
es_username = os.environ['ES_USERNAME']
es_password = os.environ['ES_PASSWORD']
es_host = os.environ['ES_HOST']

# Create openai client and get completion for prompt with the 'gpt4-o' model:
client = OpenAI(api_key=api_key)

Expand Down Expand Up @@ -52,25 +53,35 @@ def openapi_to_tools(openapi_spec):
ret.append(func)
return ret

def update_assistant(config, config_dir, production, replace_context=False):
def update_assistant(config, config_dir, production, backend, replace_context=False):
tool_resources = None
tools = None
print(f'Updating assistant: {config["name"]}')
# Load context, if necessary
if config.get('context'):
vs = VectorStoreOpenAI(config, config_dir, production, client)
if config.get('context') and replace_context: # Only runs if both conditions are true
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This logic is wrong - we need the tools and tools_resources even if we don't replace the context.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed.

## create vector store based on backend parameter
if backend == 'openai':
vs = VectorStoreOpenAI(config, config_dir, production, client)
## Elasticsearch
elif backend == 'es':
vs = VectorStoreES(config, config_dir, production, es_host, es_username, es_password)
# Update the vector store with the context
tools, tool_resources = vs.vector_store_update(config['context'], replace_context)

# List all the assistants in the organization:
assistants = client.beta.assistants.list()
assistant_id = None
assistant_name = config['name']
if not production:
assistant_name += ' - פיתוח'

print(f'Looking for assistant named: {assistant_name}')
for assistant in assistants:
print(f'Found assistant: {assistant.name} (ID: {assistant.id})')
if assistant.name == assistant_name:
assistant_id = assistant.id
break

print(f'Assistant ID: {assistant_id}')
asst_params = dict(
name=assistant_name,
Expand Down Expand Up @@ -109,7 +120,7 @@ def update_assistant(config, config_dir, production, replace_context=False):
# ...


def sync_agents(environment, bots, replace_context=False):
def sync_agents(environment, bots, backend='openai', replace_context=False):
production = environment == 'production'
for config_fn in SPECS.glob('*/config.yaml'):
config_dir = config_fn.parent
Expand All @@ -118,4 +129,4 @@ def sync_agents(environment, bots, replace_context=False):
with config_fn.open() as config_f:
config = yaml.safe_load(config_f)
config['instructions'] = (config_dir / config['instructions']).read_text()
update_assistant(config, config_dir, production, replace_context=replace_context)
update_assistant(config, config_dir, production, backend, replace_context=replace_context)
4 changes: 2 additions & 2 deletions botnim/vector_store/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .vector_store_openai import VectorStoreOpenAI
from .vector_store_es import VectorStoreES

__all__ = [VectorStoreOpenAI]

__all__ = [VectorStoreOpenAI, VectorStoreES]
159 changes: 159 additions & 0 deletions botnim/vector_store/test_es_vector_store.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
import pytest
import os
from pathlib import Path
from dotenv import load_dotenv
from io import BytesIO

from botnim.vector_store.vector_store_es import VectorStoreES
from botnim.config import get_logger

logger = get_logger(__name__)
load_dotenv()

@pytest.fixture
def es_client_config():
"""Common Elasticsearch client configuration for tests"""
return {
'es_host': 'https://localhost:9200',
'es_username': os.getenv('ES_USERNAME'),
'es_password': os.getenv('ES_PASSWORD'),
'verify_certs': False
}

@pytest.fixture
def vector_store(es_client_config):
"""Initialize vector store for testing"""
config = {"name": "test_assistant"}
config_dir = Path(".")
production = False

vs = VectorStoreES(
config=config,
config_dir=config_dir,
production=production,
es_host=es_client_config['es_host'],
es_username=es_client_config['es_username'],
es_password=es_client_config['es_password']
)
return vs

@pytest.fixture(autouse=True)
def cleanup(vector_store):
"""Cleanup test indices after each test"""
yield
try:
test_index = vector_store.env_name("test_assistant").lower().replace(' ', '_')
if vector_store.es_client.indices.exists(index=test_index):
vector_store.es_client.indices.delete(index=test_index)
logger.info(f"Cleaned up test index: {test_index}")
except Exception as e:
logger.warning(f"Cleanup failed: {e}")

def test_initialization(es_client_config):
"""Test VectorStoreES initialization"""
vs = VectorStoreES(
config={"name": "test_assistant"},
config_dir=Path("."),
production=False,
es_host=es_client_config['es_host'],
es_username=es_client_config['es_username'],
es_password=es_client_config['es_password']
)

assert vs.es_client is not None
assert vs.openai_client is not None
assert vs.init is False

def test_get_or_create_vector_store(vector_store):
"""Test creating and getting vector store"""
# Test creation
context = {}
result = vector_store.get_or_create_vector_store(context, "test_context", True)

assert result is not None
assert 'id' in result
assert 'name' in result
assert vector_store.es_client.indices.exists(index=result['id'])

# Test getting existing
result2 = vector_store.get_or_create_vector_store(context, "test_context", False)
assert result2['id'] == result['id']

def test_upload_files(vector_store):
"""Test uploading files to vector store"""
# Create vector store
vs_info = vector_store.get_or_create_vector_store({}, "test_context", True)

# Prepare test documents
test_docs = [
("doc1.txt", "This is test document 1", "text/plain"),
("doc2.txt", "This is test document 2", "text/plain")
]

docs_to_upload = [
(filename, BytesIO(content.encode('utf-8')), content_type)
for filename, content, content_type in test_docs
]

# Upload files
vector_store.upload_files({}, "test_context", vs_info, docs_to_upload, None)

# Force refresh index
vector_store.es_client.indices.refresh(index=vs_info['id'])

# Verify documents were uploaded
for filename, content, _ in test_docs:
doc = vector_store.es_client.get(index=vs_info['id'], id=filename)
assert doc['_source']['content'] == content
assert len(doc['_source']['vector']) == 1536 # OpenAI embedding size

def test_delete_existing_files(vector_store):
"""Test deleting files from vector store"""
# Create and populate vector store
vs_info = vector_store.get_or_create_vector_store({}, "test_context", True)

test_docs = [
("doc1.txt", "Test document 1", "text/plain"),
("doc2.txt", "Test document 2", "text/plain")
]

docs_to_upload = [
(filename, BytesIO(content.encode('utf-8')), content_type)
for filename, content, content_type in test_docs
]

vector_store.upload_files({}, "test_context", vs_info, docs_to_upload, None)
vector_store.es_client.indices.refresh(index=vs_info['id'])

# Delete one document
deleted_count = vector_store.delete_existing_files(
{}, vs_info, ["doc1.txt"]
)

vector_store.es_client.indices.refresh(index=vs_info['id'])

assert deleted_count == 1
with pytest.raises(Exception):
vector_store.es_client.get(index=vs_info['id'], id="doc1.txt")

def test_update_tools(vector_store):
"""Test updating tools"""
context = {"max_num_results": 10}
vs_info = vector_store.get_or_create_vector_store(context, "test_context", True)

vector_store.update_tools(context, vs_info)

assert len(vector_store.tools) == 1
assert vector_store.tools[0]['type'] == 'file_search'
assert vector_store.tools[0]['file_search']['max_num_results'] == 10

def test_update_tool_resources(vector_store):
"""Test updating tool resources"""
vs_info = vector_store.get_or_create_vector_store({}, "test_context", True)

vector_store.update_tool_resources({}, vs_info)

assert vector_store.tool_resources is not None
assert 'file_search' in vector_store.tool_resources
assert vector_store.tool_resources['file_search']['vector_store_ids'] == [vs_info['id']]

Loading