whiletrue-industries · noamoss · Jan 31, 2025 · Jan 31, 2025 · Jan 31, 2025 · Jan 31, 2025
diff --git a/.env.sample b/.env.sample
@@ -1,2 +1,5 @@
 AIRTABLE_API_KEY=
-OPENAI_API_KEY=
+OPENAI_API_KEY=
+ES_USERNAME=
+ES_PASSWORD=
+ES_HOST=
diff --git a/README.md b/README.md
@@ -15,6 +15,11 @@ $ pip install -U -e .
 $ botnim --help
 ```
 
+for development:
+```bash
+$ pip install -U -e .[dev]
+```
+
 ## Directory Structure
 
 - `.env.sample`: Sample environment file for the benchmarking scripts.
@@ -27,6 +32,9 @@ $ botnim --help
     - `__init__.py`: Package initialization.
     - `vector_store_base.py`: Abstract base class for vector store implementations.
     - `vector_store_openai.py`: OpenAI Vector Store implementation.
+    - `vector_store_es.py`: Elasticsearch Vector Store implementation
+        - see the `backend/es` directory for examples
+        - run `pytest` to test the Elasticsearch Vector Store.
   - `benchmark/`: Benchmarking scripts for the bots.
       Copy this file to `.env` and fill in the necessary values.
     - `run-benchmark.py`: Main benchmarking script.
@@ -52,7 +60,7 @@ $ botnim --help
    - Configure the source URL in the bot's `config.yaml`
    - The content will be automatically downloaded during sync
 Either:
-3. `botnim sync {staging/production} {budgetkey/takanon}` to sync the specifications with the OpenAI account.
+3. `botnim sync {staging/production} {budgetkey/takanon} --backend {openai/es}` to sync the specifications with the OpenAI account.
    - Use `--replace-context` flag to force a complete rebuild of the vector store (useful when context files have been modified)
 Or
 3. Commit the changes to the repository

diff --git a/botnim/cli.py b/botnim/cli.py
@@ -1,37 +1,41 @@
 import click
 from .sync import sync_agents
 from .benchmark.runner import run_benchmarks
+from .config import SPECS
+
 
 @click.group()
 def cli():
     """A simple CLI tool."""
     pass
 
 # Sync command, receives two arguments: production/staging and a list of bots to sync ('budgetkey'/'takanon' or 'all')
-@cli.command()
+@cli.command(name='sync')
 @click.argument('environment', type=click.Choice(['production', 'staging']))
 @click.argument('bots', type=click.Choice(['budgetkey', 'takanon', 'all']))
 @click.option('--replace-context', is_flag=True, help='Replace existing context')
-def sync(environment, bots, replace_context):
+@click.option('--backend', type=click.Choice(['es', 'openai']), default='openai', help='Vector store backend')
+def sync(environment, bots, replace_context, backend):
     """Sync bots to Airtable."""
     click.echo(f"Syncing {bots} to {environment}")
-    sync_agents(environment, bots, replace_context=replace_context)
+    sync_agents(environment, bots, backend=backend,replace_context=replace_context)
 
 # Run benchmarks command, receives three arguments: production/staging, a list of bots to run benchmarks on ('budgetkey'/'takanon' or 'all') and whether to run benchmarks on the production environment to work locally (true/false)
-@cli.command()
+@cli.command(name='benchmarks')
 @click.argument('environment', type=click.Choice(['production', 'staging']))
 @click.argument('bots', type=click.Choice(['budgetkey', 'takanon', 'all']))
-@click.argument('local', type=click.BOOL)
-@click.option('--reuse-answers', type=click.BOOL, default=False)
+@click.option('--local', is_flag=True, default=False, help='Run benchmarks locally')
+@click.option('--reuse-answers', is_flag=True, default=False)
 @click.option('--select', type=click.STRING, default='failed', help='failed/all/AirTable record ID')
 @click.option('--concurrency', type=click.INT, default=None)
 def benchmarks(environment, bots, local, reuse_answers, select, concurrency):
     """Run benchmarks on bots."""
     click.echo(f"Running benchmarks on {bots} in {environment} (save results locally: {local}, reuse answers: {reuse_answers}, select: {select})")
     run_benchmarks(environment, bots, local, reuse_answers, select, concurrency)
 
+
 def main():
     cli()
 
 if __name__ == '__main__':
-    main()
+    main()
diff --git a/botnim/config.py b/botnim/config.py
@@ -1,7 +1,16 @@
 from pathlib import Path
 import dotenv
+import logging
 
 ROOT = Path(__file__).parent.parent
 SPECS = ROOT / 'specs'
 
 dotenv.load_dotenv(ROOT / '.env')
+
+# Logging configuration
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+def get_logger(name: str) -> logging.Logger:
+    """Get a logger instance for the given name"""
+    return logging.getLogger(name)
diff --git a/botnim/sync.py b/botnim/sync.py
@@ -2,16 +2,17 @@
 import json
 import io
 from pathlib import Path
-
 import yaml
-
 from openai import OpenAI
-
 from .config import SPECS
-from .vector_store import VectorStoreOpenAI
+from .vector_store import VectorStoreOpenAI, VectorStoreES
 
 
 api_key = os.environ['OPENAI_API_KEY']
+es_username = os.environ['ES_USERNAME']
+es_password = os.environ['ES_PASSWORD']
+es_host = os.environ['ES_HOST']
+
 # Create openai client and get completion for prompt with the 'gpt4-o' model:
 client = OpenAI(api_key=api_key)
 
@@ -52,25 +53,35 @@ def openapi_to_tools(openapi_spec):
             ret.append(func)
     return ret
 
-def update_assistant(config, config_dir, production, replace_context=False):
+def update_assistant(config, config_dir, production, backend, replace_context=False):
     tool_resources = None
     tools = None
     print(f'Updating assistant: {config["name"]}')
     # Load context, if necessary
-    if config.get('context'):
-        vs = VectorStoreOpenAI(config, config_dir, production, client)
+    if config.get('context') and replace_context:  # Only runs if both conditions are true
+        ## create vector store based on backend parameter
+        if backend == 'openai':
+            vs = VectorStoreOpenAI(config, config_dir, production, client)
+        ## Elasticsearch
+        elif backend == 'es':
+            vs = VectorStoreES(config, config_dir, production, es_host, es_username, es_password)
+        # Update the vector store with the context
         tools, tool_resources = vs.vector_store_update(config['context'], replace_context)
-
+    
     # List all the assistants in the organization:
     assistants = client.beta.assistants.list()
     assistant_id = None
     assistant_name = config['name']
     if not production:
         assistant_name += ' - פיתוח'
+
+    print(f'Looking for assistant named: {assistant_name}')
     for assistant in assistants:
+        print(f'Found assistant: {assistant.name} (ID: {assistant.id})')
         if assistant.name == assistant_name:
             assistant_id = assistant.id
             break
+
     print(f'Assistant ID: {assistant_id}')
     asst_params = dict(
         name=assistant_name,
@@ -109,7 +120,7 @@ def update_assistant(config, config_dir, production, replace_context=False):
         # ...
 
 
-def sync_agents(environment, bots, replace_context=False):
+def sync_agents(environment, bots, backend='openai', replace_context=False):
     production = environment == 'production'
     for config_fn in SPECS.glob('*/config.yaml'):
         config_dir = config_fn.parent
@@ -118,4 +129,4 @@ def sync_agents(environment, bots, replace_context=False):
             with config_fn.open() as config_f:
                 config = yaml.safe_load(config_f)
                 config['instructions'] = (config_dir / config['instructions']).read_text()
-                update_assistant(config, config_dir, production, replace_context=replace_context)
+                update_assistant(config, config_dir, production, backend, replace_context=replace_context)
diff --git a/botnim/vector_store/__init__.py b/botnim/vector_store/__init__.py
@@ -1,4 +1,4 @@
 from .vector_store_openai import VectorStoreOpenAI
+from .vector_store_es import VectorStoreES
 
-__all__ = [VectorStoreOpenAI]
-
+__all__ = [VectorStoreOpenAI, VectorStoreES]
diff --git a/botnim/vector_store/test_es_vector_store.py b/botnim/vector_store/test_es_vector_store.py
@@ -0,0 +1,159 @@
+import pytest
+import os
+from pathlib import Path
+from dotenv import load_dotenv
+from io import BytesIO
+
+from botnim.vector_store.vector_store_es import VectorStoreES
+from botnim.config import get_logger
+
+logger = get_logger(__name__)
+load_dotenv()
+
+@pytest.fixture
+def es_client_config():
+    """Common Elasticsearch client configuration for tests"""
+    return {
+        'es_host': 'https://localhost:9200',
+        'es_username': os.getenv('ES_USERNAME'),
+        'es_password': os.getenv('ES_PASSWORD'),
+        'verify_certs': False
+    }
+
+@pytest.fixture
+def vector_store(es_client_config):
+    """Initialize vector store for testing"""
+    config = {"name": "test_assistant"}
+    config_dir = Path(".")
+    production = False
+
+    vs = VectorStoreES(
+        config=config,
+        config_dir=config_dir,
+        production=production,
+        es_host=es_client_config['es_host'],
+        es_username=es_client_config['es_username'],
+        es_password=es_client_config['es_password']
+    )
+    return vs
+
+@pytest.fixture(autouse=True)
+def cleanup(vector_store):
+    """Cleanup test indices after each test"""
+    yield
+    try:
+        test_index = vector_store.env_name("test_assistant").lower().replace(' ', '_')
+        if vector_store.es_client.indices.exists(index=test_index):
+            vector_store.es_client.indices.delete(index=test_index)
+            logger.info(f"Cleaned up test index: {test_index}")
+    except Exception as e:
+        logger.warning(f"Cleanup failed: {e}")
+
+def test_initialization(es_client_config):
+    """Test VectorStoreES initialization"""
+    vs = VectorStoreES(
+        config={"name": "test_assistant"},
+        config_dir=Path("."),
+        production=False,
+        es_host=es_client_config['es_host'],
+        es_username=es_client_config['es_username'],
+        es_password=es_client_config['es_password']
+    )
+
+    assert vs.es_client is not None
+    assert vs.openai_client is not None
+    assert vs.init is False
+
+def test_get_or_create_vector_store(vector_store):
+    """Test creating and getting vector store"""
+    # Test creation
+    context = {}
+    result = vector_store.get_or_create_vector_store(context, "test_context", True)
+
+    assert result is not None
+    assert 'id' in result
+    assert 'name' in result
+    assert vector_store.es_client.indices.exists(index=result['id'])
+
+    # Test getting existing
+    result2 = vector_store.get_or_create_vector_store(context, "test_context", False)
+    assert result2['id'] == result['id']
+
+def test_upload_files(vector_store):
+    """Test uploading files to vector store"""
+    # Create vector store
+    vs_info = vector_store.get_or_create_vector_store({}, "test_context", True)
+
+    # Prepare test documents
+    test_docs = [
+        ("doc1.txt", "This is test document 1", "text/plain"),
+        ("doc2.txt", "This is test document 2", "text/plain")
+    ]
+
+    docs_to_upload = [
+        (filename, BytesIO(content.encode('utf-8')), content_type)
+        for filename, content, content_type in test_docs
+    ]
+
+    # Upload files
+    vector_store.upload_files({}, "test_context", vs_info, docs_to_upload, None)
+
+    # Force refresh index
+    vector_store.es_client.indices.refresh(index=vs_info['id'])
+
+    # Verify documents were uploaded
+    for filename, content, _ in test_docs:
+        doc = vector_store.es_client.get(index=vs_info['id'], id=filename)
+        assert doc['_source']['content'] == content
+        assert len(doc['_source']['vector']) == 1536  # OpenAI embedding size
+
+def test_delete_existing_files(vector_store):
+    """Test deleting files from vector store"""
+    # Create and populate vector store
+    vs_info = vector_store.get_or_create_vector_store({}, "test_context", True)
+
+    test_docs = [
+        ("doc1.txt", "Test document 1", "text/plain"),
+        ("doc2.txt", "Test document 2", "text/plain")
+    ]
+
+    docs_to_upload = [
+        (filename, BytesIO(content.encode('utf-8')), content_type)
+        for filename, content, content_type in test_docs
+    ]
+
+    vector_store.upload_files({}, "test_context", vs_info, docs_to_upload, None)
+    vector_store.es_client.indices.refresh(index=vs_info['id'])
+
+    # Delete one document
+    deleted_count = vector_store.delete_existing_files(
+        {}, vs_info, ["doc1.txt"]
+    )
+
+    vector_store.es_client.indices.refresh(index=vs_info['id'])
+
+    assert deleted_count == 1
+    with pytest.raises(Exception):
+        vector_store.es_client.get(index=vs_info['id'], id="doc1.txt")
+
+def test_update_tools(vector_store):
+    """Test updating tools"""
+    context = {"max_num_results": 10}
+    vs_info = vector_store.get_or_create_vector_store(context, "test_context", True)
+
+    vector_store.update_tools(context, vs_info)
+
+    assert len(vector_store.tools) == 1
+    assert vector_store.tools[0]['type'] == 'file_search'
+    assert vector_store.tools[0]['file_search']['max_num_results'] == 10
+
+def test_update_tool_resources(vector_store):
+    """Test updating tool resources"""
+    vs_info = vector_store.get_or_create_vector_store({}, "test_context", True)
+
+    vector_store.update_tool_resources({}, vs_info)
+
+    assert vector_store.tool_resources is not None
+    assert 'file_search' in vector_store.tool_resources
+    assert vector_store.tool_resources['file_search']['vector_store_ids'] == [vs_info['id']]
+