Skip to content
This repository has been archived by the owner on Mar 1, 2024. It is now read-only.

Support ArangoDB simple loader #900

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions llama_hub/arango_db/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# ArangoDB Loader

This loader loads documents from ArangoDB. The user specifies a ArangoDB instance to
initialize the reader. They then specify the collection name and query params to
fetch the relevant docs.

## Usage

Here's an example usage of the SimpleArangoDBReader.

```python
from llama_index import download_loader
import os

SimpleArangoDBReader = download_loader('SimpleArangoDBReader')

host = "<host>"
db_name = "<db_name>"
collection_name = "<collection_name>"
# query_dict is passed into db.collection.find()
query_dict = {}
# Attribute of interests to load, by default ["text"]
field_names = ["title", "description"]
reader = SimpleArangoDBReader(host) # or pass ArangoClient
documents = reader.load_data(
username,
password,
db_name,
collection_name,
query_dict=query_dict,
field_names=field_names,
)
```

This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/run-llama/llama_index/tree/main/llama_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/run-llama/llama-hub/tree/main/llama_hub) for examples.
5 changes: 5 additions & 0 deletions llama_hub/arango_db/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from llama_hub.arango_db.base import (
SimpleArangoDBReader,
)

__all__ = ["SimpleArangoDBReader"]
138 changes: 138 additions & 0 deletions llama_hub/arango_db/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
"""ArangoDB client."""

from typing import Any, Dict, Iterator, List, Optional, Union, cast

from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document


class SimpleArangoDBReader(BaseReader):
"""Simple arangodb reader.
Concatenates each ArangoDB doc into Document used by LlamaIndex.
Args:
host: (Union[str, List[str]]) list of urls or url for connecting to the db
client: (Any) ArangoDB client
"""

def __init__(
self, host: Optional[Union[str, List[str]]] = None, client: Optional[Any] = None
) -> None:
"""Initialize with parameters."""
try:
from arango import ArangoClient
except ImportError as err:
raise ImportError(
"`arango` package not found, please run `pip install python-arango`"
) from err

host = host or "http://127.0.0.1:8529"
self.client = client or ArangoClient(hosts=host)
self.client = cast(ArangoClient, self.client)

def _flatten(self, texts: List[Union[str, List[str]]]) -> List[str]:
result = []
for text in texts:
result += text if isinstance(text, list) else [text]
return result

def lazy_load(
self,
username: str,
password: str,
db_name: str,
collection_name: str,
field_names: List[str] = ["text"],
separator: str = " ",
query_dict: Optional[Dict] = {},
max_docs: int = None,
metadata_names: Optional[List[str]] = None,
) -> Iterator[Document]:
"""Lazy load data from ArangoDB.
Args:
username (str): for credentials.
password (str): for credentials.
db_name (str): name of the database.
collection_name (str): name of the collection.
field_names(List[str]): names of the fields to be concatenated.
Defaults to ["text"]
separator (str): separator to be used between fields.
Defaults to " "
query_dict (Optional[Dict]): query to filter documents. Read more
at [docs](https://docs.python-arango.com/en/main/specs.html#arango.collection.StandardCollection.find)
Defaults to empty dict
max_docs (int): maximum number of documents to load.
Defaults to None (no limit)
metadata_names (Optional[List[str]]): names of the fields to be added
to the metadata attribute of the Document. Defaults to None
Returns:
List[Document]: A list of documents.
"""
db = self.client.db(name=db_name, username=username, password=password)
collection = db.collection(collection_name)
cursor = collection.find(filters=query_dict, limit=max_docs)
for item in cursor:
try:
texts = [str(item[name]) for name in field_names]
except KeyError as err:
raise ValueError(
f"{err.args[0]} field not found in arangodb document."
) from err
texts = self._flatten(texts)
text = separator.join(texts)

if metadata_names is None:
yield Document(text=text)
else:
try:
metadata = {name: item[name] for name in metadata_names}
except KeyError as err:
raise ValueError(
f"{err.args[0]} field not found in arangodb document."
) from err
yield Document(text=text, metadata=metadata)

def load_data(
self,
username: str,
password: str,
db_name: str,
collection_name: str,
field_names: List[str] = ["text"],
separator: str = " ",
query_dict: Optional[Dict] = {},
max_docs: int = None,
metadata_names: Optional[List[str]] = None,
) -> List[Document]:
"""Load data from the ArangoDB.
Args:
username (str): for credentials.
password (str): for credentials.
db_name (str): name of the database.
collection_name (str): name of the collection.
field_names(List[str]): names of the fields to be concatenated.
Defaults to ["text"]
separator (str): separator to be used between fields.
Defaults to ""
query_dict (Optional[Dict]): query to filter documents. Read more
at [docs](https://docs.python-arango.com/en/main/specs.html#arango.collection.StandardCollection.find)
Defaults to empty dict
max_docs (int): maximum number of documents to load.
Defaults to 0 (no limit)
metadata_names (Optional[List[str]]): names of the fields to be added
to the metadata attribute of the Document. Defaults to None
Returns:
List[Document]: A list of documents.
"""
return list(
self.lazy_load(
username,
password,
db_name,
collection_name,
field_names,
separator,
query_dict,
max_docs,
metadata_names,
)
)
1 change: 1 addition & 0 deletions llama_hub/arango_db/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
python-arango
4 changes: 4 additions & 0 deletions llama_hub/library.json
Original file line number Diff line number Diff line change
Expand Up @@ -1224,6 +1224,10 @@
"id": "file/xml",
"author": "mmaatouk"
},
"SimpleArangoDBReader": {
"id": "arango_db",
"author": "mmaatouk"
},
"CouchbaseReader": {
"id": "couchbase",
"author": "nithishr",
Expand Down
1 change: 1 addition & 0 deletions test_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ olefile
chromadb
snowflake-sqlalchemy
selenium
python-arango

# hotfix
psutil
Expand Down
40 changes: 40 additions & 0 deletions tests/test_arangodb/test_arangodb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from unittest.mock import MagicMock, patch
import pytest

from llama_hub.arango_db import SimpleArangoDBReader


@pytest.fixture
def mock_arangodb_client():
with patch("arango.ArangoClient") as mock_client:
# Mock the behavior of the db and collection
mock_db = MagicMock()
mock_students_collection = MagicMock()

mock_students = [
{"_key": "1", "name": "Alice", "age": 20},
{"_key": "2", "name": "Bob", "age": 21},
{"_key": "3", "name": "Mark", "age": 20},
]

mock_students_collection.find.return_value = mock_students
mock_db.collection.return_value = mock_students_collection
mock_client.db.return_value = mock_db

yield mock_client


def test_load_students(mock_arangodb_client):
reader = SimpleArangoDBReader(client=mock_arangodb_client)
documents = reader.load_data(
username="usr",
password="pass",
db_name="school",
collection_name="students",
field_names=["name", "age"],
)

assert len(documents) == 3
assert documents[0].text == "Alice 20"
assert documents[1].text == "Bob 21"
assert documents[2].text == "Mark 20"
Loading