Skip to content
This repository has been archived by the owner on Mar 1, 2024. It is now read-only.

Commit

Permalink
Merge branch 'main' into loader-894/ArangoDB-integration
Browse files Browse the repository at this point in the history
  • Loading branch information
MarouaneMaatouk authored Jan 27, 2024
2 parents 724bb40 + fb4d2fb commit c6b755f
Show file tree
Hide file tree
Showing 16 changed files with 999 additions and 601 deletions.
56 changes: 56 additions & 0 deletions llama_hub/couchbase/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# Couchbase Loader

This loader loads documents from Couchbase cluster.
The user specifies a Couchbase client or credentials to initialize the reader. They can specify the SQL++ query to
fetch the relevant docs.

## Usage

Here's an example usage of the CouchbaseReader.

```python
from llama_index import download_loader
import os

CouchbaseLoader = download_loader('CouchbaseReader')

connection_string = "couchbase://localhost" # valid Couchbase connection string
db_username = "<valid_database_user_with_read_access_to_bucket_with_data>"
db_password = "<password_for_database_user>"

# query is a valid SQL++ query that is passed to client.query()
query = """
SELECT h.* FROM `travel-sample`.inventory.hotel h
WHERE h.country = 'United States'
LIMIT 5
"""

reader = CouchbaseLoader(
connection_string=connection_string,
db_username=db_username,
db_password=db_password
)

# It is also possible to pass an initialized Couchbase client to the document loader
# from couchbase.auth import PasswordAuthenticator # noqa: E402
# from couchbase.cluster import Cluster # noqa: E402
# from couchbase.options import ClusterOptions # noqa: E402

# auth = PasswordAuthenticator(
# db_username,
# db_password,
# )

# couchbase_client = Cluster(connection_string, ClusterOptions(auth))
# reader = CouchbaseLoader(client=couchbase_client)

# fields to be written to the document
text_fields=["name", "title", "address", "reviews"]

# metadata fields to be written to the document's metadata
metadata_fields=["country", "city"],

documents = reader.load_data(query=query, text_fields=text_fields, metadata_fields=metadata_fields)
```

This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/run-llama/llama_index/tree/main/llama_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/run-llama/llama-hub/tree/main) for examples.
6 changes: 6 additions & 0 deletions llama_hub/couchbase/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"""Init file."""
from llama_hub.couchbase.base import (
CouchbaseReader,
)

__all__ = ["CouchbaseReader"]
107 changes: 107 additions & 0 deletions llama_hub/couchbase/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
"""Couchbase document loader"""

from typing import Any, Iterable, List, Optional
from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document


class CouchbaseReader(BaseReader):
"""Couchbase document loader.
Loads data from a Couchbase cluster into Document used by LlamaIndex.
Args:
client(Optional[Any]): A Couchbase client to use.
If not provided, the client will be created based on the connection_string
and database credentials.
connection_string (Optional[str]): The connection string to the Couchbase cluster.
db_username (Optional[str]): The username to connect to the Couchbase cluster.
db_password (Optional[str]): The password to connect to the Couchbase cluster.
"""

def __init__(
self,
client: Optional[Any] = None,
connection_string: Optional[str] = None,
db_username: Optional[str] = None,
db_password: Optional[str] = None,
) -> None:
"""Initialize Couchbase document loader."""
import_err_msg = "`couchbase` package not found, please run `pip install --upgrade couchbase`"
try:
from couchbase.auth import PasswordAuthenticator
from couchbase.cluster import Cluster
from couchbase.options import ClusterOptions
except ImportError:
raise ImportError(import_err_msg)

if not client:
if not connection_string or not db_username or not db_password:
raise ValueError(
"You need to pass either a couchbase client or connection_string and credentials must be provided."
)
else:
auth = PasswordAuthenticator(
db_username,
db_password,
)

self._client: Cluster = Cluster(connection_string, ClusterOptions(auth))
else:
self._client = client

def lazy_load_data(
self,
query: str,
text_fields: Optional[List[str]] = None,
metadata_fields: Optional[List[str]] = [],
) -> Iterable[Document]:
"""Load data from the Couchbase cluster lazily.
Args:
query (str): The SQL++ query to execute.
text_fields (Optional[List[str]]): The columns to write into the
`text` field of the document. By default, all columns are
written.
metadata_fields (Optional[List[str]]): The columns to write into the
`metadata` field of the document. By default, no columns are written.
"""
from datetime import timedelta

if not query:
raise ValueError("Query must be provided.")

# Ensure connection to Couchbase cluster
self._client.wait_until_ready(timedelta(seconds=5))

# Run SQL++ Query
result = self._client.query(query)
for row in result:
if not text_fields:
text_fields = list(row.keys())

metadata = {field: row[field] for field in metadata_fields}

document = "\n".join(
f"{k}: {v}" for k, v in row.items() if k in text_fields
)

yield (Document(text=document, metadata=metadata))

def load_data(
self,
query: str,
text_fields: Optional[List[str]] = None,
metadata_fields: Optional[List[str]] = None,
) -> List[Document]:
"""Load data from the Couchbase cluster.
Args:
query (str): The SQL++ query to execute.
text_fields (Optional[List[str]]): The columns to write into the
`text` field of the document. By default, all columns are
written.
metadata_fields (Optional[List[str]]): The columns to write into the
`metadata` field of the document. By default, no columns are written.
"""
return list(self.lazy_load_data(query, text_fields, metadata_fields))
1 change: 1 addition & 0 deletions llama_hub/couchbase/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
couchbase
11 changes: 10 additions & 1 deletion llama_hub/library.json
Original file line number Diff line number Diff line change
Expand Up @@ -1227,5 +1227,14 @@
"SimpleArangoDBReader": {
"id": "arango_db",
"author": "mmaatouk"
},
"CouchbaseReader": {
"id": "couchbase",
"author": "nithishr",
"keywords": [
"Couchbase",
"Capella",
"NoSQL"
]
}
}
}
29 changes: 28 additions & 1 deletion llama_hub/llama_packs/ragatouille_retriever/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,11 +69,18 @@ def __init__(
)

doc_txts = [doc.get_content() for doc in documents]
doc_ids = [doc.doc_id for doc in documents]
doc_metadatas = [doc.metadata for doc in documents]

# index the documents
if index_path is None:
RAG = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
index_path = RAG.index(index_name=index_name, collection=doc_txts)
index_path = RAG.index(
index_name=index_name,
collection=doc_txts,
document_ids=doc_ids,
document_metadatas=doc_metadatas,
)
else:
RAG = RAGPretrainedModel.from_index(index_path)

Expand All @@ -89,6 +96,26 @@ def __init__(
self.custom_retriever, service_context=ServiceContext.from_defaults(llm=llm)
)

def add_documents(self, documents: List[Document]) -> None:
"""Add documents."""

doc_txts = [doc.get_content() for doc in documents]
doc_ids = [doc.doc_id for doc in documents]
doc_metadatas = [doc.metadata for doc in documents]

self.RAG.add_to_index(
new_collection=doc_txts,
new_document_ids=doc_ids,
new_document_metadatas=doc_metadatas,
)

def delete_documents(self, documents: List[Document]) -> None:
"""Delete documents."""

doc_ids = [doc.doc_id for doc in documents]

self.RAG.delete_from_index(document_ids=doc_ids)

def get_modules(self) -> Dict[str, Any]:
"""Get modules."""
return {
Expand Down
32 changes: 32 additions & 0 deletions llama_hub/tools/exa/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Exa (formerly Metaphor) Tool

This tool connects to [Exa](https://exa.ai/) to easily enable
your agent to search and get HTML content from the Internet.

To begin, you need to obtain an API key on the [Exa developer dashboard](https://dashboard.exa.ai).

## Usage

This tool has more a extensive example usage documented in a Jupyter notebook [here](https://github.com/emptycrown/llama-hub/tree/main/llama_hub/tools/notebooks/exa.ipynb)

Here's an example usage of the ExaToolSpec.

```python
from llama_hub.tools.exa import ExaToolSpec
from llama_index.agent import OpenAIAgent

exa_tool = ExaToolSpec(
api_key='your-key',
)
agent = OpenAIAgent.from_tools(exa_tool.to_tool_list())

agent.chat('Can you summarize the news published in the last month on superconductors')
```

`search`: Search for a list of articles relating to a natural language query
`retrieve_documents`: Retrieve a list of documents returned from `exa_search`.
`search_and_retrieve_documents`: Combines search and retrieve_documents to directly return a list of documents related to a search
`find_similar`: Find similar documents to a given URL.
`current_date`: Utility for the Agent to get todays date

This loader is designed to be used as a way to load data as a Tool in a Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
6 changes: 6 additions & 0 deletions llama_hub/tools/exa/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
## init
from llama_hub.tools.exa.base import (
ExaToolSpec,
)

__all__ = ["ExaToolSpec"]
Loading

0 comments on commit c6b755f

Please sign in to comment.