Merge branch 'main' into loader-894/ArangoDB-integration

run-llama · Jan 27, 2024 · c6b755f · c6b755f
2 parents 724bb40 + fb4d2fb
commit c6b755f
Show file tree

Hide file tree

Showing 16 changed files with 999 additions and 601 deletions.
diff --git a/llama_hub/couchbase/README.md b/llama_hub/couchbase/README.md
@@ -0,0 +1,56 @@
+# Couchbase Loader
+
+This loader loads documents from Couchbase cluster.
+The user specifies a Couchbase client or credentials to initialize the reader. They can specify the SQL++ query to
+fetch the relevant docs.
+
+## Usage
+
+Here's an example usage of the CouchbaseReader.
+
+```python
+from llama_index import download_loader
+import os
+
+CouchbaseLoader = download_loader('CouchbaseReader')
+
+connection_string = "couchbase://localhost" # valid Couchbase connection string
+db_username = "<valid_database_user_with_read_access_to_bucket_with_data>"
+db_password = "<password_for_database_user>"
+
+# query is a valid SQL++ query that is passed to client.query()
+query = """
+    SELECT h.* FROM `travel-sample`.inventory.hotel h
+        WHERE h.country = 'United States'
+        LIMIT 5
+        """
+
+reader = CouchbaseLoader(
+        connection_string=connection_string,
+        db_username=db_username,
+        db_password=db_password
+    )
+
+# It is also possible to pass an initialized Couchbase client to the document loader
+# from couchbase.auth import PasswordAuthenticator  # noqa: E402
+# from couchbase.cluster import Cluster # noqa: E402
+# from couchbase.options import ClusterOptions # noqa: E402
+
+# auth = PasswordAuthenticator(
+#    db_username,
+#    db_password,
+# )
+
+# couchbase_client = Cluster(connection_string, ClusterOptions(auth))
+# reader = CouchbaseLoader(client=couchbase_client)
+
+# fields to be written to the document
+text_fields=["name", "title", "address", "reviews"]
+
+# metadata fields to be written to the document's metadata
+metadata_fields=["country", "city"],
+
+documents = reader.load_data(query=query, text_fields=text_fields, metadata_fields=metadata_fields)
+```
+
+This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/run-llama/llama_index/tree/main/llama_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/run-llama/llama-hub/tree/main) for examples.
diff --git a/llama_hub/couchbase/__init__.py b/llama_hub/couchbase/__init__.py
@@ -0,0 +1,6 @@
+"""Init file."""
+from llama_hub.couchbase.base import (
+    CouchbaseReader,
+)
+
+__all__ = ["CouchbaseReader"]
diff --git a/llama_hub/couchbase/base.py b/llama_hub/couchbase/base.py
@@ -0,0 +1,107 @@
+"""Couchbase document loader"""
+
+from typing import Any, Iterable, List, Optional
+from llama_index.readers.base import BaseReader
+from llama_index.readers.schema.base import Document
+
+
+class CouchbaseReader(BaseReader):
+    """Couchbase document loader.
+
+    Loads data from a Couchbase cluster into Document used by LlamaIndex.
+
+    Args:
+        client(Optional[Any]): A Couchbase client to use.
+            If not provided, the client will be created based on the connection_string
+            and database credentials.
+        connection_string (Optional[str]): The connection string to the Couchbase cluster.
+        db_username (Optional[str]): The username to connect to the Couchbase cluster.
+        db_password (Optional[str]): The password to connect to the Couchbase cluster.
+    """
+
+    def __init__(
+        self,
+        client: Optional[Any] = None,
+        connection_string: Optional[str] = None,
+        db_username: Optional[str] = None,
+        db_password: Optional[str] = None,
+    ) -> None:
+        """Initialize Couchbase document loader."""
+        import_err_msg = "`couchbase` package not found, please run `pip install --upgrade couchbase`"
+        try:
+            from couchbase.auth import PasswordAuthenticator
+            from couchbase.cluster import Cluster
+            from couchbase.options import ClusterOptions
+        except ImportError:
+            raise ImportError(import_err_msg)
+
+        if not client:
+            if not connection_string or not db_username or not db_password:
+                raise ValueError(
+                    "You need to pass either a couchbase client or connection_string and credentials must be provided."
+                )
+            else:
+                auth = PasswordAuthenticator(
+                    db_username,
+                    db_password,
+                )
+
+                self._client: Cluster = Cluster(connection_string, ClusterOptions(auth))
+        else:
+            self._client = client
+
+    def lazy_load_data(
+        self,
+        query: str,
+        text_fields: Optional[List[str]] = None,
+        metadata_fields: Optional[List[str]] = [],
+    ) -> Iterable[Document]:
+        """Load data from the Couchbase cluster lazily.
+
+        Args:
+            query (str): The SQL++ query to execute.
+            text_fields (Optional[List[str]]): The columns to write into the
+                `text` field of the document. By default, all columns are
+                written.
+            metadata_fields (Optional[List[str]]): The columns to write into the
+                `metadata` field of the document. By default, no columns are written.
+        """
+        from datetime import timedelta
+
+        if not query:
+            raise ValueError("Query must be provided.")
+
+        # Ensure connection to Couchbase cluster
+        self._client.wait_until_ready(timedelta(seconds=5))
+
+        # Run SQL++ Query
+        result = self._client.query(query)
+        for row in result:
+            if not text_fields:
+                text_fields = list(row.keys())
+
+            metadata = {field: row[field] for field in metadata_fields}
+
+            document = "\n".join(
+                f"{k}: {v}" for k, v in row.items() if k in text_fields
+            )
+
+            yield (Document(text=document, metadata=metadata))
+
+    def load_data(
+        self,
+        query: str,
+        text_fields: Optional[List[str]] = None,
+        metadata_fields: Optional[List[str]] = None,
+    ) -> List[Document]:
+        """Load data from the Couchbase cluster.
+
+        Args:
+            query (str): The SQL++ query to execute.
+            text_fields (Optional[List[str]]): The columns to write into the
+                `text` field of the document. By default, all columns are
+                written.
+            metadata_fields (Optional[List[str]]): The columns to write into the
+                `metadata` field of the document. By default, no columns are written.
+        """
+        return list(self.lazy_load_data(query, text_fields, metadata_fields))
diff --git a/llama_hub/couchbase/requirements.txt b/llama_hub/couchbase/requirements.txt
@@ -0,0 +1 @@
+couchbase
diff --git a/llama_hub/library.json b/llama_hub/library.json
@@ -1227,5 +1227,14 @@
   "SimpleArangoDBReader": {
     "id": "arango_db",
     "author": "mmaatouk"
+  },
+  "CouchbaseReader": {
+    "id": "couchbase",
+    "author": "nithishr",
+    "keywords": [
+      "Couchbase",
+      "Capella",
+      "NoSQL"
+    ]
   }
-}
+}
diff --git a/llama_hub/llama_packs/ragatouille_retriever/base.py b/llama_hub/llama_packs/ragatouille_retriever/base.py
@@ -69,11 +69,18 @@ def __init__(
             )
 
         doc_txts = [doc.get_content() for doc in documents]
+        doc_ids = [doc.doc_id for doc in documents]
+        doc_metadatas = [doc.metadata for doc in documents]
 
         # index the documents
         if index_path is None:
             RAG = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
-            index_path = RAG.index(index_name=index_name, collection=doc_txts)
+            index_path = RAG.index(
+                index_name=index_name,
+                collection=doc_txts,
+                document_ids=doc_ids,
+                document_metadatas=doc_metadatas,
+            )
         else:
             RAG = RAGPretrainedModel.from_index(index_path)
 
@@ -89,6 +96,26 @@ def __init__(
             self.custom_retriever, service_context=ServiceContext.from_defaults(llm=llm)
         )
 
+    def add_documents(self, documents: List[Document]) -> None:
+        """Add documents."""
+
+        doc_txts = [doc.get_content() for doc in documents]
+        doc_ids = [doc.doc_id for doc in documents]
+        doc_metadatas = [doc.metadata for doc in documents]
+
+        self.RAG.add_to_index(
+            new_collection=doc_txts,
+            new_document_ids=doc_ids,
+            new_document_metadatas=doc_metadatas,
+        )
+
+    def delete_documents(self, documents: List[Document]) -> None:
+        """Delete documents."""
+
+        doc_ids = [doc.doc_id for doc in documents]
+
+        self.RAG.delete_from_index(document_ids=doc_ids)
+
     def get_modules(self) -> Dict[str, Any]:
         """Get modules."""
         return {

diff --git a/llama_hub/tools/exa/README.md b/llama_hub/tools/exa/README.md
@@ -0,0 +1,32 @@
+# Exa (formerly Metaphor) Tool 
+
+This tool connects to [Exa](https://exa.ai/) to easily enable
+your agent to search and get HTML content from the Internet.
+
+To begin, you need to obtain an API key on the [Exa developer dashboard](https://dashboard.exa.ai).
+
+## Usage
+
+This tool has more a extensive example usage documented in a Jupyter notebook [here](https://github.com/emptycrown/llama-hub/tree/main/llama_hub/tools/notebooks/exa.ipynb)
+
+Here's an example usage of the ExaToolSpec.
+
+```python
+from llama_hub.tools.exa import ExaToolSpec
+from llama_index.agent import OpenAIAgent
+
+exa_tool = ExaToolSpec(
+    api_key='your-key',
+)
+agent = OpenAIAgent.from_tools(exa_tool.to_tool_list())
+
+agent.chat('Can you summarize the news published in the last month on superconductors')
+```
+
+`search`: Search for a list of articles relating to a natural language query
+`retrieve_documents`: Retrieve a list of documents returned from `exa_search`.
+`search_and_retrieve_documents`: Combines search and retrieve_documents to directly return a list of documents related to a search
+`find_similar`: Find similar documents to a given URL.
+`current_date`: Utility for the Agent to get todays date
+
+This loader is designed to be used as a way to load data as a Tool in a Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
diff --git a/llama_hub/tools/exa/__init__.py b/llama_hub/tools/exa/__init__.py
@@ -0,0 +1,6 @@
+## init
+from llama_hub.tools.exa.base import (
+    ExaToolSpec,
+)
+
+__all__ = ["ExaToolSpec"]