Merge branch 'main' into 894/ArangoDB-integration

run-llama · Jan 25, 2024 · fa2eb53 · fa2eb53
2 parents d988b14 + 57af091
commit fa2eb53
Show file tree

Hide file tree

Showing 19 changed files with 883 additions and 5 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # ChangeLog
 
+## v[0.0.75] - 2024-01-20
+
+### New Features
+
+- add vanna pack (#889)
+
 ## v[0.0.74] - 2024-01-19
 
 ### New Features

diff --git a/llama_hub/file/xml/README.md b/llama_hub/file/xml/README.md
@@ -0,0 +1,19 @@
+# XML Loader
+
+This loader extracts the text from a local XML file. A single local file is passed in each time you call `load_data`.
+
+## Usage
+
+To use this loader, you need to pass in a `Path` to a local file.
+
+```python
+from pathlib import Path
+from llama_index import download_loader
+
+XMLReader = download_loader("XMLReader")
+
+loader = XMLReader()
+documents = loader.load_data(file=Path('../example.xml'))
+```
+
+This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/run-llama/llama_index/tree/main/llama_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/run-llama/llama-hub/tree/main/llama_hub) for examples.
diff --git a/llama_hub/file/xml/__init__.py b/llama_hub/file/xml/__init__.py
@@ -0,0 +1,6 @@
+"""Init file."""
+from llama_hub.file.xml.base import (
+    XMLReader,
+)
+
+__all__ = ["XMLReader"]
diff --git a/llama_hub/file/xml/base.py b/llama_hub/file/xml/base.py
@@ -0,0 +1,95 @@
+"""JSON Reader."""
+
+import re
+from pathlib import Path
+from typing import Dict, List, Optional
+
+from llama_index.readers.base import BaseReader
+from llama_index.readers.schema.base import Document
+import xml.etree.ElementTree as ET
+
+
+def _get_leaf_nodes_up_to_level(root: ET.Element, level: int) -> List[ET.Element]:
+    """Get collection of nodes up to certain level including leaf nodes
+
+    Args:
+        root (ET.Element): XML Root Element
+        level (int): Levels to traverse in the tree
+
+    Returns:
+        List[ET.Element]: List of target nodes
+    """
+
+    def traverse(current_node, current_level):
+        if len(current_node) == 0 or level == current_level:
+            # Keep leaf nodes and target level nodes
+            nodes.append(current_node)
+        elif current_level < level:
+            # Move to the next level
+            for child in current_node:
+                traverse(child, current_level + 1)
+
+    nodes = []
+    traverse(root, 0)
+    return nodes
+
+
+class XMLReader(BaseReader):
+    """XML reader.
+
+    Reads XML documents with options to help suss out relationships between nodes.
+
+    Args:
+        tree_level_split (int): From which level in the xml tree we split documents,
+        the default level is the root which is level 0
+
+    """
+
+    def __init__(self, tree_level_split: Optional[int] = 0) -> None:
+        """Initialize with arguments."""
+        super().__init__()
+        self.tree_level_split = tree_level_split
+
+    def _parse_xmlelt_to_document(
+        self, root: ET.Element, extra_info: Optional[Dict] = None
+    ) -> List[Document]:
+        """Parse the xml object into a list of Documents.
+
+        Args:
+            root: The XML Element to be converted.
+            extra_info (Optional[Dict]): Additional information. Default is None.
+
+        Returns:
+            Document: The documents.
+        """
+        nodes = _get_leaf_nodes_up_to_level(root, self.tree_level_split)
+        documents = []
+        for node in nodes:
+            content = ET.tostring(node, encoding="utf8").decode("utf-8")
+            content = re.sub(r"^<\?xml.*", "", content)
+            content = content.strip()
+            documents.append(Document(text=content, extra_info=extra_info or {}))
+
+        return documents
+
+    def load_data(
+        self,
+        file: Path,
+        extra_info: Optional[Dict] = None,
+    ) -> List[Document]:
+        """Load data from the input file.
+
+        Args:
+            file (Path): Path to the input file.
+            extra_info (Optional[Dict]): Additional information. Default is None.
+
+        Returns:
+            List[Document]: List of documents.
+        """
+        if not isinstance(file, Path):
+            file = Path(file)
+
+        tree = ET.parse(file)
+        documents = self._parse_xmlelt_to_document(tree.getroot(), extra_info)
+
+        return documents
diff --git a/llama_hub/file/xml/requirements.txt b/llama_hub/file/xml/requirements.txt
diff --git a/llama_hub/library.json b/llama_hub/library.json
@@ -1224,4 +1224,8 @@
     "id": "arangodb",
     "author": "mmaatouk"
   }
+  "XMLReader": {
+    "id": "file/xml",
+    "author": "mmaatouk"
+  }
 }
diff --git a/llama_hub/llama_packs/library.json b/llama_hub/llama_packs/library.json
@@ -88,7 +88,8 @@
   "VoyageQueryEnginePack": {
     "id": "llama_packs/voyage_query_engine",
     "author": "Liuhong99",
-    "keywords": ["voyage", "query", "retrieval", "embeddings"]
+    "keywords": ["voyage", "query", "retrieval", "embeddings"],
+    "example": true
   },
   "VectaraRagPack": {
     "id": "llama_packs/vectara_rag",
@@ -153,7 +154,8 @@
   "RagEvaluatorPack": {
     "id": "llama_packs/rag_evaluator",
     "author": "nerdai",
-    "keywords": ["rag", "evaluation", "benchmarks"]
+    "keywords": ["rag", "evaluation", "benchmarks"],
+    "example": true
   },
   "LlamaDatasetMetadataPack": {
     "id": "llama_packs/llama_dataset_metadata",
@@ -236,7 +238,8 @@
   "RAGFusionPipelinePack": {
     "id": "llama_packs/query/rag_fusion_pipeline",
     "author": "jerryjliu",
-    "keywords": ["rag", "fusion", "pipeline", "query"]
+    "keywords": ["rag", "fusion", "pipeline", "query"],
+    "example": true
   },
   "AgentSearchRetrieverPack": {
     "id": "llama_packs/agent_search_retriever",
@@ -262,5 +265,10 @@
     "id": "llama_packs/stock_market_data_query_engine",
     "author": "anoopshrma",
     "keywords": ["stock", "market", "data", "query", "engine"]
+  },
+  "VannaPack": {
+    "id": "llama_packs/vanna",
+    "author": "jerryjliu",
+    "keywords": ["vanna", "sql", "ai", "text-to-sql"]
   }
 }
diff --git a/llama_hub/llama_packs/query/rag_fusion_pipeline/example.py b/llama_hub/llama_packs/query/rag_fusion_pipeline/example.py
@@ -0,0 +1,24 @@
+# Required Environment Variables: OPENAI_API_KEY
+
+from pathlib import Path
+from llama_index import download_loader
+from llama_index.llama_pack import download_llama_pack
+from llama_index.llms.openai import OpenAI
+
+# download and install dependencies
+RAGFusionPipelinePack = download_llama_pack(
+    "RAGFusionPipelinePack", "./rag_fusion_pipeline_pack"
+)
+PDFReader = download_loader("PDFReader")
+
+# load documents
+loader = PDFReader()
+document_path = Path("./data/101.pdf")  # replace with your own document
+documents = loader.load_data(file=document_path)
+
+# create the pack
+pack = RAGFusionPipelinePack(documents, llm=OpenAI(model="gpt-3.5-turbo"))
+
+# run the pack
+response = pack.run(input="How to rewrite history?")
+print(response)
diff --git a/llama_hub/llama_packs/rag_evaluator/example.py b/llama_hub/llama_packs/rag_evaluator/example.py
@@ -0,0 +1,27 @@
+# Required Environment Variables: OPENAI_API_KEY
+
+from llama_index.llama_dataset import download_llama_dataset
+from llama_index.llama_pack import download_llama_pack
+from llama_index import VectorStoreIndex
+
+# download a LabelledRagDataset from llama-hub
+rag_dataset, documents = download_llama_dataset(
+    "PaulGrahamEssayDataset", "./paul_graham"
+)
+
+# build a basic RAG pipeline off of the source documents
+index = VectorStoreIndex.from_documents(documents=documents)
+query_engine = index.as_query_engine()
+
+# Time to benchmark/evaluate this RAG pipeline
+# Download and install dependencies
+RagEvaluatorPack = download_llama_pack("RagEvaluatorPack", "./rag_evaluator_pack")
+
+# construction requires a query_engine, a rag_dataset, and optionally a judge_llm
+rag_evaluator_pack = RagEvaluatorPack(
+    query_engine=query_engine, rag_dataset=rag_dataset
+)
+
+# PERFORM EVALUATION
+benchmark_df = rag_evaluator_pack.run()  # async arun() also supported
+print(benchmark_df)
diff --git a/llama_hub/llama_packs/vanna/README.md b/llama_hub/llama_packs/vanna/README.md
@@ -0,0 +1,56 @@
+# Vanna AI LLamaPack
+
+Vanna AI is an open-source RAG framework for SQL generation. It works in two steps:
+1. Train a RAG model on your data
+2. Ask questions (use reference corpus to generate SQL queries that can run on your db).
+
+Check out the [Github project](https://github.com/vanna-ai/vanna) and the [docs](https://vanna.ai/docs/) for more details.
+
+This LlamaPack creates a simple `VannaQueryEngine` with vanna, ChromaDB and OpenAI, and allows you to train and ask questions over a SQL database.
+
+## CLI Usage
+
+You can download llamapacks directly using `llamaindex-cli`, which comes installed with the `llama-index` python package:
+
+```bash
+llamaindex-cli download-llamapack VannaPack --download-dir ./vanna_pack
+```
+
+You can then inspect the files at `./vanna_pack` and use them as a template for your own project!
+
+## Code Usage
+
+You can download the pack to a `./vanna_pack` directory:
+
+```python
+from llama_index.llama_pack import download_llama_pack
+
+# download and install dependencies
+VannaPack = download_llama_pack(
+  "VannaPack", "./vanna_pack"
+)
+```
+
+From here, you can use the pack, or inspect and modify the pack in `./vanna_pack`.
+
+Then, you can set up the pack like so:
+
+```python
+pack = VannaPack(
+    openai_api_key="<openai_api_key>",
+    sql_db_url="chinook.db",
+    openai_model="gpt-3.5-turbo"
+)
+```
+
+The `run()` function is a light wrapper around `llm.complete()`.
+
+```python
+response = pack.run("List some sample albums")
+```
+
+You can also use modules individually.
+
+```python
+query_engine = pack.get_modules()["vanna_query_engine"]
+```
diff --git a/llama_hub/llama_packs/vanna/__init__.py b/llama_hub/llama_packs/vanna/__init__.py
@@ -0,0 +1 @@
+"""Init params."""