jonfairbanks · JoepdeJong · May 27, 2024 · May 27, 2024 · May 27, 2024
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,5 @@ data/*
 .cache/*
 .nv/*
 .DS_Store
+.venv/*
+indices/
diff --git a/Pipfile b/Pipfile
@@ -21,6 +21,8 @@ torch = { version = "*", sys_platform = "!= 'darwin'" }
 colorama = { version = "*", sys_platform = "== 'win32'" }
 ebooklib = "*"
 llama-index-core = "*"
+typing_extensions = "*"
+fsspec = "*"
 
 [dev-packages]
 

diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/components/page_state.py b/components/page_state.py
@@ -83,6 +83,9 @@ def set_initial_state():
     if "chat_mode" not in st.session_state:
         st.session_state["chat_mode"] = "compact"
 
+    if "persisted_index_id" not in st.session_state:
+        st.session_state["persisted_index_id"] = None
+
     #####################
     # Advanced Settings #
     #####################

diff --git a/components/tabs/local_files.py b/components/tabs/local_files.py
@@ -1,12 +1,4 @@
-import os
-import shutil
-
 import streamlit as st
-
-import utils.helpers as func
-import utils.ollama as ollama
-import utils.llama_index as llama_index
-import utils.logs as logs
 import utils.rag_pipeline as rag
 
 supported_files = (
@@ -42,9 +34,22 @@ def local_files():
                 disabled=True,
             )
 
-    if len(uploaded_files) > 0:
+    st.text_input(
+        "Persisted Index ID",
+        key="persisted_index_id",
+        placeholder="local-rag-index",
+        help="The unique identifier for the persisted index.",
+    )
+    st.caption(
+        "Persisted indices are stored on disk and can be reloaded for future use. This is useful for retaining the index state across sessions."
+    )
+
+    if len(uploaded_files) > 0 or st.session_state["persisted_index_id"]:
         st.session_state["file_list"] = uploaded_files
 
+        if len(uploaded_files) == 0:
+            uploaded_files = None
+
         with st.spinner("Processing..."):
             # Initiate the RAG pipeline, providing documents to be saved on disk if necessary
             error = rag.rag_pipeline(uploaded_files)

diff --git a/utils/llama_index.py b/utils/llama_index.py
@@ -5,6 +5,10 @@
 import utils.logs as logs
 
 from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.core import StorageContext, load_index_from_storage
+
+from typing import Sequence
+from llama_index.core.schema import Document
 
 # This is not used but required by llama-index and must be set FIRST
 os.environ["OPENAI_API_KEY"] = "sk-abc123"
@@ -142,6 +146,58 @@ def create_index(_documents):
         raise Exception(f"Index creation failed: {err}")
 
 
+def get_persisted_index(
+    index_id: str, persist_dir: str, _documents: Sequence[Document]
+):
+
+    storage_context = StorageContext.from_defaults(persist_dir=persist_dir)
+    try:
+        # Rebuild storage context
+        print(storage_context.index_store.index_structs())
+        # Load index
+        index = load_index_from_storage(storage_context, index_id=index_id)
+        logs.log.info("Index loaded from persisted storage successfully")
+
+        st.caption("✔️ Loaded Persisted Index")
+
+        # Check if all upload documents are present in the index
+        if _documents is not None:
+            document_hashes = index.docstore.get_all_document_hashes()
+
+            for doc in _documents:
+                if doc.hash not in document_hashes:
+                    logs.log.info(
+                        "Document {} not found in index. Adding it...".format(
+                            doc.get_doc_id()
+                        )
+                    )
+                    index.insert(doc)
+
+    except Exception as e:
+        logs.log.error(f"Error loading persisted index: {e}")
+
+        if _documents is not None:
+            try:
+                index = VectorStoreIndex.from_documents(
+                    documents=_documents,
+                    show_progress=True,
+                    storage_context=storage_context,
+                )
+
+                logs.log.info("Index created from loaded documents successfully")
+
+                index.set_index_id(index_id)
+                index.storage_context.persist(persist_dir=persist_dir)
+                st.caption("✔️ Created Persisted Index")
+            except Exception as err:
+                logs.log.error(f"Index creation failed: {err}")
+                raise Exception(f"Index creation failed: {err}")
+        else:
+            raise Exception(f"Cannot create persisted index without documents.")
+
+    return index
+
+
 ###################################
 #
 # Create Query Engine
@@ -169,7 +225,15 @@ def create_query_engine(_documents):
         This function uses the `create_index` function to create an index from the provided documents and service context, and then creates a query engine from the resulting index. The `query_engine` parameter is used to specify the parameters of the query engine, including the number of top-ranked items to return (`similarity_top_k`) and the response mode (`response_mode`).
     """
     try:
-        index = create_index(_documents)
+        if st.session_state["persisted_index_id"]:
+            index = get_persisted_index(
+                index_id=st.session_state["persisted_index_id"],
+                persist_dir=os.getcwd() + "/indices",
+                _documents=_documents,
+            )
+        else:
+            index = create_index(_documents)
+            st.caption("✔️ Created File Index")
 
         query_engine = index.as_query_engine(
             similarity_top_k=st.session_state["top_k"],

diff --git a/utils/rag_pipeline.py b/utils/rag_pipeline.py
@@ -110,9 +110,12 @@ def rag_pipeline(uploaded_files: list = None):
     if (
         st.session_state["documents"] is not None
         and len(st.session_state["documents"]) > 0
+        and not st.session_state["persisted_index_id"]
     ):
         logs.log.info("Documents are already available; skipping document loading")
         st.caption("✔️ Processed File Data")
+    elif st.session_state["persisted_index_id"] and uploaded_files is None:
+        logs.log.info("No new documents to load; using persisted index")
     else:
         try:
             save_dir = os.getcwd() + "/data"
@@ -133,7 +136,6 @@ def rag_pipeline(uploaded_files: list = None):
         llama_index.create_query_engine(
             st.session_state["documents"],
         )
-        st.caption("✔️ Created File Index")
     except Exception as err:
         logs.log.error(f"Index Creation Error: {str(err)}")
         error = err