From a067ed54b9d2fa46fd57d05b83ca5a81c00228d2 Mon Sep 17 00:00:00 2001
From: Brandon Vargus <bmv2021@bu.edu>
Date: Thu, 21 Nov 2024 10:08:22 -0500
Subject: [PATCH 1/6] added rerank.py; code will also be available in notebook

---
 PoC/rerank.py | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 PoC/rerank.py

diff --git a/PoC/rerank.py b/PoC/rerank.py
new file mode 100644
index 0000000..a0aabbb
--- /dev/null
+++ b/PoC/rerank.py
@@ -0,0 +1 @@
+import numpy as npimport matplotlib.pyplot as pltimport seaborn as snsimport jsonimport pandas as pdfrom langchain_openai import ChatOpenAI# replace with file of your choosingfile = open("sample_full_text.json")full_text = json.load(file)# metadata csv file; should be included in repodf_attributes = pd.read_csv("metadata_attributes.csv")model = ChatOpenAI()import redef get_title(text):    match = re.search(r'\d+\s+(.+?)\n', text)    # Extracting and printing the title if there's a match    if match:        title = match.group(1)    return title# Turn the BPL data into a Documentfrom langchain.schema import Documentdocuments = []for doc in full_text:    # Extract metadata fields and apply get_title()    title = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "title_info_primary_tsi"]))    title_subtitle = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "title_info_primary_subtitle_tsi"]))    title_alt = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "title_info_alternative_tsim"]))    abstract = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "abstract_tsi"]))    subject_facet = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "subject_facet_ssim"]))    subject_geographic = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "subject_geographic_sim"]))    genre = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "genre_basic_ssim"]))    genre_specific = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "genre_specific_ssim"]))    name_facet = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "name_facet_ssim"]))    name_role = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "name_role_tsim"]))    date_human = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "date_tsim"]))    date_start = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "date_start_dtsi"]))    date_end = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "date_end_dtsi"]))    publisher = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "publisher_tsi"]))    collection_name = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "collection_name_ssim"]))    physical_location = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "physical_location_ssim"]))    related_item_host = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "related_item_host_ssim"]))    type_of_resource = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "type_of_resource_ssim"]))    URL = "https://www.digitalcommonwealth.org/search/" + get_title(str(df_attributes.loc[df_attributes["id"] == doc, "id"]))        # Create Document with metadata    documents.append(Document(        page_content=full_text[doc]['text'],        metadata={            "title": title,            "subtitle": title_subtitle,            "title_alt": title_alt,            "abstract": abstract,            "subject_facet": subject_facet,            "subject_geographic": subject_geographic,            "genre": genre,            "genre_specific": genre_specific,            "name_facet": name_facet,            "name_role": name_role,            "date_human": date_human,            "date_start": date_start,            "date_end": date_end,            "publisher": publisher,            "collection_name": collection_name,            "physical_location": physical_location,            "related_item_host": related_item_host,            "type_of_resource": type_of_resource,            "URL": URL        }    ))# Now for all of the vector store and reranking stuffimport faissfrom langchain_community.docstore.in_memory import InMemoryDocstorefrom langchain_community.vectorstores import FAISSfrom langchain.embeddings import HuggingFaceEmbeddings# embeddings modelembeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")# creating the vector storeindex = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))vector_store = FAISS(    embedding_function=embeddings,    index=index,    docstore=InMemoryDocstore(),    index_to_docstore_id={},)# now for the reranking stepweights = {    "title": 1.0,    "subtitle": 0.95,    "title_alt": 0.9,    "abstract": 0.85,    "subject_facet": 0.8,    "subject_geographic": 0.75,    "genre": 0.7,    "genre_specific": 0.65,    "name_facet": 0.6,    "name_role": 0.55,    "date_human": 0.5,    "date_start": 0.45,    "date_end": 0.4,    "publisher": 0.35,    "collection_name": 0.3,    "physical_location": 0.25,    "related_item_host": 0.2,    "type_of_resource": 0.15,    "URL": 0.1}from sentence_transformers import SentenceTransformerfrom sklearn.metrics.pairwise import cosine_similarityimport numpy as npfrom langchain.embeddings import HuggingFaceEmbeddings# our vector store:# embedding modelembeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")def compute_relevance_score(metadata_value, query):    """    Compute cosine similarity between the query and a metadata value using sentence-transformers.    Args:        metadata_value (str): The metadata value to compare.        query (str): The query string.    Returns:        float: Cosine similarity score (between 0 and 1).    """    if not metadata_value or not query:        return 0  # Return 0 if either the metadata or query is empty        # Encode the metadata value and query into embeddings    embeddings = model.encode([metadata_value, query], convert_to_tensor=False)  # Convert to NumPy    metadata_embedding, query_embedding = embeddings    # Compute cosine similarity    similarity = cosine_similarity([metadata_embedding], [query_embedding])    return similarity[0][0]  # Extract the scalar similarity valuedef rerank_documents(documents, query, weights, vector_store, k=10):    """    Rerank documents based on metadata relevance scores and FAISS vector similarity scores.    Args:        documents (list): List of Document objects.        query (str): The query string used for retrieval.        weights (dict): Weights for each metadata field.        vector_store (str): The vector store itself to get the similarity score    Returns:        list: Reranked documents in descending order of relevance.    """    final_score = 0    reranked_results = []    returned_docs = vector_store.similarity_search_with_score(query, k)    for doc in returned_docs:        final_score = doc[1]        # Add weighted relevance scores for each metadata field        for field, weight in weights.items():            metadata_value = doc[0].metadata.get(field, "")  # Safely get metadata field value            relevance_score = compute_relevance_score(metadata_value, query)            final_score += weight * relevance_score        reranked_results.append((doc, final_score))    # Sort documents by the final score in descending order    reranked_results.sort(key=lambda x: x[1], reverse=True)    return [doc for doc, score in reranked_results]docs = rerank_documents(documents, "Newspaper", weights, vector_store)# now we should get an output like this for some k value:# ('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:gf06jp23d', 'Reranked score: 1.1741459369659424')docs_list = [(docs[i][0].metadata['title'], docs[i][0].metadata['URL'], f"Reranked score: {docs[i][1]}") for i in range(len(docs))]docs_list.sort(key=lambda x: x[2], reverse=True)for doc in docs_list:    print(doc)
\ No newline at end of file

From 055907359ba2a67d672044b1f94fe2ded35f29e1 Mon Sep 17 00:00:00 2001
From: Brandon Vargus <bmv2021@bu.edu>
Date: Thu, 21 Nov 2024 10:15:40 -0500
Subject: [PATCH 2/6] included reranking

---
 PoC/POC.ipynb | 1733 ++++++++++++++++++++++++++++---------------------
 1 file changed, 979 insertions(+), 754 deletions(-)

diff --git a/PoC/POC.ipynb b/PoC/POC.ipynb
index 8f8f0f0..4c2f270 100644
--- a/PoC/POC.ipynb
+++ b/PoC/POC.ipynb
@@ -5,12 +5,12 @@
    "id": "2e7eb6e2-0a1b-42ea-8127-a51f13b4b4b0",
    "metadata": {},
    "source": [
-    "# LibRAG Proof of Concept"
+    "# LibRAG Deployment Phase"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "id": "3934dc57-c4c3-4892-bcf4-bce4aa5c48af",
    "metadata": {},
    "outputs": [],
@@ -23,27 +23,29 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "8848a3a2-6be4-40f3-87ee-0c9dc099117e",
+   "cell_type": "markdown",
+   "id": "5a24d843-3692-49d5-8167-af46cf4a1f5a",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "#!pip install sentence_transformers"
+    "### We are going to ensure that we have our data downloaded from the SCC.\n",
+    "### We are going to download one interval of the full text, as well as the entire metadata file"
    ]
   },
   {
-   "cell_type": "markdown",
-   "id": "5a24d843-3692-49d5-8167-af46cf4a1f5a",
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "30d31b31-2c2a-40ae-99b6-5502395f8bc7",
    "metadata": {},
+   "outputs": [],
    "source": [
-    "### We are going to ensure that we have our data downloaded from the SCC.\n",
-    "### We are going to download one interval of the full text, as well as the entire metadata file"
+    "# file path for metadata file on SCC: /projectnb/sparkgrp/ml-bpl-rag-data/full_data/bpl_data.json\n",
+    "meta = open(\"../EDA Phase/bpl-digital-commonwealth/bpl_data.json\")\n",
+    "bpl_metadata = json.load(meta)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 2,
    "id": "b50d8c5f-dfea-454f-860e-13315a9c2fea",
    "metadata": {},
    "outputs": [],
@@ -56,32 +58,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
-   "id": "6567c6f4-1f82-4cfe-aa51-7f4a3bacf6af",
+   "execution_count": null,
+   "id": "1e4c5601-648f-4ff1-a5ce-d8823c46f884",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "133\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "print(len(full_text))"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "30d31b31-2c2a-40ae-99b6-5502395f8bc7",
+   "cell_type": "markdown",
+   "id": "1148a15c-0965-4613-bdb4-15e74467fd16",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "# file path for metadata file on SCC: /projectnb/sparkgrp/ml-bpl-rag-data/full_data/bpl_data.json\n",
-    "meta = open(\"../EDA Phase/bpl-digital-commonwealth/bpl_data.json\")\n",
-    "bpl_metadata = json.load(meta)"
+    "Here's how to access the text documents:"
    ]
   },
   {
@@ -137,98 +127,10 @@
   },
   {
    "cell_type": "markdown",
-   "id": "d6f92651-0ff8-4ec7-b454-446e83e9f1d9",
-   "metadata": {},
-   "source": [
-    "### Embedding a paragraph using Word2Vec"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "255817ec-711d-4888-81f5-6374c59e8f95",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# from sentence_transformers import SentenceTransformer\n",
-    "\n",
-    "# # Load a pre-trained Sentence-BERT model\n",
-    "# model = SentenceTransformer('paraphrase-MiniLM-L6-v2')\n",
-    "\n",
-    "# # Example paragraph\n",
-    "# paragraph = full_text['commonwealth:w3764603d']['text']\n",
-    "# paragraph_embedding = model.encode(paragraph)\n",
-    "\n",
-    "# # Output: a vector representation of the paragraph\n",
-    "# print(paragraph_embedding)\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "5cfe50d1-9efb-405f-aec4-5091dace7222",
-   "metadata": {},
-   "source": [
-    "### Setting up a Retriever"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "fdc1e05d-35ca-45e7-af73-b5ea8d26199f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#!pip install langchain openai faiss-cpu"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d3033fe3-995e-46d5-86f4-5eda6a7e266a",
-   "metadata": {},
-   "source": [
-    "#### After ensuring we have the necessary dependencies, we are going to make our retriever"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "9e509e10-d10b-465c-83ba-e7ba9828f8fd",
+   "id": "bb3f0d7e-a982-4e18-a9eb-783df449ff09",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "from typing import List\n",
-    "\n",
-    "from langchain_core.callbacks import CallbackManagerForRetrieverRun\n",
-    "from langchain_core.documents import Document\n",
-    "from langchain_core.retrievers import BaseRetriever\n",
-    "\n",
-    "\n",
-    "class ToyRetriever(BaseRetriever):\n",
-    "    \"\"\"A toy retriever that contains the top k documents that contain the user query.\n",
-    "\n",
-    "    This retriever only implements the sync method _get_relevant_documents.\n",
-    "\n",
-    "    If the retriever were to involve file access or network access, it could benefit\n",
-    "    from a native async implementation of `_aget_relevant_documents`.\n",
-    "\n",
-    "    As usual, with Runnables, there's a default async implementation that's provided\n",
-    "    that delegates to the sync implementation running on another thread.\n",
-    "    \"\"\"\n",
-    "\n",
-    "    documents: List[Document]\n",
-    "    \"\"\"List of documents to retrieve from.\"\"\"\n",
-    "    k: int\n",
-    "    \"\"\"Number of top results to return\"\"\"\n",
-    "    def _get_relevant_documents(self, query: str, *, run_manager: CallbackManagerForRetrieverRun\n",
-    "    ) -> List[str]:\n",
-    "        matching_documents = []\n",
-    "        for document in documents:\n",
-    "            if len(matching_documents) >= self.k:\n",
-    "                return matching_documents\n",
-    "\n",
-    "            if query.lower() in document.page_content.lower():\n",
-    "                matching_documents.append(document.metadata['title'])\n",
-    "        return matching_documents\n"
+    "### Create Metadata Dataframe"
    ]
   },
   {
@@ -251,98 +153,6 @@
     "df.drop(columns=df.columns[0], axis=1, inplace=True)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "fc415a0f-6e2e-4b2d-8355-62a569806380",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>type</th>\n",
-       "      <th>attributes</th>\n",
-       "      <th>links</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>DigitalObject</td>\n",
-       "      <td>{'id': 'commonwealth-oai:xp68md23x', 'system_c...</td>\n",
-       "      <td>{'self': 'https://www.digitalcommonwealth.org/...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>DigitalObject</td>\n",
-       "      <td>{'id': 'commonwealth-oai:xp68m844v', 'system_c...</td>\n",
-       "      <td>{'self': 'https://www.digitalcommonwealth.org/...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>DigitalObject</td>\n",
-       "      <td>{'id': 'commonwealth-oai:xp68mb49n', 'system_c...</td>\n",
-       "      <td>{'self': 'https://www.digitalcommonwealth.org/...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>DigitalObject</td>\n",
-       "      <td>{'id': 'commonwealth-oai:xp68mc60v', 'system_c...</td>\n",
-       "      <td>{'self': 'https://www.digitalcommonwealth.org/...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>DigitalObject</td>\n",
-       "      <td>{'id': 'commonwealth-oai:xp68mc72n', 'system_c...</td>\n",
-       "      <td>{'self': 'https://www.digitalcommonwealth.org/...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "            type                                         attributes  \\\n",
-       "0  DigitalObject  {'id': 'commonwealth-oai:xp68md23x', 'system_c...   \n",
-       "1  DigitalObject  {'id': 'commonwealth-oai:xp68m844v', 'system_c...   \n",
-       "2  DigitalObject  {'id': 'commonwealth-oai:xp68mb49n', 'system_c...   \n",
-       "3  DigitalObject  {'id': 'commonwealth-oai:xp68mc60v', 'system_c...   \n",
-       "4  DigitalObject  {'id': 'commonwealth-oai:xp68mc72n', 'system_c...   \n",
-       "\n",
-       "                                               links  \n",
-       "0  {'self': 'https://www.digitalcommonwealth.org/...  \n",
-       "1  {'self': 'https://www.digitalcommonwealth.org/...  \n",
-       "2  {'self': 'https://www.digitalcommonwealth.org/...  \n",
-       "3  {'self': 'https://www.digitalcommonwealth.org/...  \n",
-       "4  {'self': 'https://www.digitalcommonwealth.org/...  "
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df.head()"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 14,
@@ -746,8 +556,7 @@
    ],
    "source": [
     "df_attributes = pd.json_normalize(df['attributes'])\n",
-    "df_attributes = pd.concat([df.drop(columns=['attributes']), df_attributes], axis=1)\n",
-    "df_attributes.head(10)"
+    "df_attributes = pd.concat([df.drop(columns=['attributes']), df_attributes], axis=1)"
    ]
   },
   {
@@ -760,585 +569,1003 @@
     "df_attributes.to_csv(\"metadata_attributes.csv\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "ce755e72-348e-45b3-8cb4-7e0202818e16",
+   "metadata": {},
+   "source": [
+    "### Optionally, read the csv if it is already downloaded"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 13,
-   "id": "92a7759e-36d6-455a-a1e4-bafcb2041f7d",
+   "execution_count": 4,
+   "id": "49e1b410-a7b0-4ef3-bbb8-2c240e568a73",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/xq/fj3st__56r54gz9tdvb7d2k40000gn/T/ipykernel_12916/3787498575.py:1: DtypeWarning: Columns (10,16,17,18,20,21,22,23,24,25,27,29,33,34,36,41,42,43,44,54,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,128,129,130,131,132,133,134,135,136,137,138,139,140) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+      "  df_attributes = pd.read_csv(\"metadata_attributes.csv\")\n"
+     ]
+    }
+   ],
+   "source": [
+    "df_attributes = pd.read_csv(\"metadata_attributes.csv\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "972493d9-63a1-477d-811e-c1d951a2d63c",
+   "metadata": {},
+   "source": [
+    "Writing the get_title function to clear away any whitespace and newline characters from the title of each document."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "d819646f-51cc-4817-9542-ecfc9ea4af33",
    "metadata": {},
    "outputs": [
     {
      "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>type</th>\n",
-       "      <th>links</th>\n",
-       "      <th>id</th>\n",
-       "      <th>system_create_dtsi</th>\n",
-       "      <th>system_modified_dtsi</th>\n",
-       "      <th>curator_model_ssi</th>\n",
-       "      <th>curator_model_suffix_ssi</th>\n",
-       "      <th>title_info_primary_tsi</th>\n",
-       "      <th>genre_basic_ssim</th>\n",
-       "      <th>genre_specific_ssim</th>\n",
-       "      <th>...</th>\n",
-       "      <th>storage_key_base_ss</th>\n",
-       "      <th>identifier_issn_ssim</th>\n",
-       "      <th>frequency_tsi</th>\n",
-       "      <th>contained_by_ssi</th>\n",
-       "      <th>note_credits_tsim</th>\n",
-       "      <th>identifier_isbn_ssim</th>\n",
-       "      <th>identifier_music_publisher_ssim</th>\n",
-       "      <th>note_arrangement_tsim</th>\n",
-       "      <th>transcription_ark_id_ssi</th>\n",
-       "      <th>transcription_key_base_ss</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>DigitalObject</td>\n",
-       "      <td>{'self': 'https://www.digitalcommonwealth.org/...</td>\n",
-       "      <td>commonwealth-oai:xp68md23x</td>\n",
-       "      <td>2021-03-04T00:13:09Z</td>\n",
-       "      <td>2021-09-02T20:40:00Z</td>\n",
-       "      <td>Curator::DigitalObject</td>\n",
-       "      <td>DigitalObject</td>\n",
-       "      <td>من فضلكم توقفوا الأشخاص الذين ارتكبوا أسوأ الج...</td>\n",
-       "      <td>[Posters]</td>\n",
-       "      <td>[Political posters]</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>DigitalObject</td>\n",
-       "      <td>{'self': 'https://www.digitalcommonwealth.org/...</td>\n",
-       "      <td>commonwealth-oai:xp68m844v</td>\n",
-       "      <td>2021-03-03T23:58:44Z</td>\n",
-       "      <td>2021-09-02T20:21:32Z</td>\n",
-       "      <td>Curator::DigitalObject</td>\n",
-       "      <td>DigitalObject</td>\n",
-       "      <td>海员们 : 要警惕航运事故</td>\n",
-       "      <td>[Posters]</td>\n",
-       "      <td>[Political posters]</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>DigitalObject</td>\n",
-       "      <td>{'self': 'https://www.digitalcommonwealth.org/...</td>\n",
-       "      <td>commonwealth-oai:xp68mb49n</td>\n",
-       "      <td>2021-03-04T00:06:25Z</td>\n",
-       "      <td>2021-09-02T20:30:29Z</td>\n",
-       "      <td>Curator::DigitalObject</td>\n",
-       "      <td>DigitalObject</td>\n",
-       "      <td>人間としての尊厳を保てる : 生活賃金を</td>\n",
-       "      <td>[Posters]</td>\n",
-       "      <td>[Political posters]</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>DigitalObject</td>\n",
-       "      <td>{'self': 'https://www.digitalcommonwealth.org/...</td>\n",
-       "      <td>commonwealth-oai:xp68mc60v</td>\n",
-       "      <td>2021-03-04T00:10:40Z</td>\n",
-       "      <td>2021-09-02T20:35:20Z</td>\n",
-       "      <td>Curator::DigitalObject</td>\n",
-       "      <td>DigitalObject</td>\n",
-       "      <td>野火</td>\n",
-       "      <td>[Posters]</td>\n",
-       "      <td>[Political posters]</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>DigitalObject</td>\n",
-       "      <td>{'self': 'https://www.digitalcommonwealth.org/...</td>\n",
-       "      <td>commonwealth-oai:xp68mc72n</td>\n",
-       "      <td>2021-03-04T00:11:07Z</td>\n",
-       "      <td>2021-09-02T20:35:52Z</td>\n",
-       "      <td>Curator::DigitalObject</td>\n",
-       "      <td>DigitalObject</td>\n",
-       "      <td>野火</td>\n",
-       "      <td>[Posters]</td>\n",
-       "      <td>[Political posters]</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>5 rows × 140 columns</p>\n",
-       "</div>"
-      ],
       "text/plain": [
-       "            type                                              links  \\\n",
-       "0  DigitalObject  {'self': 'https://www.digitalcommonwealth.org/...   \n",
-       "1  DigitalObject  {'self': 'https://www.digitalcommonwealth.org/...   \n",
-       "2  DigitalObject  {'self': 'https://www.digitalcommonwealth.org/...   \n",
-       "3  DigitalObject  {'self': 'https://www.digitalcommonwealth.org/...   \n",
-       "4  DigitalObject  {'self': 'https://www.digitalcommonwealth.org/...   \n",
-       "\n",
-       "                           id    system_create_dtsi  system_modified_dtsi  \\\n",
-       "0  commonwealth-oai:xp68md23x  2021-03-04T00:13:09Z  2021-09-02T20:40:00Z   \n",
-       "1  commonwealth-oai:xp68m844v  2021-03-03T23:58:44Z  2021-09-02T20:21:32Z   \n",
-       "2  commonwealth-oai:xp68mb49n  2021-03-04T00:06:25Z  2021-09-02T20:30:29Z   \n",
-       "3  commonwealth-oai:xp68mc60v  2021-03-04T00:10:40Z  2021-09-02T20:35:20Z   \n",
-       "4  commonwealth-oai:xp68mc72n  2021-03-04T00:11:07Z  2021-09-02T20:35:52Z   \n",
-       "\n",
-       "        curator_model_ssi curator_model_suffix_ssi  \\\n",
-       "0  Curator::DigitalObject            DigitalObject   \n",
-       "1  Curator::DigitalObject            DigitalObject   \n",
-       "2  Curator::DigitalObject            DigitalObject   \n",
-       "3  Curator::DigitalObject            DigitalObject   \n",
-       "4  Curator::DigitalObject            DigitalObject   \n",
-       "\n",
-       "                              title_info_primary_tsi genre_basic_ssim  \\\n",
-       "0  من فضلكم توقفوا الأشخاص الذين ارتكبوا أسوأ الج...        [Posters]   \n",
-       "1                                      海员们 : 要警惕航运事故        [Posters]   \n",
-       "2                               人間としての尊厳を保てる : 生活賃金を        [Posters]   \n",
-       "3                                                 野火        [Posters]   \n",
-       "4                                                 野火        [Posters]   \n",
-       "\n",
-       "   genre_specific_ssim  ... storage_key_base_ss identifier_issn_ssim  \\\n",
-       "0  [Political posters]  ...                 NaN                  NaN   \n",
-       "1  [Political posters]  ...                 NaN                  NaN   \n",
-       "2  [Political posters]  ...                 NaN                  NaN   \n",
-       "3  [Political posters]  ...                 NaN                  NaN   \n",
-       "4  [Political posters]  ...                 NaN                  NaN   \n",
-       "\n",
-       "  frequency_tsi contained_by_ssi note_credits_tsim identifier_isbn_ssim  \\\n",
-       "0           NaN              NaN               NaN                  NaN   \n",
-       "1           NaN              NaN               NaN                  NaN   \n",
-       "2           NaN              NaN               NaN                  NaN   \n",
-       "3           NaN              NaN               NaN                  NaN   \n",
-       "4           NaN              NaN               NaN                  NaN   \n",
-       "\n",
-       "  identifier_music_publisher_ssim note_arrangement_tsim  \\\n",
-       "0                             NaN                   NaN   \n",
-       "1                             NaN                   NaN   \n",
-       "2                             NaN                   NaN   \n",
-       "3                             NaN                   NaN   \n",
-       "4                             NaN                   NaN   \n",
-       "\n",
-       "  transcription_ark_id_ssi transcription_key_base_ss  \n",
-       "0                      NaN                       NaN  \n",
-       "1                      NaN                       NaN  \n",
-       "2                      NaN                       NaN  \n",
-       "3                      NaN                       NaN  \n",
-       "4                      NaN                       NaN  \n",
-       "\n",
-       "[5 rows x 140 columns]"
+       "['海员们', ':', '要警惕航运事故\\nName:', 'title_info_primary_tsi,', 'dtype:', 'object']"
       ]
      },
-     "execution_count": 13,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "df_attributes.loc[df_attributes[]]"
+    "str(df_attributes.loc[df_attributes[\"id\"] == \"commonwealth-oai:xp68m844v\"][\"title_info_primary_tsi\"]).split(\" \")[4:]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "754c22c6-5d35-436a-922a-0f5f6cafa6c9",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'1199893    NaN\\nName: title_info_alternative_tsim, dtype: object'"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "str(df_attributes.loc[df_attributes[\"id\"] == \"commonwealth:1j92ng13k\"][\"title_info_alternative_tsim\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 81,
+   "id": "09331a9b-135a-46ec-8b0a-70c70ba1c261",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "585812"
+      ]
+     },
+     "execution_count": 81,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_attributes[\"genre_specific_ssim\"].isna().sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 76,
+   "id": "914df792-9343-43fd-83cd-7678e5a56f8c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1303800"
+      ]
+     },
+     "execution_count": 76,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_attributes.shape[0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1c6d1c2d-16ad-43a0-844a-72238901c07d",
+   "metadata": {},
+   "source": [
+    "### Turn full text into Documents type"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "51f7f3b8-ed7a-43a1-949f-aa43e594af99",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1    Poster produced by the International Transport...\n",
+      "Name: abstract_tsi, dtype: object\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(str(df_attributes.loc[df_attributes[\"id\"] == \"commonwealth-oai:xp68m844v\", \"abstract_tsi\"]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b072fe0a-b538-4704-85a0-b6862b0653b6",
+   "metadata": {},
+   "source": [
+    "Important Metadata to Embed:\n",
+    "- title_info_primary_tsi\n",
+    "- title_info_primary_subtitle_tsi\n",
+    "- title_info_alternative_tsim\n",
+    "- abstract_tsi\n",
+    "- subject_facet_ssim\n",
+    "- subject_geographic_sim\n",
+    "- genre_basic_ssim\n",
+    "- genre_specific_ssim"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "15d83b93-6840-4618-ac40-1c7de9e5cc1e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "def get_title(text):\n",
+    "    match = re.search(r'\\d+\\s+(.+?)\\n', text)\n",
+    "\n",
+    "    # Extracting and printing the title if there's a match\n",
+    "    if match:\n",
+    "        title = match.group(1)\n",
+    "    return title"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "6c3f5614-cc45-4fb4-9b13-1fd0361da1b6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Turn the BPL data into a Document\n",
+    "from langchain.schema import Document\n",
+    "\n",
+    "documents = []\n",
+    "\n",
+    "for doc in full_text:\n",
+    "    # Extract metadata fields and apply get_title()\n",
+    "    title = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"title_info_primary_tsi\"]))\n",
+    "    title_subtitle = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"title_info_primary_subtitle_tsi\"]))\n",
+    "    title_alt = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"title_info_alternative_tsim\"]))\n",
+    "    abstract = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"abstract_tsi\"]))\n",
+    "    subject_facet = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"subject_facet_ssim\"]))\n",
+    "    subject_geographic = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"subject_geographic_sim\"]))\n",
+    "    genre = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"genre_basic_ssim\"]))\n",
+    "    genre_specific = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"genre_specific_ssim\"]))\n",
+    "    name_facet = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"name_facet_ssim\"]))\n",
+    "    name_role = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"name_role_tsim\"]))\n",
+    "    date_human = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"date_tsim\"]))\n",
+    "    date_start = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"date_start_dtsi\"]))\n",
+    "    date_end = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"date_end_dtsi\"]))\n",
+    "    publisher = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"publisher_tsi\"]))\n",
+    "    collection_name = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"collection_name_ssim\"]))\n",
+    "    physical_location = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"physical_location_ssim\"]))\n",
+    "    related_item_host = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"related_item_host_ssim\"]))\n",
+    "    type_of_resource = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"type_of_resource_ssim\"]))\n",
+    "    URL = \"https://www.digitalcommonwealth.org/search/\" + get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"id\"]))\n",
+    "    \n",
+    "    # Create Document with metadata\n",
+    "    documents.append(Document(\n",
+    "        page_content=full_text[doc]['text'],\n",
+    "        metadata={\n",
+    "            \"title\": title,\n",
+    "            \"subtitle\": title_subtitle,\n",
+    "            \"title_alt\": title_alt,\n",
+    "            \"abstract\": abstract,\n",
+    "            \"subject_facet\": subject_facet,\n",
+    "            \"subject_geographic\": subject_geographic,\n",
+    "            \"genre\": genre,\n",
+    "            \"genre_specific\": genre_specific,\n",
+    "            \"name_facet\": name_facet,\n",
+    "            \"name_role\": name_role,\n",
+    "            \"date_human\": date_human,\n",
+    "            \"date_start\": date_start,\n",
+    "            \"date_end\": date_end,\n",
+    "            \"publisher\": publisher,\n",
+    "            \"collection_name\": collection_name,\n",
+    "            \"physical_location\": physical_location,\n",
+    "            \"related_item_host\": related_item_host,\n",
+    "            \"type_of_resource\": type_of_resource,\n",
+    "            \"URL\": URL\n",
+    "        }\n",
+    "    ))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "b58b4530-27e4-4ed4-80be-4ee240892480",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "dict_keys(['title', 'subtitle', 'title_alt', 'abstract', 'subject_facet', 'subject_geographic', 'genre', 'genre_specific', 'name_facet', 'name_role', 'date_human', 'date_start', 'date_end', 'publisher', 'collection_name', 'physical_location', 'related_item_host', 'type_of_resource', 'URL'])"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "documents[-1].metadata.keys()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6580d4c1-9ab0-44c8-9ea7-cbeb80934f4b",
+   "metadata": {},
+   "source": [
+    "# RAG Pipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "648466aa-3142-4ece-aa02-4454c7f6ee41",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# set openai api key\n",
+    "import os\n",
+    "os.environ[\"OPENAI_API_KEY\"] = \"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\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "895258be-7bf4-4d8d-8f5a-79d80dfe8c36",
+   "metadata": {},
+   "source": [
+    "### Using FAISS Vector Store"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "1b537ce7-2eb2-4392-bc71-5a33ede503df",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!pip install langchain-ollama"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "c1b3815e-dec5-4f22-b863-d91a57b5ccad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#from langchain_ollama import OllamaEmbeddings\n",
+    "from langchain_openai import OpenAIEmbeddings\n",
+    "import faiss\n",
+    "from langchain_community.docstore.in_memory import InMemoryDocstore\n",
+    "from langchain_community.vectorstores import FAISS"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "03554a37-d142-45eb-be33-a333929b927d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#embeddings = OpenAIEmbeddings(model=\"text-embedding-3-large\", dimensions=3072)\n",
+    "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "0bea78d8-4b72-4da0-a50d-89e0fe509454",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "index = faiss.IndexFlatL2(len(embeddings.embed_query(\"hello world\")))\n",
+    "\n",
+    "vector_store = FAISS(\n",
+    "    embedding_function=embeddings,\n",
+    "    index=index,\n",
+    "    docstore=InMemoryDocstore(),\n",
+    "    index_to_docstore_id={},\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "b1bee292-778a-480e-8eb3-5cc37587ce85",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['68762a4a-dbcb-411e-9047-ed04e44a794a',\n",
+       " 'f622dbf6-aee1-43e4-aba1-59fde531a7ad',\n",
+       " '927c3752-7746-4c88-a58b-e01a6547d857',\n",
+       " '70bf52c8-b79e-459b-9aa0-c1f12e12d842',\n",
+       " 'ade0d10f-c301-4ac1-99b8-6e26562ac259',\n",
+       " '01721bef-99a0-4034-8bc0-655ebec123d1',\n",
+       " '5c0969cd-a297-4b3f-b73e-9b3e5ed60c1e',\n",
+       " 'f707ab88-6b09-48fc-94ed-93ac650931a2',\n",
+       " 'bd269b63-b143-4535-83f8-27515883647f',\n",
+       " '5b0f71e5-520e-4090-8aa7-fc4f152f42f3',\n",
+       " 'a97737a2-ba3c-4a6f-b169-967ea2856f57',\n",
+       " '5e9a4add-7aae-45c1-a071-bd8510e81829',\n",
+       " '53b49886-6e08-4c6e-b69d-d12fa9accc12',\n",
+       " 'cdb37b1a-8ff1-462c-8052-90b8681d1700',\n",
+       " '44eca254-78b8-4ce5-aa31-ee02b8895734',\n",
+       " '15cb7183-2758-468a-88d0-35348ccf357a',\n",
+       " 'fd3e2244-e666-4d73-9cac-d980ce9b49ee',\n",
+       " 'b48501c1-c52d-4cbf-bcc0-2b669017736c',\n",
+       " '3eda50a4-0269-44ea-8594-e1ed21662dc1',\n",
+       " 'd6d96535-021c-44c8-89c8-796ebd3ebc0d',\n",
+       " 'e20dfb8b-7e99-4299-82ae-b69c4b2aa5e8',\n",
+       " '89cbeb0c-253a-4c01-aba0-b3ef82622381',\n",
+       " 'fa325062-a886-499d-8377-2feedc5a8262',\n",
+       " '379080c0-03f9-4ff7-89b4-be8edcd7be96',\n",
+       " '06ff8bdc-975a-4ec2-8273-e2ca4f489df4',\n",
+       " '2ab5e13c-9051-4022-a476-92a97b05c5c1',\n",
+       " '7e144845-3350-4871-9404-fa7e3f734b78',\n",
+       " '2de7c819-546d-4470-8f0a-b91a7903b0ed',\n",
+       " 'a05149dd-2b01-4f03-9cd0-01fcea8780d2',\n",
+       " 'd9d462c0-d928-49cc-9172-7152328e4d51',\n",
+       " '8edb060d-8959-418d-9470-5da965ad2b9d',\n",
+       " '19d25b97-34b0-4f53-9ec3-4060239242dc',\n",
+       " '49a6e2ef-7f85-4a3a-98d6-3b3dd8b4c3b6',\n",
+       " '54913f79-d38b-40b3-b251-86c59721fa3d',\n",
+       " 'e87af048-11d2-4977-b4aa-6465d5017fba',\n",
+       " 'de44abe6-739b-44c5-806b-06d52aa1e56b',\n",
+       " '82898991-0951-40f6-8926-309db4a807b7',\n",
+       " '02c3a940-74db-4d60-a173-ca9bbd702cd0',\n",
+       " '20f881bb-7d3b-4f49-82a3-5ac8e7578322',\n",
+       " '141d3d1e-6d5a-4dd9-95bb-4ae07c81e39c',\n",
+       " 'b8b415f5-670b-4b56-b1ce-fe424769a182',\n",
+       " '55de7042-5452-4ea2-820a-6b35a594cbf4',\n",
+       " '404565f4-ee23-423f-a4d8-550b6f8b6e41',\n",
+       " '300e8b0b-2b5a-4618-aec7-8d5324de106f',\n",
+       " 'cab508a6-25f1-43ca-9f46-835af4e922af',\n",
+       " '2d370174-8c8f-418b-a083-84f52cfe9a36',\n",
+       " '65ff5f75-cc47-4610-999d-9a11f9894b32',\n",
+       " '3995b720-99e0-4f68-a740-46e7f2e5c59f',\n",
+       " '40af7846-ba78-48eb-83e0-6b6b844f8c79',\n",
+       " 'c1f51576-1e99-4f77-be03-ce202f238dbe',\n",
+       " 'ec7d4da2-e7a5-40d9-9e67-a37c12e03bf9',\n",
+       " '270222bf-57d7-4cfe-a268-1f6167ff6d7e',\n",
+       " '9a8373ee-c2db-4a1d-b5ac-6a04c0d2becd',\n",
+       " '4504f7c3-6820-49c7-be23-43d9b8ba58aa',\n",
+       " 'b181eab2-587c-4de7-92a4-6f8b0f3f2b00',\n",
+       " '7e889a1d-c1b8-4c53-af90-de3eaa6db021',\n",
+       " '6007601e-7b9d-4064-b16c-655169e9d72a',\n",
+       " '11e280e2-dfce-4a52-8867-296155f78eb8',\n",
+       " '821de396-47f8-4606-934c-13c1bf884473',\n",
+       " '9b7e6753-61d1-446d-8a2b-344b0e41f84c',\n",
+       " '5c2d20de-9344-441a-a1d8-0ca0f4b1fca4',\n",
+       " 'aee572eb-ec36-4f61-b35e-9dd2c6ab36f2',\n",
+       " '94ac76ef-42f8-4a03-bffe-f86cbab72408',\n",
+       " '1be0d43f-ee71-419c-ba50-3aaf024d27c5',\n",
+       " 'bfd48d92-f28e-48f1-815b-756642cf2b8e',\n",
+       " '8023464e-5507-4192-ac25-f06fb585e68d',\n",
+       " '75bde2f0-1b5f-4e05-aa8a-ff2d315a06b4',\n",
+       " '0b34222f-1dda-4d19-9810-11e75d1272e5',\n",
+       " '97500c01-1c8f-42ef-8efd-19223c68c46a',\n",
+       " 'cf8c5ea5-d750-4641-bb5b-e971494b7666',\n",
+       " '3fd5a660-ab78-4897-a700-1a73b533b0cc',\n",
+       " '17368981-b502-45e1-abaa-4a9b09c096a8',\n",
+       " 'de201d3e-8d95-41a6-b8ac-4afba7909331',\n",
+       " '192f929f-1bba-4e75-b05e-aa8ae937a092',\n",
+       " '86b39447-2fb0-4899-b1c1-6a89e6bfee0f',\n",
+       " 'ff5d3a8e-4da6-47f0-b573-036670c5a936',\n",
+       " 'ef0a7589-2792-43b9-9895-b40d175e5ff9',\n",
+       " '97338842-0c45-4e94-ac34-81dc74d259fd',\n",
+       " 'f24aca1a-8ea3-40fc-a868-c300bd6c7c5a',\n",
+       " '41af6f16-89f8-46c5-89ed-a0853385c24c',\n",
+       " 'a2f0344b-6ca1-4c2c-ac08-de839b7a61cd',\n",
+       " '8d23a995-124f-40c5-820a-b86eacb7589f',\n",
+       " '8846fe55-3eda-4af1-aee1-4c8faa9a5c73',\n",
+       " 'ae4e13a7-db9f-43d6-b27b-913db7f23b48',\n",
+       " '05846bed-e2ba-4b45-84eb-aef0fbdb1052',\n",
+       " 'ac778fa3-e3b2-42e3-8152-c63701895dfe',\n",
+       " 'c697142b-3c95-440d-b61a-7183c08c171d',\n",
+       " 'caa1e83f-ae87-4704-9f2e-d1dad34a9cd2',\n",
+       " '43847c84-8fe5-4457-9f82-526695e2e97a',\n",
+       " '5521bbc6-cdff-4703-bf3e-63e01596d21c',\n",
+       " '3c275f68-8f64-452b-bece-3949e2c25b22',\n",
+       " 'bf581ede-0913-424d-8a2d-216ae9cadcb0',\n",
+       " '117e9f7d-7c8b-4366-9f84-fbd5dd1171dd',\n",
+       " 'c116254f-1415-4188-bc93-0487bcbe47ee',\n",
+       " '40fac159-6763-4a30-addb-975a6e4e69a3',\n",
+       " '7a181599-2847-4d9b-957f-f4fc7468aea4',\n",
+       " '4df69e1f-2e1b-4be0-a161-a374f493c4cb',\n",
+       " '37813f0f-33a6-4adc-ac55-cbca0b74471f',\n",
+       " 'b0dbb8ef-e049-4393-a159-2e0f9081b0a3',\n",
+       " '57da8d7d-e0fc-4868-938d-a85f6d6663ff',\n",
+       " '618dad07-a666-4827-b188-55d5bf31aed0',\n",
+       " '3e2724b7-4f24-42fd-9def-ca0f9edcd615',\n",
+       " '41bf1d9b-da94-4e8f-9805-18bf93ff1100',\n",
+       " 'c5c85271-1e66-4dd2-a338-a2b48a9b07e4',\n",
+       " '80646915-6af3-423c-a2de-71786272f086',\n",
+       " 'bf824464-3eae-4343-b0ab-5b565445daec',\n",
+       " 'd3127b71-3763-4532-8863-0ce42170dfdb',\n",
+       " '700f2cb1-85bd-4d28-b95f-68f4536185c1',\n",
+       " 'f79d14f8-9eaa-4f3b-af7c-62095dae4497',\n",
+       " '196c8830-7cea-485e-a61c-af5863922733',\n",
+       " '989f5923-c37f-4a18-b61f-3e0b912f01d2',\n",
+       " '36ea71ca-36e3-4fcd-bdb9-0f41a83378a3',\n",
+       " '3567b36a-f249-4ab2-9392-a734303ce5c1',\n",
+       " '569ed9cf-6b14-4822-892c-4d19205490f2',\n",
+       " '7db05e1e-0406-4d9f-bdde-e7351932c25e',\n",
+       " '818a69ab-de3d-42f4-92d0-2accb701ef8a',\n",
+       " '96fc14b8-4930-4177-b20c-dba396297577',\n",
+       " 'ecca8db2-b629-4f05-8018-4aa99dc26842',\n",
+       " '2cc41514-6e02-4cba-bbb4-c2a69454559b',\n",
+       " 'b41fa3b0-ea8e-40e5-a0be-1abd27cbc23e',\n",
+       " '1254e098-a9dc-4eba-a0b9-929ed848ed91',\n",
+       " '1a9ea687-3318-4ad7-a55a-2e0044583667',\n",
+       " 'ff5a57dd-69a9-4d9a-9d26-56b9271f1b8e',\n",
+       " '593a9427-03ae-4b3b-9215-2d14f10341f3',\n",
+       " '5903b7e0-489f-4011-bbd7-cac82b19423a',\n",
+       " '48072b10-faae-4609-a298-5c6193488ed9',\n",
+       " 'ec7cd14c-1248-4285-a090-fc8d0ce96fd4',\n",
+       " 'a9c6016b-b100-4e46-8b42-a232b9be6459',\n",
+       " '7e851d53-1a48-4289-8b0a-27834e3e044d',\n",
+       " '77dde80d-7b70-4ab6-b042-c1e67932f36a',\n",
+       " '4642025a-d842-4fb3-83ef-76ce90a6a2bb',\n",
+       " '256c37b7-38b7-4642-a501-7a611e0763ac',\n",
+       " '65962070-4375-4479-872b-fc3300c3f1af']"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from uuid import uuid4\n",
+    "\n",
+    "uuids = [str(uuid4()) for _ in range(len(documents))]\n",
+    "\n",
+    "vector_store.add_documents(documents=documents, ids=uuids)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 103,
+   "id": "0c3a0a6b-4640-4036-863e-77a1a715a0e3",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['https://www.digitalcommonwealth.org/search/commonwealth:dv144791c', 'https://www.digitalcommonwealth.org/search/commonwealth:rv048f292', 'https://www.digitalcommonwealth.org/search/commonwealth:9019vm69m', 'https://www.digitalcommonwealth.org/search/commonwealth:dv1441451', 'https://www.digitalcommonwealth.org/search/commonwealth:w3764472d', 'https://www.digitalcommonwealth.org/search/commonwealth:6w924s12f', 'https://www.digitalcommonwealth.org/search/commonwealth:6w924r91w', 'https://www.digitalcommonwealth.org/search/commonwealth:wm118g867']\n"
+     ]
+    }
+   ],
+   "source": [
+    "retriever = vector_store.similarity_search_with_score(\n",
+    "    \"Manuscripts\",\n",
+    "    k=3\n",
+    ")\n",
+    "r = retriever.invoke(\"John Bishop Estlin\")\n",
+    "r_list = [x.metadata[\"URL\"] for x in r]\n",
+    "print(r_list)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d3a257cc-c687-4fc5-91a7-0c3048a49e70",
+   "metadata": {},
+   "source": [
+    "### Now for the Reranking Step:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "013d5fca-c621-4a7e-ac0e-ad5c9aba577f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "weights2 = {\n",
+    "    \"title\": 1.0,\n",
+    "    \"subtitle\": 0.95,\n",
+    "    \"title_alt\": 0.9,\n",
+    "    \"abstract\": 0.85,\n",
+    "    \"subject_facet\": 0.8,\n",
+    "    \"subject_geographic\": 0.75,\n",
+    "    \"genre\": 0.7,\n",
+    "    \"genre_specific\": 0.65,\n",
+    "    \"name_facet\": 0.6,\n",
+    "    \"name_role\": 0.55,\n",
+    "    \"date_human\": 0.5,\n",
+    "    \"date_start\": 0.45,\n",
+    "    \"date_end\": 0.4,\n",
+    "    \"publisher\": 0.35,\n",
+    "    \"collection_name\": 0.3,\n",
+    "    \"physical_location\": 0.25,\n",
+    "    \"related_item_host\": 0.2,\n",
+    "    \"type_of_resource\": 0.15,\n",
+    "    \"URL\": 0.1\n",
+    "}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 75,
+   "id": "cc1f04d1-f553-46ae-bfc6-248125f62423",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sentence_transformers import SentenceTransformer\n",
+    "from sklearn.metrics.pairwise import cosine_similarity\n",
+    "import numpy as np\n",
+    "from langchain.embeddings import HuggingFaceEmbeddings\n",
+    "\n",
+    "# our vector store:\n",
+    "\n",
+    "# embedding model\n",
+    "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")\n",
+    "\n",
+    "def compute_relevance_score(metadata_value, query):\n",
+    "    \"\"\"\n",
+    "    Compute cosine similarity between the query and a metadata value using sentence-transformers.\n",
+    "\n",
+    "    Args:\n",
+    "        metadata_value (str): The metadata value to compare.\n",
+    "        query (str): The query string.\n",
+    "\n",
+    "    Returns:\n",
+    "        float: Cosine similarity score (between 0 and 1).\n",
+    "    \"\"\"\n",
+    "    if not metadata_value or not query:\n",
+    "        return 0  # Return 0 if either the metadata or query is empty\n",
+    "    \n",
+    "    # Encode the metadata value and query into embeddings\n",
+    "    embeddings = model.encode([metadata_value, query], convert_to_tensor=False)  # Convert to NumPy\n",
+    "    metadata_embedding, query_embedding = embeddings\n",
+    "\n",
+    "    # Compute cosine similarity\n",
+    "    similarity = cosine_similarity([metadata_embedding], [query_embedding])\n",
+    "    return similarity[0][0]  # Extract the scalar similarity value\n",
+    "\n",
+    "\n",
+    "\n",
+    "def rerank_documents(documents, query, weights, vector_store, k=10):\n",
+    "    \"\"\"\n",
+    "    Rerank documents based on metadata relevance scores and FAISS vector similarity scores.\n",
+    "\n",
+    "    Args:\n",
+    "        documents (list): List of Document objects.\n",
+    "        query (str): The query string used for retrieval.\n",
+    "        weights (dict): Weights for each metadata field.\n",
+    "        vector_store (str): The vector store itself to get the similarity score\n",
+    "\n",
+    "    Returns:\n",
+    "        list: Reranked documents in descending order of relevance.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    final_score = 0\n",
+    "\n",
+    "    reranked_results = []\n",
+    "    returned_docs = vector_store.similarity_search_with_score(query, k)\n",
+    "    for doc in returned_docs:\n",
+    "        final_score = doc[1]\n",
+    "        # Add weighted relevance scores for each metadata field\n",
+    "        for field, weight in weights.items():\n",
+    "            metadata_value = doc[0].metadata.get(field, \"\")  # Safely get metadata field value\n",
+    "            relevance_score = compute_relevance_score(metadata_value, query)\n",
+    "            final_score += weight * relevance_score\n",
+    "\n",
+    "        reranked_results.append((doc, final_score))\n",
+    "\n",
+    "    # Sort documents by the final score in descending order\n",
+    "    reranked_results.sort(key=lambda x: x[1], reverse=True)\n",
+    "    return [doc for doc, score in reranked_results]\n",
+    "\n",
+    "\n",
+    "docs = rerank_documents(documents, \"Newspaper\", weights2, vector_store)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "id": "49e1b410-a7b0-4ef3-bbb8-2c240e568a73",
+   "execution_count": 78,
+   "id": "2d9172aa-6c15-4c90-856e-d0ee53100721",
    "metadata": {},
    "outputs": [
     {
-     "name": "stderr",
+     "name": "stdout",
      "output_type": "stream",
      "text": [
-      "/var/folders/xq/fj3st__56r54gz9tdvb7d2k40000gn/T/ipykernel_26237/3787498575.py:1: DtypeWarning: Columns (10,16,17,18,20,21,22,23,24,25,27,29,33,34,36,41,42,43,44,54,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,128,129,130,131,132,133,134,135,136,137,138,139,140) have mixed types. Specify dtype option on import or set low_memory=False.\n",
-      "  df_attributes = pd.read_csv(\"metadata_attributes.csv\")\n"
+      "('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:gf06jp23d', 'Reranked score: 1.1741459369659424')\n",
+      "('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:k356cp803', 'Reranked score: 1.161521077156067')\n",
+      "('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:t435k083s', 'Reranked score: 1.1445826292037964')\n",
+      "('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:8s45sw212', 'Reranked score: 1.1416451930999756')\n",
+      "('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:9p292v62n', 'Reranked score: 1.1416230201721191')\n",
+      "('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:z890vf594', 'Reranked score: 1.1343271732330322')\n",
+      "('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:pv63jm38v', 'Reranked score: 1.0997507572174072')\n",
+      "('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:05744b168', 'Reranked score: 1.0604684352874756')\n",
+      "(\"Thomas's Massachusetts Spy, or, Worcester Gazette\", 'https://www.digitalcommonwealth.org/search/commonwealth:v405x072q', 'Reranked score: 1.0452649593353271')\n",
+      "(\"Thomas's Massachusetts Spy, or, Worcester Gazette\", 'https://www.digitalcommonwealth.org/search/commonwealth:1831h918x', 'Reranked score: 1.024101972579956')\n"
      ]
     }
    ],
    "source": [
-    "df_attributes = pd.read_csv(\"metadata_attributes.csv\")"
+    "#print([docs[i].metadata['title'] for i in range(len(docs))])\n",
+    "docs_list = [(docs[i][0].metadata['title'], docs[i][0].metadata['URL'], f\"Reranked score: {docs[i][1]}\") for i in range(len(docs))]\n",
+    "docs_list.sort(key=lambda x: x[2], reverse=True)\n",
+    "for doc in docs_list:\n",
+    "    print(doc)"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 16,
-   "id": "7b269a56-1e1a-4cc7-8dbf-a37ecd6222ac",
+   "cell_type": "markdown",
+   "id": "07cdf844-72c6-41ef-bade-9afb52bceed8",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "type          1303800\n",
-       "attributes    1303800\n",
-       "links         1303800\n",
-       "dtype: int64"
-      ]
-     },
-     "execution_count": 16,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
    "source": [
-    "df.count()"
+    "Immediately we get much better performance because now only Newspapers are returned."
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 46,
-   "id": "d819646f-51cc-4817-9542-ecfc9ea4af33",
+   "cell_type": "markdown",
+   "id": "18719878-92c6-458c-ae81-21d9fe5f0bd8",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['海员们', ':', '要警惕航运事故\\nName:', 'title_info_primary_tsi,', 'dtype:', 'object']"
-      ]
-     },
-     "execution_count": 46,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
    "source": [
-    "str(df_attributes.loc[df_attributes[\"id\"] == \"commonwealth-oai:xp68m844v\"][\"title_info_primary_tsi\"]).split(\" \")[4:]"
+    "# Implementing Different Vector Store and Embedding Combos"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "1c6d1c2d-16ad-43a0-844a-72238901c07d",
+   "id": "beaf9e61-3bea-4c31-9710-532a306d1023",
    "metadata": {},
    "source": [
-    "### Turn full text into Documents type"
+    "### Pinecone Vector Store w/OLlama Embeddings"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
-   "id": "51f7f3b8-ed7a-43a1-949f-aa43e594af99",
+   "execution_count": 10,
+   "id": "28f8e253-cf14-4b37-8e54-bf114514ac60",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!pip install langchain-pinecone pinecone-notebooks\n",
+    "#!pip install pinecone-client"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "ab3fb46d-267e-4815-b823-e978d4bf3edf",
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
+     "name": "stdin",
      "output_type": "stream",
      "text": [
-      "1    Poster produced by the International Transport...\n",
-      "Name: abstract_tsi, dtype: object\n"
+      "Enter your Pinecone API key:  ········\n"
      ]
     }
    ],
    "source": [
-    "print(str(df_attributes.loc[df_attributes[\"id\"] == \"commonwealth-oai:xp68m844v\", \"abstract_tsi\"]))"
+    "import getpass\n",
+    "import os\n",
+    "import time\n",
+    "\n",
+    "from pinecone import Pinecone, ServerlessSpec\n",
+    "\n",
+    "if not os.getenv(\"PINECONE_API_KEY\"):\n",
+    "    os.environ[\"PINECONE_API_KEY\"] = getpass.getpass(\"Enter your Pinecone API key: \")\n",
+    "\n",
+    "pinecone_api_key = os.environ.get(\"PINECONE_API_KEY\")\n",
+    "\n",
+    "pc = Pinecone(api_key=pinecone_api_key)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
-   "id": "65381d50-bcba-4a6e-8347-5d8b7e4de002",
+   "execution_count": 20,
+   "id": "0a6c480d-f2cd-4732-943a-8cd9d66417e2",
    "metadata": {},
    "outputs": [],
    "source": [
-    "#df_attributes.columns.tolist()"
+    "# setting up the index name\n",
+    "import time\n",
+    "\n",
+    "index_name = \"librag1\"  # change if desired\n",
+    "\n",
+    "existing_indexes = [index_info[\"name\"] for index_info in pc.list_indexes()]\n",
+    "\n",
+    "if index_name not in existing_indexes:\n",
+    "    pc.create_index(\n",
+    "        name=index_name,\n",
+    "        dimension=768,\n",
+    "        metric=\"cosine\",\n",
+    "        spec=ServerlessSpec(cloud=\"aws\", region=\"us-east-1\"),\n",
+    "    )\n",
+    "    while not pc.describe_index(index_name).status[\"ready\"]:\n",
+    "        time.sleep(1)\n",
+    "\n",
+    "index = pc.Index(index_name)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 65,
-   "id": "cd65068e-ac92-4e97-8363-4ee6e9bfbc0a",
+   "execution_count": 12,
+   "id": "5c791f22-4909-447f-a1fe-ebc09a9bbe11",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>type</th>\n",
-       "      <th>links</th>\n",
-       "      <th>id</th>\n",
-       "      <th>system_create_dtsi</th>\n",
-       "      <th>system_modified_dtsi</th>\n",
-       "      <th>curator_model_ssi</th>\n",
-       "      <th>curator_model_suffix_ssi</th>\n",
-       "      <th>title_info_primary_tsi</th>\n",
-       "      <th>genre_basic_ssim</th>\n",
-       "      <th>genre_specific_ssim</th>\n",
-       "      <th>...</th>\n",
-       "      <th>storage_key_base_ss</th>\n",
-       "      <th>identifier_issn_ssim</th>\n",
-       "      <th>frequency_tsi</th>\n",
-       "      <th>contained_by_ssi</th>\n",
-       "      <th>note_credits_tsim</th>\n",
-       "      <th>identifier_isbn_ssim</th>\n",
-       "      <th>identifier_music_publisher_ssim</th>\n",
-       "      <th>note_arrangement_tsim</th>\n",
-       "      <th>transcription_ark_id_ssi</th>\n",
-       "      <th>transcription_key_base_ss</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>0 rows × 140 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "Empty DataFrame\n",
-       "Columns: [type, links, id, system_create_dtsi, system_modified_dtsi, curator_model_ssi, curator_model_suffix_ssi, title_info_primary_tsi, genre_basic_ssim, genre_specific_ssim, date_tsim, date_type_ssm, date_edtf_ssm, date_start_dtsi, date_end_dtsi, name_tsim, name_role_tsim, name_facet_ssim, related_item_host_ssim, subject_topic_tsim, subject_facet_ssim, subject_coordinates_geospatial, subject_point_geospatial, subject_geojson_facet_ssim, subject_hiergeo_geojson_ssm, physical_location_ssim, identifier_local_other_tsim, identifier_uri_ss, identifier_uri_preview_ss, rights_ss, license_ss, reuse_allowed_ssi, extent_tsi, abstract_tsi, type_of_resource_ssim, lang_term_ssim, publishing_state_ssi, processing_state_ssi, destination_site_ssim, hosting_status_ssi, harvesting_status_bsi, oai_header_id_ssi, exemplary_image_ssi, exemplary_image_key_base_ss, admin_set_name_ssi, admin_set_ark_id_ssi, institution_name_ssi, institution_ark_id_ssi, collection_name_ssim, collection_ark_id_ssim, filenames_ssim, _version_, timestamp, subject_geographic_sim, date_facet_yearly_itim, score, sub_location_tsi, identifier_local_accession_tsim, note_date_tsim, note_tsim, license_uri_ss, digital_origin_ssi, publisher_tsi, pubplace_tsi, title_info_translated_tsim, note_language_tsim, title_info_primary_subtitle_tsi, subject_name_tsim, note_resp_tsim, rightsstatement_ss, rightsstatement_uri_ss, title_info_primary_trans_tsim, subject_geo_other_ssm, note_physical_tsim, identifier_iiif_manifest_ss, note_acquisition_tsim, title_info_alternative_tsim, identifier_local_barcode_tsim, note_reference_tsim, related_item_series_ssi, identifier_local_call_tsim, related_item_subseries_ssi, related_item_subsubseries_ssi, shelf_locator_tsi, identifier_local_call_invalid_tsim, note_biographical_tsim, note_citation_tsim, edition_name_tsi, note_ownership_tsim, note_publications_tsim, identifier_local_other_invalid_tsim, note_funding_tsim, subject_title_tsim, title_info_partnum_tsi, resource_type_manuscript_bsi, issuance_tsi, scale_tsim, flagged_content_ssi, note_bibliography_tsim, title_info_uniform_tsim, ...]\n",
-       "Index: []\n",
-       "\n",
-       "[0 rows x 140 columns]"
-      ]
-     },
-     "execution_count": 65,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
-    "df_attributes.loc[df_attributes[\"genre_basic_ssim\"] == \"[Posters]\"]"
+    "from langchain_ollama import OllamaEmbeddings\n",
+    "from langchain.embeddings import HuggingFaceEmbeddings"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
-   "id": "15d83b93-6840-4618-ac40-1c7de9e5cc1e",
+   "execution_count": 13,
+   "id": "4aec6567-7950-4de2-8600-fe987f47a24a",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/xq/fj3st__56r54gz9tdvb7d2k40000gn/T/ipykernel_2752/1630880338.py:5: LangChainDeprecationWarning: The class `HuggingFaceEmbeddings` was deprecated in LangChain 0.2.2 and will be removed in 1.0. An updated version of the class exists in the :class:`~langchain-huggingface package and should be used instead. To use it run `pip install -U :class:`~langchain-huggingface` and import as `from :class:`~langchain_huggingface import HuggingFaceEmbeddings``.\n",
+      "  embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")\n",
+      "2024-11-14 14:26:19.674480: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
+     ]
+    }
+   ],
    "source": [
-    "import re\n",
-    "def get_title(text):\n",
-    "    match = re.search(r'\\d+\\s+(.+?)\\n', text)\n",
+    "# embeddings = OllamaEmbeddings(\n",
+    "#     model=\"llama3\",\n",
+    "# )\n",
     "\n",
-    "    # Extracting and printing the title if there's a match\n",
-    "    if match:\n",
-    "        title = match.group(1)\n",
-    "    return title"
+    "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
-   "id": "6c3f5614-cc45-4fb4-9b13-1fd0361da1b6",
+   "execution_count": 21,
+   "id": "f1ad1f25-69d2-4eb1-9f85-c7b0ccf13a53",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Turn the BPL data into a Document\n",
-    "from langchain.schema import Document\n",
-    "documents = []\n",
-    "for doc in full_text:\n",
-    "    title = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"title_info_primary_tsi\"]))\n",
-    "    ID = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"id\"]))\n",
-    "    abstract = str(df_attributes.loc[df_attributes[\"id\"] == doc, \"abstract_tsi\"])\n",
-    "    title_subtitle = str(df_attributes.loc[df_attributes[\"id\"] == doc, \"title_info_primary_subtitle_tsi\"])\n",
-    "    documents += [Document(page_content=full_text[doc]['text'], metadata={\"title\": title, \"abstract\": abstract, \"subtitle\": title_subtitle, \"ID\":ID})]\n",
-    "    #documents += [Document(page_content=full_text[doc]['text'])]\n"
+    "from langchain_pinecone import PineconeVectorStore\n",
+    "vector_store = PineconeVectorStore(index=index, embedding=embeddings)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
-   "id": "b58b4530-27e4-4ed4-80be-4ee240892480",
+   "execution_count": 43,
+   "id": "2a4b12ac-87bd-4ae3-90cc-4f2c6644256a",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "{'title': 'T.G. Hudson, Oglethorpe, Ga., autograph letter...',\n",
-       " 'abstract': '1161475    Asks market price at Charleston.\\nName: abstract_tsi, dtype: object',\n",
-       " 'subtitle': '1161475    NaN\\nName: title_info_primary_subtitle_tsi, dtype: object',\n",
-       " 'ID': 'commonwealth:9k41zk460'}"
+       "'1165601    Terms for disposal of woman.\\nName: abstract_tsi, dtype: object'"
       ]
      },
-     "execution_count": 9,
+     "execution_count": 43,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "documents[0].metadata"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 144,
-   "id": "83fdead1-b760-4766-a880-64c6df5d0cd2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# retriever = ToyRetriever(documents=documents, k=1)\n",
-    "# retriever.invoke(\"Richmond\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "id": "35c69e08-181c-42aa-a1e7-5e0846053503",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# import openai"
+    "documents[18].metadata['abstract']"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
-   "id": "20d5d467-7d3f-4c34-b0d6-ab2d19471d9b",
+   "execution_count": 16,
+   "id": "e80decf8-6ee5-48a2-bb39-5ea97ceaf7e2",
    "metadata": {},
    "outputs": [],
    "source": [
-    "from langchain_community.document_loaders import TextLoader\n",
-    "from langchain_community.vectorstores import FAISS\n",
-    "from langchain_openai import OpenAIEmbeddings\n",
-    "from langchain_text_splitters import CharacterTextSplitter"
+    "from uuid import uuid4\n",
+    "uuids = [str(uuid4()) for _ in range(len(documents))]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
-   "id": "9a395f8b-82eb-4e01-bbbc-3852524444c4",
+   "execution_count": 44,
+   "id": "18bcd3e7-07f6-4a66-a629-126ffe340966",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "PineconeApiException",
+     "evalue": "(400)\nReason: Bad Request\nHTTP response headers: HTTPHeaderDict({'Date': 'Thu, 14 Nov 2024 19:37:34 GMT', 'Content-Type': 'application/json', 'Content-Length': '116', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '155', 'x-pinecone-request-id': '2161036493328920558', 'x-envoy-upstream-service-time': '37', 'server': 'envoy'})\nHTTP response body: {\"code\":3,\"message\":\"Metadata size is 869788 bytes, which exceeds the limit of 40960 bytes per vector\",\"details\":[]}\n",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mPineconeApiException\u001b[0m                      Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[44], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m vector_store\u001b[38;5;241m.\u001b[39madd_documents(documents\u001b[38;5;241m=\u001b[39mdocuments[\u001b[38;5;241m0\u001b[39m:\u001b[38;5;241m15\u001b[39m], ids\u001b[38;5;241m=\u001b[39muuids)\n",
+      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/langchain_core/vectorstores/base.py:287\u001b[0m, in \u001b[0;36mVectorStore.add_documents\u001b[0;34m(self, documents, **kwargs)\u001b[0m\n\u001b[1;32m    285\u001b[0m     texts \u001b[38;5;241m=\u001b[39m [doc\u001b[38;5;241m.\u001b[39mpage_content \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m documents]\n\u001b[1;32m    286\u001b[0m     metadatas \u001b[38;5;241m=\u001b[39m [doc\u001b[38;5;241m.\u001b[39mmetadata \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m documents]\n\u001b[0;32m--> 287\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39madd_texts(texts, metadatas, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m    288\u001b[0m msg \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m    289\u001b[0m     \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m`add_documents` and `add_texts` has not been implemented \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    290\u001b[0m     \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfor \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    291\u001b[0m )\n\u001b[1;32m    292\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(msg)\n",
+      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/langchain_pinecone/vectorstores.py:292\u001b[0m, in \u001b[0;36mPineconeVectorStore.add_texts\u001b[0;34m(self, texts, metadatas, ids, namespace, batch_size, embedding_chunk_size, async_req, id_prefix, **kwargs)\u001b[0m\n\u001b[1;32m    281\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m async_req:\n\u001b[1;32m    282\u001b[0m     \u001b[38;5;66;03m# Runs the pinecone upsert asynchronously.\u001b[39;00m\n\u001b[1;32m    283\u001b[0m     async_res \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m    284\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_index\u001b[38;5;241m.\u001b[39mupsert(\n\u001b[1;32m    285\u001b[0m             vectors\u001b[38;5;241m=\u001b[39mbatch_vector_tuples,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    290\u001b[0m         \u001b[38;5;28;01mfor\u001b[39;00m batch_vector_tuples \u001b[38;5;129;01min\u001b[39;00m batch_iterate(batch_size, vector_tuples)\n\u001b[1;32m    291\u001b[0m     ]\n\u001b[0;32m--> 292\u001b[0m     [res\u001b[38;5;241m.\u001b[39mget() \u001b[38;5;28;01mfor\u001b[39;00m res \u001b[38;5;129;01min\u001b[39;00m async_res]\n\u001b[1;32m    293\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    294\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_index\u001b[38;5;241m.\u001b[39mupsert(\n\u001b[1;32m    295\u001b[0m         vectors\u001b[38;5;241m=\u001b[39mvector_tuples,\n\u001b[1;32m    296\u001b[0m         namespace\u001b[38;5;241m=\u001b[39mnamespace,\n\u001b[1;32m    297\u001b[0m         async_req\u001b[38;5;241m=\u001b[39masync_req,\n\u001b[1;32m    298\u001b[0m         \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m    299\u001b[0m     )\n",
+      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/langchain_pinecone/vectorstores.py:292\u001b[0m, in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m    281\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m async_req:\n\u001b[1;32m    282\u001b[0m     \u001b[38;5;66;03m# Runs the pinecone upsert asynchronously.\u001b[39;00m\n\u001b[1;32m    283\u001b[0m     async_res \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m    284\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_index\u001b[38;5;241m.\u001b[39mupsert(\n\u001b[1;32m    285\u001b[0m             vectors\u001b[38;5;241m=\u001b[39mbatch_vector_tuples,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    290\u001b[0m         \u001b[38;5;28;01mfor\u001b[39;00m batch_vector_tuples \u001b[38;5;129;01min\u001b[39;00m batch_iterate(batch_size, vector_tuples)\n\u001b[1;32m    291\u001b[0m     ]\n\u001b[0;32m--> 292\u001b[0m     [res\u001b[38;5;241m.\u001b[39mget() \u001b[38;5;28;01mfor\u001b[39;00m res \u001b[38;5;129;01min\u001b[39;00m async_res]\n\u001b[1;32m    293\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    294\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_index\u001b[38;5;241m.\u001b[39mupsert(\n\u001b[1;32m    295\u001b[0m         vectors\u001b[38;5;241m=\u001b[39mvector_tuples,\n\u001b[1;32m    296\u001b[0m         namespace\u001b[38;5;241m=\u001b[39mnamespace,\n\u001b[1;32m    297\u001b[0m         async_req\u001b[38;5;241m=\u001b[39masync_req,\n\u001b[1;32m    298\u001b[0m         \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m    299\u001b[0m     )\n",
+      "File \u001b[0;32m~/anaconda3/lib/python3.11/multiprocessing/pool.py:774\u001b[0m, in \u001b[0;36mApplyResult.get\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    772\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_value\n\u001b[1;32m    773\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 774\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_value\n",
+      "File \u001b[0;32m~/anaconda3/lib/python3.11/multiprocessing/pool.py:125\u001b[0m, in \u001b[0;36mworker\u001b[0;34m(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception)\u001b[0m\n\u001b[1;32m    123\u001b[0m job, i, func, args, kwds \u001b[38;5;241m=\u001b[39m task\n\u001b[1;32m    124\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 125\u001b[0m     result \u001b[38;5;241m=\u001b[39m (\u001b[38;5;28;01mTrue\u001b[39;00m, func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwds))\n\u001b[1;32m    126\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m    127\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m wrap_exception \u001b[38;5;129;01mand\u001b[39;00m func \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m _helper_reraises_exception:\n",
+      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pinecone/core/openapi/shared/api_client.py:187\u001b[0m, in \u001b[0;36mApiClient.__call_api\u001b[0;34m(self, resource_path, method, path_params, query_params, header_params, body, post_params, files, response_type, auth_settings, _return_http_data_only, collection_formats, _preload_content, _request_timeout, _host, _check_type)\u001b[0m\n\u001b[1;32m    185\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m PineconeApiException \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m    186\u001b[0m     e\u001b[38;5;241m.\u001b[39mbody \u001b[38;5;241m=\u001b[39m e\u001b[38;5;241m.\u001b[39mbody\u001b[38;5;241m.\u001b[39mdecode(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 187\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[1;32m    189\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlast_response \u001b[38;5;241m=\u001b[39m response_data\n\u001b[1;32m    191\u001b[0m return_data \u001b[38;5;241m=\u001b[39m response_data\n",
+      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pinecone/core/openapi/shared/api_client.py:175\u001b[0m, in \u001b[0;36mApiClient.__call_api\u001b[0;34m(self, resource_path, method, path_params, query_params, header_params, body, post_params, files, response_type, auth_settings, _return_http_data_only, collection_formats, _preload_content, _request_timeout, _host, _check_type)\u001b[0m\n\u001b[1;32m    171\u001b[0m     url \u001b[38;5;241m=\u001b[39m _host \u001b[38;5;241m+\u001b[39m resource_path\n\u001b[1;32m    173\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    174\u001b[0m     \u001b[38;5;66;03m# perform request and return response\u001b[39;00m\n\u001b[0;32m--> 175\u001b[0m     response_data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrequest(\n\u001b[1;32m    176\u001b[0m         method,\n\u001b[1;32m    177\u001b[0m         url,\n\u001b[1;32m    178\u001b[0m         query_params\u001b[38;5;241m=\u001b[39mquery_params,\n\u001b[1;32m    179\u001b[0m         headers\u001b[38;5;241m=\u001b[39mheader_params,\n\u001b[1;32m    180\u001b[0m         post_params\u001b[38;5;241m=\u001b[39mpost_params,\n\u001b[1;32m    181\u001b[0m         body\u001b[38;5;241m=\u001b[39mbody,\n\u001b[1;32m    182\u001b[0m         _preload_content\u001b[38;5;241m=\u001b[39m_preload_content,\n\u001b[1;32m    183\u001b[0m         _request_timeout\u001b[38;5;241m=\u001b[39m_request_timeout,\n\u001b[1;32m    184\u001b[0m     )\n\u001b[1;32m    185\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m PineconeApiException \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m    186\u001b[0m     e\u001b[38;5;241m.\u001b[39mbody \u001b[38;5;241m=\u001b[39m e\u001b[38;5;241m.\u001b[39mbody\u001b[38;5;241m.\u001b[39mdecode(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pinecone/core/openapi/shared/api_client.py:460\u001b[0m, in \u001b[0;36mApiClient.request\u001b[0;34m(self, method, url, query_params, headers, post_params, body, _preload_content, _request_timeout)\u001b[0m\n\u001b[1;32m    450\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrest_client\u001b[38;5;241m.\u001b[39mOPTIONS(\n\u001b[1;32m    451\u001b[0m         url,\n\u001b[1;32m    452\u001b[0m         query_params\u001b[38;5;241m=\u001b[39mquery_params,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    457\u001b[0m         body\u001b[38;5;241m=\u001b[39mbody,\n\u001b[1;32m    458\u001b[0m     )\n\u001b[1;32m    459\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m method \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPOST\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m--> 460\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrest_client\u001b[38;5;241m.\u001b[39mPOST(\n\u001b[1;32m    461\u001b[0m         url,\n\u001b[1;32m    462\u001b[0m         query_params\u001b[38;5;241m=\u001b[39mquery_params,\n\u001b[1;32m    463\u001b[0m         headers\u001b[38;5;241m=\u001b[39mheaders,\n\u001b[1;32m    464\u001b[0m         post_params\u001b[38;5;241m=\u001b[39mpost_params,\n\u001b[1;32m    465\u001b[0m         _preload_content\u001b[38;5;241m=\u001b[39m_preload_content,\n\u001b[1;32m    466\u001b[0m         _request_timeout\u001b[38;5;241m=\u001b[39m_request_timeout,\n\u001b[1;32m    467\u001b[0m         body\u001b[38;5;241m=\u001b[39mbody,\n\u001b[1;32m    468\u001b[0m     )\n\u001b[1;32m    469\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m method \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPUT\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m    470\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrest_client\u001b[38;5;241m.\u001b[39mPUT(\n\u001b[1;32m    471\u001b[0m         url,\n\u001b[1;32m    472\u001b[0m         query_params\u001b[38;5;241m=\u001b[39mquery_params,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    477\u001b[0m         body\u001b[38;5;241m=\u001b[39mbody,\n\u001b[1;32m    478\u001b[0m     )\n",
+      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pinecone/core/openapi/shared/rest.py:345\u001b[0m, in \u001b[0;36mRESTClientObject.POST\u001b[0;34m(self, url, headers, query_params, post_params, body, _preload_content, _request_timeout)\u001b[0m\n\u001b[1;32m    335\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mPOST\u001b[39m(\n\u001b[1;32m    336\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m    337\u001b[0m     url,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    343\u001b[0m     _request_timeout\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m    344\u001b[0m ):\n\u001b[0;32m--> 345\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrequest(\n\u001b[1;32m    346\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPOST\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m    347\u001b[0m         url,\n\u001b[1;32m    348\u001b[0m         headers\u001b[38;5;241m=\u001b[39mheaders,\n\u001b[1;32m    349\u001b[0m         query_params\u001b[38;5;241m=\u001b[39mquery_params,\n\u001b[1;32m    350\u001b[0m         post_params\u001b[38;5;241m=\u001b[39mpost_params,\n\u001b[1;32m    351\u001b[0m         _preload_content\u001b[38;5;241m=\u001b[39m_preload_content,\n\u001b[1;32m    352\u001b[0m         _request_timeout\u001b[38;5;241m=\u001b[39m_request_timeout,\n\u001b[1;32m    353\u001b[0m         body\u001b[38;5;241m=\u001b[39mbody,\n\u001b[1;32m    354\u001b[0m     )\n",
+      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pinecone/core/openapi/shared/rest.py:279\u001b[0m, in \u001b[0;36mRESTClientObject.request\u001b[0;34m(self, method, url, query_params, headers, body, post_params, _preload_content, _request_timeout)\u001b[0m\n\u001b[1;32m    276\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;241m500\u001b[39m \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m r\u001b[38;5;241m.\u001b[39mstatus \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m599\u001b[39m:\n\u001b[1;32m    277\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m ServiceException(http_resp\u001b[38;5;241m=\u001b[39mr)\n\u001b[0;32m--> 279\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m PineconeApiException(http_resp\u001b[38;5;241m=\u001b[39mr)\n\u001b[1;32m    281\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m r\n",
+      "\u001b[0;31mPineconeApiException\u001b[0m: (400)\nReason: Bad Request\nHTTP response headers: HTTPHeaderDict({'Date': 'Thu, 14 Nov 2024 19:37:34 GMT', 'Content-Type': 'application/json', 'Content-Length': '116', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '155', 'x-pinecone-request-id': '2161036493328920558', 'x-envoy-upstream-service-time': '37', 'server': 'envoy'})\nHTTP response body: {\"code\":3,\"message\":\"Metadata size is 869788 bytes, which exceeds the limit of 40960 bytes per vector\",\"details\":[]}\n"
+     ]
+    }
+   ],
    "source": [
-    "#!pip install langchain_community"
+    "vector_store.add_documents(documents=documents[0:15], ids=uuids)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
-   "id": "4cc9ce57-c079-4605-9430-0a6d98a63242",
+   "execution_count": 27,
+   "id": "bc900bbd-128c-40b0-b151-f77800fcb50b",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{'title': '1161521    T.H. Jones, Charlotte, N.C., autograph letter ...\\nName: title_info_primary_tsi, dtype: object', 'abstract': '1161521    Expects to make purchases in North Carolina an...\\nName: abstract_tsi, dtype: object'}\n"
+      "* [SIM=0.292190]\n",
+      " [deletion]Cha[/deletion] Graniteville Jany 15 1854\n",
+      "Mr Z. B. Oakes\n",
+      "Dr Sir\n",
+      "On my Return\n",
+      "from Charleston last week I stopd\n",
+      "and attended the sale of B. J. Godfrerys\n",
+      "at Black Creek and bought the Family\n",
+      "of Cash whom I purchased of you two\n",
+      "years ago. The woman is now in a\n",
+      "Family way and we think she will not\n",
+      "suit us for a Cook Woman, she is\n",
+      "a small young woman about 30 strong and\n",
+      "Healthy and prefers Field Work Cash\n",
+      "has proved himself an Excellent servant\n",
+      "a great Ax Man and not to be exceld\n",
+      "with the Hoe I can recommend him to be\n",
+      "a most Excellent general servant, and\n",
+      "now what do you think you can\n",
+      "get for the Family Consisting of himself\n",
+      "Wife and three Children say a girl of\n",
+      "10 or 11 year named Bella, Ceasar a boy about\n",
+      "4 or 5, and Rose 2 or three and Mother\n",
+      "in a fair way for another. Please write\n",
+      "and let me know what you think\n",
+      "you could get for them yours Truly\n",
+      "[underline]T. H. Marshall[/underline]\n",
+      "\n",
+      " [{'URL': 'https://www.digitalcommonwealth.org/search/commonwealth:ws859m407', 'abstract': '1161522    Asks value of family: man, woman, 3 children.\\nName: abstract_tsi, dtype: object', 'genre': \"1161522    ['Manuscripts', 'Correspondence']\\nName: genre_basic_ssim, dtype: object\", 'genre_specific': '1161522    NaN\\nName: genre_specific_ssim, dtype: object', 'subtitle': '1161522    NaN\\nName: title_info_primary_subtitle_tsi, dtype: object', 'title': 'T.H. Marshall, Graniteville, S.C., autograph l...', 'title_alt': '1161522    NaN\\nName: title_info_alternative_tsim, dtype: object'}]\n",
+      "----------------------------------\n",
+      "\n",
+      "* [SIM=0.243309]\n",
+      " Grumesville Sept 11 [insertion]th[/insertion] 1856\n",
+      "Dear Sir\n",
+      "The Deputy has been here since\n",
+      "day before yesterday, and it is impossible\n",
+      "to get hold of the negroes, I have advised\n",
+      "that the parties should go down & see you\n",
+      "they seem willing to consent that the fellow\n",
+      "Tom should go at $ 1000, and a smaller boy\n",
+      "at $ 300 - but a new party entered the\n",
+      "field and recommended a seperance\n",
+      "to Major Rhame - who wants the boys\n",
+      "himself, he is to be requested to go\n",
+      "down and confer with the judgement\n",
+      "creditors and buy them, or propose to\n",
+      "sell out the entire Estate and pay pro-ratio,\n",
+      "the creditors, I presume that\n",
+      "all this could be done better by yourself\n",
+      "and make no doubt a ballance\n",
+      "of some 2,000 dollars retained, I advise\n",
+      "that you should when applied\n",
+      "to take the whole Estate in hand by &\n",
+      "with the Consent of Judgement Creditors\n",
+      "and save the Commission, I could\n",
+      "then speculate on the sale if forced\n",
+      "under the [deletion]e[/deletion]sheriff -\n",
+      "in Lords - yours truly\n",
+      "Theo. C Tharin\n",
+      "\n",
+      " [{'URL': 'https://www.digitalcommonwealth.org/search/commonwealth:w3764957b', 'abstract': '1163238    On disposal of Negroes in contested estate.\\nName: abstract_tsi, dtype: object', 'genre': \"1163238    ['Manuscripts', 'Correspondence']\\nName: genre_basic_ssim, dtype: object\", 'genre_specific': '1163238    NaN\\nName: genre_specific_ssim, dtype: object', 'subtitle': '1163238    NaN\\nName: title_info_primary_subtitle_tsi, dtype: object', 'title': 'Theodore C. Tharin, Grumesville, S.C., autogra...', 'title_alt': '1163238    NaN\\nName: title_info_alternative_tsim, dtype: object'}]\n",
+      "----------------------------------\n",
+      "\n",
+      "* [SIM=0.230255]\n",
+      " Wednesday morn July 12 1854\n",
+      "Dear Sir\n",
+      "I drop you a line in a great\n",
+      "hurry by Mr McCulley to say my affairs\n",
+      "are going very well, and I have been\n",
+      "offered $ 500 or $ 1.00 per acre for the tract of\n",
+      "land you purchased at sherriff sale.\n",
+      "I think you can do better, the offer is made by\n",
+      "Mr Williams, who owns the adjoining tract\n",
+      "which he purchased of Mc Culley he\n",
+      "gets Serpentine and work about 30 Hands\n",
+      "yours truly in haste\n",
+      "Theod C Tharin\n",
+      "Z. B. Oakes Esq.\n",
+      "\n",
+      " [{'URL': 'https://www.digitalcommonwealth.org/search/commonwealth:w3764306m', 'abstract': '1163237    Offer to purchase land.\\nName: abstract_tsi, dtype: object', 'genre': \"1163237    ['Manuscripts', 'Correspondence']\\nName: genre_basic_ssim, dtype: object\", 'genre_specific': '1163237    NaN\\nName: genre_specific_ssim, dtype: object', 'subtitle': '1163237    NaN\\nName: title_info_primary_subtitle_tsi, dtype: object', 'title': 'Theodore C. Tharin, Grumesville, S.C. [?], aut...', 'title_alt': '1163237    NaN\\nName: title_info_alternative_tsim, dtype: object'}]\n",
+      "----------------------------------\n",
+      "\n"
      ]
     }
    ],
    "source": [
-    "print(documents[1].metadata)"
+    "results = vector_store.similarity_search_with_score(\n",
+    "    \"What is the metadata of the Z.B. Oakes articles\",\n",
+    "    k=3\n",
+    ")\n",
+    "for res, score in results:\n",
+    "    #print(f\"* {res.page_content} [{res.metadata}]\")\n",
+    "    print(f\"* [SIM={score:3f}]\\n {res.page_content} [{res.metadata}]\")\n",
+    "    print(\"----------------------------------\\n\")"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "4a7f8878-1967-4630-817a-9eb1d321701e",
-   "metadata": {},
-   "source": [
-    "### Using Chroma Vector Store"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 66,
-   "id": "e72f20d4-4870-44ae-be93-66b3e16d53bd",
+   "id": "f5545e52-8142-4c0d-a127-a1a3711e5ef3",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "# import os\n",
-    "# os.chmod('mydatabase.db', 0o666)"
+    "### Conclusions on Pinecone and HuggingFace Embedding Model\n",
+    "Unfortunately, it seems like Pinecone has a byte limit of how much data you can send, which 40KB. The metadata field for one of the Document objects is 800KB.\n",
+    "\n",
+    "The HuggingFace embedding model that I used though works like a charm. Just have to ensure the dimensions line up between the embedding and the query.\n",
+    "\n",
+    "FAISS seems like our best option."
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 67,
-   "id": "4c53d93b-165e-4f9b-8a6f-cd9329551a69",
+   "cell_type": "markdown",
+   "id": "4a7f8878-1967-4630-817a-9eb1d321701e",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "#!pip install chromadb==0.5.0\n",
-    "#!pip install --upgrade openai langchain\n",
-    "# !pip install --upgrade langchain langchain_community langchain_openai openai python-dotenv chromadb\n",
-    "# !pip install --upgrade transformers\n",
-    "#!pip install --upgrade transformers torchvision\n",
-    "\n",
-    "# !pip install openai==1.37.1\n",
-    "# !pip install langchain==0.2.11\n",
-    "# !pip install langchain-openai==0.1.19\n",
-    "# !pip install langchain-community==0.2.10\n",
-    "# !pip install langchain-experimental==0.0.63\n",
-    "# !pip install transformers"
+    "### Using Chroma Vector Store"
    ]
   },
   {
@@ -1351,7 +1578,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 93,
    "id": "c281018a-089b-4ad0-8f4c-efb4667c8780",
    "metadata": {},
    "outputs": [
@@ -1360,42 +1587,7 @@
      "output_type": "stream",
      "text": [
       "Split 133 documents into 13931 chunks.\n",
-      "Then again, perhaps as we become accustomed\n",
-      "to the inflated value of the dollar, in time\n",
-      "that price may not seem as exhorbitant as it\n",
-      "does to me now.\n",
-      "\n",
-      "When I was a child, soup was never served as\n",
-      "an appetizer, but always as a meal. It didn't\n",
-      "come from a can, either. It was made from\n",
-      "bones left over from Sunday's roast and had\n",
-      "some strength to it. When cooled, it jelled,\n",
-      "and suspended in it were bits and pieces of\n",
-      "meat and vegetables.\n",
-      "\n",
-      "With soup, corn or clam chowder, or oyster\n",
-      "stew came crackers. Not those skimpy skinny\n",
-      "saltines, but thick rich common or pilot cra-\n",
-      "ckers. Three or four of those, along with your\n",
-      "soup, and you had a meal.\n",
-      "\n",
-      "Father liked pie, and Mother baked them fre-\n",
-      "quently. All kinds: mince, apple, pumpkin,\n",
-      "squash, and lemon meringue. When I think of\n",
-      "the calories we consumed, I'm surprised we\n",
-      "weren't chubby. Instead, we were all thin as\n",
-      "rails.\n",
-      "{'title': 'Thanksgiving', 'abstract': '1161754    NaN\\nName: abstract_tsi, dtype: object', 'subtitle': '1161754    NaN\\nName: title_info_primary_subtitle_tsi, dtype: object', 'ID': 'commonwealth:jd478671b', 'start_index': 3435}\n",
-      "Removed existing database at /var/folders/xq/fj3st__56r54gz9tdvb7d2k40000gn/T/tmpcp1qkd0k.\n",
-      "Saved 13931 chunks to /var/folders/xq/fj3st__56r54gz9tdvb7d2k40000gn/T/tmpcp1qkd0k.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/var/folders/xq/fj3st__56r54gz9tdvb7d2k40000gn/T/ipykernel_26237/3571056397.py:60: LangChainDeprecationWarning: Since Chroma 0.4.x the manual persistence method is no longer supported as docs are automatically persisted.\n",
-      "  db.persist()\n"
+      "Saved 13931 chunks to ./chroma_try.\n"
      ]
     }
    ],
@@ -1413,11 +1605,8 @@
     "import time\n",
     "from langchain.embeddings import HuggingFaceEmbeddings\n",
     "\n",
-    "# Load environment variables. Assumes that project contains .env file with API keys\n",
-    "load_dotenv()\n",
-    "\n",
     "import tempfile\n",
-    "CHROMA_PATH = tempfile.mkdtemp()  # Use a temporary directory\n",
+    "CHROMA_PATH = \"./chroma_try\"\n",
     "\n",
     "def main(documents):\n",
     "    generate_data_store(documents)\n",
@@ -1439,13 +1628,13 @@
     "    print(f\"Split {len(documents)} documents into {len(chunks)} chunks.\")\n",
     "\n",
     "    document = chunks[10]\n",
-    "    print(document.page_content)\n",
-    "    print(document.metadata)\n",
+    "    #print(document.page_content)\n",
+    "    #print(document.metadata)\n",
     "\n",
     "    return chunks\n",
     "\n",
     "def save_to_chroma(chunks):\n",
-    "    # Clear out the database first.\n",
+    "    #Clear out the database first.\n",
     "    if os.path.exists(CHROMA_PATH):\n",
     "        shutil.rmtree(CHROMA_PATH)\n",
     "        print(f\"Removed existing database at {CHROMA_PATH}.\")\n",
@@ -1455,6 +1644,7 @@
     "\n",
     "    #embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n",
     "    embeddings = OpenAIEmbeddings(model=\"text-embedding-3-large\", dimensions=3072)\n",
+    "    #embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")\n",
     "    try:\n",
     "        db = Chroma.from_documents(\n",
     "            chunks, embeddings, persist_directory=CHROMA_PATH\n",
@@ -1478,27 +1668,20 @@
     "### Making the Query"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "7bd15c30-251e-4460-a598-9e1fcf7aa3f5",
-   "metadata": {},
-   "source": [
-    "We'll download langserve to make a sample UI for our app:"
-   ]
-  },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 82,
    "id": "5b01faae-fdbf-420d-9b79-e1f31e84baf8",
    "metadata": {},
    "outputs": [],
    "source": [
-    "#!pip install \"langserve[all]\""
+    "import os \n",
+    "os.environ[\"OPENAI_API_KEY\"] = \"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\""
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 94,
    "id": "103cf9f0-e116-4e3a-a33c-accdf4246332",
    "metadata": {
     "scrolled": true
@@ -1559,11 +1742,51 @@
       "\n",
       "---\n",
       "\n",
+      "Charleston 1-6 Dec 1853\n",
+      "Mr Z. B. Oakes,\n",
+      "Dear Sir\n",
+      "I would be glad\n",
+      "to know what you have\n",
+      "determined on in the case\n",
+      "of M Alpine, I bot the\n",
+      "negro from you, it appears\n",
+      "and of course you are\n",
+      "liable to me, I am called\n",
+      "on for the amount of the\n",
+      "verdict, It appears that the\n",
+      "negro was a stolen one. I\n",
+      "of course lay no charge to\n",
+      "you on this score, being accessory\n",
+      "to the affair, but I\n",
+      "do call on on you to\n",
+      "hold me Harmless. - Please\n",
+      "send me your written answer\n",
+      "[underline]this day.[/underline]\n",
+      "Respectfully\n",
+      "[underline]Tho: N. Gadsden[/underline]\n",
+      "\n",
+      "---\n",
+      "\n",
+      "Summerville So Ca\n",
+      "April 26th 1854\n",
+      "Mr Z B Oakes\n",
+      "Dear Sir\n",
+      "Your letter\n",
+      "in reply to mine was duly received.\n",
+      "I am willing to value the woman at\n",
+      "$ 800. & be refunded $ 150. or return her\n",
+      "at once to you. Do inform me as soon\n",
+      "as possible as to the decision of\n",
+      "her owner. Yours respectfully\n",
+      "Thos. L Gelzia\n",
+      "\n",
+      "---\n",
+      "\n",
       "Answer the question based on the above context: Who did Z.B Oakes receive a letter from?\n",
       "\n",
-      "Response: Z.B. Oakes received a letter from Theo. C Tharin.\n",
+      "Response: Z.B Oakes received a letter from Tho: N. Gadsden.\n",
       "\n",
-      "Sources: ['Thomas Taylor, Columbia, Tenn., autograph lett...: commonwealth:5q47sj75b', 'Theodore C. Tharin, Grumesville, S.C., autogra...: commonwealth:w3764352q', 'Theodore C. Tharin, Grumesville, S.C., autogra...: commonwealth:w3764286b']\n"
+      "Sources: ['Thomas Taylor, Columbia, Tenn., autograph lett...: https://www.digitalcommonwealth.org/search/commonwealth:5q47sj75b', 'Theodore C. Tharin, Grumesville, S.C., autogra...: https://www.digitalcommonwealth.org/search/commonwealth:w3764352q', 'Theodore C. Tharin, Grumesville, S.C., autogra...: https://www.digitalcommonwealth.org/search/commonwealth:w3764286b', 'Theodore N. Gadsden, Charleston, S.C., autogra...: https://www.digitalcommonwealth.org/search/commonwealth:9k41zk125', 'Thomas L. Gelzer, Summerville, autograph lette...: https://www.digitalcommonwealth.org/search/commonwealth:ws859j61k']\n"
      ]
     }
    ],
@@ -1578,18 +1801,10 @@
     "from langchain_core.messages import HumanMessage, SystemMessage\n",
     "from langchain_core.output_parsers import StrOutputParser\n",
     "\n",
-    "# For LangServe\n",
-    "from fastapi import FastAPI\n",
-    "from langchain_core.prompts import ChatPromptTemplate\n",
-    "from langchain_core.output_parsers import StrOutputParser\n",
-    "from langchain_openai import ChatOpenAI\n",
-    "from langserve import add_routes\n",
-    "import nest_asyncio\n",
-    "import uvicorn\n",
-    "\n",
     "\n",
     "# copy from above\n",
-    "CHROMA_PATH = \"/var/folders/xq/fj3st__56r54gz9tdvb7d2k40000gn/T/tmpcp1qkd0k\"\n",
+    "#CHROMA_PATH = \"/var/folders/xq/fj3st__56r54gz9tdvb7d2k40000gn/T/tmpaxd8t1dv\"\n",
+    "CHROMA_PATH = \"./chroma_try\"\n",
     "\n",
     "PROMPT_TEMPLATE = \"\"\"\n",
     "Answer the question based only on the following context:\n",
@@ -1601,10 +1816,6 @@
     "Answer the question based on the above context: {question}\n",
     "\"\"\"\n",
     "\n",
-    "# Initialize LangSmith App\n",
-    "# app = App()\n",
-    "\n",
-    "# @langsmith_route(\"/answer-question\")\n",
     "def main(query: str):\n",
     "    # Create CLI with a default value for Jupyter testing\n",
     "    parser = argparse.ArgumentParser()\n",
@@ -1614,17 +1825,15 @@
     "\n",
     "    # Prepare the database\n",
     "    embedding_function = OpenAIEmbeddings(model=\"text-embedding-3-large\", dimensions=3072)\n",
-    "    #embedding_function = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n",
+    "    #embedding_function = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")\n",
     "    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)\n",
     "\n",
-    "    results = db.similarity_search_with_relevance_scores(query_text, k=3)\n",
+    "    results = db.similarity_search_with_relevance_scores(query_text, k=5)\n",
     "    for i in range(len(results)):\n",
-    "        if len(results) == 0 or results[0][1] < 0.1:\n",
+    "        if len(results) == 0 or results[0][1] < 0.3:\n",
     "            print(f\"Unable to find matching results for \\\"{query_text}\\\"\")\n",
     "            print(results[0][1])\n",
     "            return\n",
-    "\n",
-    "    #print(results)\n",
     "    \n",
     "    context_text = \"\\n\\n---\\n\\n\".join([doc.page_content for doc, _score in results])\n",
     "    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)\n",
@@ -1634,7 +1843,7 @@
     "    model = ChatOpenAI()\n",
     "    response_text = model.predict(prompt)\n",
     "\n",
-    "    sources = [doc.metadata.get(\"title\") + \": \" + str(doc.metadata.get(\"ID\")) for doc, _score in results]\n",
+    "    sources = [doc.metadata.get(\"title\") + \": \" + str(doc.metadata.get(\"URL\")) for doc, _score in results]\n",
     "    formatted_response = f\"Response: {response_text}\\n\\nSources: {sources}\"\n",
     "    # response with context, sources, and answer to my query\n",
     "    print(formatted_response)\n",
@@ -1642,34 +1851,50 @@
     "if __name__ == \"__main__\":\n",
     "    query1 = \"Who did Z.B Oakes receive a letter from?\"\n",
     "    query2 = \"What did Henry M. Sikes say about India Goods?\"\n",
-    "    query3 = \"What are some of the most controversial topics in this database?\"\n",
-    "    query4 = \"What happened in World War II?\"\n",
-    "    query5 = \"Who critiqued India Goods?\"\n",
-    "    query6 = \"Tell me about Barnstable Public Schools\"\n",
-    "    #query7 = \"What did Thos. L Gelzia talk about in their letter to Mr Z. B. Oakes, but not in the Tocsin of Liberty?\"\n",
-    "    queries = [query1, query2, query3, query4, query5, query6]\n",
-    "    # print(\"-------------------New Query-------------------\")\n",
-    "    # for query in queries:\n",
-    "    #     main(query)\n",
-    "    #     print(\"-------------------New Query-------------------\")\n",
-    "    main(query1)\n",
-    "        \n"
+    "    query3 = \"What happened in World War II?\"\n",
+    "    main(query1)"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "9bba8e8f-14e3-48c0-9fc8-c07328271da3",
+   "id": "805b793e-16e3-4eca-9b2a-baaa91fde961",
    "metadata": {},
    "source": [
-    "# Notes from Gardos\n",
+    "### Conclusions about Chroma and OpenAI Embedding Model\n",
+    "Chroma seems to be a great option as a vector store, however it is immensely lightweight and requires us to have the vector store as an embedding in the machine that we use. Unfortunatenly we have not found many projects that is dealing with embedding a vast amount of data like we are.\n",
     "\n",
-    "These are the list of fields, if you need any clarification about these fields ask about them.\n",
-    "\n",
-    "Vectorize all of the fields\n",
-    "\n",
-    "Give this to the LLM as a preface prompt.\n",
-    "\n",
-    "Maybe two vector stores?"
+    "The OpenAIEmbeddings is great, but it also costs money, so that is a no-go."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "4eded946-6f7f-440b-a892-80f8c37db3ab",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "page_content='Oglethorpe December 8 53\n",
+      "Z B Oaks esqr\n",
+      "Dr Sir you will please\n",
+      "inform me how negros are selling\n",
+      "& how your market is supplied\n",
+      "I think I will be in Charleston\n",
+      "the latter part of this month, &\n",
+      "will want to buy a Cook &\n",
+      "good active boy & perhaps two negro fellows\n",
+      "Very Respectfully\n",
+      "Yours T G Hudson\n",
+      "\n",
+      "\n",
+      "' metadata={'title': 'T.G. Hudson, Oglethorpe, Ga., autograph letter...', 'abstract': '1161475    Asks market price at Charleston.\\nName: abstract_tsi, dtype: object', 'subtitle': '1161475    NaN\\nName: title_info_primary_subtitle_tsi, dtype: object', 'URL': 'https://www.digitalcommonwealth.org/search/commonwealth:9k41zk460', 'title_alt': '1161475    NaN\\nName: title_info_alternative_tsim, dtype: object', 'genre': \"1161475    ['Manuscripts', 'Correspondence']\\nName: genre_basic_ssim, dtype: object\", 'genre_specific': '1161475    NaN\\nName: genre_specific_ssim, dtype: object'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(documents[0])"
    ]
   }
  ],

From a67116a1629bb1afc9af0bf1483d4a8b1b1296b7 Mon Sep 17 00:00:00 2001
From: Brandon Vargus <45298256+b3v@users.noreply.github.com>
Date: Thu, 21 Nov 2024 13:50:02 -0500
Subject: [PATCH 3/6] Update rerank.py

---
 PoC/rerank.py | 196 +++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 195 insertions(+), 1 deletion(-)

diff --git a/PoC/rerank.py b/PoC/rerank.py
index a0aabbb..6948c59 100644
--- a/PoC/rerank.py
+++ b/PoC/rerank.py
@@ -1 +1,195 @@
-import numpy as npimport matplotlib.pyplot as pltimport seaborn as snsimport jsonimport pandas as pdfrom langchain_openai import ChatOpenAI# replace with file of your choosingfile = open("sample_full_text.json")full_text = json.load(file)# metadata csv file; should be included in repodf_attributes = pd.read_csv("metadata_attributes.csv")model = ChatOpenAI()import redef get_title(text):    match = re.search(r'\d+\s+(.+?)\n', text)    # Extracting and printing the title if there's a match    if match:        title = match.group(1)    return title# Turn the BPL data into a Documentfrom langchain.schema import Documentdocuments = []for doc in full_text:    # Extract metadata fields and apply get_title()    title = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "title_info_primary_tsi"]))    title_subtitle = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "title_info_primary_subtitle_tsi"]))    title_alt = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "title_info_alternative_tsim"]))    abstract = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "abstract_tsi"]))    subject_facet = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "subject_facet_ssim"]))    subject_geographic = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "subject_geographic_sim"]))    genre = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "genre_basic_ssim"]))    genre_specific = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "genre_specific_ssim"]))    name_facet = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "name_facet_ssim"]))    name_role = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "name_role_tsim"]))    date_human = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "date_tsim"]))    date_start = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "date_start_dtsi"]))    date_end = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "date_end_dtsi"]))    publisher = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "publisher_tsi"]))    collection_name = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "collection_name_ssim"]))    physical_location = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "physical_location_ssim"]))    related_item_host = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "related_item_host_ssim"]))    type_of_resource = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "type_of_resource_ssim"]))    URL = "https://www.digitalcommonwealth.org/search/" + get_title(str(df_attributes.loc[df_attributes["id"] == doc, "id"]))        # Create Document with metadata    documents.append(Document(        page_content=full_text[doc]['text'],        metadata={            "title": title,            "subtitle": title_subtitle,            "title_alt": title_alt,            "abstract": abstract,            "subject_facet": subject_facet,            "subject_geographic": subject_geographic,            "genre": genre,            "genre_specific": genre_specific,            "name_facet": name_facet,            "name_role": name_role,            "date_human": date_human,            "date_start": date_start,            "date_end": date_end,            "publisher": publisher,            "collection_name": collection_name,            "physical_location": physical_location,            "related_item_host": related_item_host,            "type_of_resource": type_of_resource,            "URL": URL        }    ))# Now for all of the vector store and reranking stuffimport faissfrom langchain_community.docstore.in_memory import InMemoryDocstorefrom langchain_community.vectorstores import FAISSfrom langchain.embeddings import HuggingFaceEmbeddings# embeddings modelembeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")# creating the vector storeindex = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))vector_store = FAISS(    embedding_function=embeddings,    index=index,    docstore=InMemoryDocstore(),    index_to_docstore_id={},)# now for the reranking stepweights = {    "title": 1.0,    "subtitle": 0.95,    "title_alt": 0.9,    "abstract": 0.85,    "subject_facet": 0.8,    "subject_geographic": 0.75,    "genre": 0.7,    "genre_specific": 0.65,    "name_facet": 0.6,    "name_role": 0.55,    "date_human": 0.5,    "date_start": 0.45,    "date_end": 0.4,    "publisher": 0.35,    "collection_name": 0.3,    "physical_location": 0.25,    "related_item_host": 0.2,    "type_of_resource": 0.15,    "URL": 0.1}from sentence_transformers import SentenceTransformerfrom sklearn.metrics.pairwise import cosine_similarityimport numpy as npfrom langchain.embeddings import HuggingFaceEmbeddings# our vector store:# embedding modelembeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")def compute_relevance_score(metadata_value, query):    """    Compute cosine similarity between the query and a metadata value using sentence-transformers.    Args:        metadata_value (str): The metadata value to compare.        query (str): The query string.    Returns:        float: Cosine similarity score (between 0 and 1).    """    if not metadata_value or not query:        return 0  # Return 0 if either the metadata or query is empty        # Encode the metadata value and query into embeddings    embeddings = model.encode([metadata_value, query], convert_to_tensor=False)  # Convert to NumPy    metadata_embedding, query_embedding = embeddings    # Compute cosine similarity    similarity = cosine_similarity([metadata_embedding], [query_embedding])    return similarity[0][0]  # Extract the scalar similarity valuedef rerank_documents(documents, query, weights, vector_store, k=10):    """    Rerank documents based on metadata relevance scores and FAISS vector similarity scores.    Args:        documents (list): List of Document objects.        query (str): The query string used for retrieval.        weights (dict): Weights for each metadata field.        vector_store (str): The vector store itself to get the similarity score    Returns:        list: Reranked documents in descending order of relevance.    """    final_score = 0    reranked_results = []    returned_docs = vector_store.similarity_search_with_score(query, k)    for doc in returned_docs:        final_score = doc[1]        # Add weighted relevance scores for each metadata field        for field, weight in weights.items():            metadata_value = doc[0].metadata.get(field, "")  # Safely get metadata field value            relevance_score = compute_relevance_score(metadata_value, query)            final_score += weight * relevance_score        reranked_results.append((doc, final_score))    # Sort documents by the final score in descending order    reranked_results.sort(key=lambda x: x[1], reverse=True)    return [doc for doc, score in reranked_results]docs = rerank_documents(documents, "Newspaper", weights, vector_store)# now we should get an output like this for some k value:# ('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:gf06jp23d', 'Reranked score: 1.1741459369659424')docs_list = [(docs[i][0].metadata['title'], docs[i][0].metadata['URL'], f"Reranked score: {docs[i][1]}") for i in range(len(docs))]docs_list.sort(key=lambda x: x[2], reverse=True)for doc in docs_list:    print(doc)
\ No newline at end of file
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+import json
+import pandas as pd
+from langchain_openai import ChatOpenAI
+
+# replace with file of your choosing
+file = open("sample_full_text.json")
+full_text = json.load(file)
+
+# metadata csv file; should be included in repo
+df_attributes = pd.read_csv("metadata_attributes.csv")
+
+model = ChatOpenAI()
+
+import re
+def get_title(text):
+    match = re.search(r'\d+\s+(.+?)\n', text)
+
+    # Extracting and printing the title if there's a match
+    if match:
+        title = match.group(1)
+    return title
+
+# Turn the BPL data into a Document
+from langchain.schema import Document
+
+documents = []
+
+for doc in full_text:
+    # Extract metadata fields and apply get_title()
+    title = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "title_info_primary_tsi"]))
+    title_subtitle = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "title_info_primary_subtitle_tsi"]))
+    title_alt = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "title_info_alternative_tsim"]))
+    abstract = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "abstract_tsi"]))
+    subject_facet = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "subject_facet_ssim"]))
+    subject_geographic = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "subject_geographic_sim"]))
+    genre = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "genre_basic_ssim"]))
+    genre_specific = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "genre_specific_ssim"]))
+    name_facet = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "name_facet_ssim"]))
+    name_role = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "name_role_tsim"]))
+    date_human = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "date_tsim"]))
+    date_start = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "date_start_dtsi"]))
+    date_end = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "date_end_dtsi"]))
+    publisher = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "publisher_tsi"]))
+    collection_name = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "collection_name_ssim"]))
+    physical_location = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "physical_location_ssim"]))
+    related_item_host = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "related_item_host_ssim"]))
+    type_of_resource = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "type_of_resource_ssim"]))
+    URL = "https://www.digitalcommonwealth.org/search/" + get_title(str(df_attributes.loc[df_attributes["id"] == doc, "id"]))
+    
+    # Create Document with metadata
+    documents.append(Document(
+        page_content=full_text[doc]['text'],
+        metadata={
+            "title": title,
+            "subtitle": title_subtitle,
+            "title_alt": title_alt,
+            "abstract": abstract,
+            "subject_facet": subject_facet,
+            "subject_geographic": subject_geographic,
+            "genre": genre,
+            "genre_specific": genre_specific,
+            "name_facet": name_facet,
+            "name_role": name_role,
+            "date_human": date_human,
+            "date_start": date_start,
+            "date_end": date_end,
+            "publisher": publisher,
+            "collection_name": collection_name,
+            "physical_location": physical_location,
+            "related_item_host": related_item_host,
+            "type_of_resource": type_of_resource,
+            "URL": URL
+        }
+    ))
+
+# Now for all of the vector store and reranking stuff
+import faiss
+from langchain_community.docstore.in_memory import InMemoryDocstore
+from langchain_community.vectorstores import FAISS
+from langchain.embeddings import HuggingFaceEmbeddings
+
+# embeddings model
+embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
+
+# creating the vector store
+index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))
+
+vector_store = FAISS(
+    embedding_function=embeddings,
+    index=index,
+    docstore=InMemoryDocstore(),
+    index_to_docstore_id={},
+)
+
+# now for the reranking step
+weights = {
+    "title": 1.0,
+    "subtitle": 0.95,
+    "title_alt": 0.9,
+    "abstract": 0.85,
+    "subject_facet": 0.8,
+    "subject_geographic": 0.75,
+    "genre": 0.7,
+    "genre_specific": 0.65,
+    "name_facet": 0.6,
+    "name_role": 0.55,
+    "date_human": 0.5,
+    "date_start": 0.45,
+    "date_end": 0.4,
+    "publisher": 0.35,
+    "collection_name": 0.3,
+    "physical_location": 0.25,
+    "related_item_host": 0.2,
+    "type_of_resource": 0.15,
+    "URL": 0.1
+}
+
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+from langchain.embeddings import HuggingFaceEmbeddings
+
+# our vector store:
+
+# embedding model
+embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
+
+def compute_relevance_score(metadata_value, query):
+    """
+    Compute cosine similarity between the query and a metadata value using sentence-transformers.
+
+    Args:
+        metadata_value (str): The metadata value to compare.
+        query (str): The query string.
+
+    Returns:
+        float: Cosine similarity score (between 0 and 1).
+    """
+    if not metadata_value or not query:
+        return 0  # Return 0 if either the metadata or query is empty
+    
+    # Encode the metadata value and query into embeddings
+    embeddings = model.encode([metadata_value, query], convert_to_tensor=False)  # Convert to NumPy
+    metadata_embedding, query_embedding = embeddings
+
+    # Compute cosine similarity
+    similarity = cosine_similarity([metadata_embedding], [query_embedding])
+    return similarity[0][0]  # Extract the scalar similarity value
+
+
+
+def rerank_documents(documents, query, weights, vector_store, k=10):
+    """
+    Rerank documents based on metadata relevance scores and FAISS vector similarity scores.
+
+    Args:
+        documents (list): List of Document objects.
+        query (str): The query string used for retrieval.
+        weights (dict): Weights for each metadata field.
+        vector_store (str): The vector store itself to get the similarity score
+
+    Returns:
+        list: Reranked documents in descending order of relevance.
+    """
+
+    final_score = 0
+
+    reranked_results = []
+    returned_docs = vector_store.similarity_search_with_score(query, k)
+    for doc in returned_docs:
+        final_score = doc[1]
+        # Add weighted relevance scores for each metadata field
+        for field, weight in weights.items():
+            metadata_value = doc[0].metadata.get(field, "")  # Safely get metadata field value
+            relevance_score = compute_relevance_score(metadata_value, query)
+            final_score += weight * relevance_score
+
+        reranked_results.append((doc, final_score))
+
+    # Sort documents by the final score in descending order
+    reranked_results.sort(key=lambda x: x[1], reverse=True)
+    return [doc for doc, score in reranked_results]
+
+
+docs = rerank_documents(documents, "Newspaper", weights, vector_store)
+
+# now we should get an output like this for some k value:
+# ('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:gf06jp23d', 'Reranked score: 1.1741459369659424')
+docs_list = [(docs[i][0].metadata['title'], docs[i][0].metadata['URL'], f"Reranked score: {docs[i][1]}") for i in range(len(docs))]
+docs_list.sort(key=lambda x: x[2], reverse=True)
+for doc in docs_list:
+    print(doc)

From 0acc75005b77eaa88312b576102ac7b85104229b Mon Sep 17 00:00:00 2001
From: Brandon Vargus <bmv2021@bu.edu>
Date: Tue, 26 Nov 2024 18:59:26 -0500
Subject: [PATCH 4/6] adjusted reranking algo and included normalization

---
 PoC/POC.ipynb | 734 ++++++++++++++++++++++----------------------------
 1 file changed, 322 insertions(+), 412 deletions(-)

diff --git a/PoC/POC.ipynb b/PoC/POC.ipynb
index 4c2f270..a027a64 100644
--- a/PoC/POC.ipynb
+++ b/PoC/POC.ipynb
@@ -10,10 +10,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 12,
    "id": "3934dc57-c4c3-4892-bcf4-bce4aa5c48af",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+     ]
+    }
+   ],
    "source": [
     "import numpy as np\n",
     "import matplotlib.pyplot as plt\n",
@@ -33,19 +44,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "30d31b31-2c2a-40ae-99b6-5502395f8bc7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# file path for metadata file on SCC: /projectnb/sparkgrp/ml-bpl-rag-data/full_data/bpl_data.json\n",
-    "meta = open(\"../EDA Phase/bpl-digital-commonwealth/bpl_data.json\")\n",
-    "bpl_metadata = json.load(meta)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 16,
    "id": "b50d8c5f-dfea-454f-860e-13315a9c2fea",
    "metadata": {},
    "outputs": [],
@@ -58,10 +57,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "id": "1e4c5601-648f-4ff1-a5ce-d8823c46f884",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "133\n"
+     ]
+    }
+   ],
    "source": [
     "print(len(full_text))"
    ]
@@ -76,51 +83,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 2,
    "id": "da05e8ae-71f6-4ab3-b2fa-0193b77d6262",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Charlotte N.C.\n",
-      "Feb 21st/57\n",
-      "Z.B. Oaks Esq\n",
-      "Charleston S.C.\n",
-      "Dr Sr\n",
-      "I take the\n",
-      "Liberty to Address you as Regards\n",
-      "your Negro Market & your\n",
-      "Opinion as to how it will continue\n",
-      "through the Spring & Summer\n",
-      "I have an Idea of Trying To puchase\n",
-      "in the Mountians of N.C. & Va\n",
-      "& Selling in your Market or in\n",
-      "Richmond Va. I expect to Trade\n",
-      "on the Small Scale for Some\n",
-      "Market & if I can Sell in\n",
-      "your Market for a fair profit\n",
-      "I Shall do my Buisiness with\n",
-      "you & C\n",
-      "\n",
-      "I wish To no what Boys from 18 to\n",
-      "20 yrs old [deletion]and[/deletion] both no 1 & no 2 Boys\n",
-      "also Boys 12 ys old Say weigs 80 to 90\n",
-      "lbs & girls 12 ys old weighs Say 60 to\n",
-      "80 lb & from 14 To ys old To 20 ys old\n",
-      "Please write me by Return Mail\n",
-      "& give me the Market prices of\n",
-      "above Negros & [deletion][/deletion] are they [deletion][/deletion]\n",
-      "Brisk Sale or dull Address\n",
-      "me Charlotte N.C.\n",
-      "Yours Respectfully\n",
-      "T.H. Jones\n",
-      "\n",
-      "\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "print(full_text['commonwealth:w3764603d']['text'])"
    ]
@@ -579,7 +545,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 13,
    "id": "49e1b410-a7b0-4ef3-bbb8-2c240e568a73",
    "metadata": {},
    "outputs": [
@@ -587,7 +553,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/var/folders/xq/fj3st__56r54gz9tdvb7d2k40000gn/T/ipykernel_12916/3787498575.py:1: DtypeWarning: Columns (10,16,17,18,20,21,22,23,24,25,27,29,33,34,36,41,42,43,44,54,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,128,129,130,131,132,133,134,135,136,137,138,139,140) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+      "/var/folders/xq/fj3st__56r54gz9tdvb7d2k40000gn/T/ipykernel_8401/3787498575.py:1: DtypeWarning: Columns (10,16,17,18,20,21,22,23,24,25,27,29,33,34,36,41,42,43,44,54,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,128,129,130,131,132,133,134,135,136,137,138,139,140) have mixed types. Specify dtype option on import or set low_memory=False.\n",
       "  df_attributes = pd.read_csv(\"metadata_attributes.csv\")\n"
      ]
     }
@@ -596,98 +562,6 @@
     "df_attributes = pd.read_csv(\"metadata_attributes.csv\")"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "972493d9-63a1-477d-811e-c1d951a2d63c",
-   "metadata": {},
-   "source": [
-    "Writing the get_title function to clear away any whitespace and newline characters from the title of each document."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "d819646f-51cc-4817-9542-ecfc9ea4af33",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['海员们', ':', '要警惕航运事故\\nName:', 'title_info_primary_tsi,', 'dtype:', 'object']"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "str(df_attributes.loc[df_attributes[\"id\"] == \"commonwealth-oai:xp68m844v\"][\"title_info_primary_tsi\"]).split(\" \")[4:]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "754c22c6-5d35-436a-922a-0f5f6cafa6c9",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'1199893    NaN\\nName: title_info_alternative_tsim, dtype: object'"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "str(df_attributes.loc[df_attributes[\"id\"] == \"commonwealth:1j92ng13k\"][\"title_info_alternative_tsim\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 81,
-   "id": "09331a9b-135a-46ec-8b0a-70c70ba1c261",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "585812"
-      ]
-     },
-     "execution_count": 81,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df_attributes[\"genre_specific_ssim\"].isna().sum()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 76,
-   "id": "914df792-9343-43fd-83cd-7678e5a56f8c",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "1303800"
-      ]
-     },
-     "execution_count": 76,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df_attributes.shape[0]"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "1c6d1c2d-16ad-43a0-844a-72238901c07d",
@@ -696,44 +570,17 @@
     "### Turn full text into Documents type"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "51f7f3b8-ed7a-43a1-949f-aa43e594af99",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "1    Poster produced by the International Transport...\n",
-      "Name: abstract_tsi, dtype: object\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(str(df_attributes.loc[df_attributes[\"id\"] == \"commonwealth-oai:xp68m844v\", \"abstract_tsi\"]))"
-   ]
-  },
   {
    "cell_type": "markdown",
-   "id": "b072fe0a-b538-4704-85a0-b6862b0653b6",
+   "id": "c74fe245-886d-4e9b-a554-77308c75e44b",
    "metadata": {},
    "source": [
-    "Important Metadata to Embed:\n",
-    "- title_info_primary_tsi\n",
-    "- title_info_primary_subtitle_tsi\n",
-    "- title_info_alternative_tsim\n",
-    "- abstract_tsi\n",
-    "- subject_facet_ssim\n",
-    "- subject_geographic_sim\n",
-    "- genre_basic_ssim\n",
-    "- genre_specific_ssim"
+    "Now we will make Document objects with the important metadata attributes"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 14,
    "id": "15d83b93-6840-4618-ac40-1c7de9e5cc1e",
    "metadata": {},
    "outputs": [],
@@ -750,7 +597,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
+   "id": "c5947005-6e02-4130-9d4e-5487cd389dd5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.schema import Document\n",
+    "documents = []\n",
+    "for doc in full_text:\n",
+    "    title = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"title_info_primary_tsi\"]))\n",
+    "    ID = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"id\"]))\n",
+    "    abstract = str(df_attributes.loc[df_attributes[\"id\"] == doc, \"abstract_tsi\"])\n",
+    "    title_subtitle = str(df_attributes.loc[df_attributes[\"id\"] == doc, \"title_info_primary_subtitle_tsi\"])\n",
+    "\n",
+    "    documents.append(Document(page_content=\"\", metadata={'title':title, 'ID':ID, 'abstract':abstract, 'title_subtitle':title_subtitle}))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
    "id": "6c3f5614-cc45-4fb4-9b13-1fd0361da1b6",
    "metadata": {},
    "outputs": [],
@@ -784,7 +649,7 @@
     "    \n",
     "    # Create Document with metadata\n",
     "    documents.append(Document(\n",
-    "        page_content=full_text[doc]['text'],\n",
+    "        page_content=\"\",\n",
     "        metadata={\n",
     "            \"title\": title,\n",
     "            \"subtitle\": title_subtitle,\n",
@@ -811,7 +676,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 18,
    "id": "b58b4530-27e4-4ed4-80be-4ee240892480",
    "metadata": {},
    "outputs": [
@@ -821,7 +686,7 @@
        "dict_keys(['title', 'subtitle', 'title_alt', 'abstract', 'subject_facet', 'subject_geographic', 'genre', 'genre_specific', 'name_facet', 'name_role', 'date_human', 'date_start', 'date_end', 'publisher', 'collection_name', 'physical_location', 'related_item_host', 'type_of_resource', 'URL'])"
       ]
      },
-     "execution_count": 11,
+     "execution_count": 18,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -838,18 +703,6 @@
     "# RAG Pipeline"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "648466aa-3142-4ece-aa02-4454c7f6ee41",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# set openai api key\n",
-    "import os\n",
-    "os.environ[\"OPENAI_API_KEY\"] = \"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\""
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "895258be-7bf4-4d8d-8f5a-79d80dfe8c36",
@@ -860,17 +713,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
-   "id": "1b537ce7-2eb2-4392-bc71-5a33ede503df",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#!pip install langchain-ollama"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": null,
    "id": "c1b3815e-dec5-4f22-b863-d91a57b5ccad",
    "metadata": {},
    "outputs": [],
@@ -879,15 +722,27 @@
     "from langchain_openai import OpenAIEmbeddings\n",
     "import faiss\n",
     "from langchain_community.docstore.in_memory import InMemoryDocstore\n",
-    "from langchain_community.vectorstores import FAISS"
+    "from langchain_community.vectorstores import FAISS\n",
+    "from langchain.embeddings import HuggingFaceEmbeddings"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 5,
    "id": "03554a37-d142-45eb-be33-a333929b927d",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/xq/fj3st__56r54gz9tdvb7d2k40000gn/T/ipykernel_7740/4140823313.py:2: LangChainDeprecationWarning: The class `HuggingFaceEmbeddings` was deprecated in LangChain 0.2.2 and will be removed in 1.0. An updated version of the class exists in the :class:`~langchain-huggingface package and should be used instead. To use it run `pip install -U :class:`~langchain-huggingface` and import as `from :class:`~langchain_huggingface import HuggingFaceEmbeddings``.\n",
+      "  embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")\n",
+      "2024-11-26 15:04:19.450227: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
+     ]
+    }
+   ],
    "source": [
     "#embeddings = OpenAIEmbeddings(model=\"text-embedding-3-large\", dimensions=3072)\n",
     "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")"
@@ -895,7 +750,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 19,
    "id": "0bea78d8-4b72-4da0-a50d-89e0fe509454",
    "metadata": {},
    "outputs": [],
@@ -904,157 +759,157 @@
     "\n",
     "vector_store = FAISS(\n",
     "    embedding_function=embeddings,\n",
-    "    index=index,\n",
     "    docstore=InMemoryDocstore(),\n",
+    "    index=index,\n",
     "    index_to_docstore_id={},\n",
     ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 20,
    "id": "b1bee292-778a-480e-8eb3-5cc37587ce85",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "['68762a4a-dbcb-411e-9047-ed04e44a794a',\n",
-       " 'f622dbf6-aee1-43e4-aba1-59fde531a7ad',\n",
-       " '927c3752-7746-4c88-a58b-e01a6547d857',\n",
-       " '70bf52c8-b79e-459b-9aa0-c1f12e12d842',\n",
-       " 'ade0d10f-c301-4ac1-99b8-6e26562ac259',\n",
-       " '01721bef-99a0-4034-8bc0-655ebec123d1',\n",
-       " '5c0969cd-a297-4b3f-b73e-9b3e5ed60c1e',\n",
-       " 'f707ab88-6b09-48fc-94ed-93ac650931a2',\n",
-       " 'bd269b63-b143-4535-83f8-27515883647f',\n",
-       " '5b0f71e5-520e-4090-8aa7-fc4f152f42f3',\n",
-       " 'a97737a2-ba3c-4a6f-b169-967ea2856f57',\n",
-       " '5e9a4add-7aae-45c1-a071-bd8510e81829',\n",
-       " '53b49886-6e08-4c6e-b69d-d12fa9accc12',\n",
-       " 'cdb37b1a-8ff1-462c-8052-90b8681d1700',\n",
-       " '44eca254-78b8-4ce5-aa31-ee02b8895734',\n",
-       " '15cb7183-2758-468a-88d0-35348ccf357a',\n",
-       " 'fd3e2244-e666-4d73-9cac-d980ce9b49ee',\n",
-       " 'b48501c1-c52d-4cbf-bcc0-2b669017736c',\n",
-       " '3eda50a4-0269-44ea-8594-e1ed21662dc1',\n",
-       " 'd6d96535-021c-44c8-89c8-796ebd3ebc0d',\n",
-       " 'e20dfb8b-7e99-4299-82ae-b69c4b2aa5e8',\n",
-       " '89cbeb0c-253a-4c01-aba0-b3ef82622381',\n",
-       " 'fa325062-a886-499d-8377-2feedc5a8262',\n",
-       " '379080c0-03f9-4ff7-89b4-be8edcd7be96',\n",
-       " '06ff8bdc-975a-4ec2-8273-e2ca4f489df4',\n",
-       " '2ab5e13c-9051-4022-a476-92a97b05c5c1',\n",
-       " '7e144845-3350-4871-9404-fa7e3f734b78',\n",
-       " '2de7c819-546d-4470-8f0a-b91a7903b0ed',\n",
-       " 'a05149dd-2b01-4f03-9cd0-01fcea8780d2',\n",
-       " 'd9d462c0-d928-49cc-9172-7152328e4d51',\n",
-       " '8edb060d-8959-418d-9470-5da965ad2b9d',\n",
-       " '19d25b97-34b0-4f53-9ec3-4060239242dc',\n",
-       " '49a6e2ef-7f85-4a3a-98d6-3b3dd8b4c3b6',\n",
-       " '54913f79-d38b-40b3-b251-86c59721fa3d',\n",
-       " 'e87af048-11d2-4977-b4aa-6465d5017fba',\n",
-       " 'de44abe6-739b-44c5-806b-06d52aa1e56b',\n",
-       " '82898991-0951-40f6-8926-309db4a807b7',\n",
-       " '02c3a940-74db-4d60-a173-ca9bbd702cd0',\n",
-       " '20f881bb-7d3b-4f49-82a3-5ac8e7578322',\n",
-       " '141d3d1e-6d5a-4dd9-95bb-4ae07c81e39c',\n",
-       " 'b8b415f5-670b-4b56-b1ce-fe424769a182',\n",
-       " '55de7042-5452-4ea2-820a-6b35a594cbf4',\n",
-       " '404565f4-ee23-423f-a4d8-550b6f8b6e41',\n",
-       " '300e8b0b-2b5a-4618-aec7-8d5324de106f',\n",
-       " 'cab508a6-25f1-43ca-9f46-835af4e922af',\n",
-       " '2d370174-8c8f-418b-a083-84f52cfe9a36',\n",
-       " '65ff5f75-cc47-4610-999d-9a11f9894b32',\n",
-       " '3995b720-99e0-4f68-a740-46e7f2e5c59f',\n",
-       " '40af7846-ba78-48eb-83e0-6b6b844f8c79',\n",
-       " 'c1f51576-1e99-4f77-be03-ce202f238dbe',\n",
-       " 'ec7d4da2-e7a5-40d9-9e67-a37c12e03bf9',\n",
-       " '270222bf-57d7-4cfe-a268-1f6167ff6d7e',\n",
-       " '9a8373ee-c2db-4a1d-b5ac-6a04c0d2becd',\n",
-       " '4504f7c3-6820-49c7-be23-43d9b8ba58aa',\n",
-       " 'b181eab2-587c-4de7-92a4-6f8b0f3f2b00',\n",
-       " '7e889a1d-c1b8-4c53-af90-de3eaa6db021',\n",
-       " '6007601e-7b9d-4064-b16c-655169e9d72a',\n",
-       " '11e280e2-dfce-4a52-8867-296155f78eb8',\n",
-       " '821de396-47f8-4606-934c-13c1bf884473',\n",
-       " '9b7e6753-61d1-446d-8a2b-344b0e41f84c',\n",
-       " '5c2d20de-9344-441a-a1d8-0ca0f4b1fca4',\n",
-       " 'aee572eb-ec36-4f61-b35e-9dd2c6ab36f2',\n",
-       " '94ac76ef-42f8-4a03-bffe-f86cbab72408',\n",
-       " '1be0d43f-ee71-419c-ba50-3aaf024d27c5',\n",
-       " 'bfd48d92-f28e-48f1-815b-756642cf2b8e',\n",
-       " '8023464e-5507-4192-ac25-f06fb585e68d',\n",
-       " '75bde2f0-1b5f-4e05-aa8a-ff2d315a06b4',\n",
-       " '0b34222f-1dda-4d19-9810-11e75d1272e5',\n",
-       " '97500c01-1c8f-42ef-8efd-19223c68c46a',\n",
-       " 'cf8c5ea5-d750-4641-bb5b-e971494b7666',\n",
-       " '3fd5a660-ab78-4897-a700-1a73b533b0cc',\n",
-       " '17368981-b502-45e1-abaa-4a9b09c096a8',\n",
-       " 'de201d3e-8d95-41a6-b8ac-4afba7909331',\n",
-       " '192f929f-1bba-4e75-b05e-aa8ae937a092',\n",
-       " '86b39447-2fb0-4899-b1c1-6a89e6bfee0f',\n",
-       " 'ff5d3a8e-4da6-47f0-b573-036670c5a936',\n",
-       " 'ef0a7589-2792-43b9-9895-b40d175e5ff9',\n",
-       " '97338842-0c45-4e94-ac34-81dc74d259fd',\n",
-       " 'f24aca1a-8ea3-40fc-a868-c300bd6c7c5a',\n",
-       " '41af6f16-89f8-46c5-89ed-a0853385c24c',\n",
-       " 'a2f0344b-6ca1-4c2c-ac08-de839b7a61cd',\n",
-       " '8d23a995-124f-40c5-820a-b86eacb7589f',\n",
-       " '8846fe55-3eda-4af1-aee1-4c8faa9a5c73',\n",
-       " 'ae4e13a7-db9f-43d6-b27b-913db7f23b48',\n",
-       " '05846bed-e2ba-4b45-84eb-aef0fbdb1052',\n",
-       " 'ac778fa3-e3b2-42e3-8152-c63701895dfe',\n",
-       " 'c697142b-3c95-440d-b61a-7183c08c171d',\n",
-       " 'caa1e83f-ae87-4704-9f2e-d1dad34a9cd2',\n",
-       " '43847c84-8fe5-4457-9f82-526695e2e97a',\n",
-       " '5521bbc6-cdff-4703-bf3e-63e01596d21c',\n",
-       " '3c275f68-8f64-452b-bece-3949e2c25b22',\n",
-       " 'bf581ede-0913-424d-8a2d-216ae9cadcb0',\n",
-       " '117e9f7d-7c8b-4366-9f84-fbd5dd1171dd',\n",
-       " 'c116254f-1415-4188-bc93-0487bcbe47ee',\n",
-       " '40fac159-6763-4a30-addb-975a6e4e69a3',\n",
-       " '7a181599-2847-4d9b-957f-f4fc7468aea4',\n",
-       " '4df69e1f-2e1b-4be0-a161-a374f493c4cb',\n",
-       " '37813f0f-33a6-4adc-ac55-cbca0b74471f',\n",
-       " 'b0dbb8ef-e049-4393-a159-2e0f9081b0a3',\n",
-       " '57da8d7d-e0fc-4868-938d-a85f6d6663ff',\n",
-       " '618dad07-a666-4827-b188-55d5bf31aed0',\n",
-       " '3e2724b7-4f24-42fd-9def-ca0f9edcd615',\n",
-       " '41bf1d9b-da94-4e8f-9805-18bf93ff1100',\n",
-       " 'c5c85271-1e66-4dd2-a338-a2b48a9b07e4',\n",
-       " '80646915-6af3-423c-a2de-71786272f086',\n",
-       " 'bf824464-3eae-4343-b0ab-5b565445daec',\n",
-       " 'd3127b71-3763-4532-8863-0ce42170dfdb',\n",
-       " '700f2cb1-85bd-4d28-b95f-68f4536185c1',\n",
-       " 'f79d14f8-9eaa-4f3b-af7c-62095dae4497',\n",
-       " '196c8830-7cea-485e-a61c-af5863922733',\n",
-       " '989f5923-c37f-4a18-b61f-3e0b912f01d2',\n",
-       " '36ea71ca-36e3-4fcd-bdb9-0f41a83378a3',\n",
-       " '3567b36a-f249-4ab2-9392-a734303ce5c1',\n",
-       " '569ed9cf-6b14-4822-892c-4d19205490f2',\n",
-       " '7db05e1e-0406-4d9f-bdde-e7351932c25e',\n",
-       " '818a69ab-de3d-42f4-92d0-2accb701ef8a',\n",
-       " '96fc14b8-4930-4177-b20c-dba396297577',\n",
-       " 'ecca8db2-b629-4f05-8018-4aa99dc26842',\n",
-       " '2cc41514-6e02-4cba-bbb4-c2a69454559b',\n",
-       " 'b41fa3b0-ea8e-40e5-a0be-1abd27cbc23e',\n",
-       " '1254e098-a9dc-4eba-a0b9-929ed848ed91',\n",
-       " '1a9ea687-3318-4ad7-a55a-2e0044583667',\n",
-       " 'ff5a57dd-69a9-4d9a-9d26-56b9271f1b8e',\n",
-       " '593a9427-03ae-4b3b-9215-2d14f10341f3',\n",
-       " '5903b7e0-489f-4011-bbd7-cac82b19423a',\n",
-       " '48072b10-faae-4609-a298-5c6193488ed9',\n",
-       " 'ec7cd14c-1248-4285-a090-fc8d0ce96fd4',\n",
-       " 'a9c6016b-b100-4e46-8b42-a232b9be6459',\n",
-       " '7e851d53-1a48-4289-8b0a-27834e3e044d',\n",
-       " '77dde80d-7b70-4ab6-b042-c1e67932f36a',\n",
-       " '4642025a-d842-4fb3-83ef-76ce90a6a2bb',\n",
-       " '256c37b7-38b7-4642-a501-7a611e0763ac',\n",
-       " '65962070-4375-4479-872b-fc3300c3f1af']"
+       "['809ac718-6a36-4ddc-adc7-adb0f7816a47',\n",
+       " '2c391b69-896b-4a2f-bf9e-00872a5f31ff',\n",
+       " 'f548658e-65eb-4653-95c1-485c8176cbf5',\n",
+       " 'e1fc2f95-c575-46f1-b323-13bdd0da1da4',\n",
+       " '5049ac41-e24c-43d4-90c4-950ba120fa9c',\n",
+       " 'd664e558-6dc3-4607-86f1-32868c9c9f47',\n",
+       " '7c3ad58c-a311-422c-afc6-8217f301a6cd',\n",
+       " 'cf0542b5-a070-4567-8159-ee948a77fe8e',\n",
+       " '26c38bfc-243d-499d-8b3c-e5d71aebf588',\n",
+       " 'a6658e4b-0c19-400c-a682-52b6de933d8e',\n",
+       " 'dbd05713-bbde-43b9-a89a-dd7a815317a4',\n",
+       " '430bd79f-7a37-4f72-8e32-7db7d2c039dc',\n",
+       " '5f2c9a44-855c-4f25-813b-76bf204c4d8b',\n",
+       " '84ae00a7-9673-4adb-8cc2-6d53774be25a',\n",
+       " 'dd3f5c10-6ff2-4a7a-b16a-5c2ae4a46cba',\n",
+       " 'bf6e1bf8-4373-4dfb-9205-c5bf53eb76f8',\n",
+       " '6aec476a-2550-45bd-9aee-012e913a074e',\n",
+       " '82c725c6-7cdd-430e-b900-f783034adfb2',\n",
+       " '45ff73db-3396-4c9a-8e87-448fe4f8d5ac',\n",
+       " '43a9a36d-ba1d-4a5f-993c-0328c25996e8',\n",
+       " '82970ba9-e8c7-490d-9013-c5cf62c7f60f',\n",
+       " '9a1633ba-90d5-4374-a578-c99351086246',\n",
+       " 'dd67de24-51b7-4ea5-b453-f8e52f209a38',\n",
+       " 'dfda00dc-e531-44d0-8f65-69cbb9db459b',\n",
+       " 'd3a06fb7-1ed1-44ee-8722-04e50fe8c05d',\n",
+       " '72af4cf3-287f-49cc-bba9-72b5cc4a5a0a',\n",
+       " '95e46dae-ee37-4f0c-818f-83ad28bc9a8e',\n",
+       " 'a19ce574-47af-4dbf-a136-77d308ff00d6',\n",
+       " '75762289-e5d0-420e-9754-f800153f82a5',\n",
+       " '384c7132-6755-4abf-ac1f-05e7b80f329a',\n",
+       " '63f2ed10-4299-442b-b528-5a575d72e7d6',\n",
+       " 'f2b3305b-5847-4039-affa-351130bce422',\n",
+       " '511db69b-a7bb-49b3-903f-2474db1e8bed',\n",
+       " 'c565e6f2-d42b-4fa4-873b-3214ccc8359e',\n",
+       " '45aa23c9-e1b7-4b7e-9e84-8e29a52ce6ff',\n",
+       " 'fa017f87-1b7b-46ed-b111-2dcb296ec68e',\n",
+       " 'bcb30bbc-a09d-47dc-a3d7-8162f231d006',\n",
+       " 'bbbc81f1-b06b-4636-ac44-cb4486af15f0',\n",
+       " '384b77cf-a6c4-4e30-9358-4d77b9592fb4',\n",
+       " '88151c65-9d1c-4b48-bc85-93d817936aba',\n",
+       " '2b8afaaa-9094-4d0e-b671-53f8018a03af',\n",
+       " '59b0d7f3-2e77-469c-9cf7-82be345d7472',\n",
+       " '0a91fa62-95ea-4cb7-ba1c-b95453b7df31',\n",
+       " 'aac0cf31-ad27-4148-9149-11a0fb65c274',\n",
+       " '504e49be-1436-4034-8af3-261a454a5363',\n",
+       " 'd1ebbf37-979c-4343-a69b-b292f01e2cc4',\n",
+       " 'd188aac2-bbad-4359-b53c-74282a4017c4',\n",
+       " '3f21253c-f928-40be-a0a4-a2b182a9f578',\n",
+       " 'aa7f8bac-6385-4078-8c10-30b7a99ff032',\n",
+       " '774dffec-5096-4b4d-aa79-d0be8f16f3d9',\n",
+       " '289f7e36-9700-4661-ad80-7926052ed1f3',\n",
+       " 'dccf192d-f623-446e-b9be-6d6df98f2fc7',\n",
+       " 'cbb848d4-196e-4108-9963-fcccb4a8b513',\n",
+       " 'b796c0f9-c883-47c3-afd5-0b16d56ca2f5',\n",
+       " '718c6742-e9b7-45e4-9c89-95727baa2cb3',\n",
+       " '1b2c307a-b713-474e-93e7-c40f9bc57eab',\n",
+       " '9ac86b88-fdeb-4c5c-b88c-ba143167b584',\n",
+       " '30ce78f2-0a09-4f35-abb2-ae48bf2da3f0',\n",
+       " 'ee6bafe4-9557-401e-9e8e-cad451b8cba0',\n",
+       " '4df4c508-e67a-4e8b-9446-22996c051538',\n",
+       " '608d862f-243a-4686-83a0-7726850a7703',\n",
+       " '28622e79-6257-46ad-8d0b-227078b0192a',\n",
+       " '797a8ec5-dd36-4892-9e64-8bdc912f1f08',\n",
+       " '17e43ec9-b702-4f33-a5fc-014ac988d05f',\n",
+       " 'daccbc57-bcfa-479a-abe7-367d07c93b26',\n",
+       " '0a3ecebf-afdc-4124-9bac-8132c2b4958a',\n",
+       " 'a5bdca3f-cc24-470f-b49a-54c435de582a',\n",
+       " 'f0ccf63a-03f4-4c44-8a90-cc31b68fbf37',\n",
+       " '0a9427d3-7e33-4068-91c0-95d7bf2484cf',\n",
+       " 'e4778209-1a5a-45ff-8c1e-f94f839255f1',\n",
+       " '461c81b5-9fa3-4622-9a10-af8664fee897',\n",
+       " '850b68b2-eab6-412a-9bc5-c9e971533891',\n",
+       " '9fb4e8f9-3e0c-4b0e-8365-62bf44fab2bf',\n",
+       " '9fc852eb-c189-43b7-a61c-358d6c8e6c93',\n",
+       " 'a5702378-e662-4bd3-be67-4e587495f41f',\n",
+       " '9ba0b5af-9c8c-4d58-ba9a-bff26ca2571f',\n",
+       " '53e83397-0b6c-4f6f-8ebf-c7dee136db51',\n",
+       " 'bc09b5e1-76a2-4904-9959-680a23bf98e4',\n",
+       " '236cbaf3-c36b-44dd-afbe-6fe396a12f0f',\n",
+       " 'a9ad4fbe-6366-451a-ae3e-b4fee2bf320f',\n",
+       " 'ccfcfd3e-ef70-42cd-95f9-c628bfae7747',\n",
+       " '2f81758a-d8c8-4a1b-852c-67d9ad8f6a54',\n",
+       " 'f04d8ace-8666-4ec7-a095-a7cc42fca152',\n",
+       " '18906e6f-a1ea-4bc9-8ce9-11884e320c1c',\n",
+       " '9077b482-c838-498f-a290-7c38f0df520e',\n",
+       " 'dd0fe10e-5954-4a87-b0e7-47eb5ed17635',\n",
+       " '689cc11b-9a6e-4c9f-a4ef-59dd473cae7c',\n",
+       " '9e58c8ee-ef08-4aba-9cca-92e505503f89',\n",
+       " '683848ad-8c37-403c-9992-38a219d2239c',\n",
+       " '9e705e7a-99e4-4914-803c-aefa0f40edbd',\n",
+       " '717d8e41-250d-47e0-b723-7224fed80f22',\n",
+       " '8a9efc67-579f-4dcf-ac0e-caeb80245052',\n",
+       " '2c08568f-ce5d-4ecd-87a6-9baec410cb39',\n",
+       " 'ef73a66a-32ab-4945-b644-6db8c29562be',\n",
+       " 'e9da66a8-13d7-40d2-ae36-1689770173c3',\n",
+       " '99b918bb-4e37-4a46-b8f8-9167fe95e618',\n",
+       " '8b589cf8-286b-4836-bfe7-909947401920',\n",
+       " 'd7b6853c-772e-4621-ac41-543ea43ace9d',\n",
+       " 'c9e79c0d-966c-4e01-b962-ac893773ad75',\n",
+       " '2e8a083e-6307-4428-83cd-65b436cebbf9',\n",
+       " '932c94ed-36b5-40e7-a270-7bea414ad537',\n",
+       " '65c85bd7-0338-4e1a-954f-794f8bf85902',\n",
+       " '7c438b8a-f7e7-4d0b-b376-9a2194a1246c',\n",
+       " '6a5616f9-075c-47d0-8531-2a7dab39bd57',\n",
+       " '59d6b5e3-b2a6-4aa5-bad3-60fa38236e86',\n",
+       " '93d6af54-7141-4616-8894-e5440446f3df',\n",
+       " 'd1c00188-e315-41a4-bb97-e3c37b7d2beb',\n",
+       " '9d6fb7c5-b366-4516-9d36-be3854360df3',\n",
+       " '6a5eac17-180c-474e-927a-66b2c0f880e3',\n",
+       " '3a68ad92-8ab8-44f3-8f1c-1c852575620b',\n",
+       " '0346130f-92a6-4d52-b746-741adfe8e604',\n",
+       " 'dcf5b04b-5990-40ac-9685-d3f67c896ba4',\n",
+       " '93a12904-61bd-4f2a-af50-691be41d14b5',\n",
+       " '8dc1d6b6-af25-42f7-b2bb-77dcfc1127ee',\n",
+       " 'e301f0dc-5399-4c8c-bf10-a87f344015cc',\n",
+       " 'b6a26a83-e757-4b57-bcd3-6e7bd9e5b1bf',\n",
+       " '6013f6a2-c398-46c6-954b-53882646f78b',\n",
+       " '96e4c7f7-5a57-4815-810a-5e3646704397',\n",
+       " '9f48b86c-becd-44de-989f-540764d8299b',\n",
+       " '59b136fa-4b48-45ed-b951-fc514085e95d',\n",
+       " 'ce095f0f-f663-423e-a289-32657bc386a9',\n",
+       " '584ccb6f-e675-4879-a439-a68c17afd92d',\n",
+       " '455466cd-ce88-4faa-966d-955b3144611e',\n",
+       " '3527b28d-f65e-4a90-ac86-aac1ec29aea2',\n",
+       " 'fc59f9d9-e2c6-4a6c-becd-64cb9fdc120f',\n",
+       " '32df0391-7b16-4f60-8a22-08ab3aaa6777',\n",
+       " '426c06c3-5550-46dd-bf4c-dc91c30de60a',\n",
+       " '146a4059-7fb6-4a7b-8665-2bf2b2805d3e',\n",
+       " '321bb4f3-eb98-4742-beb0-70b4496a8ab7',\n",
+       " 'b39d2d7c-4eea-4df1-85fd-eb286dee41f3',\n",
+       " 'c858d614-98f3-4c96-9be1-5ba3088643b8',\n",
+       " '355a6b60-f7b1-4d68-96da-45d03b1f1bb5',\n",
+       " '6634ee11-e63f-46f1-82f1-5d596ba4f832']"
       ]
      },
-     "execution_count": 23,
+     "execution_count": 20,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1069,17 +924,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 103,
+   "execution_count": 33,
    "id": "0c3a0a6b-4640-4036-863e-77a1a715a0e3",
    "metadata": {
     "scrolled": true
    },
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "['https://www.digitalcommonwealth.org/search/commonwealth:dv144791c', 'https://www.digitalcommonwealth.org/search/commonwealth:rv048f292', 'https://www.digitalcommonwealth.org/search/commonwealth:9019vm69m', 'https://www.digitalcommonwealth.org/search/commonwealth:dv1441451', 'https://www.digitalcommonwealth.org/search/commonwealth:w3764472d', 'https://www.digitalcommonwealth.org/search/commonwealth:6w924s12f', 'https://www.digitalcommonwealth.org/search/commonwealth:6w924r91w', 'https://www.digitalcommonwealth.org/search/commonwealth:wm118g867']\n"
+     "ename": "AttributeError",
+     "evalue": "'list' object has no attribute 'invoke'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[33], line 5\u001b[0m\n\u001b[1;32m      1\u001b[0m retriever \u001b[38;5;241m=\u001b[39m vector_store\u001b[38;5;241m.\u001b[39msimilarity_search_with_score(\n\u001b[1;32m      2\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mManuscripts\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m      3\u001b[0m     k\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m3\u001b[39m\n\u001b[1;32m      4\u001b[0m )\n\u001b[0;32m----> 5\u001b[0m r \u001b[38;5;241m=\u001b[39m retriever\u001b[38;5;241m.\u001b[39minvoke(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mJohn Bishop Estlin\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m      6\u001b[0m r_list \u001b[38;5;241m=\u001b[39m [x\u001b[38;5;241m.\u001b[39mmetadata[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mURL\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m r]\n\u001b[1;32m      7\u001b[0m \u001b[38;5;28mprint\u001b[39m(r_list)\n",
+      "\u001b[0;31mAttributeError\u001b[0m: 'list' object has no attribute 'invoke'"
      ]
     }
    ],
@@ -1093,6 +952,51 @@
     "print(r_list)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "71698bba-0e8d-4bbc-8412-81c225c8cb6e",
+   "metadata": {},
+   "source": [
+    "Save and load the vector store"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "273778f9-cb7d-4bb9-acd0-16841adf8344",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'metadata' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[13], line 12\u001b[0m\n\u001b[1;32m     10\u001b[0m \u001b[38;5;66;03m# Save metadata\u001b[39;00m\n\u001b[1;32m     11\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(store_dir, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmetadata.pkl\u001b[39m\u001b[38;5;124m\"\u001b[39m), \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwb\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[0;32m---> 12\u001b[0m     pickle\u001b[38;5;241m.\u001b[39mdump(metadata, f)\n\u001b[1;32m     14\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFAISS index and metadata stored in \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mstore_dir\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'metadata' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "vector_store.save_local(\"faiss_index\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "98740c69-5687-4b0c-a4ae-4233208f6e22",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.embeddings import HuggingFaceEmbeddings\n",
+    "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")\n",
+    "\n",
+    "vector_store = FAISS.load_local(\n",
+    "    \"faiss_index\", embeddings, allow_dangerous_deserialization=True\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "d3a257cc-c687-4fc5-91a7-0c3048a49e70",
@@ -1103,37 +1007,37 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 55,
    "id": "013d5fca-c621-4a7e-ac0e-ad5c9aba577f",
    "metadata": {},
    "outputs": [],
    "source": [
-    "weights2 = {\n",
-    "    \"title\": 1.0,\n",
-    "    \"subtitle\": 0.95,\n",
-    "    \"title_alt\": 0.9,\n",
-    "    \"abstract\": 0.85,\n",
-    "    \"subject_facet\": 0.8,\n",
-    "    \"subject_geographic\": 0.75,\n",
-    "    \"genre\": 0.7,\n",
-    "    \"genre_specific\": 0.65,\n",
-    "    \"name_facet\": 0.6,\n",
-    "    \"name_role\": 0.55,\n",
-    "    \"date_human\": 0.5,\n",
-    "    \"date_start\": 0.45,\n",
-    "    \"date_end\": 0.4,\n",
-    "    \"publisher\": 0.35,\n",
-    "    \"collection_name\": 0.3,\n",
-    "    \"physical_location\": 0.25,\n",
-    "    \"related_item_host\": 0.2,\n",
-    "    \"type_of_resource\": 0.15,\n",
-    "    \"URL\": 0.1\n",
+    "weights = {\n",
+    "    \"title\": 1000,\n",
+    "    \"subtitle\": 500,\n",
+    "    \"title_alt\": 500,\n",
+    "    \"abstract\": 30,\n",
+    "    \"subject_facet\": 1,\n",
+    "    \"subject_geographic\": 1,\n",
+    "    \"genre\": 1,\n",
+    "    \"genre_specific\": 1,\n",
+    "    \"name_facet\": 1,\n",
+    "    \"name_role\": 1,\n",
+    "    \"date_human\": 1,\n",
+    "    \"date_start\": 1,\n",
+    "    \"date_end\": 1,\n",
+    "    \"publisher\": 1,\n",
+    "    \"collection_name\": 1,\n",
+    "    \"physical_location\": 1,\n",
+    "    \"related_item_host\": 1,\n",
+    "    \"type_of_resource\": 1,\n",
+    "    \"URL\": 0.0\n",
     "}\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 75,
+   "execution_count": 78,
    "id": "cc1f04d1-f553-46ae-bfc6-248125f62423",
    "metadata": {},
    "outputs": [],
@@ -1143,9 +1047,7 @@
     "import numpy as np\n",
     "from langchain.embeddings import HuggingFaceEmbeddings\n",
     "\n",
-    "# our vector store:\n",
-    "\n",
-    "# embedding model\n",
+    "model = SentenceTransformer('all-MiniLM-L6-v2')\n",
     "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")\n",
     "\n",
     "def compute_relevance_score(metadata_value, query):\n",
@@ -1172,7 +1074,7 @@
     "\n",
     "\n",
     "\n",
-    "def rerank_documents(documents, query, weights, vector_store, k=10):\n",
+    "def rerank_documents(query, weights, vector_store, k=10):\n",
     "    \"\"\"\n",
     "    Rerank documents based on metadata relevance scores and FAISS vector similarity scores.\n",
     "\n",
@@ -1186,31 +1088,32 @@
     "        list: Reranked documents in descending order of relevance.\n",
     "    \"\"\"\n",
     "\n",
-    "    final_score = 0\n",
-    "\n",
     "    reranked_results = []\n",
+    "    total = sum(weights.values())\n",
+    "    # returns the relevant documents from the query\n",
     "    returned_docs = vector_store.similarity_search_with_score(query, k)\n",
     "    for doc in returned_docs:\n",
-    "        final_score = doc[1]\n",
+    "        final_score = 0\n",
     "        # Add weighted relevance scores for each metadata field\n",
     "        for field, weight in weights.items():\n",
     "            metadata_value = doc[0].metadata.get(field, \"\")  # Safely get metadata field value\n",
     "            relevance_score = compute_relevance_score(metadata_value, query)\n",
-    "            final_score += weight * relevance_score\n",
+    "            #print(f\"relevance_score: {relevance_score}\")\n",
+    "            final_score += (weight * relevance_score) \n",
     "\n",
-    "        reranked_results.append((doc, final_score))\n",
+    "        reranked_results.append((doc, final_score / total))\n",
     "\n",
     "    # Sort documents by the final score in descending order\n",
     "    reranked_results.sort(key=lambda x: x[1], reverse=True)\n",
-    "    return [doc for doc, score in reranked_results]\n",
+    "    return [(doc, score) for doc, score in reranked_results]\n",
     "\n",
     "\n",
-    "docs = rerank_documents(documents, \"Newspaper\", weights2, vector_store)"
+    "docs = rerank_documents(\"Newspaper\", weights, vector_store)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 78,
+   "execution_count": 79,
    "id": "2d9172aa-6c15-4c90-856e-d0ee53100721",
    "metadata": {},
    "outputs": [
@@ -1218,27 +1121,45 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:gf06jp23d', 'Reranked score: 1.1741459369659424')\n",
-      "('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:k356cp803', 'Reranked score: 1.161521077156067')\n",
-      "('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:t435k083s', 'Reranked score: 1.1445826292037964')\n",
-      "('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:8s45sw212', 'Reranked score: 1.1416451930999756')\n",
-      "('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:9p292v62n', 'Reranked score: 1.1416230201721191')\n",
-      "('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:z890vf594', 'Reranked score: 1.1343271732330322')\n",
-      "('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:pv63jm38v', 'Reranked score: 1.0997507572174072')\n",
-      "('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:05744b168', 'Reranked score: 1.0604684352874756')\n",
-      "(\"Thomas's Massachusetts Spy, or, Worcester Gazette\", 'https://www.digitalcommonwealth.org/search/commonwealth:v405x072q', 'Reranked score: 1.0452649593353271')\n",
-      "(\"Thomas's Massachusetts Spy, or, Worcester Gazette\", 'https://www.digitalcommonwealth.org/search/commonwealth:1831h918x', 'Reranked score: 1.024101972579956')\n"
+      "('Thanksgiving', 'https://www.digitalcommonwealth.org/search/commonwealth:02876465m', 'Reranked score: 0.21417763510649684')\n",
+      "('Thanksgiving', 'https://www.digitalcommonwealth.org/search/commonwealth:jd478671b', 'Reranked score: 0.21414245699337317')\n",
+      "('Thanks for high school articles', 'https://www.digitalcommonwealth.org/search/commonwealth:8910r4424', 'Reranked score: 0.20574750553692622')\n",
+      "('T.H. Jones, Charlotte, N.C., autograph letter ...', 'https://www.digitalcommonwealth.org/search/commonwealth:w3764603d', 'Reranked score: 0.16508627951932514')\n",
+      "('Theodore C. Tharin, Charleston, S.C., autograp...', 'https://www.digitalcommonwealth.org/search/commonwealth:9k41zk061', 'Reranked score: 0.14404614119555387')\n",
+      "('T.H. Marshall, Graniteville, S.C., autograph l...', 'https://www.digitalcommonwealth.org/search/commonwealth:ws859m407', 'Reranked score: 0.14002441968440776')\n",
+      "('Theodore C. Tharin, Grumesville, S.C., autogra...', 'https://www.digitalcommonwealth.org/search/commonwealth:w3764352q', 'Reranked score: 0.138896407891218')\n",
+      "('Theodore C. Tharin, Grumesville, S.C. [?], aut...', 'https://www.digitalcommonwealth.org/search/commonwealth:w3764306m', 'Reranked score: 0.13877071803271654')\n",
+      "('Theodore C. Tharin, Grumesville, S.C., autogra...', 'https://www.digitalcommonwealth.org/search/commonwealth:w3764957b', 'Reranked score: 0.1382296937873219')\n",
+      "('\"The Refuge of Oppression,\" from David S. Gran...', 'https://www.digitalcommonwealth.org/search/commonwealth:dv1441451', 'Reranked score: 0.12401763978186418')\n"
      ]
     }
    ],
    "source": [
     "#print([docs[i].metadata['title'] for i in range(len(docs))])\n",
-    "docs_list = [(docs[i][0].metadata['title'], docs[i][0].metadata['URL'], f\"Reranked score: {docs[i][1]}\") for i in range(len(docs))]\n",
+    "docs_list = [(docs[i][0][0].metadata['title'], docs[i][0][0].metadata['URL'], f\"Reranked score: {docs[i][1]}\") for i in range(len(docs))]\n",
     "docs_list.sort(key=lambda x: x[2], reverse=True)\n",
     "for doc in docs_list:\n",
     "    print(doc)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "83c1c3f2-91b4-4647-b93c-09a0af5b43a8",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "100\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(len(docs_list))"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "07cdf844-72c6-41ef-bade-9afb52bceed8",
@@ -1668,17 +1589,6 @@
     "### Making the Query"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 82,
-   "id": "5b01faae-fdbf-420d-9b79-e1f31e84baf8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os \n",
-    "os.environ[\"OPENAI_API_KEY\"] = \"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\""
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 94,

From d33e63675ac3434b551c168bfa3fadbd9a7f4b35 Mon Sep 17 00:00:00 2001
From: Brandon Vargus <45298256+b3v@users.noreply.github.com>
Date: Thu, 5 Dec 2024 12:42:51 -0500
Subject: [PATCH 5/6] Create streamlit-rag-app.py

---
 streamlit-rag-app.py | 115 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 115 insertions(+)
 create mode 100644 streamlit-rag-app.py

diff --git a/streamlit-rag-app.py b/streamlit-rag-app.py
new file mode 100644
index 0000000..3c7f55a
--- /dev/null
+++ b/streamlit-rag-app.py
@@ -0,0 +1,115 @@
+import streamlit as st
+import os
+import json
+from dotenv import load_dotenv
+
+from langchain.chains import RetrievalQA
+from langchain_community.vectorstores import FAISS
+from langchain.text_splitter import CharacterTextSplitter
+from langchain_openai import ChatOpenAI
+from langchain.schema import Document
+from langchain_huggingface import HuggingFaceEmbeddings
+
+# Load environment variables
+load_dotenv()
+
+# Get the OpenAI API key from the environment
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+if not OPENAI_API_KEY:
+    st.error("OPENAI_API_KEY is not set. Please add it to your .env file.")
+    
+# takes a few minutes to load
+file = open("bpl_data.json")
+
+bpl = json.load(file)
+    
+#file = open("bpl_data.json")
+
+#bpl = json.load(file)
+
+# Initialize session state variables
+if 'vector_store' not in st.session_state:
+    st.session_state.vector_store = None
+if 'qa_chain' not in st.session_state:
+    st.session_state.qa_chain = None
+    
+    
+    
+def setup_qa_chain(vector_store):
+    """Set up the QA chain with a retriever."""
+    retriever = vector_store.as_retriever(search_kwargs={"k": 3})
+    llm = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=OPENAI_API_KEY)
+    qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)
+
+    return qa_chain
+
+def setup_custom_chain(vector_store):
+    retriever = vector_store.as_retriever(search_kwargs={"k": 3})
+    llm = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=OPENAI_API_KEY)
+    docs = retriever.invoke()
+
+def main():
+    
+    # Set page title and header
+    st.set_page_config(page_title="LibRAG", page_icon="📚")
+    st.title("Boston Public Library Database 📚")
+    
+    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+
+    # Sidebar for initialization
+    # st.sidebar.header("Initialize Knowledge Base")
+    # if st.sidebar.button("Load Data"):
+    #     try:
+    #         st.session_state.vector_store = FAISS.load_local(
+    #                     "vector-store", embeddings, allow_dangerous_deserialization=True
+    #                     )
+    #         st.session_state.qa_chain = setup_qa_chain(st.session_state.vector_store)
+    #         st.sidebar.success("Knowledge base loaded successfully!")
+    #     except Exception as e:
+    #         st.sidebar.error(f"Error loading data: {e}")
+
+    st.session_state.vector_store = FAISS.load_local("vector-store", embeddings, allow_dangerous_deserialization=True)
+                        
+    st.session_state.qa_chain = setup_qa_chain(st.session_state.vector_store)
+
+    # Query input and processing
+    st.header("Ask a Question")
+    query = st.text_input("Enter your question about BPL's database")
+
+    if query:
+        # Check if vector store and QA chain are initialized
+        if st.session_state.qa_chain is None:
+            st.warning("Please load the knowledge base first using the sidebar.")
+        else:
+            # Run the query
+            try:
+                response = st.session_state.qa_chain({"query": query})
+                
+                # Display answer
+                st.subheader("Answer")
+                st.write(response["result"])
+
+                # Display sources
+                st.subheader("Sources")
+                sources = response["source_documents"]
+                for i, doc in enumerate(sources, 1):
+                    source = doc.metadata["source"]
+                    
+                    abstract = None
+                    
+                    # find the specific source:
+                    for j in range(len(bpl["Data"])):
+                        ID = bpl['Data'][j]["id"]
+                        if doc.metadata['source'] == ID:
+                            abstract = bpl["Data"][j]['attributes']['abstract_tsi']
+                            break
+                        
+                    with st.expander(f"Source {i}"):
+                        st.write(f"**Content:** {abstract}")
+                        st.write(f"**URL:** https://www.digitalcommonwealth.org/search/{doc.metadata['source']}")
+
+            except Exception as e:
+                st.error(f"An error occurred: {e}")
+
+if __name__ == "__main__":
+    main()

From 4808a1dc9c81880aabf924f6cab89a47f58b7e06 Mon Sep 17 00:00:00 2001
From: Brandon Vargus <bmv2021@bu.edu>
Date: Mon, 16 Dec 2024 19:09:38 -0500
Subject: [PATCH 6/6] added deployment notebook

---
 Deployment/Deployment.ipynb | 1832 +++++++++++++++++++++++++++++++++++
 1 file changed, 1832 insertions(+)
 create mode 100644 Deployment/Deployment.ipynb

diff --git a/Deployment/Deployment.ipynb b/Deployment/Deployment.ipynb
new file mode 100644
index 0000000..a027a64
--- /dev/null
+++ b/Deployment/Deployment.ipynb
@@ -0,0 +1,1832 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "2e7eb6e2-0a1b-42ea-8127-a51f13b4b4b0",
+   "metadata": {},
+   "source": [
+    "# LibRAG Deployment Phase"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "3934dc57-c4c3-4892-bcf4-bce4aa5c48af",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "import json\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5a24d843-3692-49d5-8167-af46cf4a1f5a",
+   "metadata": {},
+   "source": [
+    "### We are going to ensure that we have our data downloaded from the SCC.\n",
+    "### We are going to download one interval of the full text, as well as the entire metadata file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "b50d8c5f-dfea-454f-860e-13315a9c2fea",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# replace with sample_full_text.json\n",
+    "file = open(\"../EDA Phase/bpl-digital-commonwealth/ft_13_checkpoint_10_133.json\")\n",
+    "\n",
+    "full_text = json.load(file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "1e4c5601-648f-4ff1-a5ce-d8823c46f884",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "133\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(len(full_text))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1148a15c-0965-4613-bdb4-15e74467fd16",
+   "metadata": {},
+   "source": [
+    "Here's how to access the text documents:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "da05e8ae-71f6-4ab3-b2fa-0193b77d6262",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(full_text['commonwealth:w3764603d']['text'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bb3f0d7e-a982-4e18-a9eb-783df449ff09",
+   "metadata": {},
+   "source": [
+    "### Create Metadata Dataframe"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "be5c55ae-c7a5-4aec-9fe5-d79ff6cc0592",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.DataFrame(bpl_metadata['Data'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "91e29caa-9e83-4ddf-a206-53b66012c48a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.drop(columns=df.columns[0], axis=1, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "7e84b8ed-2f7b-4a1c-bb95-693d9ca8a846",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>type</th>\n",
+       "      <th>links</th>\n",
+       "      <th>id</th>\n",
+       "      <th>system_create_dtsi</th>\n",
+       "      <th>system_modified_dtsi</th>\n",
+       "      <th>curator_model_ssi</th>\n",
+       "      <th>curator_model_suffix_ssi</th>\n",
+       "      <th>title_info_primary_tsi</th>\n",
+       "      <th>genre_basic_ssim</th>\n",
+       "      <th>genre_specific_ssim</th>\n",
+       "      <th>...</th>\n",
+       "      <th>storage_key_base_ss</th>\n",
+       "      <th>identifier_issn_ssim</th>\n",
+       "      <th>frequency_tsi</th>\n",
+       "      <th>contained_by_ssi</th>\n",
+       "      <th>note_credits_tsim</th>\n",
+       "      <th>identifier_isbn_ssim</th>\n",
+       "      <th>identifier_music_publisher_ssim</th>\n",
+       "      <th>note_arrangement_tsim</th>\n",
+       "      <th>transcription_ark_id_ssi</th>\n",
+       "      <th>transcription_key_base_ss</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>DigitalObject</td>\n",
+       "      <td>{'self': 'https://www.digitalcommonwealth.org/...</td>\n",
+       "      <td>commonwealth-oai:xp68md23x</td>\n",
+       "      <td>2021-03-04T00:13:09Z</td>\n",
+       "      <td>2021-09-02T20:40:00Z</td>\n",
+       "      <td>Curator::DigitalObject</td>\n",
+       "      <td>DigitalObject</td>\n",
+       "      <td>من فضلكم توقفوا الأشخاص الذين ارتكبوا أسوأ الج...</td>\n",
+       "      <td>[Posters]</td>\n",
+       "      <td>[Political posters]</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>DigitalObject</td>\n",
+       "      <td>{'self': 'https://www.digitalcommonwealth.org/...</td>\n",
+       "      <td>commonwealth-oai:xp68m844v</td>\n",
+       "      <td>2021-03-03T23:58:44Z</td>\n",
+       "      <td>2021-09-02T20:21:32Z</td>\n",
+       "      <td>Curator::DigitalObject</td>\n",
+       "      <td>DigitalObject</td>\n",
+       "      <td>海员们 : 要警惕航运事故</td>\n",
+       "      <td>[Posters]</td>\n",
+       "      <td>[Political posters]</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>DigitalObject</td>\n",
+       "      <td>{'self': 'https://www.digitalcommonwealth.org/...</td>\n",
+       "      <td>commonwealth-oai:xp68mb49n</td>\n",
+       "      <td>2021-03-04T00:06:25Z</td>\n",
+       "      <td>2021-09-02T20:30:29Z</td>\n",
+       "      <td>Curator::DigitalObject</td>\n",
+       "      <td>DigitalObject</td>\n",
+       "      <td>人間としての尊厳を保てる : 生活賃金を</td>\n",
+       "      <td>[Posters]</td>\n",
+       "      <td>[Political posters]</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>DigitalObject</td>\n",
+       "      <td>{'self': 'https://www.digitalcommonwealth.org/...</td>\n",
+       "      <td>commonwealth-oai:xp68mc60v</td>\n",
+       "      <td>2021-03-04T00:10:40Z</td>\n",
+       "      <td>2021-09-02T20:35:20Z</td>\n",
+       "      <td>Curator::DigitalObject</td>\n",
+       "      <td>DigitalObject</td>\n",
+       "      <td>野火</td>\n",
+       "      <td>[Posters]</td>\n",
+       "      <td>[Political posters]</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>DigitalObject</td>\n",
+       "      <td>{'self': 'https://www.digitalcommonwealth.org/...</td>\n",
+       "      <td>commonwealth-oai:xp68mc72n</td>\n",
+       "      <td>2021-03-04T00:11:07Z</td>\n",
+       "      <td>2021-09-02T20:35:52Z</td>\n",
+       "      <td>Curator::DigitalObject</td>\n",
+       "      <td>DigitalObject</td>\n",
+       "      <td>野火</td>\n",
+       "      <td>[Posters]</td>\n",
+       "      <td>[Political posters]</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>DigitalObject</td>\n",
+       "      <td>{'self': 'https://www.digitalcommonwealth.org/...</td>\n",
+       "      <td>commonwealth-oai:xp68mc992</td>\n",
+       "      <td>2021-03-04T00:12:14Z</td>\n",
+       "      <td>2021-09-02T20:36:59Z</td>\n",
+       "      <td>Curator::DigitalObject</td>\n",
+       "      <td>DigitalObject</td>\n",
+       "      <td>團結 抗強權</td>\n",
+       "      <td>[Posters]</td>\n",
+       "      <td>[Political posters]</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>DigitalObject</td>\n",
+       "      <td>{'self': 'https://www.digitalcommonwealth.org/...</td>\n",
+       "      <td>commonwealth-oai:xp68m804w</td>\n",
+       "      <td>2021-03-03T23:57:00Z</td>\n",
+       "      <td>2021-09-02T20:19:35Z</td>\n",
+       "      <td>Curator::DigitalObject</td>\n",
+       "      <td>DigitalObject</td>\n",
+       "      <td>大队的夜晩 (年画) 史惠芳作</td>\n",
+       "      <td>[Posters]</td>\n",
+       "      <td>[Political posters]</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>DigitalObject</td>\n",
+       "      <td>{'self': 'https://www.digitalcommonwealth.org/...</td>\n",
+       "      <td>commonwealth-oai:xp68m8365</td>\n",
+       "      <td>2021-03-03T23:58:27Z</td>\n",
+       "      <td>2021-09-02T20:21:12Z</td>\n",
+       "      <td>Curator::DigitalObject</td>\n",
+       "      <td>DigitalObject</td>\n",
+       "      <td>морякам</td>\n",
+       "      <td>[Posters]</td>\n",
+       "      <td>[Political posters]</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>DigitalObject</td>\n",
+       "      <td>{'self': 'https://www.digitalcommonwealth.org/...</td>\n",
+       "      <td>commonwealth:8k71nz966</td>\n",
+       "      <td>2015-09-14T22:06:01Z</td>\n",
+       "      <td>2022-07-08T19:59:21Z</td>\n",
+       "      <td>Curator::DigitalObject</td>\n",
+       "      <td>DigitalObject</td>\n",
+       "      <td>A</td>\n",
+       "      <td>[Prints]</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>DigitalObject</td>\n",
+       "      <td>{'self': 'https://www.digitalcommonwealth.org/...</td>\n",
+       "      <td>commonwealth:8k71p000r</td>\n",
+       "      <td>2015-09-14T22:06:33Z</td>\n",
+       "      <td>2022-07-08T19:59:21Z</td>\n",
+       "      <td>Curator::DigitalObject</td>\n",
+       "      <td>DigitalObject</td>\n",
+       "      <td>E</td>\n",
+       "      <td>[Prints]</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>10 rows × 140 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "            type                                              links  \\\n",
+       "0  DigitalObject  {'self': 'https://www.digitalcommonwealth.org/...   \n",
+       "1  DigitalObject  {'self': 'https://www.digitalcommonwealth.org/...   \n",
+       "2  DigitalObject  {'self': 'https://www.digitalcommonwealth.org/...   \n",
+       "3  DigitalObject  {'self': 'https://www.digitalcommonwealth.org/...   \n",
+       "4  DigitalObject  {'self': 'https://www.digitalcommonwealth.org/...   \n",
+       "5  DigitalObject  {'self': 'https://www.digitalcommonwealth.org/...   \n",
+       "6  DigitalObject  {'self': 'https://www.digitalcommonwealth.org/...   \n",
+       "7  DigitalObject  {'self': 'https://www.digitalcommonwealth.org/...   \n",
+       "8  DigitalObject  {'self': 'https://www.digitalcommonwealth.org/...   \n",
+       "9  DigitalObject  {'self': 'https://www.digitalcommonwealth.org/...   \n",
+       "\n",
+       "                           id    system_create_dtsi  system_modified_dtsi  \\\n",
+       "0  commonwealth-oai:xp68md23x  2021-03-04T00:13:09Z  2021-09-02T20:40:00Z   \n",
+       "1  commonwealth-oai:xp68m844v  2021-03-03T23:58:44Z  2021-09-02T20:21:32Z   \n",
+       "2  commonwealth-oai:xp68mb49n  2021-03-04T00:06:25Z  2021-09-02T20:30:29Z   \n",
+       "3  commonwealth-oai:xp68mc60v  2021-03-04T00:10:40Z  2021-09-02T20:35:20Z   \n",
+       "4  commonwealth-oai:xp68mc72n  2021-03-04T00:11:07Z  2021-09-02T20:35:52Z   \n",
+       "5  commonwealth-oai:xp68mc992  2021-03-04T00:12:14Z  2021-09-02T20:36:59Z   \n",
+       "6  commonwealth-oai:xp68m804w  2021-03-03T23:57:00Z  2021-09-02T20:19:35Z   \n",
+       "7  commonwealth-oai:xp68m8365  2021-03-03T23:58:27Z  2021-09-02T20:21:12Z   \n",
+       "8      commonwealth:8k71nz966  2015-09-14T22:06:01Z  2022-07-08T19:59:21Z   \n",
+       "9      commonwealth:8k71p000r  2015-09-14T22:06:33Z  2022-07-08T19:59:21Z   \n",
+       "\n",
+       "        curator_model_ssi curator_model_suffix_ssi  \\\n",
+       "0  Curator::DigitalObject            DigitalObject   \n",
+       "1  Curator::DigitalObject            DigitalObject   \n",
+       "2  Curator::DigitalObject            DigitalObject   \n",
+       "3  Curator::DigitalObject            DigitalObject   \n",
+       "4  Curator::DigitalObject            DigitalObject   \n",
+       "5  Curator::DigitalObject            DigitalObject   \n",
+       "6  Curator::DigitalObject            DigitalObject   \n",
+       "7  Curator::DigitalObject            DigitalObject   \n",
+       "8  Curator::DigitalObject            DigitalObject   \n",
+       "9  Curator::DigitalObject            DigitalObject   \n",
+       "\n",
+       "                              title_info_primary_tsi genre_basic_ssim  \\\n",
+       "0  من فضلكم توقفوا الأشخاص الذين ارتكبوا أسوأ الج...        [Posters]   \n",
+       "1                                      海员们 : 要警惕航运事故        [Posters]   \n",
+       "2                               人間としての尊厳を保てる : 生活賃金を        [Posters]   \n",
+       "3                                                 野火        [Posters]   \n",
+       "4                                                 野火        [Posters]   \n",
+       "5                                             團結 抗強權        [Posters]   \n",
+       "6                                    大队的夜晩 (年画) 史惠芳作        [Posters]   \n",
+       "7                                            морякам        [Posters]   \n",
+       "8                                                  A         [Prints]   \n",
+       "9                                                  E         [Prints]   \n",
+       "\n",
+       "   genre_specific_ssim  ... storage_key_base_ss identifier_issn_ssim  \\\n",
+       "0  [Political posters]  ...                 NaN                  NaN   \n",
+       "1  [Political posters]  ...                 NaN                  NaN   \n",
+       "2  [Political posters]  ...                 NaN                  NaN   \n",
+       "3  [Political posters]  ...                 NaN                  NaN   \n",
+       "4  [Political posters]  ...                 NaN                  NaN   \n",
+       "5  [Political posters]  ...                 NaN                  NaN   \n",
+       "6  [Political posters]  ...                 NaN                  NaN   \n",
+       "7  [Political posters]  ...                 NaN                  NaN   \n",
+       "8                  NaN  ...                 NaN                  NaN   \n",
+       "9                  NaN  ...                 NaN                  NaN   \n",
+       "\n",
+       "  frequency_tsi contained_by_ssi note_credits_tsim identifier_isbn_ssim  \\\n",
+       "0           NaN              NaN               NaN                  NaN   \n",
+       "1           NaN              NaN               NaN                  NaN   \n",
+       "2           NaN              NaN               NaN                  NaN   \n",
+       "3           NaN              NaN               NaN                  NaN   \n",
+       "4           NaN              NaN               NaN                  NaN   \n",
+       "5           NaN              NaN               NaN                  NaN   \n",
+       "6           NaN              NaN               NaN                  NaN   \n",
+       "7           NaN              NaN               NaN                  NaN   \n",
+       "8           NaN              NaN               NaN                  NaN   \n",
+       "9           NaN              NaN               NaN                  NaN   \n",
+       "\n",
+       "  identifier_music_publisher_ssim note_arrangement_tsim  \\\n",
+       "0                             NaN                   NaN   \n",
+       "1                             NaN                   NaN   \n",
+       "2                             NaN                   NaN   \n",
+       "3                             NaN                   NaN   \n",
+       "4                             NaN                   NaN   \n",
+       "5                             NaN                   NaN   \n",
+       "6                             NaN                   NaN   \n",
+       "7                             NaN                   NaN   \n",
+       "8                             NaN                   NaN   \n",
+       "9                             NaN                   NaN   \n",
+       "\n",
+       "  transcription_ark_id_ssi transcription_key_base_ss  \n",
+       "0                      NaN                       NaN  \n",
+       "1                      NaN                       NaN  \n",
+       "2                      NaN                       NaN  \n",
+       "3                      NaN                       NaN  \n",
+       "4                      NaN                       NaN  \n",
+       "5                      NaN                       NaN  \n",
+       "6                      NaN                       NaN  \n",
+       "7                      NaN                       NaN  \n",
+       "8                      NaN                       NaN  \n",
+       "9                      NaN                       NaN  \n",
+       "\n",
+       "[10 rows x 140 columns]"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_attributes = pd.json_normalize(df['attributes'])\n",
+    "df_attributes = pd.concat([df.drop(columns=['attributes']), df_attributes], axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "eed31425-0d0d-46e7-ae69-45637110e2c7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_attributes.to_csv(\"metadata_attributes.csv\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ce755e72-348e-45b3-8cb4-7e0202818e16",
+   "metadata": {},
+   "source": [
+    "### Optionally, read the csv if it is already downloaded"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "49e1b410-a7b0-4ef3-bbb8-2c240e568a73",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/xq/fj3st__56r54gz9tdvb7d2k40000gn/T/ipykernel_8401/3787498575.py:1: DtypeWarning: Columns (10,16,17,18,20,21,22,23,24,25,27,29,33,34,36,41,42,43,44,54,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,128,129,130,131,132,133,134,135,136,137,138,139,140) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+      "  df_attributes = pd.read_csv(\"metadata_attributes.csv\")\n"
+     ]
+    }
+   ],
+   "source": [
+    "df_attributes = pd.read_csv(\"metadata_attributes.csv\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1c6d1c2d-16ad-43a0-844a-72238901c07d",
+   "metadata": {},
+   "source": [
+    "### Turn full text into Documents type"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c74fe245-886d-4e9b-a554-77308c75e44b",
+   "metadata": {},
+   "source": [
+    "Now we will make Document objects with the important metadata attributes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "15d83b93-6840-4618-ac40-1c7de9e5cc1e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "def get_title(text):\n",
+    "    match = re.search(r'\\d+\\s+(.+?)\\n', text)\n",
+    "\n",
+    "    # Extracting and printing the title if there's a match\n",
+    "    if match:\n",
+    "        title = match.group(1)\n",
+    "    return title"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "c5947005-6e02-4130-9d4e-5487cd389dd5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.schema import Document\n",
+    "documents = []\n",
+    "for doc in full_text:\n",
+    "    title = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"title_info_primary_tsi\"]))\n",
+    "    ID = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"id\"]))\n",
+    "    abstract = str(df_attributes.loc[df_attributes[\"id\"] == doc, \"abstract_tsi\"])\n",
+    "    title_subtitle = str(df_attributes.loc[df_attributes[\"id\"] == doc, \"title_info_primary_subtitle_tsi\"])\n",
+    "\n",
+    "    documents.append(Document(page_content=\"\", metadata={'title':title, 'ID':ID, 'abstract':abstract, 'title_subtitle':title_subtitle}))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "6c3f5614-cc45-4fb4-9b13-1fd0361da1b6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Turn the BPL data into a Document\n",
+    "from langchain.schema import Document\n",
+    "\n",
+    "documents = []\n",
+    "\n",
+    "for doc in full_text:\n",
+    "    # Extract metadata fields and apply get_title()\n",
+    "    title = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"title_info_primary_tsi\"]))\n",
+    "    title_subtitle = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"title_info_primary_subtitle_tsi\"]))\n",
+    "    title_alt = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"title_info_alternative_tsim\"]))\n",
+    "    abstract = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"abstract_tsi\"]))\n",
+    "    subject_facet = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"subject_facet_ssim\"]))\n",
+    "    subject_geographic = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"subject_geographic_sim\"]))\n",
+    "    genre = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"genre_basic_ssim\"]))\n",
+    "    genre_specific = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"genre_specific_ssim\"]))\n",
+    "    name_facet = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"name_facet_ssim\"]))\n",
+    "    name_role = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"name_role_tsim\"]))\n",
+    "    date_human = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"date_tsim\"]))\n",
+    "    date_start = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"date_start_dtsi\"]))\n",
+    "    date_end = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"date_end_dtsi\"]))\n",
+    "    publisher = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"publisher_tsi\"]))\n",
+    "    collection_name = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"collection_name_ssim\"]))\n",
+    "    physical_location = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"physical_location_ssim\"]))\n",
+    "    related_item_host = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"related_item_host_ssim\"]))\n",
+    "    type_of_resource = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"type_of_resource_ssim\"]))\n",
+    "    URL = \"https://www.digitalcommonwealth.org/search/\" + get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"id\"]))\n",
+    "    \n",
+    "    # Create Document with metadata\n",
+    "    documents.append(Document(\n",
+    "        page_content=\"\",\n",
+    "        metadata={\n",
+    "            \"title\": title,\n",
+    "            \"subtitle\": title_subtitle,\n",
+    "            \"title_alt\": title_alt,\n",
+    "            \"abstract\": abstract,\n",
+    "            \"subject_facet\": subject_facet,\n",
+    "            \"subject_geographic\": subject_geographic,\n",
+    "            \"genre\": genre,\n",
+    "            \"genre_specific\": genre_specific,\n",
+    "            \"name_facet\": name_facet,\n",
+    "            \"name_role\": name_role,\n",
+    "            \"date_human\": date_human,\n",
+    "            \"date_start\": date_start,\n",
+    "            \"date_end\": date_end,\n",
+    "            \"publisher\": publisher,\n",
+    "            \"collection_name\": collection_name,\n",
+    "            \"physical_location\": physical_location,\n",
+    "            \"related_item_host\": related_item_host,\n",
+    "            \"type_of_resource\": type_of_resource,\n",
+    "            \"URL\": URL\n",
+    "        }\n",
+    "    ))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "b58b4530-27e4-4ed4-80be-4ee240892480",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "dict_keys(['title', 'subtitle', 'title_alt', 'abstract', 'subject_facet', 'subject_geographic', 'genre', 'genre_specific', 'name_facet', 'name_role', 'date_human', 'date_start', 'date_end', 'publisher', 'collection_name', 'physical_location', 'related_item_host', 'type_of_resource', 'URL'])"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "documents[-1].metadata.keys()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6580d4c1-9ab0-44c8-9ea7-cbeb80934f4b",
+   "metadata": {},
+   "source": [
+    "# RAG Pipeline"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "895258be-7bf4-4d8d-8f5a-79d80dfe8c36",
+   "metadata": {},
+   "source": [
+    "### Using FAISS Vector Store"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c1b3815e-dec5-4f22-b863-d91a57b5ccad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#from langchain_ollama import OllamaEmbeddings\n",
+    "from langchain_openai import OpenAIEmbeddings\n",
+    "import faiss\n",
+    "from langchain_community.docstore.in_memory import InMemoryDocstore\n",
+    "from langchain_community.vectorstores import FAISS\n",
+    "from langchain.embeddings import HuggingFaceEmbeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "03554a37-d142-45eb-be33-a333929b927d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/xq/fj3st__56r54gz9tdvb7d2k40000gn/T/ipykernel_7740/4140823313.py:2: LangChainDeprecationWarning: The class `HuggingFaceEmbeddings` was deprecated in LangChain 0.2.2 and will be removed in 1.0. An updated version of the class exists in the :class:`~langchain-huggingface package and should be used instead. To use it run `pip install -U :class:`~langchain-huggingface` and import as `from :class:`~langchain_huggingface import HuggingFaceEmbeddings``.\n",
+      "  embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")\n",
+      "2024-11-26 15:04:19.450227: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
+     ]
+    }
+   ],
+   "source": [
+    "#embeddings = OpenAIEmbeddings(model=\"text-embedding-3-large\", dimensions=3072)\n",
+    "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "0bea78d8-4b72-4da0-a50d-89e0fe509454",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "index = faiss.IndexFlatL2(len(embeddings.embed_query(\"hello world\")))\n",
+    "\n",
+    "vector_store = FAISS(\n",
+    "    embedding_function=embeddings,\n",
+    "    docstore=InMemoryDocstore(),\n",
+    "    index=index,\n",
+    "    index_to_docstore_id={},\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "b1bee292-778a-480e-8eb3-5cc37587ce85",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['809ac718-6a36-4ddc-adc7-adb0f7816a47',\n",
+       " '2c391b69-896b-4a2f-bf9e-00872a5f31ff',\n",
+       " 'f548658e-65eb-4653-95c1-485c8176cbf5',\n",
+       " 'e1fc2f95-c575-46f1-b323-13bdd0da1da4',\n",
+       " '5049ac41-e24c-43d4-90c4-950ba120fa9c',\n",
+       " 'd664e558-6dc3-4607-86f1-32868c9c9f47',\n",
+       " '7c3ad58c-a311-422c-afc6-8217f301a6cd',\n",
+       " 'cf0542b5-a070-4567-8159-ee948a77fe8e',\n",
+       " '26c38bfc-243d-499d-8b3c-e5d71aebf588',\n",
+       " 'a6658e4b-0c19-400c-a682-52b6de933d8e',\n",
+       " 'dbd05713-bbde-43b9-a89a-dd7a815317a4',\n",
+       " '430bd79f-7a37-4f72-8e32-7db7d2c039dc',\n",
+       " '5f2c9a44-855c-4f25-813b-76bf204c4d8b',\n",
+       " '84ae00a7-9673-4adb-8cc2-6d53774be25a',\n",
+       " 'dd3f5c10-6ff2-4a7a-b16a-5c2ae4a46cba',\n",
+       " 'bf6e1bf8-4373-4dfb-9205-c5bf53eb76f8',\n",
+       " '6aec476a-2550-45bd-9aee-012e913a074e',\n",
+       " '82c725c6-7cdd-430e-b900-f783034adfb2',\n",
+       " '45ff73db-3396-4c9a-8e87-448fe4f8d5ac',\n",
+       " '43a9a36d-ba1d-4a5f-993c-0328c25996e8',\n",
+       " '82970ba9-e8c7-490d-9013-c5cf62c7f60f',\n",
+       " '9a1633ba-90d5-4374-a578-c99351086246',\n",
+       " 'dd67de24-51b7-4ea5-b453-f8e52f209a38',\n",
+       " 'dfda00dc-e531-44d0-8f65-69cbb9db459b',\n",
+       " 'd3a06fb7-1ed1-44ee-8722-04e50fe8c05d',\n",
+       " '72af4cf3-287f-49cc-bba9-72b5cc4a5a0a',\n",
+       " '95e46dae-ee37-4f0c-818f-83ad28bc9a8e',\n",
+       " 'a19ce574-47af-4dbf-a136-77d308ff00d6',\n",
+       " '75762289-e5d0-420e-9754-f800153f82a5',\n",
+       " '384c7132-6755-4abf-ac1f-05e7b80f329a',\n",
+       " '63f2ed10-4299-442b-b528-5a575d72e7d6',\n",
+       " 'f2b3305b-5847-4039-affa-351130bce422',\n",
+       " '511db69b-a7bb-49b3-903f-2474db1e8bed',\n",
+       " 'c565e6f2-d42b-4fa4-873b-3214ccc8359e',\n",
+       " '45aa23c9-e1b7-4b7e-9e84-8e29a52ce6ff',\n",
+       " 'fa017f87-1b7b-46ed-b111-2dcb296ec68e',\n",
+       " 'bcb30bbc-a09d-47dc-a3d7-8162f231d006',\n",
+       " 'bbbc81f1-b06b-4636-ac44-cb4486af15f0',\n",
+       " '384b77cf-a6c4-4e30-9358-4d77b9592fb4',\n",
+       " '88151c65-9d1c-4b48-bc85-93d817936aba',\n",
+       " '2b8afaaa-9094-4d0e-b671-53f8018a03af',\n",
+       " '59b0d7f3-2e77-469c-9cf7-82be345d7472',\n",
+       " '0a91fa62-95ea-4cb7-ba1c-b95453b7df31',\n",
+       " 'aac0cf31-ad27-4148-9149-11a0fb65c274',\n",
+       " '504e49be-1436-4034-8af3-261a454a5363',\n",
+       " 'd1ebbf37-979c-4343-a69b-b292f01e2cc4',\n",
+       " 'd188aac2-bbad-4359-b53c-74282a4017c4',\n",
+       " '3f21253c-f928-40be-a0a4-a2b182a9f578',\n",
+       " 'aa7f8bac-6385-4078-8c10-30b7a99ff032',\n",
+       " '774dffec-5096-4b4d-aa79-d0be8f16f3d9',\n",
+       " '289f7e36-9700-4661-ad80-7926052ed1f3',\n",
+       " 'dccf192d-f623-446e-b9be-6d6df98f2fc7',\n",
+       " 'cbb848d4-196e-4108-9963-fcccb4a8b513',\n",
+       " 'b796c0f9-c883-47c3-afd5-0b16d56ca2f5',\n",
+       " '718c6742-e9b7-45e4-9c89-95727baa2cb3',\n",
+       " '1b2c307a-b713-474e-93e7-c40f9bc57eab',\n",
+       " '9ac86b88-fdeb-4c5c-b88c-ba143167b584',\n",
+       " '30ce78f2-0a09-4f35-abb2-ae48bf2da3f0',\n",
+       " 'ee6bafe4-9557-401e-9e8e-cad451b8cba0',\n",
+       " '4df4c508-e67a-4e8b-9446-22996c051538',\n",
+       " '608d862f-243a-4686-83a0-7726850a7703',\n",
+       " '28622e79-6257-46ad-8d0b-227078b0192a',\n",
+       " '797a8ec5-dd36-4892-9e64-8bdc912f1f08',\n",
+       " '17e43ec9-b702-4f33-a5fc-014ac988d05f',\n",
+       " 'daccbc57-bcfa-479a-abe7-367d07c93b26',\n",
+       " '0a3ecebf-afdc-4124-9bac-8132c2b4958a',\n",
+       " 'a5bdca3f-cc24-470f-b49a-54c435de582a',\n",
+       " 'f0ccf63a-03f4-4c44-8a90-cc31b68fbf37',\n",
+       " '0a9427d3-7e33-4068-91c0-95d7bf2484cf',\n",
+       " 'e4778209-1a5a-45ff-8c1e-f94f839255f1',\n",
+       " '461c81b5-9fa3-4622-9a10-af8664fee897',\n",
+       " '850b68b2-eab6-412a-9bc5-c9e971533891',\n",
+       " '9fb4e8f9-3e0c-4b0e-8365-62bf44fab2bf',\n",
+       " '9fc852eb-c189-43b7-a61c-358d6c8e6c93',\n",
+       " 'a5702378-e662-4bd3-be67-4e587495f41f',\n",
+       " '9ba0b5af-9c8c-4d58-ba9a-bff26ca2571f',\n",
+       " '53e83397-0b6c-4f6f-8ebf-c7dee136db51',\n",
+       " 'bc09b5e1-76a2-4904-9959-680a23bf98e4',\n",
+       " '236cbaf3-c36b-44dd-afbe-6fe396a12f0f',\n",
+       " 'a9ad4fbe-6366-451a-ae3e-b4fee2bf320f',\n",
+       " 'ccfcfd3e-ef70-42cd-95f9-c628bfae7747',\n",
+       " '2f81758a-d8c8-4a1b-852c-67d9ad8f6a54',\n",
+       " 'f04d8ace-8666-4ec7-a095-a7cc42fca152',\n",
+       " '18906e6f-a1ea-4bc9-8ce9-11884e320c1c',\n",
+       " '9077b482-c838-498f-a290-7c38f0df520e',\n",
+       " 'dd0fe10e-5954-4a87-b0e7-47eb5ed17635',\n",
+       " '689cc11b-9a6e-4c9f-a4ef-59dd473cae7c',\n",
+       " '9e58c8ee-ef08-4aba-9cca-92e505503f89',\n",
+       " '683848ad-8c37-403c-9992-38a219d2239c',\n",
+       " '9e705e7a-99e4-4914-803c-aefa0f40edbd',\n",
+       " '717d8e41-250d-47e0-b723-7224fed80f22',\n",
+       " '8a9efc67-579f-4dcf-ac0e-caeb80245052',\n",
+       " '2c08568f-ce5d-4ecd-87a6-9baec410cb39',\n",
+       " 'ef73a66a-32ab-4945-b644-6db8c29562be',\n",
+       " 'e9da66a8-13d7-40d2-ae36-1689770173c3',\n",
+       " '99b918bb-4e37-4a46-b8f8-9167fe95e618',\n",
+       " '8b589cf8-286b-4836-bfe7-909947401920',\n",
+       " 'd7b6853c-772e-4621-ac41-543ea43ace9d',\n",
+       " 'c9e79c0d-966c-4e01-b962-ac893773ad75',\n",
+       " '2e8a083e-6307-4428-83cd-65b436cebbf9',\n",
+       " '932c94ed-36b5-40e7-a270-7bea414ad537',\n",
+       " '65c85bd7-0338-4e1a-954f-794f8bf85902',\n",
+       " '7c438b8a-f7e7-4d0b-b376-9a2194a1246c',\n",
+       " '6a5616f9-075c-47d0-8531-2a7dab39bd57',\n",
+       " '59d6b5e3-b2a6-4aa5-bad3-60fa38236e86',\n",
+       " '93d6af54-7141-4616-8894-e5440446f3df',\n",
+       " 'd1c00188-e315-41a4-bb97-e3c37b7d2beb',\n",
+       " '9d6fb7c5-b366-4516-9d36-be3854360df3',\n",
+       " '6a5eac17-180c-474e-927a-66b2c0f880e3',\n",
+       " '3a68ad92-8ab8-44f3-8f1c-1c852575620b',\n",
+       " '0346130f-92a6-4d52-b746-741adfe8e604',\n",
+       " 'dcf5b04b-5990-40ac-9685-d3f67c896ba4',\n",
+       " '93a12904-61bd-4f2a-af50-691be41d14b5',\n",
+       " '8dc1d6b6-af25-42f7-b2bb-77dcfc1127ee',\n",
+       " 'e301f0dc-5399-4c8c-bf10-a87f344015cc',\n",
+       " 'b6a26a83-e757-4b57-bcd3-6e7bd9e5b1bf',\n",
+       " '6013f6a2-c398-46c6-954b-53882646f78b',\n",
+       " '96e4c7f7-5a57-4815-810a-5e3646704397',\n",
+       " '9f48b86c-becd-44de-989f-540764d8299b',\n",
+       " '59b136fa-4b48-45ed-b951-fc514085e95d',\n",
+       " 'ce095f0f-f663-423e-a289-32657bc386a9',\n",
+       " '584ccb6f-e675-4879-a439-a68c17afd92d',\n",
+       " '455466cd-ce88-4faa-966d-955b3144611e',\n",
+       " '3527b28d-f65e-4a90-ac86-aac1ec29aea2',\n",
+       " 'fc59f9d9-e2c6-4a6c-becd-64cb9fdc120f',\n",
+       " '32df0391-7b16-4f60-8a22-08ab3aaa6777',\n",
+       " '426c06c3-5550-46dd-bf4c-dc91c30de60a',\n",
+       " '146a4059-7fb6-4a7b-8665-2bf2b2805d3e',\n",
+       " '321bb4f3-eb98-4742-beb0-70b4496a8ab7',\n",
+       " 'b39d2d7c-4eea-4df1-85fd-eb286dee41f3',\n",
+       " 'c858d614-98f3-4c96-9be1-5ba3088643b8',\n",
+       " '355a6b60-f7b1-4d68-96da-45d03b1f1bb5',\n",
+       " '6634ee11-e63f-46f1-82f1-5d596ba4f832']"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from uuid import uuid4\n",
+    "\n",
+    "uuids = [str(uuid4()) for _ in range(len(documents))]\n",
+    "\n",
+    "vector_store.add_documents(documents=documents, ids=uuids)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "0c3a0a6b-4640-4036-863e-77a1a715a0e3",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "ename": "AttributeError",
+     "evalue": "'list' object has no attribute 'invoke'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[33], line 5\u001b[0m\n\u001b[1;32m      1\u001b[0m retriever \u001b[38;5;241m=\u001b[39m vector_store\u001b[38;5;241m.\u001b[39msimilarity_search_with_score(\n\u001b[1;32m      2\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mManuscripts\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m      3\u001b[0m     k\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m3\u001b[39m\n\u001b[1;32m      4\u001b[0m )\n\u001b[0;32m----> 5\u001b[0m r \u001b[38;5;241m=\u001b[39m retriever\u001b[38;5;241m.\u001b[39minvoke(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mJohn Bishop Estlin\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m      6\u001b[0m r_list \u001b[38;5;241m=\u001b[39m [x\u001b[38;5;241m.\u001b[39mmetadata[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mURL\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m r]\n\u001b[1;32m      7\u001b[0m \u001b[38;5;28mprint\u001b[39m(r_list)\n",
+      "\u001b[0;31mAttributeError\u001b[0m: 'list' object has no attribute 'invoke'"
+     ]
+    }
+   ],
+   "source": [
+    "retriever = vector_store.similarity_search_with_score(\n",
+    "    \"Manuscripts\",\n",
+    "    k=3\n",
+    ")\n",
+    "r = retriever.invoke(\"John Bishop Estlin\")\n",
+    "r_list = [x.metadata[\"URL\"] for x in r]\n",
+    "print(r_list)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "71698bba-0e8d-4bbc-8412-81c225c8cb6e",
+   "metadata": {},
+   "source": [
+    "Save and load the vector store"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "273778f9-cb7d-4bb9-acd0-16841adf8344",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'metadata' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[13], line 12\u001b[0m\n\u001b[1;32m     10\u001b[0m \u001b[38;5;66;03m# Save metadata\u001b[39;00m\n\u001b[1;32m     11\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(store_dir, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmetadata.pkl\u001b[39m\u001b[38;5;124m\"\u001b[39m), \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwb\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[0;32m---> 12\u001b[0m     pickle\u001b[38;5;241m.\u001b[39mdump(metadata, f)\n\u001b[1;32m     14\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFAISS index and metadata stored in \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mstore_dir\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'metadata' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "vector_store.save_local(\"faiss_index\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "98740c69-5687-4b0c-a4ae-4233208f6e22",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.embeddings import HuggingFaceEmbeddings\n",
+    "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")\n",
+    "\n",
+    "vector_store = FAISS.load_local(\n",
+    "    \"faiss_index\", embeddings, allow_dangerous_deserialization=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d3a257cc-c687-4fc5-91a7-0c3048a49e70",
+   "metadata": {},
+   "source": [
+    "### Now for the Reranking Step:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "id": "013d5fca-c621-4a7e-ac0e-ad5c9aba577f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "weights = {\n",
+    "    \"title\": 1000,\n",
+    "    \"subtitle\": 500,\n",
+    "    \"title_alt\": 500,\n",
+    "    \"abstract\": 30,\n",
+    "    \"subject_facet\": 1,\n",
+    "    \"subject_geographic\": 1,\n",
+    "    \"genre\": 1,\n",
+    "    \"genre_specific\": 1,\n",
+    "    \"name_facet\": 1,\n",
+    "    \"name_role\": 1,\n",
+    "    \"date_human\": 1,\n",
+    "    \"date_start\": 1,\n",
+    "    \"date_end\": 1,\n",
+    "    \"publisher\": 1,\n",
+    "    \"collection_name\": 1,\n",
+    "    \"physical_location\": 1,\n",
+    "    \"related_item_host\": 1,\n",
+    "    \"type_of_resource\": 1,\n",
+    "    \"URL\": 0.0\n",
+    "}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 78,
+   "id": "cc1f04d1-f553-46ae-bfc6-248125f62423",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sentence_transformers import SentenceTransformer\n",
+    "from sklearn.metrics.pairwise import cosine_similarity\n",
+    "import numpy as np\n",
+    "from langchain.embeddings import HuggingFaceEmbeddings\n",
+    "\n",
+    "model = SentenceTransformer('all-MiniLM-L6-v2')\n",
+    "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")\n",
+    "\n",
+    "def compute_relevance_score(metadata_value, query):\n",
+    "    \"\"\"\n",
+    "    Compute cosine similarity between the query and a metadata value using sentence-transformers.\n",
+    "\n",
+    "    Args:\n",
+    "        metadata_value (str): The metadata value to compare.\n",
+    "        query (str): The query string.\n",
+    "\n",
+    "    Returns:\n",
+    "        float: Cosine similarity score (between 0 and 1).\n",
+    "    \"\"\"\n",
+    "    if not metadata_value or not query:\n",
+    "        return 0  # Return 0 if either the metadata or query is empty\n",
+    "    \n",
+    "    # Encode the metadata value and query into embeddings\n",
+    "    embeddings = model.encode([metadata_value, query], convert_to_tensor=False)  # Convert to NumPy\n",
+    "    metadata_embedding, query_embedding = embeddings\n",
+    "\n",
+    "    # Compute cosine similarity\n",
+    "    similarity = cosine_similarity([metadata_embedding], [query_embedding])\n",
+    "    return similarity[0][0]  # Extract the scalar similarity value\n",
+    "\n",
+    "\n",
+    "\n",
+    "def rerank_documents(query, weights, vector_store, k=10):\n",
+    "    \"\"\"\n",
+    "    Rerank documents based on metadata relevance scores and FAISS vector similarity scores.\n",
+    "\n",
+    "    Args:\n",
+    "        documents (list): List of Document objects.\n",
+    "        query (str): The query string used for retrieval.\n",
+    "        weights (dict): Weights for each metadata field.\n",
+    "        vector_store (str): The vector store itself to get the similarity score\n",
+    "\n",
+    "    Returns:\n",
+    "        list: Reranked documents in descending order of relevance.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    reranked_results = []\n",
+    "    total = sum(weights.values())\n",
+    "    # returns the relevant documents from the query\n",
+    "    returned_docs = vector_store.similarity_search_with_score(query, k)\n",
+    "    for doc in returned_docs:\n",
+    "        final_score = 0\n",
+    "        # Add weighted relevance scores for each metadata field\n",
+    "        for field, weight in weights.items():\n",
+    "            metadata_value = doc[0].metadata.get(field, \"\")  # Safely get metadata field value\n",
+    "            relevance_score = compute_relevance_score(metadata_value, query)\n",
+    "            #print(f\"relevance_score: {relevance_score}\")\n",
+    "            final_score += (weight * relevance_score) \n",
+    "\n",
+    "        reranked_results.append((doc, final_score / total))\n",
+    "\n",
+    "    # Sort documents by the final score in descending order\n",
+    "    reranked_results.sort(key=lambda x: x[1], reverse=True)\n",
+    "    return [(doc, score) for doc, score in reranked_results]\n",
+    "\n",
+    "\n",
+    "docs = rerank_documents(\"Newspaper\", weights, vector_store)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 79,
+   "id": "2d9172aa-6c15-4c90-856e-d0ee53100721",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "('Thanksgiving', 'https://www.digitalcommonwealth.org/search/commonwealth:02876465m', 'Reranked score: 0.21417763510649684')\n",
+      "('Thanksgiving', 'https://www.digitalcommonwealth.org/search/commonwealth:jd478671b', 'Reranked score: 0.21414245699337317')\n",
+      "('Thanks for high school articles', 'https://www.digitalcommonwealth.org/search/commonwealth:8910r4424', 'Reranked score: 0.20574750553692622')\n",
+      "('T.H. Jones, Charlotte, N.C., autograph letter ...', 'https://www.digitalcommonwealth.org/search/commonwealth:w3764603d', 'Reranked score: 0.16508627951932514')\n",
+      "('Theodore C. Tharin, Charleston, S.C., autograp...', 'https://www.digitalcommonwealth.org/search/commonwealth:9k41zk061', 'Reranked score: 0.14404614119555387')\n",
+      "('T.H. Marshall, Graniteville, S.C., autograph l...', 'https://www.digitalcommonwealth.org/search/commonwealth:ws859m407', 'Reranked score: 0.14002441968440776')\n",
+      "('Theodore C. Tharin, Grumesville, S.C., autogra...', 'https://www.digitalcommonwealth.org/search/commonwealth:w3764352q', 'Reranked score: 0.138896407891218')\n",
+      "('Theodore C. Tharin, Grumesville, S.C. [?], aut...', 'https://www.digitalcommonwealth.org/search/commonwealth:w3764306m', 'Reranked score: 0.13877071803271654')\n",
+      "('Theodore C. Tharin, Grumesville, S.C., autogra...', 'https://www.digitalcommonwealth.org/search/commonwealth:w3764957b', 'Reranked score: 0.1382296937873219')\n",
+      "('\"The Refuge of Oppression,\" from David S. Gran...', 'https://www.digitalcommonwealth.org/search/commonwealth:dv1441451', 'Reranked score: 0.12401763978186418')\n"
+     ]
+    }
+   ],
+   "source": [
+    "#print([docs[i].metadata['title'] for i in range(len(docs))])\n",
+    "docs_list = [(docs[i][0][0].metadata['title'], docs[i][0][0].metadata['URL'], f\"Reranked score: {docs[i][1]}\") for i in range(len(docs))]\n",
+    "docs_list.sort(key=lambda x: x[2], reverse=True)\n",
+    "for doc in docs_list:\n",
+    "    print(doc)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "83c1c3f2-91b4-4647-b93c-09a0af5b43a8",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "100\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(len(docs_list))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "07cdf844-72c6-41ef-bade-9afb52bceed8",
+   "metadata": {},
+   "source": [
+    "Immediately we get much better performance because now only Newspapers are returned."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "18719878-92c6-458c-ae81-21d9fe5f0bd8",
+   "metadata": {},
+   "source": [
+    "# Implementing Different Vector Store and Embedding Combos"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "beaf9e61-3bea-4c31-9710-532a306d1023",
+   "metadata": {},
+   "source": [
+    "### Pinecone Vector Store w/OLlama Embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "28f8e253-cf14-4b37-8e54-bf114514ac60",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!pip install langchain-pinecone pinecone-notebooks\n",
+    "#!pip install pinecone-client"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "ab3fb46d-267e-4815-b823-e978d4bf3edf",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      "Enter your Pinecone API key:  ········\n"
+     ]
+    }
+   ],
+   "source": [
+    "import getpass\n",
+    "import os\n",
+    "import time\n",
+    "\n",
+    "from pinecone import Pinecone, ServerlessSpec\n",
+    "\n",
+    "if not os.getenv(\"PINECONE_API_KEY\"):\n",
+    "    os.environ[\"PINECONE_API_KEY\"] = getpass.getpass(\"Enter your Pinecone API key: \")\n",
+    "\n",
+    "pinecone_api_key = os.environ.get(\"PINECONE_API_KEY\")\n",
+    "\n",
+    "pc = Pinecone(api_key=pinecone_api_key)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "0a6c480d-f2cd-4732-943a-8cd9d66417e2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# setting up the index name\n",
+    "import time\n",
+    "\n",
+    "index_name = \"librag1\"  # change if desired\n",
+    "\n",
+    "existing_indexes = [index_info[\"name\"] for index_info in pc.list_indexes()]\n",
+    "\n",
+    "if index_name not in existing_indexes:\n",
+    "    pc.create_index(\n",
+    "        name=index_name,\n",
+    "        dimension=768,\n",
+    "        metric=\"cosine\",\n",
+    "        spec=ServerlessSpec(cloud=\"aws\", region=\"us-east-1\"),\n",
+    "    )\n",
+    "    while not pc.describe_index(index_name).status[\"ready\"]:\n",
+    "        time.sleep(1)\n",
+    "\n",
+    "index = pc.Index(index_name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "5c791f22-4909-447f-a1fe-ebc09a9bbe11",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_ollama import OllamaEmbeddings\n",
+    "from langchain.embeddings import HuggingFaceEmbeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "4aec6567-7950-4de2-8600-fe987f47a24a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/xq/fj3st__56r54gz9tdvb7d2k40000gn/T/ipykernel_2752/1630880338.py:5: LangChainDeprecationWarning: The class `HuggingFaceEmbeddings` was deprecated in LangChain 0.2.2 and will be removed in 1.0. An updated version of the class exists in the :class:`~langchain-huggingface package and should be used instead. To use it run `pip install -U :class:`~langchain-huggingface` and import as `from :class:`~langchain_huggingface import HuggingFaceEmbeddings``.\n",
+      "  embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")\n",
+      "2024-11-14 14:26:19.674480: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# embeddings = OllamaEmbeddings(\n",
+    "#     model=\"llama3\",\n",
+    "# )\n",
+    "\n",
+    "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "f1ad1f25-69d2-4eb1-9f85-c7b0ccf13a53",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_pinecone import PineconeVectorStore\n",
+    "vector_store = PineconeVectorStore(index=index, embedding=embeddings)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "id": "2a4b12ac-87bd-4ae3-90cc-4f2c6644256a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'1165601    Terms for disposal of woman.\\nName: abstract_tsi, dtype: object'"
+      ]
+     },
+     "execution_count": 43,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "documents[18].metadata['abstract']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "e80decf8-6ee5-48a2-bb39-5ea97ceaf7e2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from uuid import uuid4\n",
+    "uuids = [str(uuid4()) for _ in range(len(documents))]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "id": "18bcd3e7-07f6-4a66-a629-126ffe340966",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "PineconeApiException",
+     "evalue": "(400)\nReason: Bad Request\nHTTP response headers: HTTPHeaderDict({'Date': 'Thu, 14 Nov 2024 19:37:34 GMT', 'Content-Type': 'application/json', 'Content-Length': '116', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '155', 'x-pinecone-request-id': '2161036493328920558', 'x-envoy-upstream-service-time': '37', 'server': 'envoy'})\nHTTP response body: {\"code\":3,\"message\":\"Metadata size is 869788 bytes, which exceeds the limit of 40960 bytes per vector\",\"details\":[]}\n",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mPineconeApiException\u001b[0m                      Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[44], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m vector_store\u001b[38;5;241m.\u001b[39madd_documents(documents\u001b[38;5;241m=\u001b[39mdocuments[\u001b[38;5;241m0\u001b[39m:\u001b[38;5;241m15\u001b[39m], ids\u001b[38;5;241m=\u001b[39muuids)\n",
+      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/langchain_core/vectorstores/base.py:287\u001b[0m, in \u001b[0;36mVectorStore.add_documents\u001b[0;34m(self, documents, **kwargs)\u001b[0m\n\u001b[1;32m    285\u001b[0m     texts \u001b[38;5;241m=\u001b[39m [doc\u001b[38;5;241m.\u001b[39mpage_content \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m documents]\n\u001b[1;32m    286\u001b[0m     metadatas \u001b[38;5;241m=\u001b[39m [doc\u001b[38;5;241m.\u001b[39mmetadata \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m documents]\n\u001b[0;32m--> 287\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39madd_texts(texts, metadatas, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m    288\u001b[0m msg \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m    289\u001b[0m     \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m`add_documents` and `add_texts` has not been implemented \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    290\u001b[0m     \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfor \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    291\u001b[0m )\n\u001b[1;32m    292\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(msg)\n",
+      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/langchain_pinecone/vectorstores.py:292\u001b[0m, in \u001b[0;36mPineconeVectorStore.add_texts\u001b[0;34m(self, texts, metadatas, ids, namespace, batch_size, embedding_chunk_size, async_req, id_prefix, **kwargs)\u001b[0m\n\u001b[1;32m    281\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m async_req:\n\u001b[1;32m    282\u001b[0m     \u001b[38;5;66;03m# Runs the pinecone upsert asynchronously.\u001b[39;00m\n\u001b[1;32m    283\u001b[0m     async_res \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m    284\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_index\u001b[38;5;241m.\u001b[39mupsert(\n\u001b[1;32m    285\u001b[0m             vectors\u001b[38;5;241m=\u001b[39mbatch_vector_tuples,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    290\u001b[0m         \u001b[38;5;28;01mfor\u001b[39;00m batch_vector_tuples \u001b[38;5;129;01min\u001b[39;00m batch_iterate(batch_size, vector_tuples)\n\u001b[1;32m    291\u001b[0m     ]\n\u001b[0;32m--> 292\u001b[0m     [res\u001b[38;5;241m.\u001b[39mget() \u001b[38;5;28;01mfor\u001b[39;00m res \u001b[38;5;129;01min\u001b[39;00m async_res]\n\u001b[1;32m    293\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    294\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_index\u001b[38;5;241m.\u001b[39mupsert(\n\u001b[1;32m    295\u001b[0m         vectors\u001b[38;5;241m=\u001b[39mvector_tuples,\n\u001b[1;32m    296\u001b[0m         namespace\u001b[38;5;241m=\u001b[39mnamespace,\n\u001b[1;32m    297\u001b[0m         async_req\u001b[38;5;241m=\u001b[39masync_req,\n\u001b[1;32m    298\u001b[0m         \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m    299\u001b[0m     )\n",
+      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/langchain_pinecone/vectorstores.py:292\u001b[0m, in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m    281\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m async_req:\n\u001b[1;32m    282\u001b[0m     \u001b[38;5;66;03m# Runs the pinecone upsert asynchronously.\u001b[39;00m\n\u001b[1;32m    283\u001b[0m     async_res \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m    284\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_index\u001b[38;5;241m.\u001b[39mupsert(\n\u001b[1;32m    285\u001b[0m             vectors\u001b[38;5;241m=\u001b[39mbatch_vector_tuples,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    290\u001b[0m         \u001b[38;5;28;01mfor\u001b[39;00m batch_vector_tuples \u001b[38;5;129;01min\u001b[39;00m batch_iterate(batch_size, vector_tuples)\n\u001b[1;32m    291\u001b[0m     ]\n\u001b[0;32m--> 292\u001b[0m     [res\u001b[38;5;241m.\u001b[39mget() \u001b[38;5;28;01mfor\u001b[39;00m res \u001b[38;5;129;01min\u001b[39;00m async_res]\n\u001b[1;32m    293\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    294\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_index\u001b[38;5;241m.\u001b[39mupsert(\n\u001b[1;32m    295\u001b[0m         vectors\u001b[38;5;241m=\u001b[39mvector_tuples,\n\u001b[1;32m    296\u001b[0m         namespace\u001b[38;5;241m=\u001b[39mnamespace,\n\u001b[1;32m    297\u001b[0m         async_req\u001b[38;5;241m=\u001b[39masync_req,\n\u001b[1;32m    298\u001b[0m         \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m    299\u001b[0m     )\n",
+      "File \u001b[0;32m~/anaconda3/lib/python3.11/multiprocessing/pool.py:774\u001b[0m, in \u001b[0;36mApplyResult.get\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    772\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_value\n\u001b[1;32m    773\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 774\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_value\n",
+      "File \u001b[0;32m~/anaconda3/lib/python3.11/multiprocessing/pool.py:125\u001b[0m, in \u001b[0;36mworker\u001b[0;34m(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception)\u001b[0m\n\u001b[1;32m    123\u001b[0m job, i, func, args, kwds \u001b[38;5;241m=\u001b[39m task\n\u001b[1;32m    124\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 125\u001b[0m     result \u001b[38;5;241m=\u001b[39m (\u001b[38;5;28;01mTrue\u001b[39;00m, func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwds))\n\u001b[1;32m    126\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m    127\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m wrap_exception \u001b[38;5;129;01mand\u001b[39;00m func \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m _helper_reraises_exception:\n",
+      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pinecone/core/openapi/shared/api_client.py:187\u001b[0m, in \u001b[0;36mApiClient.__call_api\u001b[0;34m(self, resource_path, method, path_params, query_params, header_params, body, post_params, files, response_type, auth_settings, _return_http_data_only, collection_formats, _preload_content, _request_timeout, _host, _check_type)\u001b[0m\n\u001b[1;32m    185\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m PineconeApiException \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m    186\u001b[0m     e\u001b[38;5;241m.\u001b[39mbody \u001b[38;5;241m=\u001b[39m e\u001b[38;5;241m.\u001b[39mbody\u001b[38;5;241m.\u001b[39mdecode(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 187\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[1;32m    189\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlast_response \u001b[38;5;241m=\u001b[39m response_data\n\u001b[1;32m    191\u001b[0m return_data \u001b[38;5;241m=\u001b[39m response_data\n",
+      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pinecone/core/openapi/shared/api_client.py:175\u001b[0m, in \u001b[0;36mApiClient.__call_api\u001b[0;34m(self, resource_path, method, path_params, query_params, header_params, body, post_params, files, response_type, auth_settings, _return_http_data_only, collection_formats, _preload_content, _request_timeout, _host, _check_type)\u001b[0m\n\u001b[1;32m    171\u001b[0m     url \u001b[38;5;241m=\u001b[39m _host \u001b[38;5;241m+\u001b[39m resource_path\n\u001b[1;32m    173\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    174\u001b[0m     \u001b[38;5;66;03m# perform request and return response\u001b[39;00m\n\u001b[0;32m--> 175\u001b[0m     response_data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrequest(\n\u001b[1;32m    176\u001b[0m         method,\n\u001b[1;32m    177\u001b[0m         url,\n\u001b[1;32m    178\u001b[0m         query_params\u001b[38;5;241m=\u001b[39mquery_params,\n\u001b[1;32m    179\u001b[0m         headers\u001b[38;5;241m=\u001b[39mheader_params,\n\u001b[1;32m    180\u001b[0m         post_params\u001b[38;5;241m=\u001b[39mpost_params,\n\u001b[1;32m    181\u001b[0m         body\u001b[38;5;241m=\u001b[39mbody,\n\u001b[1;32m    182\u001b[0m         _preload_content\u001b[38;5;241m=\u001b[39m_preload_content,\n\u001b[1;32m    183\u001b[0m         _request_timeout\u001b[38;5;241m=\u001b[39m_request_timeout,\n\u001b[1;32m    184\u001b[0m     )\n\u001b[1;32m    185\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m PineconeApiException \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m    186\u001b[0m     e\u001b[38;5;241m.\u001b[39mbody \u001b[38;5;241m=\u001b[39m e\u001b[38;5;241m.\u001b[39mbody\u001b[38;5;241m.\u001b[39mdecode(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pinecone/core/openapi/shared/api_client.py:460\u001b[0m, in \u001b[0;36mApiClient.request\u001b[0;34m(self, method, url, query_params, headers, post_params, body, _preload_content, _request_timeout)\u001b[0m\n\u001b[1;32m    450\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrest_client\u001b[38;5;241m.\u001b[39mOPTIONS(\n\u001b[1;32m    451\u001b[0m         url,\n\u001b[1;32m    452\u001b[0m         query_params\u001b[38;5;241m=\u001b[39mquery_params,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    457\u001b[0m         body\u001b[38;5;241m=\u001b[39mbody,\n\u001b[1;32m    458\u001b[0m     )\n\u001b[1;32m    459\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m method \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPOST\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m--> 460\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrest_client\u001b[38;5;241m.\u001b[39mPOST(\n\u001b[1;32m    461\u001b[0m         url,\n\u001b[1;32m    462\u001b[0m         query_params\u001b[38;5;241m=\u001b[39mquery_params,\n\u001b[1;32m    463\u001b[0m         headers\u001b[38;5;241m=\u001b[39mheaders,\n\u001b[1;32m    464\u001b[0m         post_params\u001b[38;5;241m=\u001b[39mpost_params,\n\u001b[1;32m    465\u001b[0m         _preload_content\u001b[38;5;241m=\u001b[39m_preload_content,\n\u001b[1;32m    466\u001b[0m         _request_timeout\u001b[38;5;241m=\u001b[39m_request_timeout,\n\u001b[1;32m    467\u001b[0m         body\u001b[38;5;241m=\u001b[39mbody,\n\u001b[1;32m    468\u001b[0m     )\n\u001b[1;32m    469\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m method \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPUT\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m    470\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrest_client\u001b[38;5;241m.\u001b[39mPUT(\n\u001b[1;32m    471\u001b[0m         url,\n\u001b[1;32m    472\u001b[0m         query_params\u001b[38;5;241m=\u001b[39mquery_params,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    477\u001b[0m         body\u001b[38;5;241m=\u001b[39mbody,\n\u001b[1;32m    478\u001b[0m     )\n",
+      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pinecone/core/openapi/shared/rest.py:345\u001b[0m, in \u001b[0;36mRESTClientObject.POST\u001b[0;34m(self, url, headers, query_params, post_params, body, _preload_content, _request_timeout)\u001b[0m\n\u001b[1;32m    335\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mPOST\u001b[39m(\n\u001b[1;32m    336\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m    337\u001b[0m     url,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    343\u001b[0m     _request_timeout\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m    344\u001b[0m ):\n\u001b[0;32m--> 345\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrequest(\n\u001b[1;32m    346\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPOST\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m    347\u001b[0m         url,\n\u001b[1;32m    348\u001b[0m         headers\u001b[38;5;241m=\u001b[39mheaders,\n\u001b[1;32m    349\u001b[0m         query_params\u001b[38;5;241m=\u001b[39mquery_params,\n\u001b[1;32m    350\u001b[0m         post_params\u001b[38;5;241m=\u001b[39mpost_params,\n\u001b[1;32m    351\u001b[0m         _preload_content\u001b[38;5;241m=\u001b[39m_preload_content,\n\u001b[1;32m    352\u001b[0m         _request_timeout\u001b[38;5;241m=\u001b[39m_request_timeout,\n\u001b[1;32m    353\u001b[0m         body\u001b[38;5;241m=\u001b[39mbody,\n\u001b[1;32m    354\u001b[0m     )\n",
+      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pinecone/core/openapi/shared/rest.py:279\u001b[0m, in \u001b[0;36mRESTClientObject.request\u001b[0;34m(self, method, url, query_params, headers, body, post_params, _preload_content, _request_timeout)\u001b[0m\n\u001b[1;32m    276\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;241m500\u001b[39m \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m r\u001b[38;5;241m.\u001b[39mstatus \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m599\u001b[39m:\n\u001b[1;32m    277\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m ServiceException(http_resp\u001b[38;5;241m=\u001b[39mr)\n\u001b[0;32m--> 279\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m PineconeApiException(http_resp\u001b[38;5;241m=\u001b[39mr)\n\u001b[1;32m    281\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m r\n",
+      "\u001b[0;31mPineconeApiException\u001b[0m: (400)\nReason: Bad Request\nHTTP response headers: HTTPHeaderDict({'Date': 'Thu, 14 Nov 2024 19:37:34 GMT', 'Content-Type': 'application/json', 'Content-Length': '116', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '155', 'x-pinecone-request-id': '2161036493328920558', 'x-envoy-upstream-service-time': '37', 'server': 'envoy'})\nHTTP response body: {\"code\":3,\"message\":\"Metadata size is 869788 bytes, which exceeds the limit of 40960 bytes per vector\",\"details\":[]}\n"
+     ]
+    }
+   ],
+   "source": [
+    "vector_store.add_documents(documents=documents[0:15], ids=uuids)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "bc900bbd-128c-40b0-b151-f77800fcb50b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "* [SIM=0.292190]\n",
+      " [deletion]Cha[/deletion] Graniteville Jany 15 1854\n",
+      "Mr Z. B. Oakes\n",
+      "Dr Sir\n",
+      "On my Return\n",
+      "from Charleston last week I stopd\n",
+      "and attended the sale of B. J. Godfrerys\n",
+      "at Black Creek and bought the Family\n",
+      "of Cash whom I purchased of you two\n",
+      "years ago. The woman is now in a\n",
+      "Family way and we think she will not\n",
+      "suit us for a Cook Woman, she is\n",
+      "a small young woman about 30 strong and\n",
+      "Healthy and prefers Field Work Cash\n",
+      "has proved himself an Excellent servant\n",
+      "a great Ax Man and not to be exceld\n",
+      "with the Hoe I can recommend him to be\n",
+      "a most Excellent general servant, and\n",
+      "now what do you think you can\n",
+      "get for the Family Consisting of himself\n",
+      "Wife and three Children say a girl of\n",
+      "10 or 11 year named Bella, Ceasar a boy about\n",
+      "4 or 5, and Rose 2 or three and Mother\n",
+      "in a fair way for another. Please write\n",
+      "and let me know what you think\n",
+      "you could get for them yours Truly\n",
+      "[underline]T. H. Marshall[/underline]\n",
+      "\n",
+      " [{'URL': 'https://www.digitalcommonwealth.org/search/commonwealth:ws859m407', 'abstract': '1161522    Asks value of family: man, woman, 3 children.\\nName: abstract_tsi, dtype: object', 'genre': \"1161522    ['Manuscripts', 'Correspondence']\\nName: genre_basic_ssim, dtype: object\", 'genre_specific': '1161522    NaN\\nName: genre_specific_ssim, dtype: object', 'subtitle': '1161522    NaN\\nName: title_info_primary_subtitle_tsi, dtype: object', 'title': 'T.H. Marshall, Graniteville, S.C., autograph l...', 'title_alt': '1161522    NaN\\nName: title_info_alternative_tsim, dtype: object'}]\n",
+      "----------------------------------\n",
+      "\n",
+      "* [SIM=0.243309]\n",
+      " Grumesville Sept 11 [insertion]th[/insertion] 1856\n",
+      "Dear Sir\n",
+      "The Deputy has been here since\n",
+      "day before yesterday, and it is impossible\n",
+      "to get hold of the negroes, I have advised\n",
+      "that the parties should go down & see you\n",
+      "they seem willing to consent that the fellow\n",
+      "Tom should go at $ 1000, and a smaller boy\n",
+      "at $ 300 - but a new party entered the\n",
+      "field and recommended a seperance\n",
+      "to Major Rhame - who wants the boys\n",
+      "himself, he is to be requested to go\n",
+      "down and confer with the judgement\n",
+      "creditors and buy them, or propose to\n",
+      "sell out the entire Estate and pay pro-ratio,\n",
+      "the creditors, I presume that\n",
+      "all this could be done better by yourself\n",
+      "and make no doubt a ballance\n",
+      "of some 2,000 dollars retained, I advise\n",
+      "that you should when applied\n",
+      "to take the whole Estate in hand by &\n",
+      "with the Consent of Judgement Creditors\n",
+      "and save the Commission, I could\n",
+      "then speculate on the sale if forced\n",
+      "under the [deletion]e[/deletion]sheriff -\n",
+      "in Lords - yours truly\n",
+      "Theo. C Tharin\n",
+      "\n",
+      " [{'URL': 'https://www.digitalcommonwealth.org/search/commonwealth:w3764957b', 'abstract': '1163238    On disposal of Negroes in contested estate.\\nName: abstract_tsi, dtype: object', 'genre': \"1163238    ['Manuscripts', 'Correspondence']\\nName: genre_basic_ssim, dtype: object\", 'genre_specific': '1163238    NaN\\nName: genre_specific_ssim, dtype: object', 'subtitle': '1163238    NaN\\nName: title_info_primary_subtitle_tsi, dtype: object', 'title': 'Theodore C. Tharin, Grumesville, S.C., autogra...', 'title_alt': '1163238    NaN\\nName: title_info_alternative_tsim, dtype: object'}]\n",
+      "----------------------------------\n",
+      "\n",
+      "* [SIM=0.230255]\n",
+      " Wednesday morn July 12 1854\n",
+      "Dear Sir\n",
+      "I drop you a line in a great\n",
+      "hurry by Mr McCulley to say my affairs\n",
+      "are going very well, and I have been\n",
+      "offered $ 500 or $ 1.00 per acre for the tract of\n",
+      "land you purchased at sherriff sale.\n",
+      "I think you can do better, the offer is made by\n",
+      "Mr Williams, who owns the adjoining tract\n",
+      "which he purchased of Mc Culley he\n",
+      "gets Serpentine and work about 30 Hands\n",
+      "yours truly in haste\n",
+      "Theod C Tharin\n",
+      "Z. B. Oakes Esq.\n",
+      "\n",
+      " [{'URL': 'https://www.digitalcommonwealth.org/search/commonwealth:w3764306m', 'abstract': '1163237    Offer to purchase land.\\nName: abstract_tsi, dtype: object', 'genre': \"1163237    ['Manuscripts', 'Correspondence']\\nName: genre_basic_ssim, dtype: object\", 'genre_specific': '1163237    NaN\\nName: genre_specific_ssim, dtype: object', 'subtitle': '1163237    NaN\\nName: title_info_primary_subtitle_tsi, dtype: object', 'title': 'Theodore C. Tharin, Grumesville, S.C. [?], aut...', 'title_alt': '1163237    NaN\\nName: title_info_alternative_tsim, dtype: object'}]\n",
+      "----------------------------------\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "results = vector_store.similarity_search_with_score(\n",
+    "    \"What is the metadata of the Z.B. Oakes articles\",\n",
+    "    k=3\n",
+    ")\n",
+    "for res, score in results:\n",
+    "    #print(f\"* {res.page_content} [{res.metadata}]\")\n",
+    "    print(f\"* [SIM={score:3f}]\\n {res.page_content} [{res.metadata}]\")\n",
+    "    print(\"----------------------------------\\n\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f5545e52-8142-4c0d-a127-a1a3711e5ef3",
+   "metadata": {},
+   "source": [
+    "### Conclusions on Pinecone and HuggingFace Embedding Model\n",
+    "Unfortunately, it seems like Pinecone has a byte limit of how much data you can send, which 40KB. The metadata field for one of the Document objects is 800KB.\n",
+    "\n",
+    "The HuggingFace embedding model that I used though works like a charm. Just have to ensure the dimensions line up between the embedding and the query.\n",
+    "\n",
+    "FAISS seems like our best option."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4a7f8878-1967-4630-817a-9eb1d321701e",
+   "metadata": {},
+   "source": [
+    "### Using Chroma Vector Store"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "01f0c6e6-fb80-4b01-9673-a790017ce71b",
+   "metadata": {},
+   "source": [
+    "Now we can embed our data into a Chroma vector store:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 93,
+   "id": "c281018a-089b-4ad0-8f4c-efb4667c8780",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Split 133 documents into 13931 chunks.\n",
+      "Saved 13931 chunks to ./chroma_try.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# from langchain.document_loaders import DirectoryLoader\n",
+    "from langchain_community.document_loaders import DirectoryLoader\n",
+    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
+    "from langchain.schema import Document\n",
+    "from langchain_openai import OpenAIEmbeddings\n",
+    "from langchain_community.vectorstores import Chroma\n",
+    "from dotenv import load_dotenv\n",
+    "import openai\n",
+    "import os\n",
+    "import shutil\n",
+    "import time\n",
+    "from langchain.embeddings import HuggingFaceEmbeddings\n",
+    "\n",
+    "import tempfile\n",
+    "CHROMA_PATH = \"./chroma_try\"\n",
+    "\n",
+    "def main(documents):\n",
+    "    generate_data_store(documents)\n",
+    "\n",
+    "\n",
+    "def generate_data_store(documents):\n",
+    "    chunks = split_text(documents)\n",
+    "    save_to_chroma(chunks)\n",
+    "\n",
+    "\n",
+    "def split_text(documents: list[Document]):\n",
+    "    text_splitter = RecursiveCharacterTextSplitter(\n",
+    "        chunk_size=1000,\n",
+    "        chunk_overlap=100,\n",
+    "        length_function=len,\n",
+    "        add_start_index=True,\n",
+    "    )\n",
+    "    chunks = text_splitter.split_documents(documents)\n",
+    "    print(f\"Split {len(documents)} documents into {len(chunks)} chunks.\")\n",
+    "\n",
+    "    document = chunks[10]\n",
+    "    #print(document.page_content)\n",
+    "    #print(document.metadata)\n",
+    "\n",
+    "    return chunks\n",
+    "\n",
+    "def save_to_chroma(chunks):\n",
+    "    #Clear out the database first.\n",
+    "    if os.path.exists(CHROMA_PATH):\n",
+    "        shutil.rmtree(CHROMA_PATH)\n",
+    "        print(f\"Removed existing database at {CHROMA_PATH}.\")\n",
+    "\n",
+    "    # Create a new DB from the documents.\n",
+    "    os.makedirs(CHROMA_PATH, exist_ok=True)  # Ensure the directory exists\n",
+    "\n",
+    "    #embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n",
+    "    embeddings = OpenAIEmbeddings(model=\"text-embedding-3-large\", dimensions=3072)\n",
+    "    #embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")\n",
+    "    try:\n",
+    "        db = Chroma.from_documents(\n",
+    "            chunks, embeddings, persist_directory=CHROMA_PATH\n",
+    "        )\n",
+    "        db.persist()\n",
+    "        print(f\"Saved {len(chunks)} chunks to {CHROMA_PATH}.\")\n",
+    "    except Exception as e:\n",
+    "        print(f\"An error occurred: {e}\")\n",
+    "\n",
+    "\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    main(documents)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b08316ec-6545-4af5-a92e-1f026f121e4f",
+   "metadata": {},
+   "source": [
+    "### Making the Query"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 94,
+   "id": "103cf9f0-e116-4e3a-a33c-accdf4246332",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Human: \n",
+      "Answer the question based only on the following context:\n",
+      "\n",
+      "Columbia Jany 8th / 55\n",
+      "Mr Z B. Oakes\n",
+      "Dear Sir\n",
+      "I read\n",
+      "your letter, desiring me to\n",
+      "pay the amount of the [unclear]\n",
+      "over to you, but I cannot do\n",
+      "so until Col. Bauskett gives\n",
+      "me notice to do so. I have\n",
+      "seen him since I read your\n",
+      "letter, I regret very much that\n",
+      "I cannot comply with your\n",
+      "request. The arrangement\n",
+      "which Mr Mazyck made with\n",
+      "the Bank, was to meet it\n",
+      "when we received notice to\n",
+      "do so,\n",
+      "Yours respectfully\n",
+      "Thos. Taylor\n",
+      "\n",
+      "---\n",
+      "\n",
+      "and the Ballance with all other obligatory and Kind\n",
+      "favors I will have to make straight when I see you\n",
+      "you will no doubt feel hurt at this step of\n",
+      "mine but when you consider all things, and that\n",
+      "upon this very step our mutual safety & welfare\n",
+      "depended, why like General Jackson at orleans\n",
+      "I take the Responsibility, and time I hope\n",
+      "will alike prove I was Right\n",
+      "with much Esteem & Regard\n",
+      "I Remain faithfully yours\n",
+      "Theo. C Tharin\n",
+      "\n",
+      "Mount Holly } {Three Mount Holly PCV\n",
+      "June 22 { Theo C Tharin PCV\n",
+      "Z.B. Oakes Esq\n",
+      "Charleston\n",
+      "\n",
+      "---\n",
+      "\n",
+      "till which time believe me yours\n",
+      "Most truly\n",
+      "Theo. C Tharin\n",
+      "Z. B. Oakes Esq\n",
+      "\n",
+      "---\n",
+      "\n",
+      "Charleston 1-6 Dec 1853\n",
+      "Mr Z. B. Oakes,\n",
+      "Dear Sir\n",
+      "I would be glad\n",
+      "to know what you have\n",
+      "determined on in the case\n",
+      "of M Alpine, I bot the\n",
+      "negro from you, it appears\n",
+      "and of course you are\n",
+      "liable to me, I am called\n",
+      "on for the amount of the\n",
+      "verdict, It appears that the\n",
+      "negro was a stolen one. I\n",
+      "of course lay no charge to\n",
+      "you on this score, being accessory\n",
+      "to the affair, but I\n",
+      "do call on on you to\n",
+      "hold me Harmless. - Please\n",
+      "send me your written answer\n",
+      "[underline]this day.[/underline]\n",
+      "Respectfully\n",
+      "[underline]Tho: N. Gadsden[/underline]\n",
+      "\n",
+      "---\n",
+      "\n",
+      "Summerville So Ca\n",
+      "April 26th 1854\n",
+      "Mr Z B Oakes\n",
+      "Dear Sir\n",
+      "Your letter\n",
+      "in reply to mine was duly received.\n",
+      "I am willing to value the woman at\n",
+      "$ 800. & be refunded $ 150. or return her\n",
+      "at once to you. Do inform me as soon\n",
+      "as possible as to the decision of\n",
+      "her owner. Yours respectfully\n",
+      "Thos. L Gelzia\n",
+      "\n",
+      "---\n",
+      "\n",
+      "Answer the question based on the above context: Who did Z.B Oakes receive a letter from?\n",
+      "\n",
+      "Response: Z.B Oakes received a letter from Tho: N. Gadsden.\n",
+      "\n",
+      "Sources: ['Thomas Taylor, Columbia, Tenn., autograph lett...: https://www.digitalcommonwealth.org/search/commonwealth:5q47sj75b', 'Theodore C. Tharin, Grumesville, S.C., autogra...: https://www.digitalcommonwealth.org/search/commonwealth:w3764352q', 'Theodore C. Tharin, Grumesville, S.C., autogra...: https://www.digitalcommonwealth.org/search/commonwealth:w3764286b', 'Theodore N. Gadsden, Charleston, S.C., autogra...: https://www.digitalcommonwealth.org/search/commonwealth:9k41zk125', 'Thomas L. Gelzer, Summerville, autograph lette...: https://www.digitalcommonwealth.org/search/commonwealth:ws859j61k']\n"
+     ]
+    }
+   ],
+   "source": [
+    "import argparse\n",
+    "from langchain_community.vectorstores import Chroma\n",
+    "from langchain_openai import OpenAIEmbeddings\n",
+    "from langchain_openai import ChatOpenAI\n",
+    "from langchain.prompts import ChatPromptTemplate\n",
+    "from langchain.embeddings import HuggingFaceEmbeddings\n",
+    "from transformers import AutoTokenizer, AutoModel\n",
+    "from langchain_core.messages import HumanMessage, SystemMessage\n",
+    "from langchain_core.output_parsers import StrOutputParser\n",
+    "\n",
+    "\n",
+    "# copy from above\n",
+    "#CHROMA_PATH = \"/var/folders/xq/fj3st__56r54gz9tdvb7d2k40000gn/T/tmpaxd8t1dv\"\n",
+    "CHROMA_PATH = \"./chroma_try\"\n",
+    "\n",
+    "PROMPT_TEMPLATE = \"\"\"\n",
+    "Answer the question based only on the following context:\n",
+    "\n",
+    "{context}\n",
+    "\n",
+    "---\n",
+    "\n",
+    "Answer the question based on the above context: {question}\n",
+    "\"\"\"\n",
+    "\n",
+    "def main(query: str):\n",
+    "    # Create CLI with a default value for Jupyter testing\n",
+    "    parser = argparse.ArgumentParser()\n",
+    "    parser.add_argument(\"query_text\", type=str, help=\"The query text.\")\n",
+    "    args = parser.parse_args(args=[query])  # Add a default value here for testing\n",
+    "    query_text = args.query_text\n",
+    "\n",
+    "    # Prepare the database\n",
+    "    embedding_function = OpenAIEmbeddings(model=\"text-embedding-3-large\", dimensions=3072)\n",
+    "    #embedding_function = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")\n",
+    "    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)\n",
+    "\n",
+    "    results = db.similarity_search_with_relevance_scores(query_text, k=5)\n",
+    "    for i in range(len(results)):\n",
+    "        if len(results) == 0 or results[0][1] < 0.3:\n",
+    "            print(f\"Unable to find matching results for \\\"{query_text}\\\"\")\n",
+    "            print(results[0][1])\n",
+    "            return\n",
+    "    \n",
+    "    context_text = \"\\n\\n---\\n\\n\".join([doc.page_content for doc, _score in results])\n",
+    "    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)\n",
+    "    prompt = prompt_template.format(context=context_text, question=query_text)\n",
+    "    print(prompt)\n",
+    "\n",
+    "    model = ChatOpenAI()\n",
+    "    response_text = model.predict(prompt)\n",
+    "\n",
+    "    sources = [doc.metadata.get(\"title\") + \": \" + str(doc.metadata.get(\"URL\")) for doc, _score in results]\n",
+    "    formatted_response = f\"Response: {response_text}\\n\\nSources: {sources}\"\n",
+    "    # response with context, sources, and answer to my query\n",
+    "    print(formatted_response)\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    query1 = \"Who did Z.B Oakes receive a letter from?\"\n",
+    "    query2 = \"What did Henry M. Sikes say about India Goods?\"\n",
+    "    query3 = \"What happened in World War II?\"\n",
+    "    main(query1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "805b793e-16e3-4eca-9b2a-baaa91fde961",
+   "metadata": {},
+   "source": [
+    "### Conclusions about Chroma and OpenAI Embedding Model\n",
+    "Chroma seems to be a great option as a vector store, however it is immensely lightweight and requires us to have the vector store as an embedding in the machine that we use. Unfortunatenly we have not found many projects that is dealing with embedding a vast amount of data like we are.\n",
+    "\n",
+    "The OpenAIEmbeddings is great, but it also costs money, so that is a no-go."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "4eded946-6f7f-440b-a892-80f8c37db3ab",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "page_content='Oglethorpe December 8 53\n",
+      "Z B Oaks esqr\n",
+      "Dr Sir you will please\n",
+      "inform me how negros are selling\n",
+      "& how your market is supplied\n",
+      "I think I will be in Charleston\n",
+      "the latter part of this month, &\n",
+      "will want to buy a Cook &\n",
+      "good active boy & perhaps two negro fellows\n",
+      "Very Respectfully\n",
+      "Yours T G Hudson\n",
+      "\n",
+      "\n",
+      "' metadata={'title': 'T.G. Hudson, Oglethorpe, Ga., autograph letter...', 'abstract': '1161475    Asks market price at Charleston.\\nName: abstract_tsi, dtype: object', 'subtitle': '1161475    NaN\\nName: title_info_primary_subtitle_tsi, dtype: object', 'URL': 'https://www.digitalcommonwealth.org/search/commonwealth:9k41zk460', 'title_alt': '1161475    NaN\\nName: title_info_alternative_tsim, dtype: object', 'genre': \"1161475    ['Manuscripts', 'Correspondence']\\nName: genre_basic_ssim, dtype: object\", 'genre_specific': '1161475    NaN\\nName: genre_specific_ssim, dtype: object'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(documents[0])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}