From a067ed54b9d2fa46fd57d05b83ca5a81c00228d2 Mon Sep 17 00:00:00 2001 From: Brandon Vargus Date: Thu, 21 Nov 2024 10:08:22 -0500 Subject: [PATCH 1/6] added rerank.py; code will also be available in notebook --- PoC/rerank.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 PoC/rerank.py diff --git a/PoC/rerank.py b/PoC/rerank.py new file mode 100644 index 0000000..a0aabbb --- /dev/null +++ b/PoC/rerank.py @@ -0,0 +1 @@ +import numpy as np import matplotlib.pyplot as plt import seaborn as sns import json import pandas as pd from langchain_openai import ChatOpenAI # replace with file of your choosing file = open("sample_full_text.json") full_text = json.load(file) # metadata csv file; should be included in repo df_attributes = pd.read_csv("metadata_attributes.csv") model = ChatOpenAI() import re def get_title(text): match = re.search(r'\d+\s+(.+?)\n', text) # Extracting and printing the title if there's a match if match: title = match.group(1) return title # Turn the BPL data into a Document from langchain.schema import Document documents = [] for doc in full_text: # Extract metadata fields and apply get_title() title = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "title_info_primary_tsi"])) title_subtitle = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "title_info_primary_subtitle_tsi"])) title_alt = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "title_info_alternative_tsim"])) abstract = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "abstract_tsi"])) subject_facet = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "subject_facet_ssim"])) subject_geographic = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "subject_geographic_sim"])) genre = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "genre_basic_ssim"])) genre_specific = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "genre_specific_ssim"])) name_facet = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "name_facet_ssim"])) name_role = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "name_role_tsim"])) date_human = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "date_tsim"])) date_start = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "date_start_dtsi"])) date_end = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "date_end_dtsi"])) publisher = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "publisher_tsi"])) collection_name = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "collection_name_ssim"])) physical_location = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "physical_location_ssim"])) related_item_host = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "related_item_host_ssim"])) type_of_resource = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "type_of_resource_ssim"])) URL = "https://www.digitalcommonwealth.org/search/" + get_title(str(df_attributes.loc[df_attributes["id"] == doc, "id"])) # Create Document with metadata documents.append(Document( page_content=full_text[doc]['text'], metadata={ "title": title, "subtitle": title_subtitle, "title_alt": title_alt, "abstract": abstract, "subject_facet": subject_facet, "subject_geographic": subject_geographic, "genre": genre, "genre_specific": genre_specific, "name_facet": name_facet, "name_role": name_role, "date_human": date_human, "date_start": date_start, "date_end": date_end, "publisher": publisher, "collection_name": collection_name, "physical_location": physical_location, "related_item_host": related_item_host, "type_of_resource": type_of_resource, "URL": URL } )) # Now for all of the vector store and reranking stuff import faiss from langchain_community.docstore.in_memory import InMemoryDocstore from langchain_community.vectorstores import FAISS from langchain.embeddings import HuggingFaceEmbeddings # embeddings model embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") # creating the vector store index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world"))) vector_store = FAISS( embedding_function=embeddings, index=index, docstore=InMemoryDocstore(), index_to_docstore_id={}, ) # now for the reranking step weights = { "title": 1.0, "subtitle": 0.95, "title_alt": 0.9, "abstract": 0.85, "subject_facet": 0.8, "subject_geographic": 0.75, "genre": 0.7, "genre_specific": 0.65, "name_facet": 0.6, "name_role": 0.55, "date_human": 0.5, "date_start": 0.45, "date_end": 0.4, "publisher": 0.35, "collection_name": 0.3, "physical_location": 0.25, "related_item_host": 0.2, "type_of_resource": 0.15, "URL": 0.1 } from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity import numpy as np from langchain.embeddings import HuggingFaceEmbeddings # our vector store: # embedding model embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") def compute_relevance_score(metadata_value, query): """ Compute cosine similarity between the query and a metadata value using sentence-transformers. Args: metadata_value (str): The metadata value to compare. query (str): The query string. Returns: float: Cosine similarity score (between 0 and 1). """ if not metadata_value or not query: return 0 # Return 0 if either the metadata or query is empty # Encode the metadata value and query into embeddings embeddings = model.encode([metadata_value, query], convert_to_tensor=False) # Convert to NumPy metadata_embedding, query_embedding = embeddings # Compute cosine similarity similarity = cosine_similarity([metadata_embedding], [query_embedding]) return similarity[0][0] # Extract the scalar similarity value def rerank_documents(documents, query, weights, vector_store, k=10): """ Rerank documents based on metadata relevance scores and FAISS vector similarity scores. Args: documents (list): List of Document objects. query (str): The query string used for retrieval. weights (dict): Weights for each metadata field. vector_store (str): The vector store itself to get the similarity score Returns: list: Reranked documents in descending order of relevance. """ final_score = 0 reranked_results = [] returned_docs = vector_store.similarity_search_with_score(query, k) for doc in returned_docs: final_score = doc[1] # Add weighted relevance scores for each metadata field for field, weight in weights.items(): metadata_value = doc[0].metadata.get(field, "") # Safely get metadata field value relevance_score = compute_relevance_score(metadata_value, query) final_score += weight * relevance_score reranked_results.append((doc, final_score)) # Sort documents by the final score in descending order reranked_results.sort(key=lambda x: x[1], reverse=True) return [doc for doc, score in reranked_results] docs = rerank_documents(documents, "Newspaper", weights, vector_store) # now we should get an output like this for some k value: # ('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:gf06jp23d', 'Reranked score: 1.1741459369659424') docs_list = [(docs[i][0].metadata['title'], docs[i][0].metadata['URL'], f"Reranked score: {docs[i][1]}") for i in range(len(docs))] docs_list.sort(key=lambda x: x[2], reverse=True) for doc in docs_list: print(doc) \ No newline at end of file From 055907359ba2a67d672044b1f94fe2ded35f29e1 Mon Sep 17 00:00:00 2001 From: Brandon Vargus Date: Thu, 21 Nov 2024 10:15:40 -0500 Subject: [PATCH 2/6] included reranking --- PoC/POC.ipynb | 1733 ++++++++++++++++++++++++++++--------------------- 1 file changed, 979 insertions(+), 754 deletions(-) diff --git a/PoC/POC.ipynb b/PoC/POC.ipynb index 8f8f0f0..4c2f270 100644 --- a/PoC/POC.ipynb +++ b/PoC/POC.ipynb @@ -5,12 +5,12 @@ "id": "2e7eb6e2-0a1b-42ea-8127-a51f13b4b4b0", "metadata": {}, "source": [ - "# LibRAG Proof of Concept" + "# LibRAG Deployment Phase" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "id": "3934dc57-c4c3-4892-bcf4-bce4aa5c48af", "metadata": {}, "outputs": [], @@ -23,27 +23,29 @@ ] }, { - "cell_type": "code", - "execution_count": 3, - "id": "8848a3a2-6be4-40f3-87ee-0c9dc099117e", + "cell_type": "markdown", + "id": "5a24d843-3692-49d5-8167-af46cf4a1f5a", "metadata": {}, - "outputs": [], "source": [ - "#!pip install sentence_transformers" + "### We are going to ensure that we have our data downloaded from the SCC.\n", + "### We are going to download one interval of the full text, as well as the entire metadata file" ] }, { - "cell_type": "markdown", - "id": "5a24d843-3692-49d5-8167-af46cf4a1f5a", + "cell_type": "code", + "execution_count": null, + "id": "30d31b31-2c2a-40ae-99b6-5502395f8bc7", "metadata": {}, + "outputs": [], "source": [ - "### We are going to ensure that we have our data downloaded from the SCC.\n", - "### We are going to download one interval of the full text, as well as the entire metadata file" + "# file path for metadata file on SCC: /projectnb/sparkgrp/ml-bpl-rag-data/full_data/bpl_data.json\n", + "meta = open(\"../EDA Phase/bpl-digital-commonwealth/bpl_data.json\")\n", + "bpl_metadata = json.load(meta)" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 2, "id": "b50d8c5f-dfea-454f-860e-13315a9c2fea", "metadata": {}, "outputs": [], @@ -56,32 +58,20 @@ }, { "cell_type": "code", - "execution_count": 15, - "id": "6567c6f4-1f82-4cfe-aa51-7f4a3bacf6af", + "execution_count": null, + "id": "1e4c5601-648f-4ff1-a5ce-d8823c46f884", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "133\n" - ] - } - ], + "outputs": [], "source": [ "print(len(full_text))" ] }, { - "cell_type": "code", - "execution_count": 5, - "id": "30d31b31-2c2a-40ae-99b6-5502395f8bc7", + "cell_type": "markdown", + "id": "1148a15c-0965-4613-bdb4-15e74467fd16", "metadata": {}, - "outputs": [], "source": [ - "# file path for metadata file on SCC: /projectnb/sparkgrp/ml-bpl-rag-data/full_data/bpl_data.json\n", - "meta = open(\"../EDA Phase/bpl-digital-commonwealth/bpl_data.json\")\n", - "bpl_metadata = json.load(meta)" + "Here's how to access the text documents:" ] }, { @@ -137,98 +127,10 @@ }, { "cell_type": "markdown", - "id": "d6f92651-0ff8-4ec7-b454-446e83e9f1d9", - "metadata": {}, - "source": [ - "### Embedding a paragraph using Word2Vec" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "255817ec-711d-4888-81f5-6374c59e8f95", - "metadata": {}, - "outputs": [], - "source": [ - "# from sentence_transformers import SentenceTransformer\n", - "\n", - "# # Load a pre-trained Sentence-BERT model\n", - "# model = SentenceTransformer('paraphrase-MiniLM-L6-v2')\n", - "\n", - "# # Example paragraph\n", - "# paragraph = full_text['commonwealth:w3764603d']['text']\n", - "# paragraph_embedding = model.encode(paragraph)\n", - "\n", - "# # Output: a vector representation of the paragraph\n", - "# print(paragraph_embedding)\n" - ] - }, - { - "cell_type": "markdown", - "id": "5cfe50d1-9efb-405f-aec4-5091dace7222", - "metadata": {}, - "source": [ - "### Setting up a Retriever" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "fdc1e05d-35ca-45e7-af73-b5ea8d26199f", - "metadata": {}, - "outputs": [], - "source": [ - "#!pip install langchain openai faiss-cpu" - ] - }, - { - "cell_type": "markdown", - "id": "d3033fe3-995e-46d5-86f4-5eda6a7e266a", - "metadata": {}, - "source": [ - "#### After ensuring we have the necessary dependencies, we are going to make our retriever" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "9e509e10-d10b-465c-83ba-e7ba9828f8fd", + "id": "bb3f0d7e-a982-4e18-a9eb-783df449ff09", "metadata": {}, - "outputs": [], "source": [ - "from typing import List\n", - "\n", - "from langchain_core.callbacks import CallbackManagerForRetrieverRun\n", - "from langchain_core.documents import Document\n", - "from langchain_core.retrievers import BaseRetriever\n", - "\n", - "\n", - "class ToyRetriever(BaseRetriever):\n", - " \"\"\"A toy retriever that contains the top k documents that contain the user query.\n", - "\n", - " This retriever only implements the sync method _get_relevant_documents.\n", - "\n", - " If the retriever were to involve file access or network access, it could benefit\n", - " from a native async implementation of `_aget_relevant_documents`.\n", - "\n", - " As usual, with Runnables, there's a default async implementation that's provided\n", - " that delegates to the sync implementation running on another thread.\n", - " \"\"\"\n", - "\n", - " documents: List[Document]\n", - " \"\"\"List of documents to retrieve from.\"\"\"\n", - " k: int\n", - " \"\"\"Number of top results to return\"\"\"\n", - " def _get_relevant_documents(self, query: str, *, run_manager: CallbackManagerForRetrieverRun\n", - " ) -> List[str]:\n", - " matching_documents = []\n", - " for document in documents:\n", - " if len(matching_documents) >= self.k:\n", - " return matching_documents\n", - "\n", - " if query.lower() in document.page_content.lower():\n", - " matching_documents.append(document.metadata['title'])\n", - " return matching_documents\n" + "### Create Metadata Dataframe" ] }, { @@ -251,98 +153,6 @@ "df.drop(columns=df.columns[0], axis=1, inplace=True)" ] }, - { - "cell_type": "code", - "execution_count": 12, - "id": "fc415a0f-6e2e-4b2d-8355-62a569806380", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
typeattributeslinks
0DigitalObject{'id': 'commonwealth-oai:xp68md23x', 'system_c...{'self': 'https://www.digitalcommonwealth.org/...
1DigitalObject{'id': 'commonwealth-oai:xp68m844v', 'system_c...{'self': 'https://www.digitalcommonwealth.org/...
2DigitalObject{'id': 'commonwealth-oai:xp68mb49n', 'system_c...{'self': 'https://www.digitalcommonwealth.org/...
3DigitalObject{'id': 'commonwealth-oai:xp68mc60v', 'system_c...{'self': 'https://www.digitalcommonwealth.org/...
4DigitalObject{'id': 'commonwealth-oai:xp68mc72n', 'system_c...{'self': 'https://www.digitalcommonwealth.org/...
\n", - "
" - ], - "text/plain": [ - " type attributes \\\n", - "0 DigitalObject {'id': 'commonwealth-oai:xp68md23x', 'system_c... \n", - "1 DigitalObject {'id': 'commonwealth-oai:xp68m844v', 'system_c... \n", - "2 DigitalObject {'id': 'commonwealth-oai:xp68mb49n', 'system_c... \n", - "3 DigitalObject {'id': 'commonwealth-oai:xp68mc60v', 'system_c... \n", - "4 DigitalObject {'id': 'commonwealth-oai:xp68mc72n', 'system_c... \n", - "\n", - " links \n", - "0 {'self': 'https://www.digitalcommonwealth.org/... \n", - "1 {'self': 'https://www.digitalcommonwealth.org/... \n", - "2 {'self': 'https://www.digitalcommonwealth.org/... \n", - "3 {'self': 'https://www.digitalcommonwealth.org/... \n", - "4 {'self': 'https://www.digitalcommonwealth.org/... " - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, { "cell_type": "code", "execution_count": 14, @@ -746,8 +556,7 @@ ], "source": [ "df_attributes = pd.json_normalize(df['attributes'])\n", - "df_attributes = pd.concat([df.drop(columns=['attributes']), df_attributes], axis=1)\n", - "df_attributes.head(10)" + "df_attributes = pd.concat([df.drop(columns=['attributes']), df_attributes], axis=1)" ] }, { @@ -760,585 +569,1003 @@ "df_attributes.to_csv(\"metadata_attributes.csv\")" ] }, + { + "cell_type": "markdown", + "id": "ce755e72-348e-45b3-8cb4-7e0202818e16", + "metadata": {}, + "source": [ + "### Optionally, read the csv if it is already downloaded" + ] + }, { "cell_type": "code", - "execution_count": 13, - "id": "92a7759e-36d6-455a-a1e4-bafcb2041f7d", + "execution_count": 4, + "id": "49e1b410-a7b0-4ef3-bbb8-2c240e568a73", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/xq/fj3st__56r54gz9tdvb7d2k40000gn/T/ipykernel_12916/3787498575.py:1: DtypeWarning: Columns (10,16,17,18,20,21,22,23,24,25,27,29,33,34,36,41,42,43,44,54,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,128,129,130,131,132,133,134,135,136,137,138,139,140) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " df_attributes = pd.read_csv(\"metadata_attributes.csv\")\n" + ] + } + ], + "source": [ + "df_attributes = pd.read_csv(\"metadata_attributes.csv\")" + ] + }, + { + "cell_type": "markdown", + "id": "972493d9-63a1-477d-811e-c1d951a2d63c", + "metadata": {}, + "source": [ + "Writing the get_title function to clear away any whitespace and newline characters from the title of each document." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "d819646f-51cc-4817-9542-ecfc9ea4af33", "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
typelinksidsystem_create_dtsisystem_modified_dtsicurator_model_ssicurator_model_suffix_ssititle_info_primary_tsigenre_basic_ssimgenre_specific_ssim...storage_key_base_ssidentifier_issn_ssimfrequency_tsicontained_by_ssinote_credits_tsimidentifier_isbn_ssimidentifier_music_publisher_ssimnote_arrangement_tsimtranscription_ark_id_ssitranscription_key_base_ss
0DigitalObject{'self': 'https://www.digitalcommonwealth.org/...commonwealth-oai:xp68md23x2021-03-04T00:13:09Z2021-09-02T20:40:00ZCurator::DigitalObjectDigitalObjectمن فضلكم توقفوا الأشخاص الذين ارتكبوا أسوأ الج...[Posters][Political posters]...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1DigitalObject{'self': 'https://www.digitalcommonwealth.org/...commonwealth-oai:xp68m844v2021-03-03T23:58:44Z2021-09-02T20:21:32ZCurator::DigitalObjectDigitalObject海员们 : 要警惕航运事故[Posters][Political posters]...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2DigitalObject{'self': 'https://www.digitalcommonwealth.org/...commonwealth-oai:xp68mb49n2021-03-04T00:06:25Z2021-09-02T20:30:29ZCurator::DigitalObjectDigitalObject人間としての尊厳を保てる : 生活賃金を[Posters][Political posters]...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
3DigitalObject{'self': 'https://www.digitalcommonwealth.org/...commonwealth-oai:xp68mc60v2021-03-04T00:10:40Z2021-09-02T20:35:20ZCurator::DigitalObjectDigitalObject野火[Posters][Political posters]...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4DigitalObject{'self': 'https://www.digitalcommonwealth.org/...commonwealth-oai:xp68mc72n2021-03-04T00:11:07Z2021-09-02T20:35:52ZCurator::DigitalObjectDigitalObject野火[Posters][Political posters]...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", - "

5 rows × 140 columns

\n", - "
" - ], "text/plain": [ - " type links \\\n", - "0 DigitalObject {'self': 'https://www.digitalcommonwealth.org/... \n", - "1 DigitalObject {'self': 'https://www.digitalcommonwealth.org/... \n", - "2 DigitalObject {'self': 'https://www.digitalcommonwealth.org/... \n", - "3 DigitalObject {'self': 'https://www.digitalcommonwealth.org/... \n", - "4 DigitalObject {'self': 'https://www.digitalcommonwealth.org/... \n", - "\n", - " id system_create_dtsi system_modified_dtsi \\\n", - "0 commonwealth-oai:xp68md23x 2021-03-04T00:13:09Z 2021-09-02T20:40:00Z \n", - "1 commonwealth-oai:xp68m844v 2021-03-03T23:58:44Z 2021-09-02T20:21:32Z \n", - "2 commonwealth-oai:xp68mb49n 2021-03-04T00:06:25Z 2021-09-02T20:30:29Z \n", - "3 commonwealth-oai:xp68mc60v 2021-03-04T00:10:40Z 2021-09-02T20:35:20Z \n", - "4 commonwealth-oai:xp68mc72n 2021-03-04T00:11:07Z 2021-09-02T20:35:52Z \n", - "\n", - " curator_model_ssi curator_model_suffix_ssi \\\n", - "0 Curator::DigitalObject DigitalObject \n", - "1 Curator::DigitalObject DigitalObject \n", - "2 Curator::DigitalObject DigitalObject \n", - "3 Curator::DigitalObject DigitalObject \n", - "4 Curator::DigitalObject DigitalObject \n", - "\n", - " title_info_primary_tsi genre_basic_ssim \\\n", - "0 من فضلكم توقفوا الأشخاص الذين ارتكبوا أسوأ الج... [Posters] \n", - "1 海员们 : 要警惕航运事故 [Posters] \n", - "2 人間としての尊厳を保てる : 生活賃金を [Posters] \n", - "3 野火 [Posters] \n", - "4 野火 [Posters] \n", - "\n", - " genre_specific_ssim ... storage_key_base_ss identifier_issn_ssim \\\n", - "0 [Political posters] ... NaN NaN \n", - "1 [Political posters] ... NaN NaN \n", - "2 [Political posters] ... NaN NaN \n", - "3 [Political posters] ... NaN NaN \n", - "4 [Political posters] ... NaN NaN \n", - "\n", - " frequency_tsi contained_by_ssi note_credits_tsim identifier_isbn_ssim \\\n", - "0 NaN NaN NaN NaN \n", - "1 NaN NaN NaN NaN \n", - "2 NaN NaN NaN NaN \n", - "3 NaN NaN NaN NaN \n", - "4 NaN NaN NaN NaN \n", - "\n", - " identifier_music_publisher_ssim note_arrangement_tsim \\\n", - "0 NaN NaN \n", - "1 NaN NaN \n", - "2 NaN NaN \n", - "3 NaN NaN \n", - "4 NaN NaN \n", - "\n", - " transcription_ark_id_ssi transcription_key_base_ss \n", - "0 NaN NaN \n", - "1 NaN NaN \n", - "2 NaN NaN \n", - "3 NaN NaN \n", - "4 NaN NaN \n", - "\n", - "[5 rows x 140 columns]" + "['海员们', ':', '要警惕航运事故\\nName:', 'title_info_primary_tsi,', 'dtype:', 'object']" ] }, - "execution_count": 13, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df_attributes.loc[df_attributes[]]" + "str(df_attributes.loc[df_attributes[\"id\"] == \"commonwealth-oai:xp68m844v\"][\"title_info_primary_tsi\"]).split(\" \")[4:]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "754c22c6-5d35-436a-922a-0f5f6cafa6c9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'1199893 NaN\\nName: title_info_alternative_tsim, dtype: object'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "str(df_attributes.loc[df_attributes[\"id\"] == \"commonwealth:1j92ng13k\"][\"title_info_alternative_tsim\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "09331a9b-135a-46ec-8b0a-70c70ba1c261", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "585812" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_attributes[\"genre_specific_ssim\"].isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "id": "914df792-9343-43fd-83cd-7678e5a56f8c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1303800" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_attributes.shape[0]" + ] + }, + { + "cell_type": "markdown", + "id": "1c6d1c2d-16ad-43a0-844a-72238901c07d", + "metadata": {}, + "source": [ + "### Turn full text into Documents type" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "51f7f3b8-ed7a-43a1-949f-aa43e594af99", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 Poster produced by the International Transport...\n", + "Name: abstract_tsi, dtype: object\n" + ] + } + ], + "source": [ + "print(str(df_attributes.loc[df_attributes[\"id\"] == \"commonwealth-oai:xp68m844v\", \"abstract_tsi\"]))" + ] + }, + { + "cell_type": "markdown", + "id": "b072fe0a-b538-4704-85a0-b6862b0653b6", + "metadata": {}, + "source": [ + "Important Metadata to Embed:\n", + "- title_info_primary_tsi\n", + "- title_info_primary_subtitle_tsi\n", + "- title_info_alternative_tsim\n", + "- abstract_tsi\n", + "- subject_facet_ssim\n", + "- subject_geographic_sim\n", + "- genre_basic_ssim\n", + "- genre_specific_ssim" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "15d83b93-6840-4618-ac40-1c7de9e5cc1e", + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "def get_title(text):\n", + " match = re.search(r'\\d+\\s+(.+?)\\n', text)\n", + "\n", + " # Extracting and printing the title if there's a match\n", + " if match:\n", + " title = match.group(1)\n", + " return title" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "6c3f5614-cc45-4fb4-9b13-1fd0361da1b6", + "metadata": {}, + "outputs": [], + "source": [ + "# Turn the BPL data into a Document\n", + "from langchain.schema import Document\n", + "\n", + "documents = []\n", + "\n", + "for doc in full_text:\n", + " # Extract metadata fields and apply get_title()\n", + " title = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"title_info_primary_tsi\"]))\n", + " title_subtitle = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"title_info_primary_subtitle_tsi\"]))\n", + " title_alt = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"title_info_alternative_tsim\"]))\n", + " abstract = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"abstract_tsi\"]))\n", + " subject_facet = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"subject_facet_ssim\"]))\n", + " subject_geographic = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"subject_geographic_sim\"]))\n", + " genre = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"genre_basic_ssim\"]))\n", + " genre_specific = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"genre_specific_ssim\"]))\n", + " name_facet = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"name_facet_ssim\"]))\n", + " name_role = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"name_role_tsim\"]))\n", + " date_human = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"date_tsim\"]))\n", + " date_start = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"date_start_dtsi\"]))\n", + " date_end = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"date_end_dtsi\"]))\n", + " publisher = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"publisher_tsi\"]))\n", + " collection_name = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"collection_name_ssim\"]))\n", + " physical_location = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"physical_location_ssim\"]))\n", + " related_item_host = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"related_item_host_ssim\"]))\n", + " type_of_resource = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"type_of_resource_ssim\"]))\n", + " URL = \"https://www.digitalcommonwealth.org/search/\" + get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"id\"]))\n", + " \n", + " # Create Document with metadata\n", + " documents.append(Document(\n", + " page_content=full_text[doc]['text'],\n", + " metadata={\n", + " \"title\": title,\n", + " \"subtitle\": title_subtitle,\n", + " \"title_alt\": title_alt,\n", + " \"abstract\": abstract,\n", + " \"subject_facet\": subject_facet,\n", + " \"subject_geographic\": subject_geographic,\n", + " \"genre\": genre,\n", + " \"genre_specific\": genre_specific,\n", + " \"name_facet\": name_facet,\n", + " \"name_role\": name_role,\n", + " \"date_human\": date_human,\n", + " \"date_start\": date_start,\n", + " \"date_end\": date_end,\n", + " \"publisher\": publisher,\n", + " \"collection_name\": collection_name,\n", + " \"physical_location\": physical_location,\n", + " \"related_item_host\": related_item_host,\n", + " \"type_of_resource\": type_of_resource,\n", + " \"URL\": URL\n", + " }\n", + " ))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "b58b4530-27e4-4ed4-80be-4ee240892480", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['title', 'subtitle', 'title_alt', 'abstract', 'subject_facet', 'subject_geographic', 'genre', 'genre_specific', 'name_facet', 'name_role', 'date_human', 'date_start', 'date_end', 'publisher', 'collection_name', 'physical_location', 'related_item_host', 'type_of_resource', 'URL'])" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "documents[-1].metadata.keys()" + ] + }, + { + "cell_type": "markdown", + "id": "6580d4c1-9ab0-44c8-9ea7-cbeb80934f4b", + "metadata": {}, + "source": [ + "# RAG Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "648466aa-3142-4ece-aa02-4454c7f6ee41", + "metadata": {}, + "outputs": [], + "source": [ + "# set openai api key\n", + "import os\n", + "os.environ[\"OPENAI_API_KEY\"] = \"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\"" + ] + }, + { + "cell_type": "markdown", + "id": "895258be-7bf4-4d8d-8f5a-79d80dfe8c36", + "metadata": {}, + "source": [ + "### Using FAISS Vector Store" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "1b537ce7-2eb2-4392-bc71-5a33ede503df", + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install langchain-ollama" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "c1b3815e-dec5-4f22-b863-d91a57b5ccad", + "metadata": {}, + "outputs": [], + "source": [ + "#from langchain_ollama import OllamaEmbeddings\n", + "from langchain_openai import OpenAIEmbeddings\n", + "import faiss\n", + "from langchain_community.docstore.in_memory import InMemoryDocstore\n", + "from langchain_community.vectorstores import FAISS" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "03554a37-d142-45eb-be33-a333929b927d", + "metadata": {}, + "outputs": [], + "source": [ + "#embeddings = OpenAIEmbeddings(model=\"text-embedding-3-large\", dimensions=3072)\n", + "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "0bea78d8-4b72-4da0-a50d-89e0fe509454", + "metadata": {}, + "outputs": [], + "source": [ + "index = faiss.IndexFlatL2(len(embeddings.embed_query(\"hello world\")))\n", + "\n", + "vector_store = FAISS(\n", + " embedding_function=embeddings,\n", + " index=index,\n", + " docstore=InMemoryDocstore(),\n", + " index_to_docstore_id={},\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "b1bee292-778a-480e-8eb3-5cc37587ce85", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['68762a4a-dbcb-411e-9047-ed04e44a794a',\n", + " 'f622dbf6-aee1-43e4-aba1-59fde531a7ad',\n", + " '927c3752-7746-4c88-a58b-e01a6547d857',\n", + " '70bf52c8-b79e-459b-9aa0-c1f12e12d842',\n", + " 'ade0d10f-c301-4ac1-99b8-6e26562ac259',\n", + " '01721bef-99a0-4034-8bc0-655ebec123d1',\n", + " '5c0969cd-a297-4b3f-b73e-9b3e5ed60c1e',\n", + " 'f707ab88-6b09-48fc-94ed-93ac650931a2',\n", + " 'bd269b63-b143-4535-83f8-27515883647f',\n", + " '5b0f71e5-520e-4090-8aa7-fc4f152f42f3',\n", + " 'a97737a2-ba3c-4a6f-b169-967ea2856f57',\n", + " '5e9a4add-7aae-45c1-a071-bd8510e81829',\n", + " '53b49886-6e08-4c6e-b69d-d12fa9accc12',\n", + " 'cdb37b1a-8ff1-462c-8052-90b8681d1700',\n", + " '44eca254-78b8-4ce5-aa31-ee02b8895734',\n", + " '15cb7183-2758-468a-88d0-35348ccf357a',\n", + " 'fd3e2244-e666-4d73-9cac-d980ce9b49ee',\n", + " 'b48501c1-c52d-4cbf-bcc0-2b669017736c',\n", + " '3eda50a4-0269-44ea-8594-e1ed21662dc1',\n", + " 'd6d96535-021c-44c8-89c8-796ebd3ebc0d',\n", + " 'e20dfb8b-7e99-4299-82ae-b69c4b2aa5e8',\n", + " '89cbeb0c-253a-4c01-aba0-b3ef82622381',\n", + " 'fa325062-a886-499d-8377-2feedc5a8262',\n", + " '379080c0-03f9-4ff7-89b4-be8edcd7be96',\n", + " '06ff8bdc-975a-4ec2-8273-e2ca4f489df4',\n", + " '2ab5e13c-9051-4022-a476-92a97b05c5c1',\n", + " '7e144845-3350-4871-9404-fa7e3f734b78',\n", + " '2de7c819-546d-4470-8f0a-b91a7903b0ed',\n", + " 'a05149dd-2b01-4f03-9cd0-01fcea8780d2',\n", + " 'd9d462c0-d928-49cc-9172-7152328e4d51',\n", + " '8edb060d-8959-418d-9470-5da965ad2b9d',\n", + " '19d25b97-34b0-4f53-9ec3-4060239242dc',\n", + " '49a6e2ef-7f85-4a3a-98d6-3b3dd8b4c3b6',\n", + " '54913f79-d38b-40b3-b251-86c59721fa3d',\n", + " 'e87af048-11d2-4977-b4aa-6465d5017fba',\n", + " 'de44abe6-739b-44c5-806b-06d52aa1e56b',\n", + " '82898991-0951-40f6-8926-309db4a807b7',\n", + " '02c3a940-74db-4d60-a173-ca9bbd702cd0',\n", + " '20f881bb-7d3b-4f49-82a3-5ac8e7578322',\n", + " '141d3d1e-6d5a-4dd9-95bb-4ae07c81e39c',\n", + " 'b8b415f5-670b-4b56-b1ce-fe424769a182',\n", + " '55de7042-5452-4ea2-820a-6b35a594cbf4',\n", + " '404565f4-ee23-423f-a4d8-550b6f8b6e41',\n", + " '300e8b0b-2b5a-4618-aec7-8d5324de106f',\n", + " 'cab508a6-25f1-43ca-9f46-835af4e922af',\n", + " '2d370174-8c8f-418b-a083-84f52cfe9a36',\n", + " '65ff5f75-cc47-4610-999d-9a11f9894b32',\n", + " '3995b720-99e0-4f68-a740-46e7f2e5c59f',\n", + " '40af7846-ba78-48eb-83e0-6b6b844f8c79',\n", + " 'c1f51576-1e99-4f77-be03-ce202f238dbe',\n", + " 'ec7d4da2-e7a5-40d9-9e67-a37c12e03bf9',\n", + " '270222bf-57d7-4cfe-a268-1f6167ff6d7e',\n", + " '9a8373ee-c2db-4a1d-b5ac-6a04c0d2becd',\n", + " '4504f7c3-6820-49c7-be23-43d9b8ba58aa',\n", + " 'b181eab2-587c-4de7-92a4-6f8b0f3f2b00',\n", + " '7e889a1d-c1b8-4c53-af90-de3eaa6db021',\n", + " '6007601e-7b9d-4064-b16c-655169e9d72a',\n", + " '11e280e2-dfce-4a52-8867-296155f78eb8',\n", + " '821de396-47f8-4606-934c-13c1bf884473',\n", + " '9b7e6753-61d1-446d-8a2b-344b0e41f84c',\n", + " '5c2d20de-9344-441a-a1d8-0ca0f4b1fca4',\n", + " 'aee572eb-ec36-4f61-b35e-9dd2c6ab36f2',\n", + " '94ac76ef-42f8-4a03-bffe-f86cbab72408',\n", + " '1be0d43f-ee71-419c-ba50-3aaf024d27c5',\n", + " 'bfd48d92-f28e-48f1-815b-756642cf2b8e',\n", + " '8023464e-5507-4192-ac25-f06fb585e68d',\n", + " '75bde2f0-1b5f-4e05-aa8a-ff2d315a06b4',\n", + " '0b34222f-1dda-4d19-9810-11e75d1272e5',\n", + " '97500c01-1c8f-42ef-8efd-19223c68c46a',\n", + " 'cf8c5ea5-d750-4641-bb5b-e971494b7666',\n", + " '3fd5a660-ab78-4897-a700-1a73b533b0cc',\n", + " '17368981-b502-45e1-abaa-4a9b09c096a8',\n", + " 'de201d3e-8d95-41a6-b8ac-4afba7909331',\n", + " '192f929f-1bba-4e75-b05e-aa8ae937a092',\n", + " '86b39447-2fb0-4899-b1c1-6a89e6bfee0f',\n", + " 'ff5d3a8e-4da6-47f0-b573-036670c5a936',\n", + " 'ef0a7589-2792-43b9-9895-b40d175e5ff9',\n", + " '97338842-0c45-4e94-ac34-81dc74d259fd',\n", + " 'f24aca1a-8ea3-40fc-a868-c300bd6c7c5a',\n", + " '41af6f16-89f8-46c5-89ed-a0853385c24c',\n", + " 'a2f0344b-6ca1-4c2c-ac08-de839b7a61cd',\n", + " '8d23a995-124f-40c5-820a-b86eacb7589f',\n", + " '8846fe55-3eda-4af1-aee1-4c8faa9a5c73',\n", + " 'ae4e13a7-db9f-43d6-b27b-913db7f23b48',\n", + " '05846bed-e2ba-4b45-84eb-aef0fbdb1052',\n", + " 'ac778fa3-e3b2-42e3-8152-c63701895dfe',\n", + " 'c697142b-3c95-440d-b61a-7183c08c171d',\n", + " 'caa1e83f-ae87-4704-9f2e-d1dad34a9cd2',\n", + " '43847c84-8fe5-4457-9f82-526695e2e97a',\n", + " '5521bbc6-cdff-4703-bf3e-63e01596d21c',\n", + " '3c275f68-8f64-452b-bece-3949e2c25b22',\n", + " 'bf581ede-0913-424d-8a2d-216ae9cadcb0',\n", + " '117e9f7d-7c8b-4366-9f84-fbd5dd1171dd',\n", + " 'c116254f-1415-4188-bc93-0487bcbe47ee',\n", + " '40fac159-6763-4a30-addb-975a6e4e69a3',\n", + " '7a181599-2847-4d9b-957f-f4fc7468aea4',\n", + " '4df69e1f-2e1b-4be0-a161-a374f493c4cb',\n", + " '37813f0f-33a6-4adc-ac55-cbca0b74471f',\n", + " 'b0dbb8ef-e049-4393-a159-2e0f9081b0a3',\n", + " '57da8d7d-e0fc-4868-938d-a85f6d6663ff',\n", + " '618dad07-a666-4827-b188-55d5bf31aed0',\n", + " '3e2724b7-4f24-42fd-9def-ca0f9edcd615',\n", + " '41bf1d9b-da94-4e8f-9805-18bf93ff1100',\n", + " 'c5c85271-1e66-4dd2-a338-a2b48a9b07e4',\n", + " '80646915-6af3-423c-a2de-71786272f086',\n", + " 'bf824464-3eae-4343-b0ab-5b565445daec',\n", + " 'd3127b71-3763-4532-8863-0ce42170dfdb',\n", + " '700f2cb1-85bd-4d28-b95f-68f4536185c1',\n", + " 'f79d14f8-9eaa-4f3b-af7c-62095dae4497',\n", + " '196c8830-7cea-485e-a61c-af5863922733',\n", + " '989f5923-c37f-4a18-b61f-3e0b912f01d2',\n", + " '36ea71ca-36e3-4fcd-bdb9-0f41a83378a3',\n", + " '3567b36a-f249-4ab2-9392-a734303ce5c1',\n", + " '569ed9cf-6b14-4822-892c-4d19205490f2',\n", + " '7db05e1e-0406-4d9f-bdde-e7351932c25e',\n", + " '818a69ab-de3d-42f4-92d0-2accb701ef8a',\n", + " '96fc14b8-4930-4177-b20c-dba396297577',\n", + " 'ecca8db2-b629-4f05-8018-4aa99dc26842',\n", + " '2cc41514-6e02-4cba-bbb4-c2a69454559b',\n", + " 'b41fa3b0-ea8e-40e5-a0be-1abd27cbc23e',\n", + " '1254e098-a9dc-4eba-a0b9-929ed848ed91',\n", + " '1a9ea687-3318-4ad7-a55a-2e0044583667',\n", + " 'ff5a57dd-69a9-4d9a-9d26-56b9271f1b8e',\n", + " '593a9427-03ae-4b3b-9215-2d14f10341f3',\n", + " '5903b7e0-489f-4011-bbd7-cac82b19423a',\n", + " '48072b10-faae-4609-a298-5c6193488ed9',\n", + " 'ec7cd14c-1248-4285-a090-fc8d0ce96fd4',\n", + " 'a9c6016b-b100-4e46-8b42-a232b9be6459',\n", + " '7e851d53-1a48-4289-8b0a-27834e3e044d',\n", + " '77dde80d-7b70-4ab6-b042-c1e67932f36a',\n", + " '4642025a-d842-4fb3-83ef-76ce90a6a2bb',\n", + " '256c37b7-38b7-4642-a501-7a611e0763ac',\n", + " '65962070-4375-4479-872b-fc3300c3f1af']" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from uuid import uuid4\n", + "\n", + "uuids = [str(uuid4()) for _ in range(len(documents))]\n", + "\n", + "vector_store.add_documents(documents=documents, ids=uuids)" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "id": "0c3a0a6b-4640-4036-863e-77a1a715a0e3", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['https://www.digitalcommonwealth.org/search/commonwealth:dv144791c', 'https://www.digitalcommonwealth.org/search/commonwealth:rv048f292', 'https://www.digitalcommonwealth.org/search/commonwealth:9019vm69m', 'https://www.digitalcommonwealth.org/search/commonwealth:dv1441451', 'https://www.digitalcommonwealth.org/search/commonwealth:w3764472d', 'https://www.digitalcommonwealth.org/search/commonwealth:6w924s12f', 'https://www.digitalcommonwealth.org/search/commonwealth:6w924r91w', 'https://www.digitalcommonwealth.org/search/commonwealth:wm118g867']\n" + ] + } + ], + "source": [ + "retriever = vector_store.similarity_search_with_score(\n", + " \"Manuscripts\",\n", + " k=3\n", + ")\n", + "r = retriever.invoke(\"John Bishop Estlin\")\n", + "r_list = [x.metadata[\"URL\"] for x in r]\n", + "print(r_list)" + ] + }, + { + "cell_type": "markdown", + "id": "d3a257cc-c687-4fc5-91a7-0c3048a49e70", + "metadata": {}, + "source": [ + "### Now for the Reranking Step:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "013d5fca-c621-4a7e-ac0e-ad5c9aba577f", + "metadata": {}, + "outputs": [], + "source": [ + "weights2 = {\n", + " \"title\": 1.0,\n", + " \"subtitle\": 0.95,\n", + " \"title_alt\": 0.9,\n", + " \"abstract\": 0.85,\n", + " \"subject_facet\": 0.8,\n", + " \"subject_geographic\": 0.75,\n", + " \"genre\": 0.7,\n", + " \"genre_specific\": 0.65,\n", + " \"name_facet\": 0.6,\n", + " \"name_role\": 0.55,\n", + " \"date_human\": 0.5,\n", + " \"date_start\": 0.45,\n", + " \"date_end\": 0.4,\n", + " \"publisher\": 0.35,\n", + " \"collection_name\": 0.3,\n", + " \"physical_location\": 0.25,\n", + " \"related_item_host\": 0.2,\n", + " \"type_of_resource\": 0.15,\n", + " \"URL\": 0.1\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "cc1f04d1-f553-46ae-bfc6-248125f62423", + "metadata": {}, + "outputs": [], + "source": [ + "from sentence_transformers import SentenceTransformer\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "import numpy as np\n", + "from langchain.embeddings import HuggingFaceEmbeddings\n", + "\n", + "# our vector store:\n", + "\n", + "# embedding model\n", + "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")\n", + "\n", + "def compute_relevance_score(metadata_value, query):\n", + " \"\"\"\n", + " Compute cosine similarity between the query and a metadata value using sentence-transformers.\n", + "\n", + " Args:\n", + " metadata_value (str): The metadata value to compare.\n", + " query (str): The query string.\n", + "\n", + " Returns:\n", + " float: Cosine similarity score (between 0 and 1).\n", + " \"\"\"\n", + " if not metadata_value or not query:\n", + " return 0 # Return 0 if either the metadata or query is empty\n", + " \n", + " # Encode the metadata value and query into embeddings\n", + " embeddings = model.encode([metadata_value, query], convert_to_tensor=False) # Convert to NumPy\n", + " metadata_embedding, query_embedding = embeddings\n", + "\n", + " # Compute cosine similarity\n", + " similarity = cosine_similarity([metadata_embedding], [query_embedding])\n", + " return similarity[0][0] # Extract the scalar similarity value\n", + "\n", + "\n", + "\n", + "def rerank_documents(documents, query, weights, vector_store, k=10):\n", + " \"\"\"\n", + " Rerank documents based on metadata relevance scores and FAISS vector similarity scores.\n", + "\n", + " Args:\n", + " documents (list): List of Document objects.\n", + " query (str): The query string used for retrieval.\n", + " weights (dict): Weights for each metadata field.\n", + " vector_store (str): The vector store itself to get the similarity score\n", + "\n", + " Returns:\n", + " list: Reranked documents in descending order of relevance.\n", + " \"\"\"\n", + "\n", + " final_score = 0\n", + "\n", + " reranked_results = []\n", + " returned_docs = vector_store.similarity_search_with_score(query, k)\n", + " for doc in returned_docs:\n", + " final_score = doc[1]\n", + " # Add weighted relevance scores for each metadata field\n", + " for field, weight in weights.items():\n", + " metadata_value = doc[0].metadata.get(field, \"\") # Safely get metadata field value\n", + " relevance_score = compute_relevance_score(metadata_value, query)\n", + " final_score += weight * relevance_score\n", + "\n", + " reranked_results.append((doc, final_score))\n", + "\n", + " # Sort documents by the final score in descending order\n", + " reranked_results.sort(key=lambda x: x[1], reverse=True)\n", + " return [doc for doc, score in reranked_results]\n", + "\n", + "\n", + "docs = rerank_documents(documents, \"Newspaper\", weights2, vector_store)" ] }, { "cell_type": "code", - "execution_count": 3, - "id": "49e1b410-a7b0-4ef3-bbb8-2c240e568a73", + "execution_count": 78, + "id": "2d9172aa-6c15-4c90-856e-d0ee53100721", "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "/var/folders/xq/fj3st__56r54gz9tdvb7d2k40000gn/T/ipykernel_26237/3787498575.py:1: DtypeWarning: Columns (10,16,17,18,20,21,22,23,24,25,27,29,33,34,36,41,42,43,44,54,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,128,129,130,131,132,133,134,135,136,137,138,139,140) have mixed types. Specify dtype option on import or set low_memory=False.\n", - " df_attributes = pd.read_csv(\"metadata_attributes.csv\")\n" + "('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:gf06jp23d', 'Reranked score: 1.1741459369659424')\n", + "('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:k356cp803', 'Reranked score: 1.161521077156067')\n", + "('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:t435k083s', 'Reranked score: 1.1445826292037964')\n", + "('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:8s45sw212', 'Reranked score: 1.1416451930999756')\n", + "('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:9p292v62n', 'Reranked score: 1.1416230201721191')\n", + "('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:z890vf594', 'Reranked score: 1.1343271732330322')\n", + "('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:pv63jm38v', 'Reranked score: 1.0997507572174072')\n", + "('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:05744b168', 'Reranked score: 1.0604684352874756')\n", + "(\"Thomas's Massachusetts Spy, or, Worcester Gazette\", 'https://www.digitalcommonwealth.org/search/commonwealth:v405x072q', 'Reranked score: 1.0452649593353271')\n", + "(\"Thomas's Massachusetts Spy, or, Worcester Gazette\", 'https://www.digitalcommonwealth.org/search/commonwealth:1831h918x', 'Reranked score: 1.024101972579956')\n" ] } ], "source": [ - "df_attributes = pd.read_csv(\"metadata_attributes.csv\")" + "#print([docs[i].metadata['title'] for i in range(len(docs))])\n", + "docs_list = [(docs[i][0].metadata['title'], docs[i][0].metadata['URL'], f\"Reranked score: {docs[i][1]}\") for i in range(len(docs))]\n", + "docs_list.sort(key=lambda x: x[2], reverse=True)\n", + "for doc in docs_list:\n", + " print(doc)" ] }, { - "cell_type": "code", - "execution_count": 16, - "id": "7b269a56-1e1a-4cc7-8dbf-a37ecd6222ac", + "cell_type": "markdown", + "id": "07cdf844-72c6-41ef-bade-9afb52bceed8", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "type 1303800\n", - "attributes 1303800\n", - "links 1303800\n", - "dtype: int64" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "df.count()" + "Immediately we get much better performance because now only Newspapers are returned." ] }, { - "cell_type": "code", - "execution_count": 46, - "id": "d819646f-51cc-4817-9542-ecfc9ea4af33", + "cell_type": "markdown", + "id": "18719878-92c6-458c-ae81-21d9fe5f0bd8", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['海员们', ':', '要警惕航运事故\\nName:', 'title_info_primary_tsi,', 'dtype:', 'object']" - ] - }, - "execution_count": 46, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "str(df_attributes.loc[df_attributes[\"id\"] == \"commonwealth-oai:xp68m844v\"][\"title_info_primary_tsi\"]).split(\" \")[4:]" + "# Implementing Different Vector Store and Embedding Combos" ] }, { "cell_type": "markdown", - "id": "1c6d1c2d-16ad-43a0-844a-72238901c07d", + "id": "beaf9e61-3bea-4c31-9710-532a306d1023", "metadata": {}, "source": [ - "### Turn full text into Documents type" + "### Pinecone Vector Store w/OLlama Embeddings" ] }, { "cell_type": "code", - "execution_count": 18, - "id": "51f7f3b8-ed7a-43a1-949f-aa43e594af99", + "execution_count": 10, + "id": "28f8e253-cf14-4b37-8e54-bf114514ac60", + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install langchain-pinecone pinecone-notebooks\n", + "#!pip install pinecone-client" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "ab3fb46d-267e-4815-b823-e978d4bf3edf", "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stdin", "output_type": "stream", "text": [ - "1 Poster produced by the International Transport...\n", - "Name: abstract_tsi, dtype: object\n" + "Enter your Pinecone API key: ········\n" ] } ], "source": [ - "print(str(df_attributes.loc[df_attributes[\"id\"] == \"commonwealth-oai:xp68m844v\", \"abstract_tsi\"]))" + "import getpass\n", + "import os\n", + "import time\n", + "\n", + "from pinecone import Pinecone, ServerlessSpec\n", + "\n", + "if not os.getenv(\"PINECONE_API_KEY\"):\n", + " os.environ[\"PINECONE_API_KEY\"] = getpass.getpass(\"Enter your Pinecone API key: \")\n", + "\n", + "pinecone_api_key = os.environ.get(\"PINECONE_API_KEY\")\n", + "\n", + "pc = Pinecone(api_key=pinecone_api_key)" ] }, { "cell_type": "code", - "execution_count": 19, - "id": "65381d50-bcba-4a6e-8347-5d8b7e4de002", + "execution_count": 20, + "id": "0a6c480d-f2cd-4732-943a-8cd9d66417e2", "metadata": {}, "outputs": [], "source": [ - "#df_attributes.columns.tolist()" + "# setting up the index name\n", + "import time\n", + "\n", + "index_name = \"librag1\" # change if desired\n", + "\n", + "existing_indexes = [index_info[\"name\"] for index_info in pc.list_indexes()]\n", + "\n", + "if index_name not in existing_indexes:\n", + " pc.create_index(\n", + " name=index_name,\n", + " dimension=768,\n", + " metric=\"cosine\",\n", + " spec=ServerlessSpec(cloud=\"aws\", region=\"us-east-1\"),\n", + " )\n", + " while not pc.describe_index(index_name).status[\"ready\"]:\n", + " time.sleep(1)\n", + "\n", + "index = pc.Index(index_name)" ] }, { "cell_type": "code", - "execution_count": 65, - "id": "cd65068e-ac92-4e97-8363-4ee6e9bfbc0a", + "execution_count": 12, + "id": "5c791f22-4909-447f-a1fe-ebc09a9bbe11", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
typelinksidsystem_create_dtsisystem_modified_dtsicurator_model_ssicurator_model_suffix_ssititle_info_primary_tsigenre_basic_ssimgenre_specific_ssim...storage_key_base_ssidentifier_issn_ssimfrequency_tsicontained_by_ssinote_credits_tsimidentifier_isbn_ssimidentifier_music_publisher_ssimnote_arrangement_tsimtranscription_ark_id_ssitranscription_key_base_ss
\n", - "

0 rows × 140 columns

\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: [type, links, id, system_create_dtsi, system_modified_dtsi, curator_model_ssi, curator_model_suffix_ssi, title_info_primary_tsi, genre_basic_ssim, genre_specific_ssim, date_tsim, date_type_ssm, date_edtf_ssm, date_start_dtsi, date_end_dtsi, name_tsim, name_role_tsim, name_facet_ssim, related_item_host_ssim, subject_topic_tsim, subject_facet_ssim, subject_coordinates_geospatial, subject_point_geospatial, subject_geojson_facet_ssim, subject_hiergeo_geojson_ssm, physical_location_ssim, identifier_local_other_tsim, identifier_uri_ss, identifier_uri_preview_ss, rights_ss, license_ss, reuse_allowed_ssi, extent_tsi, abstract_tsi, type_of_resource_ssim, lang_term_ssim, publishing_state_ssi, processing_state_ssi, destination_site_ssim, hosting_status_ssi, harvesting_status_bsi, oai_header_id_ssi, exemplary_image_ssi, exemplary_image_key_base_ss, admin_set_name_ssi, admin_set_ark_id_ssi, institution_name_ssi, institution_ark_id_ssi, collection_name_ssim, collection_ark_id_ssim, filenames_ssim, _version_, timestamp, subject_geographic_sim, date_facet_yearly_itim, score, sub_location_tsi, identifier_local_accession_tsim, note_date_tsim, note_tsim, license_uri_ss, digital_origin_ssi, publisher_tsi, pubplace_tsi, title_info_translated_tsim, note_language_tsim, title_info_primary_subtitle_tsi, subject_name_tsim, note_resp_tsim, rightsstatement_ss, rightsstatement_uri_ss, title_info_primary_trans_tsim, subject_geo_other_ssm, note_physical_tsim, identifier_iiif_manifest_ss, note_acquisition_tsim, title_info_alternative_tsim, identifier_local_barcode_tsim, note_reference_tsim, related_item_series_ssi, identifier_local_call_tsim, related_item_subseries_ssi, related_item_subsubseries_ssi, shelf_locator_tsi, identifier_local_call_invalid_tsim, note_biographical_tsim, note_citation_tsim, edition_name_tsi, note_ownership_tsim, note_publications_tsim, identifier_local_other_invalid_tsim, note_funding_tsim, subject_title_tsim, title_info_partnum_tsi, resource_type_manuscript_bsi, issuance_tsi, scale_tsim, flagged_content_ssi, note_bibliography_tsim, title_info_uniform_tsim, ...]\n", - "Index: []\n", - "\n", - "[0 rows x 140 columns]" - ] - }, - "execution_count": 65, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "df_attributes.loc[df_attributes[\"genre_basic_ssim\"] == \"[Posters]\"]" + "from langchain_ollama import OllamaEmbeddings\n", + "from langchain.embeddings import HuggingFaceEmbeddings" ] }, { "cell_type": "code", - "execution_count": 7, - "id": "15d83b93-6840-4618-ac40-1c7de9e5cc1e", + "execution_count": 13, + "id": "4aec6567-7950-4de2-8600-fe987f47a24a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/xq/fj3st__56r54gz9tdvb7d2k40000gn/T/ipykernel_2752/1630880338.py:5: LangChainDeprecationWarning: The class `HuggingFaceEmbeddings` was deprecated in LangChain 0.2.2 and will be removed in 1.0. An updated version of the class exists in the :class:`~langchain-huggingface package and should be used instead. To use it run `pip install -U :class:`~langchain-huggingface` and import as `from :class:`~langchain_huggingface import HuggingFaceEmbeddings``.\n", + " embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")\n", + "2024-11-14 14:26:19.674480: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", + "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" + ] + } + ], "source": [ - "import re\n", - "def get_title(text):\n", - " match = re.search(r'\\d+\\s+(.+?)\\n', text)\n", + "# embeddings = OllamaEmbeddings(\n", + "# model=\"llama3\",\n", + "# )\n", "\n", - " # Extracting and printing the title if there's a match\n", - " if match:\n", - " title = match.group(1)\n", - " return title" + "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")" ] }, { "cell_type": "code", - "execution_count": 8, - "id": "6c3f5614-cc45-4fb4-9b13-1fd0361da1b6", + "execution_count": 21, + "id": "f1ad1f25-69d2-4eb1-9f85-c7b0ccf13a53", "metadata": {}, "outputs": [], "source": [ - "# Turn the BPL data into a Document\n", - "from langchain.schema import Document\n", - "documents = []\n", - "for doc in full_text:\n", - " title = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"title_info_primary_tsi\"]))\n", - " ID = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"id\"]))\n", - " abstract = str(df_attributes.loc[df_attributes[\"id\"] == doc, \"abstract_tsi\"])\n", - " title_subtitle = str(df_attributes.loc[df_attributes[\"id\"] == doc, \"title_info_primary_subtitle_tsi\"])\n", - " documents += [Document(page_content=full_text[doc]['text'], metadata={\"title\": title, \"abstract\": abstract, \"subtitle\": title_subtitle, \"ID\":ID})]\n", - " #documents += [Document(page_content=full_text[doc]['text'])]\n" + "from langchain_pinecone import PineconeVectorStore\n", + "vector_store = PineconeVectorStore(index=index, embedding=embeddings)" ] }, { "cell_type": "code", - "execution_count": 9, - "id": "b58b4530-27e4-4ed4-80be-4ee240892480", + "execution_count": 43, + "id": "2a4b12ac-87bd-4ae3-90cc-4f2c6644256a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'title': 'T.G. Hudson, Oglethorpe, Ga., autograph letter...',\n", - " 'abstract': '1161475 Asks market price at Charleston.\\nName: abstract_tsi, dtype: object',\n", - " 'subtitle': '1161475 NaN\\nName: title_info_primary_subtitle_tsi, dtype: object',\n", - " 'ID': 'commonwealth:9k41zk460'}" + "'1165601 Terms for disposal of woman.\\nName: abstract_tsi, dtype: object'" ] }, - "execution_count": 9, + "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "documents[0].metadata" - ] - }, - { - "cell_type": "code", - "execution_count": 144, - "id": "83fdead1-b760-4766-a880-64c6df5d0cd2", - "metadata": {}, - "outputs": [], - "source": [ - "# retriever = ToyRetriever(documents=documents, k=1)\n", - "# retriever.invoke(\"Richmond\")" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "35c69e08-181c-42aa-a1e7-5e0846053503", - "metadata": {}, - "outputs": [], - "source": [ - "# import openai" + "documents[18].metadata['abstract']" ] }, { "cell_type": "code", - "execution_count": 22, - "id": "20d5d467-7d3f-4c34-b0d6-ab2d19471d9b", + "execution_count": 16, + "id": "e80decf8-6ee5-48a2-bb39-5ea97ceaf7e2", "metadata": {}, "outputs": [], "source": [ - "from langchain_community.document_loaders import TextLoader\n", - "from langchain_community.vectorstores import FAISS\n", - "from langchain_openai import OpenAIEmbeddings\n", - "from langchain_text_splitters import CharacterTextSplitter" + "from uuid import uuid4\n", + "uuids = [str(uuid4()) for _ in range(len(documents))]" ] }, { "cell_type": "code", - "execution_count": 23, - "id": "9a395f8b-82eb-4e01-bbbc-3852524444c4", + "execution_count": 44, + "id": "18bcd3e7-07f6-4a66-a629-126ffe340966", "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "PineconeApiException", + "evalue": "(400)\nReason: Bad Request\nHTTP response headers: HTTPHeaderDict({'Date': 'Thu, 14 Nov 2024 19:37:34 GMT', 'Content-Type': 'application/json', 'Content-Length': '116', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '155', 'x-pinecone-request-id': '2161036493328920558', 'x-envoy-upstream-service-time': '37', 'server': 'envoy'})\nHTTP response body: {\"code\":3,\"message\":\"Metadata size is 869788 bytes, which exceeds the limit of 40960 bytes per vector\",\"details\":[]}\n", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mPineconeApiException\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[44], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m vector_store\u001b[38;5;241m.\u001b[39madd_documents(documents\u001b[38;5;241m=\u001b[39mdocuments[\u001b[38;5;241m0\u001b[39m:\u001b[38;5;241m15\u001b[39m], ids\u001b[38;5;241m=\u001b[39muuids)\n", + "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/langchain_core/vectorstores/base.py:287\u001b[0m, in \u001b[0;36mVectorStore.add_documents\u001b[0;34m(self, documents, **kwargs)\u001b[0m\n\u001b[1;32m 285\u001b[0m texts \u001b[38;5;241m=\u001b[39m [doc\u001b[38;5;241m.\u001b[39mpage_content \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m documents]\n\u001b[1;32m 286\u001b[0m metadatas \u001b[38;5;241m=\u001b[39m [doc\u001b[38;5;241m.\u001b[39mmetadata \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m documents]\n\u001b[0;32m--> 287\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39madd_texts(texts, metadatas, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 288\u001b[0m msg \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 289\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m`add_documents` and `add_texts` has not been implemented \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 290\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfor \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 291\u001b[0m )\n\u001b[1;32m 292\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(msg)\n", + "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/langchain_pinecone/vectorstores.py:292\u001b[0m, in \u001b[0;36mPineconeVectorStore.add_texts\u001b[0;34m(self, texts, metadatas, ids, namespace, batch_size, embedding_chunk_size, async_req, id_prefix, **kwargs)\u001b[0m\n\u001b[1;32m 281\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m async_req:\n\u001b[1;32m 282\u001b[0m \u001b[38;5;66;03m# Runs the pinecone upsert asynchronously.\u001b[39;00m\n\u001b[1;32m 283\u001b[0m async_res \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 284\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_index\u001b[38;5;241m.\u001b[39mupsert(\n\u001b[1;32m 285\u001b[0m vectors\u001b[38;5;241m=\u001b[39mbatch_vector_tuples,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 290\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m batch_vector_tuples \u001b[38;5;129;01min\u001b[39;00m batch_iterate(batch_size, vector_tuples)\n\u001b[1;32m 291\u001b[0m ]\n\u001b[0;32m--> 292\u001b[0m [res\u001b[38;5;241m.\u001b[39mget() \u001b[38;5;28;01mfor\u001b[39;00m res \u001b[38;5;129;01min\u001b[39;00m async_res]\n\u001b[1;32m 293\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 294\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_index\u001b[38;5;241m.\u001b[39mupsert(\n\u001b[1;32m 295\u001b[0m vectors\u001b[38;5;241m=\u001b[39mvector_tuples,\n\u001b[1;32m 296\u001b[0m namespace\u001b[38;5;241m=\u001b[39mnamespace,\n\u001b[1;32m 297\u001b[0m async_req\u001b[38;5;241m=\u001b[39masync_req,\n\u001b[1;32m 298\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m 299\u001b[0m )\n", + "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/langchain_pinecone/vectorstores.py:292\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 281\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m async_req:\n\u001b[1;32m 282\u001b[0m \u001b[38;5;66;03m# Runs the pinecone upsert asynchronously.\u001b[39;00m\n\u001b[1;32m 283\u001b[0m async_res \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 284\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_index\u001b[38;5;241m.\u001b[39mupsert(\n\u001b[1;32m 285\u001b[0m vectors\u001b[38;5;241m=\u001b[39mbatch_vector_tuples,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 290\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m batch_vector_tuples \u001b[38;5;129;01min\u001b[39;00m batch_iterate(batch_size, vector_tuples)\n\u001b[1;32m 291\u001b[0m ]\n\u001b[0;32m--> 292\u001b[0m [res\u001b[38;5;241m.\u001b[39mget() \u001b[38;5;28;01mfor\u001b[39;00m res \u001b[38;5;129;01min\u001b[39;00m async_res]\n\u001b[1;32m 293\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 294\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_index\u001b[38;5;241m.\u001b[39mupsert(\n\u001b[1;32m 295\u001b[0m vectors\u001b[38;5;241m=\u001b[39mvector_tuples,\n\u001b[1;32m 296\u001b[0m namespace\u001b[38;5;241m=\u001b[39mnamespace,\n\u001b[1;32m 297\u001b[0m async_req\u001b[38;5;241m=\u001b[39masync_req,\n\u001b[1;32m 298\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m 299\u001b[0m )\n", + "File \u001b[0;32m~/anaconda3/lib/python3.11/multiprocessing/pool.py:774\u001b[0m, in \u001b[0;36mApplyResult.get\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 772\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_value\n\u001b[1;32m 773\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 774\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_value\n", + "File \u001b[0;32m~/anaconda3/lib/python3.11/multiprocessing/pool.py:125\u001b[0m, in \u001b[0;36mworker\u001b[0;34m(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception)\u001b[0m\n\u001b[1;32m 123\u001b[0m job, i, func, args, kwds \u001b[38;5;241m=\u001b[39m task\n\u001b[1;32m 124\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 125\u001b[0m result \u001b[38;5;241m=\u001b[39m (\u001b[38;5;28;01mTrue\u001b[39;00m, func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwds))\n\u001b[1;32m 126\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 127\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m wrap_exception \u001b[38;5;129;01mand\u001b[39;00m func \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m _helper_reraises_exception:\n", + "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pinecone/core/openapi/shared/api_client.py:187\u001b[0m, in \u001b[0;36mApiClient.__call_api\u001b[0;34m(self, resource_path, method, path_params, query_params, header_params, body, post_params, files, response_type, auth_settings, _return_http_data_only, collection_formats, _preload_content, _request_timeout, _host, _check_type)\u001b[0m\n\u001b[1;32m 185\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m PineconeApiException \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 186\u001b[0m e\u001b[38;5;241m.\u001b[39mbody \u001b[38;5;241m=\u001b[39m e\u001b[38;5;241m.\u001b[39mbody\u001b[38;5;241m.\u001b[39mdecode(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 187\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[1;32m 189\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlast_response \u001b[38;5;241m=\u001b[39m response_data\n\u001b[1;32m 191\u001b[0m return_data \u001b[38;5;241m=\u001b[39m response_data\n", + "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pinecone/core/openapi/shared/api_client.py:175\u001b[0m, in \u001b[0;36mApiClient.__call_api\u001b[0;34m(self, resource_path, method, path_params, query_params, header_params, body, post_params, files, response_type, auth_settings, _return_http_data_only, collection_formats, _preload_content, _request_timeout, _host, _check_type)\u001b[0m\n\u001b[1;32m 171\u001b[0m url \u001b[38;5;241m=\u001b[39m _host \u001b[38;5;241m+\u001b[39m resource_path\n\u001b[1;32m 173\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 174\u001b[0m \u001b[38;5;66;03m# perform request and return response\u001b[39;00m\n\u001b[0;32m--> 175\u001b[0m response_data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrequest(\n\u001b[1;32m 176\u001b[0m method,\n\u001b[1;32m 177\u001b[0m url,\n\u001b[1;32m 178\u001b[0m query_params\u001b[38;5;241m=\u001b[39mquery_params,\n\u001b[1;32m 179\u001b[0m headers\u001b[38;5;241m=\u001b[39mheader_params,\n\u001b[1;32m 180\u001b[0m post_params\u001b[38;5;241m=\u001b[39mpost_params,\n\u001b[1;32m 181\u001b[0m body\u001b[38;5;241m=\u001b[39mbody,\n\u001b[1;32m 182\u001b[0m _preload_content\u001b[38;5;241m=\u001b[39m_preload_content,\n\u001b[1;32m 183\u001b[0m _request_timeout\u001b[38;5;241m=\u001b[39m_request_timeout,\n\u001b[1;32m 184\u001b[0m )\n\u001b[1;32m 185\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m PineconeApiException \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 186\u001b[0m e\u001b[38;5;241m.\u001b[39mbody \u001b[38;5;241m=\u001b[39m e\u001b[38;5;241m.\u001b[39mbody\u001b[38;5;241m.\u001b[39mdecode(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pinecone/core/openapi/shared/api_client.py:460\u001b[0m, in \u001b[0;36mApiClient.request\u001b[0;34m(self, method, url, query_params, headers, post_params, body, _preload_content, _request_timeout)\u001b[0m\n\u001b[1;32m 450\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrest_client\u001b[38;5;241m.\u001b[39mOPTIONS(\n\u001b[1;32m 451\u001b[0m url,\n\u001b[1;32m 452\u001b[0m query_params\u001b[38;5;241m=\u001b[39mquery_params,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 457\u001b[0m body\u001b[38;5;241m=\u001b[39mbody,\n\u001b[1;32m 458\u001b[0m )\n\u001b[1;32m 459\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m method \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPOST\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m--> 460\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrest_client\u001b[38;5;241m.\u001b[39mPOST(\n\u001b[1;32m 461\u001b[0m url,\n\u001b[1;32m 462\u001b[0m query_params\u001b[38;5;241m=\u001b[39mquery_params,\n\u001b[1;32m 463\u001b[0m headers\u001b[38;5;241m=\u001b[39mheaders,\n\u001b[1;32m 464\u001b[0m post_params\u001b[38;5;241m=\u001b[39mpost_params,\n\u001b[1;32m 465\u001b[0m _preload_content\u001b[38;5;241m=\u001b[39m_preload_content,\n\u001b[1;32m 466\u001b[0m _request_timeout\u001b[38;5;241m=\u001b[39m_request_timeout,\n\u001b[1;32m 467\u001b[0m body\u001b[38;5;241m=\u001b[39mbody,\n\u001b[1;32m 468\u001b[0m )\n\u001b[1;32m 469\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m method \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPUT\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 470\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrest_client\u001b[38;5;241m.\u001b[39mPUT(\n\u001b[1;32m 471\u001b[0m url,\n\u001b[1;32m 472\u001b[0m query_params\u001b[38;5;241m=\u001b[39mquery_params,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 477\u001b[0m body\u001b[38;5;241m=\u001b[39mbody,\n\u001b[1;32m 478\u001b[0m )\n", + "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pinecone/core/openapi/shared/rest.py:345\u001b[0m, in \u001b[0;36mRESTClientObject.POST\u001b[0;34m(self, url, headers, query_params, post_params, body, _preload_content, _request_timeout)\u001b[0m\n\u001b[1;32m 335\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mPOST\u001b[39m(\n\u001b[1;32m 336\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 337\u001b[0m url,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 343\u001b[0m _request_timeout\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 344\u001b[0m ):\n\u001b[0;32m--> 345\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrequest(\n\u001b[1;32m 346\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPOST\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 347\u001b[0m url,\n\u001b[1;32m 348\u001b[0m headers\u001b[38;5;241m=\u001b[39mheaders,\n\u001b[1;32m 349\u001b[0m query_params\u001b[38;5;241m=\u001b[39mquery_params,\n\u001b[1;32m 350\u001b[0m post_params\u001b[38;5;241m=\u001b[39mpost_params,\n\u001b[1;32m 351\u001b[0m _preload_content\u001b[38;5;241m=\u001b[39m_preload_content,\n\u001b[1;32m 352\u001b[0m _request_timeout\u001b[38;5;241m=\u001b[39m_request_timeout,\n\u001b[1;32m 353\u001b[0m body\u001b[38;5;241m=\u001b[39mbody,\n\u001b[1;32m 354\u001b[0m )\n", + "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pinecone/core/openapi/shared/rest.py:279\u001b[0m, in \u001b[0;36mRESTClientObject.request\u001b[0;34m(self, method, url, query_params, headers, body, post_params, _preload_content, _request_timeout)\u001b[0m\n\u001b[1;32m 276\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;241m500\u001b[39m \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m r\u001b[38;5;241m.\u001b[39mstatus \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m599\u001b[39m:\n\u001b[1;32m 277\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m ServiceException(http_resp\u001b[38;5;241m=\u001b[39mr)\n\u001b[0;32m--> 279\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m PineconeApiException(http_resp\u001b[38;5;241m=\u001b[39mr)\n\u001b[1;32m 281\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m r\n", + "\u001b[0;31mPineconeApiException\u001b[0m: (400)\nReason: Bad Request\nHTTP response headers: HTTPHeaderDict({'Date': 'Thu, 14 Nov 2024 19:37:34 GMT', 'Content-Type': 'application/json', 'Content-Length': '116', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '155', 'x-pinecone-request-id': '2161036493328920558', 'x-envoy-upstream-service-time': '37', 'server': 'envoy'})\nHTTP response body: {\"code\":3,\"message\":\"Metadata size is 869788 bytes, which exceeds the limit of 40960 bytes per vector\",\"details\":[]}\n" + ] + } + ], "source": [ - "#!pip install langchain_community" + "vector_store.add_documents(documents=documents[0:15], ids=uuids)" ] }, { "cell_type": "code", - "execution_count": 25, - "id": "4cc9ce57-c079-4605-9430-0a6d98a63242", + "execution_count": 27, + "id": "bc900bbd-128c-40b0-b151-f77800fcb50b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{'title': '1161521 T.H. Jones, Charlotte, N.C., autograph letter ...\\nName: title_info_primary_tsi, dtype: object', 'abstract': '1161521 Expects to make purchases in North Carolina an...\\nName: abstract_tsi, dtype: object'}\n" + "* [SIM=0.292190]\n", + " [deletion]Cha[/deletion] Graniteville Jany 15 1854\n", + "Mr Z. B. Oakes\n", + "Dr Sir\n", + "On my Return\n", + "from Charleston last week I stopd\n", + "and attended the sale of B. J. Godfrerys\n", + "at Black Creek and bought the Family\n", + "of Cash whom I purchased of you two\n", + "years ago. The woman is now in a\n", + "Family way and we think she will not\n", + "suit us for a Cook Woman, she is\n", + "a small young woman about 30 strong and\n", + "Healthy and prefers Field Work Cash\n", + "has proved himself an Excellent servant\n", + "a great Ax Man and not to be exceld\n", + "with the Hoe I can recommend him to be\n", + "a most Excellent general servant, and\n", + "now what do you think you can\n", + "get for the Family Consisting of himself\n", + "Wife and three Children say a girl of\n", + "10 or 11 year named Bella, Ceasar a boy about\n", + "4 or 5, and Rose 2 or three and Mother\n", + "in a fair way for another. Please write\n", + "and let me know what you think\n", + "you could get for them yours Truly\n", + "[underline]T. H. Marshall[/underline]\n", + "\n", + " [{'URL': 'https://www.digitalcommonwealth.org/search/commonwealth:ws859m407', 'abstract': '1161522 Asks value of family: man, woman, 3 children.\\nName: abstract_tsi, dtype: object', 'genre': \"1161522 ['Manuscripts', 'Correspondence']\\nName: genre_basic_ssim, dtype: object\", 'genre_specific': '1161522 NaN\\nName: genre_specific_ssim, dtype: object', 'subtitle': '1161522 NaN\\nName: title_info_primary_subtitle_tsi, dtype: object', 'title': 'T.H. Marshall, Graniteville, S.C., autograph l...', 'title_alt': '1161522 NaN\\nName: title_info_alternative_tsim, dtype: object'}]\n", + "----------------------------------\n", + "\n", + "* [SIM=0.243309]\n", + " Grumesville Sept 11 [insertion]th[/insertion] 1856\n", + "Dear Sir\n", + "The Deputy has been here since\n", + "day before yesterday, and it is impossible\n", + "to get hold of the negroes, I have advised\n", + "that the parties should go down & see you\n", + "they seem willing to consent that the fellow\n", + "Tom should go at $ 1000, and a smaller boy\n", + "at $ 300 - but a new party entered the\n", + "field and recommended a seperance\n", + "to Major Rhame - who wants the boys\n", + "himself, he is to be requested to go\n", + "down and confer with the judgement\n", + "creditors and buy them, or propose to\n", + "sell out the entire Estate and pay pro-ratio,\n", + "the creditors, I presume that\n", + "all this could be done better by yourself\n", + "and make no doubt a ballance\n", + "of some 2,000 dollars retained, I advise\n", + "that you should when applied\n", + "to take the whole Estate in hand by &\n", + "with the Consent of Judgement Creditors\n", + "and save the Commission, I could\n", + "then speculate on the sale if forced\n", + "under the [deletion]e[/deletion]sheriff -\n", + "in Lords - yours truly\n", + "Theo. C Tharin\n", + "\n", + " [{'URL': 'https://www.digitalcommonwealth.org/search/commonwealth:w3764957b', 'abstract': '1163238 On disposal of Negroes in contested estate.\\nName: abstract_tsi, dtype: object', 'genre': \"1163238 ['Manuscripts', 'Correspondence']\\nName: genre_basic_ssim, dtype: object\", 'genre_specific': '1163238 NaN\\nName: genre_specific_ssim, dtype: object', 'subtitle': '1163238 NaN\\nName: title_info_primary_subtitle_tsi, dtype: object', 'title': 'Theodore C. Tharin, Grumesville, S.C., autogra...', 'title_alt': '1163238 NaN\\nName: title_info_alternative_tsim, dtype: object'}]\n", + "----------------------------------\n", + "\n", + "* [SIM=0.230255]\n", + " Wednesday morn July 12 1854\n", + "Dear Sir\n", + "I drop you a line in a great\n", + "hurry by Mr McCulley to say my affairs\n", + "are going very well, and I have been\n", + "offered $ 500 or $ 1.00 per acre for the tract of\n", + "land you purchased at sherriff sale.\n", + "I think you can do better, the offer is made by\n", + "Mr Williams, who owns the adjoining tract\n", + "which he purchased of Mc Culley he\n", + "gets Serpentine and work about 30 Hands\n", + "yours truly in haste\n", + "Theod C Tharin\n", + "Z. B. Oakes Esq.\n", + "\n", + " [{'URL': 'https://www.digitalcommonwealth.org/search/commonwealth:w3764306m', 'abstract': '1163237 Offer to purchase land.\\nName: abstract_tsi, dtype: object', 'genre': \"1163237 ['Manuscripts', 'Correspondence']\\nName: genre_basic_ssim, dtype: object\", 'genre_specific': '1163237 NaN\\nName: genre_specific_ssim, dtype: object', 'subtitle': '1163237 NaN\\nName: title_info_primary_subtitle_tsi, dtype: object', 'title': 'Theodore C. Tharin, Grumesville, S.C. [?], aut...', 'title_alt': '1163237 NaN\\nName: title_info_alternative_tsim, dtype: object'}]\n", + "----------------------------------\n", + "\n" ] } ], "source": [ - "print(documents[1].metadata)" + "results = vector_store.similarity_search_with_score(\n", + " \"What is the metadata of the Z.B. Oakes articles\",\n", + " k=3\n", + ")\n", + "for res, score in results:\n", + " #print(f\"* {res.page_content} [{res.metadata}]\")\n", + " print(f\"* [SIM={score:3f}]\\n {res.page_content} [{res.metadata}]\")\n", + " print(\"----------------------------------\\n\")" ] }, { "cell_type": "markdown", - "id": "4a7f8878-1967-4630-817a-9eb1d321701e", - "metadata": {}, - "source": [ - "### Using Chroma Vector Store" - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "id": "e72f20d4-4870-44ae-be93-66b3e16d53bd", + "id": "f5545e52-8142-4c0d-a127-a1a3711e5ef3", "metadata": {}, - "outputs": [], "source": [ - "# import os\n", - "# os.chmod('mydatabase.db', 0o666)" + "### Conclusions on Pinecone and HuggingFace Embedding Model\n", + "Unfortunately, it seems like Pinecone has a byte limit of how much data you can send, which 40KB. The metadata field for one of the Document objects is 800KB.\n", + "\n", + "The HuggingFace embedding model that I used though works like a charm. Just have to ensure the dimensions line up between the embedding and the query.\n", + "\n", + "FAISS seems like our best option." ] }, { - "cell_type": "code", - "execution_count": 67, - "id": "4c53d93b-165e-4f9b-8a6f-cd9329551a69", + "cell_type": "markdown", + "id": "4a7f8878-1967-4630-817a-9eb1d321701e", "metadata": {}, - "outputs": [], "source": [ - "#!pip install chromadb==0.5.0\n", - "#!pip install --upgrade openai langchain\n", - "# !pip install --upgrade langchain langchain_community langchain_openai openai python-dotenv chromadb\n", - "# !pip install --upgrade transformers\n", - "#!pip install --upgrade transformers torchvision\n", - "\n", - "# !pip install openai==1.37.1\n", - "# !pip install langchain==0.2.11\n", - "# !pip install langchain-openai==0.1.19\n", - "# !pip install langchain-community==0.2.10\n", - "# !pip install langchain-experimental==0.0.63\n", - "# !pip install transformers" + "### Using Chroma Vector Store" ] }, { @@ -1351,7 +1578,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 93, "id": "c281018a-089b-4ad0-8f4c-efb4667c8780", "metadata": {}, "outputs": [ @@ -1360,42 +1587,7 @@ "output_type": "stream", "text": [ "Split 133 documents into 13931 chunks.\n", - "Then again, perhaps as we become accustomed\n", - "to the inflated value of the dollar, in time\n", - "that price may not seem as exhorbitant as it\n", - "does to me now.\n", - "\n", - "When I was a child, soup was never served as\n", - "an appetizer, but always as a meal. It didn't\n", - "come from a can, either. It was made from\n", - "bones left over from Sunday's roast and had\n", - "some strength to it. When cooled, it jelled,\n", - "and suspended in it were bits and pieces of\n", - "meat and vegetables.\n", - "\n", - "With soup, corn or clam chowder, or oyster\n", - "stew came crackers. Not those skimpy skinny\n", - "saltines, but thick rich common or pilot cra-\n", - "ckers. Three or four of those, along with your\n", - "soup, and you had a meal.\n", - "\n", - "Father liked pie, and Mother baked them fre-\n", - "quently. All kinds: mince, apple, pumpkin,\n", - "squash, and lemon meringue. When I think of\n", - "the calories we consumed, I'm surprised we\n", - "weren't chubby. Instead, we were all thin as\n", - "rails.\n", - "{'title': 'Thanksgiving', 'abstract': '1161754 NaN\\nName: abstract_tsi, dtype: object', 'subtitle': '1161754 NaN\\nName: title_info_primary_subtitle_tsi, dtype: object', 'ID': 'commonwealth:jd478671b', 'start_index': 3435}\n", - "Removed existing database at /var/folders/xq/fj3st__56r54gz9tdvb7d2k40000gn/T/tmpcp1qkd0k.\n", - "Saved 13931 chunks to /var/folders/xq/fj3st__56r54gz9tdvb7d2k40000gn/T/tmpcp1qkd0k.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/var/folders/xq/fj3st__56r54gz9tdvb7d2k40000gn/T/ipykernel_26237/3571056397.py:60: LangChainDeprecationWarning: Since Chroma 0.4.x the manual persistence method is no longer supported as docs are automatically persisted.\n", - " db.persist()\n" + "Saved 13931 chunks to ./chroma_try.\n" ] } ], @@ -1413,11 +1605,8 @@ "import time\n", "from langchain.embeddings import HuggingFaceEmbeddings\n", "\n", - "# Load environment variables. Assumes that project contains .env file with API keys\n", - "load_dotenv()\n", - "\n", "import tempfile\n", - "CHROMA_PATH = tempfile.mkdtemp() # Use a temporary directory\n", + "CHROMA_PATH = \"./chroma_try\"\n", "\n", "def main(documents):\n", " generate_data_store(documents)\n", @@ -1439,13 +1628,13 @@ " print(f\"Split {len(documents)} documents into {len(chunks)} chunks.\")\n", "\n", " document = chunks[10]\n", - " print(document.page_content)\n", - " print(document.metadata)\n", + " #print(document.page_content)\n", + " #print(document.metadata)\n", "\n", " return chunks\n", "\n", "def save_to_chroma(chunks):\n", - " # Clear out the database first.\n", + " #Clear out the database first.\n", " if os.path.exists(CHROMA_PATH):\n", " shutil.rmtree(CHROMA_PATH)\n", " print(f\"Removed existing database at {CHROMA_PATH}.\")\n", @@ -1455,6 +1644,7 @@ "\n", " #embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n", " embeddings = OpenAIEmbeddings(model=\"text-embedding-3-large\", dimensions=3072)\n", + " #embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")\n", " try:\n", " db = Chroma.from_documents(\n", " chunks, embeddings, persist_directory=CHROMA_PATH\n", @@ -1478,27 +1668,20 @@ "### Making the Query" ] }, - { - "cell_type": "markdown", - "id": "7bd15c30-251e-4460-a598-9e1fcf7aa3f5", - "metadata": {}, - "source": [ - "We'll download langserve to make a sample UI for our app:" - ] - }, { "cell_type": "code", - "execution_count": null, + "execution_count": 82, "id": "5b01faae-fdbf-420d-9b79-e1f31e84baf8", "metadata": {}, "outputs": [], "source": [ - "#!pip install \"langserve[all]\"" + "import os \n", + "os.environ[\"OPENAI_API_KEY\"] = \"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\"" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 94, "id": "103cf9f0-e116-4e3a-a33c-accdf4246332", "metadata": { "scrolled": true @@ -1559,11 +1742,51 @@ "\n", "---\n", "\n", + "Charleston 1-6 Dec 1853\n", + "Mr Z. B. Oakes,\n", + "Dear Sir\n", + "I would be glad\n", + "to know what you have\n", + "determined on in the case\n", + "of M Alpine, I bot the\n", + "negro from you, it appears\n", + "and of course you are\n", + "liable to me, I am called\n", + "on for the amount of the\n", + "verdict, It appears that the\n", + "negro was a stolen one. I\n", + "of course lay no charge to\n", + "you on this score, being accessory\n", + "to the affair, but I\n", + "do call on on you to\n", + "hold me Harmless. - Please\n", + "send me your written answer\n", + "[underline]this day.[/underline]\n", + "Respectfully\n", + "[underline]Tho: N. Gadsden[/underline]\n", + "\n", + "---\n", + "\n", + "Summerville So Ca\n", + "April 26th 1854\n", + "Mr Z B Oakes\n", + "Dear Sir\n", + "Your letter\n", + "in reply to mine was duly received.\n", + "I am willing to value the woman at\n", + "$ 800. & be refunded $ 150. or return her\n", + "at once to you. Do inform me as soon\n", + "as possible as to the decision of\n", + "her owner. Yours respectfully\n", + "Thos. L Gelzia\n", + "\n", + "---\n", + "\n", "Answer the question based on the above context: Who did Z.B Oakes receive a letter from?\n", "\n", - "Response: Z.B. Oakes received a letter from Theo. C Tharin.\n", + "Response: Z.B Oakes received a letter from Tho: N. Gadsden.\n", "\n", - "Sources: ['Thomas Taylor, Columbia, Tenn., autograph lett...: commonwealth:5q47sj75b', 'Theodore C. Tharin, Grumesville, S.C., autogra...: commonwealth:w3764352q', 'Theodore C. Tharin, Grumesville, S.C., autogra...: commonwealth:w3764286b']\n" + "Sources: ['Thomas Taylor, Columbia, Tenn., autograph lett...: https://www.digitalcommonwealth.org/search/commonwealth:5q47sj75b', 'Theodore C. Tharin, Grumesville, S.C., autogra...: https://www.digitalcommonwealth.org/search/commonwealth:w3764352q', 'Theodore C. Tharin, Grumesville, S.C., autogra...: https://www.digitalcommonwealth.org/search/commonwealth:w3764286b', 'Theodore N. Gadsden, Charleston, S.C., autogra...: https://www.digitalcommonwealth.org/search/commonwealth:9k41zk125', 'Thomas L. Gelzer, Summerville, autograph lette...: https://www.digitalcommonwealth.org/search/commonwealth:ws859j61k']\n" ] } ], @@ -1578,18 +1801,10 @@ "from langchain_core.messages import HumanMessage, SystemMessage\n", "from langchain_core.output_parsers import StrOutputParser\n", "\n", - "# For LangServe\n", - "from fastapi import FastAPI\n", - "from langchain_core.prompts import ChatPromptTemplate\n", - "from langchain_core.output_parsers import StrOutputParser\n", - "from langchain_openai import ChatOpenAI\n", - "from langserve import add_routes\n", - "import nest_asyncio\n", - "import uvicorn\n", - "\n", "\n", "# copy from above\n", - "CHROMA_PATH = \"/var/folders/xq/fj3st__56r54gz9tdvb7d2k40000gn/T/tmpcp1qkd0k\"\n", + "#CHROMA_PATH = \"/var/folders/xq/fj3st__56r54gz9tdvb7d2k40000gn/T/tmpaxd8t1dv\"\n", + "CHROMA_PATH = \"./chroma_try\"\n", "\n", "PROMPT_TEMPLATE = \"\"\"\n", "Answer the question based only on the following context:\n", @@ -1601,10 +1816,6 @@ "Answer the question based on the above context: {question}\n", "\"\"\"\n", "\n", - "# Initialize LangSmith App\n", - "# app = App()\n", - "\n", - "# @langsmith_route(\"/answer-question\")\n", "def main(query: str):\n", " # Create CLI with a default value for Jupyter testing\n", " parser = argparse.ArgumentParser()\n", @@ -1614,17 +1825,15 @@ "\n", " # Prepare the database\n", " embedding_function = OpenAIEmbeddings(model=\"text-embedding-3-large\", dimensions=3072)\n", - " #embedding_function = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n", + " #embedding_function = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")\n", " db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)\n", "\n", - " results = db.similarity_search_with_relevance_scores(query_text, k=3)\n", + " results = db.similarity_search_with_relevance_scores(query_text, k=5)\n", " for i in range(len(results)):\n", - " if len(results) == 0 or results[0][1] < 0.1:\n", + " if len(results) == 0 or results[0][1] < 0.3:\n", " print(f\"Unable to find matching results for \\\"{query_text}\\\"\")\n", " print(results[0][1])\n", " return\n", - "\n", - " #print(results)\n", " \n", " context_text = \"\\n\\n---\\n\\n\".join([doc.page_content for doc, _score in results])\n", " prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)\n", @@ -1634,7 +1843,7 @@ " model = ChatOpenAI()\n", " response_text = model.predict(prompt)\n", "\n", - " sources = [doc.metadata.get(\"title\") + \": \" + str(doc.metadata.get(\"ID\")) for doc, _score in results]\n", + " sources = [doc.metadata.get(\"title\") + \": \" + str(doc.metadata.get(\"URL\")) for doc, _score in results]\n", " formatted_response = f\"Response: {response_text}\\n\\nSources: {sources}\"\n", " # response with context, sources, and answer to my query\n", " print(formatted_response)\n", @@ -1642,34 +1851,50 @@ "if __name__ == \"__main__\":\n", " query1 = \"Who did Z.B Oakes receive a letter from?\"\n", " query2 = \"What did Henry M. Sikes say about India Goods?\"\n", - " query3 = \"What are some of the most controversial topics in this database?\"\n", - " query4 = \"What happened in World War II?\"\n", - " query5 = \"Who critiqued India Goods?\"\n", - " query6 = \"Tell me about Barnstable Public Schools\"\n", - " #query7 = \"What did Thos. L Gelzia talk about in their letter to Mr Z. B. Oakes, but not in the Tocsin of Liberty?\"\n", - " queries = [query1, query2, query3, query4, query5, query6]\n", - " # print(\"-------------------New Query-------------------\")\n", - " # for query in queries:\n", - " # main(query)\n", - " # print(\"-------------------New Query-------------------\")\n", - " main(query1)\n", - " \n" + " query3 = \"What happened in World War II?\"\n", + " main(query1)" ] }, { "cell_type": "markdown", - "id": "9bba8e8f-14e3-48c0-9fc8-c07328271da3", + "id": "805b793e-16e3-4eca-9b2a-baaa91fde961", "metadata": {}, "source": [ - "# Notes from Gardos\n", + "### Conclusions about Chroma and OpenAI Embedding Model\n", + "Chroma seems to be a great option as a vector store, however it is immensely lightweight and requires us to have the vector store as an embedding in the machine that we use. Unfortunatenly we have not found many projects that is dealing with embedding a vast amount of data like we are.\n", "\n", - "These are the list of fields, if you need any clarification about these fields ask about them.\n", - "\n", - "Vectorize all of the fields\n", - "\n", - "Give this to the LLM as a preface prompt.\n", - "\n", - "Maybe two vector stores?" + "The OpenAIEmbeddings is great, but it also costs money, so that is a no-go." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "4eded946-6f7f-440b-a892-80f8c37db3ab", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_content='Oglethorpe December 8 53\n", + "Z B Oaks esqr\n", + "Dr Sir you will please\n", + "inform me how negros are selling\n", + "& how your market is supplied\n", + "I think I will be in Charleston\n", + "the latter part of this month, &\n", + "will want to buy a Cook &\n", + "good active boy & perhaps two negro fellows\n", + "Very Respectfully\n", + "Yours T G Hudson\n", + "\n", + "\n", + "' metadata={'title': 'T.G. Hudson, Oglethorpe, Ga., autograph letter...', 'abstract': '1161475 Asks market price at Charleston.\\nName: abstract_tsi, dtype: object', 'subtitle': '1161475 NaN\\nName: title_info_primary_subtitle_tsi, dtype: object', 'URL': 'https://www.digitalcommonwealth.org/search/commonwealth:9k41zk460', 'title_alt': '1161475 NaN\\nName: title_info_alternative_tsim, dtype: object', 'genre': \"1161475 ['Manuscripts', 'Correspondence']\\nName: genre_basic_ssim, dtype: object\", 'genre_specific': '1161475 NaN\\nName: genre_specific_ssim, dtype: object'}\n" + ] + } + ], + "source": [ + "print(documents[0])" ] } ], From a67116a1629bb1afc9af0bf1483d4a8b1b1296b7 Mon Sep 17 00:00:00 2001 From: Brandon Vargus <45298256+b3v@users.noreply.github.com> Date: Thu, 21 Nov 2024 13:50:02 -0500 Subject: [PATCH 3/6] Update rerank.py --- PoC/rerank.py | 196 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 195 insertions(+), 1 deletion(-) diff --git a/PoC/rerank.py b/PoC/rerank.py index a0aabbb..6948c59 100644 --- a/PoC/rerank.py +++ b/PoC/rerank.py @@ -1 +1,195 @@ -import numpy as np import matplotlib.pyplot as plt import seaborn as sns import json import pandas as pd from langchain_openai import ChatOpenAI # replace with file of your choosing file = open("sample_full_text.json") full_text = json.load(file) # metadata csv file; should be included in repo df_attributes = pd.read_csv("metadata_attributes.csv") model = ChatOpenAI() import re def get_title(text): match = re.search(r'\d+\s+(.+?)\n', text) # Extracting and printing the title if there's a match if match: title = match.group(1) return title # Turn the BPL data into a Document from langchain.schema import Document documents = [] for doc in full_text: # Extract metadata fields and apply get_title() title = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "title_info_primary_tsi"])) title_subtitle = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "title_info_primary_subtitle_tsi"])) title_alt = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "title_info_alternative_tsim"])) abstract = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "abstract_tsi"])) subject_facet = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "subject_facet_ssim"])) subject_geographic = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "subject_geographic_sim"])) genre = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "genre_basic_ssim"])) genre_specific = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "genre_specific_ssim"])) name_facet = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "name_facet_ssim"])) name_role = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "name_role_tsim"])) date_human = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "date_tsim"])) date_start = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "date_start_dtsi"])) date_end = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "date_end_dtsi"])) publisher = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "publisher_tsi"])) collection_name = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "collection_name_ssim"])) physical_location = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "physical_location_ssim"])) related_item_host = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "related_item_host_ssim"])) type_of_resource = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "type_of_resource_ssim"])) URL = "https://www.digitalcommonwealth.org/search/" + get_title(str(df_attributes.loc[df_attributes["id"] == doc, "id"])) # Create Document with metadata documents.append(Document( page_content=full_text[doc]['text'], metadata={ "title": title, "subtitle": title_subtitle, "title_alt": title_alt, "abstract": abstract, "subject_facet": subject_facet, "subject_geographic": subject_geographic, "genre": genre, "genre_specific": genre_specific, "name_facet": name_facet, "name_role": name_role, "date_human": date_human, "date_start": date_start, "date_end": date_end, "publisher": publisher, "collection_name": collection_name, "physical_location": physical_location, "related_item_host": related_item_host, "type_of_resource": type_of_resource, "URL": URL } )) # Now for all of the vector store and reranking stuff import faiss from langchain_community.docstore.in_memory import InMemoryDocstore from langchain_community.vectorstores import FAISS from langchain.embeddings import HuggingFaceEmbeddings # embeddings model embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") # creating the vector store index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world"))) vector_store = FAISS( embedding_function=embeddings, index=index, docstore=InMemoryDocstore(), index_to_docstore_id={}, ) # now for the reranking step weights = { "title": 1.0, "subtitle": 0.95, "title_alt": 0.9, "abstract": 0.85, "subject_facet": 0.8, "subject_geographic": 0.75, "genre": 0.7, "genre_specific": 0.65, "name_facet": 0.6, "name_role": 0.55, "date_human": 0.5, "date_start": 0.45, "date_end": 0.4, "publisher": 0.35, "collection_name": 0.3, "physical_location": 0.25, "related_item_host": 0.2, "type_of_resource": 0.15, "URL": 0.1 } from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity import numpy as np from langchain.embeddings import HuggingFaceEmbeddings # our vector store: # embedding model embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") def compute_relevance_score(metadata_value, query): """ Compute cosine similarity between the query and a metadata value using sentence-transformers. Args: metadata_value (str): The metadata value to compare. query (str): The query string. Returns: float: Cosine similarity score (between 0 and 1). """ if not metadata_value or not query: return 0 # Return 0 if either the metadata or query is empty # Encode the metadata value and query into embeddings embeddings = model.encode([metadata_value, query], convert_to_tensor=False) # Convert to NumPy metadata_embedding, query_embedding = embeddings # Compute cosine similarity similarity = cosine_similarity([metadata_embedding], [query_embedding]) return similarity[0][0] # Extract the scalar similarity value def rerank_documents(documents, query, weights, vector_store, k=10): """ Rerank documents based on metadata relevance scores and FAISS vector similarity scores. Args: documents (list): List of Document objects. query (str): The query string used for retrieval. weights (dict): Weights for each metadata field. vector_store (str): The vector store itself to get the similarity score Returns: list: Reranked documents in descending order of relevance. """ final_score = 0 reranked_results = [] returned_docs = vector_store.similarity_search_with_score(query, k) for doc in returned_docs: final_score = doc[1] # Add weighted relevance scores for each metadata field for field, weight in weights.items(): metadata_value = doc[0].metadata.get(field, "") # Safely get metadata field value relevance_score = compute_relevance_score(metadata_value, query) final_score += weight * relevance_score reranked_results.append((doc, final_score)) # Sort documents by the final score in descending order reranked_results.sort(key=lambda x: x[1], reverse=True) return [doc for doc, score in reranked_results] docs = rerank_documents(documents, "Newspaper", weights, vector_store) # now we should get an output like this for some k value: # ('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:gf06jp23d', 'Reranked score: 1.1741459369659424') docs_list = [(docs[i][0].metadata['title'], docs[i][0].metadata['URL'], f"Reranked score: {docs[i][1]}") for i in range(len(docs))] docs_list.sort(key=lambda x: x[2], reverse=True) for doc in docs_list: print(doc) \ No newline at end of file +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +import json +import pandas as pd +from langchain_openai import ChatOpenAI + +# replace with file of your choosing +file = open("sample_full_text.json") +full_text = json.load(file) + +# metadata csv file; should be included in repo +df_attributes = pd.read_csv("metadata_attributes.csv") + +model = ChatOpenAI() + +import re +def get_title(text): + match = re.search(r'\d+\s+(.+?)\n', text) + + # Extracting and printing the title if there's a match + if match: + title = match.group(1) + return title + +# Turn the BPL data into a Document +from langchain.schema import Document + +documents = [] + +for doc in full_text: + # Extract metadata fields and apply get_title() + title = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "title_info_primary_tsi"])) + title_subtitle = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "title_info_primary_subtitle_tsi"])) + title_alt = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "title_info_alternative_tsim"])) + abstract = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "abstract_tsi"])) + subject_facet = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "subject_facet_ssim"])) + subject_geographic = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "subject_geographic_sim"])) + genre = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "genre_basic_ssim"])) + genre_specific = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "genre_specific_ssim"])) + name_facet = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "name_facet_ssim"])) + name_role = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "name_role_tsim"])) + date_human = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "date_tsim"])) + date_start = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "date_start_dtsi"])) + date_end = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "date_end_dtsi"])) + publisher = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "publisher_tsi"])) + collection_name = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "collection_name_ssim"])) + physical_location = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "physical_location_ssim"])) + related_item_host = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "related_item_host_ssim"])) + type_of_resource = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "type_of_resource_ssim"])) + URL = "https://www.digitalcommonwealth.org/search/" + get_title(str(df_attributes.loc[df_attributes["id"] == doc, "id"])) + + # Create Document with metadata + documents.append(Document( + page_content=full_text[doc]['text'], + metadata={ + "title": title, + "subtitle": title_subtitle, + "title_alt": title_alt, + "abstract": abstract, + "subject_facet": subject_facet, + "subject_geographic": subject_geographic, + "genre": genre, + "genre_specific": genre_specific, + "name_facet": name_facet, + "name_role": name_role, + "date_human": date_human, + "date_start": date_start, + "date_end": date_end, + "publisher": publisher, + "collection_name": collection_name, + "physical_location": physical_location, + "related_item_host": related_item_host, + "type_of_resource": type_of_resource, + "URL": URL + } + )) + +# Now for all of the vector store and reranking stuff +import faiss +from langchain_community.docstore.in_memory import InMemoryDocstore +from langchain_community.vectorstores import FAISS +from langchain.embeddings import HuggingFaceEmbeddings + +# embeddings model +embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") + +# creating the vector store +index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world"))) + +vector_store = FAISS( + embedding_function=embeddings, + index=index, + docstore=InMemoryDocstore(), + index_to_docstore_id={}, +) + +# now for the reranking step +weights = { + "title": 1.0, + "subtitle": 0.95, + "title_alt": 0.9, + "abstract": 0.85, + "subject_facet": 0.8, + "subject_geographic": 0.75, + "genre": 0.7, + "genre_specific": 0.65, + "name_facet": 0.6, + "name_role": 0.55, + "date_human": 0.5, + "date_start": 0.45, + "date_end": 0.4, + "publisher": 0.35, + "collection_name": 0.3, + "physical_location": 0.25, + "related_item_host": 0.2, + "type_of_resource": 0.15, + "URL": 0.1 +} + +from sentence_transformers import SentenceTransformer +from sklearn.metrics.pairwise import cosine_similarity +import numpy as np +from langchain.embeddings import HuggingFaceEmbeddings + +# our vector store: + +# embedding model +embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") + +def compute_relevance_score(metadata_value, query): + """ + Compute cosine similarity between the query and a metadata value using sentence-transformers. + + Args: + metadata_value (str): The metadata value to compare. + query (str): The query string. + + Returns: + float: Cosine similarity score (between 0 and 1). + """ + if not metadata_value or not query: + return 0 # Return 0 if either the metadata or query is empty + + # Encode the metadata value and query into embeddings + embeddings = model.encode([metadata_value, query], convert_to_tensor=False) # Convert to NumPy + metadata_embedding, query_embedding = embeddings + + # Compute cosine similarity + similarity = cosine_similarity([metadata_embedding], [query_embedding]) + return similarity[0][0] # Extract the scalar similarity value + + + +def rerank_documents(documents, query, weights, vector_store, k=10): + """ + Rerank documents based on metadata relevance scores and FAISS vector similarity scores. + + Args: + documents (list): List of Document objects. + query (str): The query string used for retrieval. + weights (dict): Weights for each metadata field. + vector_store (str): The vector store itself to get the similarity score + + Returns: + list: Reranked documents in descending order of relevance. + """ + + final_score = 0 + + reranked_results = [] + returned_docs = vector_store.similarity_search_with_score(query, k) + for doc in returned_docs: + final_score = doc[1] + # Add weighted relevance scores for each metadata field + for field, weight in weights.items(): + metadata_value = doc[0].metadata.get(field, "") # Safely get metadata field value + relevance_score = compute_relevance_score(metadata_value, query) + final_score += weight * relevance_score + + reranked_results.append((doc, final_score)) + + # Sort documents by the final score in descending order + reranked_results.sort(key=lambda x: x[1], reverse=True) + return [doc for doc, score in reranked_results] + + +docs = rerank_documents(documents, "Newspaper", weights, vector_store) + +# now we should get an output like this for some k value: +# ('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:gf06jp23d', 'Reranked score: 1.1741459369659424') +docs_list = [(docs[i][0].metadata['title'], docs[i][0].metadata['URL'], f"Reranked score: {docs[i][1]}") for i in range(len(docs))] +docs_list.sort(key=lambda x: x[2], reverse=True) +for doc in docs_list: + print(doc) From 0acc75005b77eaa88312b576102ac7b85104229b Mon Sep 17 00:00:00 2001 From: Brandon Vargus Date: Tue, 26 Nov 2024 18:59:26 -0500 Subject: [PATCH 4/6] adjusted reranking algo and included normalization --- PoC/POC.ipynb | 734 ++++++++++++++++++++++---------------------------- 1 file changed, 322 insertions(+), 412 deletions(-) diff --git a/PoC/POC.ipynb b/PoC/POC.ipynb index 4c2f270..a027a64 100644 --- a/PoC/POC.ipynb +++ b/PoC/POC.ipynb @@ -10,10 +10,21 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 12, "id": "3934dc57-c4c3-4892-bcf4-bce4aa5c48af", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" + ] + } + ], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt\n", @@ -33,19 +44,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "30d31b31-2c2a-40ae-99b6-5502395f8bc7", - "metadata": {}, - "outputs": [], - "source": [ - "# file path for metadata file on SCC: /projectnb/sparkgrp/ml-bpl-rag-data/full_data/bpl_data.json\n", - "meta = open(\"../EDA Phase/bpl-digital-commonwealth/bpl_data.json\")\n", - "bpl_metadata = json.load(meta)" - ] - }, - { - "cell_type": "code", - "execution_count": 2, + "execution_count": 16, "id": "b50d8c5f-dfea-454f-860e-13315a9c2fea", "metadata": {}, "outputs": [], @@ -58,10 +57,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "1e4c5601-648f-4ff1-a5ce-d8823c46f884", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "133\n" + ] + } + ], "source": [ "print(len(full_text))" ] @@ -76,51 +83,10 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 2, "id": "da05e8ae-71f6-4ab3-b2fa-0193b77d6262", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Charlotte N.C.\n", - "Feb 21st/57\n", - "Z.B. Oaks Esq\n", - "Charleston S.C.\n", - "Dr Sr\n", - "I take the\n", - "Liberty to Address you as Regards\n", - "your Negro Market & your\n", - "Opinion as to how it will continue\n", - "through the Spring & Summer\n", - "I have an Idea of Trying To puchase\n", - "in the Mountians of N.C. & Va\n", - "& Selling in your Market or in\n", - "Richmond Va. I expect to Trade\n", - "on the Small Scale for Some\n", - "Market & if I can Sell in\n", - "your Market for a fair profit\n", - "I Shall do my Buisiness with\n", - "you & C\n", - "\n", - "I wish To no what Boys from 18 to\n", - "20 yrs old [deletion]and[/deletion] both no 1 & no 2 Boys\n", - "also Boys 12 ys old Say weigs 80 to 90\n", - "lbs & girls 12 ys old weighs Say 60 to\n", - "80 lb & from 14 To ys old To 20 ys old\n", - "Please write me by Return Mail\n", - "& give me the Market prices of\n", - "above Negros & [deletion][/deletion] are they [deletion][/deletion]\n", - "Brisk Sale or dull Address\n", - "me Charlotte N.C.\n", - "Yours Respectfully\n", - "T.H. Jones\n", - "\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "print(full_text['commonwealth:w3764603d']['text'])" ] @@ -579,7 +545,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 13, "id": "49e1b410-a7b0-4ef3-bbb8-2c240e568a73", "metadata": {}, "outputs": [ @@ -587,7 +553,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/var/folders/xq/fj3st__56r54gz9tdvb7d2k40000gn/T/ipykernel_12916/3787498575.py:1: DtypeWarning: Columns (10,16,17,18,20,21,22,23,24,25,27,29,33,34,36,41,42,43,44,54,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,128,129,130,131,132,133,134,135,136,137,138,139,140) have mixed types. Specify dtype option on import or set low_memory=False.\n", + "/var/folders/xq/fj3st__56r54gz9tdvb7d2k40000gn/T/ipykernel_8401/3787498575.py:1: DtypeWarning: Columns (10,16,17,18,20,21,22,23,24,25,27,29,33,34,36,41,42,43,44,54,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,128,129,130,131,132,133,134,135,136,137,138,139,140) have mixed types. Specify dtype option on import or set low_memory=False.\n", " df_attributes = pd.read_csv(\"metadata_attributes.csv\")\n" ] } @@ -596,98 +562,6 @@ "df_attributes = pd.read_csv(\"metadata_attributes.csv\")" ] }, - { - "cell_type": "markdown", - "id": "972493d9-63a1-477d-811e-c1d951a2d63c", - "metadata": {}, - "source": [ - "Writing the get_title function to clear away any whitespace and newline characters from the title of each document." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "d819646f-51cc-4817-9542-ecfc9ea4af33", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['海员们', ':', '要警惕航运事故\\nName:', 'title_info_primary_tsi,', 'dtype:', 'object']" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "str(df_attributes.loc[df_attributes[\"id\"] == \"commonwealth-oai:xp68m844v\"][\"title_info_primary_tsi\"]).split(\" \")[4:]" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "754c22c6-5d35-436a-922a-0f5f6cafa6c9", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'1199893 NaN\\nName: title_info_alternative_tsim, dtype: object'" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "str(df_attributes.loc[df_attributes[\"id\"] == \"commonwealth:1j92ng13k\"][\"title_info_alternative_tsim\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 81, - "id": "09331a9b-135a-46ec-8b0a-70c70ba1c261", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "585812" - ] - }, - "execution_count": 81, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_attributes[\"genre_specific_ssim\"].isna().sum()" - ] - }, - { - "cell_type": "code", - "execution_count": 76, - "id": "914df792-9343-43fd-83cd-7678e5a56f8c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1303800" - ] - }, - "execution_count": 76, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_attributes.shape[0]" - ] - }, { "cell_type": "markdown", "id": "1c6d1c2d-16ad-43a0-844a-72238901c07d", @@ -696,44 +570,17 @@ "### Turn full text into Documents type" ] }, - { - "cell_type": "code", - "execution_count": 5, - "id": "51f7f3b8-ed7a-43a1-949f-aa43e594af99", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1 Poster produced by the International Transport...\n", - "Name: abstract_tsi, dtype: object\n" - ] - } - ], - "source": [ - "print(str(df_attributes.loc[df_attributes[\"id\"] == \"commonwealth-oai:xp68m844v\", \"abstract_tsi\"]))" - ] - }, { "cell_type": "markdown", - "id": "b072fe0a-b538-4704-85a0-b6862b0653b6", + "id": "c74fe245-886d-4e9b-a554-77308c75e44b", "metadata": {}, "source": [ - "Important Metadata to Embed:\n", - "- title_info_primary_tsi\n", - "- title_info_primary_subtitle_tsi\n", - "- title_info_alternative_tsim\n", - "- abstract_tsi\n", - "- subject_facet_ssim\n", - "- subject_geographic_sim\n", - "- genre_basic_ssim\n", - "- genre_specific_ssim" + "Now we will make Document objects with the important metadata attributes" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 14, "id": "15d83b93-6840-4618-ac40-1c7de9e5cc1e", "metadata": {}, "outputs": [], @@ -750,7 +597,25 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, + "id": "c5947005-6e02-4130-9d4e-5487cd389dd5", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.schema import Document\n", + "documents = []\n", + "for doc in full_text:\n", + " title = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"title_info_primary_tsi\"]))\n", + " ID = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"id\"]))\n", + " abstract = str(df_attributes.loc[df_attributes[\"id\"] == doc, \"abstract_tsi\"])\n", + " title_subtitle = str(df_attributes.loc[df_attributes[\"id\"] == doc, \"title_info_primary_subtitle_tsi\"])\n", + "\n", + " documents.append(Document(page_content=\"\", metadata={'title':title, 'ID':ID, 'abstract':abstract, 'title_subtitle':title_subtitle}))" + ] + }, + { + "cell_type": "code", + "execution_count": 17, "id": "6c3f5614-cc45-4fb4-9b13-1fd0361da1b6", "metadata": {}, "outputs": [], @@ -784,7 +649,7 @@ " \n", " # Create Document with metadata\n", " documents.append(Document(\n", - " page_content=full_text[doc]['text'],\n", + " page_content=\"\",\n", " metadata={\n", " \"title\": title,\n", " \"subtitle\": title_subtitle,\n", @@ -811,7 +676,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 18, "id": "b58b4530-27e4-4ed4-80be-4ee240892480", "metadata": {}, "outputs": [ @@ -821,7 +686,7 @@ "dict_keys(['title', 'subtitle', 'title_alt', 'abstract', 'subject_facet', 'subject_geographic', 'genre', 'genre_specific', 'name_facet', 'name_role', 'date_human', 'date_start', 'date_end', 'publisher', 'collection_name', 'physical_location', 'related_item_host', 'type_of_resource', 'URL'])" ] }, - "execution_count": 11, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -838,18 +703,6 @@ "# RAG Pipeline" ] }, - { - "cell_type": "code", - "execution_count": 7, - "id": "648466aa-3142-4ece-aa02-4454c7f6ee41", - "metadata": {}, - "outputs": [], - "source": [ - "# set openai api key\n", - "import os\n", - "os.environ[\"OPENAI_API_KEY\"] = \"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\"" - ] - }, { "cell_type": "markdown", "id": "895258be-7bf4-4d8d-8f5a-79d80dfe8c36", @@ -860,17 +713,7 @@ }, { "cell_type": "code", - "execution_count": 20, - "id": "1b537ce7-2eb2-4392-bc71-5a33ede503df", - "metadata": {}, - "outputs": [], - "source": [ - "#!pip install langchain-ollama" - ] - }, - { - "cell_type": "code", - "execution_count": 20, + "execution_count": null, "id": "c1b3815e-dec5-4f22-b863-d91a57b5ccad", "metadata": {}, "outputs": [], @@ -879,15 +722,27 @@ "from langchain_openai import OpenAIEmbeddings\n", "import faiss\n", "from langchain_community.docstore.in_memory import InMemoryDocstore\n", - "from langchain_community.vectorstores import FAISS" + "from langchain_community.vectorstores import FAISS\n", + "from langchain.embeddings import HuggingFaceEmbeddings" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 5, "id": "03554a37-d142-45eb-be33-a333929b927d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/xq/fj3st__56r54gz9tdvb7d2k40000gn/T/ipykernel_7740/4140823313.py:2: LangChainDeprecationWarning: The class `HuggingFaceEmbeddings` was deprecated in LangChain 0.2.2 and will be removed in 1.0. An updated version of the class exists in the :class:`~langchain-huggingface package and should be used instead. To use it run `pip install -U :class:`~langchain-huggingface` and import as `from :class:`~langchain_huggingface import HuggingFaceEmbeddings``.\n", + " embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")\n", + "2024-11-26 15:04:19.450227: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", + "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" + ] + } + ], "source": [ "#embeddings = OpenAIEmbeddings(model=\"text-embedding-3-large\", dimensions=3072)\n", "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")" @@ -895,7 +750,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 19, "id": "0bea78d8-4b72-4da0-a50d-89e0fe509454", "metadata": {}, "outputs": [], @@ -904,157 +759,157 @@ "\n", "vector_store = FAISS(\n", " embedding_function=embeddings,\n", - " index=index,\n", " docstore=InMemoryDocstore(),\n", + " index=index,\n", " index_to_docstore_id={},\n", ")" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 20, "id": "b1bee292-778a-480e-8eb3-5cc37587ce85", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['68762a4a-dbcb-411e-9047-ed04e44a794a',\n", - " 'f622dbf6-aee1-43e4-aba1-59fde531a7ad',\n", - " '927c3752-7746-4c88-a58b-e01a6547d857',\n", - " '70bf52c8-b79e-459b-9aa0-c1f12e12d842',\n", - " 'ade0d10f-c301-4ac1-99b8-6e26562ac259',\n", - " '01721bef-99a0-4034-8bc0-655ebec123d1',\n", - " '5c0969cd-a297-4b3f-b73e-9b3e5ed60c1e',\n", - " 'f707ab88-6b09-48fc-94ed-93ac650931a2',\n", - " 'bd269b63-b143-4535-83f8-27515883647f',\n", - " '5b0f71e5-520e-4090-8aa7-fc4f152f42f3',\n", - " 'a97737a2-ba3c-4a6f-b169-967ea2856f57',\n", - " '5e9a4add-7aae-45c1-a071-bd8510e81829',\n", - " '53b49886-6e08-4c6e-b69d-d12fa9accc12',\n", - " 'cdb37b1a-8ff1-462c-8052-90b8681d1700',\n", - " '44eca254-78b8-4ce5-aa31-ee02b8895734',\n", - " '15cb7183-2758-468a-88d0-35348ccf357a',\n", - " 'fd3e2244-e666-4d73-9cac-d980ce9b49ee',\n", - " 'b48501c1-c52d-4cbf-bcc0-2b669017736c',\n", - " '3eda50a4-0269-44ea-8594-e1ed21662dc1',\n", - " 'd6d96535-021c-44c8-89c8-796ebd3ebc0d',\n", - " 'e20dfb8b-7e99-4299-82ae-b69c4b2aa5e8',\n", - " '89cbeb0c-253a-4c01-aba0-b3ef82622381',\n", - " 'fa325062-a886-499d-8377-2feedc5a8262',\n", - " '379080c0-03f9-4ff7-89b4-be8edcd7be96',\n", - " '06ff8bdc-975a-4ec2-8273-e2ca4f489df4',\n", - " '2ab5e13c-9051-4022-a476-92a97b05c5c1',\n", - " '7e144845-3350-4871-9404-fa7e3f734b78',\n", - " '2de7c819-546d-4470-8f0a-b91a7903b0ed',\n", - " 'a05149dd-2b01-4f03-9cd0-01fcea8780d2',\n", - " 'd9d462c0-d928-49cc-9172-7152328e4d51',\n", - " '8edb060d-8959-418d-9470-5da965ad2b9d',\n", - " '19d25b97-34b0-4f53-9ec3-4060239242dc',\n", - " '49a6e2ef-7f85-4a3a-98d6-3b3dd8b4c3b6',\n", - " '54913f79-d38b-40b3-b251-86c59721fa3d',\n", - " 'e87af048-11d2-4977-b4aa-6465d5017fba',\n", - " 'de44abe6-739b-44c5-806b-06d52aa1e56b',\n", - " '82898991-0951-40f6-8926-309db4a807b7',\n", - " '02c3a940-74db-4d60-a173-ca9bbd702cd0',\n", - " '20f881bb-7d3b-4f49-82a3-5ac8e7578322',\n", - " '141d3d1e-6d5a-4dd9-95bb-4ae07c81e39c',\n", - " 'b8b415f5-670b-4b56-b1ce-fe424769a182',\n", - " '55de7042-5452-4ea2-820a-6b35a594cbf4',\n", - " '404565f4-ee23-423f-a4d8-550b6f8b6e41',\n", - " '300e8b0b-2b5a-4618-aec7-8d5324de106f',\n", - " 'cab508a6-25f1-43ca-9f46-835af4e922af',\n", - " '2d370174-8c8f-418b-a083-84f52cfe9a36',\n", - " '65ff5f75-cc47-4610-999d-9a11f9894b32',\n", - " '3995b720-99e0-4f68-a740-46e7f2e5c59f',\n", - " '40af7846-ba78-48eb-83e0-6b6b844f8c79',\n", - " 'c1f51576-1e99-4f77-be03-ce202f238dbe',\n", - " 'ec7d4da2-e7a5-40d9-9e67-a37c12e03bf9',\n", - " '270222bf-57d7-4cfe-a268-1f6167ff6d7e',\n", - " '9a8373ee-c2db-4a1d-b5ac-6a04c0d2becd',\n", - " '4504f7c3-6820-49c7-be23-43d9b8ba58aa',\n", - " 'b181eab2-587c-4de7-92a4-6f8b0f3f2b00',\n", - " '7e889a1d-c1b8-4c53-af90-de3eaa6db021',\n", - " '6007601e-7b9d-4064-b16c-655169e9d72a',\n", - " '11e280e2-dfce-4a52-8867-296155f78eb8',\n", - " '821de396-47f8-4606-934c-13c1bf884473',\n", - " '9b7e6753-61d1-446d-8a2b-344b0e41f84c',\n", - " '5c2d20de-9344-441a-a1d8-0ca0f4b1fca4',\n", - " 'aee572eb-ec36-4f61-b35e-9dd2c6ab36f2',\n", - " '94ac76ef-42f8-4a03-bffe-f86cbab72408',\n", - " '1be0d43f-ee71-419c-ba50-3aaf024d27c5',\n", - " 'bfd48d92-f28e-48f1-815b-756642cf2b8e',\n", - " '8023464e-5507-4192-ac25-f06fb585e68d',\n", - " '75bde2f0-1b5f-4e05-aa8a-ff2d315a06b4',\n", - " '0b34222f-1dda-4d19-9810-11e75d1272e5',\n", - " '97500c01-1c8f-42ef-8efd-19223c68c46a',\n", - " 'cf8c5ea5-d750-4641-bb5b-e971494b7666',\n", - " '3fd5a660-ab78-4897-a700-1a73b533b0cc',\n", - " '17368981-b502-45e1-abaa-4a9b09c096a8',\n", - " 'de201d3e-8d95-41a6-b8ac-4afba7909331',\n", - " '192f929f-1bba-4e75-b05e-aa8ae937a092',\n", - " '86b39447-2fb0-4899-b1c1-6a89e6bfee0f',\n", - " 'ff5d3a8e-4da6-47f0-b573-036670c5a936',\n", - " 'ef0a7589-2792-43b9-9895-b40d175e5ff9',\n", - " '97338842-0c45-4e94-ac34-81dc74d259fd',\n", - " 'f24aca1a-8ea3-40fc-a868-c300bd6c7c5a',\n", - " '41af6f16-89f8-46c5-89ed-a0853385c24c',\n", - " 'a2f0344b-6ca1-4c2c-ac08-de839b7a61cd',\n", - " '8d23a995-124f-40c5-820a-b86eacb7589f',\n", - " '8846fe55-3eda-4af1-aee1-4c8faa9a5c73',\n", - " 'ae4e13a7-db9f-43d6-b27b-913db7f23b48',\n", - " '05846bed-e2ba-4b45-84eb-aef0fbdb1052',\n", - " 'ac778fa3-e3b2-42e3-8152-c63701895dfe',\n", - " 'c697142b-3c95-440d-b61a-7183c08c171d',\n", - " 'caa1e83f-ae87-4704-9f2e-d1dad34a9cd2',\n", - " '43847c84-8fe5-4457-9f82-526695e2e97a',\n", - " '5521bbc6-cdff-4703-bf3e-63e01596d21c',\n", - " '3c275f68-8f64-452b-bece-3949e2c25b22',\n", - " 'bf581ede-0913-424d-8a2d-216ae9cadcb0',\n", - " '117e9f7d-7c8b-4366-9f84-fbd5dd1171dd',\n", - " 'c116254f-1415-4188-bc93-0487bcbe47ee',\n", - " '40fac159-6763-4a30-addb-975a6e4e69a3',\n", - " '7a181599-2847-4d9b-957f-f4fc7468aea4',\n", - " '4df69e1f-2e1b-4be0-a161-a374f493c4cb',\n", - " '37813f0f-33a6-4adc-ac55-cbca0b74471f',\n", - " 'b0dbb8ef-e049-4393-a159-2e0f9081b0a3',\n", - " '57da8d7d-e0fc-4868-938d-a85f6d6663ff',\n", - " '618dad07-a666-4827-b188-55d5bf31aed0',\n", - " '3e2724b7-4f24-42fd-9def-ca0f9edcd615',\n", - " '41bf1d9b-da94-4e8f-9805-18bf93ff1100',\n", - " 'c5c85271-1e66-4dd2-a338-a2b48a9b07e4',\n", - " '80646915-6af3-423c-a2de-71786272f086',\n", - " 'bf824464-3eae-4343-b0ab-5b565445daec',\n", - " 'd3127b71-3763-4532-8863-0ce42170dfdb',\n", - " '700f2cb1-85bd-4d28-b95f-68f4536185c1',\n", - " 'f79d14f8-9eaa-4f3b-af7c-62095dae4497',\n", - " '196c8830-7cea-485e-a61c-af5863922733',\n", - " '989f5923-c37f-4a18-b61f-3e0b912f01d2',\n", - " '36ea71ca-36e3-4fcd-bdb9-0f41a83378a3',\n", - " '3567b36a-f249-4ab2-9392-a734303ce5c1',\n", - " '569ed9cf-6b14-4822-892c-4d19205490f2',\n", - " '7db05e1e-0406-4d9f-bdde-e7351932c25e',\n", - " '818a69ab-de3d-42f4-92d0-2accb701ef8a',\n", - " '96fc14b8-4930-4177-b20c-dba396297577',\n", - " 'ecca8db2-b629-4f05-8018-4aa99dc26842',\n", - " '2cc41514-6e02-4cba-bbb4-c2a69454559b',\n", - " 'b41fa3b0-ea8e-40e5-a0be-1abd27cbc23e',\n", - " '1254e098-a9dc-4eba-a0b9-929ed848ed91',\n", - " '1a9ea687-3318-4ad7-a55a-2e0044583667',\n", - " 'ff5a57dd-69a9-4d9a-9d26-56b9271f1b8e',\n", - " '593a9427-03ae-4b3b-9215-2d14f10341f3',\n", - " '5903b7e0-489f-4011-bbd7-cac82b19423a',\n", - " '48072b10-faae-4609-a298-5c6193488ed9',\n", - " 'ec7cd14c-1248-4285-a090-fc8d0ce96fd4',\n", - " 'a9c6016b-b100-4e46-8b42-a232b9be6459',\n", - " '7e851d53-1a48-4289-8b0a-27834e3e044d',\n", - " '77dde80d-7b70-4ab6-b042-c1e67932f36a',\n", - " '4642025a-d842-4fb3-83ef-76ce90a6a2bb',\n", - " '256c37b7-38b7-4642-a501-7a611e0763ac',\n", - " '65962070-4375-4479-872b-fc3300c3f1af']" + "['809ac718-6a36-4ddc-adc7-adb0f7816a47',\n", + " '2c391b69-896b-4a2f-bf9e-00872a5f31ff',\n", + " 'f548658e-65eb-4653-95c1-485c8176cbf5',\n", + " 'e1fc2f95-c575-46f1-b323-13bdd0da1da4',\n", + " '5049ac41-e24c-43d4-90c4-950ba120fa9c',\n", + " 'd664e558-6dc3-4607-86f1-32868c9c9f47',\n", + " '7c3ad58c-a311-422c-afc6-8217f301a6cd',\n", + " 'cf0542b5-a070-4567-8159-ee948a77fe8e',\n", + " '26c38bfc-243d-499d-8b3c-e5d71aebf588',\n", + " 'a6658e4b-0c19-400c-a682-52b6de933d8e',\n", + " 'dbd05713-bbde-43b9-a89a-dd7a815317a4',\n", + " '430bd79f-7a37-4f72-8e32-7db7d2c039dc',\n", + " '5f2c9a44-855c-4f25-813b-76bf204c4d8b',\n", + " '84ae00a7-9673-4adb-8cc2-6d53774be25a',\n", + " 'dd3f5c10-6ff2-4a7a-b16a-5c2ae4a46cba',\n", + " 'bf6e1bf8-4373-4dfb-9205-c5bf53eb76f8',\n", + " '6aec476a-2550-45bd-9aee-012e913a074e',\n", + " '82c725c6-7cdd-430e-b900-f783034adfb2',\n", + " '45ff73db-3396-4c9a-8e87-448fe4f8d5ac',\n", + " '43a9a36d-ba1d-4a5f-993c-0328c25996e8',\n", + " '82970ba9-e8c7-490d-9013-c5cf62c7f60f',\n", + " '9a1633ba-90d5-4374-a578-c99351086246',\n", + " 'dd67de24-51b7-4ea5-b453-f8e52f209a38',\n", + " 'dfda00dc-e531-44d0-8f65-69cbb9db459b',\n", + " 'd3a06fb7-1ed1-44ee-8722-04e50fe8c05d',\n", + " '72af4cf3-287f-49cc-bba9-72b5cc4a5a0a',\n", + " '95e46dae-ee37-4f0c-818f-83ad28bc9a8e',\n", + " 'a19ce574-47af-4dbf-a136-77d308ff00d6',\n", + " '75762289-e5d0-420e-9754-f800153f82a5',\n", + " '384c7132-6755-4abf-ac1f-05e7b80f329a',\n", + " '63f2ed10-4299-442b-b528-5a575d72e7d6',\n", + " 'f2b3305b-5847-4039-affa-351130bce422',\n", + " '511db69b-a7bb-49b3-903f-2474db1e8bed',\n", + " 'c565e6f2-d42b-4fa4-873b-3214ccc8359e',\n", + " '45aa23c9-e1b7-4b7e-9e84-8e29a52ce6ff',\n", + " 'fa017f87-1b7b-46ed-b111-2dcb296ec68e',\n", + " 'bcb30bbc-a09d-47dc-a3d7-8162f231d006',\n", + " 'bbbc81f1-b06b-4636-ac44-cb4486af15f0',\n", + " '384b77cf-a6c4-4e30-9358-4d77b9592fb4',\n", + " '88151c65-9d1c-4b48-bc85-93d817936aba',\n", + " '2b8afaaa-9094-4d0e-b671-53f8018a03af',\n", + " '59b0d7f3-2e77-469c-9cf7-82be345d7472',\n", + " '0a91fa62-95ea-4cb7-ba1c-b95453b7df31',\n", + " 'aac0cf31-ad27-4148-9149-11a0fb65c274',\n", + " '504e49be-1436-4034-8af3-261a454a5363',\n", + " 'd1ebbf37-979c-4343-a69b-b292f01e2cc4',\n", + " 'd188aac2-bbad-4359-b53c-74282a4017c4',\n", + " '3f21253c-f928-40be-a0a4-a2b182a9f578',\n", + " 'aa7f8bac-6385-4078-8c10-30b7a99ff032',\n", + " '774dffec-5096-4b4d-aa79-d0be8f16f3d9',\n", + " '289f7e36-9700-4661-ad80-7926052ed1f3',\n", + " 'dccf192d-f623-446e-b9be-6d6df98f2fc7',\n", + " 'cbb848d4-196e-4108-9963-fcccb4a8b513',\n", + " 'b796c0f9-c883-47c3-afd5-0b16d56ca2f5',\n", + " '718c6742-e9b7-45e4-9c89-95727baa2cb3',\n", + " '1b2c307a-b713-474e-93e7-c40f9bc57eab',\n", + " '9ac86b88-fdeb-4c5c-b88c-ba143167b584',\n", + " '30ce78f2-0a09-4f35-abb2-ae48bf2da3f0',\n", + " 'ee6bafe4-9557-401e-9e8e-cad451b8cba0',\n", + " '4df4c508-e67a-4e8b-9446-22996c051538',\n", + " '608d862f-243a-4686-83a0-7726850a7703',\n", + " '28622e79-6257-46ad-8d0b-227078b0192a',\n", + " '797a8ec5-dd36-4892-9e64-8bdc912f1f08',\n", + " '17e43ec9-b702-4f33-a5fc-014ac988d05f',\n", + " 'daccbc57-bcfa-479a-abe7-367d07c93b26',\n", + " '0a3ecebf-afdc-4124-9bac-8132c2b4958a',\n", + " 'a5bdca3f-cc24-470f-b49a-54c435de582a',\n", + " 'f0ccf63a-03f4-4c44-8a90-cc31b68fbf37',\n", + " '0a9427d3-7e33-4068-91c0-95d7bf2484cf',\n", + " 'e4778209-1a5a-45ff-8c1e-f94f839255f1',\n", + " '461c81b5-9fa3-4622-9a10-af8664fee897',\n", + " '850b68b2-eab6-412a-9bc5-c9e971533891',\n", + " '9fb4e8f9-3e0c-4b0e-8365-62bf44fab2bf',\n", + " '9fc852eb-c189-43b7-a61c-358d6c8e6c93',\n", + " 'a5702378-e662-4bd3-be67-4e587495f41f',\n", + " '9ba0b5af-9c8c-4d58-ba9a-bff26ca2571f',\n", + " '53e83397-0b6c-4f6f-8ebf-c7dee136db51',\n", + " 'bc09b5e1-76a2-4904-9959-680a23bf98e4',\n", + " '236cbaf3-c36b-44dd-afbe-6fe396a12f0f',\n", + " 'a9ad4fbe-6366-451a-ae3e-b4fee2bf320f',\n", + " 'ccfcfd3e-ef70-42cd-95f9-c628bfae7747',\n", + " '2f81758a-d8c8-4a1b-852c-67d9ad8f6a54',\n", + " 'f04d8ace-8666-4ec7-a095-a7cc42fca152',\n", + " '18906e6f-a1ea-4bc9-8ce9-11884e320c1c',\n", + " '9077b482-c838-498f-a290-7c38f0df520e',\n", + " 'dd0fe10e-5954-4a87-b0e7-47eb5ed17635',\n", + " '689cc11b-9a6e-4c9f-a4ef-59dd473cae7c',\n", + " '9e58c8ee-ef08-4aba-9cca-92e505503f89',\n", + " '683848ad-8c37-403c-9992-38a219d2239c',\n", + " '9e705e7a-99e4-4914-803c-aefa0f40edbd',\n", + " '717d8e41-250d-47e0-b723-7224fed80f22',\n", + " '8a9efc67-579f-4dcf-ac0e-caeb80245052',\n", + " '2c08568f-ce5d-4ecd-87a6-9baec410cb39',\n", + " 'ef73a66a-32ab-4945-b644-6db8c29562be',\n", + " 'e9da66a8-13d7-40d2-ae36-1689770173c3',\n", + " '99b918bb-4e37-4a46-b8f8-9167fe95e618',\n", + " '8b589cf8-286b-4836-bfe7-909947401920',\n", + " 'd7b6853c-772e-4621-ac41-543ea43ace9d',\n", + " 'c9e79c0d-966c-4e01-b962-ac893773ad75',\n", + " '2e8a083e-6307-4428-83cd-65b436cebbf9',\n", + " '932c94ed-36b5-40e7-a270-7bea414ad537',\n", + " '65c85bd7-0338-4e1a-954f-794f8bf85902',\n", + " '7c438b8a-f7e7-4d0b-b376-9a2194a1246c',\n", + " '6a5616f9-075c-47d0-8531-2a7dab39bd57',\n", + " '59d6b5e3-b2a6-4aa5-bad3-60fa38236e86',\n", + " '93d6af54-7141-4616-8894-e5440446f3df',\n", + " 'd1c00188-e315-41a4-bb97-e3c37b7d2beb',\n", + " '9d6fb7c5-b366-4516-9d36-be3854360df3',\n", + " '6a5eac17-180c-474e-927a-66b2c0f880e3',\n", + " '3a68ad92-8ab8-44f3-8f1c-1c852575620b',\n", + " '0346130f-92a6-4d52-b746-741adfe8e604',\n", + " 'dcf5b04b-5990-40ac-9685-d3f67c896ba4',\n", + " '93a12904-61bd-4f2a-af50-691be41d14b5',\n", + " '8dc1d6b6-af25-42f7-b2bb-77dcfc1127ee',\n", + " 'e301f0dc-5399-4c8c-bf10-a87f344015cc',\n", + " 'b6a26a83-e757-4b57-bcd3-6e7bd9e5b1bf',\n", + " '6013f6a2-c398-46c6-954b-53882646f78b',\n", + " '96e4c7f7-5a57-4815-810a-5e3646704397',\n", + " '9f48b86c-becd-44de-989f-540764d8299b',\n", + " '59b136fa-4b48-45ed-b951-fc514085e95d',\n", + " 'ce095f0f-f663-423e-a289-32657bc386a9',\n", + " '584ccb6f-e675-4879-a439-a68c17afd92d',\n", + " '455466cd-ce88-4faa-966d-955b3144611e',\n", + " '3527b28d-f65e-4a90-ac86-aac1ec29aea2',\n", + " 'fc59f9d9-e2c6-4a6c-becd-64cb9fdc120f',\n", + " '32df0391-7b16-4f60-8a22-08ab3aaa6777',\n", + " '426c06c3-5550-46dd-bf4c-dc91c30de60a',\n", + " '146a4059-7fb6-4a7b-8665-2bf2b2805d3e',\n", + " '321bb4f3-eb98-4742-beb0-70b4496a8ab7',\n", + " 'b39d2d7c-4eea-4df1-85fd-eb286dee41f3',\n", + " 'c858d614-98f3-4c96-9be1-5ba3088643b8',\n", + " '355a6b60-f7b1-4d68-96da-45d03b1f1bb5',\n", + " '6634ee11-e63f-46f1-82f1-5d596ba4f832']" ] }, - "execution_count": 23, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -1069,17 +924,21 @@ }, { "cell_type": "code", - "execution_count": 103, + "execution_count": 33, "id": "0c3a0a6b-4640-4036-863e-77a1a715a0e3", "metadata": { "scrolled": true }, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "['https://www.digitalcommonwealth.org/search/commonwealth:dv144791c', 'https://www.digitalcommonwealth.org/search/commonwealth:rv048f292', 'https://www.digitalcommonwealth.org/search/commonwealth:9019vm69m', 'https://www.digitalcommonwealth.org/search/commonwealth:dv1441451', 'https://www.digitalcommonwealth.org/search/commonwealth:w3764472d', 'https://www.digitalcommonwealth.org/search/commonwealth:6w924s12f', 'https://www.digitalcommonwealth.org/search/commonwealth:6w924r91w', 'https://www.digitalcommonwealth.org/search/commonwealth:wm118g867']\n" + "ename": "AttributeError", + "evalue": "'list' object has no attribute 'invoke'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[33], line 5\u001b[0m\n\u001b[1;32m 1\u001b[0m retriever \u001b[38;5;241m=\u001b[39m vector_store\u001b[38;5;241m.\u001b[39msimilarity_search_with_score(\n\u001b[1;32m 2\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mManuscripts\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 3\u001b[0m k\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m3\u001b[39m\n\u001b[1;32m 4\u001b[0m )\n\u001b[0;32m----> 5\u001b[0m r \u001b[38;5;241m=\u001b[39m retriever\u001b[38;5;241m.\u001b[39minvoke(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mJohn Bishop Estlin\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 6\u001b[0m r_list \u001b[38;5;241m=\u001b[39m [x\u001b[38;5;241m.\u001b[39mmetadata[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mURL\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m r]\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28mprint\u001b[39m(r_list)\n", + "\u001b[0;31mAttributeError\u001b[0m: 'list' object has no attribute 'invoke'" ] } ], @@ -1093,6 +952,51 @@ "print(r_list)" ] }, + { + "cell_type": "markdown", + "id": "71698bba-0e8d-4bbc-8412-81c225c8cb6e", + "metadata": {}, + "source": [ + "Save and load the vector store" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "273778f9-cb7d-4bb9-acd0-16841adf8344", + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'metadata' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[13], line 12\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;66;03m# Save metadata\u001b[39;00m\n\u001b[1;32m 11\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(store_dir, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmetadata.pkl\u001b[39m\u001b[38;5;124m\"\u001b[39m), \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwb\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[0;32m---> 12\u001b[0m pickle\u001b[38;5;241m.\u001b[39mdump(metadata, f)\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFAISS index and metadata stored in \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mstore_dir\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n", + "\u001b[0;31mNameError\u001b[0m: name 'metadata' is not defined" + ] + } + ], + "source": [ + "vector_store.save_local(\"faiss_index\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "98740c69-5687-4b0c-a4ae-4233208f6e22", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.embeddings import HuggingFaceEmbeddings\n", + "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")\n", + "\n", + "vector_store = FAISS.load_local(\n", + " \"faiss_index\", embeddings, allow_dangerous_deserialization=True\n", + ")" + ] + }, { "cell_type": "markdown", "id": "d3a257cc-c687-4fc5-91a7-0c3048a49e70", @@ -1103,37 +1007,37 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 55, "id": "013d5fca-c621-4a7e-ac0e-ad5c9aba577f", "metadata": {}, "outputs": [], "source": [ - "weights2 = {\n", - " \"title\": 1.0,\n", - " \"subtitle\": 0.95,\n", - " \"title_alt\": 0.9,\n", - " \"abstract\": 0.85,\n", - " \"subject_facet\": 0.8,\n", - " \"subject_geographic\": 0.75,\n", - " \"genre\": 0.7,\n", - " \"genre_specific\": 0.65,\n", - " \"name_facet\": 0.6,\n", - " \"name_role\": 0.55,\n", - " \"date_human\": 0.5,\n", - " \"date_start\": 0.45,\n", - " \"date_end\": 0.4,\n", - " \"publisher\": 0.35,\n", - " \"collection_name\": 0.3,\n", - " \"physical_location\": 0.25,\n", - " \"related_item_host\": 0.2,\n", - " \"type_of_resource\": 0.15,\n", - " \"URL\": 0.1\n", + "weights = {\n", + " \"title\": 1000,\n", + " \"subtitle\": 500,\n", + " \"title_alt\": 500,\n", + " \"abstract\": 30,\n", + " \"subject_facet\": 1,\n", + " \"subject_geographic\": 1,\n", + " \"genre\": 1,\n", + " \"genre_specific\": 1,\n", + " \"name_facet\": 1,\n", + " \"name_role\": 1,\n", + " \"date_human\": 1,\n", + " \"date_start\": 1,\n", + " \"date_end\": 1,\n", + " \"publisher\": 1,\n", + " \"collection_name\": 1,\n", + " \"physical_location\": 1,\n", + " \"related_item_host\": 1,\n", + " \"type_of_resource\": 1,\n", + " \"URL\": 0.0\n", "}\n" ] }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 78, "id": "cc1f04d1-f553-46ae-bfc6-248125f62423", "metadata": {}, "outputs": [], @@ -1143,9 +1047,7 @@ "import numpy as np\n", "from langchain.embeddings import HuggingFaceEmbeddings\n", "\n", - "# our vector store:\n", - "\n", - "# embedding model\n", + "model = SentenceTransformer('all-MiniLM-L6-v2')\n", "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")\n", "\n", "def compute_relevance_score(metadata_value, query):\n", @@ -1172,7 +1074,7 @@ "\n", "\n", "\n", - "def rerank_documents(documents, query, weights, vector_store, k=10):\n", + "def rerank_documents(query, weights, vector_store, k=10):\n", " \"\"\"\n", " Rerank documents based on metadata relevance scores and FAISS vector similarity scores.\n", "\n", @@ -1186,31 +1088,32 @@ " list: Reranked documents in descending order of relevance.\n", " \"\"\"\n", "\n", - " final_score = 0\n", - "\n", " reranked_results = []\n", + " total = sum(weights.values())\n", + " # returns the relevant documents from the query\n", " returned_docs = vector_store.similarity_search_with_score(query, k)\n", " for doc in returned_docs:\n", - " final_score = doc[1]\n", + " final_score = 0\n", " # Add weighted relevance scores for each metadata field\n", " for field, weight in weights.items():\n", " metadata_value = doc[0].metadata.get(field, \"\") # Safely get metadata field value\n", " relevance_score = compute_relevance_score(metadata_value, query)\n", - " final_score += weight * relevance_score\n", + " #print(f\"relevance_score: {relevance_score}\")\n", + " final_score += (weight * relevance_score) \n", "\n", - " reranked_results.append((doc, final_score))\n", + " reranked_results.append((doc, final_score / total))\n", "\n", " # Sort documents by the final score in descending order\n", " reranked_results.sort(key=lambda x: x[1], reverse=True)\n", - " return [doc for doc, score in reranked_results]\n", + " return [(doc, score) for doc, score in reranked_results]\n", "\n", "\n", - "docs = rerank_documents(documents, \"Newspaper\", weights2, vector_store)" + "docs = rerank_documents(\"Newspaper\", weights, vector_store)" ] }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 79, "id": "2d9172aa-6c15-4c90-856e-d0ee53100721", "metadata": {}, "outputs": [ @@ -1218,27 +1121,45 @@ "name": "stdout", "output_type": "stream", "text": [ - "('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:gf06jp23d', 'Reranked score: 1.1741459369659424')\n", - "('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:k356cp803', 'Reranked score: 1.161521077156067')\n", - "('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:t435k083s', 'Reranked score: 1.1445826292037964')\n", - "('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:8s45sw212', 'Reranked score: 1.1416451930999756')\n", - "('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:9p292v62n', 'Reranked score: 1.1416230201721191')\n", - "('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:z890vf594', 'Reranked score: 1.1343271732330322')\n", - "('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:pv63jm38v', 'Reranked score: 1.0997507572174072')\n", - "('The Tocsin of Liberty', 'https://www.digitalcommonwealth.org/search/commonwealth:05744b168', 'Reranked score: 1.0604684352874756')\n", - "(\"Thomas's Massachusetts Spy, or, Worcester Gazette\", 'https://www.digitalcommonwealth.org/search/commonwealth:v405x072q', 'Reranked score: 1.0452649593353271')\n", - "(\"Thomas's Massachusetts Spy, or, Worcester Gazette\", 'https://www.digitalcommonwealth.org/search/commonwealth:1831h918x', 'Reranked score: 1.024101972579956')\n" + "('Thanksgiving', 'https://www.digitalcommonwealth.org/search/commonwealth:02876465m', 'Reranked score: 0.21417763510649684')\n", + "('Thanksgiving', 'https://www.digitalcommonwealth.org/search/commonwealth:jd478671b', 'Reranked score: 0.21414245699337317')\n", + "('Thanks for high school articles', 'https://www.digitalcommonwealth.org/search/commonwealth:8910r4424', 'Reranked score: 0.20574750553692622')\n", + "('T.H. Jones, Charlotte, N.C., autograph letter ...', 'https://www.digitalcommonwealth.org/search/commonwealth:w3764603d', 'Reranked score: 0.16508627951932514')\n", + "('Theodore C. Tharin, Charleston, S.C., autograp...', 'https://www.digitalcommonwealth.org/search/commonwealth:9k41zk061', 'Reranked score: 0.14404614119555387')\n", + "('T.H. Marshall, Graniteville, S.C., autograph l...', 'https://www.digitalcommonwealth.org/search/commonwealth:ws859m407', 'Reranked score: 0.14002441968440776')\n", + "('Theodore C. Tharin, Grumesville, S.C., autogra...', 'https://www.digitalcommonwealth.org/search/commonwealth:w3764352q', 'Reranked score: 0.138896407891218')\n", + "('Theodore C. Tharin, Grumesville, S.C. [?], aut...', 'https://www.digitalcommonwealth.org/search/commonwealth:w3764306m', 'Reranked score: 0.13877071803271654')\n", + "('Theodore C. Tharin, Grumesville, S.C., autogra...', 'https://www.digitalcommonwealth.org/search/commonwealth:w3764957b', 'Reranked score: 0.1382296937873219')\n", + "('\"The Refuge of Oppression,\" from David S. Gran...', 'https://www.digitalcommonwealth.org/search/commonwealth:dv1441451', 'Reranked score: 0.12401763978186418')\n" ] } ], "source": [ "#print([docs[i].metadata['title'] for i in range(len(docs))])\n", - "docs_list = [(docs[i][0].metadata['title'], docs[i][0].metadata['URL'], f\"Reranked score: {docs[i][1]}\") for i in range(len(docs))]\n", + "docs_list = [(docs[i][0][0].metadata['title'], docs[i][0][0].metadata['URL'], f\"Reranked score: {docs[i][1]}\") for i in range(len(docs))]\n", "docs_list.sort(key=lambda x: x[2], reverse=True)\n", "for doc in docs_list:\n", " print(doc)" ] }, + { + "cell_type": "code", + "execution_count": 41, + "id": "83c1c3f2-91b4-4647-b93c-09a0af5b43a8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "100\n" + ] + } + ], + "source": [ + "print(len(docs_list))" + ] + }, { "cell_type": "markdown", "id": "07cdf844-72c6-41ef-bade-9afb52bceed8", @@ -1668,17 +1589,6 @@ "### Making the Query" ] }, - { - "cell_type": "code", - "execution_count": 82, - "id": "5b01faae-fdbf-420d-9b79-e1f31e84baf8", - "metadata": {}, - "outputs": [], - "source": [ - "import os \n", - "os.environ[\"OPENAI_API_KEY\"] = \"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\"" - ] - }, { "cell_type": "code", "execution_count": 94, From d33e63675ac3434b551c168bfa3fadbd9a7f4b35 Mon Sep 17 00:00:00 2001 From: Brandon Vargus <45298256+b3v@users.noreply.github.com> Date: Thu, 5 Dec 2024 12:42:51 -0500 Subject: [PATCH 5/6] Create streamlit-rag-app.py --- streamlit-rag-app.py | 115 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 streamlit-rag-app.py diff --git a/streamlit-rag-app.py b/streamlit-rag-app.py new file mode 100644 index 0000000..3c7f55a --- /dev/null +++ b/streamlit-rag-app.py @@ -0,0 +1,115 @@ +import streamlit as st +import os +import json +from dotenv import load_dotenv + +from langchain.chains import RetrievalQA +from langchain_community.vectorstores import FAISS +from langchain.text_splitter import CharacterTextSplitter +from langchain_openai import ChatOpenAI +from langchain.schema import Document +from langchain_huggingface import HuggingFaceEmbeddings + +# Load environment variables +load_dotenv() + +# Get the OpenAI API key from the environment +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") +if not OPENAI_API_KEY: + st.error("OPENAI_API_KEY is not set. Please add it to your .env file.") + +# takes a few minutes to load +file = open("bpl_data.json") + +bpl = json.load(file) + +#file = open("bpl_data.json") + +#bpl = json.load(file) + +# Initialize session state variables +if 'vector_store' not in st.session_state: + st.session_state.vector_store = None +if 'qa_chain' not in st.session_state: + st.session_state.qa_chain = None + + + +def setup_qa_chain(vector_store): + """Set up the QA chain with a retriever.""" + retriever = vector_store.as_retriever(search_kwargs={"k": 3}) + llm = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=OPENAI_API_KEY) + qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True) + + return qa_chain + +def setup_custom_chain(vector_store): + retriever = vector_store.as_retriever(search_kwargs={"k": 3}) + llm = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=OPENAI_API_KEY) + docs = retriever.invoke() + +def main(): + + # Set page title and header + st.set_page_config(page_title="LibRAG", page_icon="📚") + st.title("Boston Public Library Database 📚") + + embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") + + # Sidebar for initialization + # st.sidebar.header("Initialize Knowledge Base") + # if st.sidebar.button("Load Data"): + # try: + # st.session_state.vector_store = FAISS.load_local( + # "vector-store", embeddings, allow_dangerous_deserialization=True + # ) + # st.session_state.qa_chain = setup_qa_chain(st.session_state.vector_store) + # st.sidebar.success("Knowledge base loaded successfully!") + # except Exception as e: + # st.sidebar.error(f"Error loading data: {e}") + + st.session_state.vector_store = FAISS.load_local("vector-store", embeddings, allow_dangerous_deserialization=True) + + st.session_state.qa_chain = setup_qa_chain(st.session_state.vector_store) + + # Query input and processing + st.header("Ask a Question") + query = st.text_input("Enter your question about BPL's database") + + if query: + # Check if vector store and QA chain are initialized + if st.session_state.qa_chain is None: + st.warning("Please load the knowledge base first using the sidebar.") + else: + # Run the query + try: + response = st.session_state.qa_chain({"query": query}) + + # Display answer + st.subheader("Answer") + st.write(response["result"]) + + # Display sources + st.subheader("Sources") + sources = response["source_documents"] + for i, doc in enumerate(sources, 1): + source = doc.metadata["source"] + + abstract = None + + # find the specific source: + for j in range(len(bpl["Data"])): + ID = bpl['Data'][j]["id"] + if doc.metadata['source'] == ID: + abstract = bpl["Data"][j]['attributes']['abstract_tsi'] + break + + with st.expander(f"Source {i}"): + st.write(f"**Content:** {abstract}") + st.write(f"**URL:** https://www.digitalcommonwealth.org/search/{doc.metadata['source']}") + + except Exception as e: + st.error(f"An error occurred: {e}") + +if __name__ == "__main__": + main() From 4808a1dc9c81880aabf924f6cab89a47f58b7e06 Mon Sep 17 00:00:00 2001 From: Brandon Vargus Date: Mon, 16 Dec 2024 19:09:38 -0500 Subject: [PATCH 6/6] added deployment notebook --- Deployment/Deployment.ipynb | 1832 +++++++++++++++++++++++++++++++++++ 1 file changed, 1832 insertions(+) create mode 100644 Deployment/Deployment.ipynb diff --git a/Deployment/Deployment.ipynb b/Deployment/Deployment.ipynb new file mode 100644 index 0000000..a027a64 --- /dev/null +++ b/Deployment/Deployment.ipynb @@ -0,0 +1,1832 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2e7eb6e2-0a1b-42ea-8127-a51f13b4b4b0", + "metadata": {}, + "source": [ + "# LibRAG Deployment Phase" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "3934dc57-c4c3-4892-bcf4-bce4aa5c48af", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import json\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "id": "5a24d843-3692-49d5-8167-af46cf4a1f5a", + "metadata": {}, + "source": [ + "### We are going to ensure that we have our data downloaded from the SCC.\n", + "### We are going to download one interval of the full text, as well as the entire metadata file" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "b50d8c5f-dfea-454f-860e-13315a9c2fea", + "metadata": {}, + "outputs": [], + "source": [ + "# replace with sample_full_text.json\n", + "file = open(\"../EDA Phase/bpl-digital-commonwealth/ft_13_checkpoint_10_133.json\")\n", + "\n", + "full_text = json.load(file)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1e4c5601-648f-4ff1-a5ce-d8823c46f884", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "133\n" + ] + } + ], + "source": [ + "print(len(full_text))" + ] + }, + { + "cell_type": "markdown", + "id": "1148a15c-0965-4613-bdb4-15e74467fd16", + "metadata": {}, + "source": [ + "Here's how to access the text documents:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "da05e8ae-71f6-4ab3-b2fa-0193b77d6262", + "metadata": {}, + "outputs": [], + "source": [ + "print(full_text['commonwealth:w3764603d']['text'])" + ] + }, + { + "cell_type": "markdown", + "id": "bb3f0d7e-a982-4e18-a9eb-783df449ff09", + "metadata": {}, + "source": [ + "### Create Metadata Dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "be5c55ae-c7a5-4aec-9fe5-d79ff6cc0592", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame(bpl_metadata['Data'])" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "91e29caa-9e83-4ddf-a206-53b66012c48a", + "metadata": {}, + "outputs": [], + "source": [ + "df.drop(columns=df.columns[0], axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "7e84b8ed-2f7b-4a1c-bb95-693d9ca8a846", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
typelinksidsystem_create_dtsisystem_modified_dtsicurator_model_ssicurator_model_suffix_ssititle_info_primary_tsigenre_basic_ssimgenre_specific_ssim...storage_key_base_ssidentifier_issn_ssimfrequency_tsicontained_by_ssinote_credits_tsimidentifier_isbn_ssimidentifier_music_publisher_ssimnote_arrangement_tsimtranscription_ark_id_ssitranscription_key_base_ss
0DigitalObject{'self': 'https://www.digitalcommonwealth.org/...commonwealth-oai:xp68md23x2021-03-04T00:13:09Z2021-09-02T20:40:00ZCurator::DigitalObjectDigitalObjectمن فضلكم توقفوا الأشخاص الذين ارتكبوا أسوأ الج...[Posters][Political posters]...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1DigitalObject{'self': 'https://www.digitalcommonwealth.org/...commonwealth-oai:xp68m844v2021-03-03T23:58:44Z2021-09-02T20:21:32ZCurator::DigitalObjectDigitalObject海员们 : 要警惕航运事故[Posters][Political posters]...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2DigitalObject{'self': 'https://www.digitalcommonwealth.org/...commonwealth-oai:xp68mb49n2021-03-04T00:06:25Z2021-09-02T20:30:29ZCurator::DigitalObjectDigitalObject人間としての尊厳を保てる : 生活賃金を[Posters][Political posters]...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
3DigitalObject{'self': 'https://www.digitalcommonwealth.org/...commonwealth-oai:xp68mc60v2021-03-04T00:10:40Z2021-09-02T20:35:20ZCurator::DigitalObjectDigitalObject野火[Posters][Political posters]...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4DigitalObject{'self': 'https://www.digitalcommonwealth.org/...commonwealth-oai:xp68mc72n2021-03-04T00:11:07Z2021-09-02T20:35:52ZCurator::DigitalObjectDigitalObject野火[Posters][Political posters]...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
5DigitalObject{'self': 'https://www.digitalcommonwealth.org/...commonwealth-oai:xp68mc9922021-03-04T00:12:14Z2021-09-02T20:36:59ZCurator::DigitalObjectDigitalObject團結 抗強權[Posters][Political posters]...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
6DigitalObject{'self': 'https://www.digitalcommonwealth.org/...commonwealth-oai:xp68m804w2021-03-03T23:57:00Z2021-09-02T20:19:35ZCurator::DigitalObjectDigitalObject大队的夜晩 (年画) 史惠芳作[Posters][Political posters]...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
7DigitalObject{'self': 'https://www.digitalcommonwealth.org/...commonwealth-oai:xp68m83652021-03-03T23:58:27Z2021-09-02T20:21:12ZCurator::DigitalObjectDigitalObjectморякам[Posters][Political posters]...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
8DigitalObject{'self': 'https://www.digitalcommonwealth.org/...commonwealth:8k71nz9662015-09-14T22:06:01Z2022-07-08T19:59:21ZCurator::DigitalObjectDigitalObjectA[Prints]NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
9DigitalObject{'self': 'https://www.digitalcommonwealth.org/...commonwealth:8k71p000r2015-09-14T22:06:33Z2022-07-08T19:59:21ZCurator::DigitalObjectDigitalObjectE[Prints]NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

10 rows × 140 columns

\n", + "
" + ], + "text/plain": [ + " type links \\\n", + "0 DigitalObject {'self': 'https://www.digitalcommonwealth.org/... \n", + "1 DigitalObject {'self': 'https://www.digitalcommonwealth.org/... \n", + "2 DigitalObject {'self': 'https://www.digitalcommonwealth.org/... \n", + "3 DigitalObject {'self': 'https://www.digitalcommonwealth.org/... \n", + "4 DigitalObject {'self': 'https://www.digitalcommonwealth.org/... \n", + "5 DigitalObject {'self': 'https://www.digitalcommonwealth.org/... \n", + "6 DigitalObject {'self': 'https://www.digitalcommonwealth.org/... \n", + "7 DigitalObject {'self': 'https://www.digitalcommonwealth.org/... \n", + "8 DigitalObject {'self': 'https://www.digitalcommonwealth.org/... \n", + "9 DigitalObject {'self': 'https://www.digitalcommonwealth.org/... \n", + "\n", + " id system_create_dtsi system_modified_dtsi \\\n", + "0 commonwealth-oai:xp68md23x 2021-03-04T00:13:09Z 2021-09-02T20:40:00Z \n", + "1 commonwealth-oai:xp68m844v 2021-03-03T23:58:44Z 2021-09-02T20:21:32Z \n", + "2 commonwealth-oai:xp68mb49n 2021-03-04T00:06:25Z 2021-09-02T20:30:29Z \n", + "3 commonwealth-oai:xp68mc60v 2021-03-04T00:10:40Z 2021-09-02T20:35:20Z \n", + "4 commonwealth-oai:xp68mc72n 2021-03-04T00:11:07Z 2021-09-02T20:35:52Z \n", + "5 commonwealth-oai:xp68mc992 2021-03-04T00:12:14Z 2021-09-02T20:36:59Z \n", + "6 commonwealth-oai:xp68m804w 2021-03-03T23:57:00Z 2021-09-02T20:19:35Z \n", + "7 commonwealth-oai:xp68m8365 2021-03-03T23:58:27Z 2021-09-02T20:21:12Z \n", + "8 commonwealth:8k71nz966 2015-09-14T22:06:01Z 2022-07-08T19:59:21Z \n", + "9 commonwealth:8k71p000r 2015-09-14T22:06:33Z 2022-07-08T19:59:21Z \n", + "\n", + " curator_model_ssi curator_model_suffix_ssi \\\n", + "0 Curator::DigitalObject DigitalObject \n", + "1 Curator::DigitalObject DigitalObject \n", + "2 Curator::DigitalObject DigitalObject \n", + "3 Curator::DigitalObject DigitalObject \n", + "4 Curator::DigitalObject DigitalObject \n", + "5 Curator::DigitalObject DigitalObject \n", + "6 Curator::DigitalObject DigitalObject \n", + "7 Curator::DigitalObject DigitalObject \n", + "8 Curator::DigitalObject DigitalObject \n", + "9 Curator::DigitalObject DigitalObject \n", + "\n", + " title_info_primary_tsi genre_basic_ssim \\\n", + "0 من فضلكم توقفوا الأشخاص الذين ارتكبوا أسوأ الج... [Posters] \n", + "1 海员们 : 要警惕航运事故 [Posters] \n", + "2 人間としての尊厳を保てる : 生活賃金を [Posters] \n", + "3 野火 [Posters] \n", + "4 野火 [Posters] \n", + "5 團結 抗強權 [Posters] \n", + "6 大队的夜晩 (年画) 史惠芳作 [Posters] \n", + "7 морякам [Posters] \n", + "8 A [Prints] \n", + "9 E [Prints] \n", + "\n", + " genre_specific_ssim ... storage_key_base_ss identifier_issn_ssim \\\n", + "0 [Political posters] ... NaN NaN \n", + "1 [Political posters] ... NaN NaN \n", + "2 [Political posters] ... NaN NaN \n", + "3 [Political posters] ... NaN NaN \n", + "4 [Political posters] ... NaN NaN \n", + "5 [Political posters] ... NaN NaN \n", + "6 [Political posters] ... NaN NaN \n", + "7 [Political posters] ... NaN NaN \n", + "8 NaN ... NaN NaN \n", + "9 NaN ... NaN NaN \n", + "\n", + " frequency_tsi contained_by_ssi note_credits_tsim identifier_isbn_ssim \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "5 NaN NaN NaN NaN \n", + "6 NaN NaN NaN NaN \n", + "7 NaN NaN NaN NaN \n", + "8 NaN NaN NaN NaN \n", + "9 NaN NaN NaN NaN \n", + "\n", + " identifier_music_publisher_ssim note_arrangement_tsim \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + "5 NaN NaN \n", + "6 NaN NaN \n", + "7 NaN NaN \n", + "8 NaN NaN \n", + "9 NaN NaN \n", + "\n", + " transcription_ark_id_ssi transcription_key_base_ss \n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + "5 NaN NaN \n", + "6 NaN NaN \n", + "7 NaN NaN \n", + "8 NaN NaN \n", + "9 NaN NaN \n", + "\n", + "[10 rows x 140 columns]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_attributes = pd.json_normalize(df['attributes'])\n", + "df_attributes = pd.concat([df.drop(columns=['attributes']), df_attributes], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "eed31425-0d0d-46e7-ae69-45637110e2c7", + "metadata": {}, + "outputs": [], + "source": [ + "df_attributes.to_csv(\"metadata_attributes.csv\")" + ] + }, + { + "cell_type": "markdown", + "id": "ce755e72-348e-45b3-8cb4-7e0202818e16", + "metadata": {}, + "source": [ + "### Optionally, read the csv if it is already downloaded" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "49e1b410-a7b0-4ef3-bbb8-2c240e568a73", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/xq/fj3st__56r54gz9tdvb7d2k40000gn/T/ipykernel_8401/3787498575.py:1: DtypeWarning: Columns (10,16,17,18,20,21,22,23,24,25,27,29,33,34,36,41,42,43,44,54,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,128,129,130,131,132,133,134,135,136,137,138,139,140) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " df_attributes = pd.read_csv(\"metadata_attributes.csv\")\n" + ] + } + ], + "source": [ + "df_attributes = pd.read_csv(\"metadata_attributes.csv\")" + ] + }, + { + "cell_type": "markdown", + "id": "1c6d1c2d-16ad-43a0-844a-72238901c07d", + "metadata": {}, + "source": [ + "### Turn full text into Documents type" + ] + }, + { + "cell_type": "markdown", + "id": "c74fe245-886d-4e9b-a554-77308c75e44b", + "metadata": {}, + "source": [ + "Now we will make Document objects with the important metadata attributes" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "15d83b93-6840-4618-ac40-1c7de9e5cc1e", + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "def get_title(text):\n", + " match = re.search(r'\\d+\\s+(.+?)\\n', text)\n", + "\n", + " # Extracting and printing the title if there's a match\n", + " if match:\n", + " title = match.group(1)\n", + " return title" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "c5947005-6e02-4130-9d4e-5487cd389dd5", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.schema import Document\n", + "documents = []\n", + "for doc in full_text:\n", + " title = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"title_info_primary_tsi\"]))\n", + " ID = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"id\"]))\n", + " abstract = str(df_attributes.loc[df_attributes[\"id\"] == doc, \"abstract_tsi\"])\n", + " title_subtitle = str(df_attributes.loc[df_attributes[\"id\"] == doc, \"title_info_primary_subtitle_tsi\"])\n", + "\n", + " documents.append(Document(page_content=\"\", metadata={'title':title, 'ID':ID, 'abstract':abstract, 'title_subtitle':title_subtitle}))" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "6c3f5614-cc45-4fb4-9b13-1fd0361da1b6", + "metadata": {}, + "outputs": [], + "source": [ + "# Turn the BPL data into a Document\n", + "from langchain.schema import Document\n", + "\n", + "documents = []\n", + "\n", + "for doc in full_text:\n", + " # Extract metadata fields and apply get_title()\n", + " title = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"title_info_primary_tsi\"]))\n", + " title_subtitle = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"title_info_primary_subtitle_tsi\"]))\n", + " title_alt = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"title_info_alternative_tsim\"]))\n", + " abstract = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"abstract_tsi\"]))\n", + " subject_facet = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"subject_facet_ssim\"]))\n", + " subject_geographic = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"subject_geographic_sim\"]))\n", + " genre = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"genre_basic_ssim\"]))\n", + " genre_specific = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"genre_specific_ssim\"]))\n", + " name_facet = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"name_facet_ssim\"]))\n", + " name_role = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"name_role_tsim\"]))\n", + " date_human = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"date_tsim\"]))\n", + " date_start = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"date_start_dtsi\"]))\n", + " date_end = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"date_end_dtsi\"]))\n", + " publisher = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"publisher_tsi\"]))\n", + " collection_name = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"collection_name_ssim\"]))\n", + " physical_location = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"physical_location_ssim\"]))\n", + " related_item_host = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"related_item_host_ssim\"]))\n", + " type_of_resource = get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"type_of_resource_ssim\"]))\n", + " URL = \"https://www.digitalcommonwealth.org/search/\" + get_title(str(df_attributes.loc[df_attributes[\"id\"] == doc, \"id\"]))\n", + " \n", + " # Create Document with metadata\n", + " documents.append(Document(\n", + " page_content=\"\",\n", + " metadata={\n", + " \"title\": title,\n", + " \"subtitle\": title_subtitle,\n", + " \"title_alt\": title_alt,\n", + " \"abstract\": abstract,\n", + " \"subject_facet\": subject_facet,\n", + " \"subject_geographic\": subject_geographic,\n", + " \"genre\": genre,\n", + " \"genre_specific\": genre_specific,\n", + " \"name_facet\": name_facet,\n", + " \"name_role\": name_role,\n", + " \"date_human\": date_human,\n", + " \"date_start\": date_start,\n", + " \"date_end\": date_end,\n", + " \"publisher\": publisher,\n", + " \"collection_name\": collection_name,\n", + " \"physical_location\": physical_location,\n", + " \"related_item_host\": related_item_host,\n", + " \"type_of_resource\": type_of_resource,\n", + " \"URL\": URL\n", + " }\n", + " ))" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "b58b4530-27e4-4ed4-80be-4ee240892480", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['title', 'subtitle', 'title_alt', 'abstract', 'subject_facet', 'subject_geographic', 'genre', 'genre_specific', 'name_facet', 'name_role', 'date_human', 'date_start', 'date_end', 'publisher', 'collection_name', 'physical_location', 'related_item_host', 'type_of_resource', 'URL'])" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "documents[-1].metadata.keys()" + ] + }, + { + "cell_type": "markdown", + "id": "6580d4c1-9ab0-44c8-9ea7-cbeb80934f4b", + "metadata": {}, + "source": [ + "# RAG Pipeline" + ] + }, + { + "cell_type": "markdown", + "id": "895258be-7bf4-4d8d-8f5a-79d80dfe8c36", + "metadata": {}, + "source": [ + "### Using FAISS Vector Store" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1b3815e-dec5-4f22-b863-d91a57b5ccad", + "metadata": {}, + "outputs": [], + "source": [ + "#from langchain_ollama import OllamaEmbeddings\n", + "from langchain_openai import OpenAIEmbeddings\n", + "import faiss\n", + "from langchain_community.docstore.in_memory import InMemoryDocstore\n", + "from langchain_community.vectorstores import FAISS\n", + "from langchain.embeddings import HuggingFaceEmbeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "03554a37-d142-45eb-be33-a333929b927d", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/xq/fj3st__56r54gz9tdvb7d2k40000gn/T/ipykernel_7740/4140823313.py:2: LangChainDeprecationWarning: The class `HuggingFaceEmbeddings` was deprecated in LangChain 0.2.2 and will be removed in 1.0. An updated version of the class exists in the :class:`~langchain-huggingface package and should be used instead. To use it run `pip install -U :class:`~langchain-huggingface` and import as `from :class:`~langchain_huggingface import HuggingFaceEmbeddings``.\n", + " embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")\n", + "2024-11-26 15:04:19.450227: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", + "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" + ] + } + ], + "source": [ + "#embeddings = OpenAIEmbeddings(model=\"text-embedding-3-large\", dimensions=3072)\n", + "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "0bea78d8-4b72-4da0-a50d-89e0fe509454", + "metadata": {}, + "outputs": [], + "source": [ + "index = faiss.IndexFlatL2(len(embeddings.embed_query(\"hello world\")))\n", + "\n", + "vector_store = FAISS(\n", + " embedding_function=embeddings,\n", + " docstore=InMemoryDocstore(),\n", + " index=index,\n", + " index_to_docstore_id={},\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "b1bee292-778a-480e-8eb3-5cc37587ce85", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['809ac718-6a36-4ddc-adc7-adb0f7816a47',\n", + " '2c391b69-896b-4a2f-bf9e-00872a5f31ff',\n", + " 'f548658e-65eb-4653-95c1-485c8176cbf5',\n", + " 'e1fc2f95-c575-46f1-b323-13bdd0da1da4',\n", + " '5049ac41-e24c-43d4-90c4-950ba120fa9c',\n", + " 'd664e558-6dc3-4607-86f1-32868c9c9f47',\n", + " '7c3ad58c-a311-422c-afc6-8217f301a6cd',\n", + " 'cf0542b5-a070-4567-8159-ee948a77fe8e',\n", + " '26c38bfc-243d-499d-8b3c-e5d71aebf588',\n", + " 'a6658e4b-0c19-400c-a682-52b6de933d8e',\n", + " 'dbd05713-bbde-43b9-a89a-dd7a815317a4',\n", + " '430bd79f-7a37-4f72-8e32-7db7d2c039dc',\n", + " '5f2c9a44-855c-4f25-813b-76bf204c4d8b',\n", + " '84ae00a7-9673-4adb-8cc2-6d53774be25a',\n", + " 'dd3f5c10-6ff2-4a7a-b16a-5c2ae4a46cba',\n", + " 'bf6e1bf8-4373-4dfb-9205-c5bf53eb76f8',\n", + " '6aec476a-2550-45bd-9aee-012e913a074e',\n", + " '82c725c6-7cdd-430e-b900-f783034adfb2',\n", + " '45ff73db-3396-4c9a-8e87-448fe4f8d5ac',\n", + " '43a9a36d-ba1d-4a5f-993c-0328c25996e8',\n", + " '82970ba9-e8c7-490d-9013-c5cf62c7f60f',\n", + " '9a1633ba-90d5-4374-a578-c99351086246',\n", + " 'dd67de24-51b7-4ea5-b453-f8e52f209a38',\n", + " 'dfda00dc-e531-44d0-8f65-69cbb9db459b',\n", + " 'd3a06fb7-1ed1-44ee-8722-04e50fe8c05d',\n", + " '72af4cf3-287f-49cc-bba9-72b5cc4a5a0a',\n", + " '95e46dae-ee37-4f0c-818f-83ad28bc9a8e',\n", + " 'a19ce574-47af-4dbf-a136-77d308ff00d6',\n", + " '75762289-e5d0-420e-9754-f800153f82a5',\n", + " '384c7132-6755-4abf-ac1f-05e7b80f329a',\n", + " '63f2ed10-4299-442b-b528-5a575d72e7d6',\n", + " 'f2b3305b-5847-4039-affa-351130bce422',\n", + " '511db69b-a7bb-49b3-903f-2474db1e8bed',\n", + " 'c565e6f2-d42b-4fa4-873b-3214ccc8359e',\n", + " '45aa23c9-e1b7-4b7e-9e84-8e29a52ce6ff',\n", + " 'fa017f87-1b7b-46ed-b111-2dcb296ec68e',\n", + " 'bcb30bbc-a09d-47dc-a3d7-8162f231d006',\n", + " 'bbbc81f1-b06b-4636-ac44-cb4486af15f0',\n", + " '384b77cf-a6c4-4e30-9358-4d77b9592fb4',\n", + " '88151c65-9d1c-4b48-bc85-93d817936aba',\n", + " '2b8afaaa-9094-4d0e-b671-53f8018a03af',\n", + " '59b0d7f3-2e77-469c-9cf7-82be345d7472',\n", + " '0a91fa62-95ea-4cb7-ba1c-b95453b7df31',\n", + " 'aac0cf31-ad27-4148-9149-11a0fb65c274',\n", + " '504e49be-1436-4034-8af3-261a454a5363',\n", + " 'd1ebbf37-979c-4343-a69b-b292f01e2cc4',\n", + " 'd188aac2-bbad-4359-b53c-74282a4017c4',\n", + " '3f21253c-f928-40be-a0a4-a2b182a9f578',\n", + " 'aa7f8bac-6385-4078-8c10-30b7a99ff032',\n", + " '774dffec-5096-4b4d-aa79-d0be8f16f3d9',\n", + " '289f7e36-9700-4661-ad80-7926052ed1f3',\n", + " 'dccf192d-f623-446e-b9be-6d6df98f2fc7',\n", + " 'cbb848d4-196e-4108-9963-fcccb4a8b513',\n", + " 'b796c0f9-c883-47c3-afd5-0b16d56ca2f5',\n", + " '718c6742-e9b7-45e4-9c89-95727baa2cb3',\n", + " '1b2c307a-b713-474e-93e7-c40f9bc57eab',\n", + " '9ac86b88-fdeb-4c5c-b88c-ba143167b584',\n", + " '30ce78f2-0a09-4f35-abb2-ae48bf2da3f0',\n", + " 'ee6bafe4-9557-401e-9e8e-cad451b8cba0',\n", + " '4df4c508-e67a-4e8b-9446-22996c051538',\n", + " '608d862f-243a-4686-83a0-7726850a7703',\n", + " '28622e79-6257-46ad-8d0b-227078b0192a',\n", + " '797a8ec5-dd36-4892-9e64-8bdc912f1f08',\n", + " '17e43ec9-b702-4f33-a5fc-014ac988d05f',\n", + " 'daccbc57-bcfa-479a-abe7-367d07c93b26',\n", + " '0a3ecebf-afdc-4124-9bac-8132c2b4958a',\n", + " 'a5bdca3f-cc24-470f-b49a-54c435de582a',\n", + " 'f0ccf63a-03f4-4c44-8a90-cc31b68fbf37',\n", + " '0a9427d3-7e33-4068-91c0-95d7bf2484cf',\n", + " 'e4778209-1a5a-45ff-8c1e-f94f839255f1',\n", + " '461c81b5-9fa3-4622-9a10-af8664fee897',\n", + " '850b68b2-eab6-412a-9bc5-c9e971533891',\n", + " '9fb4e8f9-3e0c-4b0e-8365-62bf44fab2bf',\n", + " '9fc852eb-c189-43b7-a61c-358d6c8e6c93',\n", + " 'a5702378-e662-4bd3-be67-4e587495f41f',\n", + " '9ba0b5af-9c8c-4d58-ba9a-bff26ca2571f',\n", + " '53e83397-0b6c-4f6f-8ebf-c7dee136db51',\n", + " 'bc09b5e1-76a2-4904-9959-680a23bf98e4',\n", + " '236cbaf3-c36b-44dd-afbe-6fe396a12f0f',\n", + " 'a9ad4fbe-6366-451a-ae3e-b4fee2bf320f',\n", + " 'ccfcfd3e-ef70-42cd-95f9-c628bfae7747',\n", + " '2f81758a-d8c8-4a1b-852c-67d9ad8f6a54',\n", + " 'f04d8ace-8666-4ec7-a095-a7cc42fca152',\n", + " '18906e6f-a1ea-4bc9-8ce9-11884e320c1c',\n", + " '9077b482-c838-498f-a290-7c38f0df520e',\n", + " 'dd0fe10e-5954-4a87-b0e7-47eb5ed17635',\n", + " '689cc11b-9a6e-4c9f-a4ef-59dd473cae7c',\n", + " '9e58c8ee-ef08-4aba-9cca-92e505503f89',\n", + " '683848ad-8c37-403c-9992-38a219d2239c',\n", + " '9e705e7a-99e4-4914-803c-aefa0f40edbd',\n", + " '717d8e41-250d-47e0-b723-7224fed80f22',\n", + " '8a9efc67-579f-4dcf-ac0e-caeb80245052',\n", + " '2c08568f-ce5d-4ecd-87a6-9baec410cb39',\n", + " 'ef73a66a-32ab-4945-b644-6db8c29562be',\n", + " 'e9da66a8-13d7-40d2-ae36-1689770173c3',\n", + " '99b918bb-4e37-4a46-b8f8-9167fe95e618',\n", + " '8b589cf8-286b-4836-bfe7-909947401920',\n", + " 'd7b6853c-772e-4621-ac41-543ea43ace9d',\n", + " 'c9e79c0d-966c-4e01-b962-ac893773ad75',\n", + " '2e8a083e-6307-4428-83cd-65b436cebbf9',\n", + " '932c94ed-36b5-40e7-a270-7bea414ad537',\n", + " '65c85bd7-0338-4e1a-954f-794f8bf85902',\n", + " '7c438b8a-f7e7-4d0b-b376-9a2194a1246c',\n", + " '6a5616f9-075c-47d0-8531-2a7dab39bd57',\n", + " '59d6b5e3-b2a6-4aa5-bad3-60fa38236e86',\n", + " '93d6af54-7141-4616-8894-e5440446f3df',\n", + " 'd1c00188-e315-41a4-bb97-e3c37b7d2beb',\n", + " '9d6fb7c5-b366-4516-9d36-be3854360df3',\n", + " '6a5eac17-180c-474e-927a-66b2c0f880e3',\n", + " '3a68ad92-8ab8-44f3-8f1c-1c852575620b',\n", + " '0346130f-92a6-4d52-b746-741adfe8e604',\n", + " 'dcf5b04b-5990-40ac-9685-d3f67c896ba4',\n", + " '93a12904-61bd-4f2a-af50-691be41d14b5',\n", + " '8dc1d6b6-af25-42f7-b2bb-77dcfc1127ee',\n", + " 'e301f0dc-5399-4c8c-bf10-a87f344015cc',\n", + " 'b6a26a83-e757-4b57-bcd3-6e7bd9e5b1bf',\n", + " '6013f6a2-c398-46c6-954b-53882646f78b',\n", + " '96e4c7f7-5a57-4815-810a-5e3646704397',\n", + " '9f48b86c-becd-44de-989f-540764d8299b',\n", + " '59b136fa-4b48-45ed-b951-fc514085e95d',\n", + " 'ce095f0f-f663-423e-a289-32657bc386a9',\n", + " '584ccb6f-e675-4879-a439-a68c17afd92d',\n", + " '455466cd-ce88-4faa-966d-955b3144611e',\n", + " '3527b28d-f65e-4a90-ac86-aac1ec29aea2',\n", + " 'fc59f9d9-e2c6-4a6c-becd-64cb9fdc120f',\n", + " '32df0391-7b16-4f60-8a22-08ab3aaa6777',\n", + " '426c06c3-5550-46dd-bf4c-dc91c30de60a',\n", + " '146a4059-7fb6-4a7b-8665-2bf2b2805d3e',\n", + " '321bb4f3-eb98-4742-beb0-70b4496a8ab7',\n", + " 'b39d2d7c-4eea-4df1-85fd-eb286dee41f3',\n", + " 'c858d614-98f3-4c96-9be1-5ba3088643b8',\n", + " '355a6b60-f7b1-4d68-96da-45d03b1f1bb5',\n", + " '6634ee11-e63f-46f1-82f1-5d596ba4f832']" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from uuid import uuid4\n", + "\n", + "uuids = [str(uuid4()) for _ in range(len(documents))]\n", + "\n", + "vector_store.add_documents(documents=documents, ids=uuids)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "0c3a0a6b-4640-4036-863e-77a1a715a0e3", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "'list' object has no attribute 'invoke'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[33], line 5\u001b[0m\n\u001b[1;32m 1\u001b[0m retriever \u001b[38;5;241m=\u001b[39m vector_store\u001b[38;5;241m.\u001b[39msimilarity_search_with_score(\n\u001b[1;32m 2\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mManuscripts\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 3\u001b[0m k\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m3\u001b[39m\n\u001b[1;32m 4\u001b[0m )\n\u001b[0;32m----> 5\u001b[0m r \u001b[38;5;241m=\u001b[39m retriever\u001b[38;5;241m.\u001b[39minvoke(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mJohn Bishop Estlin\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 6\u001b[0m r_list \u001b[38;5;241m=\u001b[39m [x\u001b[38;5;241m.\u001b[39mmetadata[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mURL\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m r]\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28mprint\u001b[39m(r_list)\n", + "\u001b[0;31mAttributeError\u001b[0m: 'list' object has no attribute 'invoke'" + ] + } + ], + "source": [ + "retriever = vector_store.similarity_search_with_score(\n", + " \"Manuscripts\",\n", + " k=3\n", + ")\n", + "r = retriever.invoke(\"John Bishop Estlin\")\n", + "r_list = [x.metadata[\"URL\"] for x in r]\n", + "print(r_list)" + ] + }, + { + "cell_type": "markdown", + "id": "71698bba-0e8d-4bbc-8412-81c225c8cb6e", + "metadata": {}, + "source": [ + "Save and load the vector store" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "273778f9-cb7d-4bb9-acd0-16841adf8344", + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'metadata' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[13], line 12\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;66;03m# Save metadata\u001b[39;00m\n\u001b[1;32m 11\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(store_dir, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmetadata.pkl\u001b[39m\u001b[38;5;124m\"\u001b[39m), \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwb\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[0;32m---> 12\u001b[0m pickle\u001b[38;5;241m.\u001b[39mdump(metadata, f)\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFAISS index and metadata stored in \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mstore_dir\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n", + "\u001b[0;31mNameError\u001b[0m: name 'metadata' is not defined" + ] + } + ], + "source": [ + "vector_store.save_local(\"faiss_index\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "98740c69-5687-4b0c-a4ae-4233208f6e22", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.embeddings import HuggingFaceEmbeddings\n", + "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")\n", + "\n", + "vector_store = FAISS.load_local(\n", + " \"faiss_index\", embeddings, allow_dangerous_deserialization=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "d3a257cc-c687-4fc5-91a7-0c3048a49e70", + "metadata": {}, + "source": [ + "### Now for the Reranking Step:" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "013d5fca-c621-4a7e-ac0e-ad5c9aba577f", + "metadata": {}, + "outputs": [], + "source": [ + "weights = {\n", + " \"title\": 1000,\n", + " \"subtitle\": 500,\n", + " \"title_alt\": 500,\n", + " \"abstract\": 30,\n", + " \"subject_facet\": 1,\n", + " \"subject_geographic\": 1,\n", + " \"genre\": 1,\n", + " \"genre_specific\": 1,\n", + " \"name_facet\": 1,\n", + " \"name_role\": 1,\n", + " \"date_human\": 1,\n", + " \"date_start\": 1,\n", + " \"date_end\": 1,\n", + " \"publisher\": 1,\n", + " \"collection_name\": 1,\n", + " \"physical_location\": 1,\n", + " \"related_item_host\": 1,\n", + " \"type_of_resource\": 1,\n", + " \"URL\": 0.0\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "cc1f04d1-f553-46ae-bfc6-248125f62423", + "metadata": {}, + "outputs": [], + "source": [ + "from sentence_transformers import SentenceTransformer\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "import numpy as np\n", + "from langchain.embeddings import HuggingFaceEmbeddings\n", + "\n", + "model = SentenceTransformer('all-MiniLM-L6-v2')\n", + "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")\n", + "\n", + "def compute_relevance_score(metadata_value, query):\n", + " \"\"\"\n", + " Compute cosine similarity between the query and a metadata value using sentence-transformers.\n", + "\n", + " Args:\n", + " metadata_value (str): The metadata value to compare.\n", + " query (str): The query string.\n", + "\n", + " Returns:\n", + " float: Cosine similarity score (between 0 and 1).\n", + " \"\"\"\n", + " if not metadata_value or not query:\n", + " return 0 # Return 0 if either the metadata or query is empty\n", + " \n", + " # Encode the metadata value and query into embeddings\n", + " embeddings = model.encode([metadata_value, query], convert_to_tensor=False) # Convert to NumPy\n", + " metadata_embedding, query_embedding = embeddings\n", + "\n", + " # Compute cosine similarity\n", + " similarity = cosine_similarity([metadata_embedding], [query_embedding])\n", + " return similarity[0][0] # Extract the scalar similarity value\n", + "\n", + "\n", + "\n", + "def rerank_documents(query, weights, vector_store, k=10):\n", + " \"\"\"\n", + " Rerank documents based on metadata relevance scores and FAISS vector similarity scores.\n", + "\n", + " Args:\n", + " documents (list): List of Document objects.\n", + " query (str): The query string used for retrieval.\n", + " weights (dict): Weights for each metadata field.\n", + " vector_store (str): The vector store itself to get the similarity score\n", + "\n", + " Returns:\n", + " list: Reranked documents in descending order of relevance.\n", + " \"\"\"\n", + "\n", + " reranked_results = []\n", + " total = sum(weights.values())\n", + " # returns the relevant documents from the query\n", + " returned_docs = vector_store.similarity_search_with_score(query, k)\n", + " for doc in returned_docs:\n", + " final_score = 0\n", + " # Add weighted relevance scores for each metadata field\n", + " for field, weight in weights.items():\n", + " metadata_value = doc[0].metadata.get(field, \"\") # Safely get metadata field value\n", + " relevance_score = compute_relevance_score(metadata_value, query)\n", + " #print(f\"relevance_score: {relevance_score}\")\n", + " final_score += (weight * relevance_score) \n", + "\n", + " reranked_results.append((doc, final_score / total))\n", + "\n", + " # Sort documents by the final score in descending order\n", + " reranked_results.sort(key=lambda x: x[1], reverse=True)\n", + " return [(doc, score) for doc, score in reranked_results]\n", + "\n", + "\n", + "docs = rerank_documents(\"Newspaper\", weights, vector_store)" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "id": "2d9172aa-6c15-4c90-856e-d0ee53100721", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "('Thanksgiving', 'https://www.digitalcommonwealth.org/search/commonwealth:02876465m', 'Reranked score: 0.21417763510649684')\n", + "('Thanksgiving', 'https://www.digitalcommonwealth.org/search/commonwealth:jd478671b', 'Reranked score: 0.21414245699337317')\n", + "('Thanks for high school articles', 'https://www.digitalcommonwealth.org/search/commonwealth:8910r4424', 'Reranked score: 0.20574750553692622')\n", + "('T.H. Jones, Charlotte, N.C., autograph letter ...', 'https://www.digitalcommonwealth.org/search/commonwealth:w3764603d', 'Reranked score: 0.16508627951932514')\n", + "('Theodore C. Tharin, Charleston, S.C., autograp...', 'https://www.digitalcommonwealth.org/search/commonwealth:9k41zk061', 'Reranked score: 0.14404614119555387')\n", + "('T.H. Marshall, Graniteville, S.C., autograph l...', 'https://www.digitalcommonwealth.org/search/commonwealth:ws859m407', 'Reranked score: 0.14002441968440776')\n", + "('Theodore C. Tharin, Grumesville, S.C., autogra...', 'https://www.digitalcommonwealth.org/search/commonwealth:w3764352q', 'Reranked score: 0.138896407891218')\n", + "('Theodore C. Tharin, Grumesville, S.C. [?], aut...', 'https://www.digitalcommonwealth.org/search/commonwealth:w3764306m', 'Reranked score: 0.13877071803271654')\n", + "('Theodore C. Tharin, Grumesville, S.C., autogra...', 'https://www.digitalcommonwealth.org/search/commonwealth:w3764957b', 'Reranked score: 0.1382296937873219')\n", + "('\"The Refuge of Oppression,\" from David S. Gran...', 'https://www.digitalcommonwealth.org/search/commonwealth:dv1441451', 'Reranked score: 0.12401763978186418')\n" + ] + } + ], + "source": [ + "#print([docs[i].metadata['title'] for i in range(len(docs))])\n", + "docs_list = [(docs[i][0][0].metadata['title'], docs[i][0][0].metadata['URL'], f\"Reranked score: {docs[i][1]}\") for i in range(len(docs))]\n", + "docs_list.sort(key=lambda x: x[2], reverse=True)\n", + "for doc in docs_list:\n", + " print(doc)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "83c1c3f2-91b4-4647-b93c-09a0af5b43a8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "100\n" + ] + } + ], + "source": [ + "print(len(docs_list))" + ] + }, + { + "cell_type": "markdown", + "id": "07cdf844-72c6-41ef-bade-9afb52bceed8", + "metadata": {}, + "source": [ + "Immediately we get much better performance because now only Newspapers are returned." + ] + }, + { + "cell_type": "markdown", + "id": "18719878-92c6-458c-ae81-21d9fe5f0bd8", + "metadata": {}, + "source": [ + "# Implementing Different Vector Store and Embedding Combos" + ] + }, + { + "cell_type": "markdown", + "id": "beaf9e61-3bea-4c31-9710-532a306d1023", + "metadata": {}, + "source": [ + "### Pinecone Vector Store w/OLlama Embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "28f8e253-cf14-4b37-8e54-bf114514ac60", + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install langchain-pinecone pinecone-notebooks\n", + "#!pip install pinecone-client" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "ab3fb46d-267e-4815-b823-e978d4bf3edf", + "metadata": {}, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Enter your Pinecone API key: ········\n" + ] + } + ], + "source": [ + "import getpass\n", + "import os\n", + "import time\n", + "\n", + "from pinecone import Pinecone, ServerlessSpec\n", + "\n", + "if not os.getenv(\"PINECONE_API_KEY\"):\n", + " os.environ[\"PINECONE_API_KEY\"] = getpass.getpass(\"Enter your Pinecone API key: \")\n", + "\n", + "pinecone_api_key = os.environ.get(\"PINECONE_API_KEY\")\n", + "\n", + "pc = Pinecone(api_key=pinecone_api_key)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "0a6c480d-f2cd-4732-943a-8cd9d66417e2", + "metadata": {}, + "outputs": [], + "source": [ + "# setting up the index name\n", + "import time\n", + "\n", + "index_name = \"librag1\" # change if desired\n", + "\n", + "existing_indexes = [index_info[\"name\"] for index_info in pc.list_indexes()]\n", + "\n", + "if index_name not in existing_indexes:\n", + " pc.create_index(\n", + " name=index_name,\n", + " dimension=768,\n", + " metric=\"cosine\",\n", + " spec=ServerlessSpec(cloud=\"aws\", region=\"us-east-1\"),\n", + " )\n", + " while not pc.describe_index(index_name).status[\"ready\"]:\n", + " time.sleep(1)\n", + "\n", + "index = pc.Index(index_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "5c791f22-4909-447f-a1fe-ebc09a9bbe11", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_ollama import OllamaEmbeddings\n", + "from langchain.embeddings import HuggingFaceEmbeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "4aec6567-7950-4de2-8600-fe987f47a24a", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/xq/fj3st__56r54gz9tdvb7d2k40000gn/T/ipykernel_2752/1630880338.py:5: LangChainDeprecationWarning: The class `HuggingFaceEmbeddings` was deprecated in LangChain 0.2.2 and will be removed in 1.0. An updated version of the class exists in the :class:`~langchain-huggingface package and should be used instead. To use it run `pip install -U :class:`~langchain-huggingface` and import as `from :class:`~langchain_huggingface import HuggingFaceEmbeddings``.\n", + " embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")\n", + "2024-11-14 14:26:19.674480: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", + "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" + ] + } + ], + "source": [ + "# embeddings = OllamaEmbeddings(\n", + "# model=\"llama3\",\n", + "# )\n", + "\n", + "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "f1ad1f25-69d2-4eb1-9f85-c7b0ccf13a53", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_pinecone import PineconeVectorStore\n", + "vector_store = PineconeVectorStore(index=index, embedding=embeddings)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "2a4b12ac-87bd-4ae3-90cc-4f2c6644256a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'1165601 Terms for disposal of woman.\\nName: abstract_tsi, dtype: object'" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "documents[18].metadata['abstract']" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "e80decf8-6ee5-48a2-bb39-5ea97ceaf7e2", + "metadata": {}, + "outputs": [], + "source": [ + "from uuid import uuid4\n", + "uuids = [str(uuid4()) for _ in range(len(documents))]" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "18bcd3e7-07f6-4a66-a629-126ffe340966", + "metadata": {}, + "outputs": [ + { + "ename": "PineconeApiException", + "evalue": "(400)\nReason: Bad Request\nHTTP response headers: HTTPHeaderDict({'Date': 'Thu, 14 Nov 2024 19:37:34 GMT', 'Content-Type': 'application/json', 'Content-Length': '116', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '155', 'x-pinecone-request-id': '2161036493328920558', 'x-envoy-upstream-service-time': '37', 'server': 'envoy'})\nHTTP response body: {\"code\":3,\"message\":\"Metadata size is 869788 bytes, which exceeds the limit of 40960 bytes per vector\",\"details\":[]}\n", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mPineconeApiException\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[44], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m vector_store\u001b[38;5;241m.\u001b[39madd_documents(documents\u001b[38;5;241m=\u001b[39mdocuments[\u001b[38;5;241m0\u001b[39m:\u001b[38;5;241m15\u001b[39m], ids\u001b[38;5;241m=\u001b[39muuids)\n", + "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/langchain_core/vectorstores/base.py:287\u001b[0m, in \u001b[0;36mVectorStore.add_documents\u001b[0;34m(self, documents, **kwargs)\u001b[0m\n\u001b[1;32m 285\u001b[0m texts \u001b[38;5;241m=\u001b[39m [doc\u001b[38;5;241m.\u001b[39mpage_content \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m documents]\n\u001b[1;32m 286\u001b[0m metadatas \u001b[38;5;241m=\u001b[39m [doc\u001b[38;5;241m.\u001b[39mmetadata \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m documents]\n\u001b[0;32m--> 287\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39madd_texts(texts, metadatas, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 288\u001b[0m msg \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 289\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m`add_documents` and `add_texts` has not been implemented \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 290\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfor \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 291\u001b[0m )\n\u001b[1;32m 292\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(msg)\n", + "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/langchain_pinecone/vectorstores.py:292\u001b[0m, in \u001b[0;36mPineconeVectorStore.add_texts\u001b[0;34m(self, texts, metadatas, ids, namespace, batch_size, embedding_chunk_size, async_req, id_prefix, **kwargs)\u001b[0m\n\u001b[1;32m 281\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m async_req:\n\u001b[1;32m 282\u001b[0m \u001b[38;5;66;03m# Runs the pinecone upsert asynchronously.\u001b[39;00m\n\u001b[1;32m 283\u001b[0m async_res \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 284\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_index\u001b[38;5;241m.\u001b[39mupsert(\n\u001b[1;32m 285\u001b[0m vectors\u001b[38;5;241m=\u001b[39mbatch_vector_tuples,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 290\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m batch_vector_tuples \u001b[38;5;129;01min\u001b[39;00m batch_iterate(batch_size, vector_tuples)\n\u001b[1;32m 291\u001b[0m ]\n\u001b[0;32m--> 292\u001b[0m [res\u001b[38;5;241m.\u001b[39mget() \u001b[38;5;28;01mfor\u001b[39;00m res \u001b[38;5;129;01min\u001b[39;00m async_res]\n\u001b[1;32m 293\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 294\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_index\u001b[38;5;241m.\u001b[39mupsert(\n\u001b[1;32m 295\u001b[0m vectors\u001b[38;5;241m=\u001b[39mvector_tuples,\n\u001b[1;32m 296\u001b[0m namespace\u001b[38;5;241m=\u001b[39mnamespace,\n\u001b[1;32m 297\u001b[0m async_req\u001b[38;5;241m=\u001b[39masync_req,\n\u001b[1;32m 298\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m 299\u001b[0m )\n", + "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/langchain_pinecone/vectorstores.py:292\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 281\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m async_req:\n\u001b[1;32m 282\u001b[0m \u001b[38;5;66;03m# Runs the pinecone upsert asynchronously.\u001b[39;00m\n\u001b[1;32m 283\u001b[0m async_res \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 284\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_index\u001b[38;5;241m.\u001b[39mupsert(\n\u001b[1;32m 285\u001b[0m vectors\u001b[38;5;241m=\u001b[39mbatch_vector_tuples,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 290\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m batch_vector_tuples \u001b[38;5;129;01min\u001b[39;00m batch_iterate(batch_size, vector_tuples)\n\u001b[1;32m 291\u001b[0m ]\n\u001b[0;32m--> 292\u001b[0m [res\u001b[38;5;241m.\u001b[39mget() \u001b[38;5;28;01mfor\u001b[39;00m res \u001b[38;5;129;01min\u001b[39;00m async_res]\n\u001b[1;32m 293\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 294\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_index\u001b[38;5;241m.\u001b[39mupsert(\n\u001b[1;32m 295\u001b[0m vectors\u001b[38;5;241m=\u001b[39mvector_tuples,\n\u001b[1;32m 296\u001b[0m namespace\u001b[38;5;241m=\u001b[39mnamespace,\n\u001b[1;32m 297\u001b[0m async_req\u001b[38;5;241m=\u001b[39masync_req,\n\u001b[1;32m 298\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m 299\u001b[0m )\n", + "File \u001b[0;32m~/anaconda3/lib/python3.11/multiprocessing/pool.py:774\u001b[0m, in \u001b[0;36mApplyResult.get\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 772\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_value\n\u001b[1;32m 773\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 774\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_value\n", + "File \u001b[0;32m~/anaconda3/lib/python3.11/multiprocessing/pool.py:125\u001b[0m, in \u001b[0;36mworker\u001b[0;34m(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception)\u001b[0m\n\u001b[1;32m 123\u001b[0m job, i, func, args, kwds \u001b[38;5;241m=\u001b[39m task\n\u001b[1;32m 124\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 125\u001b[0m result \u001b[38;5;241m=\u001b[39m (\u001b[38;5;28;01mTrue\u001b[39;00m, func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwds))\n\u001b[1;32m 126\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 127\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m wrap_exception \u001b[38;5;129;01mand\u001b[39;00m func \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m _helper_reraises_exception:\n", + "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pinecone/core/openapi/shared/api_client.py:187\u001b[0m, in \u001b[0;36mApiClient.__call_api\u001b[0;34m(self, resource_path, method, path_params, query_params, header_params, body, post_params, files, response_type, auth_settings, _return_http_data_only, collection_formats, _preload_content, _request_timeout, _host, _check_type)\u001b[0m\n\u001b[1;32m 185\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m PineconeApiException \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 186\u001b[0m e\u001b[38;5;241m.\u001b[39mbody \u001b[38;5;241m=\u001b[39m e\u001b[38;5;241m.\u001b[39mbody\u001b[38;5;241m.\u001b[39mdecode(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 187\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[1;32m 189\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlast_response \u001b[38;5;241m=\u001b[39m response_data\n\u001b[1;32m 191\u001b[0m return_data \u001b[38;5;241m=\u001b[39m response_data\n", + "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pinecone/core/openapi/shared/api_client.py:175\u001b[0m, in \u001b[0;36mApiClient.__call_api\u001b[0;34m(self, resource_path, method, path_params, query_params, header_params, body, post_params, files, response_type, auth_settings, _return_http_data_only, collection_formats, _preload_content, _request_timeout, _host, _check_type)\u001b[0m\n\u001b[1;32m 171\u001b[0m url \u001b[38;5;241m=\u001b[39m _host \u001b[38;5;241m+\u001b[39m resource_path\n\u001b[1;32m 173\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 174\u001b[0m \u001b[38;5;66;03m# perform request and return response\u001b[39;00m\n\u001b[0;32m--> 175\u001b[0m response_data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrequest(\n\u001b[1;32m 176\u001b[0m method,\n\u001b[1;32m 177\u001b[0m url,\n\u001b[1;32m 178\u001b[0m query_params\u001b[38;5;241m=\u001b[39mquery_params,\n\u001b[1;32m 179\u001b[0m headers\u001b[38;5;241m=\u001b[39mheader_params,\n\u001b[1;32m 180\u001b[0m post_params\u001b[38;5;241m=\u001b[39mpost_params,\n\u001b[1;32m 181\u001b[0m body\u001b[38;5;241m=\u001b[39mbody,\n\u001b[1;32m 182\u001b[0m _preload_content\u001b[38;5;241m=\u001b[39m_preload_content,\n\u001b[1;32m 183\u001b[0m _request_timeout\u001b[38;5;241m=\u001b[39m_request_timeout,\n\u001b[1;32m 184\u001b[0m )\n\u001b[1;32m 185\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m PineconeApiException \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 186\u001b[0m e\u001b[38;5;241m.\u001b[39mbody \u001b[38;5;241m=\u001b[39m e\u001b[38;5;241m.\u001b[39mbody\u001b[38;5;241m.\u001b[39mdecode(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pinecone/core/openapi/shared/api_client.py:460\u001b[0m, in \u001b[0;36mApiClient.request\u001b[0;34m(self, method, url, query_params, headers, post_params, body, _preload_content, _request_timeout)\u001b[0m\n\u001b[1;32m 450\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrest_client\u001b[38;5;241m.\u001b[39mOPTIONS(\n\u001b[1;32m 451\u001b[0m url,\n\u001b[1;32m 452\u001b[0m query_params\u001b[38;5;241m=\u001b[39mquery_params,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 457\u001b[0m body\u001b[38;5;241m=\u001b[39mbody,\n\u001b[1;32m 458\u001b[0m )\n\u001b[1;32m 459\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m method \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPOST\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m--> 460\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrest_client\u001b[38;5;241m.\u001b[39mPOST(\n\u001b[1;32m 461\u001b[0m url,\n\u001b[1;32m 462\u001b[0m query_params\u001b[38;5;241m=\u001b[39mquery_params,\n\u001b[1;32m 463\u001b[0m headers\u001b[38;5;241m=\u001b[39mheaders,\n\u001b[1;32m 464\u001b[0m post_params\u001b[38;5;241m=\u001b[39mpost_params,\n\u001b[1;32m 465\u001b[0m _preload_content\u001b[38;5;241m=\u001b[39m_preload_content,\n\u001b[1;32m 466\u001b[0m _request_timeout\u001b[38;5;241m=\u001b[39m_request_timeout,\n\u001b[1;32m 467\u001b[0m body\u001b[38;5;241m=\u001b[39mbody,\n\u001b[1;32m 468\u001b[0m )\n\u001b[1;32m 469\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m method \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPUT\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 470\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrest_client\u001b[38;5;241m.\u001b[39mPUT(\n\u001b[1;32m 471\u001b[0m url,\n\u001b[1;32m 472\u001b[0m query_params\u001b[38;5;241m=\u001b[39mquery_params,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 477\u001b[0m body\u001b[38;5;241m=\u001b[39mbody,\n\u001b[1;32m 478\u001b[0m )\n", + "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pinecone/core/openapi/shared/rest.py:345\u001b[0m, in \u001b[0;36mRESTClientObject.POST\u001b[0;34m(self, url, headers, query_params, post_params, body, _preload_content, _request_timeout)\u001b[0m\n\u001b[1;32m 335\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mPOST\u001b[39m(\n\u001b[1;32m 336\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 337\u001b[0m url,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 343\u001b[0m _request_timeout\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 344\u001b[0m ):\n\u001b[0;32m--> 345\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrequest(\n\u001b[1;32m 346\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPOST\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 347\u001b[0m url,\n\u001b[1;32m 348\u001b[0m headers\u001b[38;5;241m=\u001b[39mheaders,\n\u001b[1;32m 349\u001b[0m query_params\u001b[38;5;241m=\u001b[39mquery_params,\n\u001b[1;32m 350\u001b[0m post_params\u001b[38;5;241m=\u001b[39mpost_params,\n\u001b[1;32m 351\u001b[0m _preload_content\u001b[38;5;241m=\u001b[39m_preload_content,\n\u001b[1;32m 352\u001b[0m _request_timeout\u001b[38;5;241m=\u001b[39m_request_timeout,\n\u001b[1;32m 353\u001b[0m body\u001b[38;5;241m=\u001b[39mbody,\n\u001b[1;32m 354\u001b[0m )\n", + "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pinecone/core/openapi/shared/rest.py:279\u001b[0m, in \u001b[0;36mRESTClientObject.request\u001b[0;34m(self, method, url, query_params, headers, body, post_params, _preload_content, _request_timeout)\u001b[0m\n\u001b[1;32m 276\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;241m500\u001b[39m \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m r\u001b[38;5;241m.\u001b[39mstatus \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m599\u001b[39m:\n\u001b[1;32m 277\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m ServiceException(http_resp\u001b[38;5;241m=\u001b[39mr)\n\u001b[0;32m--> 279\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m PineconeApiException(http_resp\u001b[38;5;241m=\u001b[39mr)\n\u001b[1;32m 281\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m r\n", + "\u001b[0;31mPineconeApiException\u001b[0m: (400)\nReason: Bad Request\nHTTP response headers: HTTPHeaderDict({'Date': 'Thu, 14 Nov 2024 19:37:34 GMT', 'Content-Type': 'application/json', 'Content-Length': '116', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '155', 'x-pinecone-request-id': '2161036493328920558', 'x-envoy-upstream-service-time': '37', 'server': 'envoy'})\nHTTP response body: {\"code\":3,\"message\":\"Metadata size is 869788 bytes, which exceeds the limit of 40960 bytes per vector\",\"details\":[]}\n" + ] + } + ], + "source": [ + "vector_store.add_documents(documents=documents[0:15], ids=uuids)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "bc900bbd-128c-40b0-b151-f77800fcb50b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "* [SIM=0.292190]\n", + " [deletion]Cha[/deletion] Graniteville Jany 15 1854\n", + "Mr Z. B. Oakes\n", + "Dr Sir\n", + "On my Return\n", + "from Charleston last week I stopd\n", + "and attended the sale of B. J. Godfrerys\n", + "at Black Creek and bought the Family\n", + "of Cash whom I purchased of you two\n", + "years ago. The woman is now in a\n", + "Family way and we think she will not\n", + "suit us for a Cook Woman, she is\n", + "a small young woman about 30 strong and\n", + "Healthy and prefers Field Work Cash\n", + "has proved himself an Excellent servant\n", + "a great Ax Man and not to be exceld\n", + "with the Hoe I can recommend him to be\n", + "a most Excellent general servant, and\n", + "now what do you think you can\n", + "get for the Family Consisting of himself\n", + "Wife and three Children say a girl of\n", + "10 or 11 year named Bella, Ceasar a boy about\n", + "4 or 5, and Rose 2 or three and Mother\n", + "in a fair way for another. Please write\n", + "and let me know what you think\n", + "you could get for them yours Truly\n", + "[underline]T. H. Marshall[/underline]\n", + "\n", + " [{'URL': 'https://www.digitalcommonwealth.org/search/commonwealth:ws859m407', 'abstract': '1161522 Asks value of family: man, woman, 3 children.\\nName: abstract_tsi, dtype: object', 'genre': \"1161522 ['Manuscripts', 'Correspondence']\\nName: genre_basic_ssim, dtype: object\", 'genre_specific': '1161522 NaN\\nName: genre_specific_ssim, dtype: object', 'subtitle': '1161522 NaN\\nName: title_info_primary_subtitle_tsi, dtype: object', 'title': 'T.H. Marshall, Graniteville, S.C., autograph l...', 'title_alt': '1161522 NaN\\nName: title_info_alternative_tsim, dtype: object'}]\n", + "----------------------------------\n", + "\n", + "* [SIM=0.243309]\n", + " Grumesville Sept 11 [insertion]th[/insertion] 1856\n", + "Dear Sir\n", + "The Deputy has been here since\n", + "day before yesterday, and it is impossible\n", + "to get hold of the negroes, I have advised\n", + "that the parties should go down & see you\n", + "they seem willing to consent that the fellow\n", + "Tom should go at $ 1000, and a smaller boy\n", + "at $ 300 - but a new party entered the\n", + "field and recommended a seperance\n", + "to Major Rhame - who wants the boys\n", + "himself, he is to be requested to go\n", + "down and confer with the judgement\n", + "creditors and buy them, or propose to\n", + "sell out the entire Estate and pay pro-ratio,\n", + "the creditors, I presume that\n", + "all this could be done better by yourself\n", + "and make no doubt a ballance\n", + "of some 2,000 dollars retained, I advise\n", + "that you should when applied\n", + "to take the whole Estate in hand by &\n", + "with the Consent of Judgement Creditors\n", + "and save the Commission, I could\n", + "then speculate on the sale if forced\n", + "under the [deletion]e[/deletion]sheriff -\n", + "in Lords - yours truly\n", + "Theo. C Tharin\n", + "\n", + " [{'URL': 'https://www.digitalcommonwealth.org/search/commonwealth:w3764957b', 'abstract': '1163238 On disposal of Negroes in contested estate.\\nName: abstract_tsi, dtype: object', 'genre': \"1163238 ['Manuscripts', 'Correspondence']\\nName: genre_basic_ssim, dtype: object\", 'genre_specific': '1163238 NaN\\nName: genre_specific_ssim, dtype: object', 'subtitle': '1163238 NaN\\nName: title_info_primary_subtitle_tsi, dtype: object', 'title': 'Theodore C. Tharin, Grumesville, S.C., autogra...', 'title_alt': '1163238 NaN\\nName: title_info_alternative_tsim, dtype: object'}]\n", + "----------------------------------\n", + "\n", + "* [SIM=0.230255]\n", + " Wednesday morn July 12 1854\n", + "Dear Sir\n", + "I drop you a line in a great\n", + "hurry by Mr McCulley to say my affairs\n", + "are going very well, and I have been\n", + "offered $ 500 or $ 1.00 per acre for the tract of\n", + "land you purchased at sherriff sale.\n", + "I think you can do better, the offer is made by\n", + "Mr Williams, who owns the adjoining tract\n", + "which he purchased of Mc Culley he\n", + "gets Serpentine and work about 30 Hands\n", + "yours truly in haste\n", + "Theod C Tharin\n", + "Z. B. Oakes Esq.\n", + "\n", + " [{'URL': 'https://www.digitalcommonwealth.org/search/commonwealth:w3764306m', 'abstract': '1163237 Offer to purchase land.\\nName: abstract_tsi, dtype: object', 'genre': \"1163237 ['Manuscripts', 'Correspondence']\\nName: genre_basic_ssim, dtype: object\", 'genre_specific': '1163237 NaN\\nName: genre_specific_ssim, dtype: object', 'subtitle': '1163237 NaN\\nName: title_info_primary_subtitle_tsi, dtype: object', 'title': 'Theodore C. Tharin, Grumesville, S.C. [?], aut...', 'title_alt': '1163237 NaN\\nName: title_info_alternative_tsim, dtype: object'}]\n", + "----------------------------------\n", + "\n" + ] + } + ], + "source": [ + "results = vector_store.similarity_search_with_score(\n", + " \"What is the metadata of the Z.B. Oakes articles\",\n", + " k=3\n", + ")\n", + "for res, score in results:\n", + " #print(f\"* {res.page_content} [{res.metadata}]\")\n", + " print(f\"* [SIM={score:3f}]\\n {res.page_content} [{res.metadata}]\")\n", + " print(\"----------------------------------\\n\")" + ] + }, + { + "cell_type": "markdown", + "id": "f5545e52-8142-4c0d-a127-a1a3711e5ef3", + "metadata": {}, + "source": [ + "### Conclusions on Pinecone and HuggingFace Embedding Model\n", + "Unfortunately, it seems like Pinecone has a byte limit of how much data you can send, which 40KB. The metadata field for one of the Document objects is 800KB.\n", + "\n", + "The HuggingFace embedding model that I used though works like a charm. Just have to ensure the dimensions line up between the embedding and the query.\n", + "\n", + "FAISS seems like our best option." + ] + }, + { + "cell_type": "markdown", + "id": "4a7f8878-1967-4630-817a-9eb1d321701e", + "metadata": {}, + "source": [ + "### Using Chroma Vector Store" + ] + }, + { + "cell_type": "markdown", + "id": "01f0c6e6-fb80-4b01-9673-a790017ce71b", + "metadata": {}, + "source": [ + "Now we can embed our data into a Chroma vector store:" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "id": "c281018a-089b-4ad0-8f4c-efb4667c8780", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Split 133 documents into 13931 chunks.\n", + "Saved 13931 chunks to ./chroma_try.\n" + ] + } + ], + "source": [ + "# from langchain.document_loaders import DirectoryLoader\n", + "from langchain_community.document_loaders import DirectoryLoader\n", + "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", + "from langchain.schema import Document\n", + "from langchain_openai import OpenAIEmbeddings\n", + "from langchain_community.vectorstores import Chroma\n", + "from dotenv import load_dotenv\n", + "import openai\n", + "import os\n", + "import shutil\n", + "import time\n", + "from langchain.embeddings import HuggingFaceEmbeddings\n", + "\n", + "import tempfile\n", + "CHROMA_PATH = \"./chroma_try\"\n", + "\n", + "def main(documents):\n", + " generate_data_store(documents)\n", + "\n", + "\n", + "def generate_data_store(documents):\n", + " chunks = split_text(documents)\n", + " save_to_chroma(chunks)\n", + "\n", + "\n", + "def split_text(documents: list[Document]):\n", + " text_splitter = RecursiveCharacterTextSplitter(\n", + " chunk_size=1000,\n", + " chunk_overlap=100,\n", + " length_function=len,\n", + " add_start_index=True,\n", + " )\n", + " chunks = text_splitter.split_documents(documents)\n", + " print(f\"Split {len(documents)} documents into {len(chunks)} chunks.\")\n", + "\n", + " document = chunks[10]\n", + " #print(document.page_content)\n", + " #print(document.metadata)\n", + "\n", + " return chunks\n", + "\n", + "def save_to_chroma(chunks):\n", + " #Clear out the database first.\n", + " if os.path.exists(CHROMA_PATH):\n", + " shutil.rmtree(CHROMA_PATH)\n", + " print(f\"Removed existing database at {CHROMA_PATH}.\")\n", + "\n", + " # Create a new DB from the documents.\n", + " os.makedirs(CHROMA_PATH, exist_ok=True) # Ensure the directory exists\n", + "\n", + " #embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n", + " embeddings = OpenAIEmbeddings(model=\"text-embedding-3-large\", dimensions=3072)\n", + " #embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")\n", + " try:\n", + " db = Chroma.from_documents(\n", + " chunks, embeddings, persist_directory=CHROMA_PATH\n", + " )\n", + " db.persist()\n", + " print(f\"Saved {len(chunks)} chunks to {CHROMA_PATH}.\")\n", + " except Exception as e:\n", + " print(f\"An error occurred: {e}\")\n", + "\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " main(documents)" + ] + }, + { + "cell_type": "markdown", + "id": "b08316ec-6545-4af5-a92e-1f026f121e4f", + "metadata": {}, + "source": [ + "### Making the Query" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "id": "103cf9f0-e116-4e3a-a33c-accdf4246332", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Human: \n", + "Answer the question based only on the following context:\n", + "\n", + "Columbia Jany 8th / 55\n", + "Mr Z B. Oakes\n", + "Dear Sir\n", + "I read\n", + "your letter, desiring me to\n", + "pay the amount of the [unclear]\n", + "over to you, but I cannot do\n", + "so until Col. Bauskett gives\n", + "me notice to do so. I have\n", + "seen him since I read your\n", + "letter, I regret very much that\n", + "I cannot comply with your\n", + "request. The arrangement\n", + "which Mr Mazyck made with\n", + "the Bank, was to meet it\n", + "when we received notice to\n", + "do so,\n", + "Yours respectfully\n", + "Thos. Taylor\n", + "\n", + "---\n", + "\n", + "and the Ballance with all other obligatory and Kind\n", + "favors I will have to make straight when I see you\n", + "you will no doubt feel hurt at this step of\n", + "mine but when you consider all things, and that\n", + "upon this very step our mutual safety & welfare\n", + "depended, why like General Jackson at orleans\n", + "I take the Responsibility, and time I hope\n", + "will alike prove I was Right\n", + "with much Esteem & Regard\n", + "I Remain faithfully yours\n", + "Theo. C Tharin\n", + "\n", + "Mount Holly } {Three Mount Holly PCV\n", + "June 22 { Theo C Tharin PCV\n", + "Z.B. Oakes Esq\n", + "Charleston\n", + "\n", + "---\n", + "\n", + "till which time believe me yours\n", + "Most truly\n", + "Theo. C Tharin\n", + "Z. B. Oakes Esq\n", + "\n", + "---\n", + "\n", + "Charleston 1-6 Dec 1853\n", + "Mr Z. B. Oakes,\n", + "Dear Sir\n", + "I would be glad\n", + "to know what you have\n", + "determined on in the case\n", + "of M Alpine, I bot the\n", + "negro from you, it appears\n", + "and of course you are\n", + "liable to me, I am called\n", + "on for the amount of the\n", + "verdict, It appears that the\n", + "negro was a stolen one. I\n", + "of course lay no charge to\n", + "you on this score, being accessory\n", + "to the affair, but I\n", + "do call on on you to\n", + "hold me Harmless. - Please\n", + "send me your written answer\n", + "[underline]this day.[/underline]\n", + "Respectfully\n", + "[underline]Tho: N. Gadsden[/underline]\n", + "\n", + "---\n", + "\n", + "Summerville So Ca\n", + "April 26th 1854\n", + "Mr Z B Oakes\n", + "Dear Sir\n", + "Your letter\n", + "in reply to mine was duly received.\n", + "I am willing to value the woman at\n", + "$ 800. & be refunded $ 150. or return her\n", + "at once to you. Do inform me as soon\n", + "as possible as to the decision of\n", + "her owner. Yours respectfully\n", + "Thos. L Gelzia\n", + "\n", + "---\n", + "\n", + "Answer the question based on the above context: Who did Z.B Oakes receive a letter from?\n", + "\n", + "Response: Z.B Oakes received a letter from Tho: N. Gadsden.\n", + "\n", + "Sources: ['Thomas Taylor, Columbia, Tenn., autograph lett...: https://www.digitalcommonwealth.org/search/commonwealth:5q47sj75b', 'Theodore C. Tharin, Grumesville, S.C., autogra...: https://www.digitalcommonwealth.org/search/commonwealth:w3764352q', 'Theodore C. Tharin, Grumesville, S.C., autogra...: https://www.digitalcommonwealth.org/search/commonwealth:w3764286b', 'Theodore N. Gadsden, Charleston, S.C., autogra...: https://www.digitalcommonwealth.org/search/commonwealth:9k41zk125', 'Thomas L. Gelzer, Summerville, autograph lette...: https://www.digitalcommonwealth.org/search/commonwealth:ws859j61k']\n" + ] + } + ], + "source": [ + "import argparse\n", + "from langchain_community.vectorstores import Chroma\n", + "from langchain_openai import OpenAIEmbeddings\n", + "from langchain_openai import ChatOpenAI\n", + "from langchain.prompts import ChatPromptTemplate\n", + "from langchain.embeddings import HuggingFaceEmbeddings\n", + "from transformers import AutoTokenizer, AutoModel\n", + "from langchain_core.messages import HumanMessage, SystemMessage\n", + "from langchain_core.output_parsers import StrOutputParser\n", + "\n", + "\n", + "# copy from above\n", + "#CHROMA_PATH = \"/var/folders/xq/fj3st__56r54gz9tdvb7d2k40000gn/T/tmpaxd8t1dv\"\n", + "CHROMA_PATH = \"./chroma_try\"\n", + "\n", + "PROMPT_TEMPLATE = \"\"\"\n", + "Answer the question based only on the following context:\n", + "\n", + "{context}\n", + "\n", + "---\n", + "\n", + "Answer the question based on the above context: {question}\n", + "\"\"\"\n", + "\n", + "def main(query: str):\n", + " # Create CLI with a default value for Jupyter testing\n", + " parser = argparse.ArgumentParser()\n", + " parser.add_argument(\"query_text\", type=str, help=\"The query text.\")\n", + " args = parser.parse_args(args=[query]) # Add a default value here for testing\n", + " query_text = args.query_text\n", + "\n", + " # Prepare the database\n", + " embedding_function = OpenAIEmbeddings(model=\"text-embedding-3-large\", dimensions=3072)\n", + " #embedding_function = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")\n", + " db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)\n", + "\n", + " results = db.similarity_search_with_relevance_scores(query_text, k=5)\n", + " for i in range(len(results)):\n", + " if len(results) == 0 or results[0][1] < 0.3:\n", + " print(f\"Unable to find matching results for \\\"{query_text}\\\"\")\n", + " print(results[0][1])\n", + " return\n", + " \n", + " context_text = \"\\n\\n---\\n\\n\".join([doc.page_content for doc, _score in results])\n", + " prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)\n", + " prompt = prompt_template.format(context=context_text, question=query_text)\n", + " print(prompt)\n", + "\n", + " model = ChatOpenAI()\n", + " response_text = model.predict(prompt)\n", + "\n", + " sources = [doc.metadata.get(\"title\") + \": \" + str(doc.metadata.get(\"URL\")) for doc, _score in results]\n", + " formatted_response = f\"Response: {response_text}\\n\\nSources: {sources}\"\n", + " # response with context, sources, and answer to my query\n", + " print(formatted_response)\n", + "\n", + "if __name__ == \"__main__\":\n", + " query1 = \"Who did Z.B Oakes receive a letter from?\"\n", + " query2 = \"What did Henry M. Sikes say about India Goods?\"\n", + " query3 = \"What happened in World War II?\"\n", + " main(query1)" + ] + }, + { + "cell_type": "markdown", + "id": "805b793e-16e3-4eca-9b2a-baaa91fde961", + "metadata": {}, + "source": [ + "### Conclusions about Chroma and OpenAI Embedding Model\n", + "Chroma seems to be a great option as a vector store, however it is immensely lightweight and requires us to have the vector store as an embedding in the machine that we use. Unfortunatenly we have not found many projects that is dealing with embedding a vast amount of data like we are.\n", + "\n", + "The OpenAIEmbeddings is great, but it also costs money, so that is a no-go." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "4eded946-6f7f-440b-a892-80f8c37db3ab", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_content='Oglethorpe December 8 53\n", + "Z B Oaks esqr\n", + "Dr Sir you will please\n", + "inform me how negros are selling\n", + "& how your market is supplied\n", + "I think I will be in Charleston\n", + "the latter part of this month, &\n", + "will want to buy a Cook &\n", + "good active boy & perhaps two negro fellows\n", + "Very Respectfully\n", + "Yours T G Hudson\n", + "\n", + "\n", + "' metadata={'title': 'T.G. Hudson, Oglethorpe, Ga., autograph letter...', 'abstract': '1161475 Asks market price at Charleston.\\nName: abstract_tsi, dtype: object', 'subtitle': '1161475 NaN\\nName: title_info_primary_subtitle_tsi, dtype: object', 'URL': 'https://www.digitalcommonwealth.org/search/commonwealth:9k41zk460', 'title_alt': '1161475 NaN\\nName: title_info_alternative_tsim, dtype: object', 'genre': \"1161475 ['Manuscripts', 'Correspondence']\\nName: genre_basic_ssim, dtype: object\", 'genre_specific': '1161475 NaN\\nName: genre_specific_ssim, dtype: object'}\n" + ] + } + ], + "source": [ + "print(documents[0])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}