diff --git a/CHANGELOG.md b/CHANGELOG.md index b182abb..d6b7bff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,15 +1,13 @@ # [1.1.0](https://github.com/lpm0073/netec-llm/compare/v1.0.0...v1.1.0) (2023-12-01) - ### Bug Fixes -* fix load problem with existing index ([62cd18f](https://github.com/lpm0073/netec-llm/commit/62cd18f8088873a794ec363c4e18770dfdc41ea5)) - +- fix load problem with existing index ([62cd18f](https://github.com/lpm0073/netec-llm/commit/62cd18f8088873a794ec363c4e18770dfdc41ea5)) ### Features -* perfect load(). revert rag() to openai only calls ([8de793d](https://github.com/lpm0073/netec-llm/commit/8de793dcca77ec23f09e68ca9e8dba7f64623b3c)) -* ssm.rag() w load, split, embed, store ([2335d22](https://github.com/lpm0073/netec-llm/commit/2335d22c5fd9092642ff1eb67a34fbcd712d7f9b)) +- perfect load(). revert rag() to openai only calls ([8de793d](https://github.com/lpm0073/netec-llm/commit/8de793dcca77ec23f09e68ca9e8dba7f64623b3c)) +- ssm.rag() w load, split, embed, store ([2335d22](https://github.com/lpm0073/netec-llm/commit/2335d22c5fd9092642ff1eb67a34fbcd712d7f9b)) # 1.0.0 (2023-11-30) diff --git a/README.md b/README.md index d877472..b8ee553 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,16 @@ export PINECONE_API_KEY=SET-ME-PLEASE export PINECONE_ENVIRONMENT=SET-ME-PLEASE ``` +### Pinecone setup + +You'll need to manually create an index with the following characteristics + +- Environment: gcp-starter +- Index name: netec-rag +- Metric: dotproduct +- Dimensions: 1536 +- Pod Type: starter + ## Contributing This project uses a mostly automated pull request and unit testing process. See the resources in .github for additional details. You additionally should ensure that pre-commit is installed and working correctly on your dev machine by running the following command from the root of the repo. diff --git a/models/__version__.py b/models/__version__.py index a8b8563..2517c85 100644 --- a/models/__version__.py +++ b/models/__version__.py @@ -1,2 +1,2 @@ # -*- coding: utf-8 -*- -__version__ = "1.1.0-beta.1" +__version__ = "1.1.1" diff --git a/models/ssm.py b/models/ssm.py index aaa7723..fc2ebf0 100644 --- a/models/ssm.py +++ b/models/ssm.py @@ -4,10 +4,12 @@ Sales Support Model (SSM) for the LangChain project. See: https://python.langchain.com/docs/modules/model_io/llms/llm_caching https://python.langchain.com/docs/modules/data_connection/document_loaders/pdf + https://python.langchain.com/docs/integrations/retrievers/pinecone_hybrid_search """ import glob import os +import textwrap from typing import List # ClassVar # pinecone integration @@ -27,9 +29,11 @@ from langchain.globals import set_llm_cache from langchain.llms.openai import OpenAI from langchain.prompts import PromptTemplate +from langchain.retrievers import PineconeHybridSearchRetriever from langchain.schema import HumanMessage, SystemMessage -from langchain.text_splitter import Document, RecursiveCharacterTextSplitter +from langchain.text_splitter import Document from langchain.vectorstores.pinecone import Pinecone +from pinecone_text.sparse import BM25Encoder # this project from models.const import Credentials @@ -46,6 +50,24 @@ set_llm_cache(InMemoryCache()) +class TextSplitter: + """ + Custom text splitter that add metadata to the Document object + which is required by PineconeHybridSearchRetriever. + """ + + # ... + + def create_documents(self, texts): + """Create documents""" + documents = [] + for text in texts: + # Create a Document object with the text and metadata + document = Document(page_content=text, metadata={"context": text}) + documents.append(document) + return documents + + class SalesSupportModel: """Sales Support Model (SSM).""" @@ -60,15 +82,14 @@ class SalesSupportModel: ) # embeddings - text_splitter = RecursiveCharacterTextSplitter( - chunk_size=100, - chunk_overlap=0, - ) - openai_embedding = OpenAIEmbeddings() - pinecone_index = Pinecone.from_existing_index( - Credentials.PINECONE_INDEX_NAME, - embedding=openai_embedding, + openai_embeddings = OpenAIEmbeddings( + api_key=Credentials.OPENAI_API_KEY, organization=Credentials.OPENAI_API_ORGANIZATION ) + pinecone_index = pinecone.Index(index_name=Credentials.PINECONE_INDEX_NAME) + vector_store = Pinecone(index=pinecone_index, embedding=openai_embeddings, text_key="lc_id") + + text_splitter = TextSplitter() + bm25_encoder = BM25Encoder().default() def cached_chat_request(self, system_message: str, human_message: str) -> SystemMessage: """Cached chat request.""" @@ -86,16 +107,27 @@ def prompt_with_template(self, prompt: PromptTemplate, concept: str, model: str retval = llm(prompt.format(concept=concept)) return retval - # FIX NOTE: DEPRECATED def split_text(self, text: str) -> List[Document]: - """Split text.""" - text_splitter = RecursiveCharacterTextSplitter( - chunk_size=100, - chunk_overlap=0, - ) - retval = text_splitter.create_documents([text]) + """Split text. Leaving this here for now, since it exposes the return type.""" + retval = self.text_splitter.create_documents([text]) return retval + def fit_tf_idf_values(self, corpus: List[str]): + """Fit TF-IDF values. + 1. Fit the BM25 encoder on the corpus + 2. Encode the corpus + 3. Store the encoded corpus in Pinecone + """ + corpus = ["foo", "bar", "world", "hello"] + + # fit tf-idf values on your corpus + self.bm25_encoder.fit(corpus) + + # persist the values to a json file + self.bm25_encoder.dump("bm25_values.json") + self.bm25_encoder = BM25Encoder().load("bm25_values.json") + self.bm25_encoder.fit(corpus) + def load(self, filepath: str): """ Embed PDF. @@ -103,7 +135,26 @@ def load(self, filepath: str): 2. Split into pages 3. Embed each page 4. Store in Pinecone + + Note: it's important to make sure that the "context" field that holds the document text + in the metadata is not indexed. Currently you need to specify explicitly the fields you + do want to index. For more information checkout + https://docs.pinecone.io/docs/manage-indexes#selective-metadata-indexing """ + try: + print("Deleting index...") + pinecone.delete_index(Credentials.PINECONE_INDEX_NAME) + except pinecone.exceptions.PineconeException: + print("Index does not exist. Continuing...") + + metadata_config = { + "indexed": ["lc_id", "lc_type"], + "context": ["lc_text"], + } + print("Creating index. This may take a few minutes...") + pinecone.create_index( + Credentials.PINECONE_INDEX_NAME, dimension=1536, metric="dotproduct", metadata_config=metadata_config + ) pdf_files = glob.glob(os.path.join(filepath, "*.pdf")) i = 0 @@ -117,12 +168,10 @@ def load(self, filepath: str): for doc in docs: k += 1 print(k * "-", end="\r") - texts_splitter_results = self.text_splitter.create_documents([doc.page_content]) - self.pinecone_index.from_existing_index( - index_name=Credentials.PINECONE_INDEX_NAME, - embedding=self.openai_embedding, - text_key=texts_splitter_results, - ) + documents = self.text_splitter.create_documents([doc.page_content]) + document_texts = [doc.page_content for doc in documents] + embeddings = self.openai_embeddings.embed_documents(document_texts) + self.vector_store.add_documents(documents=documents, embeddings=embeddings) print("Finished loading PDFs") @@ -133,26 +182,42 @@ def rag(self, prompt: str): from storage using a Retriever. 2. Generate: A ChatModel / LLM produces an answer using a prompt that includes the question and the retrieved data - """ - # pylint: disable=unused-variable - def format_docs(docs): - """Format docs.""" - return "\n\n".join(doc.page_content for doc in docs) + To prompt OpenAI's GPT-3 model to consider the embeddings from the Pinecone + vector database, you would typically need to convert the embeddings back + into a format that GPT-3 can understand, such as text. However, GPT-3 does + not natively support direct input of embeddings. - retriever = self.pinecone_index.as_retriever() - - # Use the retriever to get relevant documents + The typical workflow is to use the embeddings to retrieve relevant documents, + and then use the text of these documents as part of the prompt for GPT-3. + """ + retriever = PineconeHybridSearchRetriever( + embeddings=self.openai_embeddings, sparse_encoder=self.bm25_encoder, index=self.pinecone_index + ) documents = retriever.get_relevant_documents(query=prompt) print(f"Retrieved {len(documents)} related documents from Pinecone") - # Generate a prompt from the retrieved documents - prompt += " ".join(doc.page_content for doc in documents) - print(f"Prompt contains {len(prompt.split())} words") - print("Prompt:", prompt) - print(doc for doc in documents) + # Extract the text from the documents + document_texts = [doc.page_content for doc in documents] + leader = textwrap.dedent( + """\ + You can assume that the following is true, + and you should attempt to incorporate these facts + in your response: + """ + ) + + # Create a prompt that includes the document texts + prompt_with_relevant_documents = f"{prompt + leader} {'. '.join(document_texts)}" + + print(f"Prompt contains {len(prompt_with_relevant_documents.split())} words") + print("Prompt:", prompt_with_relevant_documents) # Get a response from the GPT-3.5-turbo model - response = self.cached_chat_request(system_message="You are a helpful assistant.", human_message=prompt) + response = self.cached_chat_request( + system_message="You are a helpful assistant.", human_message=prompt_with_relevant_documents + ) + print("Response:") + print("------------------------------------------------------") return response diff --git a/models/tests/test_openai.py b/models/tests/test_openai.py new file mode 100644 index 0000000..c9110a6 --- /dev/null +++ b/models/tests/test_openai.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +# flake8: noqa: F401 +# pylint: disable=too-few-public-methods +""" +Test integrity of base class. +""" +import pytest # pylint: disable=unused-import + +from ..ssm import SalesSupportModel + + +class TestOpenAI: + """Test SalesSupportModel class.""" + + def test_03_test_openai_connectivity(self): + """Ensure that we have connectivity to OpenAI.""" + + ssm = SalesSupportModel() + retval = ssm.cached_chat_request( + "your are a helpful assistant", "please return the value 'CORRECT' in all upper case." + ) + assert retval == "CORRECT" diff --git a/models/tests/test_pinecone.py b/models/tests/test_pinecone.py new file mode 100644 index 0000000..8b5fc44 --- /dev/null +++ b/models/tests/test_pinecone.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- +# flake8: noqa: F401 +""" +Test integrity of base class. +""" + +import pinecone +import pytest # pylint: disable=unused-import +from langchain.embeddings import OpenAIEmbeddings +from langchain.vectorstores.pinecone import Pinecone + +from ..const import Credentials + + +class TestPinecone: + """Test SalesSupportModel class.""" + + def test_01_test_pinecone_connectivity(self): + """Ensure that we have connectivity to Pinecone.""" + # pylint: disable=broad-except + try: + pinecone.init(api_key=Credentials.PINECONE_API_KEY, environment=Credentials.PINECONE_ENVIRONMENT) + except Exception as e: + assert False, f"pinecone.init() failed with exception: {e}" + + def test_02_test_pinecone_index(self): + """Ensure that the Pinecone index exists and that we can connect to it.""" + pinecone.init(api_key=Credentials.PINECONE_API_KEY, environment=Credentials.PINECONE_ENVIRONMENT) + openai_embedding = OpenAIEmbeddings() + + # pylint: disable=broad-except + try: + Pinecone.from_existing_index( + Credentials.PINECONE_INDEX_NAME, + embedding=openai_embedding, + ) + except Exception as e: + assert ( + False + ), f"Pinecone initialization of index {Credentials.PINECONE_INDEX_NAME,} failed with exception: {e}" diff --git a/models/tests/test_prompts.py b/models/tests/test_prompts.py index 63e3f36..6220b5f 100644 --- a/models/tests/test_prompts.py +++ b/models/tests/test_prompts.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- # flake8: noqa: F401 -# pylint: disable=too-few-public-methods """ Test integrity of base class. """ diff --git a/models/tests/test_ssm.py b/models/tests/test_ssm.py index 619c9bc..ebb8576 100644 --- a/models/tests/test_ssm.py +++ b/models/tests/test_ssm.py @@ -1,18 +1,14 @@ # -*- coding: utf-8 -*- # flake8: noqa: F401 -# pylint: disable=too-few-public-methods """ Test integrity of base class. """ -import pinecone import pytest # pylint: disable=unused-import from langchain.chat_models import ChatOpenAI from langchain.embeddings import OpenAIEmbeddings -from langchain.text_splitter import RecursiveCharacterTextSplitter -from langchain.vectorstores.pinecone import Pinecone +from pinecone import Index -from ..const import Credentials -from ..ssm import SalesSupportModel +from models.ssm import SalesSupportModel, TextSplitter class TestSalesSupportModel: @@ -21,30 +17,17 @@ class TestSalesSupportModel: def test_01_basic(self): """Ensure that we can instantiate the class.""" - SalesSupportModel() + # pylint: disable=broad-except + try: + SalesSupportModel() + except Exception as e: + assert False, f"initialization of SalesSupportModel() failed with exception: {e}" def test_02_class_aatribute_types(self): """ensure that class attributes are of the correct type""" ssm = SalesSupportModel() assert isinstance(ssm.chat, ChatOpenAI) - assert isinstance(ssm.pinecone_index, Pinecone) - assert isinstance(ssm.text_splitter, RecursiveCharacterTextSplitter) - assert isinstance(ssm.openai_embedding, OpenAIEmbeddings) - - def test_03_test_openai_connectivity(self): - """Ensure that we have connectivity to OpenAI.""" - - ssm = SalesSupportModel() - retval = ssm.cached_chat_request( - "your are a helpful assistant", "please return the value 'CORRECT' in all upper case." - ) - assert retval == "CORRECT" - - def test_04_test_pinecone_connectivity(self): - """Ensure that we have connectivity to Pinecone.""" - # pylint: disable=broad-except - try: - pinecone.init(api_key=Credentials.PINECONE_API_KEY, environment=Credentials.PINECONE_ENVIRONMENT) - except Exception as e: - assert False, f"pinecone.init() failed with exception: {e}" + assert isinstance(ssm.pinecone_index, Index) + assert isinstance(ssm.text_splitter, TextSplitter) + assert isinstance(ssm.openai_embeddings, OpenAIEmbeddings) diff --git a/requirements.txt b/requirements.txt index 5592235..e3d81d5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,6 +23,7 @@ langchain==0.0.343 langchainhub==0.1.14 openai==1.3.5 pinecone-client==2.2.4 +pinecone-text==0.7.0 pydantic==2.5.2 pypdf==3.17.1 python-dotenv==1.0.0