From 9b6ae46e922e1f663ff8049929acaf092f5eadda Mon Sep 17 00:00:00 2001 From: asoderlind Date: Thu, 12 Oct 2023 20:24:25 +0200 Subject: [PATCH 1/9] raise more legible error if the embedding vector dims don't match --- application/vectorstore/faiss.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/application/vectorstore/faiss.py b/application/vectorstore/faiss.py index 217b04571..c5af0e6cb 100644 --- a/application/vectorstore/faiss.py +++ b/application/vectorstore/faiss.py @@ -2,6 +2,7 @@ from langchain.vectorstores import FAISS from application.core.settings import settings +HUGGINGFACE_MODEL_NAME = "huggingface_sentence-transformers/all-mpnet-base-v2" class FaissStore(BaseVectorStore): def __init__(self, path, embeddings_key, docs_init=None): @@ -12,9 +13,19 @@ def __init__(self, path, embeddings_key, docs_init=None): docs_init, self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key) ) else: + embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key) self.docsearch = FAISS.load_local( - self.path, self._get_embeddings(settings.EMBEDDINGS_NAME, settings.EMBEDDINGS_KEY) + self.path, embeddings ) + + # Check that the word_embedding_dimension of the index matches the word_embedding_dimension of the embeddings + if settings.EMBEDDINGS_NAME == HUGGINGFACE_MODEL_NAME: + try: + word_embedding_dimension = embeddings.client[1].word_embedding_dimension + except AttributeError as e: + raise AttributeError("word_embedding_dimension not found in embeddings.client[1]") from e + if word_embedding_dimension != self.docsearch.index.d: + raise ValueError("word_embedding_dimension != docsearch_index_word_embedding_dimension") def search(self, *args, **kwargs): return self.docsearch.similarity_search(*args, **kwargs) From 4752ce525008cc5d2d4c046a2a762dba4492de48 Mon Sep 17 00:00:00 2001 From: asoderlind Date: Sun, 15 Oct 2023 09:12:00 +0200 Subject: [PATCH 2/9] fix linting error --- application/vectorstore/faiss.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/application/vectorstore/faiss.py b/application/vectorstore/faiss.py index c5af0e6cb..856eaee2d 100644 --- a/application/vectorstore/faiss.py +++ b/application/vectorstore/faiss.py @@ -18,7 +18,8 @@ def __init__(self, path, embeddings_key, docs_init=None): self.path, embeddings ) - # Check that the word_embedding_dimension of the index matches the word_embedding_dimension of the embeddings + # Check that the word_embedding_dimension of the index matches + # the word_embedding_dimension of the embeddings if settings.EMBEDDINGS_NAME == HUGGINGFACE_MODEL_NAME: try: word_embedding_dimension = embeddings.client[1].word_embedding_dimension From 60cd6a455abf69719e6a82b5c34aa953e5008dd3 Mon Sep 17 00:00:00 2001 From: asoderlind Date: Sun, 15 Oct 2023 10:22:00 +0200 Subject: [PATCH 3/9] refactor --- application/vectorstore/faiss.py | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/application/vectorstore/faiss.py b/application/vectorstore/faiss.py index 856eaee2d..08502c57b 100644 --- a/application/vectorstore/faiss.py +++ b/application/vectorstore/faiss.py @@ -2,31 +2,21 @@ from langchain.vectorstores import FAISS from application.core.settings import settings -HUGGINGFACE_MODEL_NAME = "huggingface_sentence-transformers/all-mpnet-base-v2" class FaissStore(BaseVectorStore): def __init__(self, path, embeddings_key, docs_init=None): super().__init__() self.path = path + embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key) if docs_init: self.docsearch = FAISS.from_documents( - docs_init, self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key) + docs_init, embeddings ) else: - embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key) self.docsearch = FAISS.load_local( self.path, embeddings ) - - # Check that the word_embedding_dimension of the index matches - # the word_embedding_dimension of the embeddings - if settings.EMBEDDINGS_NAME == HUGGINGFACE_MODEL_NAME: - try: - word_embedding_dimension = embeddings.client[1].word_embedding_dimension - except AttributeError as e: - raise AttributeError("word_embedding_dimension not found in embeddings.client[1]") from e - if word_embedding_dimension != self.docsearch.index.d: - raise ValueError("word_embedding_dimension != docsearch_index_word_embedding_dimension") + self.assert_embedding_dimensions(embeddings) def search(self, *args, **kwargs): return self.docsearch.similarity_search(*args, **kwargs) @@ -36,3 +26,19 @@ def add_texts(self, *args, **kwargs): def save_local(self, *args, **kwargs): return self.docsearch.save_local(*args, **kwargs) + + def assert_embedding_dimensions(self, embeddings, *args, **kwargs): + """ + Check that the word embedding dimension of the docsearch index matches + the dimension of the word embeddings used + """ + if settings.EMBEDDINGS_NAME == "huggingface_sentence-transformers/all-mpnet-base-v2": + try: + word_embedding_dimension = embeddings.client[1].word_embedding_dimension + except AttributeError as e: + raise AttributeError("word_embedding_dimension not found in embeddings.client[1]") from e + docsearch_index_dimension = self.docsearch.index.d + if word_embedding_dimension != docsearch_index_dimension: + raise ValueError(f"word_embedding_dimension ({word_embedding_dimension}) " + + f"!= docsearch_index_word_embedding_dimension ({docsearch_index_dimension})") + From 09aa56b63daea00f1ef12142924d6c77a803ca2b Mon Sep 17 00:00:00 2001 From: asoderlind Date: Sun, 15 Oct 2023 10:22:07 +0200 Subject: [PATCH 4/9] add test --- tests/test_vector_store.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 tests/test_vector_store.py diff --git a/tests/test_vector_store.py b/tests/test_vector_store.py new file mode 100644 index 000000000..a4b2d3c73 --- /dev/null +++ b/tests/test_vector_store.py @@ -0,0 +1,16 @@ +import pytest +from flask import Flask +from application.error import bad_request, response_error +from application.vectorstore.faiss import FaissStore +from application.core.settings import settings + +def test_init_local_faiss_store_huggingface(): + """ + Test that asserts that trying to initialize a FaissStore with + the huggingface sentence transformer below together with the + index.faiss file in the application/ folder results in a + dimension mismatch error. + """ + settings.EMBEDDINGS_NAME = "huggingface_sentence-transformers/all-mpnet-base-v2" + with pytest.raises(ValueError): + FaissStore("application/", "", None) From 0ca96130c8274adcf85578979a436b484b662f59 Mon Sep 17 00:00:00 2001 From: asoderlind Date: Sun, 15 Oct 2023 10:23:09 +0200 Subject: [PATCH 5/9] remove trailing whitespace --- application/vectorstore/faiss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/application/vectorstore/faiss.py b/application/vectorstore/faiss.py index 08502c57b..653484c47 100644 --- a/application/vectorstore/faiss.py +++ b/application/vectorstore/faiss.py @@ -10,7 +10,7 @@ def __init__(self, path, embeddings_key, docs_init=None): embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key) if docs_init: self.docsearch = FAISS.from_documents( - docs_init, embeddings + docs_init, embeddings ) else: self.docsearch = FAISS.load_local( From 4e81f989274e341e3f2ae706fd8419a4db51b987 Mon Sep 17 00:00:00 2001 From: asoderlind Date: Mon, 16 Oct 2023 06:13:15 +0200 Subject: [PATCH 6/9] add dependency --- application/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/application/requirements.txt b/application/requirements.txt index b4c712f44..96dec9367 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -104,3 +104,4 @@ urllib3==1.26.17 vine==5.0.0 wcwidth==0.2.6 yarl==1.8.2 +sentence-transformers==2.2.2 \ No newline at end of file From e2a8ca143ab1456c716ecfe4a3f139a3ec28fc43 Mon Sep 17 00:00:00 2001 From: asoderlind Date: Mon, 16 Oct 2023 06:13:26 +0200 Subject: [PATCH 7/9] remove unused imports --- tests/test_vector_store.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_vector_store.py b/tests/test_vector_store.py index a4b2d3c73..97822cc98 100644 --- a/tests/test_vector_store.py +++ b/tests/test_vector_store.py @@ -1,6 +1,4 @@ import pytest -from flask import Flask -from application.error import bad_request, response_error from application.vectorstore.faiss import FaissStore from application.core.settings import settings From e73636bef3cea2212a91aa9755b275c2ba764ad5 Mon Sep 17 00:00:00 2001 From: asoderlind Date: Mon, 16 Oct 2023 11:22:42 +0200 Subject: [PATCH 8/9] remove trailing whitespace, sort imports, remove unused arguments --- application/vectorstore/faiss.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/application/vectorstore/faiss.py b/application/vectorstore/faiss.py index 653484c47..e8960fe0a 100644 --- a/application/vectorstore/faiss.py +++ b/application/vectorstore/faiss.py @@ -1,5 +1,5 @@ -from application.vectorstore.base import BaseVectorStore from langchain.vectorstores import FAISS +from application.vectorstore.base import BaseVectorStore from application.core.settings import settings class FaissStore(BaseVectorStore): @@ -23,11 +23,11 @@ def search(self, *args, **kwargs): def add_texts(self, *args, **kwargs): return self.docsearch.add_texts(*args, **kwargs) - + def save_local(self, *args, **kwargs): return self.docsearch.save_local(*args, **kwargs) - def assert_embedding_dimensions(self, embeddings, *args, **kwargs): + def assert_embedding_dimensions(self, embeddings): """ Check that the word embedding dimension of the docsearch index matches the dimension of the word embeddings used From d51cd8df89d801fa81c4d909a3f491dac15c61ff Mon Sep 17 00:00:00 2001 From: asoderlind Date: Mon, 16 Oct 2023 11:31:10 +0200 Subject: [PATCH 9/9] add file docstring --- tests/test_vector_store.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test_vector_store.py b/tests/test_vector_store.py index 97822cc98..297a225ce 100644 --- a/tests/test_vector_store.py +++ b/tests/test_vector_store.py @@ -1,3 +1,8 @@ +""" +Tests regarding the vector store class, including checking +compatibility between different transformers and local vector +stores (index.faiss) +""" import pytest from application.vectorstore.faiss import FaissStore from application.core.settings import settings