From 5cf5b4c71cdde8083e4aace13994b940d1397d78 Mon Sep 17 00:00:00 2001 From: Yuce Tekol Date: Wed, 2 Oct 2024 19:51:43 +0300 Subject: [PATCH 1/7] LangChain integration docs --- .../pages/langchain-integration.adoc | 257 ++++++++++++++++++ 1 file changed, 257 insertions(+) create mode 100644 docs/modules/integrate/pages/langchain-integration.adoc diff --git a/docs/modules/integrate/pages/langchain-integration.adoc b/docs/modules/integrate/pages/langchain-integration.adoc new file mode 100644 index 000000000..3ec9dee21 --- /dev/null +++ b/docs/modules/integrate/pages/langchain-integration.adoc @@ -0,0 +1,257 @@ += LangChain Integration +:description: The Hazelcast integration for LangChain provides a Vector Store implementation that enables using Hazecast Vector Search with LangChain. + +{description} + +== Introduction + +LangChain is a Python framework that makes it easier to create large language model (LLM) based solutions, such as chat bots by linking various components. + +LangChain `VectorStore` interface makes it easier to incorporate RAGs (Retrieval Augmented Generation) in LLM solutions. + +`langchain-hazelcast` package provides the Hazelcast `VectorStore` implementation for LangChain. + +== Installing LangChain/Hazelcast Vector Store + +[source,bash] +---- +pip install langchain-hazelcast +---- + +== Creating a Vector Store + +`Hazelcast` class is the Hazelcast vector store implementation that lives in the `langchain_hazelcast.vectorstore` package. + +The constructor for the `Hazelcast` vector store class takes the following arguments: + +* `embedding: Embeddings`: The embedding producer. This is a required argument. +* `collection_name: str`: Hazelcast `VectorCollection` to use. By default `"langchain"`. +* `client: Optional[HazelcastClient]`: A Hazelcast client object. +* `client_config: Optional[Config]`: A Hazelcast client configuration object. + +`client` and `client_config` arguments are mutually exclusive, they must not be set together. + +If you already have a Hazelcast client object, it is recommended to reuse it using the `client` argument. +Otherwise, you may prefer to create a Hazelcast configuration object first and pass it to the `Hazelcast` vector store constructor. + +The embedding producer must be an instance of LangChain `langchain_core.embeddings.Embeddings` class, such as `HuggingFaceEmbeddings`. +Here is an example: + +[source,python] +---- +from langchain_huggingface import HuggingFaceEmbeddings + +embeddings = HuggingFaceEmbeddings( + model_name="sentence-transformers/all-mpnet-base-v2", + model_kwargs={ + "device": "cpu", + "tokenizer_kwargs": { + "clean_up_tokenization_spaces": True, + }, + }, + encode_kwargs={"normalize_embeddings": False}, +) +---- + +Once you have the embedding producer, you can create the `Hazelcast` vector store instance. +Here's how to create a vector store which uses the default Hazelcast client that connects to the Hazelcast cluster `dev` at `localhost:5701`: + +[source,python] +---- +vector_store = Hazelcast(embeddings) +---- + +The same but with an explicitly created Hazelcast client: + +[source,python] +---- +from hazelcast import HazelcastClient +from hazelcast.config import Config + +config = Config() +config.cluster_members = ["localhost:5701"] +config.cluster_name = "dev" +client = HazelcastClient(config) +vector_store = Hazelcast(embeddings, client=client) +---- + +In case you would like to pass the client configuration without creating the client itself: +[source,python] +---- +from hazelcast import HazelcastClient +from hazelcast.config import Config + +config = Config() +config.cluster_members = ["localhost:5701"] +config.cluster_name = "dev" +vector_store = Hazelcast(embeddings, client_config=config) +---- + +You can find more about the various Hazelcast client configuration options in link:https://hazelcast.readthedocs.io/en/stable/client.html#hazelcast.client.HazelcastClient[Hazelcast Client documentation]. + +Although there is a default name for the underlying Hazelcast VectorCollection, you may want to use a different name. +You can do that by passing the name in the `collection_name` argument to the vector store constructor: +[source,python] +---- +name = "customer-docs" +vector_store = Hazelcast(embeddings, collection_name=name, client=client) +---- + +== Updating the Vector Store + +Once the vector store is created, you can start adding LangChain documents or string data into it. +While adding the data, you have the option to associate identifiers and metadata with them. + +Hazelcast vector store has two methods to add data, `add_documents` and `add_texts`. +Using the former, you can add `langchain_core.documents.Document` objects, and using the latter, you can add strings. + +In the simplest case, you would add one or more strings to the vector store: + +[source,python] +---- +texts = [ + "Hazelcast Platform uniquely combines a distributed compute engine and a fast data store in one runtime.", + "It offers unmatched performance, resilience and scale for real-time and AI-driven applications.", + "It allows you to quickly build resource-efficient, real-time applications.", + "You can deploy it at any scale from small edge devices to a large cluster of cloud instances.", +] +ids = vector_store.add_texts(texts) +for id in ids: + print(id) +---- + +Outputs: +[source,output] +---- +8c28f820-d4ed-4cfa-bac4-89b2d110b380 +b235643b-62c0-4039-9856-1493f921e1a4 +083cc0a4-9221-48bd-b734-0de2b4754bb3 +94b524bd-cdcb-4327-92e9-488ea5d915fd +---- + +`Hazelcast.add_texts` method returns the IDs of the added texts. +If the IDs were not provided to the `add_texts` method, then they are automatically genereated, like in the example above. + +You can provide the IDs manually by passing them in the `ids` parameter. +That may be useful in case you would like to update data instead of extending the vector store. + +[source,python] +---- +ids = vector_store.add_texts( + texts, + ids=["item1", "item2", "item3", "item4"] +) +for id in ids: + print(id) +---- + +If provided, the number of IDs must be equal to the number of texts. + +You can also pass metadata with the text or documents using the `metadatas` parameter. +Each item of the `metadatas` list must be a Python dictionary. +Like IDs, the number of metadata must be equal to the number of texts. + +[source,python] +---- +ids = vector_store.add_texts( + texts, + metadata=[ + {"page": 1}, + {"page": 1}, + {"page": 1}, + {"page": 2}, + ] +) +---- + +In case you have `langchain_core.documents.Document` objects, you can use the `add_documents` methods to add them to the vector store: + +[source,python] +---- +from langchain_core.documents import Document + +docs = [ + Document( + id="item1", + metadata={"page": 1}, + page_content="Hazelcast Platform uniquely combines a distributed compute engine and a fast data store in one runtime."), + Document( + id="item2", + metadata={"page": 1}, + page_content="It offers unmatched performance, resilience and scale for real-time and AI-driven applications."), + Document( + id="item3", + metadata={"page": 1}, + page_content="It allows you to quickly build resource-efficient, real-time applications."), + Document( + id="item4", + metadata={"page": 2}, + page_content="You can deploy it at any scale from small edge devices to a large cluster of cloud instances."), +] +ids = vector_store.add_documents(docs) +---- + +`Hazelcast` vector store has two class methods that combine creating the vector store and adding texts or documents to it. +Those are `Hazelcast.from_texts` and `Hazelcast.from_documents` methods respectively. +Calling these methods return the `Hazelcast` vector store instance. + +Here is an example that uses the `Hazelcast.from_texts` method: +[source,python] +---- +vector_store = Hazelcast.from_texts(texts, embedding=embeddings, client_config=config) +---- + +== Searching the Vector Store + +Once the vector store is populated, you can run vector similarity searches on it. +The `similarity_search` method of `Hazelcast` vector store takes a string to be used for the search and returns a list of Documents. + +[source,python] +---- +query = "Does Hazelcast enable real-time applications?" +docs = vector_store.similarity_search(query) +for doc in docs: + print(f"{doc.id}: {doc.page_content}") +---- + +You can optionally specify the maximum number of Documents to be returned using the `k` parameter: + +[source,python] +---- +docs = vector_store.similarity_search(query, k=10) +---- + +== Other Vector Store Operations + +You can retrieve Documents in the vector store using the `get_by_ids` method. +This method takes a sequence of IDs and returns the corresponding Documents if they exist. +Note that, the order of the IDs and the returned Documents may not be the same: + +[source,python] +---- +docs = vector_store.get_by_ids([ + "b235643b-62c0-4039-9856-1493f921e1a4", + "24d72bd3-e981-4701-a983-0a7800383fd1", +]) +---- + +To delete some or all Documents, you can use the `delete` method. +It deletes the Documents with the given IDs if one or more IDs provided, or deletes all Documents if no IDs are provided. +This method always returns `True`. +The example below deletes only two Documents: + +[source,python] +---- +vector_store.delete([ + "b235643b-62c0-4039-9856-1493f921e1a4", + "24d72bd3-e981-4701-a983-0a7800383fd1", +]) +---- + +And the following example deletes all Documents: + +[source,python] +---- +vector_store.delete() +---- + From 31d64e3604e6ba0f7bee3f14d9af7a6410b60173 Mon Sep 17 00:00:00 2001 From: Yuce Tekol Date: Wed, 2 Oct 2024 20:09:53 +0300 Subject: [PATCH 2/7] Updated nav --- docs/modules/ROOT/nav.adoc | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc index 758fba3e1..f9535f988 100644 --- a/docs/modules/ROOT/nav.adoc +++ b/docs/modules/ROOT/nav.adoc @@ -179,6 +179,7 @@ include::wan:partial$nav.adoc[] ** xref:spring:hibernate.adoc[] ** xref:spring:transaction-manager.adoc[] ** xref:spring:best-practices.adoc[] +* xref:integrate:langchain-integration.adoc[] * xref:integrate:integrate-with-feast.adoc[] ** xref:integrate:install-connect.adoc[Install and connect Feast] ** xref:integrate:feast-config.adoc[] From 86f7c7877cc8e82655a287828cbc3a39a6708dd0 Mon Sep 17 00:00:00 2001 From: Yuce Tekol Date: Thu, 10 Oct 2024 15:57:57 +0300 Subject: [PATCH 3/7] Renamed the page to conform to other page names --- docs/modules/ROOT/nav.adoc | 2 +- ...langchain-integration.adoc => integrate-with-langchain.adoc} | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename docs/modules/integrate/pages/{langchain-integration.adoc => integrate-with-langchain.adoc} (99%) diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc index be665e4a7..085fc9770 100644 --- a/docs/modules/ROOT/nav.adoc +++ b/docs/modules/ROOT/nav.adoc @@ -179,7 +179,7 @@ include::wan:partial$nav.adoc[] ** xref:spring:hibernate.adoc[] ** xref:spring:transaction-manager.adoc[] ** xref:spring:best-practices.adoc[] -* xref:integrate:langchain-integration.adoc[] +* xref:integrate:integrate-with-langchain.adoc[] * xref:integrate:integrate-with-feast.adoc[] ** xref:integrate:install-connect.adoc[Install and connect Feast] ** xref:integrate:feast-config.adoc[] diff --git a/docs/modules/integrate/pages/langchain-integration.adoc b/docs/modules/integrate/pages/integrate-with-langchain.adoc similarity index 99% rename from docs/modules/integrate/pages/langchain-integration.adoc rename to docs/modules/integrate/pages/integrate-with-langchain.adoc index 3ec9dee21..d6235e989 100644 --- a/docs/modules/integrate/pages/langchain-integration.adoc +++ b/docs/modules/integrate/pages/integrate-with-langchain.adoc @@ -1,4 +1,4 @@ -= LangChain Integration += Integrate with LangChain :description: The Hazelcast integration for LangChain provides a Vector Store implementation that enables using Hazecast Vector Search with LangChain. {description} From cfce797f919233fd938130cf7394b6f76d8bc9c0 Mon Sep 17 00:00:00 2001 From: Yuce Tekol Date: Wed, 16 Oct 2024 17:16:49 +0300 Subject: [PATCH 4/7] Review comments --- .../integrate/pages/integrate-with-langchain.adoc | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/modules/integrate/pages/integrate-with-langchain.adoc b/docs/modules/integrate/pages/integrate-with-langchain.adoc index d6235e989..41c82c42d 100644 --- a/docs/modules/integrate/pages/integrate-with-langchain.adoc +++ b/docs/modules/integrate/pages/integrate-with-langchain.adoc @@ -100,7 +100,7 @@ vector_store = Hazelcast(embeddings, collection_name=name, client=client) == Updating the Vector Store Once the vector store is created, you can start adding LangChain documents or string data into it. -While adding the data, you have the option to associate identifiers and metadata with them. +While adding the data, you have the option to associate identifiers and metadata with it. Hazelcast vector store has two methods to add data, `add_documents` and `add_texts`. Using the former, you can add `langchain_core.documents.Document` objects, and using the latter, you can add strings. @@ -130,10 +130,10 @@ b235643b-62c0-4039-9856-1493f921e1a4 ---- `Hazelcast.add_texts` method returns the IDs of the added texts. -If the IDs were not provided to the `add_texts` method, then they are automatically genereated, like in the example above. +If the IDs were not provided to the `add_texts` method, then they are automatically generated, like in the example above. You can provide the IDs manually by passing them in the `ids` parameter. -That may be useful in case you would like to update data instead of extending the vector store. +This is useful when you want to update data instead of extending the vector store. [source,python] ---- @@ -164,7 +164,7 @@ ids = vector_store.add_texts( ) ---- -In case you have `langchain_core.documents.Document` objects, you can use the `add_documents` methods to add them to the vector store: +If you have `langchain_core.documents.Document` objects, you can use the `add_documents` methods to add them to the vector store: [source,python] ---- @@ -192,8 +192,8 @@ ids = vector_store.add_documents(docs) ---- `Hazelcast` vector store has two class methods that combine creating the vector store and adding texts or documents to it. -Those are `Hazelcast.from_texts` and `Hazelcast.from_documents` methods respectively. -Calling these methods return the `Hazelcast` vector store instance. +These are the `Hazelcast.from_texts` and `Hazelcast.from_documents` methods respectively. +Calling these methods returns the `Hazelcast` vector store instance. Here is an example that uses the `Hazelcast.from_texts` method: [source,python] @@ -236,7 +236,7 @@ docs = vector_store.get_by_ids([ ---- To delete some or all Documents, you can use the `delete` method. -It deletes the Documents with the given IDs if one or more IDs provided, or deletes all Documents if no IDs are provided. +It deletes the Documents with the given IDs if one or more IDs are provided, or deletes all Documents if no IDs are provided. This method always returns `True`. The example below deletes only two Documents: From 9d2dd52aa6dfd96a4beaba61fa11d285b842ad96 Mon Sep 17 00:00:00 2001 From: Yuce Tekol Date: Fri, 25 Oct 2024 13:52:15 +0300 Subject: [PATCH 5/7] Added the initial Langchian4J doc --- .../pages/integrate-with-langchain-java.adoc | 257 ++++++++++++++++++ 1 file changed, 257 insertions(+) create mode 100644 docs/modules/integrate/pages/integrate-with-langchain-java.adoc diff --git a/docs/modules/integrate/pages/integrate-with-langchain-java.adoc b/docs/modules/integrate/pages/integrate-with-langchain-java.adoc new file mode 100644 index 000000000..7b58fa158 --- /dev/null +++ b/docs/modules/integrate/pages/integrate-with-langchain-java.adoc @@ -0,0 +1,257 @@ += Integrate with LangChain for Jva +:description: The Hazelcast integration for LangChain provides a Vector Store implementation that enables using Hazecast Vector Search with LangChain. + +{description} + +== Introduction + +LangChain is a Python framework that makes it easier to create large language model (LLM) based solutions, such as chat bots by linking various components. + +LangChain `VectorStore` interface makes it easier to incorporate RAGs (Retrieval Augmented Generation) in LLM solutions. + +`langchain-hazelcast` package provides the Hazelcast `VectorStore` implementation for LangChain. + +== Installing LangChain/Hazelcast Vector Store + +[source,bash] +---- +pip install langchain-hazelcast +---- + +== Creating a Vector Store + +`Hazelcast` class is the Hazelcast vector store implementation that lives in the `langchain_hazelcast.vectorstore` package. + +The constructor for the `Hazelcast` vector store class takes the following arguments: + +* `embedding: Embeddings`: The embedding producer. This is a required argument. +* `collection_name: str`: Hazelcast `VectorCollection` to use. By default `"langchain"`. +* `client: Optional[HazelcastClient]`: A Hazelcast client object. +* `client_config: Optional[Config]`: A Hazelcast client configuration object. + +`client` and `client_config` arguments are mutually exclusive, they must not be set together. + +If you already have a Hazelcast client object, it is recommended to reuse it using the `client` argument. +Otherwise, you may prefer to create a Hazelcast configuration object first and pass it to the `Hazelcast` vector store constructor. + +The embedding producer must be an instance of LangChain `langchain_core.embeddings.Embeddings` class, such as `HuggingFaceEmbeddings`. +Here is an example: + +[source,python] +---- +from langchain_huggingface import HuggingFaceEmbeddings + +embeddings = HuggingFaceEmbeddings( + model_name="sentence-transformers/all-mpnet-base-v2", + model_kwargs={ + "device": "cpu", + "tokenizer_kwargs": { + "clean_up_tokenization_spaces": True, + }, + }, + encode_kwargs={"normalize_embeddings": False}, +) +---- + +Once you have the embedding producer, you can create the `Hazelcast` vector store instance. +Here's how to create a vector store which uses the default Hazelcast client that connects to the Hazelcast cluster `dev` at `localhost:5701`: + +[source,python] +---- +vector_store = Hazelcast(embeddings) +---- + +The same but with an explicitly created Hazelcast client: + +[source,python] +---- +from hazelcast import HazelcastClient +from hazelcast.config import Config + +config = Config() +config.cluster_members = ["localhost:5701"] +config.cluster_name = "dev" +client = HazelcastClient(config) +vector_store = Hazelcast(embeddings, client=client) +---- + +In case you would like to pass the client configuration without creating the client itself: +[source,python] +---- +from hazelcast import HazelcastClient +from hazelcast.config import Config + +config = Config() +config.cluster_members = ["localhost:5701"] +config.cluster_name = "dev" +vector_store = Hazelcast(embeddings, client_config=config) +---- + +You can find more about the various Hazelcast client configuration options in link:https://hazelcast.readthedocs.io/en/stable/client.html#hazelcast.client.HazelcastClient[Hazelcast Client documentation]. + +Although there is a default name for the underlying Hazelcast VectorCollection, you may want to use a different name. +You can do that by passing the name in the `collection_name` argument to the vector store constructor: +[source,python] +---- +name = "customer-docs" +vector_store = Hazelcast(embeddings, collection_name=name, client=client) +---- + +== Updating the Vector Store + +Once the vector store is created, you can start adding LangChain documents or string data into it. +While adding the data, you have the option to associate identifiers and metadata with it. + +Hazelcast vector store has two methods to add data, `add_documents` and `add_texts`. +Using the former, you can add `langchain_core.documents.Document` objects, and using the latter, you can add strings. + +In the simplest case, you would add one or more strings to the vector store: + +[source,python] +---- +texts = [ + "Hazelcast Platform uniquely combines a distributed compute engine and a fast data store in one runtime.", + "It offers unmatched performance, resilience and scale for real-time and AI-driven applications.", + "It allows you to quickly build resource-efficient, real-time applications.", + "You can deploy it at any scale from small edge devices to a large cluster of cloud instances.", +] +ids = vector_store.add_texts(texts) +for id in ids: + print(id) +---- + +Outputs: +[source,output] +---- +8c28f820-d4ed-4cfa-bac4-89b2d110b380 +b235643b-62c0-4039-9856-1493f921e1a4 +083cc0a4-9221-48bd-b734-0de2b4754bb3 +94b524bd-cdcb-4327-92e9-488ea5d915fd +---- + +`Hazelcast.add_texts` method returns the IDs of the added texts. +If the IDs were not provided to the `add_texts` method, then they are automatically generated, like in the example above. + +You can provide the IDs manually by passing them in the `ids` parameter. +This is useful when you want to update data instead of extending the vector store. + +[source,python] +---- +ids = vector_store.add_texts( + texts, + ids=["item1", "item2", "item3", "item4"] +) +for id in ids: + print(id) +---- + +If provided, the number of IDs must be equal to the number of texts. + +You can also pass metadata with the text or documents using the `metadatas` parameter. +Each item of the `metadatas` list must be a Python dictionary. +Like IDs, the number of metadata must be equal to the number of texts. + +[source,python] +---- +ids = vector_store.add_texts( + texts, + metadata=[ + {"page": 1}, + {"page": 1}, + {"page": 1}, + {"page": 2}, + ] +) +---- + +If you have `langchain_core.documents.Document` objects, you can use the `add_documents` methods to add them to the vector store: + +[source,python] +---- +from langchain_core.documents import Document + +docs = [ + Document( + id="item1", + metadata={"page": 1}, + page_content="Hazelcast Platform uniquely combines a distributed compute engine and a fast data store in one runtime."), + Document( + id="item2", + metadata={"page": 1}, + page_content="It offers unmatched performance, resilience and scale for real-time and AI-driven applications."), + Document( + id="item3", + metadata={"page": 1}, + page_content="It allows you to quickly build resource-efficient, real-time applications."), + Document( + id="item4", + metadata={"page": 2}, + page_content="You can deploy it at any scale from small edge devices to a large cluster of cloud instances."), +] +ids = vector_store.add_documents(docs) +---- + +`Hazelcast` vector store has two class methods that combine creating the vector store and adding texts or documents to it. +These are the `Hazelcast.from_texts` and `Hazelcast.from_documents` methods respectively. +Calling these methods returns the `Hazelcast` vector store instance. + +Here is an example that uses the `Hazelcast.from_texts` method: +[source,python] +---- +vector_store = Hazelcast.from_texts(texts, embedding=embeddings, client_config=config) +---- + +== Searching the Vector Store + +Once the vector store is populated, you can run vector similarity searches on it. +The `similarity_search` method of `Hazelcast` vector store takes a string to be used for the search and returns a list of Documents. + +[source,python] +---- +query = "Does Hazelcast enable real-time applications?" +docs = vector_store.similarity_search(query) +for doc in docs: + print(f"{doc.id}: {doc.page_content}") +---- + +You can optionally specify the maximum number of Documents to be returned using the `k` parameter: + +[source,python] +---- +docs = vector_store.similarity_search(query, k=10) +---- + +== Other Vector Store Operations + +You can retrieve Documents in the vector store using the `get_by_ids` method. +This method takes a sequence of IDs and returns the corresponding Documents if they exist. +Note that, the order of the IDs and the returned Documents may not be the same: + +[source,python] +---- +docs = vector_store.get_by_ids([ + "b235643b-62c0-4039-9856-1493f921e1a4", + "24d72bd3-e981-4701-a983-0a7800383fd1", +]) +---- + +To delete some or all Documents, you can use the `delete` method. +It deletes the Documents with the given IDs if one or more IDs are provided, or deletes all Documents if no IDs are provided. +This method always returns `True`. +The example below deletes only two Documents: + +[source,python] +---- +vector_store.delete([ + "b235643b-62c0-4039-9856-1493f921e1a4", + "24d72bd3-e981-4701-a983-0a7800383fd1", +]) +---- + +And the following example deletes all Documents: + +[source,python] +---- +vector_store.delete() +---- + From e6495d69a4b4e4a7ef5e45012b00ac39042e1a78 Mon Sep 17 00:00:00 2001 From: Yuce Tekol Date: Fri, 1 Nov 2024 13:07:34 +0300 Subject: [PATCH 6/7] Added the Langchain4j document --- .../pages/integrate-with-langchain-java.adoc | 346 +++++++++--------- 1 file changed, 182 insertions(+), 164 deletions(-) diff --git a/docs/modules/integrate/pages/integrate-with-langchain-java.adoc b/docs/modules/integrate/pages/integrate-with-langchain-java.adoc index 7b58fa158..1ff470db0 100644 --- a/docs/modules/integrate/pages/integrate-with-langchain-java.adoc +++ b/docs/modules/integrate/pages/integrate-with-langchain-java.adoc @@ -5,253 +5,271 @@ == Introduction -LangChain is a Python framework that makes it easier to create large language model (LLM) based solutions, such as chat bots by linking various components. +LangChain4J is a Java framework that makes it easier to create large language model (LLM) based solutions, such as chat bots by linking various components. -LangChain `VectorStore` interface makes it easier to incorporate RAGs (Retrieval Augmented Generation) in LLM solutions. +LangChain4J `EmbeddingStore` interface makes it easier to incorporate RAGs (Retrieval Augmented Generation) in LLM solutions. -`langchain-hazelcast` package provides the Hazelcast `VectorStore` implementation for LangChain. +`hazelcast.com:langchain-hazelcast` package provides the Hazelcast `EmbeddingStore` implementation for LangChain. -== Installing LangChain/Hazelcast Vector Store +== Installing LangChain/Hazelcast Embedding Store -[source,bash] +Add the following to your `pom.xml`: + +[source,xml] ---- -pip install langchain-hazelcast + + com.hazelcast + langchain-hazelcast + 6.0.0 + ---- -== Creating a Vector Store +== Creating an Embedding Store -`Hazelcast` class is the Hazelcast vector store implementation that lives in the `langchain_hazelcast.vectorstore` package. +`HazelcastEmbeddingStore` class is the Hazelcast embedding store implementation that lives in the `hazelcast.com:langchain-hazelcast` package. +But before creating the embedding store, you must create an instance of the embedding model itself. +The model instance will be used to generate the embeddings for adding text documents and searching them. +In the sample below, we used `AllMiniLmL6V2QuantizedEmbeddingModel`, but you can use anything. -The constructor for the `Hazelcast` vector store class takes the following arguments: +[source,java] +---- +var embeddingModel = new AllMiniLmL6V2QuantizedEmbeddingModel(); +---- -* `embedding: Embeddings`: The embedding producer. This is a required argument. -* `collection_name: str`: Hazelcast `VectorCollection` to use. By default `"langchain"`. -* `client: Optional[HazelcastClient]`: A Hazelcast client object. -* `client_config: Optional[Config]`: A Hazelcast client configuration object. +To create an instance of `HazelcastEmbeddingStore`, use its `builder` method, using the dimension of the embedding model: -`client` and `client_config` arguments are mutually exclusive, they must not be set together. +[source,java] +---- +var store = HazelcastEmbeddingStore.builder(embeddingModel.dimension()) + // ... + .build(); +---- -If you already have a Hazelcast client object, it is recommended to reuse it using the `client` argument. -Otherwise, you may prefer to create a Hazelcast configuration object first and pass it to the `Hazelcast` vector store constructor. +The `builder` method creates an instance of `HazelcastEmbeddingStore.Builder`. +`HazelcastEmbeddingStore` needs to communicate with an Hazelcast Enterprise cluster in order to send embeddings and retrieve search results. +Cluster configuration parameters can be supplied one of the alternative methods below: -The embedding producer must be an instance of LangChain `langchain_core.embeddings.Embeddings` class, such as `HuggingFaceEmbeddings`. -Here is an example: +* Using Hazelcast Client XML configuration by calling `builder.clientConfigFromXml(path or stream)` +* Using Hazelcast Client YAML configuration by calling `builder.clientConfigFromXml(path or stream)` +* Setting cluster configuration directly using `builder.clusterName` and one of `builder.address` or `builder.addressess`. -[source,python] ----- -from langchain_huggingface import HuggingFaceEmbeddings +The latter method of setting the cluster configuration is useful during development and when the cluster requires very little configuration. +The following code snippet uses simple cluster configuration: -embeddings = HuggingFaceEmbeddings( - model_name="sentence-transformers/all-mpnet-base-v2", - model_kwargs={ - "device": "cpu", - "tokenizer_kwargs": { - "clean_up_tokenization_spaces": True, - }, - }, - encode_kwargs={"normalize_embeddings": False}, -) +[source,java] +---- +var store = HazelcastEmbeddingStore.builder(embeddingModel.dimension()) + .clusterName("dev") + .address("localhost:5701") + .build(); ---- -Once you have the embedding producer, you can create the `Hazelcast` vector store instance. -Here's how to create a vector store which uses the default Hazelcast client that connects to the Hazelcast cluster `dev` at `localhost:5701`: +Code above is equivalent to the one below, since it uses the defaults: -[source,python] +[source,java] ---- -vector_store = Hazelcast(embeddings) +var store = HazelcastEmbeddingStore.builder(embeddingModel.dimension()) + .build(); ---- -The same but with an explicitly created Hazelcast client: +Use the XML/YAML configuration method when you already have Hazelcast Client configuration in XML/YAML, or the cluster requires more advanced features, such as authentication, TLS etc. -[source,python] +The example below shows how to use a Hazelcast Client XML configuration: + +[source,java] +---- +var store = HazelcastEmbeddingStore.builder(embeddingModel.dimension()) + .clientConfigFromXml("client.xml") + .build(); ---- -from hazelcast import HazelcastClient -from hazelcast.config import Config -config = Config() -config.cluster_members = ["localhost:5701"] -config.cluster_name = "dev" -client = HazelcastClient(config) -vector_store = Hazelcast(embeddings, client=client) +`client.xml` looks like this: + +[source,xml] ---- + + + + dev -In case you would like to pass the client configuration without creating the client itself: -[source,python] + + +
localhost:5701
+
+
+ +
---- -from hazelcast import HazelcastClient -from hazelcast.config import Config -config = Config() -config.cluster_members = ["localhost:5701"] -config.cluster_name = "dev" -vector_store = Hazelcast(embeddings, client_config=config) +You can find more information about client XML configuration at xref:clients:java.adoc[] documentation. + +Using client YAML configuration with `clientConfigFromYaml` is similar to how XML configuration is used: + +[source,java] +---- +var store = HazelcastEmbeddingStore.builder(embeddingModel.dimension()) + .clientConfigFromYaml("client.yaml") + .build(); ---- -You can find more about the various Hazelcast client configuration options in link:https://hazelcast.readthedocs.io/en/stable/client.html#hazelcast.client.HazelcastClient[Hazelcast Client documentation]. +`client.yaml` used above looks like this: -Although there is a default name for the underlying Hazelcast VectorCollection, you may want to use a different name. -You can do that by passing the name in the `collection_name` argument to the vector store constructor: -[source,python] +[source,yaml] ---- -name = "customer-docs" -vector_store = Hazelcast(embeddings, collection_name=name, client=client) +hazelcast-client: + cluster-name: dev + network: + cluster-members: + - localhost:5701 ---- -== Updating the Vector Store +== Updating the Embedding Store Once the vector store is created, you can start adding LangChain documents or string data into it. While adding the data, you have the option to associate identifiers and metadata with it. +Hazelcast embedding store supports a few ways of adding embeddings and text documents. -Hazelcast vector store has two methods to add data, `add_documents` and `add_texts`. -Using the former, you can add `langchain_core.documents.Document` objects, and using the latter, you can add strings. - -In the simplest case, you would add one or more strings to the vector store: +The simplest case is adding a single embedding. +An identifier is randomly created in this case: -[source,python] +[source,java] ---- -texts = [ - "Hazelcast Platform uniquely combines a distributed compute engine and a fast data store in one runtime.", - "It offers unmatched performance, resilience and scale for real-time and AI-driven applications.", - "It allows you to quickly build resource-efficient, real-time applications.", - "You can deploy it at any scale from small edge devices to a large cluster of cloud instances.", -] -ids = vector_store.add_texts(texts) -for id in ids: - print(id) +var text = "Hazelcast provides a simple scheme for controlling which partitions data resides in." +var embedding = embeddingModel.embed(text); +var id = store.add(embedding); ---- -Outputs: -[source,output] +You can also add an embedding and associate an identifier with it: + +[source,java] ---- -8c28f820-d4ed-4cfa-bac4-89b2d110b380 -b235643b-62c0-4039-9856-1493f921e1a4 -083cc0a4-9221-48bd-b734-0de2b4754bb3 -94b524bd-cdcb-4327-92e9-488ea5d915fd +var id = UUID.randomUUID().toString(); +store.add(id, embedding); ---- -`Hazelcast.add_texts` method returns the IDs of the added texts. -If the IDs were not provided to the `add_texts` method, then they are automatically generated, like in the example above. +To store an embedding and the corresponding text document, pass them to the `add` method. +The corresponding identifier is randomly created: -You can provide the IDs manually by passing them in the `ids` parameter. -This is useful when you want to update data instead of extending the vector store. +[source,java] +---- +var document = TextSegment.from(text) +var id = store.add(embedding, document); +---- + +You have the option to attach metadata to the document too: -[source,python] +[source,java] ---- -ids = vector_store.add_texts( - texts, - ids=["item1", "item2", "item3", "item4"] -) -for id in ids: - print(id) +var metadata = new Metadata(); +metadata.put("page", 7); +var document = TextSegment.from(text, metadata) +var id = store.add(embedding, document); ---- -If provided, the number of IDs must be equal to the number of texts. +Metadata keys must be of type `String` but values can be in one of the following types: + `String`, `Integer`, `Long`, `Float`, `Double`. + -You can also pass metadata with the text or documents using the `metadatas` parameter. -Each item of the `metadatas` list must be a Python dictionary. -Like IDs, the number of metadata must be equal to the number of texts. +You can add an embedding and document with a predefined identifier: -[source,python] +[source,java] ---- -ids = vector_store.add_texts( - texts, - metadata=[ - {"page": 1}, - {"page": 1}, - {"page": 1}, - {"page": 2}, - ] -) +store.add(id, embedding, document); ---- -If you have `langchain_core.documents.Document` objects, you can use the `add_documents` methods to add them to the vector store: +In case you have more than one embedding or document to add, it is more efficient to use one of the `addAll` methods. -[source,python] +Calling `addAll` with only the list of embeddings stores those embeddings with autogenerated identifiers: + +[source,java] +---- +var embeddings = new ArrayList(); +for (String text : texts) { + var embedding = embeddingModel.embed(text).content(); + embeddings.add(embedding); +} +var ids = store.addAll(embeddings); ---- -from langchain_core.documents import Document -docs = [ - Document( - id="item1", - metadata={"page": 1}, - page_content="Hazelcast Platform uniquely combines a distributed compute engine and a fast data store in one runtime."), - Document( - id="item2", - metadata={"page": 1}, - page_content="It offers unmatched performance, resilience and scale for real-time and AI-driven applications."), - Document( - id="item3", - metadata={"page": 1}, - page_content="It allows you to quickly build resource-efficient, real-time applications."), - Document( - id="item4", - metadata={"page": 2}, - page_content="You can deploy it at any scale from small edge devices to a large cluster of cloud instances."), -] -ids = vector_store.add_documents(docs) +Similarly, calling `addAll` with the list of embeddings and documents stores them with autogenerated identifiers. +The number of items in those lists must be the same: + +[source,java] +---- +var documents = new ArrayList(); +for (String text : texts) { + documents.add(TextSegment.from(text)); +} +var ids = store.addAll(embeddings, documents); ---- -`Hazelcast` vector store has two class methods that combine creating the vector store and adding texts or documents to it. -These are the `Hazelcast.from_texts` and `Hazelcast.from_documents` methods respectively. -Calling these methods returns the `Hazelcast` vector store instance. +You also have the option to specify the identifiers manually. +The number of items must match to the number of items in the embeddings and documents lists: -Here is an example that uses the `Hazelcast.from_texts` method: -[source,python] +[source,java] ---- -vector_store = Hazelcast.from_texts(texts, embedding=embeddings, client_config=config) +var ids = new ArrayList(); +for (int i = 0; i < texts.size(); i++) { + ids.add(String.valueOf(i); +} +store.addAll(ids, embeddings, documents); ---- == Searching the Vector Store -Once the vector store is populated, you can run vector similarity searches on it. -The `similarity_search` method of `Hazelcast` vector store takes a string to be used for the search and returns a list of Documents. +Once the embedding store is populated, you can run vector similarity searches on it. +The `search` method of `Hazelcast` embedding store takes an `EmbeddingSearchRequest` instance to be used for the search and returns an `EmbeddingSearchResult` object: -[source,python] +[source,java] ---- -query = "Does Hazelcast enable real-time applications?" -docs = vector_store.similarity_search(query) -for doc in docs: - print(f"{doc.id}: {doc.page_content}") +var query = "What was Hazelcast designed for?"; +var embedding = embeddingModel.embed(query).content(); +EmbeddingSearchRequest req = + EmbeddingSearchRequest.builder() + .queryEmbedding(embedding) + .build(); +var results = store.search(req).matches(); +for (var result : results) { + var document = result.embedded(); + System.out.println(document.text()); +} ---- -You can optionally specify the maximum number of Documents to be returned using the `k` parameter: +You can optionally specify the maximum number of Documents to be returned using the `maxResults` method of the search request builder: -[source,python] +[source,java] ---- -docs = vector_store.similarity_search(query, k=10) +EmbeddingSearchRequest req = + EmbeddingSearchRequest.builder() + .queryEmbedding(embedding) + .maxResults(3) + .build(); ---- -== Other Vector Store Operations +Currently, other methods of the search request builder are not supported. + +== Deleting Data From Embedding Store -You can retrieve Documents in the vector store using the `get_by_ids` method. -This method takes a sequence of IDs and returns the corresponding Documents if they exist. -Note that, the order of the IDs and the returned Documents may not be the same: +To delete a single embedding and the corresponding document, you can call the `remove` method of the embedding store with the identifier of the embedding: -[source,python] +[source,java] ---- -docs = vector_store.get_by_ids([ - "b235643b-62c0-4039-9856-1493f921e1a4", - "24d72bd3-e981-4701-a983-0a7800383fd1", -]) +store.remove(id); ---- -To delete some or all Documents, you can use the `delete` method. -It deletes the Documents with the given IDs if one or more IDs are provided, or deletes all Documents if no IDs are provided. -This method always returns `True`. -The example below deletes only two Documents: +If you have a number of embeddings to delete, using the `removeAll` method is more efficient: -[source,python] +[source,java] ---- -vector_store.delete([ - "b235643b-62c0-4039-9856-1493f921e1a4", - "24d72bd3-e981-4701-a983-0a7800383fd1", -]) +store.removeAll(ids); ---- -And the following example deletes all Documents: +To delete all embeddings from the embedding store, call `removeAll` with no arguments: -[source,python] +[source,java] ---- -vector_store.delete() +store.removeAll(); ---- - From ac042a2664293c88cb2285b91d946fe802ad8af8 Mon Sep 17 00:00:00 2001 From: Yuce Tekol Date: Thu, 14 Nov 2024 11:29:40 +0300 Subject: [PATCH 7/7] ADded langchain4j to nav --- docs/modules/ROOT/nav.adoc | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc index c03564d2a..4986173a7 100644 --- a/docs/modules/ROOT/nav.adoc +++ b/docs/modules/ROOT/nav.adoc @@ -180,6 +180,7 @@ include::wan:partial$nav.adoc[] ** xref:spring:transaction-manager.adoc[] ** xref:spring:best-practices.adoc[] * xref:integrate:integrate-with-langchain.adoc[] +* xref:integrate:integrate-with-langchain-java.adoc[] * xref:integrate:integrate-with-feast.adoc[] ** xref:integrate:install-connect.adoc[Install and connect Feast] ** xref:integrate:feast-config.adoc[]