feat: add support for variable chunk length (Mintplex-Labs#415)

fix: cleanup code for embedding length clarify resolves Mintplex-Labs#388
franzbischoff · Dec 8, 2023 · 8cc1455 · 8cc1455
1 parent 48dd99b
commit 8cc1455
Show file tree

Hide file tree

Showing 15 changed files with 89 additions and 24 deletions.
diff --git a/docker/.env.example b/docker/.env.example
@@ -47,6 +47,7 @@ GID='1000'
 # EMBEDDING_ENGINE='localai'
 # EMBEDDING_BASE_PATH='https://localhost:8080/v1'
 # EMBEDDING_MODEL_PREF='text-embedding-ada-002'
+# EMBEDDING_MODEL_MAX_CHUNK_LENGTH=1000 # The max chunk size in chars a string to embed can be
 
 ###########################################
 ######## Vector Database Selection ########

diff --git a/frontend/src/components/EmbeddingSelection/LocalAiOptions/index.jsx b/frontend/src/components/EmbeddingSelection/LocalAiOptions/index.jsx
@@ -30,6 +30,22 @@ export default function LocalAiOptions({ settings }) {
         />
       </div>
       <LocalAIModelSelection settings={settings} basePath={basePath} />
+      <div className="flex flex-col w-60">
+        <label className="text-white text-sm font-semibold block mb-4">
+          Max embedding chunk length
+        </label>
+        <input
+          type="number"
+          name="EmbeddingModelMaxChunkLength"
+          className="bg-zinc-900 text-white placeholder-white placeholder-opacity-60 text-sm rounded-lg focus:border-white block w-full p-2.5"
+          placeholder="1000"
+          min={1}
+          onScroll={(e) => e.target.blur()}
+          defaultValue={settings?.EmbeddingModelMaxChunkLength}
+          required={false}
+          autoComplete="off"
+        />
+      </div>
     </>
   );
 }

diff --git a/server/.env.example b/server/.env.example
@@ -44,6 +44,7 @@ JWT_SECRET="my-random-string-for-seeding" # Please generate random string at lea
 # EMBEDDING_ENGINE='localai'
 # EMBEDDING_BASE_PATH='https://localhost:8080/v1'
 # EMBEDDING_MODEL_PREF='text-embedding-ada-002'
+# EMBEDDING_MODEL_MAX_CHUNK_LENGTH=1000 # The max chunk size in chars a string to embed can be
 
 ###########################################
 ######## Vector Database Selection ########

diff --git a/server/models/systemSettings.js b/server/models/systemSettings.js
@@ -27,6 +27,8 @@ const SystemSettings = {
       EmbeddingEngine: process.env.EMBEDDING_ENGINE,
       EmbeddingBasePath: process.env.EMBEDDING_BASE_PATH,
       EmbeddingModelPref: process.env.EMBEDDING_MODEL_PREF,
+      EmbeddingModelMaxChunkLength:
+        process.env.EMBEDDING_MODEL_MAX_CHUNK_LENGTH,
       ...(vectorDB === "pinecone"
         ? {
             PineConeEnvironment: process.env.PINECONE_ENVIRONMENT,

diff --git a/server/utils/EmbeddingEngines/azureOpenAi/index.js b/server/utils/EmbeddingEngines/azureOpenAi/index.js
@@ -16,7 +16,7 @@ class AzureOpenAiEmbedder {
 
     // The maximum amount of "inputs" that OpenAI API can process in a single call.
     // https://learn.microsoft.com/en-us/azure/ai-services/openai/faq#i-am-trying-to-use-embeddings-and-received-the-error--invalidrequesterror--too-many-inputs--the-max-number-of-inputs-is-1---how-do-i-fix-this-:~:text=consisting%20of%20up%20to%2016%20inputs%20per%20API%20request
-    this.embeddingChunkLimit = 16;
+    this.embeddingMaxChunkLength = 16;
   }
 
   async embedTextInput(textInput) {
@@ -34,9 +34,9 @@ class AzureOpenAiEmbedder {
 
     // Because there is a limit on how many chunks can be sent at once to Azure OpenAI
     // we concurrently execute each max batch of text chunks possible.
-    // Refer to constructor embeddingChunkLimit for more info.
+    // Refer to constructor embeddingMaxChunkLength for more info.
     const embeddingRequests = [];
-    for (const chunk of toChunks(textChunks, this.embeddingChunkLimit)) {
+    for (const chunk of toChunks(textChunks, this.embeddingMaxChunkLength)) {
       embeddingRequests.push(
         new Promise((resolve) => {
           this.openai

diff --git a/server/utils/EmbeddingEngines/localAi/index.js b/server/utils/EmbeddingEngines/localAi/index.js
@@ -1,4 +1,4 @@
-const { toChunks } = require("../../helpers");
+const { toChunks, maximumChunkLength } = require("../../helpers");
 
 class LocalAiEmbedder {
   constructor() {
@@ -12,8 +12,8 @@ class LocalAiEmbedder {
     });
     this.openai = new OpenAIApi(config);
 
-    // Arbitrary limit to ensure we stay within reasonable POST request size.
-    this.embeddingChunkLimit = 1_000;
+    // Arbitrary limit of string size in chars to ensure we stay within reasonable POST request size.
+    this.embeddingMaxChunkLength = maximumChunkLength();
   }
 
   async embedTextInput(textInput) {
@@ -23,7 +23,7 @@ class LocalAiEmbedder {
 
   async embedChunks(textChunks = []) {
     const embeddingRequests = [];
-    for (const chunk of toChunks(textChunks, this.embeddingChunkLimit)) {
+    for (const chunk of toChunks(textChunks, this.embeddingMaxChunkLength)) {
       embeddingRequests.push(
         new Promise((resolve) => {
           this.openai

diff --git a/server/utils/EmbeddingEngines/native/index.js b/server/utils/EmbeddingEngines/native/index.js
@@ -4,6 +4,7 @@ const { toChunks } = require("../../helpers");
 
 class NativeEmbedder {
   constructor() {
+    // Model Card: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
     this.model = "Xenova/all-MiniLM-L6-v2";
     this.cacheDir = path.resolve(
       process.env.STORAGE_DIR
@@ -12,8 +13,8 @@ class NativeEmbedder {
     );
     this.modelPath = path.resolve(this.cacheDir, "Xenova", "all-MiniLM-L6-v2");
 
-    // Limit the number of chunks to send per loop to not overload compute.
-    this.embeddingChunkLimit = 16;
+    // Arbitrary limit of string size in chars to ensure we stay within reasonable POST request size.
+    this.embeddingMaxChunkLength = 1_000;
 
     // Make directory when it does not exist in existing installations
     if (!fs.existsSync(this.cacheDir)) fs.mkdirSync(this.cacheDir);
@@ -62,7 +63,7 @@ class NativeEmbedder {
   async embedChunks(textChunks = []) {
     const Embedder = await this.embedderClient();
     const embeddingResults = [];
-    for (const chunk of toChunks(textChunks, this.embeddingChunkLimit)) {
+    for (const chunk of toChunks(textChunks, this.embeddingMaxChunkLength)) {
       const output = await Embedder(chunk, {
         pooling: "mean",
         normalize: true,

diff --git a/server/utils/EmbeddingEngines/openAi/index.js b/server/utils/EmbeddingEngines/openAi/index.js
@@ -10,8 +10,8 @@ class OpenAiEmbedder {
     const openai = new OpenAIApi(config);
     this.openai = openai;
 
-    // Arbitrary limit to ensure we stay within reasonable POST request size.
-    this.embeddingChunkLimit = 1_000;
+    // Arbitrary limit of string size in chars to ensure we stay within reasonable POST request size.
+    this.embeddingMaxChunkLength = 1_000;
   }
 
   async embedTextInput(textInput) {
@@ -22,9 +22,9 @@ class OpenAiEmbedder {
   async embedChunks(textChunks = []) {
     // Because there is a hard POST limit on how many chunks can be sent at once to OpenAI (~8mb)
     // we concurrently execute each max batch of text chunks possible.
-    // Refer to constructor embeddingChunkLimit for more info.
+    // Refer to constructor embeddingMaxChunkLength for more info.
     const embeddingRequests = [];
-    for (const chunk of toChunks(textChunks, this.embeddingChunkLimit)) {
+    for (const chunk of toChunks(textChunks, this.embeddingMaxChunkLength)) {
       embeddingRequests.push(
         new Promise((resolve) => {
           this.openai

diff --git a/server/utils/helpers/index.js b/server/utils/helpers/index.js
@@ -70,6 +70,20 @@ function getEmbeddingEngineSelection() {
   }
 }
 
+// Some models have lower restrictions on chars that can be encoded in a single pass
+// and by default we assume it can handle 1,000 chars, but some models use work with smaller
+// chars so here we can override that value when embedding information.
+function maximumChunkLength() {
+  if (
+    !!process.env.EMBEDDING_MODEL_MAX_CHUNK_LENGTH &&
+    !isNaN(process.env.EMBEDDING_MODEL_MAX_CHUNK_LENGTH) &&
+    Number(process.env.EMBEDDING_MODEL_MAX_CHUNK_LENGTH) > 1
+  )
+    return Number(process.env.EMBEDDING_MODEL_MAX_CHUNK_LENGTH);
+
+  return 1_000;
+}
+
 function toChunks(arr, size) {
   return Array.from({ length: Math.ceil(arr.length / size) }, (_v, i) =>
     arr.slice(i * size, i * size + size)
@@ -78,6 +92,7 @@ function toChunks(arr, size) {
 
 module.exports = {
   getEmbeddingEngineSelection,
+  maximumChunkLength,
   getVectorDbClass,
   getLLMProvider,
   toChunks,

diff --git a/server/utils/helpers/updateENV.js b/server/utils/helpers/updateENV.js
@@ -90,6 +90,10 @@ const KEY_MAPPING = {
     envKey: "EMBEDDING_MODEL_PREF",
     checks: [isNotEmpty],
   },
+  EmbeddingModelMaxChunkLength: {
+    envKey: "EMBEDDING_MODEL_MAX_CHUNK_LENGTH",
+    checks: [nonZero],
+  },
 
   // Vector Database Selection Settings
   VectorDB: {

diff --git a/server/utils/vectorDbProviders/chroma/index.js b/server/utils/vectorDbProviders/chroma/index.js
@@ -2,7 +2,11 @@ const { ChromaClient } = require("chromadb");
 const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
 const { storeVectorResult, cachedVectorInformation } = require("../../files");
 const { v4: uuidv4 } = require("uuid");
-const { toChunks, getLLMProvider } = require("../../helpers");
+const {
+  toChunks,
+  getLLMProvider,
+  getEmbeddingEngineSelection,
+} = require("../../helpers");
 
 const Chroma = {
   name: "Chroma",
@@ -175,7 +179,8 @@ const Chroma = {
       // because we then cannot atomically control our namespace to granularly find/remove documents
       // from vectordb.
       const textSplitter = new RecursiveCharacterTextSplitter({
-        chunkSize: 1000,
+        chunkSize:
+          getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000,
         chunkOverlap: 20,
       });
       const textChunks = await textSplitter.splitText(pageContent);

diff --git a/server/utils/vectorDbProviders/lance/index.js b/server/utils/vectorDbProviders/lance/index.js
@@ -1,5 +1,9 @@
 const lancedb = require("vectordb");
-const { toChunks, getLLMProvider } = require("../../helpers");
+const {
+  toChunks,
+  getLLMProvider,
+  getEmbeddingEngineSelection,
+} = require("../../helpers");
 const { OpenAIEmbeddings } = require("langchain/embeddings/openai");
 const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
 const { storeVectorResult, cachedVectorInformation } = require("../../files");
@@ -176,7 +180,8 @@ const LanceDb = {
       // because we then cannot atomically control our namespace to granularly find/remove documents
       // from vectordb.
       const textSplitter = new RecursiveCharacterTextSplitter({
-        chunkSize: 1000,
+        chunkSize:
+          getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000,
         chunkOverlap: 20,
       });
       const textChunks = await textSplitter.splitText(pageContent);

diff --git a/server/utils/vectorDbProviders/pinecone/index.js b/server/utils/vectorDbProviders/pinecone/index.js
@@ -2,7 +2,11 @@ const { PineconeClient } = require("@pinecone-database/pinecone");
 const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
 const { storeVectorResult, cachedVectorInformation } = require("../../files");
 const { v4: uuidv4 } = require("uuid");
-const { toChunks, getLLMProvider } = require("../../helpers");
+const {
+  toChunks,
+  getLLMProvider,
+  getEmbeddingEngineSelection,
+} = require("../../helpers");
 
 const Pinecone = {
   name: "Pinecone",
@@ -130,7 +134,8 @@ const Pinecone = {
       // from vectordb.
       // https://github.com/hwchase17/langchainjs/blob/2def486af734c0ca87285a48f1a04c057ab74bdf/langchain/src/vectorstores/pinecone.ts#L167
       const textSplitter = new RecursiveCharacterTextSplitter({
-        chunkSize: 1000,
+        chunkSize:
+          getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000,
         chunkOverlap: 20,
       });
       const textChunks = await textSplitter.splitText(pageContent);

diff --git a/server/utils/vectorDbProviders/qdrant/index.js b/server/utils/vectorDbProviders/qdrant/index.js
@@ -2,7 +2,11 @@ const { QdrantClient } = require("@qdrant/js-client-rest");
 const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
 const { storeVectorResult, cachedVectorInformation } = require("../../files");
 const { v4: uuidv4 } = require("uuid");
-const { toChunks, getLLMProvider } = require("../../helpers");
+const {
+  toChunks,
+  getLLMProvider,
+  getEmbeddingEngineSelection,
+} = require("../../helpers");
 
 const QDrant = {
   name: "QDrant",
@@ -174,7 +178,8 @@ const QDrant = {
       // because we then cannot atomically control our namespace to granularly find/remove documents
       // from vectordb.
       const textSplitter = new RecursiveCharacterTextSplitter({
-        chunkSize: 1000,
+        chunkSize:
+          getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000,
         chunkOverlap: 20,
       });
       const textChunks = await textSplitter.splitText(pageContent);

diff --git a/server/utils/vectorDbProviders/weaviate/index.js b/server/utils/vectorDbProviders/weaviate/index.js
@@ -2,7 +2,11 @@ const { default: weaviate } = require("weaviate-ts-client");
 const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
 const { storeVectorResult, cachedVectorInformation } = require("../../files");
 const { v4: uuidv4 } = require("uuid");
-const { toChunks, getLLMProvider } = require("../../helpers");
+const {
+  toChunks,
+  getLLMProvider,
+  getEmbeddingEngineSelection,
+} = require("../../helpers");
 const { camelCase } = require("../../helpers/camelcase");
 
 const Weaviate = {
@@ -237,7 +241,8 @@ const Weaviate = {
       // because we then cannot atomically control our namespace to granularly find/remove documents
       // from vectordb.
       const textSplitter = new RecursiveCharacterTextSplitter({
-        chunkSize: 1000,
+        chunkSize:
+          getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000,
         chunkOverlap: 20,
       });
       const textChunks = await textSplitter.splitText(pageContent);