feat: groups sentences that have less than a given number of tokens

oramasearch · Feb 12, 2024 · eb1cf82 · eb1cf82
1 parent c714f10
commit eb1cf82
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 4 deletions.
diff --git a/src/nlp.ts b/src/nlp.ts
@@ -14,19 +14,19 @@ export class NLPChunker extends Chunker {
       ])
 
       if (sentenceTokenCount + currentChunkTokenCount <= maxTokensPerChunk) {
-        currentChunk += (currentChunk ? ' ' : '') + sentence
+        currentChunk += (currentChunk ? ' ' : '') + sentence // Ensure space between sentences
       } else {
         if (currentChunk) {
           chunks.push(currentChunk)
         }
+        currentChunk = sentenceTokenCount > maxTokensPerChunk ? '' : sentence
+
         if (sentenceTokenCount > maxTokensPerChunk) {
           chunks.push(sentence)
-          currentChunk = ''
-        } else {
-          currentChunk = sentence
         }
       }
     }
+
     if (currentChunk) {
       chunks.push(currentChunk)
     }

diff --git a/tests/nlp.test.ts b/tests/nlp.test.ts
@@ -16,3 +16,17 @@ Deno.test('NLP chunker', async () => {
 
   assertEquals(chunks.length, 4)
 })
+
+Deno.test('NLP chunker should combine multiple sentences when their total number of token is less than a given threshold', async () => {
+  const input = dedent`
+    This is an example.
+    Every sentence here has just a few tokens.
+  `
+
+  const chunker = new NLPChunker()
+  const singleChunk = await chunker.chunk(input, 50)
+  const multipleChunks = await chunker.chunk(input, 10)
+
+  assertEquals(singleChunk.length, 1)
+  assertEquals(multipleChunks.length, 2)
+})