Skip to content

Commit

Permalink
feat: groups sentences that have less than a given number of tokens
Browse files Browse the repository at this point in the history
  • Loading branch information
micheleriva committed Feb 12, 2024
1 parent c714f10 commit eb1cf82
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 4 deletions.
8 changes: 4 additions & 4 deletions src/nlp.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,19 @@ export class NLPChunker extends Chunker {
])

if (sentenceTokenCount + currentChunkTokenCount <= maxTokensPerChunk) {
currentChunk += (currentChunk ? ' ' : '') + sentence
currentChunk += (currentChunk ? ' ' : '') + sentence // Ensure space between sentences
} else {
if (currentChunk) {
chunks.push(currentChunk)
}
currentChunk = sentenceTokenCount > maxTokensPerChunk ? '' : sentence

if (sentenceTokenCount > maxTokensPerChunk) {
chunks.push(sentence)
currentChunk = ''
} else {
currentChunk = sentence
}
}
}

if (currentChunk) {
chunks.push(currentChunk)
}
Expand Down
14 changes: 14 additions & 0 deletions tests/nlp.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,17 @@ Deno.test('NLP chunker', async () => {

assertEquals(chunks.length, 4)
})

Deno.test('NLP chunker should combine multiple sentences when their total number of token is less than a given threshold', async () => {
const input = dedent`
This is an example.
Every sentence here has just a few tokens.
`

const chunker = new NLPChunker()
const singleChunk = await chunker.chunk(input, 50)
const multipleChunks = await chunker.chunk(input, 10)

assertEquals(singleChunk.length, 1)
assertEquals(multipleChunks.length, 2)
})

0 comments on commit eb1cf82

Please sign in to comment.