Skip to content
This repository has been archived by the owner on Oct 29, 2024. It is now read-only.

Commit

Permalink
fix: bug of duplicate document (#256)
Browse files Browse the repository at this point in the history
Because

- we have the duplicate document in the logic, which is the bug

This commit

- fix the bug with not executing the same code within and out the loop
- add the test code to test markdown chunking logic specifically
  • Loading branch information
chuang8511 authored Jul 30, 2024
1 parent 4bfd81b commit e028a6e
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 5 deletions.
7 changes: 2 additions & 5 deletions operator/text/v0/markdown_splitter.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,8 @@ func (sp MarkdownTextSplitter) buildDocuments(rawRunes []rune) []MarkdownDocumen

documents := []MarkdownDocument{}

startPosition := 0
document, startPosition := sp.buildDocument(rawRunes, MarkdownDocument{}, startPosition)

documents = append(documents, document)

var startPosition int
var document MarkdownDocument
for startPosition < len(rawRunes) {
document, startPosition = sp.buildDocument(rawRunes, document, startPosition)

Expand Down
74 changes: 74 additions & 0 deletions operator/text/v0/markdown_splitter_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
package text

import (
"testing"

"github.com/frankban/quicktest"
"github.com/tmc/langchaingo/textsplitter"
)

func Test_MarkdownSplitter(t *testing.T) {

c := quicktest.New(t)

testCases := []struct {
input ChunkTextInput
outputLen int
}{
{
input: ChunkTextInput{
Text: `# asf65463
## 654654
fasdflj`,
Strategy: Strategy{
Setting: Setting{
ChunkMethod: "Markdown",
ChunkSize: 800,
ChunkOverlap: 200,
},
},
},
outputLen: 1,
},
{
input: ChunkTextInput{
Text: `# 醫囑
檢驗 : Urine routine(急) [尿液] [有蓋定量離心管(尿液收集管)] STAT 【註:Foley】
=> **尿液檢查採樣來源為Foley,表示有裝置導尿管**
# 護理
病人2way尿管存,管路引流順暢,尿液呈淡黃色,管路固定於右大腿,無滑脫,續觀
=> **尿管存,表示有裝置導尿管**
# 個案泌尿道感染判定
1. 有導尿管
2. 病患無UTI感染症狀
=> **判斷為:非泌尿道感染,僅無症狀菌尿症(not UTI; asymptomatic bacteuria only)**`,
Strategy: Strategy{
Setting: Setting{
ChunkMethod: "Markdown",
ChunkSize: 800,
ChunkOverlap: 200,
},
},
},
outputLen: 3,
},
}
for _, testCase := range testCases {
c.Run("Test bug cases reported", func(c *quicktest.C) {
inputStruct := testCase.input
setting := inputStruct.Strategy.Setting
split := NewMarkdownTextSplitter(
textsplitter.WithChunkSize(setting.ChunkSize),
textsplitter.WithChunkOverlap(setting.ChunkOverlap),
)

chunks, err := split.SplitText(inputStruct.Text)

c.Assert(err, quicktest.IsNil)

c.Assert(len(chunks), quicktest.DeepEquals, testCase.outputLen)
})
}
}

0 comments on commit e028a6e

Please sign in to comment.