This repository has been archived by the owner on Oct 29, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix: bug of duplicate document (#256)
Because - we have the duplicate document in the logic, which is the bug This commit - fix the bug with not executing the same code within and out the loop - add the test code to test markdown chunking logic specifically
- Loading branch information
1 parent
4bfd81b
commit e028a6e
Showing
2 changed files
with
76 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
package text | ||
|
||
import ( | ||
"testing" | ||
|
||
"github.com/frankban/quicktest" | ||
"github.com/tmc/langchaingo/textsplitter" | ||
) | ||
|
||
func Test_MarkdownSplitter(t *testing.T) { | ||
|
||
c := quicktest.New(t) | ||
|
||
testCases := []struct { | ||
input ChunkTextInput | ||
outputLen int | ||
}{ | ||
{ | ||
input: ChunkTextInput{ | ||
Text: `# asf65463 | ||
## 654654 | ||
fasdflj`, | ||
Strategy: Strategy{ | ||
Setting: Setting{ | ||
ChunkMethod: "Markdown", | ||
ChunkSize: 800, | ||
ChunkOverlap: 200, | ||
}, | ||
}, | ||
}, | ||
outputLen: 1, | ||
}, | ||
{ | ||
input: ChunkTextInput{ | ||
Text: `# 醫囑 | ||
檢驗 : Urine routine(急) [尿液] [有蓋定量離心管(尿液收集管)] STAT 【註:Foley】 | ||
=> **尿液檢查採樣來源為Foley,表示有裝置導尿管** | ||
# 護理 | ||
病人2way尿管存,管路引流順暢,尿液呈淡黃色,管路固定於右大腿,無滑脫,續觀 | ||
=> **尿管存,表示有裝置導尿管** | ||
# 個案泌尿道感染判定 | ||
1. 有導尿管 | ||
2. 病患無UTI感染症狀 | ||
=> **判斷為:非泌尿道感染,僅無症狀菌尿症(not UTI; asymptomatic bacteuria only)**`, | ||
Strategy: Strategy{ | ||
Setting: Setting{ | ||
ChunkMethod: "Markdown", | ||
ChunkSize: 800, | ||
ChunkOverlap: 200, | ||
}, | ||
}, | ||
}, | ||
outputLen: 3, | ||
}, | ||
} | ||
for _, testCase := range testCases { | ||
c.Run("Test bug cases reported", func(c *quicktest.C) { | ||
inputStruct := testCase.input | ||
setting := inputStruct.Strategy.Setting | ||
split := NewMarkdownTextSplitter( | ||
textsplitter.WithChunkSize(setting.ChunkSize), | ||
textsplitter.WithChunkOverlap(setting.ChunkOverlap), | ||
) | ||
|
||
chunks, err := split.SplitText(inputStruct.Text) | ||
|
||
c.Assert(err, quicktest.IsNil) | ||
|
||
c.Assert(len(chunks), quicktest.DeepEquals, testCase.outputLen) | ||
}) | ||
} | ||
} |