fix(text): markdown chunking bug (#325)

Because - It missed a case of parsing the markdown This commit - add the missing case
instill-ai · Sep 11, 2024 · 9866a91 · 9866a91
1 parent 57d8050
commit 9866a91
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 2 deletions.
diff --git a/operator/text/v0/markdown_document.go b/operator/text/v0/markdown_document.go
@@ -128,14 +128,19 @@ func buildDocument(rawRunes []rune, previousDocument *MarkdownDocument, startPos
 				currentContent.Type = "plaintext"
 				currentContent.BlockStartPosition = currentPosition
 				currentContent.BlockEndPosition = currentPosition
-
+				meetHeaderTimes := 0
 				for currentPosition < endPositionOfBlock {
 
 					line := readLine(rawRunes, &currentPosition)
 					currentContent.BlockEndPosition += sizeOfString(line) + 1
 
 					if isHeader(line) {
-						header := parseHeader(line)
+						meetHeaderTimes++
+
+						if meetHeaderTimes > 0 && len(paragraph) > 0 {
+							currentContent.PlainText = paragraph
+							doc.Contents = append(doc.Contents, currentContent)
+						}
 						if endOfDocument(doc) {
 							currentPosition -= sizeOfString(line) + 1
 							currentContent.PlainText = paragraph
@@ -145,6 +150,7 @@ func buildDocument(rawRunes []rune, previousDocument *MarkdownDocument, startPos
 							end = true
 							break
 						}
+						header := parseHeader(line)
 						currentHeaderLevel = header.Level
 						headers[header.Level-1] = &header
 					} else {

diff --git a/operator/text/v0/markdown_splitter.go b/operator/text/v0/markdown_splitter.go
@@ -5,6 +5,8 @@ import (
 	"reflect"
 	"strings"
 
+	"log"
+
 	"github.com/tmc/langchaingo/textsplitter"
 )
 
@@ -432,6 +434,10 @@ func (sp MarkdownTextSplitter) chunkPlainText(content Content, headers []Header)
 
 		if shouldScanRawTextFromPreviousChunk(startPosition, endPosition) {
 			previousChunkIndex := len(contentChunks) - 1
+			if previousChunkIndex < 0 {
+				log.Println("There may be missing chunks in the content because of parsing errors in the markdown_document")
+				continue
+			}
 			previousChunk := contentChunks[previousChunkIndex]
 			startPosition, endPosition = getChunkPositions(rawRunes, chunkRunes, previousChunk.ContentStartPosition+1)
 		}