Skip to content
This repository has been archived by the owner on Oct 29, 2024. It is now read-only.

Commit

Permalink
feat(document): repair pdf with libreoffic
Browse files Browse the repository at this point in the history
  • Loading branch information
chuang8511 committed Sep 26, 2024
1 parent 162ddad commit 656d902
Show file tree
Hide file tree
Showing 5 changed files with 90 additions and 2 deletions.
12 changes: 11 additions & 1 deletion operator/document/v0/convert_to_images.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,18 @@ func ConvertDocumentToImage(inputStruct *ConvertDocumentToImagesInput) (*Convert
base64PDF = strings.Split(inputStruct.Document, ",")[1]
}

var base64PDFWithoutMime string
if RequiredToRepair(base64PDF) {
base64PDFWithoutMime, err = RepairPDF(base64PDF)
if err != nil {
return nil, fmt.Errorf("failed to repair PDF: %w", err)
}
} else {
base64PDFWithoutMime = base.TrimBase64Mime(base64PDF)
}

paramsJSON := map[string]interface{}{
"PDF": base.TrimBase64Mime(base64PDF),
"PDF": base64PDFWithoutMime,
"filename": inputStruct.Filename,
}

Expand Down
21 changes: 21 additions & 0 deletions operator/document/v0/execution/pdf_checker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from io import BytesIO
import json
import base64
import sys

# TODO: Deal with the import error when running the code in the docker container
# from pdf_to_markdown import PDFTransformer

if __name__ == "__main__":
json_str = sys.stdin.buffer.read().decode('utf-8')
params = json.loads(json_str)
pdf_string = params["PDF"]

decoded_bytes = base64.b64decode(pdf_string)
pdf_file_obj = BytesIO(decoded_bytes)
pdf = PDFTransformer(x=pdf_file_obj)
pages = pdf.raw_pages
output = {
"required": len(pages) == 0,
}
print(json.dumps(output))
43 changes: 43 additions & 0 deletions operator/document/v0/helper.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package document

import (
"encoding/json"
"log"

"github.com/instill-ai/component/base"
"github.com/instill-ai/component/internal/util"
)

func RequiredToRepair(pdfBase64 string) bool {

paramsJSON := map[string]interface{}{
"PDF": base.TrimBase64Mime(pdfBase64),
}

pythonCode := pdfTransformer + pdfChecker

outputBytes, err := util.ExecutePythonCode(pythonCode, paramsJSON)

if err != nil {
// It shouldn't block the original process.
log.Println("failed to run python script: %w", err)
return false
}

var output struct {
Repair bool `json:"required"`
}

err = json.Unmarshal(outputBytes, &output)

if err != nil {
// It shouldn't block the original process.
log.Println("failed to unmarshal output: %w", err)
}

return output.Repair
}

func RepairPDF(pdfBase64 string) (string, error) {
return ConvertToPDF(pdfBase64, "pdf")
}
3 changes: 3 additions & 0 deletions operator/document/v0/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ var (
//go:embed execution/task_convert_to_images.py
taskConvertToImagesExecution string

//go:embed execution/pdf_checker.py
pdfChecker string

once sync.Once
comp *component
)
Expand Down
13 changes: 12 additions & 1 deletion operator/document/v0/pdf_to_markdown_converter.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,19 @@ type converterOutput struct {

func convertPDFToMarkdownWithPDFPlumber(base64Text string, displayImageTag bool, displayAllPage bool) (converterOutput, error) {

var pdfBase64 string
var err error
if RequiredToRepair(base64Text) {
pdfBase64, err = RepairPDF(base64Text)
if err != nil {
return converterOutput{}, fmt.Errorf("failed to repair PDF: %w", err)
}
} else {
pdfBase64 = base.TrimBase64Mime(base64Text)
}

paramsJSON, err := json.Marshal(map[string]interface{}{
"PDF": base.TrimBase64Mime(base64Text),
"PDF": pdfBase64,
"display-image-tag": displayImageTag,
"display-all-page-image": displayAllPage,
})
Expand Down

0 comments on commit 656d902

Please sign in to comment.