Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(catalog): allow external service to store file metadata #123

Merged
merged 1 commit into from
Oct 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ database:
host: pg-sql
port: 5432
name: artifact
version: 17
version: 18
timezone: Etc/UTC
pool:
idleconnections: 5
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ require (
github.com/grpc-ecosystem/go-grpc-middleware v1.3.0
github.com/grpc-ecosystem/grpc-gateway/v2 v2.19.1
github.com/influxdata/influxdb-client-go/v2 v2.12.3
github.com/instill-ai/protogen-go v0.3.3-alpha.0.20241024152819-5ed9f53b5c8a
github.com/instill-ai/protogen-go v0.3.3-alpha.0.20241029162707-1398399a24ee
github.com/instill-ai/usage-client v0.3.0-alpha.0.20240319060111-4a3a39f2fd61
github.com/instill-ai/x v0.3.0-alpha.0.20231219052200-6230a89e386c
github.com/knadh/koanf v1.5.0
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -344,8 +344,8 @@ github.com/influxdata/influxdb-client-go/v2 v2.12.3 h1:28nRlNMRIV4QbtIUvxhWqaxn0
github.com/influxdata/influxdb-client-go/v2 v2.12.3/go.mod h1:IrrLUbCjjfkmRuaCiGQg4m2GbkaeJDcuWoxiWdQEbA0=
github.com/influxdata/line-protocol v0.0.0-20200327222509-2487e7298839 h1:W9WBk7wlPfJLvMCdtV4zPulc4uCPrlywQOmbFOhgQNU=
github.com/influxdata/line-protocol v0.0.0-20200327222509-2487e7298839/go.mod h1:xaLFMmpvUxqXtVkUJfg9QmT88cDaCJ3ZKgdZ78oO8Qo=
github.com/instill-ai/protogen-go v0.3.3-alpha.0.20241024152819-5ed9f53b5c8a h1:FAj25JbB8CUfUbyPAj1PupoFHo9sW5kffjQyVjkXDg4=
github.com/instill-ai/protogen-go v0.3.3-alpha.0.20241024152819-5ed9f53b5c8a/go.mod h1:rf0UY7VpEgpaLudYEcjx5rnbuwlBaaLyD4FQmWLtgAY=
github.com/instill-ai/protogen-go v0.3.3-alpha.0.20241029162707-1398399a24ee h1:onnzrn5jabO3jDLPo2193Ql6YMRyDWDx9K834Bfi8V0=
github.com/instill-ai/protogen-go v0.3.3-alpha.0.20241029162707-1398399a24ee/go.mod h1:rf0UY7VpEgpaLudYEcjx5rnbuwlBaaLyD4FQmWLtgAY=
github.com/instill-ai/usage-client v0.3.0-alpha.0.20240319060111-4a3a39f2fd61 h1:smPTvmXDhn/QC7y/TPXyMTqbbRd0gvzmFgWBChwTfhE=
github.com/instill-ai/usage-client v0.3.0-alpha.0.20240319060111-4a3a39f2fd61/go.mod h1:/TAHs4ybuylk5icuy+MQtHRc4XUnIyXzeNKxX9qDFhw=
github.com/instill-ai/x v0.3.0-alpha.0.20231219052200-6230a89e386c h1:a2RVkpIV2QcrGnSHAou+t/L+vBsaIfFvk5inVg5Uh4s=
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
BEGIN;

-- Remove external_metadata column from knowledge_base_file table
ALTER TABLE knowledge_base_file
DROP COLUMN external_metadata;

COMMIT;
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
BEGIN;

-- Add external_metadata column to knowledge_base_file table
ALTER TABLE knowledge_base_file
ADD COLUMN external_metadata JSONB;

-- Add comment for the new column
COMMENT ON COLUMN knowledge_base_file.external_metadata IS 'External metadata stored as JSON, serialized from protobuf Struct';

COMMIT;
67 changes: 35 additions & 32 deletions pkg/handler/knowledgebasefiles.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,14 +117,15 @@ func (ph *PublicHandler) UploadCatalogFile(ctx context.Context, req *artifactpb.

destination := ph.service.MinIO.GetUploadedFilePathInKnowledgeBase(kb.UID.String(), req.File.Name)
kbFile := repository.KnowledgeBaseFile{
Name: req.File.Name,
Type: artifactpb.FileType_name[int32(req.File.Type)],
Owner: ns.NsUID,
CreatorUID: creatorUID,
KnowledgeBaseUID: kb.UID,
Destination: destination,
ProcessStatus: artifactpb.FileProcessStatus_name[int32(artifactpb.FileProcessStatus_FILE_PROCESS_STATUS_NOTSTARTED)],
Size: fileSize,
Name: req.File.Name,
Type: artifactpb.FileType_name[int32(req.File.Type)],
Owner: ns.NsUID,
CreatorUID: creatorUID,
KnowledgeBaseUID: kb.UID,
Destination: destination,
ProcessStatus: artifactpb.FileProcessStatus_name[int32(artifactpb.FileProcessStatus_FILE_PROCESS_STATUS_NOTSTARTED)],
Size: fileSize,
ExternalMetadataUnmarshal: req.File.ExternalMetadata,
}

// create catalog file in database
Expand Down Expand Up @@ -154,18 +155,19 @@ func (ph *PublicHandler) UploadCatalogFile(ctx context.Context, req *artifactpb.

return &artifactpb.UploadCatalogFileResponse{
File: &artifactpb.File{
FileUid: res.UID.String(),
OwnerUid: res.Owner.String(),
CreatorUid: res.CreatorUID.String(),
CatalogUid: res.KnowledgeBaseUID.String(),
Name: res.Name,
Type: req.File.Type,
CreateTime: timestamppb.New(*res.CreateTime),
UpdateTime: timestamppb.New(*res.UpdateTime),
ProcessStatus: artifactpb.FileProcessStatus_FILE_PROCESS_STATUS_NOTSTARTED,
Size: res.Size,
TotalChunks: 0,
TotalTokens: 0,
FileUid: res.UID.String(),
OwnerUid: res.Owner.String(),
CreatorUid: res.CreatorUID.String(),
CatalogUid: res.KnowledgeBaseUID.String(),
Name: res.Name,
Type: req.File.Type,
CreateTime: timestamppb.New(*res.CreateTime),
UpdateTime: timestamppb.New(*res.UpdateTime),
ProcessStatus: artifactpb.FileProcessStatus_FILE_PROCESS_STATUS_NOTSTARTED,
Size: res.Size,
TotalChunks: 0,
TotalTokens: 0,
ExternalMetadata: res.ExternalMetadataUnmarshal,
},
}, nil
}
Expand Down Expand Up @@ -290,18 +292,19 @@ func (ph *PublicHandler) ListCatalogFiles(ctx context.Context, req *artifactpb.L
nextPageToken = nextToken
for _, kbFile := range kbFiles {
files = append(files, &artifactpb.File{
FileUid: kbFile.UID.String(),
OwnerUid: kbFile.Owner.String(),
CreatorUid: kbFile.CreatorUID.String(),
CatalogUid: kbFile.KnowledgeBaseUID.String(),
Name: kbFile.Name,
Type: artifactpb.FileType(artifactpb.FileType_value[kbFile.Type]),
CreateTime: timestamppb.New(*kbFile.CreateTime),
UpdateTime: timestamppb.New(*kbFile.UpdateTime),
ProcessStatus: artifactpb.FileProcessStatus(artifactpb.FileProcessStatus_value[kbFile.ProcessStatus]),
Size: kbFile.Size,
TotalChunks: int32(totalChunks[kbFile.UID]),
TotalTokens: int32(totalTokens[kbFile.UID]),
FileUid: kbFile.UID.String(),
OwnerUid: kbFile.Owner.String(),
CreatorUid: kbFile.CreatorUID.String(),
CatalogUid: kbFile.KnowledgeBaseUID.String(),
Name: kbFile.Name,
Type: artifactpb.FileType(artifactpb.FileType_value[kbFile.Type]),
CreateTime: timestamppb.New(*kbFile.CreateTime),
UpdateTime: timestamppb.New(*kbFile.UpdateTime),
ProcessStatus: artifactpb.FileProcessStatus(artifactpb.FileProcessStatus_value[kbFile.ProcessStatus]),
Size: kbFile.Size,
ExternalMetadata: kbFile.ExternalMetadataUnmarshal,
TotalChunks: int32(totalChunks[kbFile.UID]),
TotalTokens: int32(totalTokens[kbFile.UID]),
})
}
}
Expand Down
61 changes: 57 additions & 4 deletions pkg/repository/knowledgebasefile.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ import (
"github.com/instill-ai/artifact-backend/pkg/logger"
artifactpb "github.com/instill-ai/protogen-go/artifact/artifact/v1alpha"
"go.uber.org/zap"
"google.golang.org/protobuf/encoding/protojson"
"google.golang.org/protobuf/types/known/structpb"
"gorm.io/gorm"
"gorm.io/gorm/clause"
)
Expand Down Expand Up @@ -66,6 +68,7 @@ type KnowledgeBaseFile struct {
// Process status is defined in the grpc proto file
ProcessStatus string `gorm:"column:process_status;size:100;not null" json:"process_status"`
// Note: use ExtraMetaDataMarshal method to marshal and unmarshal. do not populate this field directly
// this field is used internally for the extra meta data of the file
ExtraMetaData string `gorm:"column:extra_meta_data;type:jsonb" json:"extra_meta_data"`
// Content not used yet
Content []byte `gorm:"column:content;type:bytea" json:"content"`
Expand All @@ -78,6 +81,10 @@ type KnowledgeBaseFile struct {
RequesterUID uuid.UUID `gorm:"column:requester_uid;type:uuid;"`
// This filed is not stored in the database. It is used to unmarshal the ExtraMetaData field
ExtraMetaDataUnmarshal *ExtraMetaData `gorm:"-" json:"extra_meta_data_unmarshal"`
// this field is used to let external service store the external metadata of file.
ExternalMetadata string `gorm:"column:external_metadata;type:jsonb" json:"external_metadata"`
// This field is not stored in the database. It is used to unmarshal the ExternalMetadata field
ExternalMetadataUnmarshal *structpb.Struct `gorm:"-" json:"external_metadata_unmarshal"`
}

type ExtraMetaData struct {
Expand Down Expand Up @@ -107,6 +114,7 @@ type KnowledgeBaseFileColumns struct {
DeleteTime string
RequesterUID string
Size string
ExternalMetadata string
}

var KnowledgeBaseFileColumn = KnowledgeBaseFileColumns{
Expand All @@ -124,6 +132,7 @@ var KnowledgeBaseFileColumn = KnowledgeBaseFileColumns{
DeleteTime: "delete_time",
Size: "size",
RequesterUID: "requester_uid",
ExternalMetadata: "external_metadata",
}

// ExtraMetaDataMarshal marshals the ExtraMetaData struct to a JSON string
Expand Down Expand Up @@ -154,21 +163,65 @@ func (kf *KnowledgeBaseFile) ExtraMetaDataUnmarshalFunc() error {
return nil
}

// ExternalMetadataToJSON converts structpb.Struct to JSON string for DB storage
func (kf *KnowledgeBaseFile) ExternalMetadataToJSON() error {
if kf.ExternalMetadataUnmarshal == nil {
kf.ExternalMetadata = "{}"
return nil
}

jsonBytes, err := protojson.Marshal(kf.ExternalMetadataUnmarshal)
if err != nil {
return fmt.Errorf("failed to marshal external metadata to JSON: %v", err)
}

kf.ExternalMetadata = string(jsonBytes)
return nil
}

// JSONToExternalMetadata converts JSON string from DB to structpb.Struct
func (kf *KnowledgeBaseFile) JSONToExternalMetadata() error {
if kf.ExternalMetadata == "" {
kf.ExternalMetadataUnmarshal = nil
return nil
}

s := &structpb.Struct{}
if err := protojson.Unmarshal([]byte(kf.ExternalMetadata), s); err != nil {
return fmt.Errorf("failed to unmarshal external metadata from JSON: %v", err)
}

kf.ExternalMetadataUnmarshal = s
return nil
}

// GORM hooks
func (kf *KnowledgeBaseFile) BeforeCreate(tx *gorm.DB) (err error) {
return kf.ExtraMetaDataMarshal()
if err := kf.ExtraMetaDataMarshal(); err != nil {
return err
}
return kf.ExternalMetadataToJSON()
}

func (kf *KnowledgeBaseFile) BeforeSave(tx *gorm.DB) (err error) {
return kf.ExtraMetaDataMarshal()
if err := kf.ExtraMetaDataMarshal(); err != nil {
return err
}
return kf.ExternalMetadataToJSON()
}

func (kf *KnowledgeBaseFile) BeforeUpdate(tx *gorm.DB) (err error) {
return kf.ExtraMetaDataMarshal()
if err := kf.ExtraMetaDataMarshal(); err != nil {
return err
}
return kf.ExternalMetadataToJSON()
}

func (kf *KnowledgeBaseFile) AfterFind(tx *gorm.DB) (err error) {
return kf.ExtraMetaDataUnmarshalFunc()
if err := kf.ExtraMetaDataUnmarshalFunc(); err != nil {
return err
}
return kf.JSONToExternalMetadata()
}

// KnowledgeBaseFileTableName returns the table name of the KnowledgeBaseFile
Expand Down
2 changes: 2 additions & 0 deletions pkg/worker/worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -526,6 +526,8 @@ func (wp *fileToEmbWorkerPool) processChunkingFile(ctx context.Context, file rep
logger.Error("Failed to get converted file from minIO.", zap.String("Converted file uid", convertedFile.UID.String()))
return nil, artifactpb.FileProcessStatus_FILE_PROCESS_STATUS_UNSPECIFIED, err
}

// TODO: some file use splitTextPipe and some use splitMarkdownPipe
// call the markdown chunking pipeline
requesterUID := file.RequesterUID
chunks, err := wp.svc.SplitMarkdownPipe(ctx, file.CreatorUID, requesterUID, string(convertedFileData))
Expand Down
Loading