Skip to content

Commit

Permalink
feat(catalog): allow external service to store file metadata (#123)
Browse files Browse the repository at this point in the history
Because

when an external service uploads a file, it includes specific metadata
for the file. It also needs to check this metadata during the next
retrieval.

This commit

allows external_metadata in the file upload API and returns the metadata
in the file list API.
  • Loading branch information
Yougigun authored Oct 31, 2024
1 parent 84dbc55 commit 6c97540
Show file tree
Hide file tree
Showing 8 changed files with 115 additions and 40 deletions.
2 changes: 1 addition & 1 deletion config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ database:
host: pg-sql
port: 5432
name: artifact
version: 17
version: 18
timezone: Etc/UTC
pool:
idleconnections: 5
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ require (
github.com/grpc-ecosystem/go-grpc-middleware v1.3.0
github.com/grpc-ecosystem/grpc-gateway/v2 v2.19.1
github.com/influxdata/influxdb-client-go/v2 v2.12.3
github.com/instill-ai/protogen-go v0.3.3-alpha.0.20241024152819-5ed9f53b5c8a
github.com/instill-ai/protogen-go v0.3.3-alpha.0.20241029162707-1398399a24ee
github.com/instill-ai/usage-client v0.3.0-alpha.0.20240319060111-4a3a39f2fd61
github.com/instill-ai/x v0.3.0-alpha.0.20231219052200-6230a89e386c
github.com/knadh/koanf v1.5.0
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -344,8 +344,8 @@ github.com/influxdata/influxdb-client-go/v2 v2.12.3 h1:28nRlNMRIV4QbtIUvxhWqaxn0
github.com/influxdata/influxdb-client-go/v2 v2.12.3/go.mod h1:IrrLUbCjjfkmRuaCiGQg4m2GbkaeJDcuWoxiWdQEbA0=
github.com/influxdata/line-protocol v0.0.0-20200327222509-2487e7298839 h1:W9WBk7wlPfJLvMCdtV4zPulc4uCPrlywQOmbFOhgQNU=
github.com/influxdata/line-protocol v0.0.0-20200327222509-2487e7298839/go.mod h1:xaLFMmpvUxqXtVkUJfg9QmT88cDaCJ3ZKgdZ78oO8Qo=
github.com/instill-ai/protogen-go v0.3.3-alpha.0.20241024152819-5ed9f53b5c8a h1:FAj25JbB8CUfUbyPAj1PupoFHo9sW5kffjQyVjkXDg4=
github.com/instill-ai/protogen-go v0.3.3-alpha.0.20241024152819-5ed9f53b5c8a/go.mod h1:rf0UY7VpEgpaLudYEcjx5rnbuwlBaaLyD4FQmWLtgAY=
github.com/instill-ai/protogen-go v0.3.3-alpha.0.20241029162707-1398399a24ee h1:onnzrn5jabO3jDLPo2193Ql6YMRyDWDx9K834Bfi8V0=
github.com/instill-ai/protogen-go v0.3.3-alpha.0.20241029162707-1398399a24ee/go.mod h1:rf0UY7VpEgpaLudYEcjx5rnbuwlBaaLyD4FQmWLtgAY=
github.com/instill-ai/usage-client v0.3.0-alpha.0.20240319060111-4a3a39f2fd61 h1:smPTvmXDhn/QC7y/TPXyMTqbbRd0gvzmFgWBChwTfhE=
github.com/instill-ai/usage-client v0.3.0-alpha.0.20240319060111-4a3a39f2fd61/go.mod h1:/TAHs4ybuylk5icuy+MQtHRc4XUnIyXzeNKxX9qDFhw=
github.com/instill-ai/x v0.3.0-alpha.0.20231219052200-6230a89e386c h1:a2RVkpIV2QcrGnSHAou+t/L+vBsaIfFvk5inVg5Uh4s=
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
BEGIN;

-- Remove external_metadata column from knowledge_base_file table
ALTER TABLE knowledge_base_file
DROP COLUMN external_metadata;

COMMIT;
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
BEGIN;

-- Add external_metadata column to knowledge_base_file table
ALTER TABLE knowledge_base_file
ADD COLUMN external_metadata JSONB;

-- Add comment for the new column
COMMENT ON COLUMN knowledge_base_file.external_metadata IS 'External metadata stored as JSON, serialized from protobuf Struct';

COMMIT;
67 changes: 35 additions & 32 deletions pkg/handler/knowledgebasefiles.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,14 +117,15 @@ func (ph *PublicHandler) UploadCatalogFile(ctx context.Context, req *artifactpb.

destination := ph.service.MinIO.GetUploadedFilePathInKnowledgeBase(kb.UID.String(), req.File.Name)
kbFile := repository.KnowledgeBaseFile{
Name: req.File.Name,
Type: artifactpb.FileType_name[int32(req.File.Type)],
Owner: ns.NsUID,
CreatorUID: creatorUID,
KnowledgeBaseUID: kb.UID,
Destination: destination,
ProcessStatus: artifactpb.FileProcessStatus_name[int32(artifactpb.FileProcessStatus_FILE_PROCESS_STATUS_NOTSTARTED)],
Size: fileSize,
Name: req.File.Name,
Type: artifactpb.FileType_name[int32(req.File.Type)],
Owner: ns.NsUID,
CreatorUID: creatorUID,
KnowledgeBaseUID: kb.UID,
Destination: destination,
ProcessStatus: artifactpb.FileProcessStatus_name[int32(artifactpb.FileProcessStatus_FILE_PROCESS_STATUS_NOTSTARTED)],
Size: fileSize,
ExternalMetadataUnmarshal: req.File.ExternalMetadata,
}

// create catalog file in database
Expand Down Expand Up @@ -154,18 +155,19 @@ func (ph *PublicHandler) UploadCatalogFile(ctx context.Context, req *artifactpb.

return &artifactpb.UploadCatalogFileResponse{
File: &artifactpb.File{
FileUid: res.UID.String(),
OwnerUid: res.Owner.String(),
CreatorUid: res.CreatorUID.String(),
CatalogUid: res.KnowledgeBaseUID.String(),
Name: res.Name,
Type: req.File.Type,
CreateTime: timestamppb.New(*res.CreateTime),
UpdateTime: timestamppb.New(*res.UpdateTime),
ProcessStatus: artifactpb.FileProcessStatus_FILE_PROCESS_STATUS_NOTSTARTED,
Size: res.Size,
TotalChunks: 0,
TotalTokens: 0,
FileUid: res.UID.String(),
OwnerUid: res.Owner.String(),
CreatorUid: res.CreatorUID.String(),
CatalogUid: res.KnowledgeBaseUID.String(),
Name: res.Name,
Type: req.File.Type,
CreateTime: timestamppb.New(*res.CreateTime),
UpdateTime: timestamppb.New(*res.UpdateTime),
ProcessStatus: artifactpb.FileProcessStatus_FILE_PROCESS_STATUS_NOTSTARTED,
Size: res.Size,
TotalChunks: 0,
TotalTokens: 0,
ExternalMetadata: res.ExternalMetadataUnmarshal,
},
}, nil
}
Expand Down Expand Up @@ -290,18 +292,19 @@ func (ph *PublicHandler) ListCatalogFiles(ctx context.Context, req *artifactpb.L
nextPageToken = nextToken
for _, kbFile := range kbFiles {
files = append(files, &artifactpb.File{
FileUid: kbFile.UID.String(),
OwnerUid: kbFile.Owner.String(),
CreatorUid: kbFile.CreatorUID.String(),
CatalogUid: kbFile.KnowledgeBaseUID.String(),
Name: kbFile.Name,
Type: artifactpb.FileType(artifactpb.FileType_value[kbFile.Type]),
CreateTime: timestamppb.New(*kbFile.CreateTime),
UpdateTime: timestamppb.New(*kbFile.UpdateTime),
ProcessStatus: artifactpb.FileProcessStatus(artifactpb.FileProcessStatus_value[kbFile.ProcessStatus]),
Size: kbFile.Size,
TotalChunks: int32(totalChunks[kbFile.UID]),
TotalTokens: int32(totalTokens[kbFile.UID]),
FileUid: kbFile.UID.String(),
OwnerUid: kbFile.Owner.String(),
CreatorUid: kbFile.CreatorUID.String(),
CatalogUid: kbFile.KnowledgeBaseUID.String(),
Name: kbFile.Name,
Type: artifactpb.FileType(artifactpb.FileType_value[kbFile.Type]),
CreateTime: timestamppb.New(*kbFile.CreateTime),
UpdateTime: timestamppb.New(*kbFile.UpdateTime),
ProcessStatus: artifactpb.FileProcessStatus(artifactpb.FileProcessStatus_value[kbFile.ProcessStatus]),
Size: kbFile.Size,
ExternalMetadata: kbFile.ExternalMetadataUnmarshal,
TotalChunks: int32(totalChunks[kbFile.UID]),
TotalTokens: int32(totalTokens[kbFile.UID]),
})
}
}
Expand Down
61 changes: 57 additions & 4 deletions pkg/repository/knowledgebasefile.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ import (
"github.com/instill-ai/artifact-backend/pkg/logger"
artifactpb "github.com/instill-ai/protogen-go/artifact/artifact/v1alpha"
"go.uber.org/zap"
"google.golang.org/protobuf/encoding/protojson"
"google.golang.org/protobuf/types/known/structpb"
"gorm.io/gorm"
"gorm.io/gorm/clause"
)
Expand Down Expand Up @@ -66,6 +68,7 @@ type KnowledgeBaseFile struct {
// Process status is defined in the grpc proto file
ProcessStatus string `gorm:"column:process_status;size:100;not null" json:"process_status"`
// Note: use ExtraMetaDataMarshal method to marshal and unmarshal. do not populate this field directly
// this field is used internally for the extra meta data of the file
ExtraMetaData string `gorm:"column:extra_meta_data;type:jsonb" json:"extra_meta_data"`
// Content not used yet
Content []byte `gorm:"column:content;type:bytea" json:"content"`
Expand All @@ -78,6 +81,10 @@ type KnowledgeBaseFile struct {
RequesterUID uuid.UUID `gorm:"column:requester_uid;type:uuid;"`
// This filed is not stored in the database. It is used to unmarshal the ExtraMetaData field
ExtraMetaDataUnmarshal *ExtraMetaData `gorm:"-" json:"extra_meta_data_unmarshal"`
// this field is used to let external service store the external metadata of file.
ExternalMetadata string `gorm:"column:external_metadata;type:jsonb" json:"external_metadata"`
// This field is not stored in the database. It is used to unmarshal the ExternalMetadata field
ExternalMetadataUnmarshal *structpb.Struct `gorm:"-" json:"external_metadata_unmarshal"`
}

type ExtraMetaData struct {
Expand Down Expand Up @@ -107,6 +114,7 @@ type KnowledgeBaseFileColumns struct {
DeleteTime string
RequesterUID string
Size string
ExternalMetadata string
}

var KnowledgeBaseFileColumn = KnowledgeBaseFileColumns{
Expand All @@ -124,6 +132,7 @@ var KnowledgeBaseFileColumn = KnowledgeBaseFileColumns{
DeleteTime: "delete_time",
Size: "size",
RequesterUID: "requester_uid",
ExternalMetadata: "external_metadata",
}

// ExtraMetaDataMarshal marshals the ExtraMetaData struct to a JSON string
Expand Down Expand Up @@ -154,21 +163,65 @@ func (kf *KnowledgeBaseFile) ExtraMetaDataUnmarshalFunc() error {
return nil
}

// ExternalMetadataToJSON converts structpb.Struct to JSON string for DB storage
func (kf *KnowledgeBaseFile) ExternalMetadataToJSON() error {
if kf.ExternalMetadataUnmarshal == nil {
kf.ExternalMetadata = "{}"
return nil
}

jsonBytes, err := protojson.Marshal(kf.ExternalMetadataUnmarshal)
if err != nil {
return fmt.Errorf("failed to marshal external metadata to JSON: %v", err)
}

kf.ExternalMetadata = string(jsonBytes)
return nil
}

// JSONToExternalMetadata converts JSON string from DB to structpb.Struct
func (kf *KnowledgeBaseFile) JSONToExternalMetadata() error {
if kf.ExternalMetadata == "" {
kf.ExternalMetadataUnmarshal = nil
return nil
}

s := &structpb.Struct{}
if err := protojson.Unmarshal([]byte(kf.ExternalMetadata), s); err != nil {
return fmt.Errorf("failed to unmarshal external metadata from JSON: %v", err)
}

kf.ExternalMetadataUnmarshal = s
return nil
}

// GORM hooks
func (kf *KnowledgeBaseFile) BeforeCreate(tx *gorm.DB) (err error) {
return kf.ExtraMetaDataMarshal()
if err := kf.ExtraMetaDataMarshal(); err != nil {
return err
}
return kf.ExternalMetadataToJSON()
}

func (kf *KnowledgeBaseFile) BeforeSave(tx *gorm.DB) (err error) {
return kf.ExtraMetaDataMarshal()
if err := kf.ExtraMetaDataMarshal(); err != nil {
return err
}
return kf.ExternalMetadataToJSON()
}

func (kf *KnowledgeBaseFile) BeforeUpdate(tx *gorm.DB) (err error) {
return kf.ExtraMetaDataMarshal()
if err := kf.ExtraMetaDataMarshal(); err != nil {
return err
}
return kf.ExternalMetadataToJSON()
}

func (kf *KnowledgeBaseFile) AfterFind(tx *gorm.DB) (err error) {
return kf.ExtraMetaDataUnmarshalFunc()
if err := kf.ExtraMetaDataUnmarshalFunc(); err != nil {
return err
}
return kf.JSONToExternalMetadata()
}

// KnowledgeBaseFileTableName returns the table name of the KnowledgeBaseFile
Expand Down
2 changes: 2 additions & 0 deletions pkg/worker/worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -526,6 +526,8 @@ func (wp *fileToEmbWorkerPool) processChunkingFile(ctx context.Context, file rep
logger.Error("Failed to get converted file from minIO.", zap.String("Converted file uid", convertedFile.UID.String()))
return nil, artifactpb.FileProcessStatus_FILE_PROCESS_STATUS_UNSPECIFIED, err
}

// TODO: some file use splitTextPipe and some use splitMarkdownPipe
// call the markdown chunking pipeline
requesterUID := file.RequesterUID
chunks, err := wp.svc.SplitMarkdownPipe(ctx, file.CreatorUID, requesterUID, string(convertedFileData))
Expand Down

0 comments on commit 6c97540

Please sign in to comment.