diff --git a/config/config.yaml b/config/config.yaml index 91cb274..7c57bb9 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -22,7 +22,7 @@ database: host: pg-sql port: 5432 name: artifact - version: 17 + version: 18 timezone: Etc/UTC pool: idleconnections: 5 diff --git a/go.mod b/go.mod index 3fd080c..cf347b9 100644 --- a/go.mod +++ b/go.mod @@ -13,7 +13,7 @@ require ( github.com/grpc-ecosystem/go-grpc-middleware v1.3.0 github.com/grpc-ecosystem/grpc-gateway/v2 v2.19.1 github.com/influxdata/influxdb-client-go/v2 v2.12.3 - github.com/instill-ai/protogen-go v0.3.3-alpha.0.20241024152819-5ed9f53b5c8a + github.com/instill-ai/protogen-go v0.3.3-alpha.0.20241029162707-1398399a24ee github.com/instill-ai/usage-client v0.3.0-alpha.0.20240319060111-4a3a39f2fd61 github.com/instill-ai/x v0.3.0-alpha.0.20231219052200-6230a89e386c github.com/knadh/koanf v1.5.0 diff --git a/go.sum b/go.sum index c5e4f83..00e78e5 100644 --- a/go.sum +++ b/go.sum @@ -344,8 +344,8 @@ github.com/influxdata/influxdb-client-go/v2 v2.12.3 h1:28nRlNMRIV4QbtIUvxhWqaxn0 github.com/influxdata/influxdb-client-go/v2 v2.12.3/go.mod h1:IrrLUbCjjfkmRuaCiGQg4m2GbkaeJDcuWoxiWdQEbA0= github.com/influxdata/line-protocol v0.0.0-20200327222509-2487e7298839 h1:W9WBk7wlPfJLvMCdtV4zPulc4uCPrlywQOmbFOhgQNU= github.com/influxdata/line-protocol v0.0.0-20200327222509-2487e7298839/go.mod h1:xaLFMmpvUxqXtVkUJfg9QmT88cDaCJ3ZKgdZ78oO8Qo= -github.com/instill-ai/protogen-go v0.3.3-alpha.0.20241024152819-5ed9f53b5c8a h1:FAj25JbB8CUfUbyPAj1PupoFHo9sW5kffjQyVjkXDg4= -github.com/instill-ai/protogen-go v0.3.3-alpha.0.20241024152819-5ed9f53b5c8a/go.mod h1:rf0UY7VpEgpaLudYEcjx5rnbuwlBaaLyD4FQmWLtgAY= +github.com/instill-ai/protogen-go v0.3.3-alpha.0.20241029162707-1398399a24ee h1:onnzrn5jabO3jDLPo2193Ql6YMRyDWDx9K834Bfi8V0= +github.com/instill-ai/protogen-go v0.3.3-alpha.0.20241029162707-1398399a24ee/go.mod h1:rf0UY7VpEgpaLudYEcjx5rnbuwlBaaLyD4FQmWLtgAY= github.com/instill-ai/usage-client v0.3.0-alpha.0.20240319060111-4a3a39f2fd61 h1:smPTvmXDhn/QC7y/TPXyMTqbbRd0gvzmFgWBChwTfhE= github.com/instill-ai/usage-client v0.3.0-alpha.0.20240319060111-4a3a39f2fd61/go.mod h1:/TAHs4ybuylk5icuy+MQtHRc4XUnIyXzeNKxX9qDFhw= github.com/instill-ai/x v0.3.0-alpha.0.20231219052200-6230a89e386c h1:a2RVkpIV2QcrGnSHAou+t/L+vBsaIfFvk5inVg5Uh4s= diff --git a/pkg/db/migration/000018_add_external_metadata_in_kb_file_table.down.sql b/pkg/db/migration/000018_add_external_metadata_in_kb_file_table.down.sql new file mode 100644 index 0000000..f865cd8 --- /dev/null +++ b/pkg/db/migration/000018_add_external_metadata_in_kb_file_table.down.sql @@ -0,0 +1,7 @@ +BEGIN; + +-- Remove external_metadata column from knowledge_base_file table +ALTER TABLE knowledge_base_file +DROP COLUMN external_metadata; + +COMMIT; diff --git a/pkg/db/migration/000018_add_external_metadata_in_kb_file_table.up.sql b/pkg/db/migration/000018_add_external_metadata_in_kb_file_table.up.sql new file mode 100644 index 0000000..d53ea24 --- /dev/null +++ b/pkg/db/migration/000018_add_external_metadata_in_kb_file_table.up.sql @@ -0,0 +1,10 @@ +BEGIN; + +-- Add external_metadata column to knowledge_base_file table +ALTER TABLE knowledge_base_file +ADD COLUMN external_metadata JSONB; + +-- Add comment for the new column +COMMENT ON COLUMN knowledge_base_file.external_metadata IS 'External metadata stored as JSON, serialized from protobuf Struct'; + +COMMIT; diff --git a/pkg/handler/knowledgebasefiles.go b/pkg/handler/knowledgebasefiles.go index e9f9b01..8a2dc47 100644 --- a/pkg/handler/knowledgebasefiles.go +++ b/pkg/handler/knowledgebasefiles.go @@ -117,14 +117,15 @@ func (ph *PublicHandler) UploadCatalogFile(ctx context.Context, req *artifactpb. destination := ph.service.MinIO.GetUploadedFilePathInKnowledgeBase(kb.UID.String(), req.File.Name) kbFile := repository.KnowledgeBaseFile{ - Name: req.File.Name, - Type: artifactpb.FileType_name[int32(req.File.Type)], - Owner: ns.NsUID, - CreatorUID: creatorUID, - KnowledgeBaseUID: kb.UID, - Destination: destination, - ProcessStatus: artifactpb.FileProcessStatus_name[int32(artifactpb.FileProcessStatus_FILE_PROCESS_STATUS_NOTSTARTED)], - Size: fileSize, + Name: req.File.Name, + Type: artifactpb.FileType_name[int32(req.File.Type)], + Owner: ns.NsUID, + CreatorUID: creatorUID, + KnowledgeBaseUID: kb.UID, + Destination: destination, + ProcessStatus: artifactpb.FileProcessStatus_name[int32(artifactpb.FileProcessStatus_FILE_PROCESS_STATUS_NOTSTARTED)], + Size: fileSize, + ExternalMetadataUnmarshal: req.File.ExternalMetadata, } // create catalog file in database @@ -154,18 +155,19 @@ func (ph *PublicHandler) UploadCatalogFile(ctx context.Context, req *artifactpb. return &artifactpb.UploadCatalogFileResponse{ File: &artifactpb.File{ - FileUid: res.UID.String(), - OwnerUid: res.Owner.String(), - CreatorUid: res.CreatorUID.String(), - CatalogUid: res.KnowledgeBaseUID.String(), - Name: res.Name, - Type: req.File.Type, - CreateTime: timestamppb.New(*res.CreateTime), - UpdateTime: timestamppb.New(*res.UpdateTime), - ProcessStatus: artifactpb.FileProcessStatus_FILE_PROCESS_STATUS_NOTSTARTED, - Size: res.Size, - TotalChunks: 0, - TotalTokens: 0, + FileUid: res.UID.String(), + OwnerUid: res.Owner.String(), + CreatorUid: res.CreatorUID.String(), + CatalogUid: res.KnowledgeBaseUID.String(), + Name: res.Name, + Type: req.File.Type, + CreateTime: timestamppb.New(*res.CreateTime), + UpdateTime: timestamppb.New(*res.UpdateTime), + ProcessStatus: artifactpb.FileProcessStatus_FILE_PROCESS_STATUS_NOTSTARTED, + Size: res.Size, + TotalChunks: 0, + TotalTokens: 0, + ExternalMetadata: res.ExternalMetadataUnmarshal, }, }, nil } @@ -290,18 +292,19 @@ func (ph *PublicHandler) ListCatalogFiles(ctx context.Context, req *artifactpb.L nextPageToken = nextToken for _, kbFile := range kbFiles { files = append(files, &artifactpb.File{ - FileUid: kbFile.UID.String(), - OwnerUid: kbFile.Owner.String(), - CreatorUid: kbFile.CreatorUID.String(), - CatalogUid: kbFile.KnowledgeBaseUID.String(), - Name: kbFile.Name, - Type: artifactpb.FileType(artifactpb.FileType_value[kbFile.Type]), - CreateTime: timestamppb.New(*kbFile.CreateTime), - UpdateTime: timestamppb.New(*kbFile.UpdateTime), - ProcessStatus: artifactpb.FileProcessStatus(artifactpb.FileProcessStatus_value[kbFile.ProcessStatus]), - Size: kbFile.Size, - TotalChunks: int32(totalChunks[kbFile.UID]), - TotalTokens: int32(totalTokens[kbFile.UID]), + FileUid: kbFile.UID.String(), + OwnerUid: kbFile.Owner.String(), + CreatorUid: kbFile.CreatorUID.String(), + CatalogUid: kbFile.KnowledgeBaseUID.String(), + Name: kbFile.Name, + Type: artifactpb.FileType(artifactpb.FileType_value[kbFile.Type]), + CreateTime: timestamppb.New(*kbFile.CreateTime), + UpdateTime: timestamppb.New(*kbFile.UpdateTime), + ProcessStatus: artifactpb.FileProcessStatus(artifactpb.FileProcessStatus_value[kbFile.ProcessStatus]), + Size: kbFile.Size, + ExternalMetadata: kbFile.ExternalMetadataUnmarshal, + TotalChunks: int32(totalChunks[kbFile.UID]), + TotalTokens: int32(totalTokens[kbFile.UID]), }) } } diff --git a/pkg/repository/knowledgebasefile.go b/pkg/repository/knowledgebasefile.go index 8390c15..df7692d 100644 --- a/pkg/repository/knowledgebasefile.go +++ b/pkg/repository/knowledgebasefile.go @@ -11,6 +11,8 @@ import ( "github.com/instill-ai/artifact-backend/pkg/logger" artifactpb "github.com/instill-ai/protogen-go/artifact/artifact/v1alpha" "go.uber.org/zap" + "google.golang.org/protobuf/encoding/protojson" + "google.golang.org/protobuf/types/known/structpb" "gorm.io/gorm" "gorm.io/gorm/clause" ) @@ -66,6 +68,7 @@ type KnowledgeBaseFile struct { // Process status is defined in the grpc proto file ProcessStatus string `gorm:"column:process_status;size:100;not null" json:"process_status"` // Note: use ExtraMetaDataMarshal method to marshal and unmarshal. do not populate this field directly + // this field is used internally for the extra meta data of the file ExtraMetaData string `gorm:"column:extra_meta_data;type:jsonb" json:"extra_meta_data"` // Content not used yet Content []byte `gorm:"column:content;type:bytea" json:"content"` @@ -78,6 +81,10 @@ type KnowledgeBaseFile struct { RequesterUID uuid.UUID `gorm:"column:requester_uid;type:uuid;"` // This filed is not stored in the database. It is used to unmarshal the ExtraMetaData field ExtraMetaDataUnmarshal *ExtraMetaData `gorm:"-" json:"extra_meta_data_unmarshal"` + // this field is used to let external service store the external metadata of file. + ExternalMetadata string `gorm:"column:external_metadata;type:jsonb" json:"external_metadata"` + // This field is not stored in the database. It is used to unmarshal the ExternalMetadata field + ExternalMetadataUnmarshal *structpb.Struct `gorm:"-" json:"external_metadata_unmarshal"` } type ExtraMetaData struct { @@ -107,6 +114,7 @@ type KnowledgeBaseFileColumns struct { DeleteTime string RequesterUID string Size string + ExternalMetadata string } var KnowledgeBaseFileColumn = KnowledgeBaseFileColumns{ @@ -124,6 +132,7 @@ var KnowledgeBaseFileColumn = KnowledgeBaseFileColumns{ DeleteTime: "delete_time", Size: "size", RequesterUID: "requester_uid", + ExternalMetadata: "external_metadata", } // ExtraMetaDataMarshal marshals the ExtraMetaData struct to a JSON string @@ -154,21 +163,65 @@ func (kf *KnowledgeBaseFile) ExtraMetaDataUnmarshalFunc() error { return nil } +// ExternalMetadataToJSON converts structpb.Struct to JSON string for DB storage +func (kf *KnowledgeBaseFile) ExternalMetadataToJSON() error { + if kf.ExternalMetadataUnmarshal == nil { + kf.ExternalMetadata = "{}" + return nil + } + + jsonBytes, err := protojson.Marshal(kf.ExternalMetadataUnmarshal) + if err != nil { + return fmt.Errorf("failed to marshal external metadata to JSON: %v", err) + } + + kf.ExternalMetadata = string(jsonBytes) + return nil +} + +// JSONToExternalMetadata converts JSON string from DB to structpb.Struct +func (kf *KnowledgeBaseFile) JSONToExternalMetadata() error { + if kf.ExternalMetadata == "" { + kf.ExternalMetadataUnmarshal = nil + return nil + } + + s := &structpb.Struct{} + if err := protojson.Unmarshal([]byte(kf.ExternalMetadata), s); err != nil { + return fmt.Errorf("failed to unmarshal external metadata from JSON: %v", err) + } + + kf.ExternalMetadataUnmarshal = s + return nil +} + // GORM hooks func (kf *KnowledgeBaseFile) BeforeCreate(tx *gorm.DB) (err error) { - return kf.ExtraMetaDataMarshal() + if err := kf.ExtraMetaDataMarshal(); err != nil { + return err + } + return kf.ExternalMetadataToJSON() } func (kf *KnowledgeBaseFile) BeforeSave(tx *gorm.DB) (err error) { - return kf.ExtraMetaDataMarshal() + if err := kf.ExtraMetaDataMarshal(); err != nil { + return err + } + return kf.ExternalMetadataToJSON() } func (kf *KnowledgeBaseFile) BeforeUpdate(tx *gorm.DB) (err error) { - return kf.ExtraMetaDataMarshal() + if err := kf.ExtraMetaDataMarshal(); err != nil { + return err + } + return kf.ExternalMetadataToJSON() } func (kf *KnowledgeBaseFile) AfterFind(tx *gorm.DB) (err error) { - return kf.ExtraMetaDataUnmarshalFunc() + if err := kf.ExtraMetaDataUnmarshalFunc(); err != nil { + return err + } + return kf.JSONToExternalMetadata() } // KnowledgeBaseFileTableName returns the table name of the KnowledgeBaseFile diff --git a/pkg/worker/worker.go b/pkg/worker/worker.go index e630398..9590bf7 100644 --- a/pkg/worker/worker.go +++ b/pkg/worker/worker.go @@ -526,6 +526,8 @@ func (wp *fileToEmbWorkerPool) processChunkingFile(ctx context.Context, file rep logger.Error("Failed to get converted file from minIO.", zap.String("Converted file uid", convertedFile.UID.String())) return nil, artifactpb.FileProcessStatus_FILE_PROCESS_STATUS_UNSPECIFIED, err } + + // TODO: some file use splitTextPipe and some use splitMarkdownPipe // call the markdown chunking pipeline requesterUID := file.RequesterUID chunks, err := wp.svc.SplitMarkdownPipe(ctx, file.CreatorUID, requesterUID, string(convertedFileData))