diff --git a/packages/service/common/file/read/utils.ts b/packages/service/common/file/read/utils.ts index 00335e9e40b1..a1a336d0d00b 100644 --- a/packages/service/common/file/read/utils.ts +++ b/packages/service/common/file/read/utils.ts @@ -6,6 +6,7 @@ import { addHours } from 'date-fns'; import { WorkerNameEnum, runWorker } from '../../../worker/utils'; import { ReadFileResponse } from '../../../worker/file/type'; import {DatasetSchemaType} from "@fastgpt/global/core/dataset/type"; +import {initPdfText} from "../../../worker/file/extension/unstructured"; export const initMarkdownText = ({ teamId, @@ -55,6 +56,16 @@ export const readFileRawContent = async ({ dataset }); + // pdf image query + if (['pdf'].includes(extension)) { + result.rawText = await initPdfText({ + teamId: teamId, + metadata: metadata, + dataset: dataset, + pageElements: result?.metadata? result?.metadata["elements"] : [] + }); + } + // markdown data format if (['md', 'html', 'docx'].includes(extension)) { result.rawText = await initMarkdownText({ diff --git a/packages/service/core/ai/functions/queryImageDescription.ts b/packages/service/core/ai/functions/queryImageDescription.ts index 3aa460d2da8d..f280275a120e 100644 --- a/packages/service/core/ai/functions/queryImageDescription.ts +++ b/packages/service/core/ai/functions/queryImageDescription.ts @@ -20,11 +20,13 @@ export async function queryImageDescription({ rawTex, image_base64, model, + ai, language="eng" }: { rawTex: string; image_base64: string; model: string; + ai: any language?: string; }) { // 无"data:image/jpeg;base64,"开头的base64结构 @@ -49,9 +51,6 @@ export async function queryImageDescription({ } ]; - const ai = getAIApi({ - timeout: 480000 - }); const data = await ai.chat.completions.create({ model: getLLMModel(model).model, temperature: 0.1, diff --git a/packages/service/core/dataset/unstructured/config.ts b/packages/service/core/dataset/unstructured/config.ts index d3b5e7121599..d9c1ee448be2 100644 --- a/packages/service/core/dataset/unstructured/config.ts +++ b/packages/service/core/dataset/unstructured/config.ts @@ -16,29 +16,32 @@ export type UnstructuredEnvType = { let client: UnstructuredClient | null = null; -function initClient(){ +function initClient(config?: UnstructuredEnvType){ + if (!config){ + config = global.unstructuredConfigs; + } const httpClient = axios.create({ - timeout: global.unstructuredConfigs.timeout, + timeout: config.timeout, }) // httpClient.interceptors.request.use((config) => { // return config; // }) client = new UnstructuredClient({ - serverURL: global.unstructuredConfigs.baseUrl || 'http://localhost:8000', + serverURL: config.baseUrl || 'http://localhost:8000', security: { apiKeyAuth: "" }, defaultClient: httpClient, retryConfig: { logger: addLog, - strategy: global.unstructuredConfigs.retryConfig.strategy, + strategy: config.retryConfig.strategy, retryConnectionErrors: true, backoff: { - initialInterval: global.unstructuredConfigs.retryConfig.initialInterval, - maxInterval: global.unstructuredConfigs.retryConfig.maxInterval, - maxElapsedTime: global.unstructuredConfigs.retryConfig.maxElapsedTime, - exponent: global.unstructuredConfigs.retryConfig.exponent, + initialInterval: config.retryConfig.initialInterval, + maxInterval: config.retryConfig.maxInterval, + maxElapsedTime: config.retryConfig.maxElapsedTime, + exponent: config.retryConfig.exponent, } } }); @@ -46,11 +49,9 @@ function initClient(){ -export const getClient = ({ - -}) => { +export const getClient = (config?: UnstructuredEnvType) => { if (!client) { - initClient(); + initClient(config); } return client; } \ No newline at end of file diff --git a/packages/service/worker/file/extension/unstructured.ts b/packages/service/worker/file/extension/unstructured.ts index a076f6309088..89bf37790328 100644 --- a/packages/service/worker/file/extension/unstructured.ts +++ b/packages/service/worker/file/extension/unstructured.ts @@ -1,23 +1,14 @@ -import { ReadFileByBufferParams, ReadFileResponse } from "../../../common/file/read/type"; -import { initMarkdownText } from '../../../common/file/read/utils'; -import { getDownloadStream, getFileById } from "../../../common/file/gridfs/controller"; -import { queryImageDescription } from "../../../core/ai/functions/queryImageDescription"; -import { getLLMModel } from "../../../core/ai/model"; -import { getClient } from "../../../core/dataset/unstructured/config"; -import { addLog } from "../../../common/system/log"; -import { PDFDocument } from "pdf-lib" +import {ReadFileResponse} from "../../../common/file/read/type"; +import {initMarkdownText} from '../../../common/file/read/utils'; +import {queryImageDescription} from "../../../core/ai/functions/queryImageDescription"; +import {getClient} from "../../../core/dataset/unstructured/config"; +import {addLog} from "../../../common/system/log"; +import {PDFDocument} from "pdf-lib" import pLimit from "p-limit"; import {ReadRawTextByBuffer} from "../type"; - -type TokenType = { - str: string; - dir: string; - width: number; - height: number; - transform: number[]; - fontName: string; - hasEOL: boolean; -}; +import {workerData} from "worker_threads" +import {DatasetSchemaType} from "@fastgpt/global/core/dataset/type"; +import {getAIApi} from "../../../core/ai/config"; type UnstructuredElementType = { type: string; @@ -32,7 +23,7 @@ type UnstructuredElementType = { const limit = pLimit(3); // 解构文件,目前接收pdf、word -export const readUnFile = async ({ buffer, preview, metadata, teamId, dataset }: ReadRawTextByBuffer): Promise => { +export const readUnFile = async ({ buffer, preview, metadata }: ReadRawTextByBuffer): Promise => { if (preview) { const pdfDoc = await PDFDocument.load(buffer); @@ -47,7 +38,7 @@ export const readUnFile = async ({ buffer, preview, metadata, teamId, dataset }: //1. 请求分割pdf addLog.info(`File ${metadata?.relatedId} partition started.`); - const client = getClient({}) + const client = getClient(workerData.globalConfig.unstructuredConfigs) const res = await client?.general.partition({ files: { content: buffer, @@ -67,7 +58,23 @@ export const readUnFile = async ({ buffer, preview, metadata, teamId, dataset }: if (!pageElements || pageElements.length == 0) { pageElements = [] } + if (metadata) { + metadata["elements"] = pageElements; + } + return { + formatText: "", metadata: metadata, rawText: "" + } +} +export const initPdfText = async ({ metadata, teamId, dataset, pageElements }: { + metadata: any; + teamId: string; + dataset: DatasetSchemaType|undefined; + pageElements: any[]; +}): Promise => { + const ai = getAIApi({ + timeout: 480000 + }) //3. 请求llm-v对图片(图片和表格)进行描述 4. 将图片、表格插入mongodb const asyncOperation = async (element: UnstructuredElementType) => { if (["Image", "Table"].includes(element.type) && element.text.length >= 2 && element.metadata.image_base64) { @@ -77,6 +84,7 @@ export const readUnFile = async ({ buffer, preview, metadata, teamId, dataset }: rawTex: element.text, image_base64: "data:image/jpeg;base64," + element.metadata.image_base64, model: (dataset?.agentModel || "gemini-pro-vision"), + ai: ai, language: element.metadata.languages[0], }).catch(error => { addLog.error(`Llm image ${element.element_id} error:`, error) @@ -105,7 +113,5 @@ export const readUnFile = async ({ buffer, preview, metadata, teamId, dataset }: }).join(''); addLog.info(`Join ${metadata?.relatedId} pdf text end.`); - return { - formatText: "", metadata: metadata, rawText: finalText - } + return finalText } \ No newline at end of file diff --git a/packages/service/worker/file/type.d.ts b/packages/service/worker/file/type.d.ts index 1992e3395af4..2b0563de862a 100644 --- a/packages/service/worker/file/type.d.ts +++ b/packages/service/worker/file/type.d.ts @@ -8,7 +8,6 @@ export type ReadRawTextProps = { encoding: string; preview?: boolean; teamId: string; - dataset?: DatasetSchemaType; metadata?: Record; }; @@ -17,4 +16,5 @@ export type ReadRawTextByBuffer = ReadRawTextProps; export type ReadFileResponse = { rawText: string; formatText?: string; + metadata?: Record; }; diff --git a/packages/service/worker/utils.ts b/packages/service/worker/utils.ts index f9ab4be72f9b..8390ccd2b0e7 100644 --- a/packages/service/worker/utils.ts +++ b/packages/service/worker/utils.ts @@ -9,7 +9,11 @@ export enum WorkerNameEnum { export const getWorker = (name: WorkerNameEnum) => { const workerPath = path.join(process.cwd(), '.next', 'server', 'worker', `${name}.js`); - return new Worker(workerPath); + return new Worker(workerPath, {workerData: { + globalConfig: { + unstructuredConfigs: global.unstructuredConfigs + } + }}); }; export const runWorker = (name: WorkerNameEnum, params?: Record) => { diff --git a/projects/app/src/global/common/api/systemRes.d.ts b/projects/app/src/global/common/api/systemRes.d.ts index 8ddf3369766c..f224615cc868 100644 --- a/projects/app/src/global/common/api/systemRes.d.ts +++ b/projects/app/src/global/common/api/systemRes.d.ts @@ -8,6 +8,7 @@ import type { import type { FastGPTFeConfigsType } from '@fastgpt/global/common/system/types/index.d'; import { SubPlanType } from '@fastgpt/global/support/wallet/sub/type'; +import {UnstructuredEnvType} from "@fastgpt/service/core/dataset/unstructured/config"; export type InitDateResponse = { llmModels: LLMModelItemType[]; @@ -15,6 +16,7 @@ export type InitDateResponse = { audioSpeechModels: AudioSpeechModels[]; reRankModels: ReRankModelItemType[]; whisperModel: WhisperModelType; + unstructuredConfigs: UnstructuredEnvType; feConfigs: FastGPTFeConfigsType; subPlans?: SubPlanType; systemVersion: string; diff --git a/projects/app/src/pages/api/common/system/getInitData.ts b/projects/app/src/pages/api/common/system/getInitData.ts index 9bd032964b7e..6b0e45f93fd6 100644 --- a/projects/app/src/pages/api/common/system/getInitData.ts +++ b/projects/app/src/pages/api/common/system/getInitData.ts @@ -19,6 +19,7 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse) jsonRes(res, { data: { feConfigs: global.feConfigs, + unstructuredConfigs: global.unstructuredConfigs, subPlans: global.subPlans, llmModels: global.llmModels, vectorModels: global.vectorModels, diff --git a/projects/app/src/service/events/generateFileChunk.ts b/projects/app/src/service/events/generateFileChunk.ts index dd8528af00cc..2e0455a3438f 100644 --- a/projects/app/src/service/events/generateFileChunk.ts +++ b/projects/app/src/service/events/generateFileChunk.ts @@ -1,7 +1,7 @@ import {readFileContentFromMongo} from "@fastgpt/service/common/file/gridfs/controller"; import {BucketNameEnum} from "@fastgpt/global/common/file/constants"; import {splitText2Chunks} from "@fastgpt/global/common/string/textSplitter"; -import {DatasetCollectionTypeEnum, TrainingModeEnum} from "@fastgpt/global/core/dataset/constants"; +import {TrainingModeEnum} from "@fastgpt/global/core/dataset/constants"; import {checkDatasetLimit} from "@fastgpt/service/support/permission/teamLimit"; import {predictDataLimitLength} from "@fastgpt/global/core/dataset/utils"; import {createTrainingUsage} from "@fastgpt/service/support/wallet/usage/controller"; @@ -9,17 +9,12 @@ import {UsageSourceEnum} from "@fastgpt/global/support/wallet/usage/constants"; import {getLLMModel, getVectorModel} from "@fastgpt/service/core/ai/model"; import {pushDataListToTrainingQueue} from "@fastgpt/service/core/dataset/training/controller"; import {MongoImage} from "@fastgpt/service/common/file/image/schema"; -import {jsonRes} from "@fastgpt/service/common/response"; import {addLog} from "@fastgpt/service/common/system/log"; import {startTrainingQueue} from "@/service/core/dataset/training/utils"; import {DatasetSchemaType} from "@fastgpt/global/core/dataset/type"; -import {ClientSession} from "@fastgpt/service/common/mongo"; import {mongoSessionRun} from "@fastgpt/service/common/mongo/sessionRun"; -import {createOneCollection} from "@fastgpt/service/core/dataset/collection/controller"; -import {putDatasetCollectionById} from "@/web/core/dataset/api"; import {hashStr} from "@fastgpt/global/common/string/tools"; import {MongoDatasetCollection} from "@fastgpt/service/core/dataset/collection/schema"; -import {getCollectionUpdateTime} from "@fastgpt/service/core/dataset/collection/utils"; export const generateFileChunk = async ({ teamId,