Add image index and pdf parse (#3956)

* feat: think tag parse

* feat: parse think tag test

* feat: pdf parse ux

* feat: doc2x parse

* perf: rewrite training mode setting

* feat: image parse queue

* perf: image index

* feat: image parse process

* feat: add init sh

* fix: ts
This commit is contained in:
Archer
2025-03-03 23:08:29 +08:00
committed by archer
parent 08b6f594df
commit adf5377ebe
106 changed files with 2337 additions and 1454 deletions

View File

@@ -0,0 +1,65 @@
import { NextAPI } from '@/service/middleware/entry';
import { authCert } from '@fastgpt/service/support/permission/auth/common';
import { NextApiRequest, NextApiResponse } from 'next';
import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema';
import { DatasetCollectionDataProcessModeEnum } from '@fastgpt/global/core/dataset/constants';
import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema';
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
// 所有 trainingType=auto 的 collection都改成 trainingType=chunk
const updateCollections = async () => {
await MongoDatasetCollection.updateMany(
{
trainingType: DatasetCollectionDataProcessModeEnum.auto
},
{
$set: {
trainingType: DatasetCollectionDataProcessModeEnum.chunk,
autoIndexes: true
}
}
);
};
const updateData = async () => {
await MongoDatasetData.updateMany({ indexes: { $exists: true } }, [
{
$set: {
indexes: {
$map: {
input: '$indexes',
as: 'index',
in: {
$mergeObjects: [
'$$index',
{
type: {
$cond: {
if: { $eq: ['$$index.defaultIndex', true] },
then: DatasetDataIndexTypeEnum.default,
else: DatasetDataIndexTypeEnum.custom
}
}
}
]
}
}
}
}
}
]);
};
async function handler(req: NextApiRequest, _res: NextApiResponse) {
await authCert({ req, authRoot: true });
console.log('变更所有 collection 的 trainingType 为 chunk');
await updateCollections();
console.log(
"更新所有 data 的 index, autoIndex=true 的增加type='default',其他的增加 type='custom'"
);
await updateData();
return { success: true };
}
export default NextAPI(handler);

View File

@@ -1,78 +0,0 @@
/*
Read db file content and response 3000 words
*/
import type { NextApiResponse } from 'next';
import { authCollectionFile } from '@fastgpt/service/support/permission/auth/file';
import { NextAPI } from '@/service/middleware/entry';
import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
import { readDatasetSourceRawText } from '@fastgpt/service/core/dataset/read';
import { ApiRequestProps } from '@fastgpt/service/type/next';
import {
OwnerPermissionVal,
WritePermissionVal
} from '@fastgpt/global/support/permission/constant';
import { authDataset } from '@fastgpt/service/support/permission/dataset/auth';
export type PreviewContextProps = {
datasetId: string;
type: DatasetSourceReadTypeEnum;
sourceId: string;
isQAImport?: boolean;
selector?: string;
externalFileId?: string;
};
async function handler(req: ApiRequestProps<PreviewContextProps>, res: NextApiResponse<any>) {
const { type, sourceId, isQAImport, selector, datasetId, externalFileId } = req.body;
if (!sourceId) {
throw new Error('fileId is empty');
}
const { teamId, apiServer, feishuServer, yuqueServer } = await (async () => {
if (type === DatasetSourceReadTypeEnum.fileLocal) {
const res = await authCollectionFile({
req,
authToken: true,
authApiKey: true,
fileId: sourceId,
per: OwnerPermissionVal
});
return {
teamId: res.teamId
};
}
const { dataset } = await authDataset({
req,
authApiKey: true,
authToken: true,
datasetId,
per: WritePermissionVal
});
return {
teamId: dataset.teamId,
apiServer: dataset.apiServer,
feishuServer: dataset.feishuServer,
yuqueServer: dataset.yuqueServer
};
})();
const rawText = await readDatasetSourceRawText({
teamId,
type,
sourceId,
isQAImport,
selector,
apiServer,
feishuServer,
yuqueServer,
externalFileId
});
return {
previewContent: rawText.slice(0, 3000),
totalLength: rawText.length
};
}
export default NextAPI(handler);

View File

@@ -4,7 +4,8 @@ import { authDataset } from '@fastgpt/service/support/permission/dataset/auth';
import { createCollectionAndInsertData } from '@fastgpt/service/core/dataset/collection/controller';
import {
TrainingModeEnum,
DatasetCollectionTypeEnum
DatasetCollectionTypeEnum,
DatasetCollectionDataProcessModeEnum
} from '@fastgpt/global/core/dataset/constants';
import { NextAPI } from '@/service/middleware/entry';
@@ -15,15 +16,7 @@ import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection
import { DatasetErrEnum } from '@fastgpt/global/common/error/code/dataset';
async function handler(req: NextApiRequest): CreateCollectionResponse {
const {
name,
apiFileId,
trainingType = TrainingModeEnum.chunk,
chunkSize = 512,
chunkSplitter,
qaPrompt,
...body
} = req.body as ApiDatasetCreateDatasetCollectionParams;
const { name, apiFileId, ...body } = req.body as ApiDatasetCreateDatasetCollectionParams;
const { teamId, tmbId, dataset } = await authDataset({
req,
@@ -56,7 +49,8 @@ async function handler(req: NextApiRequest): CreateCollectionResponse {
feishuServer,
yuqueServer,
apiFileId,
teamId
teamId,
tmbId
});
const { collectionId, insertResults } = await createCollectionAndInsertData({
@@ -69,10 +63,6 @@ async function handler(req: NextApiRequest): CreateCollectionResponse {
tmbId,
type: DatasetCollectionTypeEnum.apiFile,
name: name,
trainingType,
chunkSize,
chunkSplitter,
qaPrompt,
apiFileId,
metadata: {
relatedImgId: apiFileId

View File

@@ -4,6 +4,7 @@ import { authDataset } from '@fastgpt/service/support/permission/dataset/auth';
import { FileIdCreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api';
import { createCollectionAndInsertData } from '@fastgpt/service/core/dataset/collection/controller';
import {
DatasetCollectionDataProcessModeEnum,
DatasetCollectionTypeEnum,
TrainingModeEnum
} from '@fastgpt/global/core/dataset/constants';
@@ -15,7 +16,6 @@ import { MongoRawTextBuffer } from '@fastgpt/service/common/buffer/rawText/schem
async function handler(req: NextApiRequest): CreateCollectionResponse {
const { datasetId, parentId, fileId, ...body } = req.body as FileIdCreateDatasetCollectionParams;
const trainingType = TrainingModeEnum.chunk;
const { teamId, tmbId, dataset } = await authDataset({
req,
authToken: true,
@@ -27,6 +27,7 @@ async function handler(req: NextApiRequest): CreateCollectionResponse {
// 1. read file
const { rawText, filename } = await readFileContentFromMongo({
teamId,
tmbId,
bucketName: BucketNameEnum.dataset,
fileId,
isQAImport: true
@@ -47,7 +48,7 @@ async function handler(req: NextApiRequest): CreateCollectionResponse {
fileId,
// special metadata
trainingType,
trainingType: DatasetCollectionDataProcessModeEnum.chunk,
chunkSize: 0
}
});

View File

@@ -2,12 +2,8 @@ import { readFileContentFromMongo } from '@fastgpt/service/common/file/gridfs/co
import { authDataset } from '@fastgpt/service/support/permission/dataset/auth';
import { FileIdCreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api';
import { createCollectionAndInsertData } from '@fastgpt/service/core/dataset/collection/controller';
import {
DatasetCollectionTypeEnum,
TrainingModeEnum
} from '@fastgpt/global/core/dataset/constants';
import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constants';
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
import { hashStr } from '@fastgpt/global/common/string/tools';
import { MongoRawTextBuffer } from '@fastgpt/service/common/buffer/rawText/schema';
import { NextAPI } from '@/service/middleware/entry';
import { ApiRequestProps } from '@fastgpt/service/type/next';
@@ -17,14 +13,7 @@ import { CreateCollectionResponse } from '@/global/core/dataset/api';
async function handler(
req: ApiRequestProps<FileIdCreateDatasetCollectionParams>
): CreateCollectionResponse {
const {
fileId,
trainingType = TrainingModeEnum.chunk,
chunkSize = 512,
chunkSplitter,
qaPrompt,
...body
} = req.body;
const { fileId, customPdfParse, ...body } = req.body;
const { teamId, tmbId, dataset } = await authDataset({
req,
@@ -37,8 +26,10 @@ async function handler(
// 1. read file
const { rawText, filename } = await readFileContentFromMongo({
teamId,
tmbId,
bucketName: BucketNameEnum.dataset,
fileId
fileId,
customPdfParse
});
const { collectionId, insertResults } = await createCollectionAndInsertData({
@@ -54,12 +45,7 @@ async function handler(
metadata: {
relatedImgId: fileId
},
// special metadata
trainingType,
chunkSize,
chunkSplitter,
qaPrompt
customPdfParse
},
relatedId: fileId

View File

@@ -13,14 +13,7 @@ import { urlsFetch } from '@fastgpt/service/common/string/cheerio';
import { hashStr } from '@fastgpt/global/common/string/tools';
async function handler(req: NextApiRequest): CreateCollectionResponse {
const {
link,
trainingType = TrainingModeEnum.chunk,
chunkSize = 512,
chunkSplitter,
qaPrompt,
...body
} = req.body as LinkCreateDatasetCollectionParams;
const { link, ...body } = req.body as LinkCreateDatasetCollectionParams;
const { teamId, tmbId, dataset } = await authDataset({
req,
@@ -53,12 +46,6 @@ async function handler(req: NextApiRequest): CreateCollectionResponse {
relatedImgId: link,
webPageSelector: body?.metadata?.webPageSelector
},
trainingType,
chunkSize,
chunkSplitter,
qaPrompt,
rawLink: link
},

View File

@@ -6,7 +6,7 @@ import { FileCreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/
import { removeFilesByPaths } from '@fastgpt/service/common/file/utils';
import { createCollectionAndInsertData } from '@fastgpt/service/core/dataset/collection/controller';
import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constants';
import { getNanoid, hashStr } from '@fastgpt/global/common/string/tools';
import { getNanoid } from '@fastgpt/global/common/string/tools';
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
import { readRawTextByLocalFile } from '@fastgpt/service/common/file/read/utils';
import { NextAPI } from '@/service/middleware/entry';
@@ -48,8 +48,10 @@ async function handler(req: NextApiRequest, res: NextApiResponse<any>): CreateCo
// 1. read file
const { rawText } = await readRawTextByLocalFile({
teamId,
tmbId,
path: file.path,
encoding: file.encoding,
customPdfParse: collectionData.customPdfParse,
metadata: {
...fileMetadata,
relatedId: relatedImgId

View File

@@ -24,20 +24,14 @@ type RetrainingCollectionResponse = {
async function handler(
req: ApiRequestProps<reTrainingDatasetFileCollectionParams>
): Promise<RetrainingCollectionResponse> {
const {
collectionId,
trainingType = TrainingModeEnum.chunk,
chunkSize = 512,
chunkSplitter,
qaPrompt
} = req.body;
const { collectionId, customPdfParse, ...data } = req.body;
if (!collectionId) {
return Promise.reject(CommonErrEnum.missingParams);
}
// 凭证校验
const { collection } = await authDatasetCollection({
const { collection, teamId, tmbId } = await authDatasetCollection({
req,
authToken: true,
authApiKey: true,
@@ -84,7 +78,9 @@ async function handler(
})();
const rawText = await readDatasetSourceRawText({
teamId: collection.teamId,
teamId,
tmbId,
customPdfParse,
...sourceReadType
});
@@ -100,12 +96,15 @@ async function handler(
dataset: collection.dataset,
rawText,
createCollectionParams: {
...data,
teamId: collection.teamId,
tmbId: collection.tmbId,
datasetId: collection.dataset._id,
name: collection.name,
type: collection.type,
customPdfParse,
fileId: collection.fileId,
rawLink: collection.rawLink,
externalFileId: collection.externalFileId,
@@ -121,10 +120,6 @@ async function handler(
parentId: collection.parentId,
// special metadata
trainingType,
chunkSize,
chunkSplitter,
qaPrompt,
metadata: collection.metadata
}
});

View File

@@ -2,25 +2,13 @@ import type { NextApiRequest } from 'next';
import type { TextCreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
import { authDataset } from '@fastgpt/service/support/permission/dataset/auth';
import { createCollectionAndInsertData } from '@fastgpt/service/core/dataset/collection/controller';
import {
TrainingModeEnum,
DatasetCollectionTypeEnum
} from '@fastgpt/global/core/dataset/constants';
import { hashStr } from '@fastgpt/global/common/string/tools';
import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constants';
import { NextAPI } from '@/service/middleware/entry';
import { WritePermissionVal } from '@fastgpt/global/support/permission/constant';
import { CreateCollectionResponse } from '@/global/core/dataset/api';
async function handler(req: NextApiRequest): CreateCollectionResponse {
const {
name,
text,
trainingType = TrainingModeEnum.chunk,
chunkSize = 512,
chunkSplitter,
qaPrompt,
...body
} = req.body as TextCreateDatasetCollectionParams;
const { name, text, ...body } = req.body as TextCreateDatasetCollectionParams;
const { teamId, tmbId, dataset } = await authDataset({
req,
@@ -39,11 +27,7 @@ async function handler(req: NextApiRequest): CreateCollectionResponse {
tmbId,
type: DatasetCollectionTypeEnum.virtual,
name,
trainingType,
chunkSize,
chunkSplitter,
qaPrompt
name
}
});

View File

@@ -6,12 +6,12 @@ import {
getLLMModel,
getEmbeddingModel,
getDatasetModel,
getDefaultEmbeddingModel
getDefaultEmbeddingModel,
getVlmModel
} from '@fastgpt/service/core/ai/model';
import { checkTeamDatasetLimit } from '@fastgpt/service/support/permission/teamLimit';
import { WritePermissionVal } from '@fastgpt/global/support/permission/constant';
import { NextAPI } from '@/service/middleware/entry';
import { DatasetErrEnum } from '@fastgpt/global/common/error/code/dataset';
import type { ApiRequestProps } from '@fastgpt/service/type/next';
import { parseParentIdInMongo } from '@fastgpt/global/common/parentFolder/utils';
import { authDataset } from '@fastgpt/service/support/permission/dataset/auth';
@@ -32,8 +32,9 @@ async function handler(
intro,
type = DatasetTypeEnum.dataset,
avatar,
vectorModel = getDefaultEmbeddingModel().model,
agentModel = getDatasetModel().model,
vectorModel = getDefaultEmbeddingModel()?.model,
agentModel = getDatasetModel()?.model,
vlmModel,
apiServer,
feishuServer,
yuqueServer
@@ -63,8 +64,11 @@ async function handler(
// check model valid
const vectorModelStore = getEmbeddingModel(vectorModel);
const agentModelStore = getLLMModel(agentModel);
if (!vectorModelStore || !agentModelStore) {
return Promise.reject(DatasetErrEnum.invalidVectorModelOrQAModel);
if (!vectorModelStore) {
return Promise.reject(`System not embedding model`);
}
if (!agentModelStore) {
return Promise.reject(`System not llm model`);
}
// check limit
@@ -81,6 +85,7 @@ async function handler(
tmbId,
vectorModel,
agentModel,
vlmModel,
avatar,
type,
apiServer,

View File

@@ -7,9 +7,13 @@ import { predictDataLimitLength } from '@fastgpt/global/core/dataset/utils';
import { pushDataListToTrainingQueue } from '@fastgpt/service/core/dataset/training/controller';
import { NextAPI } from '@/service/middleware/entry';
import { WritePermissionVal } from '@fastgpt/global/support/permission/constant';
import { getTrainingModeByCollection } from '@fastgpt/service/core/dataset/collection/utils';
async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
const body = req.body as PushDatasetDataProps;
// Adapter 4.9.0
body.trainingType = body.trainingType || body.trainingMode;
const { collectionId, data } = body;
if (!collectionId || !Array.isArray(data)) {
@@ -32,7 +36,7 @@ async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
// auth dataset limit
await checkDatasetLimit({
teamId,
insertLen: predictDataLimitLength(collection.trainingType, data)
insertLen: predictDataLimitLength(getTrainingModeByCollection(collection), data)
});
return pushDataListToTrainingQueue({
@@ -40,8 +44,9 @@ async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
teamId,
tmbId,
datasetId: collection.datasetId,
vectorModel: collection.dataset.vectorModel,
agentModel: collection.dataset.agentModel,
vectorModel: collection.dataset.vectorModel
vlmModel: collection.dataset.vlmModel
});
}

View File

@@ -1,4 +1,4 @@
import { getLLMModel, getEmbeddingModel } from '@fastgpt/service/core/ai/model';
import { getLLMModel, getEmbeddingModel, getVlmModel } from '@fastgpt/service/core/ai/model';
import { authDataset } from '@fastgpt/service/support/permission/dataset/auth';
import { ReadPermissionVal } from '@fastgpt/global/support/permission/constant';
import { NextAPI } from '@/service/middleware/entry';
@@ -51,7 +51,8 @@ async function handler(req: ApiRequestProps<Query>): Promise<DatasetItemType> {
: undefined,
permission,
vectorModel: getEmbeddingModel(dataset.vectorModel),
agentModel: getLLMModel(dataset.agentModel)
agentModel: getLLMModel(dataset.agentModel),
vlmModel: getVlmModel(dataset.vlmModel)
};
}

View File

@@ -17,6 +17,7 @@ export type PostPreviewFilesChunksProps = {
chunkSize: number;
overlapRatio: number;
customSplitChar?: string;
customPdfParse?: boolean;
// Read params
selector?: string;
@@ -40,7 +41,8 @@ async function handler(
selector,
isQAImport,
datasetId,
externalFileId
externalFileId,
customPdfParse = false
} = req.body;
if (!sourceId) {
@@ -50,7 +52,7 @@ async function handler(
throw new Error('chunkSize is too large, should be less than 30000');
}
const { teamId, apiServer, feishuServer, yuqueServer } = await (async () => {
const { teamId, tmbId, apiServer, feishuServer, yuqueServer } = await (async () => {
if (type === DatasetSourceReadTypeEnum.fileLocal) {
const res = await authCollectionFile({
req,
@@ -60,10 +62,11 @@ async function handler(
per: OwnerPermissionVal
});
return {
teamId: res.teamId
teamId: res.teamId,
tmbId: res.tmbId
};
}
const { dataset } = await authDataset({
const { dataset, teamId, tmbId } = await authDataset({
req,
authApiKey: true,
authToken: true,
@@ -71,7 +74,8 @@ async function handler(
per: WritePermissionVal
});
return {
teamId: dataset.teamId,
teamId,
tmbId,
apiServer: dataset.apiServer,
feishuServer: dataset.feishuServer,
yuqueServer: dataset.yuqueServer
@@ -80,6 +84,7 @@ async function handler(
const rawText = await readDatasetSourceRawText({
teamId,
tmbId,
type,
sourceId,
selector,
@@ -87,7 +92,8 @@ async function handler(
apiServer,
feishuServer,
yuqueServer,
externalFileId
externalFileId,
customPdfParse
});
return rawText2Chunks({
@@ -96,6 +102,6 @@ async function handler(
overlapRatio,
customReg: customSplitChar ? [customSplitChar] : [],
isQAImport: isQAImport
}).slice(0, 15);
}).slice(0, 10);
}
export default NextAPI(handler);

View File

@@ -6,7 +6,7 @@ import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema';
import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema';
import { createTrainingUsage } from '@fastgpt/service/support/wallet/usage/controller';
import { UsageSourceEnum } from '@fastgpt/global/support/wallet/usage/constants';
import { getLLMModel, getEmbeddingModel } from '@fastgpt/service/core/ai/model';
import { getLLMModel, getEmbeddingModel, getVlmModel } from '@fastgpt/service/core/ai/model';
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
import { ApiRequestProps } from '@fastgpt/service/type/next';
import { OwnerPermissionVal } from '@fastgpt/global/support/permission/constant';
@@ -50,7 +50,8 @@ async function handler(req: ApiRequestProps<rebuildEmbeddingBody>): Promise<Resp
appName: '切换索引模型',
billSource: UsageSourceEnum.training,
vectorModel: getEmbeddingModel(dataset.vectorModel)?.name,
agentModel: getLLMModel(dataset.agentModel)?.name
agentModel: getLLMModel(dataset.agentModel)?.name,
vllmModel: getVlmModel(dataset.vlmModel)?.name
});
// update vector model and dataset.data rebuild field

View File

@@ -56,6 +56,7 @@ async function handler(
avatar,
intro,
agentModel,
vlmModel,
websiteConfig,
externalReadUrl,
apiServer,
@@ -109,7 +110,7 @@ async function handler(
updateTraining({
teamId: dataset.teamId,
datasetId: id,
agentModel: agentModel?.model
agentModel
});
const onUpdate = async (session: ClientSession) => {
@@ -119,7 +120,8 @@ async function handler(
...parseParentIdInMongo(parentId),
...(name && { name }),
...(avatar && { avatar }),
...(agentModel && { agentModel: agentModel.model }),
...(agentModel && { agentModel }),
...(vlmModel && { vlmModel }),
...(websiteConfig && { websiteConfig }),
...(status && { status }),
...(intro !== undefined && { intro }),
@@ -212,7 +214,7 @@ const updateTraining = async ({
$set: {
model: agentModel,
retryCount: 5,
lockTime: new Date()
lockTime: new Date('2000/1/1')
}
}
);

View File

@@ -1,7 +1,7 @@
import type { NextApiRequest, NextApiResponse } from 'next';
import type { NextApiRequest } from 'next';
import { UsageSourceEnum } from '@fastgpt/global/support/wallet/usage/constants';
import { CreateTrainingUsageProps } from '@fastgpt/global/support/wallet/usage/api.d';
import { getLLMModel, getEmbeddingModel } from '@fastgpt/service/core/ai/model';
import { getLLMModel, getEmbeddingModel, getVlmModel } from '@fastgpt/service/core/ai/model';
import { createTrainingUsage } from '@fastgpt/service/support/wallet/usage/controller';
import { authDataset } from '@fastgpt/service/support/permission/dataset/auth';
import { WritePermissionVal } from '@fastgpt/global/support/permission/constant';
@@ -24,7 +24,8 @@ async function handler(req: NextApiRequest) {
appName: name,
billSource: UsageSourceEnum.training,
vectorModel: getEmbeddingModel(dataset.vectorModel).name,
agentModel: getLLMModel(dataset.agentModel).name
agentModel: getLLMModel(dataset.agentModel).name,
vllmModel: getVlmModel(dataset.vlmModel)?.name
});
return billId;