feat: chunk index independent config (#4271)

* sync collection

* remove lock

* feat: chunk index independent config

* feat: add max chunksize to split chunk function

* remove log

* update doc

* remove

* remove log
This commit is contained in:
Archer
2025-03-21 16:44:25 +08:00
committed by archer
parent 222ff0d49a
commit e812ad6e84
47 changed files with 784 additions and 443 deletions

View File

@@ -294,7 +294,7 @@ const MyInfo = ({ onOpenContact }: { onOpenContact: () => void }) => {
title={t('account_info:click_modify_nickname')}
borderColor={'transparent'}
transform={'translateX(-11px)'}
maxLength={20}
maxLength={100}
onBlur={async (e) => {
const val = e.target.value;
if (val === userInfo?.team?.memberName) return;

View File

@@ -2,8 +2,7 @@ import { reTrainingDatasetFileCollectionParams } from '@fastgpt/global/core/data
import { createCollectionAndInsertData } from '@fastgpt/service/core/dataset/collection/controller';
import {
DatasetCollectionTypeEnum,
DatasetSourceReadTypeEnum,
TrainingModeEnum
DatasetSourceReadTypeEnum
} from '@fastgpt/global/core/dataset/constants';
import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun';
import { hashStr } from '@fastgpt/global/common/string/tools';

View File

@@ -4,7 +4,7 @@
*/
import type { NextApiRequest } from 'next';
import { countPromptTokens } from '@fastgpt/service/common/string/tiktoken/index';
import { getEmbeddingModel } from '@fastgpt/service/core/ai/model';
import { getEmbeddingModel, getLLMModel } from '@fastgpt/service/core/ai/model';
import { hasSameValue } from '@/service/core/dataset/data/utils';
import { insertData2Dataset } from '@/service/core/dataset/data/controller';
import { authDatasetCollection } from '@fastgpt/service/support/permission/dataset/auth';
@@ -16,6 +16,7 @@ import { checkDatasetLimit } from '@fastgpt/service/support/permission/teamLimit
import { NextAPI } from '@/service/middleware/entry';
import { WritePermissionVal } from '@fastgpt/global/support/permission/constant';
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils';
async function handler(req: NextApiRequest) {
const { collectionId, q, a, indexes } = req.body as InsertOneDatasetDataProps;
@@ -45,7 +46,7 @@ async function handler(req: NextApiRequest) {
// auth collection and get dataset
const [
{
dataset: { _id: datasetId, vectorModel }
dataset: { _id: datasetId, vectorModel, agentModel }
}
] = await Promise.all([getCollectionWithDataset(collectionId)]);
@@ -60,9 +61,11 @@ async function handler(req: NextApiRequest) {
// token check
const token = await countPromptTokens(formatQ + formatA, '');
const vectorModelData = getEmbeddingModel(vectorModel);
const llmModelData = getLLMModel(agentModel);
const maxChunkSize = getLLMMaxChunkSize(llmModelData);
if (token > vectorModelData.maxToken) {
return Promise.reject('Q Over Tokens');
if (token > maxChunkSize) {
return Promise.reject(`Content over max chunk size: ${maxChunkSize}`);
}
// Duplicate data check
@@ -82,7 +85,7 @@ async function handler(req: NextApiRequest) {
q: formatQ,
a: formatA,
chunkIndex: 0,
model: vectorModelData.model,
embeddingModel: vectorModelData.model,
indexes: formatIndexes
});

View File

@@ -1,4 +1,9 @@
import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
import {
ChunkSettingModeEnum,
DataChunkSplitModeEnum,
DatasetCollectionDataProcessModeEnum,
DatasetSourceReadTypeEnum
} from '@fastgpt/global/core/dataset/constants';
import { rawText2Chunks, readDatasetSourceRawText } from '@fastgpt/service/core/dataset/read';
import { NextAPI } from '@/service/middleware/entry';
import { ApiRequestProps } from '@fastgpt/service/type/next';
@@ -8,17 +13,30 @@ import {
} from '@fastgpt/global/support/permission/constant';
import { authCollectionFile } from '@fastgpt/service/support/permission/auth/file';
import { authDataset } from '@fastgpt/service/support/permission/dataset/auth';
import {
computeChunkSize,
computeChunkSplitter,
getLLMMaxChunkSize
} from '@fastgpt/global/core/dataset/training/utils';
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
import { getLLMModel } from '@fastgpt/service/core/ai/model';
export type PostPreviewFilesChunksProps = {
datasetId: string;
type: DatasetSourceReadTypeEnum;
sourceId: string;
chunkSize: number;
overlapRatio: number;
customSplitChar?: string;
customPdfParse?: boolean;
trainingType: DatasetCollectionDataProcessModeEnum;
// Chunk settings
chunkSettingMode: ChunkSettingModeEnum;
chunkSplitMode: DataChunkSplitModeEnum;
chunkSize: number;
chunkSplitter?: string;
overlapRatio: number;
// Read params
selector?: string;
isQAImport?: boolean;
@@ -32,55 +50,64 @@ export type PreviewChunksResponse = {
async function handler(
req: ApiRequestProps<PostPreviewFilesChunksProps>
): Promise<PreviewChunksResponse> {
const {
let {
type,
sourceId,
customPdfParse = false,
trainingType,
chunkSettingMode,
chunkSplitMode,
chunkSize,
customSplitChar,
chunkSplitter,
overlapRatio,
selector,
isQAImport,
datasetId,
externalFileId,
customPdfParse = false
externalFileId
} = req.body;
if (!sourceId) {
throw new Error('sourceId is empty');
}
if (chunkSize > 30000) {
throw new Error('chunkSize is too large, should be less than 30000');
const fileAuthRes =
type === DatasetSourceReadTypeEnum.fileLocal
? await authCollectionFile({
req,
authToken: true,
authApiKey: true,
fileId: sourceId,
per: OwnerPermissionVal
})
: undefined;
const { dataset, teamId, tmbId } = await authDataset({
req,
authApiKey: true,
authToken: true,
datasetId,
per: WritePermissionVal
});
if (fileAuthRes && (String(fileAuthRes.tmbId) !== String(tmbId) || !fileAuthRes.isRoot)) {
return Promise.reject(CommonErrEnum.unAuthFile);
}
const { teamId, tmbId, apiServer, feishuServer, yuqueServer } = await (async () => {
if (type === DatasetSourceReadTypeEnum.fileLocal) {
const res = await authCollectionFile({
req,
authToken: true,
authApiKey: true,
fileId: sourceId,
per: OwnerPermissionVal
});
return {
teamId: res.teamId,
tmbId: res.tmbId
};
}
const { dataset, teamId, tmbId } = await authDataset({
req,
authApiKey: true,
authToken: true,
datasetId,
per: WritePermissionVal
});
return {
teamId,
tmbId,
apiServer: dataset.apiServer,
feishuServer: dataset.feishuServer,
yuqueServer: dataset.yuqueServer
};
})();
chunkSize = computeChunkSize({
trainingType,
chunkSettingMode,
chunkSplitMode,
chunkSize,
llmModel: getLLMModel(dataset.agentModel)
});
chunkSplitter = computeChunkSplitter({
chunkSettingMode,
chunkSplitMode,
chunkSplitter
});
const { rawText } = await readDatasetSourceRawText({
teamId,
@@ -89,18 +116,19 @@ async function handler(
sourceId,
selector,
isQAImport,
apiServer,
feishuServer,
yuqueServer,
apiServer: dataset.apiServer,
feishuServer: dataset.feishuServer,
yuqueServer: dataset.yuqueServer,
externalFileId,
customPdfParse
});
return rawText2Chunks({
rawText,
chunkLen: chunkSize,
chunkSize,
maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)),
overlapRatio,
customReg: customSplitChar ? [customSplitChar] : [],
customReg: chunkSplitter ? [chunkSplitter] : [],
isQAImport: isQAImport
}).slice(0, 10);
}