feat: chunk index independent config (#4271)
* sync collection * remove lock * feat: chunk index independent config * feat: add max chunksize to split chunk function * remove log * update doc * remove * remove log
This commit is contained in:
@@ -294,7 +294,7 @@ const MyInfo = ({ onOpenContact }: { onOpenContact: () => void }) => {
|
||||
title={t('account_info:click_modify_nickname')}
|
||||
borderColor={'transparent'}
|
||||
transform={'translateX(-11px)'}
|
||||
maxLength={20}
|
||||
maxLength={100}
|
||||
onBlur={async (e) => {
|
||||
const val = e.target.value;
|
||||
if (val === userInfo?.team?.memberName) return;
|
||||
|
||||
@@ -2,8 +2,7 @@ import { reTrainingDatasetFileCollectionParams } from '@fastgpt/global/core/data
|
||||
import { createCollectionAndInsertData } from '@fastgpt/service/core/dataset/collection/controller';
|
||||
import {
|
||||
DatasetCollectionTypeEnum,
|
||||
DatasetSourceReadTypeEnum,
|
||||
TrainingModeEnum
|
||||
DatasetSourceReadTypeEnum
|
||||
} from '@fastgpt/global/core/dataset/constants';
|
||||
import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun';
|
||||
import { hashStr } from '@fastgpt/global/common/string/tools';
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
*/
|
||||
import type { NextApiRequest } from 'next';
|
||||
import { countPromptTokens } from '@fastgpt/service/common/string/tiktoken/index';
|
||||
import { getEmbeddingModel } from '@fastgpt/service/core/ai/model';
|
||||
import { getEmbeddingModel, getLLMModel } from '@fastgpt/service/core/ai/model';
|
||||
import { hasSameValue } from '@/service/core/dataset/data/utils';
|
||||
import { insertData2Dataset } from '@/service/core/dataset/data/controller';
|
||||
import { authDatasetCollection } from '@fastgpt/service/support/permission/dataset/auth';
|
||||
@@ -16,6 +16,7 @@ import { checkDatasetLimit } from '@fastgpt/service/support/permission/teamLimit
|
||||
import { NextAPI } from '@/service/middleware/entry';
|
||||
import { WritePermissionVal } from '@fastgpt/global/support/permission/constant';
|
||||
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
|
||||
import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils';
|
||||
|
||||
async function handler(req: NextApiRequest) {
|
||||
const { collectionId, q, a, indexes } = req.body as InsertOneDatasetDataProps;
|
||||
@@ -45,7 +46,7 @@ async function handler(req: NextApiRequest) {
|
||||
// auth collection and get dataset
|
||||
const [
|
||||
{
|
||||
dataset: { _id: datasetId, vectorModel }
|
||||
dataset: { _id: datasetId, vectorModel, agentModel }
|
||||
}
|
||||
] = await Promise.all([getCollectionWithDataset(collectionId)]);
|
||||
|
||||
@@ -60,9 +61,11 @@ async function handler(req: NextApiRequest) {
|
||||
// token check
|
||||
const token = await countPromptTokens(formatQ + formatA, '');
|
||||
const vectorModelData = getEmbeddingModel(vectorModel);
|
||||
const llmModelData = getLLMModel(agentModel);
|
||||
const maxChunkSize = getLLMMaxChunkSize(llmModelData);
|
||||
|
||||
if (token > vectorModelData.maxToken) {
|
||||
return Promise.reject('Q Over Tokens');
|
||||
if (token > maxChunkSize) {
|
||||
return Promise.reject(`Content over max chunk size: ${maxChunkSize}`);
|
||||
}
|
||||
|
||||
// Duplicate data check
|
||||
@@ -82,7 +85,7 @@ async function handler(req: NextApiRequest) {
|
||||
q: formatQ,
|
||||
a: formatA,
|
||||
chunkIndex: 0,
|
||||
model: vectorModelData.model,
|
||||
embeddingModel: vectorModelData.model,
|
||||
indexes: formatIndexes
|
||||
});
|
||||
|
||||
|
||||
@@ -1,4 +1,9 @@
|
||||
import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import {
|
||||
ChunkSettingModeEnum,
|
||||
DataChunkSplitModeEnum,
|
||||
DatasetCollectionDataProcessModeEnum,
|
||||
DatasetSourceReadTypeEnum
|
||||
} from '@fastgpt/global/core/dataset/constants';
|
||||
import { rawText2Chunks, readDatasetSourceRawText } from '@fastgpt/service/core/dataset/read';
|
||||
import { NextAPI } from '@/service/middleware/entry';
|
||||
import { ApiRequestProps } from '@fastgpt/service/type/next';
|
||||
@@ -8,17 +13,30 @@ import {
|
||||
} from '@fastgpt/global/support/permission/constant';
|
||||
import { authCollectionFile } from '@fastgpt/service/support/permission/auth/file';
|
||||
import { authDataset } from '@fastgpt/service/support/permission/dataset/auth';
|
||||
import {
|
||||
computeChunkSize,
|
||||
computeChunkSplitter,
|
||||
getLLMMaxChunkSize
|
||||
} from '@fastgpt/global/core/dataset/training/utils';
|
||||
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
|
||||
import { getLLMModel } from '@fastgpt/service/core/ai/model';
|
||||
|
||||
export type PostPreviewFilesChunksProps = {
|
||||
datasetId: string;
|
||||
type: DatasetSourceReadTypeEnum;
|
||||
sourceId: string;
|
||||
|
||||
chunkSize: number;
|
||||
overlapRatio: number;
|
||||
customSplitChar?: string;
|
||||
customPdfParse?: boolean;
|
||||
|
||||
trainingType: DatasetCollectionDataProcessModeEnum;
|
||||
|
||||
// Chunk settings
|
||||
chunkSettingMode: ChunkSettingModeEnum;
|
||||
chunkSplitMode: DataChunkSplitModeEnum;
|
||||
chunkSize: number;
|
||||
chunkSplitter?: string;
|
||||
overlapRatio: number;
|
||||
|
||||
// Read params
|
||||
selector?: string;
|
||||
isQAImport?: boolean;
|
||||
@@ -32,55 +50,64 @@ export type PreviewChunksResponse = {
|
||||
async function handler(
|
||||
req: ApiRequestProps<PostPreviewFilesChunksProps>
|
||||
): Promise<PreviewChunksResponse> {
|
||||
const {
|
||||
let {
|
||||
type,
|
||||
sourceId,
|
||||
customPdfParse = false,
|
||||
|
||||
trainingType,
|
||||
chunkSettingMode,
|
||||
chunkSplitMode,
|
||||
chunkSize,
|
||||
customSplitChar,
|
||||
chunkSplitter,
|
||||
|
||||
overlapRatio,
|
||||
selector,
|
||||
isQAImport,
|
||||
datasetId,
|
||||
externalFileId,
|
||||
customPdfParse = false
|
||||
externalFileId
|
||||
} = req.body;
|
||||
|
||||
if (!sourceId) {
|
||||
throw new Error('sourceId is empty');
|
||||
}
|
||||
if (chunkSize > 30000) {
|
||||
throw new Error('chunkSize is too large, should be less than 30000');
|
||||
|
||||
const fileAuthRes =
|
||||
type === DatasetSourceReadTypeEnum.fileLocal
|
||||
? await authCollectionFile({
|
||||
req,
|
||||
authToken: true,
|
||||
authApiKey: true,
|
||||
fileId: sourceId,
|
||||
per: OwnerPermissionVal
|
||||
})
|
||||
: undefined;
|
||||
|
||||
const { dataset, teamId, tmbId } = await authDataset({
|
||||
req,
|
||||
authApiKey: true,
|
||||
authToken: true,
|
||||
datasetId,
|
||||
per: WritePermissionVal
|
||||
});
|
||||
|
||||
if (fileAuthRes && (String(fileAuthRes.tmbId) !== String(tmbId) || !fileAuthRes.isRoot)) {
|
||||
return Promise.reject(CommonErrEnum.unAuthFile);
|
||||
}
|
||||
|
||||
const { teamId, tmbId, apiServer, feishuServer, yuqueServer } = await (async () => {
|
||||
if (type === DatasetSourceReadTypeEnum.fileLocal) {
|
||||
const res = await authCollectionFile({
|
||||
req,
|
||||
authToken: true,
|
||||
authApiKey: true,
|
||||
fileId: sourceId,
|
||||
per: OwnerPermissionVal
|
||||
});
|
||||
return {
|
||||
teamId: res.teamId,
|
||||
tmbId: res.tmbId
|
||||
};
|
||||
}
|
||||
const { dataset, teamId, tmbId } = await authDataset({
|
||||
req,
|
||||
authApiKey: true,
|
||||
authToken: true,
|
||||
datasetId,
|
||||
per: WritePermissionVal
|
||||
});
|
||||
return {
|
||||
teamId,
|
||||
tmbId,
|
||||
apiServer: dataset.apiServer,
|
||||
feishuServer: dataset.feishuServer,
|
||||
yuqueServer: dataset.yuqueServer
|
||||
};
|
||||
})();
|
||||
chunkSize = computeChunkSize({
|
||||
trainingType,
|
||||
chunkSettingMode,
|
||||
chunkSplitMode,
|
||||
chunkSize,
|
||||
llmModel: getLLMModel(dataset.agentModel)
|
||||
});
|
||||
|
||||
chunkSplitter = computeChunkSplitter({
|
||||
chunkSettingMode,
|
||||
chunkSplitMode,
|
||||
chunkSplitter
|
||||
});
|
||||
|
||||
const { rawText } = await readDatasetSourceRawText({
|
||||
teamId,
|
||||
@@ -89,18 +116,19 @@ async function handler(
|
||||
sourceId,
|
||||
selector,
|
||||
isQAImport,
|
||||
apiServer,
|
||||
feishuServer,
|
||||
yuqueServer,
|
||||
apiServer: dataset.apiServer,
|
||||
feishuServer: dataset.feishuServer,
|
||||
yuqueServer: dataset.yuqueServer,
|
||||
externalFileId,
|
||||
customPdfParse
|
||||
});
|
||||
|
||||
return rawText2Chunks({
|
||||
rawText,
|
||||
chunkLen: chunkSize,
|
||||
chunkSize,
|
||||
maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)),
|
||||
overlapRatio,
|
||||
customReg: customSplitChar ? [customSplitChar] : [],
|
||||
customReg: chunkSplitter ? [chunkSplitter] : [],
|
||||
isQAImport: isQAImport
|
||||
}).slice(0, 10);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user