feat: chunk index independent config (#4271)

* sync collection

* remove lock

* feat: chunk index independent config

* feat: add max chunksize to split chunk function

* remove log

* update doc

* remove

* remove log
This commit is contained in:
Archer
2025-03-21 16:44:25 +08:00
committed by archer
parent 222ff0d49a
commit e812ad6e84
47 changed files with 784 additions and 443 deletions

View File

@@ -5,25 +5,63 @@ import {
UpdateDatasetDataProps
} from '@fastgpt/global/core/dataset/controller';
import { insertDatasetDataVector } from '@fastgpt/service/common/vectorStore/controller';
import { getDefaultIndex } from '@fastgpt/global/core/dataset/utils';
import { jiebaSplit } from '@fastgpt/service/common/string/jieba/index';
import { deleteDatasetDataVector } from '@fastgpt/service/common/vectorStore/controller';
import { DatasetDataIndexItemType, DatasetDataItemType } from '@fastgpt/global/core/dataset/type';
import { getEmbeddingModel } from '@fastgpt/service/core/ai/model';
import { getEmbeddingModel, getLLMModel } from '@fastgpt/service/core/ai/model';
import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun';
import { ClientSession } from '@fastgpt/service/common/mongo';
import { MongoDatasetDataText } from '@fastgpt/service/core/dataset/data/dataTextSchema';
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
import { countPromptTokens } from '@fastgpt/service/common/string/tiktoken';
import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils';
const formatIndexes = ({
const formatIndexes = async ({
indexes,
q,
a = ''
a = '',
indexSize
}: {
indexes?: (Omit<DatasetDataIndexItemType, 'dataId'> & { dataId?: string })[];
q: string;
a?: string;
}) => {
indexSize: number;
}): Promise<
{
type: `${DatasetDataIndexTypeEnum}`;
text: string;
dataId?: string;
}[]
> => {
/* get dataset data default index */
const getDefaultIndex = ({
q = '',
a,
indexSize
}: {
q?: string;
a?: string;
indexSize: number;
}) => {
const qChunks = splitText2Chunks({
text: q,
chunkSize: indexSize
}).chunks;
const aChunks = a ? splitText2Chunks({ text: a, chunkSize: indexSize }).chunks : [];
return [
...qChunks.map((text) => ({
text,
type: DatasetDataIndexTypeEnum.default
})),
...aChunks.map((text) => ({
text,
type: DatasetDataIndexTypeEnum.default
}))
];
};
indexes = indexes || [];
// If index not type, set it to custom
indexes = indexes
@@ -35,7 +73,7 @@ const formatIndexes = ({
.filter((item) => !!item.text.trim());
// Recompute default indexes, Merge ids of the same index, reduce the number of rebuilds
const defaultIndexes = getDefaultIndex({ q, a });
const defaultIndexes = getDefaultIndex({ q, a, indexSize });
const concatDefaultIndexes = defaultIndexes.map((item) => {
const oldIndex = indexes!.find((index) => index.text === item.text);
if (oldIndex) {
@@ -56,11 +94,24 @@ const formatIndexes = ({
(item, index, self) => index === self.findIndex((t) => t.text === item.text)
);
return indexes.map((index) => ({
type: index.type,
text: index.text,
dataId: index.dataId
}));
const chekcIndexes = (
await Promise.all(
indexes.map(async (item) => {
// If oversize tokens, split it
const tokens = await countPromptTokens(item.text);
if (tokens > indexSize) {
const splitText = splitText2Chunks({ text: item.text, chunkSize: 512 }).chunks;
return splitText.map((text) => ({
text,
type: item.type
}));
}
return item;
})
)
).flat();
return chekcIndexes;
};
/* insert data.
* 1. create data id
@@ -75,30 +126,40 @@ export async function insertData2Dataset({
q,
a = '',
chunkIndex = 0,
indexSize = 512,
indexes,
model,
embeddingModel,
session
}: CreateDatasetDataProps & {
model: string;
embeddingModel: string;
indexSize?: number;
session?: ClientSession;
}) {
if (!q || !datasetId || !collectionId || !model) {
return Promise.reject('q, datasetId, collectionId, model is required');
if (!q || !datasetId || !collectionId || !embeddingModel) {
return Promise.reject('q, datasetId, collectionId, embeddingModel is required');
}
if (String(teamId) === String(tmbId)) {
return Promise.reject("teamId and tmbId can't be the same");
}
const embModel = getEmbeddingModel(embeddingModel);
indexSize = Math.min(embModel.maxToken, indexSize);
// 1. Get vector indexes and insert
// Empty indexes check, if empty, create default index
const newIndexes = formatIndexes({ indexes, q, a });
const newIndexes = await formatIndexes({
indexes,
q,
a,
indexSize
});
// insert to vector store
const result = await Promise.all(
newIndexes.map(async (item) => {
const result = await insertDatasetDataVector({
query: item.text,
model: getEmbeddingModel(model),
model: embModel,
teamId,
datasetId,
collectionId
@@ -163,8 +224,9 @@ export async function updateData2Dataset({
q = '',
a,
indexes,
model
}: UpdateDatasetDataProps & { model: string }) {
model,
indexSize = 512
}: UpdateDatasetDataProps & { model: string; indexSize?: number }) {
if (!Array.isArray(indexes)) {
return Promise.reject('indexes is required');
}
@@ -174,7 +236,7 @@ export async function updateData2Dataset({
if (!mongoData) return Promise.reject('core.dataset.error.Data not found');
// 2. Compute indexes
const formatIndexesResult = formatIndexes({ indexes, q, a });
const formatIndexesResult = await formatIndexes({ indexes, q, a, indexSize });
// 3. Patch indexes, create, update, delete
const patchResult: PatchIndexesProps[] = [];

View File

@@ -21,6 +21,11 @@ import {
llmCompletionsBodyFormat,
llmStreamResponseToAnswerText
} from '@fastgpt/service/core/ai/utils';
import { LLMModelItemType } from '@fastgpt/global/core/ai/model.d';
import {
chunkAutoChunkSize,
getLLMMaxChunkSize
} from '@fastgpt/global/core/dataset/training/utils';
const reduceQueue = () => {
global.qaQueueLen = global.qaQueueLen > 0 ? global.qaQueueLen - 1 : 0;
@@ -129,7 +134,7 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`;
});
const answer = await llmStreamResponseToAnswerText(chatResponse);
const qaArr = formatSplitText(answer, text); // 格式化后的QA对
const qaArr = formatSplitText({ answer, rawText: text, llmModel: modelData }); // 格式化后的QA对
addLog.info(`[QA Queue] Finish`, {
time: Date.now() - startTime,
@@ -180,10 +185,18 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`;
}
// Format qa answer
function formatSplitText(text: string, rawText: string) {
text = text.replace(/\\n/g, '\n'); // 将换行符替换为空格
function formatSplitText({
answer,
rawText,
llmModel
}: {
answer: string;
rawText: string;
llmModel: LLMModelItemType;
}) {
answer = answer.replace(/\\n/g, '\n'); // 将换行符替换为空格
const regex = /Q\d+:(\s*)(.*)(\s*)A\d+:(\s*)([\s\S]*?)(?=Q\d|$)/g; // 匹配Q和A的正则表达式
const matches = text.matchAll(regex); // 获取所有匹配到的结果
const matches = answer.matchAll(regex); // 获取所有匹配到的结果
const result: PushDatasetDataChunkProps[] = []; // 存储最终的结果
for (const match of matches) {
@@ -199,7 +212,11 @@ function formatSplitText(text: string, rawText: string) {
// empty result. direct split chunk
if (result.length === 0) {
const { chunks } = splitText2Chunks({ text: rawText, chunkLen: 512 });
const { chunks } = splitText2Chunks({
text: rawText,
chunkSize: chunkAutoChunkSize,
maxSize: getLLMMaxChunkSize(llmModel)
});
chunks.forEach((chunk) => {
result.push({
q: chunk,

View File

@@ -245,7 +245,7 @@ const insertData = async ({
a: trainingData.a,
chunkIndex: trainingData.chunkIndex,
indexes: trainingData.indexes,
model: trainingData.model,
embeddingModel: trainingData.model,
session
});
// delete data from training