feat: chunk index independent config (#4271)
* sync collection * remove lock * feat: chunk index independent config * feat: add max chunksize to split chunk function * remove log * update doc * remove * remove log
This commit is contained in:
@@ -5,25 +5,63 @@ import {
|
||||
UpdateDatasetDataProps
|
||||
} from '@fastgpt/global/core/dataset/controller';
|
||||
import { insertDatasetDataVector } from '@fastgpt/service/common/vectorStore/controller';
|
||||
import { getDefaultIndex } from '@fastgpt/global/core/dataset/utils';
|
||||
import { jiebaSplit } from '@fastgpt/service/common/string/jieba/index';
|
||||
import { deleteDatasetDataVector } from '@fastgpt/service/common/vectorStore/controller';
|
||||
import { DatasetDataIndexItemType, DatasetDataItemType } from '@fastgpt/global/core/dataset/type';
|
||||
import { getEmbeddingModel } from '@fastgpt/service/core/ai/model';
|
||||
import { getEmbeddingModel, getLLMModel } from '@fastgpt/service/core/ai/model';
|
||||
import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun';
|
||||
import { ClientSession } from '@fastgpt/service/common/mongo';
|
||||
import { MongoDatasetDataText } from '@fastgpt/service/core/dataset/data/dataTextSchema';
|
||||
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
|
||||
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
|
||||
import { countPromptTokens } from '@fastgpt/service/common/string/tiktoken';
|
||||
import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils';
|
||||
|
||||
const formatIndexes = ({
|
||||
const formatIndexes = async ({
|
||||
indexes,
|
||||
q,
|
||||
a = ''
|
||||
a = '',
|
||||
indexSize
|
||||
}: {
|
||||
indexes?: (Omit<DatasetDataIndexItemType, 'dataId'> & { dataId?: string })[];
|
||||
q: string;
|
||||
a?: string;
|
||||
}) => {
|
||||
indexSize: number;
|
||||
}): Promise<
|
||||
{
|
||||
type: `${DatasetDataIndexTypeEnum}`;
|
||||
text: string;
|
||||
dataId?: string;
|
||||
}[]
|
||||
> => {
|
||||
/* get dataset data default index */
|
||||
const getDefaultIndex = ({
|
||||
q = '',
|
||||
a,
|
||||
indexSize
|
||||
}: {
|
||||
q?: string;
|
||||
a?: string;
|
||||
indexSize: number;
|
||||
}) => {
|
||||
const qChunks = splitText2Chunks({
|
||||
text: q,
|
||||
chunkSize: indexSize
|
||||
}).chunks;
|
||||
const aChunks = a ? splitText2Chunks({ text: a, chunkSize: indexSize }).chunks : [];
|
||||
|
||||
return [
|
||||
...qChunks.map((text) => ({
|
||||
text,
|
||||
type: DatasetDataIndexTypeEnum.default
|
||||
})),
|
||||
...aChunks.map((text) => ({
|
||||
text,
|
||||
type: DatasetDataIndexTypeEnum.default
|
||||
}))
|
||||
];
|
||||
};
|
||||
|
||||
indexes = indexes || [];
|
||||
// If index not type, set it to custom
|
||||
indexes = indexes
|
||||
@@ -35,7 +73,7 @@ const formatIndexes = ({
|
||||
.filter((item) => !!item.text.trim());
|
||||
|
||||
// Recompute default indexes, Merge ids of the same index, reduce the number of rebuilds
|
||||
const defaultIndexes = getDefaultIndex({ q, a });
|
||||
const defaultIndexes = getDefaultIndex({ q, a, indexSize });
|
||||
const concatDefaultIndexes = defaultIndexes.map((item) => {
|
||||
const oldIndex = indexes!.find((index) => index.text === item.text);
|
||||
if (oldIndex) {
|
||||
@@ -56,11 +94,24 @@ const formatIndexes = ({
|
||||
(item, index, self) => index === self.findIndex((t) => t.text === item.text)
|
||||
);
|
||||
|
||||
return indexes.map((index) => ({
|
||||
type: index.type,
|
||||
text: index.text,
|
||||
dataId: index.dataId
|
||||
}));
|
||||
const chekcIndexes = (
|
||||
await Promise.all(
|
||||
indexes.map(async (item) => {
|
||||
// If oversize tokens, split it
|
||||
const tokens = await countPromptTokens(item.text);
|
||||
if (tokens > indexSize) {
|
||||
const splitText = splitText2Chunks({ text: item.text, chunkSize: 512 }).chunks;
|
||||
return splitText.map((text) => ({
|
||||
text,
|
||||
type: item.type
|
||||
}));
|
||||
}
|
||||
return item;
|
||||
})
|
||||
)
|
||||
).flat();
|
||||
|
||||
return chekcIndexes;
|
||||
};
|
||||
/* insert data.
|
||||
* 1. create data id
|
||||
@@ -75,30 +126,40 @@ export async function insertData2Dataset({
|
||||
q,
|
||||
a = '',
|
||||
chunkIndex = 0,
|
||||
indexSize = 512,
|
||||
indexes,
|
||||
model,
|
||||
embeddingModel,
|
||||
session
|
||||
}: CreateDatasetDataProps & {
|
||||
model: string;
|
||||
embeddingModel: string;
|
||||
indexSize?: number;
|
||||
session?: ClientSession;
|
||||
}) {
|
||||
if (!q || !datasetId || !collectionId || !model) {
|
||||
return Promise.reject('q, datasetId, collectionId, model is required');
|
||||
if (!q || !datasetId || !collectionId || !embeddingModel) {
|
||||
return Promise.reject('q, datasetId, collectionId, embeddingModel is required');
|
||||
}
|
||||
if (String(teamId) === String(tmbId)) {
|
||||
return Promise.reject("teamId and tmbId can't be the same");
|
||||
}
|
||||
|
||||
const embModel = getEmbeddingModel(embeddingModel);
|
||||
indexSize = Math.min(embModel.maxToken, indexSize);
|
||||
|
||||
// 1. Get vector indexes and insert
|
||||
// Empty indexes check, if empty, create default index
|
||||
const newIndexes = formatIndexes({ indexes, q, a });
|
||||
const newIndexes = await formatIndexes({
|
||||
indexes,
|
||||
q,
|
||||
a,
|
||||
indexSize
|
||||
});
|
||||
|
||||
// insert to vector store
|
||||
const result = await Promise.all(
|
||||
newIndexes.map(async (item) => {
|
||||
const result = await insertDatasetDataVector({
|
||||
query: item.text,
|
||||
model: getEmbeddingModel(model),
|
||||
model: embModel,
|
||||
teamId,
|
||||
datasetId,
|
||||
collectionId
|
||||
@@ -163,8 +224,9 @@ export async function updateData2Dataset({
|
||||
q = '',
|
||||
a,
|
||||
indexes,
|
||||
model
|
||||
}: UpdateDatasetDataProps & { model: string }) {
|
||||
model,
|
||||
indexSize = 512
|
||||
}: UpdateDatasetDataProps & { model: string; indexSize?: number }) {
|
||||
if (!Array.isArray(indexes)) {
|
||||
return Promise.reject('indexes is required');
|
||||
}
|
||||
@@ -174,7 +236,7 @@ export async function updateData2Dataset({
|
||||
if (!mongoData) return Promise.reject('core.dataset.error.Data not found');
|
||||
|
||||
// 2. Compute indexes
|
||||
const formatIndexesResult = formatIndexes({ indexes, q, a });
|
||||
const formatIndexesResult = await formatIndexes({ indexes, q, a, indexSize });
|
||||
|
||||
// 3. Patch indexes, create, update, delete
|
||||
const patchResult: PatchIndexesProps[] = [];
|
||||
|
||||
@@ -21,6 +21,11 @@ import {
|
||||
llmCompletionsBodyFormat,
|
||||
llmStreamResponseToAnswerText
|
||||
} from '@fastgpt/service/core/ai/utils';
|
||||
import { LLMModelItemType } from '@fastgpt/global/core/ai/model.d';
|
||||
import {
|
||||
chunkAutoChunkSize,
|
||||
getLLMMaxChunkSize
|
||||
} from '@fastgpt/global/core/dataset/training/utils';
|
||||
|
||||
const reduceQueue = () => {
|
||||
global.qaQueueLen = global.qaQueueLen > 0 ? global.qaQueueLen - 1 : 0;
|
||||
@@ -129,7 +134,7 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`;
|
||||
});
|
||||
const answer = await llmStreamResponseToAnswerText(chatResponse);
|
||||
|
||||
const qaArr = formatSplitText(answer, text); // 格式化后的QA对
|
||||
const qaArr = formatSplitText({ answer, rawText: text, llmModel: modelData }); // 格式化后的QA对
|
||||
|
||||
addLog.info(`[QA Queue] Finish`, {
|
||||
time: Date.now() - startTime,
|
||||
@@ -180,10 +185,18 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`;
|
||||
}
|
||||
|
||||
// Format qa answer
|
||||
function formatSplitText(text: string, rawText: string) {
|
||||
text = text.replace(/\\n/g, '\n'); // 将换行符替换为空格
|
||||
function formatSplitText({
|
||||
answer,
|
||||
rawText,
|
||||
llmModel
|
||||
}: {
|
||||
answer: string;
|
||||
rawText: string;
|
||||
llmModel: LLMModelItemType;
|
||||
}) {
|
||||
answer = answer.replace(/\\n/g, '\n'); // 将换行符替换为空格
|
||||
const regex = /Q\d+:(\s*)(.*)(\s*)A\d+:(\s*)([\s\S]*?)(?=Q\d|$)/g; // 匹配Q和A的正则表达式
|
||||
const matches = text.matchAll(regex); // 获取所有匹配到的结果
|
||||
const matches = answer.matchAll(regex); // 获取所有匹配到的结果
|
||||
|
||||
const result: PushDatasetDataChunkProps[] = []; // 存储最终的结果
|
||||
for (const match of matches) {
|
||||
@@ -199,7 +212,11 @@ function formatSplitText(text: string, rawText: string) {
|
||||
|
||||
// empty result. direct split chunk
|
||||
if (result.length === 0) {
|
||||
const { chunks } = splitText2Chunks({ text: rawText, chunkLen: 512 });
|
||||
const { chunks } = splitText2Chunks({
|
||||
text: rawText,
|
||||
chunkSize: chunkAutoChunkSize,
|
||||
maxSize: getLLMMaxChunkSize(llmModel)
|
||||
});
|
||||
chunks.forEach((chunk) => {
|
||||
result.push({
|
||||
q: chunk,
|
||||
|
||||
@@ -245,7 +245,7 @@ const insertData = async ({
|
||||
a: trainingData.a,
|
||||
chunkIndex: trainingData.chunkIndex,
|
||||
indexes: trainingData.indexes,
|
||||
model: trainingData.model,
|
||||
embeddingModel: trainingData.model,
|
||||
session
|
||||
});
|
||||
// delete data from training
|
||||
|
||||
Reference in New Issue
Block a user