perf: generate queue

This commit is contained in:
archer
2023-05-27 04:38:00 +08:00
parent f05b12975c
commit 741381ecb0
19 changed files with 288 additions and 265 deletions

View File

@@ -0,0 +1,37 @@
// Next.js API route support: https://nextjs.org/docs/api-routes/introduction
import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@/service/response';
import { authUser } from '@/service/utils/auth';
import { connectToDatabase, TrainingData } from '@/service/mongo';
import { TrainingTypeEnum } from '@/constants/plugin';
export default async function handler(req: NextApiRequest, res: NextApiResponse) {
try {
await authUser({ req, authRoot: true });
await connectToDatabase();
// split queue data
const result = await TrainingData.aggregate([
{
$group: {
_id: '$mode',
count: { $sum: 1 }
}
}
]);
jsonRes(res, {
data: {
qaListLen: result.find((item) => item._id === TrainingTypeEnum.qa)?.count || 0,
vectorListLen: result.find((item) => item._id === TrainingTypeEnum.index)?.count || 0
}
});
} catch (error) {
console.log(error);
jsonRes(res, {
code: 500,
error
});
}
}

View File

@@ -3,19 +3,21 @@ import type { KbDataItemType } from '@/types/plugin';
import { jsonRes } from '@/service/response';
import { connectToDatabase, TrainingData } from '@/service/mongo';
import { authUser } from '@/service/utils/auth';
import { generateVector } from '@/service/events/generateVector';
import { PgClient } from '@/service/pg';
import { authKb } from '@/service/utils/auth';
import { withNextCors } from '@/service/utils/tools';
import { TrainingTypeEnum } from '@/constants/plugin';
import { startQueue } from '@/service/utils/tools';
interface Props {
export type Props = {
kbId: string;
data: { a: KbDataItemType['a']; q: KbDataItemType['q'] }[];
}
mode: `${TrainingTypeEnum}`;
prompt?: string;
};
export default withNextCors(async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
try {
const { kbId, data } = req.body as Props;
const { kbId, data, mode, prompt } = req.body as Props;
if (!kbId || !Array.isArray(data)) {
throw new Error('缺少参数');
@@ -29,7 +31,9 @@ export default withNextCors(async function handler(req: NextApiRequest, res: Nex
data: await pushDataToKb({
kbId,
data,
userId
userId,
mode,
prompt
})
});
} catch (err) {
@@ -40,36 +44,43 @@ export default withNextCors(async function handler(req: NextApiRequest, res: Nex
}
});
export async function pushDataToKb({ userId, kbId, data }: { userId: string } & Props) {
export async function pushDataToKb({
userId,
kbId,
data,
mode,
prompt
}: { userId: string } & Props) {
await authKb({
userId,
kbId
});
if (data.length === 0) {
return {
trainingId: ''
};
return {};
}
// 插入记录
const { _id } = await TrainingData.create({
userId,
kbId,
vectorList: data
});
await TrainingData.insertMany(
data.map((item) => ({
q: item.q,
a: item.a,
userId,
kbId,
mode,
prompt
}))
);
generateVector(_id);
startQueue();
return {
trainingId: _id
};
return {};
}
export const config = {
api: {
bodyParser: {
sizeLimit: '100mb'
sizeLimit: '20mb'
}
}
};

View File

@@ -33,7 +33,15 @@ export default withNextCors(async function handler(req: NextApiRequest, res: Nex
// 更新 pg 内容.仅修改a不需要更新向量。
await PgClient.update('modelData', {
where: [['id', dataId], 'AND', ['user_id', userId]],
values: [{ key: 'a', value: a }, ...(q ? [{ key: 'q', value: `${vector[0]}` }] : [])]
values: [
{ key: 'a', value: a },
...(q
? [
{ key: 'q', value: q },
{ key: 'vector', value: `[${vector[0]}]` }
]
: [])
]
});
jsonRes(res);

View File

@@ -1,69 +0,0 @@
import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@/service/response';
import { connectToDatabase, TrainingData } from '@/service/mongo';
import { authKb, authUser } from '@/service/utils/auth';
import { generateQA } from '@/service/events/generateQA';
import { TrainingTypeEnum } from '@/constants/plugin';
import { withNextCors } from '@/service/utils/tools';
import { pushDataToKb } from '../kb/pushData';
/* split text */
export default withNextCors(async function handler(req: NextApiRequest, res: NextApiResponse) {
try {
const { chunks, kbId, prompt, mode } = req.body as {
kbId: string;
chunks: string[];
prompt: string;
mode: `${TrainingTypeEnum}`;
};
if (!chunks || !kbId || !prompt) {
throw new Error('参数错误');
}
await connectToDatabase();
const { userId } = await authUser({ req });
// 验证是否是该用户的 model
await authKb({
kbId,
userId
});
if (mode === TrainingTypeEnum.qa) {
// 批量QA拆分插入数据
const { _id } = await TrainingData.create({
userId,
kbId,
qaList: chunks,
prompt
});
generateQA(_id);
} else if (mode === TrainingTypeEnum.subsection) {
// 分段导入,直接插入向量队列
const response = await pushDataToKb({
kbId,
data: chunks.map((item) => ({ q: item, a: '' })),
userId
});
return jsonRes(res, {
data: response
});
}
jsonRes(res);
} catch (err) {
jsonRes(res, {
code: 500,
error: err
});
}
});
export const config = {
api: {
bodyParser: {
sizeLimit: '100mb'
}
}
};

View File

@@ -2,9 +2,10 @@ import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@/service/response';
import { connectToDatabase, TrainingData } from '@/service/mongo';
import { authUser } from '@/service/utils/auth';
import { Types } from 'mongoose';
import { generateQA } from '@/service/events/generateQA';
import { generateVector } from '@/service/events/generateVector';
import { TrainingTypeEnum } from '@/constants/plugin';
import { Types } from 'mongoose';
/* 拆分数据成QA */
export default async function handler(req: NextApiRequest, res: NextApiResponse) {
@@ -19,26 +20,24 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
// split queue data
const result = await TrainingData.aggregate([
{ $match: { userId: new Types.ObjectId(userId), kbId: new Types.ObjectId(kbId) } },
{
$project: {
qaListLength: { $size: { $ifNull: ['$qaList', []] } },
vectorListLength: { $size: { $ifNull: ['$vectorList', []] } }
$match: {
userId: new Types.ObjectId(userId),
kbId: new Types.ObjectId(kbId)
}
},
{
$group: {
_id: null,
totalQaListLength: { $sum: '$qaListLength' },
totalVectorListLength: { $sum: '$vectorListLength' }
_id: '$mode',
count: { $sum: 1 }
}
}
]);
jsonRes(res, {
data: {
qaListLen: result[0]?.totalQaListLength || 0,
vectorListLen: result[0]?.totalVectorListLength || 0
qaListLen: result.find((item) => item._id === TrainingTypeEnum.qa)?.count || 0,
vectorListLen: result.find((item) => item._id === TrainingTypeEnum.index)?.count || 0
}
});
@@ -49,10 +48,10 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
kbId
},
'_id'
);
).limit(10);
list.forEach((item) => {
generateQA(item._id);
generateVector(item._id);
generateQA();
generateVector();
});
}
} catch (err) {

View File

@@ -13,6 +13,7 @@ import {
import { useForm } from 'react-hook-form';
import { postKbDataFromList, putKbDataById } from '@/api/plugins/kb';
import { useToast } from '@/hooks/useToast';
import { TrainingTypeEnum } from '@/constants/plugin';
export type FormData = { dataId?: string; a: string; q: string };
@@ -59,7 +60,8 @@ const InputDataModal = ({
a: e.a,
q: e.q
}
]
],
mode: TrainingTypeEnum.index
});
toast({

View File

@@ -19,6 +19,7 @@ import { postKbDataFromList } from '@/api/plugins/kb';
import Markdown from '@/components/Markdown';
import { useMarkdown } from '@/hooks/useMarkdown';
import { fileDownload } from '@/utils/file';
import { TrainingTypeEnum } from '@/constants/plugin';
const csvTemplate = `question,answer\n"什么是 laf","laf 是一个云函数开发平台……"\n"什么是 sealos","Sealos 是以 kubernetes 为内核的云操作系统发行版,可以……"`;
@@ -72,7 +73,8 @@ const SelectJsonModal = ({
const res = await postKbDataFromList({
kbId,
data: fileData
data: fileData,
mode: TrainingTypeEnum.index
});
toast({

View File

@@ -17,7 +17,7 @@ import { useSelectFile } from '@/hooks/useSelectFile';
import { useConfirm } from '@/hooks/useConfirm';
import { readTxtContent, readPdfContent, readDocContent } from '@/utils/file';
import { useMutation } from '@tanstack/react-query';
import { postSplitData } from '@/api/plugins/kb';
import { postKbDataFromList } from '@/api/plugins/kb';
import Radio from '@/components/Radio';
import { splitText_token } from '@/utils/file';
import { TrainingTypeEnum } from '@/constants/plugin';
@@ -32,7 +32,7 @@ const modeMap = {
price: 4,
isPrompt: true
},
subsection: {
index: {
maxLen: 800,
slideLen: 300,
price: 0.4,
@@ -53,7 +53,7 @@ const SelectFileModal = ({
const { toast } = useToast();
const [prompt, setPrompt] = useState('');
const { File, onOpen } = useSelectFile({ fileType: fileExtension, multiple: true });
const [mode, setMode] = useState<`${TrainingTypeEnum}`>(TrainingTypeEnum.subsection);
const [mode, setMode] = useState<`${TrainingTypeEnum}`>(TrainingTypeEnum.index);
const [fileTextArr, setFileTextArr] = useState<string[]>(['']);
const [splitRes, setSplitRes] = useState<{ tokens: number; chunks: string[] }>({
tokens: 0,
@@ -108,9 +108,9 @@ const SelectFileModal = ({
mutationFn: async () => {
if (splitRes.chunks.length === 0) return;
await postSplitData({
await postKbDataFromList({
kbId,
chunks: splitRes.chunks,
data: splitRes.chunks.map((text) => ({ q: text, a: '' })),
prompt: `下面是"${prompt || '一段长文本'}"`,
mode
});
@@ -195,11 +195,11 @@ const SelectFileModal = ({
<Radio
ml={3}
list={[
{ label: '直接分段', value: 'subsection' },
{ label: '直接分段', value: 'index' },
{ label: 'QA拆分', value: 'qa' }
]}
value={mode}
onChange={(e) => setMode(e as 'subsection' | 'qa')}
onChange={(e) => setMode(e as 'index' | 'qa')}
/>
</Flex>
{/* 内容介绍 */}