training queue

This commit is contained in:
archer
2023-05-26 23:08:25 +08:00
parent 69f32a0861
commit dc1c1d1355
32 changed files with 528 additions and 493 deletions

View File

@@ -1,75 +1,55 @@
import { SplitData } from '@/service/mongo';
import { TrainingData } from '@/service/mongo';
import { getApiKey } from '../utils/auth';
import { OpenAiChatEnum } from '@/constants/model';
import { pushSplitDataBill } from '@/service/events/pushBill';
import { generateVector } from './generateVector';
import { openaiError2 } from '../errorCode';
import { insertKbItem } from '@/service/pg';
import { SplitDataSchema } from '@/types/mongoSchema';
import { modelServiceToolMap } from '../utils/chat';
import { ChatRoleEnum } from '@/constants/chat';
import { getErrText } from '@/utils/tools';
import { BillTypeEnum } from '@/constants/user';
import { pushDataToKb } from '@/pages/api/openapi/kb/pushData';
import { ERROR_ENUM } from '../errorCode';
export async function generateQA(next = false): Promise<any> {
if (process.env.queueTask !== '1') {
try {
fetch(process.env.parentUrl || '');
} catch (error) {
console.log('parentUrl fetch error', error);
}
return;
}
if (global.generatingQA === true && !next) return;
global.generatingQA = true;
let dataId = null;
// 每次最多选 1 组
const listLen = 1;
export async function generateQA(trainingId: string): Promise<any> {
try {
// 找出一个需要生成的 dataItem
const data = await SplitData.aggregate([
{ $match: { textList: { $exists: true, $ne: [] } } },
{ $sample: { size: 1 } }
]);
// 找出一个需要生成的 dataItem (4分钟锁)
const data = await TrainingData.findOneAndUpdate(
{
_id: trainingId,
lockTime: { $lte: Date.now() - 4 * 60 * 1000 }
},
{
lockTime: new Date()
}
);
const dataItem: SplitDataSchema = data[0];
if (!dataItem) {
console.log('没有需要生成 QA 的数据');
global.generatingQA = false;
return;
}
dataId = dataItem._id;
// 获取 5 个源文本
const textList: string[] = dataItem.textList.slice(-5);
// 获取 openapi Key
let userOpenAiKey = '',
systemAuthKey = '';
try {
const key = await getApiKey({ model: OpenAiChatEnum.GPT35, userId: dataItem.userId });
userOpenAiKey = key.userOpenAiKey;
systemAuthKey = key.systemAuthKey;
} catch (err: any) {
// 余额不够了, 清空该记录
await SplitData.findByIdAndUpdate(dataItem._id, {
textList: [],
errorText: getErrText(err, '获取 OpenAi Key 失败')
if (!data || data.qaList.length === 0) {
await TrainingData.findOneAndDelete({
_id: trainingId,
qaList: [],
vectorList: []
});
generateQA(true);
return;
}
console.log(`正在生成一组QA, 包含 ${textList.length} 组文本。ID: ${dataItem._id}`);
const qaList: string[] = data.qaList.slice(-listLen);
// 余额校验并获取 openapi Key
const { userOpenAiKey, systemAuthKey } = await getApiKey({
model: OpenAiChatEnum.GPT35,
userId: data.userId,
type: 'training'
});
console.log(`正在生成一组QA, 包含 ${qaList.length} 组文本。ID: ${data._id}`);
const startTime = Date.now();
// 请求 chatgpt 获取回答
const response = await Promise.allSettled(
textList.map((text) =>
const response = await Promise.all(
qaList.map((text) =>
modelServiceToolMap[OpenAiChatEnum.GPT35]
.chatCompletion({
apiKey: userOpenAiKey || systemAuthKey,
@@ -78,7 +58,7 @@ export async function generateQA(next = false): Promise<any> {
{
obj: ChatRoleEnum.System,
value: `你是出题人
${dataItem.prompt || '下面是"一段长文本"'}
${data.prompt || '下面是"一段长文本"'}
从中选出5至20个题目和答案.答案详细.按格式返回: Q1:
A1:
Q2:
@@ -98,7 +78,7 @@ A2:
// 计费
pushSplitDataBill({
isPay: !userOpenAiKey && result.length > 0,
userId: dataItem.userId,
userId: data.userId,
type: BillTypeEnum.QA,
textLen: responseMessages.map((item) => item.value).join('').length,
totalTokens
@@ -116,57 +96,59 @@ A2:
)
);
// 获取成功的回答
const successResponse: {
rawContent: string;
result: {
q: string;
a: string;
}[];
}[] = response.filter((item) => item.status === 'fulfilled').map((item: any) => item.value);
const responseList = response.map((item) => item.result).flat();
const resultList = successResponse.map((item) => item.result).flat();
// 创建 向量生成 队列
pushDataToKb({
kbId: data.kbId,
data: responseList,
userId: data.userId
});
await Promise.allSettled([
// 删掉后5个数据
SplitData.findByIdAndUpdate(dataItem._id, {
textList: dataItem.textList.slice(0, -5)
}),
// 生成的内容插入 pg
insertKbItem({
userId: dataItem.userId,
kbId: dataItem.kbId,
data: resultList
})
]);
console.log('生成QA成功time:', `${(Date.now() - startTime) / 1000}s`);
generateQA(true);
generateVector();
} catch (error: any) {
// log
if (error?.response) {
console.log('openai error: 生成QA错误');
console.log(error.response?.status, error.response?.statusText, error.response?.data);
// 删除 QA 队列。如果小于 n 条,整个数据删掉。 如果大于 n 条,仅删数组后 n 个
if (data.vectorList.length <= listLen) {
await TrainingData.findByIdAndDelete(data._id);
} else {
console.log('生成QA错误:', error);
await TrainingData.findByIdAndUpdate(data._id, {
qaList: data.qaList.slice(0, -listLen),
lockTime: new Date('2000/1/1')
});
}
// 没有余额或者凭证错误时,拒绝任务
if (dataId && openaiError2[error?.response?.data?.error?.type]) {
console.log(openaiError2[error?.response?.data?.error?.type], '删除QA任务');
console.log('生成QA成功time:', `${(Date.now() - startTime) / 1000}s`);
await SplitData.findByIdAndUpdate(dataId, {
textList: [],
errorText: 'api 余额不足'
});
generateQA(trainingId);
} catch (err: any) {
// log
if (err?.response) {
console.log('openai error: 生成QA错误');
console.log(err.response?.status, err.response?.statusText, err.response?.data);
} else {
console.log('生成QA错误:', err);
}
generateQA(true);
// openai 账号异常或者账号余额不足,删除任务
if (openaiError2[err?.response?.data?.error?.type] || err === ERROR_ENUM.insufficientQuota) {
console.log('余额不足,删除向量生成任务');
await TrainingData.findByIdAndDelete(trainingId);
return;
}
// unlock
await TrainingData.findByIdAndUpdate(trainingId, {
lockTime: new Date('2000/1/1')
});
// 频率限制
if (err?.response?.statusText === 'Too Many Requests') {
console.log('生成向量次数限制30s后尝试');
return setTimeout(() => {
generateQA(trainingId);
}, 30000);
}
setTimeout(() => {
generateQA(true);
generateQA(trainingId);
}, 1000);
}
}