feat: 摘要拆分

This commit is contained in:
archer
2023-03-26 22:09:59 +08:00
parent 888642f154
commit 3e4487ad9a
20 changed files with 397 additions and 83 deletions

View File

@@ -0,0 +1,177 @@
import { DataItem } from '@/service/mongo';
import { getOpenAIApi } from '@/service/utils/chat';
import { httpsAgent, getOpenApiKey } from '@/service/utils/tools';
import type { ChatCompletionRequestMessage } from 'openai';
import { DataItemSchema } from '@/types/mongoSchema';
import { ChatModelNameEnum } from '@/constants/model';
import { pushSplitDataBill } from '@/service/events/pushBill';
export async function generateAbstract(next = false): Promise<any> {
if (global.generatingAbstract && !next) return;
global.generatingAbstract = true;
const systemPrompt: ChatCompletionRequestMessage = {
role: 'system',
content: `我会向你发送一段长文本请从中总结出3~10个摘要尽量详细请按以下格式返回: "(1):"\n"(2):"\n"(3):"\n`
};
let dataItem: DataItemSchema | null = null;
try {
// 找出一个需要生成的 dataItem
dataItem = await DataItem.findOne({
status: { $ne: 0 },
times: { $gt: 0 },
type: 'abstract'
});
if (!dataItem) {
console.log('没有需要生成 【摘要】 的数据');
global.generatingAbstract = false;
return;
}
// 更新状态为生成中
await DataItem.findByIdAndUpdate(dataItem._id, {
status: 2
});
// 获取 openapi Key
let userApiKey, systemKey;
try {
const key = await getOpenApiKey(dataItem.userId);
userApiKey = key.userApiKey;
systemKey = key.systemKey;
} catch (error) {
// 余额不够了, 把用户所有记录改成闲置
await DataItem.updateMany({
userId: dataItem.userId,
status: 0
});
throw new Error('获取 openai key 失败');
}
console.log('正在生成一组摘要, ID:', dataItem._id);
const startTime = Date.now();
// 获取 openai 请求实例
const chatAPI = getOpenAIApi(userApiKey || systemKey);
// 请求 chatgpt 获取摘要
const abstractResponse = await Promise.allSettled(
[0.5, 1].map((temperature) =>
chatAPI.createChatCompletion(
{
model: ChatModelNameEnum.GPT35,
temperature: temperature,
n: 1,
messages: [
systemPrompt,
{
role: 'user',
content: dataItem?.text || ''
}
]
},
{
timeout: 120000,
httpsAgent
}
)
)
);
// 过滤出成功的响应
const successAbstracts = abstractResponse.filter((item) => item.status === 'fulfilled');
// 提取摘要内容
const rawContents: string[] = successAbstracts.map(
(item: any) => item?.value?.data.choices[0].message?.content || ''
);
// 从 content 中提取摘要内容
const splitContents = rawContents.map((content) => splitText(content)).flat();
// 生成词向量
const vectorResponse = await Promise.allSettled(
splitContents.map((item) =>
chatAPI.createEmbedding({
model: 'text-embedding-ada-002',
input: item.abstract
})
)
);
// 筛选成功的向量请求
const vectorSuccessResponse = vectorResponse
.map((item: any, i) => {
if (item.status !== 'fulfilled') return '';
return {
abstract: splitContents[i].abstract,
abstractVector: item?.value?.data?.data?.[0]?.embedding
};
})
.filter((item) => item);
// 插入数据库,并修改状态
await DataItem.findByIdAndUpdate(dataItem._id, {
status: 0,
$push: {
rawResponse: {
$each: rawContents
},
result: {
$each: vectorSuccessResponse
}
}
});
// 计费
!userApiKey &&
splitContents.length > 0 &&
pushSplitDataBill({
userId: dataItem.userId,
type: 'abstract',
text:
systemPrompt.content +
dataItem.text +
rawContents.join('') +
rawContents.join('').substring(0, Math.floor(dataItem.text.length / 10)) // 向量价格是gpt35的1/10
});
console.log(
'生成摘要成功time:',
`${(Date.now() - startTime) / 1000}s`,
'摘要数量:',
splitContents.length
);
} catch (error: any) {
console.log('error: 生成摘要错误', dataItem?._id);
console.log('response:', error);
if (dataItem?._id) {
await DataItem.findByIdAndUpdate(dataItem._id, {
status: dataItem.times > 0 ? 1 : 0, // 还有重试次数则可以继续进行
$inc: {
// 剩余尝试次数-1
times: -1
}
});
}
}
generateAbstract(true);
}
/**
* 检查文本是否按格式返回
*/
function splitText(text: string) {
const regex = /\(\d+\):(\s*)(.*)(\s*)/g;
const matches = text.matchAll(regex); // 获取所有匹配到的结果
const result = []; // 存储最终的结果
for (const match of matches) {
if (match[2]) {
result.push({
abstract: match[2] as string
});
}
}
return result;
}

View File

@@ -20,7 +20,8 @@ export async function generateQA(next = false): Promise<any> {
// 找出一个需要生成的 dataItem
dataItem = await DataItem.findOne({
status: { $ne: 0 },
times: { $gt: 0 }
times: { $gt: 0 },
type: 'QA'
});
if (!dataItem) {
@@ -49,62 +50,72 @@ export async function generateQA(next = false): Promise<any> {
throw new Error('获取 openai key 失败');
}
console.log('正在生成一QA, ID:', dataItem._id, 'temperature: ', dataItem.temperature / 100);
console.log('正在生成一QA, ID:', dataItem._id);
const startTime = Date.now();
// 获取 openai 请求实例
const chatAPI = getOpenAIApi(userApiKey || systemKey);
// 请求 chatgpt 获取回答
const response = await chatAPI.createChatCompletion(
{
model: ChatModelNameEnum.GPT35,
temperature: dataItem.temperature / 100,
n: 1,
messages: [
systemPrompt,
const response = await Promise.allSettled(
[0, 0.5, 0.8].map((temperature) =>
chatAPI.createChatCompletion(
{
role: 'user',
content: dataItem.text
model: ChatModelNameEnum.GPT35,
temperature: temperature,
n: 1,
messages: [
systemPrompt,
{
role: 'user',
content: dataItem?.text || ''
}
]
},
{
timeout: 120000,
httpsAgent
}
]
},
{
timeout: 120000,
httpsAgent
}
)
)
);
// 过滤出成功的响应
const successResponse = response.filter((item) => item.status === 'fulfilled');
// 提取响应内容
const rawContents: string[] = successResponse.map(
(item: any) => item?.value?.data.choices[0].message?.content || ''
);
const content = response.data.choices[0].message?.content;
// 从 content 中提取 QA
const splitResponse = splitText(content || '');
const splitResponses = rawContents.map((content) => splitText(content)).flat();
// 插入数据库,并修改状态
await DataItem.findByIdAndUpdate(dataItem._id, {
status: dataItem.temperature >= 90 ? 0 : 1, // 需要生成 4 组内容。0,0.3,0.6,0.9
temperature: dataItem.temperature >= 90 ? dataItem.temperature : dataItem.temperature + 30,
status: 0,
$push: {
rawResponse: content,
rawResponse: {
$each: rawContents
},
result: {
$each: splitResponse
$each: splitResponses
}
}
});
// 计费
!userApiKey &&
splitResponse.length > 0 &&
splitResponses.length > 0 &&
pushSplitDataBill({
userId: dataItem.userId,
text: systemPrompt.content + dataItem.text + content
type: 'QA',
text: systemPrompt.content + dataItem.text + rawContents.join('')
});
console.log(
'生成QA成功time:',
`${(Date.now() - startTime) / 1000}s`,
'QA数量',
splitResponse.length
splitResponses.length
);
} catch (error: any) {
console.log('error: 生成QA错误', dataItem?._id);
console.log('response:', error?.response);
// 重置状态
if (dataItem?._id) {
await DataItem.findByIdAndUpdate(dataItem._id, {
status: dataItem.times > 0 ? 1 : 0, // 还有重试次数则可以继续进行

View File

@@ -2,6 +2,7 @@ import { connectToDatabase, Bill, User } from '../mongo';
import { modelList, ChatModelNameEnum } from '@/constants/model';
import { encode } from 'gpt-token-utils';
import { formatPrice } from '@/utils/user';
import type { DataType } from '@/types/data';
export const pushChatBill = async ({
modelName,
@@ -59,7 +60,15 @@ export const pushChatBill = async ({
}
};
export const pushSplitDataBill = async ({ userId, text }: { userId: string; text: string }) => {
export const pushSplitDataBill = async ({
userId,
text,
type
}: {
userId: string;
text: string;
type: DataType;
}) => {
await connectToDatabase();
let billId;
@@ -83,7 +92,7 @@ export const pushSplitDataBill = async ({ userId, text }: { userId: string; text
// 插入 Bill 记录
const res = await Bill.create({
userId,
type: 'splitData',
type,
modelName: ChatModelNameEnum.GPT35,
textLen: text.length,
tokenLen: tokens.length,

View File

@@ -1,5 +1,6 @@
import { Schema, model, models, Model } from 'mongoose';
import { DataItemSchema as Datatype } from '@/types/mongoSchema';
import { DataSchema as Datatype } from '@/types/mongoSchema';
import { DataTypeTextMap } from '@/constants/data';
const DataSchema = new Schema({
userId: {
@@ -15,6 +16,11 @@ const DataSchema = new Schema({
type: Date,
default: () => new Date()
},
type: {
type: String,
required: true,
enum: Object.keys(DataTypeTextMap)
},
isDeleted: {
type: Boolean,
default: false

View File

@@ -1,5 +1,6 @@
import type { DataItemSchema as DataItemType } from '@/types/mongoSchema';
import { Schema, model, models, Model } from 'mongoose';
import { DataTypeTextMap } from '@/constants/data';
const DataItemSchema = new Schema({
userId: {
@@ -12,19 +13,23 @@ const DataItemSchema = new Schema({
ref: 'data',
required: true
},
type: {
type: String,
required: true,
enum: Object.keys(DataTypeTextMap)
},
times: {
// 剩余重试次数
type: Number,
default: 3
},
text: {
// 文本内容
type: String,
required: true
},
temperature: {
type: Number,
required: true
},
rawResponse: {
// 原始拆分结果
type: [String],
default: []
},
@@ -33,11 +38,21 @@ const DataItemSchema = new Schema({
{
q: {
type: String,
required: true
default: ''
},
a: {
type: String,
required: true
default: ''
},
abstract: {
// 摘要
type: String,
default: ''
},
abstractVector: {
// 摘要对应的向量
type: [Number],
default: []
}
}
],

View File

@@ -1,5 +1,7 @@
import mongoose from 'mongoose';
import { generateQA } from './events/generateQA';
import { generateAbstract } from './events/generateAbstract';
/**
* 连接 MongoDB 数据库
*/
@@ -24,8 +26,8 @@ export async function connectToDatabase(): Promise<void> {
global.mongodb = null;
}
// 递归 QA 生成
generateQA();
generateAbstract();
}
export * from './models/authCode';