feat: 摘要拆分
This commit is contained in:
177
src/service/events/generateAbstract.ts
Normal file
177
src/service/events/generateAbstract.ts
Normal file
@@ -0,0 +1,177 @@
|
||||
import { DataItem } from '@/service/mongo';
|
||||
import { getOpenAIApi } from '@/service/utils/chat';
|
||||
import { httpsAgent, getOpenApiKey } from '@/service/utils/tools';
|
||||
import type { ChatCompletionRequestMessage } from 'openai';
|
||||
import { DataItemSchema } from '@/types/mongoSchema';
|
||||
import { ChatModelNameEnum } from '@/constants/model';
|
||||
import { pushSplitDataBill } from '@/service/events/pushBill';
|
||||
|
||||
export async function generateAbstract(next = false): Promise<any> {
|
||||
if (global.generatingAbstract && !next) return;
|
||||
global.generatingAbstract = true;
|
||||
|
||||
const systemPrompt: ChatCompletionRequestMessage = {
|
||||
role: 'system',
|
||||
content: `我会向你发送一段长文本,请从中总结出3~10个摘要,尽量详细,请按以下格式返回: "(1):"\n"(2):"\n"(3):"\n`
|
||||
};
|
||||
let dataItem: DataItemSchema | null = null;
|
||||
|
||||
try {
|
||||
// 找出一个需要生成的 dataItem
|
||||
dataItem = await DataItem.findOne({
|
||||
status: { $ne: 0 },
|
||||
times: { $gt: 0 },
|
||||
type: 'abstract'
|
||||
});
|
||||
|
||||
if (!dataItem) {
|
||||
console.log('没有需要生成 【摘要】 的数据');
|
||||
global.generatingAbstract = false;
|
||||
return;
|
||||
}
|
||||
|
||||
// 更新状态为生成中
|
||||
await DataItem.findByIdAndUpdate(dataItem._id, {
|
||||
status: 2
|
||||
});
|
||||
|
||||
// 获取 openapi Key
|
||||
let userApiKey, systemKey;
|
||||
try {
|
||||
const key = await getOpenApiKey(dataItem.userId);
|
||||
userApiKey = key.userApiKey;
|
||||
systemKey = key.systemKey;
|
||||
} catch (error) {
|
||||
// 余额不够了, 把用户所有记录改成闲置
|
||||
await DataItem.updateMany({
|
||||
userId: dataItem.userId,
|
||||
status: 0
|
||||
});
|
||||
throw new Error('获取 openai key 失败');
|
||||
}
|
||||
|
||||
console.log('正在生成一组摘要, ID:', dataItem._id);
|
||||
|
||||
const startTime = Date.now();
|
||||
|
||||
// 获取 openai 请求实例
|
||||
const chatAPI = getOpenAIApi(userApiKey || systemKey);
|
||||
// 请求 chatgpt 获取摘要
|
||||
const abstractResponse = await Promise.allSettled(
|
||||
[0.5, 1].map((temperature) =>
|
||||
chatAPI.createChatCompletion(
|
||||
{
|
||||
model: ChatModelNameEnum.GPT35,
|
||||
temperature: temperature,
|
||||
n: 1,
|
||||
messages: [
|
||||
systemPrompt,
|
||||
{
|
||||
role: 'user',
|
||||
content: dataItem?.text || ''
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
timeout: 120000,
|
||||
httpsAgent
|
||||
}
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
// 过滤出成功的响应
|
||||
const successAbstracts = abstractResponse.filter((item) => item.status === 'fulfilled');
|
||||
// 提取摘要内容
|
||||
const rawContents: string[] = successAbstracts.map(
|
||||
(item: any) => item?.value?.data.choices[0].message?.content || ''
|
||||
);
|
||||
// 从 content 中提取摘要内容
|
||||
const splitContents = rawContents.map((content) => splitText(content)).flat();
|
||||
|
||||
// 生成词向量
|
||||
const vectorResponse = await Promise.allSettled(
|
||||
splitContents.map((item) =>
|
||||
chatAPI.createEmbedding({
|
||||
model: 'text-embedding-ada-002',
|
||||
input: item.abstract
|
||||
})
|
||||
)
|
||||
);
|
||||
// 筛选成功的向量请求
|
||||
const vectorSuccessResponse = vectorResponse
|
||||
.map((item: any, i) => {
|
||||
if (item.status !== 'fulfilled') return '';
|
||||
return {
|
||||
abstract: splitContents[i].abstract,
|
||||
abstractVector: item?.value?.data?.data?.[0]?.embedding
|
||||
};
|
||||
})
|
||||
.filter((item) => item);
|
||||
|
||||
// 插入数据库,并修改状态
|
||||
await DataItem.findByIdAndUpdate(dataItem._id, {
|
||||
status: 0,
|
||||
$push: {
|
||||
rawResponse: {
|
||||
$each: rawContents
|
||||
},
|
||||
result: {
|
||||
$each: vectorSuccessResponse
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// 计费
|
||||
!userApiKey &&
|
||||
splitContents.length > 0 &&
|
||||
pushSplitDataBill({
|
||||
userId: dataItem.userId,
|
||||
type: 'abstract',
|
||||
text:
|
||||
systemPrompt.content +
|
||||
dataItem.text +
|
||||
rawContents.join('') +
|
||||
rawContents.join('').substring(0, Math.floor(dataItem.text.length / 10)) // 向量价格是gpt35的1/10
|
||||
});
|
||||
console.log(
|
||||
'生成摘要成功,time:',
|
||||
`${(Date.now() - startTime) / 1000}s`,
|
||||
'摘要数量:',
|
||||
splitContents.length
|
||||
);
|
||||
} catch (error: any) {
|
||||
console.log('error: 生成摘要错误', dataItem?._id);
|
||||
console.log('response:', error);
|
||||
if (dataItem?._id) {
|
||||
await DataItem.findByIdAndUpdate(dataItem._id, {
|
||||
status: dataItem.times > 0 ? 1 : 0, // 还有重试次数则可以继续进行
|
||||
$inc: {
|
||||
// 剩余尝试次数-1
|
||||
times: -1
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
generateAbstract(true);
|
||||
}
|
||||
|
||||
/**
|
||||
* 检查文本是否按格式返回
|
||||
*/
|
||||
function splitText(text: string) {
|
||||
const regex = /\(\d+\):(\s*)(.*)(\s*)/g;
|
||||
const matches = text.matchAll(regex); // 获取所有匹配到的结果
|
||||
|
||||
const result = []; // 存储最终的结果
|
||||
for (const match of matches) {
|
||||
if (match[2]) {
|
||||
result.push({
|
||||
abstract: match[2] as string
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
@@ -20,7 +20,8 @@ export async function generateQA(next = false): Promise<any> {
|
||||
// 找出一个需要生成的 dataItem
|
||||
dataItem = await DataItem.findOne({
|
||||
status: { $ne: 0 },
|
||||
times: { $gt: 0 }
|
||||
times: { $gt: 0 },
|
||||
type: 'QA'
|
||||
});
|
||||
|
||||
if (!dataItem) {
|
||||
@@ -49,62 +50,72 @@ export async function generateQA(next = false): Promise<any> {
|
||||
throw new Error('获取 openai key 失败');
|
||||
}
|
||||
|
||||
console.log('正在生成一个QA, ID:', dataItem._id, 'temperature: ', dataItem.temperature / 100);
|
||||
console.log('正在生成一组QA, ID:', dataItem._id);
|
||||
|
||||
const startTime = Date.now();
|
||||
|
||||
// 获取 openai 请求实例
|
||||
const chatAPI = getOpenAIApi(userApiKey || systemKey);
|
||||
// 请求 chatgpt 获取回答
|
||||
const response = await chatAPI.createChatCompletion(
|
||||
{
|
||||
model: ChatModelNameEnum.GPT35,
|
||||
temperature: dataItem.temperature / 100,
|
||||
n: 1,
|
||||
messages: [
|
||||
systemPrompt,
|
||||
const response = await Promise.allSettled(
|
||||
[0, 0.5, 0.8].map((temperature) =>
|
||||
chatAPI.createChatCompletion(
|
||||
{
|
||||
role: 'user',
|
||||
content: dataItem.text
|
||||
model: ChatModelNameEnum.GPT35,
|
||||
temperature: temperature,
|
||||
n: 1,
|
||||
messages: [
|
||||
systemPrompt,
|
||||
{
|
||||
role: 'user',
|
||||
content: dataItem?.text || ''
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
timeout: 120000,
|
||||
httpsAgent
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
timeout: 120000,
|
||||
httpsAgent
|
||||
}
|
||||
)
|
||||
)
|
||||
);
|
||||
// 过滤出成功的响应
|
||||
const successResponse = response.filter((item) => item.status === 'fulfilled');
|
||||
// 提取响应内容
|
||||
const rawContents: string[] = successResponse.map(
|
||||
(item: any) => item?.value?.data.choices[0].message?.content || ''
|
||||
);
|
||||
const content = response.data.choices[0].message?.content;
|
||||
// 从 content 中提取 QA
|
||||
const splitResponse = splitText(content || '');
|
||||
const splitResponses = rawContents.map((content) => splitText(content)).flat();
|
||||
// 插入数据库,并修改状态
|
||||
await DataItem.findByIdAndUpdate(dataItem._id, {
|
||||
status: dataItem.temperature >= 90 ? 0 : 1, // 需要生成 4 组内容。0,0.3,0.6,0.9
|
||||
temperature: dataItem.temperature >= 90 ? dataItem.temperature : dataItem.temperature + 30,
|
||||
status: 0,
|
||||
$push: {
|
||||
rawResponse: content,
|
||||
rawResponse: {
|
||||
$each: rawContents
|
||||
},
|
||||
result: {
|
||||
$each: splitResponse
|
||||
$each: splitResponses
|
||||
}
|
||||
}
|
||||
});
|
||||
// 计费
|
||||
!userApiKey &&
|
||||
splitResponse.length > 0 &&
|
||||
splitResponses.length > 0 &&
|
||||
pushSplitDataBill({
|
||||
userId: dataItem.userId,
|
||||
text: systemPrompt.content + dataItem.text + content
|
||||
type: 'QA',
|
||||
text: systemPrompt.content + dataItem.text + rawContents.join('')
|
||||
});
|
||||
console.log(
|
||||
'生成QA成功,time:',
|
||||
`${(Date.now() - startTime) / 1000}s`,
|
||||
'QA数量:',
|
||||
splitResponse.length
|
||||
splitResponses.length
|
||||
);
|
||||
} catch (error: any) {
|
||||
console.log('error: 生成QA错误', dataItem?._id);
|
||||
console.log('response:', error?.response);
|
||||
// 重置状态
|
||||
if (dataItem?._id) {
|
||||
await DataItem.findByIdAndUpdate(dataItem._id, {
|
||||
status: dataItem.times > 0 ? 1 : 0, // 还有重试次数则可以继续进行
|
||||
|
||||
@@ -2,6 +2,7 @@ import { connectToDatabase, Bill, User } from '../mongo';
|
||||
import { modelList, ChatModelNameEnum } from '@/constants/model';
|
||||
import { encode } from 'gpt-token-utils';
|
||||
import { formatPrice } from '@/utils/user';
|
||||
import type { DataType } from '@/types/data';
|
||||
|
||||
export const pushChatBill = async ({
|
||||
modelName,
|
||||
@@ -59,7 +60,15 @@ export const pushChatBill = async ({
|
||||
}
|
||||
};
|
||||
|
||||
export const pushSplitDataBill = async ({ userId, text }: { userId: string; text: string }) => {
|
||||
export const pushSplitDataBill = async ({
|
||||
userId,
|
||||
text,
|
||||
type
|
||||
}: {
|
||||
userId: string;
|
||||
text: string;
|
||||
type: DataType;
|
||||
}) => {
|
||||
await connectToDatabase();
|
||||
|
||||
let billId;
|
||||
@@ -83,7 +92,7 @@ export const pushSplitDataBill = async ({ userId, text }: { userId: string; text
|
||||
// 插入 Bill 记录
|
||||
const res = await Bill.create({
|
||||
userId,
|
||||
type: 'splitData',
|
||||
type,
|
||||
modelName: ChatModelNameEnum.GPT35,
|
||||
textLen: text.length,
|
||||
tokenLen: tokens.length,
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import { Schema, model, models, Model } from 'mongoose';
|
||||
import { DataItemSchema as Datatype } from '@/types/mongoSchema';
|
||||
import { DataSchema as Datatype } from '@/types/mongoSchema';
|
||||
import { DataTypeTextMap } from '@/constants/data';
|
||||
|
||||
const DataSchema = new Schema({
|
||||
userId: {
|
||||
@@ -15,6 +16,11 @@ const DataSchema = new Schema({
|
||||
type: Date,
|
||||
default: () => new Date()
|
||||
},
|
||||
type: {
|
||||
type: String,
|
||||
required: true,
|
||||
enum: Object.keys(DataTypeTextMap)
|
||||
},
|
||||
isDeleted: {
|
||||
type: Boolean,
|
||||
default: false
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import type { DataItemSchema as DataItemType } from '@/types/mongoSchema';
|
||||
import { Schema, model, models, Model } from 'mongoose';
|
||||
import { DataTypeTextMap } from '@/constants/data';
|
||||
|
||||
const DataItemSchema = new Schema({
|
||||
userId: {
|
||||
@@ -12,19 +13,23 @@ const DataItemSchema = new Schema({
|
||||
ref: 'data',
|
||||
required: true
|
||||
},
|
||||
type: {
|
||||
type: String,
|
||||
required: true,
|
||||
enum: Object.keys(DataTypeTextMap)
|
||||
},
|
||||
times: {
|
||||
// 剩余重试次数
|
||||
type: Number,
|
||||
default: 3
|
||||
},
|
||||
text: {
|
||||
// 文本内容
|
||||
type: String,
|
||||
required: true
|
||||
},
|
||||
temperature: {
|
||||
type: Number,
|
||||
required: true
|
||||
},
|
||||
rawResponse: {
|
||||
// 原始拆分结果
|
||||
type: [String],
|
||||
default: []
|
||||
},
|
||||
@@ -33,11 +38,21 @@ const DataItemSchema = new Schema({
|
||||
{
|
||||
q: {
|
||||
type: String,
|
||||
required: true
|
||||
default: ''
|
||||
},
|
||||
a: {
|
||||
type: String,
|
||||
required: true
|
||||
default: ''
|
||||
},
|
||||
abstract: {
|
||||
// 摘要
|
||||
type: String,
|
||||
default: ''
|
||||
},
|
||||
abstractVector: {
|
||||
// 摘要对应的向量
|
||||
type: [Number],
|
||||
default: []
|
||||
}
|
||||
}
|
||||
],
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
import mongoose from 'mongoose';
|
||||
import { generateQA } from './events/generateQA';
|
||||
import { generateAbstract } from './events/generateAbstract';
|
||||
|
||||
/**
|
||||
* 连接 MongoDB 数据库
|
||||
*/
|
||||
@@ -24,8 +26,8 @@ export async function connectToDatabase(): Promise<void> {
|
||||
global.mongodb = null;
|
||||
}
|
||||
|
||||
// 递归 QA 生成
|
||||
generateQA();
|
||||
generateAbstract();
|
||||
}
|
||||
|
||||
export * from './models/authCode';
|
||||
|
||||
Reference in New Issue
Block a user