This commit is contained in:
Archer
2023-11-15 21:35:50 +08:00
committed by GitHub
parent bfd8be5df0
commit cd3acb44ab
39 changed files with 457 additions and 160 deletions

View File

@@ -11,6 +11,8 @@ import {
import { authCert } from '@fastgpt/service/support/permission/auth/common';
import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema';
import { getUserDefaultTeam } from '@fastgpt/service/support/user/team/controller';
import { MongoDataset } from '@fastgpt/service/core/dataset/schema';
import { defaultQAModels } from '@fastgpt/global/core/ai/model';
let success = 0;
/* pg 中的数据搬到 mongo dataset.datas 中,并做映射 */
@@ -41,6 +43,13 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
await initPgData();
await MongoDataset.updateMany(
{},
{
agentModel: defaultQAModels[0].model
}
);
jsonRes(res, {
data: await init(limit),
message:
@@ -76,14 +85,19 @@ async function initPgData() {
for (let i = 0; i < limit; i++) {
init(i);
}
async function init(index: number): Promise<any> {
const userId = rows[index]?.user_id;
if (!userId) return;
try {
const tmb = await getUserDefaultTeam({ userId });
console.log(tmb);
// update pg
await PgClient.query(
`Update ${PgDatasetTableName} set team_id = '${tmb.teamId}', tmb_id = '${tmb.tmbId}' where user_id = '${userId}' AND team_id='null';`
`Update ${PgDatasetTableName} set team_id = '${String(tmb.teamId)}', tmb_id = '${String(
tmb.tmbId
)}' where user_id = '${userId}' AND team_id='null';`
);
console.log(++success);
init(index + limit);

View File

@@ -0,0 +1,101 @@
import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@fastgpt/service/common/response';
import { connectToDatabase } from '@/service/mongo';
import { delay } from '@/utils/tools';
import { PgClient } from '@fastgpt/service/common/pg';
import {
DatasetDataIndexTypeEnum,
PgDatasetTableName
} from '@fastgpt/global/core/dataset/constant';
import { authCert } from '@fastgpt/service/support/permission/auth/common';
import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema';
let success = 0;
/* pg 中的数据搬到 mongo dataset.datas 中,并做映射 */
export default async function handler(req: NextApiRequest, res: NextApiResponse) {
try {
const { limit = 50 } = req.body as { limit: number };
await authCert({ req, authRoot: true });
await connectToDatabase();
success = 0;
jsonRes(res, {
data: await init(limit)
});
} catch (error) {
console.log(error);
jsonRes(res, {
code: 500,
error
});
}
}
type PgItemType = {
id: string;
q: string;
a: string;
dataset_id: string;
collection_id: string;
data_id: string;
};
async function init(limit: number): Promise<any> {
const { rows: idList } = await PgClient.query<{ id: string }>(
`SELECT id FROM ${PgDatasetTableName} WHERE inited=1`
);
console.log('totalCount', idList.length);
await delay(2000);
if (idList.length === 0) return;
for (let i = 0; i < limit; i++) {
initData(i);
}
async function initData(index: number): Promise<any> {
const dataId = idList[index]?.id;
if (!dataId) {
console.log('done');
return;
}
// get limit data where data_id is null
const { rows } = await PgClient.query<PgItemType>(
`SELECT id,q,a,dataset_id,collection_id,data_id FROM ${PgDatasetTableName} WHERE id=${dataId};`
);
const data = rows[0];
if (!data) {
console.log('done');
return;
}
try {
// update mongo data and update inited
await MongoDatasetData.findByIdAndUpdate(data.data_id, {
q: data.q,
a: data.a,
indexes: [
{
defaultIndex: !data.a,
type: data.a ? DatasetDataIndexTypeEnum.qa : DatasetDataIndexTypeEnum.chunk,
dataId: data.id,
text: data.q
}
]
});
// update pg data_id
await PgClient.query(`UPDATE ${PgDatasetTableName} SET inited=0 WHERE id=${dataId};`);
return initData(index + limit);
} catch (error) {
console.log(error);
console.log(data);
await delay(500);
return initData(index);
}
}
}

View File

@@ -2,7 +2,7 @@ import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@fastgpt/service/common/response';
import { connectToDatabase } from '@/service/mongo';
import { MongoDataset } from '@fastgpt/service/core/dataset/schema';
import { getVectorModel } from '@/service/core/ai/model';
import { getQAModel, getVectorModel } from '@/service/core/ai/model';
import type { DatasetItemType } from '@fastgpt/global/core/dataset/type.d';
import { mongoRPermission } from '@fastgpt/global/support/permission/utils';
import { authUserRole } from '@fastgpt/service/support/permission/auth/user';
@@ -22,6 +22,7 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
const data = datasets.map((item) => ({
...item.toJSON(),
vectorModel: getVectorModel(item.vectorModel),
agentModel: getQAModel(item.agentModel),
canWrite: String(item.tmbId) === tmbId,
isOwner: teamOwner || String(item.tmbId) === tmbId
}));

View File

@@ -0,0 +1,73 @@
import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@fastgpt/service/common/response';
import { connectToDatabase } from '@/service/mongo';
import { MongoUser } from '@fastgpt/service/support/user/schema';
import { addLog } from '@fastgpt/service/common/mongo/controller';
import { authDataset } from '@fastgpt/service/support/permission/auth/dataset';
import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema';
import { findDatasetIdTreeByTopDatasetId } from '@fastgpt/service/core/dataset/controller';
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
try {
await connectToDatabase();
let { datasetId } = req.query as {
datasetId: string;
};
if (!datasetId) {
throw new Error('缺少参数');
}
// 凭证校验
const { userId } = await authDataset({ req, authToken: true, datasetId, per: 'w' });
await limitCheck({
datasetId,
userId
});
jsonRes(res);
} catch (err) {
res.status(500);
jsonRes(res, {
code: 500,
error: err
});
}
}
export async function limitCheck({ datasetId, userId }: { datasetId: string; userId: string }) {
const exportIds = await findDatasetIdTreeByTopDatasetId(datasetId);
const limitMinutesAgo = new Date(
Date.now() - (global.feConfigs?.limit?.exportLimitMinutes || 0) * 60 * 1000
);
// auth export times
const authTimes = await MongoUser.findOne(
{
_id: userId,
$or: [
{ 'limit.exportKbTime': { $exists: false } },
{ 'limit.exportKbTime': { $lte: limitMinutesAgo } }
]
},
'_id limit'
);
if (!authTimes) {
const minutes = `${global.feConfigs?.limit?.exportLimitMinutes || 0} 分钟`;
return Promise.reject(`上次导出未到 ${minutes},每 ${minutes}仅可导出一次。`);
}
// auth max data
const total = await MongoDatasetData.countDocuments({
datasetId: { $in: exportIds }
});
addLog.info(`export datasets: ${datasetId}`, { total });
if (total > 100000) {
return Promise.reject('数据量超出 10 万,无法导出');
}
}

View File

@@ -9,7 +9,8 @@ import { authUserNotVisitor } from '@fastgpt/service/support/permission/auth/use
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
try {
await connectToDatabase();
const { name, tags, avatar, vectorModel, parentId, type } = req.body as CreateDatasetParams;
const { name, tags, avatar, vectorModel, agentModel, parentId, type } =
req.body as CreateDatasetParams;
// 凭证校验
const { teamId, tmbId } = await authUserNotVisitor({ req, authToken: true });
@@ -20,6 +21,7 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
tmbId,
tags,
vectorModel,
agentModel,
avatar,
parentId: parentId || null,
type

View File

@@ -10,7 +10,7 @@ import { countPromptTokens } from '@fastgpt/global/common/string/tiktoken';
import type { PushDataResponse } from '@/global/core/api/datasetRes.d';
import type { PushDatasetDataProps } from '@/global/core/dataset/api.d';
import { PushDatasetDataChunkProps } from '@fastgpt/global/core/dataset/api';
import { getVectorModel } from '@/service/core/ai/model';
import { getQAModel, getVectorModel } from '@/service/core/ai/model';
import { authDatasetCollection } from '@fastgpt/service/support/permission/auth/dataset';
import { getCollectionWithDataset } from '@fastgpt/service/core/dataset/controller';
@@ -63,24 +63,14 @@ export async function pushDataToDatasetCollection({
mode,
prompt,
billId
}: { teamId: string; tmbId: string } & PushDatasetDataProps): Promise<PushDataResponse> {
// get dataset vector model
const {
datasetId: { _id: datasetId, vectorModel }
} = await getCollectionWithDataset(collectionId);
const vectorModelData = getVectorModel(vectorModel);
const modeMap = {
[TrainingModeEnum.chunk]: {
maxToken: vectorModelData.maxToken * 1.5,
model: vectorModelData.model
},
[TrainingModeEnum.qa]: {
maxToken: global.qaModels[0].maxContext * 0.8,
model: global.qaModels[0].model
}
};
}: {
teamId: string;
tmbId: string;
} & PushDatasetDataProps): Promise<PushDataResponse> {
const { datasetId, model, maxToken } = await checkModelValid({
mode,
collectionId
});
// filter repeat or equal content
const set = new Set();
@@ -102,12 +92,13 @@ export async function pushDataToDatasetCollection({
// count q token
const token = countPromptTokens(item.q);
if (token > modeMap[mode].maxToken) {
if (token > maxToken) {
filterResult.overToken.push(item);
return;
}
if (set.has(text)) {
console.log('repeat', item);
filterResult.repeat.push(item);
} else {
filterResult.success.push(item);
@@ -126,7 +117,7 @@ export async function pushDataToDatasetCollection({
billId,
mode,
prompt,
model: modeMap[mode].model,
model,
q: item.q,
a: item.a,
indexes: item.indexes
@@ -142,6 +133,44 @@ export async function pushDataToDatasetCollection({
};
}
export async function checkModelValid({
mode,
collectionId
}: {
mode: `${TrainingModeEnum}`;
collectionId: string;
}) {
const {
datasetId: { _id: datasetId, vectorModel, agentModel }
} = await getCollectionWithDataset(collectionId);
if (mode === TrainingModeEnum.chunk) {
if (!collectionId) return Promise.reject(`CollectionId is empty`);
const vectorModelData = getVectorModel(vectorModel);
if (!vectorModelData) {
return Promise.reject(`Model ${vectorModel} is inValid`);
}
return {
datasetId,
maxToken: vectorModelData.maxToken * 1.5,
model: vectorModelData.model
};
}
if (mode === TrainingModeEnum.qa) {
const qaModelData = getQAModel(agentModel);
if (!qaModelData) {
return Promise.reject(`Model ${agentModel} is inValid`);
}
return {
datasetId,
maxToken: qaModelData.maxContext * 0.8,
model: qaModelData.model
};
}
return Promise.reject(`Mode ${mode} is inValid`);
}
export const config = {
api: {
bodyParser: {

View File

@@ -1,7 +1,7 @@
import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@fastgpt/service/common/response';
import { connectToDatabase } from '@/service/mongo';
import { getVectorModel } from '@/service/core/ai/model';
import { getQAModel, getVectorModel } from '@/service/core/ai/model';
import type { DatasetItemType } from '@fastgpt/global/core/dataset/type.d';
import { authDataset } from '@fastgpt/service/support/permission/auth/dataset';
@@ -28,6 +28,7 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
data: {
...dataset,
vectorModel: getVectorModel(dataset.vectorModel),
agentModel: getQAModel(dataset.agentModel),
canWrite,
isOwner
}

View File

@@ -1,5 +1,5 @@
import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@fastgpt/service/common/response';
import { jsonRes, responseWriteController } from '@fastgpt/service/common/response';
import { connectToDatabase } from '@/service/mongo';
import { MongoUser } from '@fastgpt/service/support/user/schema';
import { addLog } from '@fastgpt/service/common/mongo/controller';
@@ -8,6 +8,7 @@ import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema';
import { findDatasetIdTreeByTopDatasetId } from '@fastgpt/service/core/dataset/controller';
import { Readable } from 'stream';
import type { Cursor } from '@fastgpt/service/common/mongo';
import { limitCheck } from './checkExportLimit';
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
try {
@@ -23,39 +24,12 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
// 凭证校验
const { userId } = await authDataset({ req, authToken: true, datasetId, per: 'w' });
const exportIds = await findDatasetIdTreeByTopDatasetId(datasetId);
const limitMinutesAgo = new Date(
Date.now() - (global.feConfigs?.limit?.exportLimitMinutes || 0) * 60 * 1000
);
// auth export times
const authTimes = await MongoUser.findOne(
{
_id: userId,
$or: [
{ 'limit.exportKbTime': { $exists: false } },
{ 'limit.exportKbTime': { $lte: limitMinutesAgo } }
]
},
'_id limit'
);
if (!authTimes) {
const minutes = `${global.feConfigs?.limit?.exportLimitMinutes || 0} 分钟`;
throw new Error(`上次导出未到 ${minutes},每 ${minutes}仅可导出一次。`);
}
// auth max data
const total = await MongoDatasetData.countDocuments({
datasetId: { $in: exportIds }
await limitCheck({
userId,
datasetId
});
addLog.info(`export datasets: ${datasetId}`, { total });
if (total > 100000) {
throw new Error('数据量超出 10 万,无法导出');
}
const exportIds = await findDatasetIdTreeByTopDatasetId(datasetId);
res.setHeader('Content-Type', 'text/csv; charset=utf-8;');
res.setHeader('Content-Disposition', 'attachment; filename=dataset.csv; ');
@@ -72,35 +46,27 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
'q a'
).cursor();
function cursorToReadableStream(cursor: Cursor) {
const readable = new Readable({
objectMode: true,
read() {}
const write = responseWriteController({
res,
readStream: cursor
});
write(`\uFEFFindex,content`);
cursor.on('data', (doc) => {
const q = doc.q.replace(/"/g, '""') || '';
const a = doc.a.replace(/"/g, '""') || '';
write(`\n"${q}","${a}"`);
});
cursor.on('end', async () => {
cursor.close();
res.end();
await MongoUser.findByIdAndUpdate(userId, {
'limit.exportKbTime': new Date()
});
readable.push(`\uFEFFindex,content`);
cursor.on('data', (doc) => {
const q = doc.q.replace(/"/g, '""') || '';
const a = doc.a.replace(/"/g, '""') || '';
readable.push(`\n"${q}","${a}"`);
});
cursor.on('end', async () => {
readable.push(null);
cursor.close();
await MongoUser.findByIdAndUpdate(userId, {
'limit.exportKbTime': new Date()
});
});
return readable;
}
// @ts-ignore
const stream = cursorToReadableStream(cursor);
stream.pipe(res);
});
} catch (err) {
res.status(500);
jsonRes(res, {

View File

@@ -1,7 +1,7 @@
import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@fastgpt/service/common/response';
import { connectToDatabase } from '@/service/mongo';
import { getVectorModel } from '@/service/core/ai/model';
import { getQAModel, getVectorModel } from '@/service/core/ai/model';
import type { DatasetItemType } from '@fastgpt/global/core/dataset/type.d';
import { DatasetTypeEnum } from '@fastgpt/global/core/dataset/constant';
import { MongoDataset } from '@fastgpt/service/core/dataset/schema';
@@ -28,6 +28,7 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
datasets.map(async (item) => ({
...item.toJSON(),
vectorModel: getVectorModel(item.vectorModel),
agentModel: getQAModel(item.agentModel),
canWrite,
isOwner: teamOwner || String(item.tmbId) === tmbId
}))

View File

@@ -8,7 +8,8 @@ import { authDataset } from '@fastgpt/service/support/permission/auth/dataset';
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
try {
await connectToDatabase();
const { id, parentId, name, avatar, tags, permission } = req.body as DatasetUpdateParams;
const { id, parentId, name, avatar, tags, permission, agentModel } =
req.body as DatasetUpdateParams;
if (!id) {
throw new Error('缺少参数');
@@ -26,7 +27,8 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
...(name && { name }),
...(avatar && { avatar }),
...(tags && { tags }),
...(permission && { permission })
...(permission && { permission }),
...(agentModel && { agentModel: agentModel.model })
}
);

View File

@@ -5,15 +5,17 @@ import { MongoBill } from '@fastgpt/service/support/wallet/bill/schema';
import { authCert } from '@fastgpt/service/support/permission/auth/common';
import { BillSourceEnum } from '@fastgpt/global/support/wallet/bill/constants';
import { CreateTrainingBillProps } from '@fastgpt/global/support/wallet/bill/api.d';
import { getQAModel, getVectorModel } from '@/service/core/ai/model';
export default async function handler(req: NextApiRequest, res: NextApiResponse) {
try {
await connectToDatabase();
const { name } = req.body as CreateTrainingBillProps;
const { name, vectorModel, agentModel } = req.body as CreateTrainingBillProps;
const { teamId, tmbId } = await authCert({ req, authToken: true, authApiKey: true });
const qaModel = global.qaModels[0];
const vectorModelData = getVectorModel(vectorModel);
const agentModelData = getQAModel(agentModel);
const { _id } = await MongoBill.create({
teamId,
@@ -23,13 +25,13 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
list: [
{
moduleName: '索引生成',
model: 'embedding',
model: vectorModelData.name,
amount: 0,
tokenLen: 0
},
{
moduleName: 'QA 拆分',
model: qaModel?.name,
model: agentModelData.name,
amount: 0,
tokenLen: 0
}