4.6.7 first pr (#726)
This commit is contained in:
@@ -29,6 +29,7 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
|
||||
|
||||
await MongoChatItem.findOneAndUpdate(
|
||||
{
|
||||
chatId,
|
||||
dataId: chatItemId
|
||||
},
|
||||
{
|
||||
|
||||
@@ -0,0 +1,88 @@
|
||||
/*
|
||||
Create one dataset collection
|
||||
*/
|
||||
import type { NextApiRequest, NextApiResponse } from 'next';
|
||||
import { jsonRes } from '@fastgpt/service/common/response';
|
||||
import { connectToDatabase } from '@/service/mongo';
|
||||
import type { LinkCreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
|
||||
import { authDataset } from '@fastgpt/service/support/permission/auth/dataset';
|
||||
import { createOneCollection } from '@fastgpt/service/core/dataset/collection/controller';
|
||||
import { TrainingModeEnum, DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constant';
|
||||
import { checkDatasetLimit } from '@fastgpt/service/support/permission/limit/dataset';
|
||||
import { predictDataLimitLength } from '@fastgpt/global/core/dataset/utils';
|
||||
import { createTrainingBill } from '@fastgpt/service/support/wallet/bill/controller';
|
||||
import { BillSourceEnum } from '@fastgpt/global/support/wallet/bill/constants';
|
||||
import { getQAModel, getVectorModel } from '@/service/core/ai/model';
|
||||
import { reloadCollectionChunks } from '@fastgpt/service/core/dataset/collection/utils';
|
||||
import { startQueue } from '@/service/utils/tools';
|
||||
|
||||
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
|
||||
try {
|
||||
await connectToDatabase();
|
||||
const {
|
||||
link,
|
||||
trainingType = TrainingModeEnum.chunk,
|
||||
chunkSize = 512,
|
||||
chunkSplitter,
|
||||
qaPrompt,
|
||||
...body
|
||||
} = req.body as LinkCreateDatasetCollectionParams;
|
||||
|
||||
const { teamId, tmbId, dataset } = await authDataset({
|
||||
req,
|
||||
authToken: true,
|
||||
authApiKey: true,
|
||||
datasetId: body.datasetId,
|
||||
per: 'w'
|
||||
});
|
||||
|
||||
// 1. check dataset limit
|
||||
await checkDatasetLimit({
|
||||
teamId,
|
||||
freeSize: global.feConfigs?.subscription?.datasetStoreFreeSize,
|
||||
insertLen: predictDataLimitLength(trainingType, new Array(10))
|
||||
});
|
||||
|
||||
// 2. create collection
|
||||
const collectionId = await createOneCollection({
|
||||
...body,
|
||||
name: link,
|
||||
teamId,
|
||||
tmbId,
|
||||
type: DatasetCollectionTypeEnum.link,
|
||||
|
||||
trainingType,
|
||||
chunkSize,
|
||||
chunkSplitter,
|
||||
qaPrompt,
|
||||
|
||||
rawLink: link
|
||||
});
|
||||
|
||||
// 3. create bill and start sync
|
||||
const { billId } = await createTrainingBill({
|
||||
teamId,
|
||||
tmbId,
|
||||
appName: 'core.dataset.collection.Sync Collection',
|
||||
billSource: BillSourceEnum.training,
|
||||
vectorModel: getVectorModel(dataset.vectorModel).name,
|
||||
agentModel: getQAModel(dataset.agentModel).name
|
||||
});
|
||||
await reloadCollectionChunks({
|
||||
collectionId,
|
||||
tmbId,
|
||||
billId
|
||||
});
|
||||
|
||||
startQueue();
|
||||
|
||||
jsonRes(res, {
|
||||
data: { collectionId }
|
||||
});
|
||||
} catch (err) {
|
||||
jsonRes(res, {
|
||||
code: 500,
|
||||
error: err
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,90 @@
|
||||
/*
|
||||
Create one dataset collection
|
||||
*/
|
||||
import type { NextApiRequest, NextApiResponse } from 'next';
|
||||
import { jsonRes } from '@fastgpt/service/common/response';
|
||||
import { connectToDatabase } from '@/service/mongo';
|
||||
import type { TextCreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
|
||||
import { authDataset } from '@fastgpt/service/support/permission/auth/dataset';
|
||||
import { createOneCollection } from '@fastgpt/service/core/dataset/collection/controller';
|
||||
import { TrainingModeEnum, DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constant';
|
||||
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
|
||||
import { checkDatasetLimit } from '@fastgpt/service/support/permission/limit/dataset';
|
||||
import { predictDataLimitLength } from '@fastgpt/global/core/dataset/utils';
|
||||
import { pushDataToDatasetCollection } from '@/service/core/dataset/data/controller';
|
||||
import { hashStr } from '@fastgpt/global/common/string/tools';
|
||||
|
||||
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
|
||||
try {
|
||||
await connectToDatabase();
|
||||
const {
|
||||
text,
|
||||
trainingType = TrainingModeEnum.chunk,
|
||||
chunkSize = 512,
|
||||
chunkSplitter,
|
||||
qaPrompt,
|
||||
...body
|
||||
} = req.body as TextCreateDatasetCollectionParams;
|
||||
|
||||
const { teamId, tmbId } = await authDataset({
|
||||
req,
|
||||
authToken: true,
|
||||
authApiKey: true,
|
||||
datasetId: body.datasetId,
|
||||
per: 'w'
|
||||
});
|
||||
|
||||
// 1. split text to chunks
|
||||
const { chunks } = splitText2Chunks({
|
||||
text,
|
||||
chunkLen: chunkSize,
|
||||
overlapRatio: trainingType === TrainingModeEnum.chunk ? 0.2 : 0,
|
||||
customReg: chunkSplitter ? [chunkSplitter] : [],
|
||||
countTokens: false
|
||||
});
|
||||
|
||||
// 2. check dataset limit
|
||||
await checkDatasetLimit({
|
||||
teamId,
|
||||
freeSize: global.feConfigs?.subscription?.datasetStoreFreeSize,
|
||||
insertLen: predictDataLimitLength(trainingType, chunks)
|
||||
});
|
||||
|
||||
// 3. create collection
|
||||
const collectionId = await createOneCollection({
|
||||
...body,
|
||||
teamId,
|
||||
tmbId,
|
||||
type: DatasetCollectionTypeEnum.virtual,
|
||||
|
||||
trainingType,
|
||||
chunkSize,
|
||||
chunkSplitter,
|
||||
qaPrompt,
|
||||
|
||||
hashRawText: hashStr(text),
|
||||
rawTextLength: text.length
|
||||
});
|
||||
|
||||
// 4. push chunks to training queue
|
||||
const insertResults = await pushDataToDatasetCollection({
|
||||
teamId,
|
||||
tmbId,
|
||||
collectionId,
|
||||
trainingMode: trainingType,
|
||||
data: chunks.map((text, index) => ({
|
||||
q: text,
|
||||
chunkIndex: index
|
||||
}))
|
||||
});
|
||||
|
||||
jsonRes(res, {
|
||||
data: { collectionId, results: insertResults }
|
||||
});
|
||||
} catch (err) {
|
||||
jsonRes(res, {
|
||||
code: 500,
|
||||
error: err
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -5,7 +5,6 @@ import type { NextApiRequest, NextApiResponse } from 'next';
|
||||
import { jsonRes } from '@fastgpt/service/common/response';
|
||||
import { connectToDatabase } from '@/service/mongo';
|
||||
import type { CreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
|
||||
import { authUserNotVisitor } from '@fastgpt/service/support/permission/auth/user';
|
||||
import { authDataset } from '@fastgpt/service/support/permission/auth/dataset';
|
||||
import { createOneCollection } from '@fastgpt/service/core/dataset/collection/controller';
|
||||
|
||||
@@ -14,13 +13,12 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
|
||||
await connectToDatabase();
|
||||
const body = req.body as CreateDatasetCollectionParams;
|
||||
|
||||
// auth. not visitor and dataset is public
|
||||
const { teamId, tmbId } = await authUserNotVisitor({ req, authToken: true });
|
||||
await authDataset({
|
||||
const { teamId, tmbId } = await authDataset({
|
||||
req,
|
||||
authToken: true,
|
||||
authApiKey: true,
|
||||
datasetId: body.datasetId,
|
||||
per: 'r'
|
||||
per: 'w'
|
||||
});
|
||||
|
||||
jsonRes(res, {
|
||||
|
||||
@@ -4,13 +4,12 @@ import { connectToDatabase } from '@/service/mongo';
|
||||
import { findCollectionAndChild } from '@fastgpt/service/core/dataset/collection/utils';
|
||||
import { delCollectionRelevantData } from '@fastgpt/service/core/dataset/data/controller';
|
||||
import { authDatasetCollection } from '@fastgpt/service/support/permission/auth/dataset';
|
||||
import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema';
|
||||
|
||||
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
|
||||
try {
|
||||
await connectToDatabase();
|
||||
|
||||
const { collectionId } = req.query as { collectionId: string };
|
||||
const { id: collectionId } = req.query as { id: string };
|
||||
|
||||
if (!collectionId) {
|
||||
throw new Error('CollectionIdId is required');
|
||||
@@ -19,6 +18,7 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
|
||||
await authDatasetCollection({
|
||||
req,
|
||||
authToken: true,
|
||||
authApiKey: true,
|
||||
collectionId,
|
||||
per: 'w'
|
||||
});
|
||||
|
||||
@@ -22,6 +22,7 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
|
||||
const { collection, canWrite } = await authDatasetCollection({
|
||||
req,
|
||||
authToken: true,
|
||||
authApiKey: true,
|
||||
collectionId: id,
|
||||
per: 'r'
|
||||
});
|
||||
|
||||
@@ -11,7 +11,6 @@ import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constant
|
||||
import { startQueue } from '@/service/utils/tools';
|
||||
import { authDataset } from '@fastgpt/service/support/permission/auth/dataset';
|
||||
import { DatasetDataCollectionName } from '@fastgpt/service/core/dataset/data/schema';
|
||||
import { authUserRole } from '@fastgpt/service/support/permission/auth/user';
|
||||
|
||||
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
|
||||
try {
|
||||
@@ -27,12 +26,19 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
|
||||
simple = false
|
||||
} = req.body as GetDatasetCollectionsProps;
|
||||
searchText = searchText?.replace(/'/g, '');
|
||||
pageSize = Math.min(pageSize, 30);
|
||||
|
||||
// auth dataset and get my role
|
||||
const { tmbId } = await authDataset({ req, authToken: true, datasetId, per: 'r' });
|
||||
const { canWrite } = await authUserRole({ req, authToken: true });
|
||||
const { teamId, tmbId, canWrite } = await authDataset({
|
||||
req,
|
||||
authToken: true,
|
||||
authApiKey: true,
|
||||
datasetId,
|
||||
per: 'r'
|
||||
});
|
||||
|
||||
const match = {
|
||||
teamId: new Types.ObjectId(teamId),
|
||||
datasetId: new Types.ObjectId(datasetId),
|
||||
parentId: parentId ? new Types.ObjectId(parentId) : null,
|
||||
...(selectFolder ? { type: DatasetCollectionTypeEnum.folder } : {}),
|
||||
@@ -85,9 +91,9 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
|
||||
}
|
||||
}
|
||||
},
|
||||
{ $project: { _id: 1 } }
|
||||
{ $count: 'count' }
|
||||
],
|
||||
as: 'trainings'
|
||||
as: 'trainingCount'
|
||||
}
|
||||
},
|
||||
// count collection total data
|
||||
@@ -103,9 +109,9 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
|
||||
}
|
||||
}
|
||||
},
|
||||
{ $project: { _id: 1 } }
|
||||
{ $count: 'count' }
|
||||
],
|
||||
as: 'datas'
|
||||
as: 'dataCount'
|
||||
}
|
||||
},
|
||||
{
|
||||
@@ -117,10 +123,14 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
|
||||
type: 1,
|
||||
status: 1,
|
||||
updateTime: 1,
|
||||
dataAmount: { $size: '$datas' },
|
||||
trainingAmount: { $size: '$trainings' },
|
||||
fileId: 1,
|
||||
rawLink: 1
|
||||
rawLink: 1,
|
||||
dataAmount: {
|
||||
$ifNull: [{ $arrayElemAt: ['$dataCount.count', 0] }, 0]
|
||||
},
|
||||
trainingAmount: {
|
||||
$ifNull: [{ $arrayElemAt: ['$trainingCount.count', 0] }, 0]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
@@ -144,7 +154,7 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
|
||||
);
|
||||
|
||||
if (data.find((item) => item.trainingAmount > 0)) {
|
||||
startQueue(1);
|
||||
startQueue();
|
||||
}
|
||||
|
||||
// count collections
|
||||
|
||||
@@ -38,7 +38,7 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
|
||||
return Promise.reject(DatasetErrEnum.unLinkCollection);
|
||||
}
|
||||
|
||||
const { rawText, isSameRawText } = await getCollectionAndRawText({
|
||||
const { title, rawText, isSameRawText } = await getCollectionAndRawText({
|
||||
collection
|
||||
});
|
||||
|
||||
@@ -68,7 +68,7 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
|
||||
tmbId: collection.tmbId,
|
||||
parentId: collection.parentId,
|
||||
datasetId: collection.datasetId._id,
|
||||
name: collection.name,
|
||||
name: title || collection.name,
|
||||
type: collection.type,
|
||||
trainingType: collection.trainingType,
|
||||
chunkSize: collection.chunkSize,
|
||||
|
||||
@@ -16,7 +16,13 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
|
||||
}
|
||||
|
||||
// 凭证校验
|
||||
await authDatasetCollection({ req, authToken: true, collectionId: id, per: 'w' });
|
||||
await authDatasetCollection({
|
||||
req,
|
||||
authToken: true,
|
||||
authApiKey: true,
|
||||
collectionId: id,
|
||||
per: 'w'
|
||||
});
|
||||
|
||||
const updateFields: Record<string, any> = {
|
||||
...(parentId !== undefined && { parentId: parentId || null }),
|
||||
|
||||
@@ -16,12 +16,28 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
|
||||
type,
|
||||
avatar,
|
||||
vectorModel = global.vectorModels[0].model,
|
||||
agentModel
|
||||
agentModel = global.qaModels[0].model
|
||||
} = req.body as CreateDatasetParams;
|
||||
|
||||
// 凭证校验
|
||||
// auth
|
||||
const { teamId, tmbId } = await authUserNotVisitor({ req, authToken: true });
|
||||
|
||||
// check model valid
|
||||
const vectorModelStore = global.vectorModels.find((item) => item.model === vectorModel);
|
||||
const agentModelStore = global.qaModels.find((item) => item.model === agentModel);
|
||||
if (!vectorModelStore || !agentModelStore) {
|
||||
throw new Error('vectorModel or qaModel is invalid');
|
||||
}
|
||||
|
||||
// check limit
|
||||
const authCount = await MongoDataset.countDocuments({
|
||||
teamId,
|
||||
type: DatasetTypeEnum.dataset
|
||||
});
|
||||
if (authCount >= 50) {
|
||||
throw new Error('每个团队上限 50 个知识库');
|
||||
}
|
||||
|
||||
const { _id } = await MongoDataset.create({
|
||||
name,
|
||||
teamId,
|
||||
|
||||
@@ -8,8 +8,8 @@ import { delDatasetDataByDataId } from '@fastgpt/service/core/dataset/data/contr
|
||||
export default withNextCors(async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
|
||||
try {
|
||||
await connectToDatabase();
|
||||
const { dataId } = req.query as {
|
||||
dataId: string;
|
||||
const { id: dataId } = req.query as {
|
||||
id: string;
|
||||
};
|
||||
|
||||
if (!dataId) {
|
||||
@@ -17,9 +17,18 @@ export default withNextCors(async function handler(req: NextApiRequest, res: Nex
|
||||
}
|
||||
|
||||
// 凭证校验
|
||||
await authDatasetData({ req, authToken: true, dataId, per: 'w' });
|
||||
const { datasetData } = await authDatasetData({
|
||||
req,
|
||||
authToken: true,
|
||||
authApiKey: true,
|
||||
dataId,
|
||||
per: 'w'
|
||||
});
|
||||
|
||||
await delDatasetDataByDataId(dataId);
|
||||
await delDatasetDataByDataId({
|
||||
collectionId: datasetData.collectionId,
|
||||
mongoDataId: dataId
|
||||
});
|
||||
|
||||
jsonRes(res, {
|
||||
data: 'success'
|
||||
|
||||
@@ -13,12 +13,18 @@ export type Response = {
|
||||
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
|
||||
try {
|
||||
await connectToDatabase();
|
||||
const { dataId } = req.query as {
|
||||
dataId: string;
|
||||
const { id: dataId } = req.query as {
|
||||
id: string;
|
||||
};
|
||||
|
||||
// 凭证校验
|
||||
const { datasetData } = await authDatasetData({ req, authToken: true, dataId, per: 'r' });
|
||||
const { datasetData } = await authDatasetData({
|
||||
req,
|
||||
authToken: true,
|
||||
authApiKey: true,
|
||||
dataId,
|
||||
per: 'r'
|
||||
});
|
||||
|
||||
jsonRes(res, {
|
||||
data: datasetData
|
||||
|
||||
@@ -16,6 +16,7 @@ import { authTeamBalance } from '@/service/support/permission/auth/bill';
|
||||
import { pushGenerateVectorBill } from '@/service/support/wallet/bill/push';
|
||||
import { InsertOneDatasetDataProps } from '@/global/core/dataset/api';
|
||||
import { simpleText } from '@fastgpt/global/common/string/tools';
|
||||
import { checkDatasetLimit } from '@fastgpt/service/support/permission/limit/dataset';
|
||||
|
||||
export default withNextCors(async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
|
||||
try {
|
||||
@@ -39,6 +40,12 @@ export default withNextCors(async function handler(req: NextApiRequest, res: Nex
|
||||
per: 'w'
|
||||
});
|
||||
|
||||
await checkDatasetLimit({
|
||||
teamId,
|
||||
freeSize: global.feConfigs?.subscription?.datasetStoreFreeSize,
|
||||
insertLen: 1
|
||||
});
|
||||
|
||||
// auth collection and get dataset
|
||||
const [
|
||||
{
|
||||
|
||||
@@ -17,8 +17,10 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
|
||||
collectionId
|
||||
} = req.body as GetDatasetDataListProps;
|
||||
|
||||
pageSize = Math.min(pageSize, 30);
|
||||
|
||||
// 凭证校验
|
||||
await authDatasetCollection({ req, authToken: true, collectionId, per: 'r' });
|
||||
await authDatasetCollection({ req, authToken: true, authApiKey: true, collectionId, per: 'r' });
|
||||
|
||||
searchText = searchText.replace(/'/g, '');
|
||||
|
||||
@@ -32,7 +34,7 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
|
||||
};
|
||||
|
||||
const [data, total] = await Promise.all([
|
||||
MongoDatasetData.find(match, '_id datasetId collectionId q a chunkIndex indexes')
|
||||
MongoDatasetData.find(match, '_id datasetId collectionId q a chunkIndex')
|
||||
.sort({ chunkIndex: 1, updateTime: -1 })
|
||||
.skip((pageNum - 1) * pageSize)
|
||||
.limit(pageSize)
|
||||
|
||||
@@ -2,38 +2,30 @@
|
||||
import type { NextApiRequest, NextApiResponse } from 'next';
|
||||
import { jsonRes } from '@fastgpt/service/common/response';
|
||||
import { connectToDatabase } from '@/service/mongo';
|
||||
import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema';
|
||||
import { withNextCors } from '@fastgpt/service/common/middle/cors';
|
||||
import { TrainingModeEnum, TrainingTypeMap } from '@fastgpt/global/core/dataset/constant';
|
||||
import { startQueue } from '@/service/utils/tools';
|
||||
import { countPromptTokens } from '@fastgpt/global/common/string/tiktoken';
|
||||
import type { PushDataResponse } from '@/global/core/api/datasetRes.d';
|
||||
import type { PushDatasetDataProps } from '@/global/core/dataset/api.d';
|
||||
import { PushDatasetDataChunkProps } from '@fastgpt/global/core/dataset/api';
|
||||
import { getQAModel, getVectorModel } from '@/service/core/ai/model';
|
||||
import { authDatasetCollection } from '@fastgpt/service/support/permission/auth/dataset';
|
||||
import { getCollectionWithDataset } from '@fastgpt/service/core/dataset/controller';
|
||||
import { simpleText } from '@fastgpt/global/common/string/tools';
|
||||
import { checkDatasetLimit } from '@fastgpt/service/support/permission/limit/dataset';
|
||||
import { predictDataLimitLength } from '@fastgpt/global/core/dataset/utils';
|
||||
import { pushDataToDatasetCollection } from '@/service/core/dataset/data/controller';
|
||||
|
||||
export default withNextCors(async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
|
||||
try {
|
||||
await connectToDatabase();
|
||||
const { collectionId, data, mode = TrainingModeEnum.chunk } = req.body as PushDatasetDataProps;
|
||||
const { collectionId, data } = req.body as PushDatasetDataProps;
|
||||
|
||||
if (!collectionId || !Array.isArray(data)) {
|
||||
throw new Error('collectionId or data is empty');
|
||||
}
|
||||
|
||||
if (!TrainingTypeMap[mode]) {
|
||||
throw new Error(`Mode is not ${Object.keys(TrainingTypeMap).join(', ')}`);
|
||||
}
|
||||
|
||||
if (data.length > 200) {
|
||||
throw new Error('Data is too long, max 200');
|
||||
}
|
||||
|
||||
// 凭证校验
|
||||
const { teamId, tmbId } = await authDatasetCollection({
|
||||
const { teamId, tmbId, collection } = await authDatasetCollection({
|
||||
req,
|
||||
authToken: true,
|
||||
authApiKey: true,
|
||||
@@ -41,6 +33,13 @@ export default withNextCors(async function handler(req: NextApiRequest, res: Nex
|
||||
per: 'w'
|
||||
});
|
||||
|
||||
// auth dataset limit
|
||||
await checkDatasetLimit({
|
||||
teamId,
|
||||
freeSize: global.feConfigs?.subscription?.datasetStoreFreeSize,
|
||||
insertLen: predictDataLimitLength(collection.trainingType, data)
|
||||
});
|
||||
|
||||
jsonRes<PushDataResponse>(res, {
|
||||
data: await pushDataToDatasetCollection({
|
||||
...req.body,
|
||||
@@ -56,141 +55,6 @@ export default withNextCors(async function handler(req: NextApiRequest, res: Nex
|
||||
}
|
||||
});
|
||||
|
||||
export async function pushDataToDatasetCollection({
|
||||
teamId,
|
||||
tmbId,
|
||||
collectionId,
|
||||
data,
|
||||
mode,
|
||||
prompt,
|
||||
billId
|
||||
}: {
|
||||
teamId: string;
|
||||
tmbId: string;
|
||||
} & PushDatasetDataProps): Promise<PushDataResponse> {
|
||||
const { datasetId, model, maxToken, weight } = await checkModelValid({
|
||||
mode,
|
||||
collectionId
|
||||
});
|
||||
|
||||
// format q and a, remove empty char
|
||||
data.forEach((item) => {
|
||||
item.q = simpleText(item.q);
|
||||
item.a = simpleText(item.a);
|
||||
|
||||
item.indexes = item.indexes
|
||||
?.map((index) => {
|
||||
return {
|
||||
...index,
|
||||
text: simpleText(index.text)
|
||||
};
|
||||
})
|
||||
.filter(Boolean);
|
||||
});
|
||||
|
||||
// filter repeat or equal content
|
||||
const set = new Set();
|
||||
const filterResult: Record<string, PushDatasetDataChunkProps[]> = {
|
||||
success: [],
|
||||
overToken: [],
|
||||
repeat: [],
|
||||
error: []
|
||||
};
|
||||
|
||||
data.forEach((item) => {
|
||||
if (!item.q) {
|
||||
filterResult.error.push(item);
|
||||
return;
|
||||
}
|
||||
|
||||
const text = item.q + item.a;
|
||||
|
||||
// count q token
|
||||
const token = countPromptTokens(item.q);
|
||||
|
||||
if (token > maxToken) {
|
||||
filterResult.overToken.push(item);
|
||||
return;
|
||||
}
|
||||
|
||||
if (set.has(text)) {
|
||||
console.log('repeat', item);
|
||||
filterResult.repeat.push(item);
|
||||
} else {
|
||||
filterResult.success.push(item);
|
||||
set.add(text);
|
||||
}
|
||||
});
|
||||
|
||||
// 插入记录
|
||||
const insertRes = await MongoDatasetTraining.insertMany(
|
||||
filterResult.success.map((item, i) => ({
|
||||
teamId,
|
||||
tmbId,
|
||||
datasetId,
|
||||
collectionId,
|
||||
billId,
|
||||
mode,
|
||||
prompt,
|
||||
model,
|
||||
q: item.q,
|
||||
a: item.a,
|
||||
chunkIndex: item.chunkIndex ?? i,
|
||||
weight: weight ?? 0,
|
||||
indexes: item.indexes
|
||||
}))
|
||||
);
|
||||
|
||||
insertRes.length > 0 && startQueue();
|
||||
delete filterResult.success;
|
||||
|
||||
return {
|
||||
insertLen: insertRes.length,
|
||||
...filterResult
|
||||
};
|
||||
}
|
||||
|
||||
export async function checkModelValid({
|
||||
mode,
|
||||
collectionId
|
||||
}: {
|
||||
mode: `${TrainingModeEnum}`;
|
||||
collectionId: string;
|
||||
}) {
|
||||
const {
|
||||
datasetId: { _id: datasetId, vectorModel, agentModel }
|
||||
} = await getCollectionWithDataset(collectionId);
|
||||
|
||||
if (mode === TrainingModeEnum.chunk) {
|
||||
if (!collectionId) return Promise.reject(`CollectionId is empty`);
|
||||
const vectorModelData = getVectorModel(vectorModel);
|
||||
if (!vectorModelData) {
|
||||
return Promise.reject(`Model ${vectorModel} is inValid`);
|
||||
}
|
||||
|
||||
return {
|
||||
datasetId,
|
||||
maxToken: vectorModelData.maxToken * 1.5,
|
||||
model: vectorModelData.model,
|
||||
weight: vectorModelData.weight
|
||||
};
|
||||
}
|
||||
|
||||
if (mode === TrainingModeEnum.qa) {
|
||||
const qaModelData = getQAModel(agentModel);
|
||||
if (!qaModelData) {
|
||||
return Promise.reject(`Model ${agentModel} is inValid`);
|
||||
}
|
||||
return {
|
||||
datasetId,
|
||||
maxToken: qaModelData.maxContext * 0.8,
|
||||
model: qaModelData.model,
|
||||
weight: 0
|
||||
};
|
||||
}
|
||||
return Promise.reject(`Mode ${mode} is inValid`);
|
||||
}
|
||||
|
||||
export const config = {
|
||||
api: {
|
||||
bodyParser: {
|
||||
|
||||
@@ -11,7 +11,7 @@ import { UpdateDatasetDataProps } from '@/global/core/dataset/api';
|
||||
export default withNextCors(async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
|
||||
try {
|
||||
await connectToDatabase();
|
||||
const { id, q = '', a, indexes } = req.body as UpdateDatasetDataProps;
|
||||
const { id, q = '', a, indexes = [] } = req.body as UpdateDatasetDataProps;
|
||||
|
||||
// auth data permission
|
||||
const {
|
||||
@@ -23,6 +23,7 @@ export default withNextCors(async function handler(req: NextApiRequest, res: Nex
|
||||
} = await authDatasetData({
|
||||
req,
|
||||
authToken: true,
|
||||
authApiKey: true,
|
||||
dataId: id,
|
||||
per: 'w'
|
||||
});
|
||||
|
||||
@@ -20,6 +20,7 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
|
||||
const { dataset, canWrite, isOwner } = await authDataset({
|
||||
req,
|
||||
authToken: true,
|
||||
authApiKey: true,
|
||||
datasetId,
|
||||
per: 'r'
|
||||
});
|
||||
|
||||
@@ -15,7 +15,8 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
|
||||
// 凭证校验
|
||||
const { teamId, tmbId, teamOwner, role, canWrite } = await authUserRole({
|
||||
req,
|
||||
authToken: true
|
||||
authToken: true,
|
||||
authApiKey: true
|
||||
});
|
||||
|
||||
const datasets = await MongoDataset.find({
|
||||
|
||||
Reference in New Issue
Block a user