Change embedding (#1463)

* rebuild embedding queue

* dataset menu

* feat: rebuild data api

* feat: ui change embedding model

* dataset ui

* feat: rebuild index ui

* rename collection
This commit is contained in:
Archer
2024-05-13 14:51:42 +08:00
committed by GitHub
parent 59fd94384d
commit 80a84a5733
37 changed files with 1260 additions and 419 deletions

View File

@@ -0,0 +1,178 @@
import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@fastgpt/service/common/response';
import { connectToDatabase } from '@/service/mongo';
import { authCert } from '@fastgpt/service/support/permission/auth/common';
import { PgClient } from '@fastgpt/service/common/vectorStore/pg';
import { NextAPI } from '@/service/middle/entry';
import { PgDatasetTableName } from '@fastgpt/global/common/vectorStore/constants';
import { connectionMongo } from '@fastgpt/service/common/mongo';
import { addLog } from '@fastgpt/service/common/system/log';
/* pg 中的数据搬到 mongo dataset.datas 中,并做映射 */
async function handler(req: NextApiRequest, res: NextApiResponse) {
await authCert({ req, authRoot: true });
// 重命名 dataset.trainigns -> dataset_trainings
try {
const collections = await connectionMongo.connection.db
.listCollections({ name: 'dataset.trainings' })
.toArray();
if (collections.length > 0) {
const sourceCol = connectionMongo.connection.db.collection('dataset.trainings');
const targetCol = connectionMongo.connection.db.collection('dataset_trainings');
if ((await targetCol.countDocuments()) > 0) {
console.log(
'dataset_trainings 中有数据,无法自动将 dataset.trainings 迁移到 dataset_trainings请手动操作'
);
} else {
await sourceCol.rename('dataset_trainings', { dropTarget: true });
console.log('success rename dataset.trainings -> dataset_trainings');
}
}
} catch (error) {
console.log('error rename dataset.trainings -> dataset_trainings', error);
}
try {
const collections = await connectionMongo.connection.db
.listCollections({ name: 'dataset.collections' })
.toArray();
if (collections.length > 0) {
const sourceCol = connectionMongo.connection.db.collection('dataset.collections');
const targetCol = connectionMongo.connection.db.collection('dataset_collections');
if ((await targetCol.countDocuments()) > 0) {
console.log(
'dataset_collections 中有数据,无法自动将 dataset.collections 迁移到 dataset_collections请手动操作'
);
} else {
await sourceCol.rename('dataset_collections', { dropTarget: true });
console.log('success rename dataset.collections -> dataset_collections');
}
}
} catch (error) {
console.log('error rename dataset.collections -> dataset_collections', error);
}
try {
const collections = await connectionMongo.connection.db
.listCollections({ name: 'dataset.datas' })
.toArray();
if (collections.length > 0) {
const sourceCol = connectionMongo.connection.db.collection('dataset.datas');
const targetCol = connectionMongo.connection.db.collection('dataset_datas');
if ((await targetCol.countDocuments()) > 0) {
console.log(
'dataset_datas 中有数据,无法自动将 dataset.datas 迁移到 dataset_datas请手动操作'
);
} else {
await sourceCol.rename('dataset_datas', { dropTarget: true });
console.log('success rename dataset.datas -> dataset_datas');
}
}
} catch (error) {
console.log('error rename dataset.datas -> dataset_datas', error);
}
try {
const collections = await connectionMongo.connection.db
.listCollections({ name: 'app.versions' })
.toArray();
if (collections.length > 0) {
const sourceCol = connectionMongo.connection.db.collection('app.versions');
const targetCol = connectionMongo.connection.db.collection('app_versions');
if ((await targetCol.countDocuments()) > 0) {
console.log(
'app_versions 中有数据,无法自动将 app.versions 迁移到 app_versions请手动操作'
);
} else {
await sourceCol.rename('app_versions', { dropTarget: true });
console.log('success rename app.versions -> app_versions');
}
}
} catch (error) {
console.log('error rename app.versions -> app_versions', error);
}
try {
const collections = await connectionMongo.connection.db
.listCollections({ name: 'buffer.rawtexts' })
.toArray();
if (collections.length > 0) {
const sourceCol = connectionMongo.connection.db.collection('buffer.rawtexts');
const targetCol = connectionMongo.connection.db.collection('buffer_rawtexts');
if ((await targetCol.countDocuments()) > 0) {
console.log(
'buffer_rawtexts 中有数据,无法自动将 buffer.rawtexts 迁移到 buffer_rawtexts请手动操作'
);
} else {
await sourceCol.rename('buffer_rawtexts', { dropTarget: true });
console.log('success rename buffer.rawtexts -> buffer_rawtexts');
}
}
} catch (error) {
console.log('error rename buffer.rawtext -> buffer_rawtext', error);
}
try {
const collections = await connectionMongo.connection.db
.listCollections({ name: 'buffer.tts' })
.toArray();
if (collections.length > 0) {
const sourceCol = connectionMongo.connection.db.collection('buffer.tts');
const targetCol = connectionMongo.connection.db.collection('buffer_tts');
if ((await targetCol.countDocuments()) > 0) {
console.log('buffer_tts 中有数据,无法自动将 buffer.tts 迁移到 buffer_tts请手动操作');
} else {
await sourceCol.rename('buffer_tts', { dropTarget: true });
console.log('success rename buffer.tts -> buffer_tts');
}
}
} catch (error) {
console.log('error rename buffer.tts -> buffer_tts', error);
}
try {
const collections = await connectionMongo.connection.db
.listCollections({ name: 'team.members' })
.toArray();
if (collections.length > 0) {
const sourceCol = connectionMongo.connection.db.collection('team.members');
await sourceCol.rename('team_members', { dropTarget: true });
console.log('success rename team.members -> team_members');
}
} catch (error) {
console.log('error rename team.members -> team_members', error);
}
try {
const collections = await connectionMongo.connection.db
.listCollections({ name: 'team.tags' })
.toArray();
if (collections.length > 0) {
const sourceCol = connectionMongo.connection.db.collection('team.tags');
const targetCol = connectionMongo.connection.db.collection('team_tags');
if ((await targetCol.countDocuments()) > 0) {
console.log('team_tags 中有数据,无法自动将 team.tags 迁移到 team_tags请手动操作');
} else {
await sourceCol.rename('team_tags', { dropTarget: true });
console.log('success rename team.tags -> team_tags');
}
}
} catch (error) {
console.log('error rename team.tags -> team_tags', error);
}
jsonRes(res, {
message: 'success'
});
}
export default NextAPI(handler);

View File

@@ -0,0 +1,39 @@
import type { ApiRequestProps, ApiResponseType } from '@fastgpt/service/type/next';
import { NextAPI } from '@/service/middle/entry';
import { authDataset } from '@fastgpt/service/support/permission/auth/dataset';
import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema';
import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema';
type Props = {};
export type getDatasetTrainingQueueResponse = {
rebuildingCount: number;
trainingCount: number;
};
async function handler(
req: ApiRequestProps<any, { datasetId: string }>,
res: ApiResponseType<any>
): Promise<getDatasetTrainingQueueResponse> {
const { datasetId } = req.query;
const { teamId } = await authDataset({
req,
authToken: true,
authApiKey: true,
datasetId,
per: 'r'
});
const [rebuildingCount, trainingCount] = await Promise.all([
MongoDatasetData.countDocuments({ teamId, datasetId, rebuilding: true }),
MongoDatasetTraining.countDocuments({ teamId, datasetId })
]);
return {
rebuildingCount,
trainingCount
};
}
export default NextAPI(handler);

View File

@@ -0,0 +1,133 @@
import { NextAPI } from '@/service/middle/entry';
import { authDataset } from '@fastgpt/service/support/permission/auth/dataset';
import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun';
import { MongoDataset } from '@fastgpt/service/core/dataset/schema';
import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema';
import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema';
import { createTrainingUsage } from '@fastgpt/service/support/wallet/usage/controller';
import { UsageSourceEnum } from '@fastgpt/global/support/wallet/usage/constants';
import { getLLMModel, getVectorModel } from '@fastgpt/service/core/ai/model';
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
import { ApiRequestProps, ApiResponseType } from '@fastgpt/service/type/next';
export type rebuildEmbeddingBody = {
datasetId: string;
vectorModel: string;
};
export type Response = {};
async function handler(
req: ApiRequestProps<rebuildEmbeddingBody>,
res: ApiResponseType<any>
): Promise<Response> {
const { datasetId, vectorModel } = req.body;
const { teamId, tmbId, dataset } = await authDataset({
req,
authToken: true,
authApiKey: true,
datasetId,
per: 'owner'
});
// check vector model
if (!vectorModel || dataset.vectorModel === vectorModel) {
return Promise.reject('vectorModel 不合法');
}
// check rebuilding or training
const [rebuilding, training] = await Promise.all([
MongoDatasetData.findOne({ teamId, datasetId, rebuilding: true }),
MongoDatasetTraining.findOne({ teamId, datasetId })
]);
if (rebuilding || training) {
return Promise.reject('数据集正在训练或者重建中,请稍后再试');
}
const { billId } = await createTrainingUsage({
teamId,
tmbId,
appName: '切换索引模型',
billSource: UsageSourceEnum.training,
vectorModel: getVectorModel(dataset.vectorModel)?.name,
agentModel: getLLMModel(dataset.agentModel)?.name
});
// update vector model and dataset.data rebuild field
await mongoSessionRun(async (session) => {
await MongoDataset.findByIdAndUpdate(
datasetId,
{
vectorModel
},
{ session }
);
await MongoDatasetData.updateMany(
{
teamId,
datasetId
},
{
$set: {
rebuilding: true
}
},
{
session
}
);
});
// get 10 init dataset.data
const arr = new Array(10).fill(0);
for await (const _ of arr) {
await mongoSessionRun(async (session) => {
const data = await MongoDatasetData.findOneAndUpdate(
{
teamId,
datasetId,
rebuilding: true
},
{
$unset: {
rebuilding: null
},
updateTime: new Date()
},
{
session
}
).select({
_id: 1,
collectionId: 1
});
if (data) {
await MongoDatasetTraining.create(
[
{
teamId,
tmbId,
datasetId,
collectionId: data.collectionId,
billId,
mode: TrainingModeEnum.chunk,
model: vectorModel,
q: '1',
dataId: data._id
}
],
{
session
}
);
}
});
}
return {};
}
export default NextAPI(handler);