4.6.7-alpha commit (#743)
Co-authored-by: Archer <545436317@qq.com> Co-authored-by: heheer <71265218+newfish-cmyk@users.noreply.github.com>
This commit is contained in:
@@ -1,6 +1,20 @@
|
||||
import { TrainingModeEnum, DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constant';
|
||||
import {
|
||||
TrainingModeEnum,
|
||||
DatasetCollectionTypeEnum
|
||||
} from '@fastgpt/global/core/dataset/constants';
|
||||
import type { CreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
|
||||
import { MongoDatasetCollection } from './schema';
|
||||
import {
|
||||
CollectionWithDatasetType,
|
||||
DatasetCollectionSchemaType
|
||||
} from '@fastgpt/global/core/dataset/type';
|
||||
import { MongoDatasetTraining } from '../training/schema';
|
||||
import { delay } from '@fastgpt/global/common/system/utils';
|
||||
import { MongoDatasetData } from '../data/schema';
|
||||
import { delImgByRelatedId } from '../../../common/file/image/controller';
|
||||
import { deleteDatasetDataVector } from '../../../common/vectorStore/controller';
|
||||
import { delFileByFileIdList } from '../../../common/file/gridfs/controller';
|
||||
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
|
||||
|
||||
export async function createOneCollection({
|
||||
teamId,
|
||||
@@ -85,20 +99,50 @@ export function createDefaultCollection({
|
||||
});
|
||||
}
|
||||
|
||||
// check same collection
|
||||
export const getSameRawTextCollection = async ({
|
||||
datasetId,
|
||||
hashRawText
|
||||
/**
|
||||
* delete collection and it related data
|
||||
*/
|
||||
export async function delCollectionAndRelatedSources({
|
||||
collections
|
||||
}: {
|
||||
datasetId: string;
|
||||
hashRawText?: string;
|
||||
}) => {
|
||||
if (!hashRawText) return undefined;
|
||||
collections: (CollectionWithDatasetType | DatasetCollectionSchemaType)[];
|
||||
}) {
|
||||
if (collections.length === 0) return;
|
||||
|
||||
const collection = await MongoDatasetCollection.findOne({
|
||||
datasetId,
|
||||
hashRawText
|
||||
const teamId = collections[0].teamId;
|
||||
|
||||
if (!teamId) return Promise.reject('teamId is not exist');
|
||||
|
||||
const collectionIds = collections.map((item) => String(item._id));
|
||||
const fileIdList = collections.map((item) => item?.fileId || '').filter(Boolean);
|
||||
const relatedImageIds = collections
|
||||
.map((item) => item?.metadata?.relatedImgId || '')
|
||||
.filter(Boolean);
|
||||
|
||||
// delete training data
|
||||
await MongoDatasetTraining.deleteMany({
|
||||
teamId,
|
||||
collectionId: { $in: collectionIds }
|
||||
});
|
||||
|
||||
return collection;
|
||||
};
|
||||
await delay(2000);
|
||||
|
||||
// delete dataset.datas
|
||||
await MongoDatasetData.deleteMany({ teamId, collectionId: { $in: collectionIds } });
|
||||
// delete pg data
|
||||
await deleteDatasetDataVector({ teamId, collectionIds });
|
||||
|
||||
// delete file and imgs
|
||||
await Promise.all([
|
||||
delImgByRelatedId(relatedImageIds),
|
||||
delFileByFileIdList({
|
||||
bucketName: BucketNameEnum.dataset,
|
||||
fileIdList
|
||||
})
|
||||
]);
|
||||
|
||||
// delete collections
|
||||
await MongoDatasetCollection.deleteMany({
|
||||
_id: { $in: collectionIds }
|
||||
});
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import { connectionMongo, type Model } from '../../../common/mongo';
|
||||
const { Schema, model, models } = connectionMongo;
|
||||
import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type.d';
|
||||
import { TrainingTypeMap, DatasetCollectionTypeMap } from '@fastgpt/global/core/dataset/constant';
|
||||
import { TrainingTypeMap, DatasetCollectionTypeMap } from '@fastgpt/global/core/dataset/constants';
|
||||
import { DatasetCollectionName } from '../schema';
|
||||
import {
|
||||
TeamCollectionName,
|
||||
@@ -91,11 +91,19 @@ const DatasetCollectionSchema = new Schema({
|
||||
});
|
||||
|
||||
try {
|
||||
DatasetCollectionSchema.index({ teamId: 1 });
|
||||
DatasetCollectionSchema.index({ datasetId: 1 });
|
||||
DatasetCollectionSchema.index({ teamId: 1, datasetId: 1, parentId: 1 });
|
||||
DatasetCollectionSchema.index({ updateTime: -1 });
|
||||
DatasetCollectionSchema.index({ hashRawText: -1 });
|
||||
// auth file
|
||||
DatasetCollectionSchema.index({ teamId: 1, fileId: 1 }, { background: true });
|
||||
|
||||
// list collection; deep find collections
|
||||
DatasetCollectionSchema.index(
|
||||
{
|
||||
teamId: 1,
|
||||
datasetId: 1,
|
||||
parentId: 1,
|
||||
updateTime: -1
|
||||
},
|
||||
{ background: true }
|
||||
);
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
@@ -4,16 +4,32 @@ import type { ParentTreePathItemType } from '@fastgpt/global/common/parentFolder
|
||||
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
|
||||
import { MongoDatasetTraining } from '../training/schema';
|
||||
import { urlsFetch } from '../../../common/string/cheerio';
|
||||
import { DatasetCollectionTypeEnum, TrainingModeEnum } from '@fastgpt/global/core/dataset/constant';
|
||||
import {
|
||||
DatasetCollectionTypeEnum,
|
||||
TrainingModeEnum
|
||||
} from '@fastgpt/global/core/dataset/constants';
|
||||
import { hashStr } from '@fastgpt/global/common/string/tools';
|
||||
|
||||
/**
|
||||
* get all collection by top collectionId
|
||||
*/
|
||||
export async function findCollectionAndChild(id: string, fields = '_id parentId name metadata') {
|
||||
export async function findCollectionAndChild({
|
||||
teamId,
|
||||
datasetId,
|
||||
collectionId,
|
||||
fields = '_id parentId name metadata'
|
||||
}: {
|
||||
teamId: string;
|
||||
datasetId: string;
|
||||
collectionId: string;
|
||||
fields?: string;
|
||||
}) {
|
||||
async function find(id: string) {
|
||||
// find children
|
||||
const children = await MongoDatasetCollection.find({ parentId: id }, fields);
|
||||
const children = await MongoDatasetCollection.find(
|
||||
{ teamId, datasetId, parentId: id },
|
||||
fields
|
||||
).lean();
|
||||
|
||||
let collections = children;
|
||||
|
||||
@@ -25,8 +41,8 @@ export async function findCollectionAndChild(id: string, fields = '_id parentId
|
||||
return collections;
|
||||
}
|
||||
const [collection, childCollections] = await Promise.all([
|
||||
MongoDatasetCollection.findById(id, fields),
|
||||
find(id)
|
||||
MongoDatasetCollection.findById(collectionId, fields),
|
||||
find(collectionId)
|
||||
]);
|
||||
|
||||
if (!collection) {
|
||||
@@ -107,8 +123,8 @@ export const getCollectionAndRawText = async ({
|
||||
});
|
||||
|
||||
return {
|
||||
title: result[0].title,
|
||||
rawText: result[0].content
|
||||
title: result[0]?.title,
|
||||
rawText: result[0]?.content
|
||||
};
|
||||
}
|
||||
|
||||
@@ -121,7 +137,7 @@ export const getCollectionAndRawText = async ({
|
||||
})();
|
||||
|
||||
const hashRawText = hashStr(rawText);
|
||||
const isSameRawText = col.hashRawText === hashRawText;
|
||||
const isSameRawText = rawText && col.hashRawText === hashRawText;
|
||||
|
||||
return {
|
||||
collection: col,
|
||||
@@ -161,8 +177,7 @@ export const reloadCollectionChunks = async ({
|
||||
// split data
|
||||
const { chunks } = splitText2Chunks({
|
||||
text: newRawText,
|
||||
chunkLen: col.chunkSize || 512,
|
||||
countTokens: false
|
||||
chunkLen: col.chunkSize || 512
|
||||
});
|
||||
|
||||
// insert to training queue
|
||||
|
||||
@@ -1,24 +1,47 @@
|
||||
import { CollectionWithDatasetType } from '@fastgpt/global/core/dataset/type';
|
||||
import { CollectionWithDatasetType, DatasetSchemaType } from '@fastgpt/global/core/dataset/type';
|
||||
import { MongoDatasetCollection } from './collection/schema';
|
||||
import { MongoDataset } from './schema';
|
||||
import { delCollectionAndRelatedSources } from './collection/controller';
|
||||
|
||||
/* ============= dataset ========== */
|
||||
/* find all datasetId by top datasetId */
|
||||
export async function findDatasetIdTreeByTopDatasetId(
|
||||
id: string,
|
||||
result: string[] = []
|
||||
): Promise<string[]> {
|
||||
let allChildrenIds = [...result];
|
||||
export async function findDatasetAndAllChildren({
|
||||
teamId,
|
||||
datasetId,
|
||||
fields
|
||||
}: {
|
||||
teamId: string;
|
||||
datasetId: string;
|
||||
fields?: string;
|
||||
}): Promise<DatasetSchemaType[]> {
|
||||
const find = async (id: string) => {
|
||||
const children = await MongoDataset.find(
|
||||
{
|
||||
teamId,
|
||||
parentId: id
|
||||
},
|
||||
fields
|
||||
).lean();
|
||||
|
||||
// find children
|
||||
const children = await MongoDataset.find({ parentId: id });
|
||||
let datasets = children;
|
||||
|
||||
for (const child of children) {
|
||||
const grandChildrenIds = await findDatasetIdTreeByTopDatasetId(child._id, result);
|
||||
allChildrenIds = allChildrenIds.concat(grandChildrenIds);
|
||||
for (const child of children) {
|
||||
const grandChildrenIds = await find(child._id);
|
||||
datasets = datasets.concat(grandChildrenIds);
|
||||
}
|
||||
|
||||
return datasets;
|
||||
};
|
||||
const [dataset, childDatasets] = await Promise.all([
|
||||
MongoDataset.findById(datasetId),
|
||||
find(datasetId)
|
||||
]);
|
||||
|
||||
if (!dataset) {
|
||||
return Promise.reject('Dataset not found');
|
||||
}
|
||||
|
||||
return [String(id), ...allChildrenIds];
|
||||
return [dataset, ...childDatasets];
|
||||
}
|
||||
|
||||
export async function getCollectionWithDataset(collectionId: string) {
|
||||
@@ -30,3 +53,22 @@ export async function getCollectionWithDataset(collectionId: string) {
|
||||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
/* delete all data by datasetIds */
|
||||
export async function delDatasetRelevantData({ datasets }: { datasets: DatasetSchemaType[] }) {
|
||||
if (!datasets.length) return;
|
||||
|
||||
const teamId = datasets[0].teamId;
|
||||
const datasetIds = datasets.map((item) => String(item._id));
|
||||
|
||||
// Get _id, teamId, fileId, metadata.relatedImgId for all collections
|
||||
const collections = await MongoDatasetCollection.find(
|
||||
{
|
||||
teamId,
|
||||
datasetId: { $in: datasetIds }
|
||||
},
|
||||
'_id teamId fileId metadata'
|
||||
).lean();
|
||||
|
||||
await delCollectionAndRelatedSources({ collections });
|
||||
}
|
||||
|
||||
@@ -1,87 +1,2 @@
|
||||
import { MongoDatasetData } from './schema';
|
||||
import { MongoDatasetTraining } from '../training/schema';
|
||||
import { delFileByFileIdList, delFileByMetadata } from '../../../common/file/gridfs/controller';
|
||||
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
|
||||
import { MongoDatasetCollection } from '../collection/schema';
|
||||
import { delay } from '@fastgpt/global/common/system/utils';
|
||||
import { delImgByFileIdList } from '../../../common/file/image/controller';
|
||||
import { deleteDatasetDataVector } from '../../../common/vectorStore/controller';
|
||||
|
||||
/* delete all data by datasetIds */
|
||||
export async function delDatasetRelevantData({ datasetIds }: { datasetIds: string[] }) {
|
||||
datasetIds = datasetIds.map((item) => String(item));
|
||||
|
||||
// delete training data(There could be a training mission)
|
||||
await MongoDatasetTraining.deleteMany({
|
||||
datasetId: { $in: datasetIds }
|
||||
});
|
||||
|
||||
await delay(2000);
|
||||
|
||||
// delete dataset.datas
|
||||
await MongoDatasetData.deleteMany({ datasetId: { $in: datasetIds } });
|
||||
// delete pg data
|
||||
await deleteDatasetDataVector({ datasetIds });
|
||||
|
||||
// delete collections
|
||||
await MongoDatasetCollection.deleteMany({
|
||||
datasetId: { $in: datasetIds }
|
||||
});
|
||||
|
||||
// delete related files
|
||||
await Promise.all(
|
||||
datasetIds.map((id) => delFileByMetadata({ bucketName: BucketNameEnum.dataset, datasetId: id }))
|
||||
);
|
||||
}
|
||||
/**
|
||||
* delete all data by collectionIds
|
||||
*/
|
||||
export async function delCollectionRelevantData({
|
||||
collectionIds,
|
||||
fileIds
|
||||
}: {
|
||||
collectionIds: string[];
|
||||
fileIds: string[];
|
||||
}) {
|
||||
collectionIds = collectionIds.filter(Boolean).map((item) => String(item));
|
||||
const filterFileIds = fileIds.filter(Boolean).map((item) => String(item));
|
||||
|
||||
// delete training data
|
||||
await MongoDatasetTraining.deleteMany({
|
||||
collectionId: { $in: collectionIds }
|
||||
});
|
||||
|
||||
await delay(2000);
|
||||
|
||||
// delete dataset.datas
|
||||
await MongoDatasetData.deleteMany({ collectionId: { $in: collectionIds } });
|
||||
// delete pg data
|
||||
await deleteDatasetDataVector({ collectionIds });
|
||||
|
||||
// delete collections
|
||||
await MongoDatasetCollection.deleteMany({
|
||||
_id: { $in: collectionIds }
|
||||
});
|
||||
|
||||
// delete file and imgs
|
||||
await Promise.all([
|
||||
delImgByFileIdList(filterFileIds),
|
||||
delFileByFileIdList({
|
||||
bucketName: BucketNameEnum.dataset,
|
||||
fileIdList: filterFileIds
|
||||
})
|
||||
]);
|
||||
}
|
||||
/**
|
||||
* delete one data by mongoDataId
|
||||
*/
|
||||
export async function delDatasetDataByDataId({
|
||||
collectionId,
|
||||
mongoDataId
|
||||
}: {
|
||||
collectionId: string;
|
||||
mongoDataId: string;
|
||||
}) {
|
||||
await deleteDatasetDataVector({ collectionId, dataIds: [mongoDataId] });
|
||||
await MongoDatasetData.findByIdAndDelete(mongoDataId);
|
||||
}
|
||||
|
||||
@@ -10,7 +10,7 @@ import { DatasetColCollectionName } from '../collection/schema';
|
||||
import {
|
||||
DatasetDataIndexTypeEnum,
|
||||
DatasetDataIndexTypeMap
|
||||
} from '@fastgpt/global/core/dataset/constant';
|
||||
} from '@fastgpt/global/core/dataset/constants';
|
||||
|
||||
export const DatasetDataCollectionName = 'dataset.datas';
|
||||
|
||||
@@ -71,6 +71,7 @@ const DatasetDataSchema = new Schema({
|
||||
],
|
||||
default: []
|
||||
},
|
||||
|
||||
updateTime: {
|
||||
type: Date,
|
||||
default: () => new Date()
|
||||
@@ -85,13 +86,18 @@ const DatasetDataSchema = new Schema({
|
||||
});
|
||||
|
||||
try {
|
||||
DatasetDataSchema.index({ teamId: 1 });
|
||||
DatasetDataSchema.index({ datasetId: 1 });
|
||||
DatasetDataSchema.index({ collectionId: 1 });
|
||||
DatasetDataSchema.index({ updateTime: -1 });
|
||||
DatasetDataSchema.index({ collectionId: 1, q: 1, a: 1 });
|
||||
// same data check
|
||||
DatasetDataSchema.index({ teamId: 1, collectionId: 1, q: 1, a: 1 }, { background: true });
|
||||
// list collection and count data; list data
|
||||
DatasetDataSchema.index(
|
||||
{ teamId: 1, datasetId: 1, collectionId: 1, chunkIndex: 1, updateTime: -1 },
|
||||
{ background: true }
|
||||
);
|
||||
// full text index
|
||||
DatasetDataSchema.index({ datasetId: 1, fullTextToken: 'text' });
|
||||
DatasetDataSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' }, { background: true });
|
||||
// Recall vectors after data matching
|
||||
DatasetDataSchema.index({ teamId: 1, datasetId: 1, 'indexes.dataId': 1 }, { background: true });
|
||||
DatasetDataSchema.index({ updateTime: 1 }, { background: true });
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@ import {
|
||||
DatasetStatusEnum,
|
||||
DatasetStatusMap,
|
||||
DatasetTypeMap
|
||||
} from '@fastgpt/global/core/dataset/constant';
|
||||
} from '@fastgpt/global/core/dataset/constants';
|
||||
import {
|
||||
TeamCollectionName,
|
||||
TeamMemberCollectionName
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
import { connectionMongo, type Model } from '../../../common/mongo';
|
||||
const { Schema, model, models } = connectionMongo;
|
||||
import { DatasetTrainingSchemaType } from '@fastgpt/global/core/dataset/type';
|
||||
import { DatasetDataIndexTypeMap, TrainingTypeMap } from '@fastgpt/global/core/dataset/constant';
|
||||
import { DatasetDataIndexTypeMap, TrainingTypeMap } from '@fastgpt/global/core/dataset/constants';
|
||||
import { DatasetColCollectionName } from '../collection/schema';
|
||||
import { DatasetCollectionName } from '../schema';
|
||||
import {
|
||||
@@ -102,11 +102,11 @@ const TrainingDataSchema = new Schema({
|
||||
});
|
||||
|
||||
try {
|
||||
TrainingDataSchema.index({ teamId: 1 });
|
||||
// lock training data; delete training data
|
||||
TrainingDataSchema.index({ teamId: 1, collectionId: 1 });
|
||||
// get training data and sort
|
||||
TrainingDataSchema.index({ weight: -1 });
|
||||
TrainingDataSchema.index({ lockTime: 1 });
|
||||
TrainingDataSchema.index({ datasetId: 1 });
|
||||
TrainingDataSchema.index({ collectionId: 1 });
|
||||
TrainingDataSchema.index({ expireAt: 1 }, { expireAfterSeconds: 7 * 24 * 60 });
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
|
||||
Reference in New Issue
Block a user