Feat: Images dataset collection (#4941)

* New pic (#4858)

* 更新数据集相关类型,添加图像文件ID和预览URL支持;优化数据集导入功能,新增图像数据集处理组件;修复部分国际化文本;更新文件上传逻辑以支持新功能。

* 与原先代码的差别

* 新增 V4.9.10 更新说明,支持 PG 设置`systemEnv.hnswMaxScanTuples`参数,优化 LLM stream 调用超时,修复全文检索多知识库排序问题。同时更新数据集索引,移除 datasetId 字段以简化查询。

* 更换成fileId_image逻辑,并增加训练队列匹配的逻辑

* 新增图片集合判断逻辑,优化预览URL生成流程,确保仅在数据集为图片集合时生成预览URL,并添加相关日志输出以便调试。

* Refactor Docker Compose configuration to comment out exposed ports for production environments, update image versions for pgvector, fastgpt, and mcp_server, and enhance Redis service with a health check. Additionally, standardize dataset collection labels in constants and improve internationalization strings across multiple languages.

* Enhance TrainingStates component by adding internationalization support for the imageParse training mode and update defaultCounts to include imageParse mode in trainingDetail API.

* Enhance dataset import context by adding additional steps for image dataset import process and improve internationalization strings for modal buttons in the useEditTitle hook.

* Update DatasetImportContext to conditionally render MyStep component based on data source type, improving the import process for non-image datasets.

* Refactor image dataset handling by improving internationalization strings, enhancing error messages, and streamlining the preview URL generation process.

* 图片上传到新建的 dataset_collection_images 表,逻辑跟随更改

* 修改了除了controller的其他部分问题

* 把图片数据集的逻辑整合到controller里面

* 补充i18n

* 补充i18n

* resolve评论:主要是上传逻辑的更改和组件复用

* 图片名称的图标显示

* 修改编译报错的命名问题

* 删除不需要的collectionid部分

* 多余文件的处理和改动一个删除按钮

* 除了loading和统一的imageId,其他都resolve掉的

* 处理图标报错

* 复用了MyPhotoView并采用全部替换的方式将imageFileId变成imageId

* 去除不必要文件修改

* 报错和字段修改

* 增加上传成功后删除临时文件的逻辑以及回退一些修改

* 删除path字段,将图片保存到gridfs内,并修改增删等操作的代码

* 修正编译错误

---------

Co-authored-by: archer <545436317@qq.com>

* perf: image dataset

* feat: insert image

* perf: image icon

* fix: training state

---------

Co-authored-by: Zhuangzai fa <143257420+ctrlz526@users.noreply.github.com>
This commit is contained in:
Archer
2025-06-03 16:30:59 +08:00
committed by archer
parent 9fb5d05865
commit 92c38d9d2f
104 changed files with 2341 additions and 693 deletions

View File

@@ -5,9 +5,10 @@ import {
} from '@fastgpt/global/core/dataset/constants';
import type { CreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
import { MongoDatasetCollection } from './schema';
import {
type DatasetCollectionSchemaType,
type DatasetSchemaType
import type {
DatasetCollectionSchemaType,
DatasetDataFieldType,
DatasetSchemaType
} from '@fastgpt/global/core/dataset/type';
import { MongoDatasetTraining } from '../training/schema';
import { MongoDatasetData } from '../data/schema';
@@ -15,7 +16,7 @@ import { delImgByRelatedId } from '../../../common/file/image/controller';
import { deleteDatasetDataVector } from '../../../common/vectorDB/controller';
import { delFileByFileIdList } from '../../../common/file/gridfs/controller';
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
import { type ClientSession } from '../../../common/mongo';
import type { ClientSession } from '../../../common/mongo';
import { createOrGetCollectionTags } from './utils';
import { rawText2Chunks } from '../read';
import { checkDatasetLimit } from '../../../support/permission/teamLimit';
@@ -38,20 +39,25 @@ import {
getLLMMaxChunkSize
} from '@fastgpt/global/core/dataset/training/utils';
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
import { deleteDatasetImage } from '../image/controller';
import { clearCollectionImages, removeDatasetImageExpiredTime } from '../image/utils';
export const createCollectionAndInsertData = async ({
dataset,
rawText,
relatedId,
imageIds,
createCollectionParams,
backupParse = false,
billId,
session
}: {
dataset: DatasetSchemaType;
rawText: string;
rawText?: string;
relatedId?: string;
imageIds?: string[];
createCollectionParams: CreateOneCollectionParams;
backupParse?: boolean;
billId?: string;
@@ -69,13 +75,13 @@ export const createCollectionAndInsertData = async ({
// Set default params
const trainingType =
createCollectionParams.trainingType || DatasetCollectionDataProcessModeEnum.chunk;
const chunkSize = computeChunkSize({
...createCollectionParams,
trainingType,
llmModel: getLLMModel(dataset.agentModel)
});
const chunkSplitter = computeChunkSplitter(createCollectionParams);
const paragraphChunkDeep = computeParagraphChunkDeep(createCollectionParams);
const trainingMode = getTrainingModeByCollection({
trainingType: trainingType,
autoIndexes: createCollectionParams.autoIndexes,
imageIndex: createCollectionParams.imageIndex
});
if (
trainingType === DatasetCollectionDataProcessModeEnum.qa ||
@@ -90,35 +96,60 @@ export const createCollectionAndInsertData = async ({
delete createCollectionParams.qaPrompt;
}
// 1. split chunks
const chunks = rawText2Chunks({
rawText,
chunkTriggerType: createCollectionParams.chunkTriggerType,
chunkTriggerMinSize: createCollectionParams.chunkTriggerMinSize,
chunkSize,
paragraphChunkDeep,
paragraphChunkMinSize: createCollectionParams.paragraphChunkMinSize,
maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)),
overlapRatio: trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0,
customReg: chunkSplitter ? [chunkSplitter] : [],
backupParse
});
// 1. split chunks or create image chunks
const {
chunks,
chunkSize
}: {
chunks: Array<{
q?: string;
a?: string; // answer or custom content
imageId?: string;
indexes?: string[];
}>;
chunkSize?: number;
} = (() => {
if (rawText) {
const chunkSize = computeChunkSize({
...createCollectionParams,
trainingType,
llmModel: getLLMModel(dataset.agentModel)
});
// Process text chunks
const chunks = rawText2Chunks({
rawText,
chunkTriggerType: createCollectionParams.chunkTriggerType,
chunkTriggerMinSize: createCollectionParams.chunkTriggerMinSize,
chunkSize,
paragraphChunkDeep,
paragraphChunkMinSize: createCollectionParams.paragraphChunkMinSize,
maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)),
overlapRatio: trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0,
customReg: chunkSplitter ? [chunkSplitter] : [],
backupParse
});
return { chunks, chunkSize };
}
if (imageIds) {
// Process image chunks
const chunks = imageIds.map((imageId: string) => ({
imageId,
indexes: []
}));
return { chunks };
}
throw new Error('Either rawText or imageIdList must be provided');
})();
// 2. auth limit
await checkDatasetLimit({
teamId,
insertLen: predictDataLimitLength(
getTrainingModeByCollection({
trainingType: trainingType,
autoIndexes: createCollectionParams.autoIndexes,
imageIndex: createCollectionParams.imageIndex
}),
chunks
)
insertLen: predictDataLimitLength(trainingMode, chunks)
});
const fn = async (session: ClientSession) => {
// 3. create collection
// 3. Create collection
const { _id: collectionId } = await createOneCollection({
...createCollectionParams,
trainingType,
@@ -126,8 +157,8 @@ export const createCollectionAndInsertData = async ({
chunkSize,
chunkSplitter,
hashRawText: hashStr(rawText),
rawTextLength: rawText.length,
hashRawText: rawText ? hashStr(rawText) : undefined,
rawTextLength: rawText?.length,
nextSyncTime: (() => {
// ignore auto collections sync for website datasets
if (!dataset.autoSync && dataset.type === DatasetTypeEnum.websiteDataset) return undefined;
@@ -169,11 +200,7 @@ export const createCollectionAndInsertData = async ({
vectorModel: dataset.vectorModel,
vlmModel: dataset.vlmModel,
indexSize: createCollectionParams.indexSize,
mode: getTrainingModeByCollection({
trainingType: trainingType,
autoIndexes: createCollectionParams.autoIndexes,
imageIndex: createCollectionParams.imageIndex
}),
mode: trainingMode,
prompt: createCollectionParams.qaPrompt,
billId: traingBillId,
data: chunks.map((item, index) => ({
@@ -187,7 +214,12 @@ export const createCollectionAndInsertData = async ({
session
});
// 6. remove related image ttl
// 6. Remove images ttl index
await removeDatasetImageExpiredTime({
ids: imageIds,
collectionId,
session
});
if (relatedId) {
await MongoImage.updateMany(
{
@@ -207,7 +239,7 @@ export const createCollectionAndInsertData = async ({
}
return {
collectionId,
collectionId: String(collectionId),
insertResults
};
};
@@ -288,17 +320,20 @@ export const delCollectionRelatedSource = async ({
.map((item) => item?.metadata?.relatedImgId || '')
.filter(Boolean);
// Delete files
await delFileByFileIdList({
bucketName: BucketNameEnum.dataset,
fileIdList
});
// Delete images
await delImgByRelatedId({
teamId,
relateIds: relatedImageIds,
session
});
// Delete files and images in parallel
await Promise.all([
// Delete files
delFileByFileIdList({
bucketName: BucketNameEnum.dataset,
fileIdList
}),
// Delete images
delImgByRelatedId({
teamId,
relateIds: relatedImageIds,
session
})
]);
};
/**
* delete collection and it related data
@@ -343,16 +378,16 @@ export async function delCollection({
datasetId: { $in: datasetIds },
collectionId: { $in: collectionIds }
}),
// Delete dataset_images
clearCollectionImages(collectionIds),
// Delete images if needed
...(delImg
? [
delImgByRelatedId({
teamId,
relateIds: collections
.map((item) => item?.metadata?.relatedImgId || '')
.filter(Boolean)
})
]
? collections
.map((item) => item?.metadata?.relatedImgId || '')
.filter(Boolean)
.map((imageId) => deleteDatasetImage(imageId))
: []),
// Delete files if needed
...(delFile
? [
delFileByFileIdList({