Feat: Images dataset collection (#4941)

* New pic (#4858)

* 更新数据集相关类型,添加图像文件ID和预览URL支持;优化数据集导入功能,新增图像数据集处理组件;修复部分国际化文本;更新文件上传逻辑以支持新功能。

* 与原先代码的差别

* 新增 V4.9.10 更新说明,支持 PG 设置`systemEnv.hnswMaxScanTuples`参数,优化 LLM stream 调用超时,修复全文检索多知识库排序问题。同时更新数据集索引,移除 datasetId 字段以简化查询。

* 更换成fileId_image逻辑,并增加训练队列匹配的逻辑

* 新增图片集合判断逻辑,优化预览URL生成流程,确保仅在数据集为图片集合时生成预览URL,并添加相关日志输出以便调试。

* Refactor Docker Compose configuration to comment out exposed ports for production environments, update image versions for pgvector, fastgpt, and mcp_server, and enhance Redis service with a health check. Additionally, standardize dataset collection labels in constants and improve internationalization strings across multiple languages.

* Enhance TrainingStates component by adding internationalization support for the imageParse training mode and update defaultCounts to include imageParse mode in trainingDetail API.

* Enhance dataset import context by adding additional steps for image dataset import process and improve internationalization strings for modal buttons in the useEditTitle hook.

* Update DatasetImportContext to conditionally render MyStep component based on data source type, improving the import process for non-image datasets.

* Refactor image dataset handling by improving internationalization strings, enhancing error messages, and streamlining the preview URL generation process.

* 图片上传到新建的 dataset_collection_images 表,逻辑跟随更改

* 修改了除了controller的其他部分问题

* 把图片数据集的逻辑整合到controller里面

* 补充i18n

* 补充i18n

* resolve评论:主要是上传逻辑的更改和组件复用

* 图片名称的图标显示

* 修改编译报错的命名问题

* 删除不需要的collectionid部分

* 多余文件的处理和改动一个删除按钮

* 除了loading和统一的imageId,其他都resolve掉的

* 处理图标报错

* 复用了MyPhotoView并采用全部替换的方式将imageFileId变成imageId

* 去除不必要文件修改

* 报错和字段修改

* 增加上传成功后删除临时文件的逻辑以及回退一些修改

* 删除path字段,将图片保存到gridfs内,并修改增删等操作的代码

* 修正编译错误

---------

Co-authored-by: archer <545436317@qq.com>

* perf: image dataset

* feat: insert image

* perf: image icon

* fix: training state

---------

Co-authored-by: Zhuangzai fa <143257420+ctrlz526@users.noreply.github.com>
This commit is contained in:
Archer
2025-06-03 16:30:59 +08:00
committed by archer
parent 9fb5d05865
commit 92c38d9d2f
104 changed files with 2341 additions and 693 deletions

View File

@@ -1,4 +1,9 @@
import type { ChunkSettingsType, DatasetDataIndexItemType, DatasetSchemaType } from './type';
import type {
ChunkSettingsType,
DatasetDataIndexItemType,
DatasetDataFieldType,
DatasetSchemaType
} from './type';
import type {
DatasetCollectionTypeEnum,
DatasetCollectionDataProcessModeEnum,
@@ -7,8 +12,7 @@ import type {
ChunkTriggerConfigTypeEnum,
ParagraphChunkAIModeEnum
} from './constants';
import type { LLMModelItemType } from '../ai/model.d';
import type { ParentIdType } from 'common/parentFolder/type';
import type { ParentIdType } from '../../common/parentFolder/type';
/* ================= dataset ===================== */
export type DatasetUpdateBody = {
@@ -100,6 +104,9 @@ export type ExternalFileCreateDatasetCollectionParams = ApiCreateDatasetCollecti
externalFileUrl: string;
filename?: string;
};
export type ImageCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams & {
collectionName: string;
};
/* ================= tag ===================== */
export type CreateDatasetCollectionTagParams = {
@@ -125,8 +132,9 @@ export type PgSearchRawType = {
score: number;
};
export type PushDatasetDataChunkProps = {
q: string; // embedding content
a?: string; // bonus content
q?: string;
a?: string;
imageId?: string;
chunkIndex?: number;
indexes?: Omit<DatasetDataIndexItemType, 'dataId'>[];
};

View File

@@ -77,7 +77,8 @@ export enum DatasetCollectionTypeEnum {
file = 'file',
link = 'link', // one link
externalFile = 'externalFile',
apiFile = 'apiFile'
apiFile = 'apiFile',
images = 'images'
}
export const DatasetCollectionTypeMap = {
[DatasetCollectionTypeEnum.folder]: {
@@ -97,6 +98,9 @@ export const DatasetCollectionTypeMap = {
},
[DatasetCollectionTypeEnum.apiFile]: {
name: i18nT('common:core.dataset.apiFile')
},
[DatasetCollectionTypeEnum.images]: {
name: i18nT('dataset:core.dataset.Image collection')
}
};
@@ -120,6 +124,7 @@ export const DatasetCollectionSyncResultMap = {
export enum DatasetCollectionDataProcessModeEnum {
chunk = 'chunk',
qa = 'qa',
imageParse = 'imageParse',
backup = 'backup',
auto = 'auto' // abandon
@@ -133,6 +138,10 @@ export const DatasetCollectionDataProcessModeMap = {
label: i18nT('common:core.dataset.training.QA mode'),
tooltip: i18nT('common:core.dataset.import.QA Import Tip')
},
[DatasetCollectionDataProcessModeEnum.imageParse]: {
label: i18nT('dataset:training.Image mode'),
tooltip: i18nT('common:core.dataset.import.Chunk Split Tip')
},
[DatasetCollectionDataProcessModeEnum.backup]: {
label: i18nT('dataset:backup_mode'),
tooltip: i18nT('dataset:backup_mode')
@@ -172,14 +181,16 @@ export enum ImportDataSourceEnum {
fileCustom = 'fileCustom',
externalFile = 'externalFile',
apiDataset = 'apiDataset',
reTraining = 'reTraining'
reTraining = 'reTraining',
imageDataset = 'imageDataset'
}
export enum TrainingModeEnum {
chunk = 'chunk',
qa = 'qa',
auto = 'auto',
image = 'image'
image = 'image',
imageParse = 'imageParse'
}
/* ------------ search -------------- */

View File

@@ -8,17 +8,19 @@ export type CreateDatasetDataProps = {
chunkIndex?: number;
q: string;
a?: string;
imageId?: string;
indexes?: Omit<DatasetDataIndexItemType, 'dataId'>[];
};
export type UpdateDatasetDataProps = {
dataId: string;
q?: string;
q: string;
a?: string;
indexes?: (Omit<DatasetDataIndexItemType, 'dataId'> & {
dataId?: string; // pg data id
})[];
imageId?: string;
};
export type PatchIndexesProps =

View File

@@ -0,0 +1,13 @@
export type DatasetImageSchema = {
_id: string;
teamId: string;
datasetId: string;
collectionId?: string;
name: string;
contentType: string;
size: number;
metadata?: Record<string, any>;
expiredTime?: Date;
createdAt: Date;
updatedAt: Date;
};

View File

@@ -16,6 +16,7 @@ import type { DatasetPermission } from '../../support/permission/dataset/control
import type { APIFileServer, FeishuServer, YuqueServer } from './apiDataset';
import type { SourceMemberType } from 'support/user/type';
import type { DatasetDataIndexTypeEnum } from './data/constants';
import type { ParentIdType } from 'common/parentFolder/type';
export type ChunkSettingsType = {
trainingType?: DatasetCollectionDataProcessModeEnum;
@@ -49,7 +50,7 @@ export type ChunkSettingsType = {
export type DatasetSchemaType = {
_id: string;
parentId?: string;
parentId: ParentIdType;
userId: string;
teamId: string;
tmbId: string;
@@ -132,7 +133,13 @@ export type DatasetDataIndexItemType = {
dataId: string; // pg data id
text: string;
};
export type DatasetDataSchemaType = {
export type DatasetDataFieldType = {
q: string; // large chunks or question
a?: string; // answer or custom content
imageId?: string;
};
export type DatasetDataSchemaType = DatasetDataFieldType & {
_id: string;
userId: string;
teamId: string;
@@ -141,13 +148,9 @@ export type DatasetDataSchemaType = {
collectionId: string;
chunkIndex: number;
updateTime: Date;
q: string; // large chunks or question
a: string; // answer or custom content
history?: {
q: string;
a: string;
history?: (DatasetDataFieldType & {
updateTime: Date;
}[];
})[];
forbid?: boolean;
fullTextToken: string;
indexes: DatasetDataIndexItemType[];
@@ -179,6 +182,7 @@ export type DatasetTrainingSchemaType = {
dataId?: string;
q: string;
a: string;
imageId?: string;
chunkIndex: number;
indexSize?: number;
weight: number;
@@ -244,20 +248,18 @@ export type DatasetCollectionItemType = CollectionWithDatasetType & {
};
/* ================= data ===================== */
export type DatasetDataItemType = {
export type DatasetDataItemType = DatasetDataFieldType & {
id: string;
teamId: string;
datasetId: string;
imagePreivewUrl?: string;
updateTime: Date;
collectionId: string;
sourceName: string;
sourceId?: string;
q: string;
a: string;
chunkIndex: number;
indexes: DatasetDataIndexItemType[];
isOwner: boolean;
// permission: DatasetPermission;
};
/* --------------- file ---------------------- */

View File

@@ -2,10 +2,15 @@ import { TrainingModeEnum, DatasetCollectionTypeEnum } from './constants';
import { getFileIcon } from '../../common/file/icon';
import { strIsLink } from '../../common/string/tools';
export function getCollectionIcon(
type: DatasetCollectionTypeEnum = DatasetCollectionTypeEnum.file,
name = ''
) {
export function getCollectionIcon({
type = DatasetCollectionTypeEnum.file,
name = '',
sourceId
}: {
type?: DatasetCollectionTypeEnum;
name?: string;
sourceId?: string;
}) {
if (type === DatasetCollectionTypeEnum.folder) {
return 'common/folderFill';
}
@@ -15,7 +20,10 @@ export function getCollectionIcon(
if (type === DatasetCollectionTypeEnum.virtual) {
return 'file/fill/manual';
}
return getFileIcon(name);
if (type === DatasetCollectionTypeEnum.images) {
return 'core/dataset/imageFill';
}
return getSourceNameIcon({ sourceName: name, sourceId });
}
export function getSourceNameIcon({
sourceName,