feat: model config required check;feat: dataset text model default setting (#3866)

* feat: model config required check * feat: dataset text model default setting * perf: collection list count * fix: ts * remove index count
2025-02-24 19:55:49 +08:00
parent 3bfe802c48
commit 255764400f
32 changed files with 356 additions and 192 deletions
--- a/packages/global/core/ai/model.d.ts
+++ b/packages/global/core/ai/model.d.ts
@@ -17,6 +17,8 @@ type BaseModelItemType = {
  isActive?: boolean;
  isCustom?: boolean;
  isDefault?: boolean;
+  isDefaultDatasetTextModel?: boolean;
+  isDefaultDatasetImageModel?: boolean;

  // If has requestUrl, it will request the model directly
  requestUrl?: string;
--- a/packages/global/core/dataset/type.d.ts
+++ b/packages/global/core/dataset/type.d.ts
@@ -192,6 +192,7 @@ export type DatasetCollectionItemType = CollectionWithDatasetType & {
  sourceId?: string;
  file?: DatasetFileSchema;
  permission: DatasetPermission;
+  indexAmount: number;
 };

 /* ================= data ===================== */
--- a/packages/service/common/system/constants.ts
+++ b/packages/service/common/system/constants.ts
@@ -1 +1,4 @@
 export const FastGPTProUrl = process.env.PRO_URL ? `${process.env.PRO_URL}/api` : '';
+export const isFastGPTMainService = !!process.env.PRO_URL;
+// @ts-ignore
+export const isFastGPTProService = () => !!global.systemConfig;
--- a/packages/service/common/vectorStore/controller.ts
+++ b/packages/service/common/vectorStore/controller.ts
@@ -21,6 +21,7 @@ export const recallFromVectorStore = Vector.embRecall;
 export const getVectorDataByTime = Vector.getVectorDataByTime;
 export const getVectorCountByTeamId = Vector.getVectorCountByTeamId;
 export const getVectorCountByDatasetId = Vector.getVectorCountByDatasetId;
+export const getVectorCountByCollectionId = Vector.getVectorCountByCollectionId;

 export const insertDatasetDataVector = async ({
  model,
--- a/packages/service/common/vectorStore/milvus/class.ts
+++ b/packages/service/common/vectorStore/milvus/class.ts
@@ -321,6 +321,23 @@ export class MilvusCtrl {

    return total;
  };
+  getVectorCountByCollectionId = async (
+    teamId: string,
+    datasetId: string,
+    collectionId: string
+  ) => {
+    const client = await this.getClient();
+
+    const result = await client.query({
+      collection_name: DatasetVectorTableName,
+      output_fields: ['count(*)'],
+      filter: `(teamId == "${String(teamId)}") and (datasetId == "${String(datasetId)}") and (collectionId == "${String(collectionId)}")`
+    });
+
+    const total = result.data?.[0]?.['count(*)'] as number;
+
+    return total;
+  };

  getVectorDataByTime = async (start: Date, end: Date) => {
    const client = await this.getClient();
--- a/packages/service/common/vectorStore/pg/class.ts
+++ b/packages/service/common/vectorStore/pg/class.ts
@@ -240,6 +240,23 @@ export class PgVectorCtrl {
      where: [['team_id', String(teamId)], 'and', ['dataset_id', String(datasetId)]]
    });

+    return total;
+  };
+  getVectorCountByCollectionId = async (
+    teamId: string,
+    datasetId: string,
+    collectionId: string
+  ) => {
+    const total = await PgClient.count(DatasetVectorTableName, {
+      where: [
+        ['team_id', String(teamId)],
+        'and',
+        ['dataset_id', String(datasetId)],
+        'and',
+        ['collection_id', String(collectionId)]
+      ]
+    });
+
    return total;
  };
 }
--- a/packages/service/core/ai/config/utils.ts
+++ b/packages/service/core/ai/config/utils.ts
@@ -52,6 +52,12 @@ export const loadSystemModels = async (init = false) => {
        if (model.isDefault) {
          global.systemDefaultModel.llm = model;
        }
+        if (model.isDefaultDatasetTextModel) {
+          global.systemDefaultModel.datasetTextLLM = model;
+        }
+        if (model.isDefaultDatasetImageModel) {
+          global.systemDefaultModel.datasetImageLLM = model;
+        }
      } else if (model.type === ModelTypeEnum.embedding) {
        global.embeddingModelMap.set(model.model, model);
        global.embeddingModelMap.set(model.name, model);
@@ -134,6 +140,16 @@ export const loadSystemModels = async (init = false) => {
    if (!global.systemDefaultModel.llm) {
      global.systemDefaultModel.llm = Array.from(global.llmModelMap.values())[0];
    }
+    if (!global.systemDefaultModel.datasetTextLLM) {
+      global.systemDefaultModel.datasetTextLLM = Array.from(global.llmModelMap.values()).find(
+        (item) => item.datasetProcess
+      );
+    }
+    if (!global.systemDefaultModel.datasetImageLLM) {
+      global.systemDefaultModel.datasetImageLLM = Array.from(global.llmModelMap.values()).find(
+        (item) => item.vision
+      );
+    }
    if (!global.systemDefaultModel.embedding) {
      global.systemDefaultModel.embedding = Array.from(global.embeddingModelMap.values())[0];
    }
--- a/packages/service/core/ai/type.d.ts
+++ b/packages/service/core/ai/type.d.ts
@@ -22,6 +22,9 @@ export type SystemModelItemType =

 export type SystemDefaultModelType = {
  [ModelTypeEnum.llm]?: LLMModelItemType;
+  datasetTextLLM?: LLMModelItemType;
+  datasetImageLLM?: LLMModelItemType;
+
  [ModelTypeEnum.embedding]?: EmbeddingModelItemType;
  [ModelTypeEnum.tts]?: TTSModelType;
  [ModelTypeEnum.stt]?: STTModelType;
--- a/packages/service/core/dataset/search/controller.ts
+++ b/packages/service/core/dataset/search/controller.ts
@@ -201,61 +201,6 @@ export async function searchDatasetData(
    };
  };

-  async function getAllCollectionIds({
-    teamId,
-    datasetIds,
-    parentCollectionIds
-  }: {
-    teamId: string;
-    datasetIds: string[];
-    parentCollectionIds: string[];
-  }): Promise<string[]> {
-    if (!parentCollectionIds.length) {
-      return [];
-    }
-    const collections = await MongoDatasetCollection.find(
-      {
-        teamId,
-        datasetId: { $in: datasetIds },
-        _id: { $in: parentCollectionIds }
-      },
-      '_id type',
-      {
-        ...readFromSecondary
-      }
-    ).lean();
-
-    const resultIds = new Set(collections.map((item) => String(item._id)));
-
-    const folderIds = collections
-      .filter((item) => item.type === 'folder')
-      .map((item) => String(item._id));
-
-    // Get all child collection ids
-    if (folderIds.length) {
-      const childCollections = await MongoDatasetCollection.find(
-        {
-          teamId,
-          datasetId: { $in: datasetIds },
-          parentId: { $in: folderIds }
-        },
-        '_id',
-        {
-          ...readFromSecondary
-        }
-      ).lean();
-
-      const childIds = await getAllCollectionIds({
-        teamId,
-        datasetIds,
-        parentCollectionIds: childCollections.map((item) => String(item._id))
-      });
-
-      childIds.forEach((id) => resultIds.add(id));
-    }
-
-    return Array.from(resultIds);
-  }
  /* 
    Collection metadata filter
    标签过滤：
@@ -263,6 +208,63 @@ export async function searchDatasetData(
    2. and 标签和 null 不能共存，否则返回空数组
  */
  const filterCollectionByMetadata = async (): Promise<string[] | undefined> => {
+    const getAllCollectionIds = async ({
+      parentCollectionIds
+    }: {
+      parentCollectionIds?: string[];
+    }): Promise<string[] | undefined> => {
+      if (!parentCollectionIds) return;
+      if (parentCollectionIds.length === 0) {
+        return [];
+      }
+
+      const collections = await MongoDatasetCollection.find(
+        {
+          teamId,
+          datasetId: { $in: datasetIds },
+          _id: { $in: parentCollectionIds }
+        },
+        '_id type',
+        {
+          ...readFromSecondary
+        }
+      ).lean();
+
+      const resultIds = new Set<string>();
+      collections.forEach((item) => {
+        if (item.type !== 'folder') {
+          resultIds.add(String(item._id));
+        }
+      });
+
+      const folderIds = collections
+        .filter((item) => item.type === 'folder')
+        .map((item) => String(item._id));
+
+      // Get all child collection ids
+      if (folderIds.length) {
+        const childCollections = await MongoDatasetCollection.find(
+          {
+            teamId,
+            datasetId: { $in: datasetIds },
+            parentId: { $in: folderIds }
+          },
+          '_id type',
+          {
+            ...readFromSecondary
+          }
+        ).lean();
+
+        const childIds = await getAllCollectionIds({
+          parentCollectionIds: childCollections.map((item) => String(item._id))
+        });
+
+        childIds?.forEach((id) => resultIds.add(id));
+      }
+
+      return Array.from(resultIds);
+    };
+
    if (!collectionFilterMatch || !global.feConfigs.isPlus) return;

    let tagCollectionIdList: string[] | undefined = undefined;
@@ -382,7 +384,7 @@ export async function searchDatasetData(
      }

      // Concat tag and time
-      const finalIds = (() => {
+      const collectionIds = (() => {
        if (tagCollectionIdList && createTimeCollectionIdList) {
          return tagCollectionIdList.filter((id) =>
            (createTimeCollectionIdList as string[]).includes(id)
@@ -392,13 +394,9 @@ export async function searchDatasetData(
        return tagCollectionIdList || createTimeCollectionIdList;
      })();

-      return finalIds
-        ? await getAllCollectionIds({
-            teamId,
-            datasetIds,
-            parentCollectionIds: finalIds
-          })
-        : undefined;
+      return await getAllCollectionIds({
+        parentCollectionIds: collectionIds
+      });
    } catch (error) {}
  };
  const embeddingRecall = async ({
--- a/packages/service/support/wallet/usage/controller.ts
+++ b/packages/service/support/wallet/usage/controller.ts
@@ -8,12 +8,12 @@ import { i18nT } from '../../../../web/i18n/utils';
 import { pushConcatBillTask, pushReduceTeamAiPointsTask } from './utils';

 import { POST } from '../../../common/api/plusRequest';
-import { FastGPTProUrl } from '../../../common/system/constants';
+import { isFastGPTMainService } from '../../../common/system/constants';

 export async function createUsage(data: CreateUsageProps) {
  try {
    // In FastGPT server
-    if (FastGPTProUrl) {
+    if (isFastGPTMainService) {
      await POST('/support/wallet/usage/createUsage', data);
    } else if (global.reduceAiPointsQueue) {
      // In FastGPT pro server
@@ -31,7 +31,7 @@ export async function createUsage(data: CreateUsageProps) {
 export async function concatUsage(data: ConcatUsageProps) {
  try {
    // In FastGPT server
-    if (FastGPTProUrl) {
+    if (isFastGPTMainService) {
      await POST('/support/wallet/usage/concatUsage', data);
    } else if (global.reduceAiPointsQueue) {
      const {
--- a/packages/web/i18n/en/common.json
+++ b/packages/web/i18n/en/common.json
@@ -547,7 +547,6 @@
  "core.dataset.data.Main Content": "Main Content",
  "core.dataset.data.Search data placeholder": "Search Related Data",
  "core.dataset.data.Too Long": "Total Length Exceeded",
-  "core.dataset.data.Total Amount": "{{total}} Groups",
  "core.dataset.data.group": "Group",
  "core.dataset.data.unit": "Items",
  "core.dataset.embedding model tip": "The index model can convert natural language into vectors for semantic search.\nNote that different index models cannot be used together. Once an index model is selected, it cannot be changed.",
@@ -860,7 +859,6 @@
  "dataset.collections.Collection Embedding": "{{total}} Indexes",
  "dataset.collections.Confirm to delete the folder": "Confirm to Delete This Folder and All Its Contents?",
  "dataset.collections.Create And Import": "Create/Import",
-  "dataset.collections.Data Amount": "Total Data",
  "dataset.collections.Select Collection": "Select File",
  "dataset.collections.Select One Collection To Store": "Select a File to Store",
  "dataset.data.Can not edit": "No Edit Permission",
@@ -876,6 +874,7 @@
  "dataset.dataset_name": "Dataset Name",
  "dataset.deleteFolderTips": "Confirm to Delete This Folder and All Its Contained Datasets? Data Cannot Be Recovered After Deletion, Please Confirm!",
  "dataset.test.noResult": "No Search Results",
+  "dataset_text_model_tip": "Used for text processing in the knowledge base preprocessing stage, such as automatic supplementary indexing, Q&A pair extraction.",
  "deep_rag_search": "In-depth search",
  "delete_api": "Are you sure you want to delete this API key? \nAfter deletion, the key will become invalid immediately and the corresponding conversation log will not be deleted. Please confirm!",
  "embedding_model_not_config": "No index model is detected",
--- a/packages/web/i18n/en/dataset.json
+++ b/packages/web/i18n/en/dataset.json
@@ -7,6 +7,7 @@
  "close_auto_sync": "Are you sure you want to turn off automatic sync?",
  "collection.Create update time": "Creation/Update Time",
  "collection.Training type": "Training",
+  "collection_data_count": "Data amount",
  "collection_not_support_retraining": "This collection type does not support retuning parameters",
  "collection_not_support_sync": "This collection does not support synchronization",
  "collection_sync": "Sync data",
@@ -20,6 +21,7 @@
  "custom_data_process_params": "Custom",
  "custom_data_process_params_desc": "Customize data processing rules",
  "data.ideal_chunk_length": "ideal block length",
+  "data_amount": "{{dataAmount}} Datas, {{indexAmount}} Indexes",
  "data_process_params": "Params",
  "data_process_setting": "Processing config",
  "dataset.Unsupported operation": "dataset.Unsupported operation",
--- a/packages/web/i18n/zh-CN/common.json
+++ b/packages/web/i18n/zh-CN/common.json
@@ -550,7 +550,6 @@
  "core.dataset.data.Main Content": "主要内容",
  "core.dataset.data.Search data placeholder": "搜索相关数据",
  "core.dataset.data.Too Long": "总长度超长了",
-  "core.dataset.data.Total Amount": "{{total}} 组",
  "core.dataset.data.group": "组",
  "core.dataset.data.unit": "条",
  "core.dataset.embedding model tip": "索引模型可以将自然语言转成向量，用于进行语义检索。\n注意，不同索引模型无法一起使用，选择完索引模型后将无法修改。",
@@ -863,7 +862,6 @@
  "dataset.collections.Collection Embedding": "{{total}} 组索引中",
  "dataset.collections.Confirm to delete the folder": "确认删除该文件夹及里面所有内容？",
  "dataset.collections.Create And Import": "新建/导入",
-  "dataset.collections.Data Amount": "数据总量",
  "dataset.collections.Select Collection": "选择文件",
  "dataset.collections.Select One Collection To Store": "选择一个文件进行存储",
  "dataset.data.Can not edit": "无编辑权限",
@@ -879,6 +877,7 @@
  "dataset.dataset_name": "知识库名称",
  "dataset.deleteFolderTips": "确认删除该文件夹及其包含的所有知识库？删除后数据无法恢复，请确认！",
  "dataset.test.noResult": "搜索结果为空",
+  "dataset_text_model_tip": "用于知识库预处理阶段的文本处理，例如自动补充索引、问答对提取。",
  "deep_rag_search": "深度搜索",
  "delete_api": "确认删除该API密钥？删除后该密钥立即失效，对应的对话日志不会删除，请确认！",
  "embedding_model_not_config": "检测到没有可用的索引模型",
@@ -944,9 +943,9 @@
  "model_moka": "Moka-AI",
  "model_moonshot": "月之暗面",
  "model_other": "其他",
+  "model_ppio": "PPIO 派欧云",
  "model_qwen": "阿里千问",
  "model_siliconflow": "硅基流动",
-  "model_ppio": "PPIO 派欧云",
  "model_sparkdesk": "讯飞星火",
  "model_stepfun": "阶跃星辰",
  "model_yi": "零一万物",
--- a/packages/web/i18n/zh-CN/dataset.json
+++ b/packages/web/i18n/zh-CN/dataset.json
@@ -7,6 +7,7 @@
  "close_auto_sync": "确认关闭自动同步功能？",
  "collection.Create update time": "创建/更新时间",
  "collection.Training type": "训练模式",
+  "collection_data_count": "数据量",
  "collection_not_support_retraining": "该集合类型不支持重新调整参数",
  "collection_not_support_sync": "该集合不支持同步",
  "collection_sync": "立即同步",
@@ -20,6 +21,7 @@
  "custom_data_process_params": "自定义",
  "custom_data_process_params_desc": "自定义设置数据处理规则",
  "data.ideal_chunk_length": "理想分块长度",
+  "data_amount": "{{dataAmount}} 组数据, {{indexAmount}} 组索引",
  "data_process_params": "处理参数",
  "data_process_setting": "数据处理配置",
  "dataset.Unsupported operation": "操作不支持",
--- a/packages/web/i18n/zh-Hant/common.json
+++ b/packages/web/i18n/zh-Hant/common.json
@@ -546,7 +546,6 @@
  "core.dataset.data.Main Content": "主要內容",
  "core.dataset.data.Search data placeholder": "搜尋相關資料",
  "core.dataset.data.Too Long": "總長度超出上限",
-  "core.dataset.data.Total Amount": "{{total}} 組",
  "core.dataset.data.group": "組",
  "core.dataset.data.unit": "筆",
  "core.dataset.embedding model tip": "索引模型可以將自然語言轉換成向量，用於進行語意搜尋。\n注意，不同索引模型無法一起使用。選擇索引模型後就無法修改。",
@@ -860,7 +859,6 @@
  "dataset.collections.Collection Embedding": "{{total}} 個索引",
  "dataset.collections.Confirm to delete the folder": "確認刪除此資料夾及其所有內容？",
  "dataset.collections.Create And Import": "建立或匯入",
-  "dataset.collections.Data Amount": "資料總量",
  "dataset.collections.Select Collection": "選擇檔案",
  "dataset.collections.Select One Collection To Store": "選擇一個檔案進行儲存",
  "dataset.data.Can not edit": "無編輯權限",
@@ -876,6 +874,7 @@
  "dataset.dataset_name": "知識庫名稱",
  "dataset.deleteFolderTips": "確認刪除此資料夾及其包含的所有知識庫？刪除後資料無法復原，請確認！",
  "dataset.test.noResult": "搜尋結果為空",
+  "dataset_text_model_tip": "用於知識庫預處理階段的文本處理，例如自動補充索引、問答對提取。",
  "deep_rag_search": "深度搜索",
  "delete_api": "確認刪除此 API 金鑰？\n刪除後該金鑰將立即失效，對應的對話記錄不會被刪除，請確認！",
  "embedding_model_not_config": "檢測到沒有可用的索引模型",
--- a/packages/web/i18n/zh-Hant/dataset.json
+++ b/packages/web/i18n/zh-Hant/dataset.json
@@ -7,6 +7,7 @@
  "close_auto_sync": "確認關閉自動同步功能？",
  "collection.Create update time": "建立／更新時間",
  "collection.Training type": "分段模式",
+  "collection_data_count": "數據量",
  "collection_not_support_retraining": "此集合類型不支援重新調整參數",
  "collection_not_support_sync": "該集合不支援同步",
  "collection_sync": "立即同步",
@@ -20,6 +21,7 @@
  "custom_data_process_params": "自訂",
  "custom_data_process_params_desc": "自訂資料處理規則",
  "data.ideal_chunk_length": "理想分塊長度",
+  "data_amount": "{{dataAmount}} 組數據, {{indexAmount}} 組索引",
  "data_process_params": "處理參數",
  "data_process_setting": "資料處理設定",
  "dataset.Unsupported operation": "操作不支持",