perf: file encoding;perf: leave team code;@c121914yu perf: full text search code (#3528)

* perf: text encoding * perf: leave team code * perf: full text search code * fix: http status * perf: embedding search and vector avatar
2025-01-05 14:40:02 +08:00
parent 5465ca642f
commit 2bf1fce32a
24 changed files with 345 additions and 100 deletions
--- a/packages/global/common/file/tools.ts
+++ b/packages/global/common/file/tools.ts
@@ -2,6 +2,7 @@ import { detect } from 'jschardet';
 import { documentFileType, imageFileType } from './constants';
 import { ChatFileTypeEnum } from '../../core/chat/constants';
 import { UserChatItemValueItemType } from '../../core/chat/type';
+import * as fs from 'fs';

 export const formatFileSize = (bytes: number): string => {
  if (bytes === 0) return '0 B';
@@ -16,6 +17,22 @@ export const formatFileSize = (bytes: number): string => {
 export const detectFileEncoding = (buffer: Buffer) => {
  return detect(buffer.slice(0, 200))?.encoding?.toLocaleLowerCase();
 };
+export const detectFileEncodingByPath = async (path: string) => {
+  // Get 64KB file head
+  const MAX_BYTES = 64 * 1024;
+  const buffer = Buffer.alloc(MAX_BYTES);
+
+  const fd = await fs.promises.open(path, 'r');
+  try {
+    // Read file head
+    const { bytesRead } = await fd.read(buffer, 0, MAX_BYTES, 0);
+    const actualBuffer = buffer.slice(0, bytesRead);
+
+    return detect(actualBuffer)?.encoding?.toLocaleLowerCase();
+  } finally {
+    await fd.close();
+  }
+};

 // Url => user upload file type
 export const parseUrlToFileType = (url: string): UserChatItemValueItemType['file'] | undefined => {
--- a/packages/service/common/file/gridfs/controller.ts
+++ b/packages/service/common/file/gridfs/controller.ts
@@ -4,7 +4,7 @@ import fsp from 'fs/promises';
 import fs from 'fs';
 import { DatasetFileSchema } from '@fastgpt/global/core/dataset/type';
 import { MongoChatFileSchema, MongoDatasetFileSchema } from './schema';
-import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
+import { detectFileEncoding, detectFileEncodingByPath } from '@fastgpt/global/common/file/tools';
 import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
 import { MongoRawTextBuffer } from '../../buffer/rawText/schema';
 import { readRawContentByFileBuffer } from '../read/utils';
@@ -36,7 +36,6 @@ export async function uploadFile({
  path,
  filename,
  contentType,
-  encoding,
  metadata = {}
 }: {
  bucketName: `${BucketNameEnum}`;
@@ -45,7 +44,6 @@ export async function uploadFile({
  path: string;
  filename: string;
  contentType?: string;
-  encoding: string;
  metadata?: Record<string, any>;
 }) {
  if (!path) return Promise.reject(`filePath is empty`);
@@ -59,7 +57,7 @@ export async function uploadFile({
  // Add default metadata
  metadata.teamId = teamId;
  metadata.uid = uid;
-  metadata.encoding = encoding;
+  metadata.encoding = await detectFileEncodingByPath(path);

  // create a gridfs bucket
  const bucket = getGridBucket(bucketName);
--- a/packages/service/common/middle/reqFrequencyLimit.ts
+++ b/packages/service/common/middle/reqFrequencyLimit.ts
@@ -22,7 +22,6 @@ export function useReqFrequencyLimit(seconds: number, limit: number, force = fal
        expiredTime: addSeconds(new Date(), seconds)
      });
    } catch (_) {
-      res.status(429);
      jsonRes(res, {
        code: 429,
        error: ERROR_ENUM.tooManyRequest
--- a/packages/service/common/response/index.ts
+++ b/packages/service/common/response/index.ts
@@ -33,8 +33,7 @@ export const jsonRes = <T = any>(

    addLog.error(`Api response error: ${url}`, ERROR_RESPONSE[errResponseKey]);

-    res.status(ERROR_RESPONSE[errResponseKey].code);
-    return res.json(ERROR_RESPONSE[errResponseKey]);
+    return res.status(code).json(ERROR_RESPONSE[errResponseKey]);
  }

  // another error
--- a/packages/service/core/dataset/data/dataTextSchema.ts
+++ b/packages/service/core/dataset/data/dataTextSchema.ts
@@ -25,7 +25,7 @@ const DatasetDataTextSchema = new Schema({
    required: true
  },
  dataId: {
-    type: String,
+    type: Schema.Types.ObjectId,
    ref: DatasetDataCollectionName,
    required: true
  },
@@ -37,7 +37,7 @@ const DatasetDataTextSchema = new Schema({

 try {
  DatasetDataTextSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' });
-  DatasetDataTextSchema.index({ dataId: 'hashed' });
+  DatasetDataTextSchema.index({ dataId: 1 }, { unique: true });
 } catch (error) {
  console.log(error);
 }
--- a/packages/service/core/dataset/data/schema.ts
+++ b/packages/service/core/dataset/data/schema.ts
@@ -39,10 +39,6 @@ const DatasetDataSchema = new Schema({
    type: String,
    default: ''
  },
-  fullTextToken: {
-    type: String,
-    default: ''
-  },
  indexes: {
    type: [
      {
@@ -72,7 +68,13 @@ const DatasetDataSchema = new Schema({
    default: 0
  },
  rebuilding: Boolean,
-  inited: Boolean
+
+  // Abandon
+  fullTextToken: {
+    type: String,
+    default: ''
+  },
+  initFullText: Boolean
 });

 try {
@@ -85,13 +87,14 @@ try {
    updateTime: -1
  });
  // full text index
-  DatasetDataSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' });
+  // DatasetDataSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' });
  // Recall vectors after data matching
  DatasetDataSchema.index({ teamId: 1, datasetId: 1, collectionId: 1, 'indexes.dataId': 1 });
  DatasetDataSchema.index({ updateTime: 1 });
  // rebuild data
  DatasetDataSchema.index({ rebuilding: 1, teamId: 1, datasetId: 1 });
-  DatasetDataSchema.index({ inited: 'hashed' });
+
+  DatasetDataSchema.index({ initFullText: 1 });
 } catch (error) {
  console.log(error);
 }
--- a/packages/service/core/dataset/search/controller.ts
+++ b/packages/service/core/dataset/search/controller.ts
@@ -289,20 +289,22 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
      ).lean()
    ]);

-    const formatResult = dataList
-      .map((data, index) => {
-        const collection = collections.find((col) => String(col._id) === String(data.collectionId));
+    const formatResult = results
+      .map((item, index) => {
+        const collection = collections.find((col) => String(col._id) === String(item.collectionId));
        if (!collection) {
-          console.log('Collection is not found', data);
+          console.log('Collection is not found', item);
+          return;
+        }
+        const data = dataList.find((data) =>
+          data.indexes.some((index) => index.dataId === item.id)
+        );
+        if (!data) {
+          console.log('Data is not found', item);
          return;
        }

-        // add score to data(It's already sorted. The first one is the one with the most points)
-        const dataIdList = data.indexes.map((item) => item.dataId);
-        const maxScoreResult = results.find((item) => {
-          return dataIdList.includes(item.id);
-        });
-        const score = maxScoreResult?.score || 0;
+        const score = item?.score || 0;

        const result: SearchDataResponseItemType = {
          id: String(data._id),
@@ -320,8 +322,6 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
      })
      .filter(Boolean) as SearchDataResponseItemType[];

-    formatResult.sort((a, b) => b.score[0].value - a.score[0].value);
-
    return {
      embeddingRecallResults: formatResult,
      tokens
@@ -411,22 +411,6 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
      '_id name fileId rawLink externalFileId externalFileUrl',
      { ...readFromSecondary }
    ).lean();
-    // const [dataList, collections] = await Promise.all([
-    //   MongoDatasetData.find(
-    //     {
-    //       _id: { $in: searchResults.map((item) => item.dataId) }
-    //     },
-    //     '_id datasetId collectionId updateTime q a chunkIndex indexes',
-    //     { ...readFromSecondary }
-    //   ).lean(),
-    //   MongoDatasetCollection.find(
-    //     {
-    //       _id: { $in: searchResults.map((item) => item.collectionId) }
-    //     },
-    //     '_id name fileId rawLink externalFileId externalFileUrl',
-    //     { ...readFromSecondary }
-    //   ).lean()
-    // ]);

    return {
      fullTextRecallResults: searchResults
@@ -439,9 +423,6 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
            return;
          }

-          // const score =
-          //   searchResults.find((item) => String(item.dataId) === String(data._id))?.score || 0;
-
          return {
            id: String(data._id),
            datasetId: String(data.datasetId),
@@ -459,6 +440,135 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
      tokenLen: 0
    };
  };
+  const fullTextRecall2 = async ({
+    query,
+    limit,
+    filterCollectionIdList,
+    forbidCollectionIdList
+  }: {
+    query: string;
+    limit: number;
+    filterCollectionIdList?: string[];
+    forbidCollectionIdList: string[];
+  }): Promise<{
+    fullTextRecallResults: SearchDataResponseItemType[];
+    tokenLen: number;
+  }> => {
+    if (limit === 0) {
+      return {
+        fullTextRecallResults: [],
+        tokenLen: 0
+      };
+    }
+
+    const searchResults = (
+      await Promise.all(
+        datasetIds.map(async (id) => {
+          return MongoDatasetDataText.aggregate(
+            [
+              {
+                $match: {
+                  teamId: new Types.ObjectId(teamId),
+                  datasetId: new Types.ObjectId(id),
+                  $text: { $search: jiebaSplit({ text: query }) },
+                  ...(filterCollectionIdList
+                    ? {
+                        collectionId: {
+                          $in: filterCollectionIdList.map((id) => new Types.ObjectId(id))
+                        }
+                      }
+                    : {}),
+                  ...(forbidCollectionIdList && forbidCollectionIdList.length > 0
+                    ? {
+                        collectionId: {
+                          $nin: forbidCollectionIdList.map((id) => new Types.ObjectId(id))
+                        }
+                      }
+                    : {})
+                }
+              },
+              {
+                $sort: {
+                  score: { $meta: 'textScore' }
+                }
+              },
+              {
+                $limit: limit
+              },
+              {
+                $project: {
+                  _id: 1,
+                  collectionId: 1,
+                  dataId: 1,
+                  score: { $meta: 'textScore' }
+                }
+              }
+            ],
+            {
+              ...readFromSecondary
+            }
+          );
+        })
+      )
+    ).flat() as (DatasetDataTextSchemaType & { score: number })[];
+
+    // Get data and collections
+    const [dataList, collections] = await Promise.all([
+      MongoDatasetData.find(
+        {
+          _id: { $in: searchResults.map((item) => item.dataId) }
+        },
+        '_id datasetId collectionId updateTime q a chunkIndex indexes',
+        { ...readFromSecondary }
+      ).lean(),
+      MongoDatasetCollection.find(
+        {
+          _id: { $in: searchResults.map((item) => item.collectionId) }
+        },
+        '_id name fileId rawLink externalFileId externalFileUrl',
+        { ...readFromSecondary }
+      ).lean()
+    ]);
+
+    return {
+      fullTextRecallResults: searchResults
+        .map((item, index) => {
+          const collection = collections.find(
+            (col) => String(col._id) === String(item.collectionId)
+          );
+          if (!collection) {
+            console.log('Collection is not found', item);
+            return;
+          }
+          const data = dataList.find((data) => String(data._id) === String(item.dataId));
+          if (!data) {
+            console.log('Data is not found', item);
+            return;
+          }
+
+          return {
+            id: String(data._id),
+            datasetId: String(data.datasetId),
+            collectionId: String(data.collectionId),
+            updateTime: data.updateTime,
+            q: data.q,
+            a: data.a,
+            chunkIndex: data.chunkIndex,
+            indexes: data.indexes,
+            ...getCollectionSourceData(collection),
+            score: [
+              {
+                type: SearchScoreTypeEnum.fullText,
+                value: item.score || 0,
+                index
+              }
+            ]
+          };
+        })
+        .filter(Boolean) as SearchDataResponseItemType[],
+      tokenLen: 0
+    };
+  };
  const reRankSearchResult = async ({
    data,
    query
@@ -526,7 +636,7 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
            forbidCollectionIdList,
            filterCollectionIdList
          }),
-          fullTextRecall({
+          fullTextRecall2({
            query,
            limit: fullTextLimit,
            filterCollectionIdList,
--- a/packages/service/support/permission/memberGroup/controllers.ts
+++ b/packages/service/support/permission/memberGroup/controllers.ts
@@ -47,26 +47,32 @@ export const getTeamDefaultGroup = async ({
 export const getGroupsByTmbId = async ({
  tmbId,
  teamId,
-  role
+  role,
+  session
 }: {
  tmbId: string;
  teamId: string;
  role?: `${GroupMemberRole}`[];
+  session?: ClientSession;
 }) =>
  (
    await Promise.all([
      (
-        await MongoGroupMemberModel.find({
-          tmbId,
-          groupId: {
-            $exists: true
+        await MongoGroupMemberModel.find(
+          {
+            tmbId,
+            groupId: {
+              $exists: true
+            },
+            ...(role ? { role: { $in: role } } : {})
          },
-          ...(role ? { role: { $in: role } } : {})
-        })
+          undefined,
+          { session }
+        )
          .populate<{ group: MemberGroupSchemaType }>('group')
          .lean()
      ).map((item) => item.group),
-      role ? [] : getTeamDefaultGroup({ teamId })
+      role ? [] : getTeamDefaultGroup({ teamId, session })
    ])
  ).flat();

--- a/packages/service/support/permission/schema.ts
+++ b/packages/service/support/permission/schema.ts
@@ -115,6 +115,7 @@ try {
    }
  );

+  // Delete tmb permission
  ResourcePermissionSchema.index({
    resourceType: 1,
    teamId: 1,
--- a/packages/service/support/user/schema.ts
+++ b/packages/service/support/user/schema.ts
@@ -78,9 +78,6 @@ const UserSchema = new Schema({
 });

 try {
-  // login
-  UserSchema.index({ username: 1 });
-
  // Admin charts
  UserSchema.index({ createTime: -1 });
 } catch (error) {
--- a/packages/service/support/user/team/teamMemberSchema.ts
+++ b/packages/service/support/user/team/teamMemberSchema.ts
@@ -23,10 +23,6 @@ const TeamMemberSchema = new Schema({
    type: String,
    default: 'Member'
  },
-  role: {
-    type: String
-    // enum: Object.keys(TeamMemberRoleMap) // disable enum validation for old data
-  },
  status: {
    type: String,
    enum: Object.keys(TeamMemberStatusMap)
@@ -38,6 +34,12 @@ const TeamMemberSchema = new Schema({
  defaultTeam: {
    type: Boolean,
    default: false
+  },
+
+  // Abandoned
+  role: {
+    type: String
+    // enum: Object.keys(TeamMemberRoleMap) // disable enum validation for old data
  }
 });

--- a/packages/service/worker/htmlStr2Md/utils.ts
+++ b/packages/service/worker/htmlStr2Md/utils.ts
@@ -1,9 +1,27 @@
 import TurndownService from 'turndown';
 import { ImageType } from '../readFile/type';
 import { matchMdImgTextAndUpload } from '@fastgpt/global/common/string/markdown';
+import { getNanoid } from '@fastgpt/global/common/string/tools';
 // @ts-ignore
 const turndownPluginGfm = require('joplin-turndown-plugin-gfm');

+const processBase64Images = (htmlContent: string) => {
+  const base64Regex = /src="data:([^;]+);base64,([^"]+)"/g;
+  const images: ImageType[] = [];
+
+  const processedHtml = htmlContent.replace(base64Regex, (match, mime, base64Data) => {
+    const uuid = `IMAGE_${getNanoid(12)}_IMAGE`;
+    images.push({
+      uuid,
+      base64: base64Data,
+      mime
+    });
+    return `src="${uuid}"`;
+  });
+
+  return { processedHtml, images };
+};
+
 export const html2md = (
  html: string
 ): {
@@ -25,11 +43,14 @@ export const html2md = (
    turndownService.remove(['i', 'script', 'iframe', 'style']);
    turndownService.use(turndownPluginGfm.gfm);

-    const { text, imageList } = matchMdImgTextAndUpload(html);
+    // Base64 img to id, otherwise it will occupy memory when going to md
+    const { processedHtml, images } = processBase64Images(html);
+    const md = turndownService.turndown(processedHtml);
+    const { text, imageList } = matchMdImgTextAndUpload(md);

    return {
-      rawText: turndownService.turndown(text),
-      imageList
+      rawText: text,
+      imageList: [...images, ...imageList]
    };
  } catch (error) {
    console.log('html 2 markdown error', error);
--- a/packages/service/worker/readFile/extension/rawText.ts
+++ b/packages/service/worker/readFile/extension/rawText.ts
@@ -24,7 +24,11 @@ export const readFileRawText = ({ buffer, encoding }: ReadRawTextByBuffer): Read
        return buffer.toString(encoding as BufferEncoding);
      }

-      return iconv.decode(buffer, encoding);
+      if (encoding) {
+        return iconv.decode(buffer, encoding);
+      }
+
+      return buffer.toString('utf-8');
    } catch (error) {
      return buffer.toString('utf-8');
    }
--- a/packages/web/i18n/en/account_team.json
+++ b/packages/web/i18n/en/account_team.json
@@ -1,31 +1,31 @@
 {
  "action": "operate",
  "confirm_delete_group": "Confirm to delete group?",
-  "confirm_leave_team": "Confirmed to leave the team?  \n \nAfter you log out, all your resources in the team (applications, knowledge bases, folders, managed groups, etc.) will be transferred to the team owner.",
-  "confirm_delete_org": "Confirm to delete organization?",
  "confirm_delete_member": "Confirm to delete member?",
+  "confirm_delete_org": "Confirm to delete organization?",
+  "confirm_leave_team": "Confirmed to leave the team?  \nAfter exiting, all your resources in the team are transferred to the team owner.",
  "create_group": "Create group",
-  "delete": "delete",
-  "edit_info": "Edit information",
-  "group": "group",
-  "group_name": "Group name",
-  "org": "organization",
-  "org_name": "Organization name",
-  "org_description": "Organization description",
  "create_org": "Create organization",
  "create_sub_org": "Create sub-organization",
-  "edit_org_info": "Edit organization information",
-  "move_org": "Move organization",
-  "move_member": "Move member",
+  "delete": "delete",
  "delete_org": "Delete organization",
-  "remark": "remark",
+  "edit_info": "Edit information",
+  "edit_org_info": "Edit organization information",
+  "group": "group",
+  "group_name": "Group name",
  "label_sync": "Tag sync",
  "leave_team_failed": "Leaving the team exception",
  "manage_member": "Managing members",
  "member": "member",
  "member_group": "Belonging to member group",
+  "move_member": "Move member",
+  "move_org": "Move organization",
+  "org": "organization",
+  "org_description": "Organization description",
+  "org_name": "Organization name",
  "owner": "owner",
  "permission": "Permissions",
+  "remark": "remark",
  "remove_tip": "Confirm to remove {{username}} from the team?",
  "retain_admin_permissions": "Keep administrator rights",
  "search_member_group_name": "Search member/group name",
--- a/packages/web/i18n/zh-CN/account_team.json
+++ b/packages/web/i18n/zh-CN/account_team.json
@@ -3,7 +3,7 @@
  "confirm_delete_group": "确认删除群组？",
  "confirm_delete_member": "确认删除成员？",
  "confirm_delete_org": "确认删除该部门？",
-  "confirm_leave_team": "确认离开该团队？  \n 退出后，您在该团队所有的资源（ 应用、知识库、文件夹、管理的群组等）均转让给团队所有者。",
+  "confirm_leave_team": "确认离开该团队？  \n退出后，您在该团队所有的资源均转让给团队所有者。",
  "create_group": "创建群组",
  "create_org": "创建部门",
  "create_sub_org": "创建子部门",
--- a/packages/web/i18n/zh-Hant/account_team.json
+++ b/packages/web/i18n/zh-Hant/account_team.json
@@ -3,7 +3,7 @@
  "confirm_delete_group": "確認刪除群組？",
  "confirm_delete_member": "確認刪除成員？",
  "confirm_delete_org": "確認刪除該部門？",
-  "confirm_leave_team": "確認離開該團隊？  \n \n退出後，您在該團隊所有的資源（ 應用程式、知識庫、資料夾、管理的群組等）均轉讓給團隊所有者。",
+  "confirm_leave_team": "確認離開該團隊？  \n退出後，您在該團隊所有的資源轉讓給團隊所有者。",
  "create_group": "建立群組",
  "create_org": "創建部門",
  "create_sub_org": "創建子部門",