perf: file encoding;perf: leave team code;@c121914yu perf: full text search code (#3528)

* perf: text encoding

* perf: leave team code

* perf: full text search code

* fix: http status

* perf: embedding search and vector avatar
This commit is contained in:
Archer
2025-01-05 14:40:02 +08:00
committed by archer
parent 5465ca642f
commit 2bf1fce32a
24 changed files with 345 additions and 100 deletions

View File

@@ -2,6 +2,7 @@ import { detect } from 'jschardet';
import { documentFileType, imageFileType } from './constants';
import { ChatFileTypeEnum } from '../../core/chat/constants';
import { UserChatItemValueItemType } from '../../core/chat/type';
import * as fs from 'fs';
export const formatFileSize = (bytes: number): string => {
if (bytes === 0) return '0 B';
@@ -16,6 +17,22 @@ export const formatFileSize = (bytes: number): string => {
export const detectFileEncoding = (buffer: Buffer) => {
return detect(buffer.slice(0, 200))?.encoding?.toLocaleLowerCase();
};
export const detectFileEncodingByPath = async (path: string) => {
// Get 64KB file head
const MAX_BYTES = 64 * 1024;
const buffer = Buffer.alloc(MAX_BYTES);
const fd = await fs.promises.open(path, 'r');
try {
// Read file head
const { bytesRead } = await fd.read(buffer, 0, MAX_BYTES, 0);
const actualBuffer = buffer.slice(0, bytesRead);
return detect(actualBuffer)?.encoding?.toLocaleLowerCase();
} finally {
await fd.close();
}
};
// Url => user upload file type
export const parseUrlToFileType = (url: string): UserChatItemValueItemType['file'] | undefined => {

View File

@@ -4,7 +4,7 @@ import fsp from 'fs/promises';
import fs from 'fs';
import { DatasetFileSchema } from '@fastgpt/global/core/dataset/type';
import { MongoChatFileSchema, MongoDatasetFileSchema } from './schema';
import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
import { detectFileEncoding, detectFileEncodingByPath } from '@fastgpt/global/common/file/tools';
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
import { MongoRawTextBuffer } from '../../buffer/rawText/schema';
import { readRawContentByFileBuffer } from '../read/utils';
@@ -36,7 +36,6 @@ export async function uploadFile({
path,
filename,
contentType,
encoding,
metadata = {}
}: {
bucketName: `${BucketNameEnum}`;
@@ -45,7 +44,6 @@ export async function uploadFile({
path: string;
filename: string;
contentType?: string;
encoding: string;
metadata?: Record<string, any>;
}) {
if (!path) return Promise.reject(`filePath is empty`);
@@ -59,7 +57,7 @@ export async function uploadFile({
// Add default metadata
metadata.teamId = teamId;
metadata.uid = uid;
metadata.encoding = encoding;
metadata.encoding = await detectFileEncodingByPath(path);
// create a gridfs bucket
const bucket = getGridBucket(bucketName);

View File

@@ -22,7 +22,6 @@ export function useReqFrequencyLimit(seconds: number, limit: number, force = fal
expiredTime: addSeconds(new Date(), seconds)
});
} catch (_) {
res.status(429);
jsonRes(res, {
code: 429,
error: ERROR_ENUM.tooManyRequest

View File

@@ -33,8 +33,7 @@ export const jsonRes = <T = any>(
addLog.error(`Api response error: ${url}`, ERROR_RESPONSE[errResponseKey]);
res.status(ERROR_RESPONSE[errResponseKey].code);
return res.json(ERROR_RESPONSE[errResponseKey]);
return res.status(code).json(ERROR_RESPONSE[errResponseKey]);
}
// another error

View File

@@ -25,7 +25,7 @@ const DatasetDataTextSchema = new Schema({
required: true
},
dataId: {
type: String,
type: Schema.Types.ObjectId,
ref: DatasetDataCollectionName,
required: true
},
@@ -37,7 +37,7 @@ const DatasetDataTextSchema = new Schema({
try {
DatasetDataTextSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' });
DatasetDataTextSchema.index({ dataId: 'hashed' });
DatasetDataTextSchema.index({ dataId: 1 }, { unique: true });
} catch (error) {
console.log(error);
}

View File

@@ -39,10 +39,6 @@ const DatasetDataSchema = new Schema({
type: String,
default: ''
},
fullTextToken: {
type: String,
default: ''
},
indexes: {
type: [
{
@@ -72,7 +68,13 @@ const DatasetDataSchema = new Schema({
default: 0
},
rebuilding: Boolean,
inited: Boolean
// Abandon
fullTextToken: {
type: String,
default: ''
},
initFullText: Boolean
});
try {
@@ -85,13 +87,14 @@ try {
updateTime: -1
});
// full text index
DatasetDataSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' });
// DatasetDataSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' });
// Recall vectors after data matching
DatasetDataSchema.index({ teamId: 1, datasetId: 1, collectionId: 1, 'indexes.dataId': 1 });
DatasetDataSchema.index({ updateTime: 1 });
// rebuild data
DatasetDataSchema.index({ rebuilding: 1, teamId: 1, datasetId: 1 });
DatasetDataSchema.index({ inited: 'hashed' });
DatasetDataSchema.index({ initFullText: 1 });
} catch (error) {
console.log(error);
}

View File

@@ -289,20 +289,22 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
).lean()
]);
const formatResult = dataList
.map((data, index) => {
const collection = collections.find((col) => String(col._id) === String(data.collectionId));
const formatResult = results
.map((item, index) => {
const collection = collections.find((col) => String(col._id) === String(item.collectionId));
if (!collection) {
console.log('Collection is not found', data);
console.log('Collection is not found', item);
return;
}
const data = dataList.find((data) =>
data.indexes.some((index) => index.dataId === item.id)
);
if (!data) {
console.log('Data is not found', item);
return;
}
// add score to data(It's already sorted. The first one is the one with the most points)
const dataIdList = data.indexes.map((item) => item.dataId);
const maxScoreResult = results.find((item) => {
return dataIdList.includes(item.id);
});
const score = maxScoreResult?.score || 0;
const score = item?.score || 0;
const result: SearchDataResponseItemType = {
id: String(data._id),
@@ -320,8 +322,6 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
})
.filter(Boolean) as SearchDataResponseItemType[];
formatResult.sort((a, b) => b.score[0].value - a.score[0].value);
return {
embeddingRecallResults: formatResult,
tokens
@@ -411,22 +411,6 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
'_id name fileId rawLink externalFileId externalFileUrl',
{ ...readFromSecondary }
).lean();
// const [dataList, collections] = await Promise.all([
// MongoDatasetData.find(
// {
// _id: { $in: searchResults.map((item) => item.dataId) }
// },
// '_id datasetId collectionId updateTime q a chunkIndex indexes',
// { ...readFromSecondary }
// ).lean(),
// MongoDatasetCollection.find(
// {
// _id: { $in: searchResults.map((item) => item.collectionId) }
// },
// '_id name fileId rawLink externalFileId externalFileUrl',
// { ...readFromSecondary }
// ).lean()
// ]);
return {
fullTextRecallResults: searchResults
@@ -439,9 +423,6 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
return;
}
// const score =
// searchResults.find((item) => String(item.dataId) === String(data._id))?.score || 0;
return {
id: String(data._id),
datasetId: String(data.datasetId),
@@ -459,6 +440,135 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
tokenLen: 0
};
};
const fullTextRecall2 = async ({
query,
limit,
filterCollectionIdList,
forbidCollectionIdList
}: {
query: string;
limit: number;
filterCollectionIdList?: string[];
forbidCollectionIdList: string[];
}): Promise<{
fullTextRecallResults: SearchDataResponseItemType[];
tokenLen: number;
}> => {
if (limit === 0) {
return {
fullTextRecallResults: [],
tokenLen: 0
};
}
const searchResults = (
await Promise.all(
datasetIds.map(async (id) => {
return MongoDatasetDataText.aggregate(
[
{
$match: {
teamId: new Types.ObjectId(teamId),
datasetId: new Types.ObjectId(id),
$text: { $search: jiebaSplit({ text: query }) },
...(filterCollectionIdList
? {
collectionId: {
$in: filterCollectionIdList.map((id) => new Types.ObjectId(id))
}
}
: {}),
...(forbidCollectionIdList && forbidCollectionIdList.length > 0
? {
collectionId: {
$nin: forbidCollectionIdList.map((id) => new Types.ObjectId(id))
}
}
: {})
}
},
{
$sort: {
score: { $meta: 'textScore' }
}
},
{
$limit: limit
},
{
$project: {
_id: 1,
collectionId: 1,
dataId: 1,
score: { $meta: 'textScore' }
}
}
],
{
...readFromSecondary
}
);
})
)
).flat() as (DatasetDataTextSchemaType & { score: number })[];
// Get data and collections
const [dataList, collections] = await Promise.all([
MongoDatasetData.find(
{
_id: { $in: searchResults.map((item) => item.dataId) }
},
'_id datasetId collectionId updateTime q a chunkIndex indexes',
{ ...readFromSecondary }
).lean(),
MongoDatasetCollection.find(
{
_id: { $in: searchResults.map((item) => item.collectionId) }
},
'_id name fileId rawLink externalFileId externalFileUrl',
{ ...readFromSecondary }
).lean()
]);
return {
fullTextRecallResults: searchResults
.map((item, index) => {
const collection = collections.find(
(col) => String(col._id) === String(item.collectionId)
);
if (!collection) {
console.log('Collection is not found', item);
return;
}
const data = dataList.find((data) => String(data._id) === String(item.dataId));
if (!data) {
console.log('Data is not found', item);
return;
}
return {
id: String(data._id),
datasetId: String(data.datasetId),
collectionId: String(data.collectionId),
updateTime: data.updateTime,
q: data.q,
a: data.a,
chunkIndex: data.chunkIndex,
indexes: data.indexes,
...getCollectionSourceData(collection),
score: [
{
type: SearchScoreTypeEnum.fullText,
value: item.score || 0,
index
}
]
};
})
.filter(Boolean) as SearchDataResponseItemType[],
tokenLen: 0
};
};
const reRankSearchResult = async ({
data,
query
@@ -526,7 +636,7 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
forbidCollectionIdList,
filterCollectionIdList
}),
fullTextRecall({
fullTextRecall2({
query,
limit: fullTextLimit,
filterCollectionIdList,

View File

@@ -47,26 +47,32 @@ export const getTeamDefaultGroup = async ({
export const getGroupsByTmbId = async ({
tmbId,
teamId,
role
role,
session
}: {
tmbId: string;
teamId: string;
role?: `${GroupMemberRole}`[];
session?: ClientSession;
}) =>
(
await Promise.all([
(
await MongoGroupMemberModel.find({
tmbId,
groupId: {
$exists: true
await MongoGroupMemberModel.find(
{
tmbId,
groupId: {
$exists: true
},
...(role ? { role: { $in: role } } : {})
},
...(role ? { role: { $in: role } } : {})
})
undefined,
{ session }
)
.populate<{ group: MemberGroupSchemaType }>('group')
.lean()
).map((item) => item.group),
role ? [] : getTeamDefaultGroup({ teamId })
role ? [] : getTeamDefaultGroup({ teamId, session })
])
).flat();

View File

@@ -115,6 +115,7 @@ try {
}
);
// Delete tmb permission
ResourcePermissionSchema.index({
resourceType: 1,
teamId: 1,

View File

@@ -78,9 +78,6 @@ const UserSchema = new Schema({
});
try {
// login
UserSchema.index({ username: 1 });
// Admin charts
UserSchema.index({ createTime: -1 });
} catch (error) {

View File

@@ -23,10 +23,6 @@ const TeamMemberSchema = new Schema({
type: String,
default: 'Member'
},
role: {
type: String
// enum: Object.keys(TeamMemberRoleMap) // disable enum validation for old data
},
status: {
type: String,
enum: Object.keys(TeamMemberStatusMap)
@@ -38,6 +34,12 @@ const TeamMemberSchema = new Schema({
defaultTeam: {
type: Boolean,
default: false
},
// Abandoned
role: {
type: String
// enum: Object.keys(TeamMemberRoleMap) // disable enum validation for old data
}
});

View File

@@ -1,9 +1,27 @@
import TurndownService from 'turndown';
import { ImageType } from '../readFile/type';
import { matchMdImgTextAndUpload } from '@fastgpt/global/common/string/markdown';
import { getNanoid } from '@fastgpt/global/common/string/tools';
// @ts-ignore
const turndownPluginGfm = require('joplin-turndown-plugin-gfm');
const processBase64Images = (htmlContent: string) => {
const base64Regex = /src="data:([^;]+);base64,([^"]+)"/g;
const images: ImageType[] = [];
const processedHtml = htmlContent.replace(base64Regex, (match, mime, base64Data) => {
const uuid = `IMAGE_${getNanoid(12)}_IMAGE`;
images.push({
uuid,
base64: base64Data,
mime
});
return `src="${uuid}"`;
});
return { processedHtml, images };
};
export const html2md = (
html: string
): {
@@ -25,11 +43,14 @@ export const html2md = (
turndownService.remove(['i', 'script', 'iframe', 'style']);
turndownService.use(turndownPluginGfm.gfm);
const { text, imageList } = matchMdImgTextAndUpload(html);
// Base64 img to id, otherwise it will occupy memory when going to md
const { processedHtml, images } = processBase64Images(html);
const md = turndownService.turndown(processedHtml);
const { text, imageList } = matchMdImgTextAndUpload(md);
return {
rawText: turndownService.turndown(text),
imageList
rawText: text,
imageList: [...images, ...imageList]
};
} catch (error) {
console.log('html 2 markdown error', error);

View File

@@ -24,7 +24,11 @@ export const readFileRawText = ({ buffer, encoding }: ReadRawTextByBuffer): Read
return buffer.toString(encoding as BufferEncoding);
}
return iconv.decode(buffer, encoding);
if (encoding) {
return iconv.decode(buffer, encoding);
}
return buffer.toString('utf-8');
} catch (error) {
return buffer.toString('utf-8');
}

View File

@@ -1,31 +1,31 @@
{
"action": "operate",
"confirm_delete_group": "Confirm to delete group?",
"confirm_leave_team": "Confirmed to leave the team? \n \nAfter you log out, all your resources in the team (applications, knowledge bases, folders, managed groups, etc.) will be transferred to the team owner.",
"confirm_delete_org": "Confirm to delete organization?",
"confirm_delete_member": "Confirm to delete member?",
"confirm_delete_org": "Confirm to delete organization?",
"confirm_leave_team": "Confirmed to leave the team? \nAfter exiting, all your resources in the team are transferred to the team owner.",
"create_group": "Create group",
"delete": "delete",
"edit_info": "Edit information",
"group": "group",
"group_name": "Group name",
"org": "organization",
"org_name": "Organization name",
"org_description": "Organization description",
"create_org": "Create organization",
"create_sub_org": "Create sub-organization",
"edit_org_info": "Edit organization information",
"move_org": "Move organization",
"move_member": "Move member",
"delete": "delete",
"delete_org": "Delete organization",
"remark": "remark",
"edit_info": "Edit information",
"edit_org_info": "Edit organization information",
"group": "group",
"group_name": "Group name",
"label_sync": "Tag sync",
"leave_team_failed": "Leaving the team exception",
"manage_member": "Managing members",
"member": "member",
"member_group": "Belonging to member group",
"move_member": "Move member",
"move_org": "Move organization",
"org": "organization",
"org_description": "Organization description",
"org_name": "Organization name",
"owner": "owner",
"permission": "Permissions",
"remark": "remark",
"remove_tip": "Confirm to remove {{username}} from the team?",
"retain_admin_permissions": "Keep administrator rights",
"search_member_group_name": "Search member/group name",

View File

@@ -3,7 +3,7 @@
"confirm_delete_group": "确认删除群组?",
"confirm_delete_member": "确认删除成员?",
"confirm_delete_org": "确认删除该部门?",
"confirm_leave_team": "确认离开该团队? \n 退出后,您在该团队所有的资源 应用、知识库、文件夹、管理的群组等)均转让给团队所有者。",
"confirm_leave_team": "确认离开该团队? \n退出后您在该团队所有的资源均转让给团队所有者。",
"create_group": "创建群组",
"create_org": "创建部门",
"create_sub_org": "创建子部门",

View File

@@ -3,7 +3,7 @@
"confirm_delete_group": "確認刪除群組?",
"confirm_delete_member": "確認刪除成員?",
"confirm_delete_org": "確認刪除該部門?",
"confirm_leave_team": "確認離開該團隊? \n \n退出後您在該團隊所有的資源 應用程式、知識庫、資料夾、管理的群組等)均轉讓給團隊所有者。",
"confirm_leave_team": "確認離開該團隊? \n退出後您在該團隊所有的資源轉讓給團隊所有者。",
"create_group": "建立群組",
"create_org": "創建部門",
"create_sub_org": "創建子部門",