perf: file encoding;perf: leave team code;@c121914yu perf: full text search code (#3528)
* perf: text encoding * perf: leave team code * perf: full text search code * fix: http status * perf: embedding search and vector avatar
This commit is contained in:
@@ -2,6 +2,7 @@ import { detect } from 'jschardet';
|
||||
import { documentFileType, imageFileType } from './constants';
|
||||
import { ChatFileTypeEnum } from '../../core/chat/constants';
|
||||
import { UserChatItemValueItemType } from '../../core/chat/type';
|
||||
import * as fs from 'fs';
|
||||
|
||||
export const formatFileSize = (bytes: number): string => {
|
||||
if (bytes === 0) return '0 B';
|
||||
@@ -16,6 +17,22 @@ export const formatFileSize = (bytes: number): string => {
|
||||
export const detectFileEncoding = (buffer: Buffer) => {
|
||||
return detect(buffer.slice(0, 200))?.encoding?.toLocaleLowerCase();
|
||||
};
|
||||
export const detectFileEncodingByPath = async (path: string) => {
|
||||
// Get 64KB file head
|
||||
const MAX_BYTES = 64 * 1024;
|
||||
const buffer = Buffer.alloc(MAX_BYTES);
|
||||
|
||||
const fd = await fs.promises.open(path, 'r');
|
||||
try {
|
||||
// Read file head
|
||||
const { bytesRead } = await fd.read(buffer, 0, MAX_BYTES, 0);
|
||||
const actualBuffer = buffer.slice(0, bytesRead);
|
||||
|
||||
return detect(actualBuffer)?.encoding?.toLocaleLowerCase();
|
||||
} finally {
|
||||
await fd.close();
|
||||
}
|
||||
};
|
||||
|
||||
// Url => user upload file type
|
||||
export const parseUrlToFileType = (url: string): UserChatItemValueItemType['file'] | undefined => {
|
||||
|
||||
@@ -4,7 +4,7 @@ import fsp from 'fs/promises';
|
||||
import fs from 'fs';
|
||||
import { DatasetFileSchema } from '@fastgpt/global/core/dataset/type';
|
||||
import { MongoChatFileSchema, MongoDatasetFileSchema } from './schema';
|
||||
import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
|
||||
import { detectFileEncoding, detectFileEncodingByPath } from '@fastgpt/global/common/file/tools';
|
||||
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
|
||||
import { MongoRawTextBuffer } from '../../buffer/rawText/schema';
|
||||
import { readRawContentByFileBuffer } from '../read/utils';
|
||||
@@ -36,7 +36,6 @@ export async function uploadFile({
|
||||
path,
|
||||
filename,
|
||||
contentType,
|
||||
encoding,
|
||||
metadata = {}
|
||||
}: {
|
||||
bucketName: `${BucketNameEnum}`;
|
||||
@@ -45,7 +44,6 @@ export async function uploadFile({
|
||||
path: string;
|
||||
filename: string;
|
||||
contentType?: string;
|
||||
encoding: string;
|
||||
metadata?: Record<string, any>;
|
||||
}) {
|
||||
if (!path) return Promise.reject(`filePath is empty`);
|
||||
@@ -59,7 +57,7 @@ export async function uploadFile({
|
||||
// Add default metadata
|
||||
metadata.teamId = teamId;
|
||||
metadata.uid = uid;
|
||||
metadata.encoding = encoding;
|
||||
metadata.encoding = await detectFileEncodingByPath(path);
|
||||
|
||||
// create a gridfs bucket
|
||||
const bucket = getGridBucket(bucketName);
|
||||
|
||||
@@ -22,7 +22,6 @@ export function useReqFrequencyLimit(seconds: number, limit: number, force = fal
|
||||
expiredTime: addSeconds(new Date(), seconds)
|
||||
});
|
||||
} catch (_) {
|
||||
res.status(429);
|
||||
jsonRes(res, {
|
||||
code: 429,
|
||||
error: ERROR_ENUM.tooManyRequest
|
||||
|
||||
@@ -33,8 +33,7 @@ export const jsonRes = <T = any>(
|
||||
|
||||
addLog.error(`Api response error: ${url}`, ERROR_RESPONSE[errResponseKey]);
|
||||
|
||||
res.status(ERROR_RESPONSE[errResponseKey].code);
|
||||
return res.json(ERROR_RESPONSE[errResponseKey]);
|
||||
return res.status(code).json(ERROR_RESPONSE[errResponseKey]);
|
||||
}
|
||||
|
||||
// another error
|
||||
|
||||
@@ -25,7 +25,7 @@ const DatasetDataTextSchema = new Schema({
|
||||
required: true
|
||||
},
|
||||
dataId: {
|
||||
type: String,
|
||||
type: Schema.Types.ObjectId,
|
||||
ref: DatasetDataCollectionName,
|
||||
required: true
|
||||
},
|
||||
@@ -37,7 +37,7 @@ const DatasetDataTextSchema = new Schema({
|
||||
|
||||
try {
|
||||
DatasetDataTextSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' });
|
||||
DatasetDataTextSchema.index({ dataId: 'hashed' });
|
||||
DatasetDataTextSchema.index({ dataId: 1 }, { unique: true });
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
@@ -39,10 +39,6 @@ const DatasetDataSchema = new Schema({
|
||||
type: String,
|
||||
default: ''
|
||||
},
|
||||
fullTextToken: {
|
||||
type: String,
|
||||
default: ''
|
||||
},
|
||||
indexes: {
|
||||
type: [
|
||||
{
|
||||
@@ -72,7 +68,13 @@ const DatasetDataSchema = new Schema({
|
||||
default: 0
|
||||
},
|
||||
rebuilding: Boolean,
|
||||
inited: Boolean
|
||||
|
||||
// Abandon
|
||||
fullTextToken: {
|
||||
type: String,
|
||||
default: ''
|
||||
},
|
||||
initFullText: Boolean
|
||||
});
|
||||
|
||||
try {
|
||||
@@ -85,13 +87,14 @@ try {
|
||||
updateTime: -1
|
||||
});
|
||||
// full text index
|
||||
DatasetDataSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' });
|
||||
// DatasetDataSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' });
|
||||
// Recall vectors after data matching
|
||||
DatasetDataSchema.index({ teamId: 1, datasetId: 1, collectionId: 1, 'indexes.dataId': 1 });
|
||||
DatasetDataSchema.index({ updateTime: 1 });
|
||||
// rebuild data
|
||||
DatasetDataSchema.index({ rebuilding: 1, teamId: 1, datasetId: 1 });
|
||||
DatasetDataSchema.index({ inited: 'hashed' });
|
||||
|
||||
DatasetDataSchema.index({ initFullText: 1 });
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
@@ -289,20 +289,22 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
|
||||
).lean()
|
||||
]);
|
||||
|
||||
const formatResult = dataList
|
||||
.map((data, index) => {
|
||||
const collection = collections.find((col) => String(col._id) === String(data.collectionId));
|
||||
const formatResult = results
|
||||
.map((item, index) => {
|
||||
const collection = collections.find((col) => String(col._id) === String(item.collectionId));
|
||||
if (!collection) {
|
||||
console.log('Collection is not found', data);
|
||||
console.log('Collection is not found', item);
|
||||
return;
|
||||
}
|
||||
const data = dataList.find((data) =>
|
||||
data.indexes.some((index) => index.dataId === item.id)
|
||||
);
|
||||
if (!data) {
|
||||
console.log('Data is not found', item);
|
||||
return;
|
||||
}
|
||||
|
||||
// add score to data(It's already sorted. The first one is the one with the most points)
|
||||
const dataIdList = data.indexes.map((item) => item.dataId);
|
||||
const maxScoreResult = results.find((item) => {
|
||||
return dataIdList.includes(item.id);
|
||||
});
|
||||
const score = maxScoreResult?.score || 0;
|
||||
const score = item?.score || 0;
|
||||
|
||||
const result: SearchDataResponseItemType = {
|
||||
id: String(data._id),
|
||||
@@ -320,8 +322,6 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
|
||||
})
|
||||
.filter(Boolean) as SearchDataResponseItemType[];
|
||||
|
||||
formatResult.sort((a, b) => b.score[0].value - a.score[0].value);
|
||||
|
||||
return {
|
||||
embeddingRecallResults: formatResult,
|
||||
tokens
|
||||
@@ -411,22 +411,6 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
|
||||
'_id name fileId rawLink externalFileId externalFileUrl',
|
||||
{ ...readFromSecondary }
|
||||
).lean();
|
||||
// const [dataList, collections] = await Promise.all([
|
||||
// MongoDatasetData.find(
|
||||
// {
|
||||
// _id: { $in: searchResults.map((item) => item.dataId) }
|
||||
// },
|
||||
// '_id datasetId collectionId updateTime q a chunkIndex indexes',
|
||||
// { ...readFromSecondary }
|
||||
// ).lean(),
|
||||
// MongoDatasetCollection.find(
|
||||
// {
|
||||
// _id: { $in: searchResults.map((item) => item.collectionId) }
|
||||
// },
|
||||
// '_id name fileId rawLink externalFileId externalFileUrl',
|
||||
// { ...readFromSecondary }
|
||||
// ).lean()
|
||||
// ]);
|
||||
|
||||
return {
|
||||
fullTextRecallResults: searchResults
|
||||
@@ -439,9 +423,6 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
|
||||
return;
|
||||
}
|
||||
|
||||
// const score =
|
||||
// searchResults.find((item) => String(item.dataId) === String(data._id))?.score || 0;
|
||||
|
||||
return {
|
||||
id: String(data._id),
|
||||
datasetId: String(data.datasetId),
|
||||
@@ -459,6 +440,135 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
|
||||
tokenLen: 0
|
||||
};
|
||||
};
|
||||
const fullTextRecall2 = async ({
|
||||
query,
|
||||
limit,
|
||||
filterCollectionIdList,
|
||||
forbidCollectionIdList
|
||||
}: {
|
||||
query: string;
|
||||
limit: number;
|
||||
filterCollectionIdList?: string[];
|
||||
forbidCollectionIdList: string[];
|
||||
}): Promise<{
|
||||
fullTextRecallResults: SearchDataResponseItemType[];
|
||||
tokenLen: number;
|
||||
}> => {
|
||||
if (limit === 0) {
|
||||
return {
|
||||
fullTextRecallResults: [],
|
||||
tokenLen: 0
|
||||
};
|
||||
}
|
||||
|
||||
const searchResults = (
|
||||
await Promise.all(
|
||||
datasetIds.map(async (id) => {
|
||||
return MongoDatasetDataText.aggregate(
|
||||
[
|
||||
{
|
||||
$match: {
|
||||
teamId: new Types.ObjectId(teamId),
|
||||
datasetId: new Types.ObjectId(id),
|
||||
$text: { $search: jiebaSplit({ text: query }) },
|
||||
...(filterCollectionIdList
|
||||
? {
|
||||
collectionId: {
|
||||
$in: filterCollectionIdList.map((id) => new Types.ObjectId(id))
|
||||
}
|
||||
}
|
||||
: {}),
|
||||
...(forbidCollectionIdList && forbidCollectionIdList.length > 0
|
||||
? {
|
||||
collectionId: {
|
||||
$nin: forbidCollectionIdList.map((id) => new Types.ObjectId(id))
|
||||
}
|
||||
}
|
||||
: {})
|
||||
}
|
||||
},
|
||||
{
|
||||
$sort: {
|
||||
score: { $meta: 'textScore' }
|
||||
}
|
||||
},
|
||||
{
|
||||
$limit: limit
|
||||
},
|
||||
{
|
||||
$project: {
|
||||
_id: 1,
|
||||
collectionId: 1,
|
||||
dataId: 1,
|
||||
score: { $meta: 'textScore' }
|
||||
}
|
||||
}
|
||||
],
|
||||
{
|
||||
...readFromSecondary
|
||||
}
|
||||
);
|
||||
})
|
||||
)
|
||||
).flat() as (DatasetDataTextSchemaType & { score: number })[];
|
||||
|
||||
// Get data and collections
|
||||
const [dataList, collections] = await Promise.all([
|
||||
MongoDatasetData.find(
|
||||
{
|
||||
_id: { $in: searchResults.map((item) => item.dataId) }
|
||||
},
|
||||
'_id datasetId collectionId updateTime q a chunkIndex indexes',
|
||||
{ ...readFromSecondary }
|
||||
).lean(),
|
||||
MongoDatasetCollection.find(
|
||||
{
|
||||
_id: { $in: searchResults.map((item) => item.collectionId) }
|
||||
},
|
||||
'_id name fileId rawLink externalFileId externalFileUrl',
|
||||
{ ...readFromSecondary }
|
||||
).lean()
|
||||
]);
|
||||
|
||||
return {
|
||||
fullTextRecallResults: searchResults
|
||||
.map((item, index) => {
|
||||
const collection = collections.find(
|
||||
(col) => String(col._id) === String(item.collectionId)
|
||||
);
|
||||
if (!collection) {
|
||||
console.log('Collection is not found', item);
|
||||
return;
|
||||
}
|
||||
const data = dataList.find((data) => String(data._id) === String(item.dataId));
|
||||
if (!data) {
|
||||
console.log('Data is not found', item);
|
||||
return;
|
||||
}
|
||||
|
||||
return {
|
||||
id: String(data._id),
|
||||
datasetId: String(data.datasetId),
|
||||
collectionId: String(data.collectionId),
|
||||
updateTime: data.updateTime,
|
||||
q: data.q,
|
||||
a: data.a,
|
||||
chunkIndex: data.chunkIndex,
|
||||
indexes: data.indexes,
|
||||
...getCollectionSourceData(collection),
|
||||
score: [
|
||||
{
|
||||
type: SearchScoreTypeEnum.fullText,
|
||||
value: item.score || 0,
|
||||
index
|
||||
}
|
||||
]
|
||||
};
|
||||
})
|
||||
.filter(Boolean) as SearchDataResponseItemType[],
|
||||
tokenLen: 0
|
||||
};
|
||||
};
|
||||
const reRankSearchResult = async ({
|
||||
data,
|
||||
query
|
||||
@@ -526,7 +636,7 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
|
||||
forbidCollectionIdList,
|
||||
filterCollectionIdList
|
||||
}),
|
||||
fullTextRecall({
|
||||
fullTextRecall2({
|
||||
query,
|
||||
limit: fullTextLimit,
|
||||
filterCollectionIdList,
|
||||
|
||||
@@ -47,26 +47,32 @@ export const getTeamDefaultGroup = async ({
|
||||
export const getGroupsByTmbId = async ({
|
||||
tmbId,
|
||||
teamId,
|
||||
role
|
||||
role,
|
||||
session
|
||||
}: {
|
||||
tmbId: string;
|
||||
teamId: string;
|
||||
role?: `${GroupMemberRole}`[];
|
||||
session?: ClientSession;
|
||||
}) =>
|
||||
(
|
||||
await Promise.all([
|
||||
(
|
||||
await MongoGroupMemberModel.find({
|
||||
tmbId,
|
||||
groupId: {
|
||||
$exists: true
|
||||
await MongoGroupMemberModel.find(
|
||||
{
|
||||
tmbId,
|
||||
groupId: {
|
||||
$exists: true
|
||||
},
|
||||
...(role ? { role: { $in: role } } : {})
|
||||
},
|
||||
...(role ? { role: { $in: role } } : {})
|
||||
})
|
||||
undefined,
|
||||
{ session }
|
||||
)
|
||||
.populate<{ group: MemberGroupSchemaType }>('group')
|
||||
.lean()
|
||||
).map((item) => item.group),
|
||||
role ? [] : getTeamDefaultGroup({ teamId })
|
||||
role ? [] : getTeamDefaultGroup({ teamId, session })
|
||||
])
|
||||
).flat();
|
||||
|
||||
|
||||
@@ -115,6 +115,7 @@ try {
|
||||
}
|
||||
);
|
||||
|
||||
// Delete tmb permission
|
||||
ResourcePermissionSchema.index({
|
||||
resourceType: 1,
|
||||
teamId: 1,
|
||||
|
||||
@@ -78,9 +78,6 @@ const UserSchema = new Schema({
|
||||
});
|
||||
|
||||
try {
|
||||
// login
|
||||
UserSchema.index({ username: 1 });
|
||||
|
||||
// Admin charts
|
||||
UserSchema.index({ createTime: -1 });
|
||||
} catch (error) {
|
||||
|
||||
@@ -23,10 +23,6 @@ const TeamMemberSchema = new Schema({
|
||||
type: String,
|
||||
default: 'Member'
|
||||
},
|
||||
role: {
|
||||
type: String
|
||||
// enum: Object.keys(TeamMemberRoleMap) // disable enum validation for old data
|
||||
},
|
||||
status: {
|
||||
type: String,
|
||||
enum: Object.keys(TeamMemberStatusMap)
|
||||
@@ -38,6 +34,12 @@ const TeamMemberSchema = new Schema({
|
||||
defaultTeam: {
|
||||
type: Boolean,
|
||||
default: false
|
||||
},
|
||||
|
||||
// Abandoned
|
||||
role: {
|
||||
type: String
|
||||
// enum: Object.keys(TeamMemberRoleMap) // disable enum validation for old data
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
@@ -1,9 +1,27 @@
|
||||
import TurndownService from 'turndown';
|
||||
import { ImageType } from '../readFile/type';
|
||||
import { matchMdImgTextAndUpload } from '@fastgpt/global/common/string/markdown';
|
||||
import { getNanoid } from '@fastgpt/global/common/string/tools';
|
||||
// @ts-ignore
|
||||
const turndownPluginGfm = require('joplin-turndown-plugin-gfm');
|
||||
|
||||
const processBase64Images = (htmlContent: string) => {
|
||||
const base64Regex = /src="data:([^;]+);base64,([^"]+)"/g;
|
||||
const images: ImageType[] = [];
|
||||
|
||||
const processedHtml = htmlContent.replace(base64Regex, (match, mime, base64Data) => {
|
||||
const uuid = `IMAGE_${getNanoid(12)}_IMAGE`;
|
||||
images.push({
|
||||
uuid,
|
||||
base64: base64Data,
|
||||
mime
|
||||
});
|
||||
return `src="${uuid}"`;
|
||||
});
|
||||
|
||||
return { processedHtml, images };
|
||||
};
|
||||
|
||||
export const html2md = (
|
||||
html: string
|
||||
): {
|
||||
@@ -25,11 +43,14 @@ export const html2md = (
|
||||
turndownService.remove(['i', 'script', 'iframe', 'style']);
|
||||
turndownService.use(turndownPluginGfm.gfm);
|
||||
|
||||
const { text, imageList } = matchMdImgTextAndUpload(html);
|
||||
// Base64 img to id, otherwise it will occupy memory when going to md
|
||||
const { processedHtml, images } = processBase64Images(html);
|
||||
const md = turndownService.turndown(processedHtml);
|
||||
const { text, imageList } = matchMdImgTextAndUpload(md);
|
||||
|
||||
return {
|
||||
rawText: turndownService.turndown(text),
|
||||
imageList
|
||||
rawText: text,
|
||||
imageList: [...images, ...imageList]
|
||||
};
|
||||
} catch (error) {
|
||||
console.log('html 2 markdown error', error);
|
||||
|
||||
@@ -24,7 +24,11 @@ export const readFileRawText = ({ buffer, encoding }: ReadRawTextByBuffer): Read
|
||||
return buffer.toString(encoding as BufferEncoding);
|
||||
}
|
||||
|
||||
return iconv.decode(buffer, encoding);
|
||||
if (encoding) {
|
||||
return iconv.decode(buffer, encoding);
|
||||
}
|
||||
|
||||
return buffer.toString('utf-8');
|
||||
} catch (error) {
|
||||
return buffer.toString('utf-8');
|
||||
}
|
||||
|
||||
@@ -1,31 +1,31 @@
|
||||
{
|
||||
"action": "operate",
|
||||
"confirm_delete_group": "Confirm to delete group?",
|
||||
"confirm_leave_team": "Confirmed to leave the team? \n \nAfter you log out, all your resources in the team (applications, knowledge bases, folders, managed groups, etc.) will be transferred to the team owner.",
|
||||
"confirm_delete_org": "Confirm to delete organization?",
|
||||
"confirm_delete_member": "Confirm to delete member?",
|
||||
"confirm_delete_org": "Confirm to delete organization?",
|
||||
"confirm_leave_team": "Confirmed to leave the team? \nAfter exiting, all your resources in the team are transferred to the team owner.",
|
||||
"create_group": "Create group",
|
||||
"delete": "delete",
|
||||
"edit_info": "Edit information",
|
||||
"group": "group",
|
||||
"group_name": "Group name",
|
||||
"org": "organization",
|
||||
"org_name": "Organization name",
|
||||
"org_description": "Organization description",
|
||||
"create_org": "Create organization",
|
||||
"create_sub_org": "Create sub-organization",
|
||||
"edit_org_info": "Edit organization information",
|
||||
"move_org": "Move organization",
|
||||
"move_member": "Move member",
|
||||
"delete": "delete",
|
||||
"delete_org": "Delete organization",
|
||||
"remark": "remark",
|
||||
"edit_info": "Edit information",
|
||||
"edit_org_info": "Edit organization information",
|
||||
"group": "group",
|
||||
"group_name": "Group name",
|
||||
"label_sync": "Tag sync",
|
||||
"leave_team_failed": "Leaving the team exception",
|
||||
"manage_member": "Managing members",
|
||||
"member": "member",
|
||||
"member_group": "Belonging to member group",
|
||||
"move_member": "Move member",
|
||||
"move_org": "Move organization",
|
||||
"org": "organization",
|
||||
"org_description": "Organization description",
|
||||
"org_name": "Organization name",
|
||||
"owner": "owner",
|
||||
"permission": "Permissions",
|
||||
"remark": "remark",
|
||||
"remove_tip": "Confirm to remove {{username}} from the team?",
|
||||
"retain_admin_permissions": "Keep administrator rights",
|
||||
"search_member_group_name": "Search member/group name",
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
"confirm_delete_group": "确认删除群组?",
|
||||
"confirm_delete_member": "确认删除成员?",
|
||||
"confirm_delete_org": "确认删除该部门?",
|
||||
"confirm_leave_team": "确认离开该团队? \n 退出后,您在该团队所有的资源( 应用、知识库、文件夹、管理的群组等)均转让给团队所有者。",
|
||||
"confirm_leave_team": "确认离开该团队? \n退出后,您在该团队所有的资源均转让给团队所有者。",
|
||||
"create_group": "创建群组",
|
||||
"create_org": "创建部门",
|
||||
"create_sub_org": "创建子部门",
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
"confirm_delete_group": "確認刪除群組?",
|
||||
"confirm_delete_member": "確認刪除成員?",
|
||||
"confirm_delete_org": "確認刪除該部門?",
|
||||
"confirm_leave_team": "確認離開該團隊? \n \n退出後,您在該團隊所有的資源( 應用程式、知識庫、資料夾、管理的群組等)均轉讓給團隊所有者。",
|
||||
"confirm_leave_team": "確認離開該團隊? \n退出後,您在該團隊所有的資源轉讓給團隊所有者。",
|
||||
"create_group": "建立群組",
|
||||
"create_org": "創建部門",
|
||||
"create_sub_org": "創建子部門",
|
||||
|
||||
Reference in New Issue
Block a user