perf: file encoding;perf: leave team code;@c121914yu perf: full text search code (#3528)

* perf: text encoding

* perf: leave team code

* perf: full text search code

* fix: http status

* perf: embedding search and vector avatar
This commit is contained in:
Archer
2025-01-05 14:40:02 +08:00
committed by archer
parent 5465ca642f
commit 2bf1fce32a
24 changed files with 345 additions and 100 deletions

View File

@@ -4,7 +4,7 @@ import fsp from 'fs/promises';
import fs from 'fs';
import { DatasetFileSchema } from '@fastgpt/global/core/dataset/type';
import { MongoChatFileSchema, MongoDatasetFileSchema } from './schema';
import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
import { detectFileEncoding, detectFileEncodingByPath } from '@fastgpt/global/common/file/tools';
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
import { MongoRawTextBuffer } from '../../buffer/rawText/schema';
import { readRawContentByFileBuffer } from '../read/utils';
@@ -36,7 +36,6 @@ export async function uploadFile({
path,
filename,
contentType,
encoding,
metadata = {}
}: {
bucketName: `${BucketNameEnum}`;
@@ -45,7 +44,6 @@ export async function uploadFile({
path: string;
filename: string;
contentType?: string;
encoding: string;
metadata?: Record<string, any>;
}) {
if (!path) return Promise.reject(`filePath is empty`);
@@ -59,7 +57,7 @@ export async function uploadFile({
// Add default metadata
metadata.teamId = teamId;
metadata.uid = uid;
metadata.encoding = encoding;
metadata.encoding = await detectFileEncodingByPath(path);
// create a gridfs bucket
const bucket = getGridBucket(bucketName);

View File

@@ -22,7 +22,6 @@ export function useReqFrequencyLimit(seconds: number, limit: number, force = fal
expiredTime: addSeconds(new Date(), seconds)
});
} catch (_) {
res.status(429);
jsonRes(res, {
code: 429,
error: ERROR_ENUM.tooManyRequest

View File

@@ -33,8 +33,7 @@ export const jsonRes = <T = any>(
addLog.error(`Api response error: ${url}`, ERROR_RESPONSE[errResponseKey]);
res.status(ERROR_RESPONSE[errResponseKey].code);
return res.json(ERROR_RESPONSE[errResponseKey]);
return res.status(code).json(ERROR_RESPONSE[errResponseKey]);
}
// another error

View File

@@ -25,7 +25,7 @@ const DatasetDataTextSchema = new Schema({
required: true
},
dataId: {
type: String,
type: Schema.Types.ObjectId,
ref: DatasetDataCollectionName,
required: true
},
@@ -37,7 +37,7 @@ const DatasetDataTextSchema = new Schema({
try {
DatasetDataTextSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' });
DatasetDataTextSchema.index({ dataId: 'hashed' });
DatasetDataTextSchema.index({ dataId: 1 }, { unique: true });
} catch (error) {
console.log(error);
}

View File

@@ -39,10 +39,6 @@ const DatasetDataSchema = new Schema({
type: String,
default: ''
},
fullTextToken: {
type: String,
default: ''
},
indexes: {
type: [
{
@@ -72,7 +68,13 @@ const DatasetDataSchema = new Schema({
default: 0
},
rebuilding: Boolean,
inited: Boolean
// Abandon
fullTextToken: {
type: String,
default: ''
},
initFullText: Boolean
});
try {
@@ -85,13 +87,14 @@ try {
updateTime: -1
});
// full text index
DatasetDataSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' });
// DatasetDataSchema.index({ teamId: 1, datasetId: 1, fullTextToken: 'text' });
// Recall vectors after data matching
DatasetDataSchema.index({ teamId: 1, datasetId: 1, collectionId: 1, 'indexes.dataId': 1 });
DatasetDataSchema.index({ updateTime: 1 });
// rebuild data
DatasetDataSchema.index({ rebuilding: 1, teamId: 1, datasetId: 1 });
DatasetDataSchema.index({ inited: 'hashed' });
DatasetDataSchema.index({ initFullText: 1 });
} catch (error) {
console.log(error);
}

View File

@@ -289,20 +289,22 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
).lean()
]);
const formatResult = dataList
.map((data, index) => {
const collection = collections.find((col) => String(col._id) === String(data.collectionId));
const formatResult = results
.map((item, index) => {
const collection = collections.find((col) => String(col._id) === String(item.collectionId));
if (!collection) {
console.log('Collection is not found', data);
console.log('Collection is not found', item);
return;
}
const data = dataList.find((data) =>
data.indexes.some((index) => index.dataId === item.id)
);
if (!data) {
console.log('Data is not found', item);
return;
}
// add score to data(It's already sorted. The first one is the one with the most points)
const dataIdList = data.indexes.map((item) => item.dataId);
const maxScoreResult = results.find((item) => {
return dataIdList.includes(item.id);
});
const score = maxScoreResult?.score || 0;
const score = item?.score || 0;
const result: SearchDataResponseItemType = {
id: String(data._id),
@@ -320,8 +322,6 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
})
.filter(Boolean) as SearchDataResponseItemType[];
formatResult.sort((a, b) => b.score[0].value - a.score[0].value);
return {
embeddingRecallResults: formatResult,
tokens
@@ -411,22 +411,6 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
'_id name fileId rawLink externalFileId externalFileUrl',
{ ...readFromSecondary }
).lean();
// const [dataList, collections] = await Promise.all([
// MongoDatasetData.find(
// {
// _id: { $in: searchResults.map((item) => item.dataId) }
// },
// '_id datasetId collectionId updateTime q a chunkIndex indexes',
// { ...readFromSecondary }
// ).lean(),
// MongoDatasetCollection.find(
// {
// _id: { $in: searchResults.map((item) => item.collectionId) }
// },
// '_id name fileId rawLink externalFileId externalFileUrl',
// { ...readFromSecondary }
// ).lean()
// ]);
return {
fullTextRecallResults: searchResults
@@ -439,9 +423,6 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
return;
}
// const score =
// searchResults.find((item) => String(item.dataId) === String(data._id))?.score || 0;
return {
id: String(data._id),
datasetId: String(data.datasetId),
@@ -459,6 +440,135 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
tokenLen: 0
};
};
const fullTextRecall2 = async ({
query,
limit,
filterCollectionIdList,
forbidCollectionIdList
}: {
query: string;
limit: number;
filterCollectionIdList?: string[];
forbidCollectionIdList: string[];
}): Promise<{
fullTextRecallResults: SearchDataResponseItemType[];
tokenLen: number;
}> => {
if (limit === 0) {
return {
fullTextRecallResults: [],
tokenLen: 0
};
}
const searchResults = (
await Promise.all(
datasetIds.map(async (id) => {
return MongoDatasetDataText.aggregate(
[
{
$match: {
teamId: new Types.ObjectId(teamId),
datasetId: new Types.ObjectId(id),
$text: { $search: jiebaSplit({ text: query }) },
...(filterCollectionIdList
? {
collectionId: {
$in: filterCollectionIdList.map((id) => new Types.ObjectId(id))
}
}
: {}),
...(forbidCollectionIdList && forbidCollectionIdList.length > 0
? {
collectionId: {
$nin: forbidCollectionIdList.map((id) => new Types.ObjectId(id))
}
}
: {})
}
},
{
$sort: {
score: { $meta: 'textScore' }
}
},
{
$limit: limit
},
{
$project: {
_id: 1,
collectionId: 1,
dataId: 1,
score: { $meta: 'textScore' }
}
}
],
{
...readFromSecondary
}
);
})
)
).flat() as (DatasetDataTextSchemaType & { score: number })[];
// Get data and collections
const [dataList, collections] = await Promise.all([
MongoDatasetData.find(
{
_id: { $in: searchResults.map((item) => item.dataId) }
},
'_id datasetId collectionId updateTime q a chunkIndex indexes',
{ ...readFromSecondary }
).lean(),
MongoDatasetCollection.find(
{
_id: { $in: searchResults.map((item) => item.collectionId) }
},
'_id name fileId rawLink externalFileId externalFileUrl',
{ ...readFromSecondary }
).lean()
]);
return {
fullTextRecallResults: searchResults
.map((item, index) => {
const collection = collections.find(
(col) => String(col._id) === String(item.collectionId)
);
if (!collection) {
console.log('Collection is not found', item);
return;
}
const data = dataList.find((data) => String(data._id) === String(item.dataId));
if (!data) {
console.log('Data is not found', item);
return;
}
return {
id: String(data._id),
datasetId: String(data.datasetId),
collectionId: String(data.collectionId),
updateTime: data.updateTime,
q: data.q,
a: data.a,
chunkIndex: data.chunkIndex,
indexes: data.indexes,
...getCollectionSourceData(collection),
score: [
{
type: SearchScoreTypeEnum.fullText,
value: item.score || 0,
index
}
]
};
})
.filter(Boolean) as SearchDataResponseItemType[],
tokenLen: 0
};
};
const reRankSearchResult = async ({
data,
query
@@ -526,7 +636,7 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
forbidCollectionIdList,
filterCollectionIdList
}),
fullTextRecall({
fullTextRecall2({
query,
limit: fullTextLimit,
filterCollectionIdList,

View File

@@ -47,26 +47,32 @@ export const getTeamDefaultGroup = async ({
export const getGroupsByTmbId = async ({
tmbId,
teamId,
role
role,
session
}: {
tmbId: string;
teamId: string;
role?: `${GroupMemberRole}`[];
session?: ClientSession;
}) =>
(
await Promise.all([
(
await MongoGroupMemberModel.find({
tmbId,
groupId: {
$exists: true
await MongoGroupMemberModel.find(
{
tmbId,
groupId: {
$exists: true
},
...(role ? { role: { $in: role } } : {})
},
...(role ? { role: { $in: role } } : {})
})
undefined,
{ session }
)
.populate<{ group: MemberGroupSchemaType }>('group')
.lean()
).map((item) => item.group),
role ? [] : getTeamDefaultGroup({ teamId })
role ? [] : getTeamDefaultGroup({ teamId, session })
])
).flat();

View File

@@ -115,6 +115,7 @@ try {
}
);
// Delete tmb permission
ResourcePermissionSchema.index({
resourceType: 1,
teamId: 1,

View File

@@ -78,9 +78,6 @@ const UserSchema = new Schema({
});
try {
// login
UserSchema.index({ username: 1 });
// Admin charts
UserSchema.index({ createTime: -1 });
} catch (error) {

View File

@@ -23,10 +23,6 @@ const TeamMemberSchema = new Schema({
type: String,
default: 'Member'
},
role: {
type: String
// enum: Object.keys(TeamMemberRoleMap) // disable enum validation for old data
},
status: {
type: String,
enum: Object.keys(TeamMemberStatusMap)
@@ -38,6 +34,12 @@ const TeamMemberSchema = new Schema({
defaultTeam: {
type: Boolean,
default: false
},
// Abandoned
role: {
type: String
// enum: Object.keys(TeamMemberRoleMap) // disable enum validation for old data
}
});

View File

@@ -1,9 +1,27 @@
import TurndownService from 'turndown';
import { ImageType } from '../readFile/type';
import { matchMdImgTextAndUpload } from '@fastgpt/global/common/string/markdown';
import { getNanoid } from '@fastgpt/global/common/string/tools';
// @ts-ignore
const turndownPluginGfm = require('joplin-turndown-plugin-gfm');
const processBase64Images = (htmlContent: string) => {
const base64Regex = /src="data:([^;]+);base64,([^"]+)"/g;
const images: ImageType[] = [];
const processedHtml = htmlContent.replace(base64Regex, (match, mime, base64Data) => {
const uuid = `IMAGE_${getNanoid(12)}_IMAGE`;
images.push({
uuid,
base64: base64Data,
mime
});
return `src="${uuid}"`;
});
return { processedHtml, images };
};
export const html2md = (
html: string
): {
@@ -25,11 +43,14 @@ export const html2md = (
turndownService.remove(['i', 'script', 'iframe', 'style']);
turndownService.use(turndownPluginGfm.gfm);
const { text, imageList } = matchMdImgTextAndUpload(html);
// Base64 img to id, otherwise it will occupy memory when going to md
const { processedHtml, images } = processBase64Images(html);
const md = turndownService.turndown(processedHtml);
const { text, imageList } = matchMdImgTextAndUpload(md);
return {
rawText: turndownService.turndown(text),
imageList
rawText: text,
imageList: [...images, ...imageList]
};
} catch (error) {
console.log('html 2 markdown error', error);

View File

@@ -24,7 +24,11 @@ export const readFileRawText = ({ buffer, encoding }: ReadRawTextByBuffer): Read
return buffer.toString(encoding as BufferEncoding);
}
return iconv.decode(buffer, encoding);
if (encoding) {
return iconv.decode(buffer, encoding);
}
return buffer.toString('utf-8');
} catch (error) {
return buffer.toString('utf-8');
}