add env to check internal ip (#4187)

* fix: ts

* update jieba package

* add env to check internal ip

* package

* fix: jieba

* reset package

* update config

* fix: jieba package

* init shell

* init version

* change team reload
This commit is contained in:
Archer
2025-03-17 18:21:27 +08:00
committed by archer
parent 9d43edb75c
commit 02813f3a47
17 changed files with 121 additions and 26 deletions

File diff suppressed because one or more lines are too long

View File

@@ -1,13 +1,13 @@
import { Jieba } from '@node-rs/jieba';
import fs from 'fs';
import path from 'path';
// 使用 require.resolve 获取包的路径,然后拼接字典文件路径
const jiebaPath = path.dirname(require.resolve('@node-rs/jieba/package.json'));
const dictPath = path.join(jiebaPath, 'dict.txt');
let jieba: Jieba | undefined;
// 使用正确的文件路径加载字典
const jieba = Jieba.withDict(fs.readFileSync(dictPath));
(async () => {
const dictData = await import('./dict.json');
// @ts-ignore
const dictBuffer = Buffer.from(dictData.dict?.replace(/\\n/g, '\n'), 'utf-8');
jieba = Jieba.withDict(dictBuffer);
})();
const stopWords = new Set([
'--',
@@ -1519,7 +1519,9 @@ const stopWords = new Set([
]);
export async function jiebaSplit({ text }: { text: string }) {
const tokens = (await jieba.cutAsync(text, true)) as string[];
text = text.replace(/[#*`_~>[\](){}|]/g, '').replace(/\S*https?\S*/gi, '');
const tokens = (await jieba!.cutAsync(text, true)) as string[];
return (
tokens

View File

@@ -30,6 +30,8 @@ export const isInternalAddress = (url: string): boolean => {
return true;
}
if (process.env.CHECK_INTERNAL_IP !== 'true') return false;
// For IP addresses, check if they are internal
const ipv4Pattern = /^(\d{1,3}\.){3}\d{1,3}$/;
if (!ipv4Pattern.test(hostname)) {

View File

@@ -41,7 +41,7 @@ try {
}
);
DatasetDataTextSchema.index({ teamId: 1, datasetId: 1, collectionId: 1 });
DatasetDataTextSchema.index({ dataId: 1 }, { unique: true });
DatasetDataTextSchema.index({ dataId: 'hashed' });
} catch (error) {
console.log(error);
}

View File

@@ -86,7 +86,8 @@ const DatasetDataSchema = new Schema({
// Abandon
fullTextToken: String,
initFullText: Boolean
initFullText: Boolean,
initJieba: Boolean
});
try {
@@ -103,6 +104,9 @@ try {
DatasetDataSchema.index({ updateTime: 1 });
// rebuild data
DatasetDataSchema.index({ rebuilding: 1, teamId: 1, datasetId: 1 });
// 为查询 initJieba 字段不存在的数据添加索引
DatasetDataSchema.index({ initJieba: 1 }, { sparse: true });
} catch (error) {
console.log(error);
}

View File

@@ -16,7 +16,7 @@ import { reRankRecall } from '../../../core/ai/rerank';
import { countPromptTokens } from '../../../common/string/tiktoken/index';
import { datasetSearchResultConcat } from '@fastgpt/global/core/dataset/search/utils';
import { hashStr } from '@fastgpt/global/common/string/tools';
import { jiebaSplit } from '../../../common/string/jieba';
import { jiebaSplit } from '../../../common/string/jieba/index';
import { getCollectionSourceData } from '@fastgpt/global/core/dataset/collection/utils';
import { Types } from '../../../common/mongo';
import json5 from 'json5';