add env to check internal ip (#4187)
* fix: ts * update jieba package * add env to check internal ip * package * fix: jieba * reset package * update config * fix: jieba package * init shell * init version * change team reload
This commit is contained in:
3
packages/service/common/string/jieba/dict.json
Normal file
3
packages/service/common/string/jieba/dict.json
Normal file
File diff suppressed because one or more lines are too long
@@ -1,13 +1,13 @@
|
||||
import { Jieba } from '@node-rs/jieba';
|
||||
import fs from 'fs';
|
||||
import path from 'path';
|
||||
|
||||
// 使用 require.resolve 获取包的路径,然后拼接字典文件路径
|
||||
const jiebaPath = path.dirname(require.resolve('@node-rs/jieba/package.json'));
|
||||
const dictPath = path.join(jiebaPath, 'dict.txt');
|
||||
let jieba: Jieba | undefined;
|
||||
|
||||
// 使用正确的文件路径加载字典
|
||||
const jieba = Jieba.withDict(fs.readFileSync(dictPath));
|
||||
(async () => {
|
||||
const dictData = await import('./dict.json');
|
||||
// @ts-ignore
|
||||
const dictBuffer = Buffer.from(dictData.dict?.replace(/\\n/g, '\n'), 'utf-8');
|
||||
jieba = Jieba.withDict(dictBuffer);
|
||||
})();
|
||||
|
||||
const stopWords = new Set([
|
||||
'--',
|
||||
@@ -1519,7 +1519,9 @@ const stopWords = new Set([
|
||||
]);
|
||||
|
||||
export async function jiebaSplit({ text }: { text: string }) {
|
||||
const tokens = (await jieba.cutAsync(text, true)) as string[];
|
||||
text = text.replace(/[#*`_~>[\](){}|]/g, '').replace(/\S*https?\S*/gi, '');
|
||||
|
||||
const tokens = (await jieba!.cutAsync(text, true)) as string[];
|
||||
|
||||
return (
|
||||
tokens
|
||||
@@ -30,6 +30,8 @@ export const isInternalAddress = (url: string): boolean => {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (process.env.CHECK_INTERNAL_IP !== 'true') return false;
|
||||
|
||||
// For IP addresses, check if they are internal
|
||||
const ipv4Pattern = /^(\d{1,3}\.){3}\d{1,3}$/;
|
||||
if (!ipv4Pattern.test(hostname)) {
|
||||
|
||||
@@ -41,7 +41,7 @@ try {
|
||||
}
|
||||
);
|
||||
DatasetDataTextSchema.index({ teamId: 1, datasetId: 1, collectionId: 1 });
|
||||
DatasetDataTextSchema.index({ dataId: 1 }, { unique: true });
|
||||
DatasetDataTextSchema.index({ dataId: 'hashed' });
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
@@ -86,7 +86,8 @@ const DatasetDataSchema = new Schema({
|
||||
|
||||
// Abandon
|
||||
fullTextToken: String,
|
||||
initFullText: Boolean
|
||||
initFullText: Boolean,
|
||||
initJieba: Boolean
|
||||
});
|
||||
|
||||
try {
|
||||
@@ -103,6 +104,9 @@ try {
|
||||
DatasetDataSchema.index({ updateTime: 1 });
|
||||
// rebuild data
|
||||
DatasetDataSchema.index({ rebuilding: 1, teamId: 1, datasetId: 1 });
|
||||
|
||||
// 为查询 initJieba 字段不存在的数据添加索引
|
||||
DatasetDataSchema.index({ initJieba: 1 }, { sparse: true });
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
@@ -16,7 +16,7 @@ import { reRankRecall } from '../../../core/ai/rerank';
|
||||
import { countPromptTokens } from '../../../common/string/tiktoken/index';
|
||||
import { datasetSearchResultConcat } from '@fastgpt/global/core/dataset/search/utils';
|
||||
import { hashStr } from '@fastgpt/global/common/string/tools';
|
||||
import { jiebaSplit } from '../../../common/string/jieba';
|
||||
import { jiebaSplit } from '../../../common/string/jieba/index';
|
||||
import { getCollectionSourceData } from '@fastgpt/global/core/dataset/collection/utils';
|
||||
import { Types } from '../../../common/mongo';
|
||||
import json5 from 'json5';
|
||||
|
||||
Reference in New Issue
Block a user