add env to check internal ip (#4187)

* fix: ts * update jieba package * add env to check internal ip * package * fix: jieba * reset package * update config * fix: jieba package * init shell * init version * change team reload
2025-03-17 18:21:27 +08:00
parent 9d43edb75c
commit 02813f3a47
17 changed files with 121 additions and 26 deletions
--- a/packages/service/common/string/jieba/dict.json
+++ b/packages/service/common/string/jieba/dict.json
--- a/packages/service/common/string/jieba/index.ts
+++ b/packages/service/common/string/jieba/index.ts
@@ -1,13 +1,13 @@
 import { Jieba } from '@node-rs/jieba';
-import fs from 'fs';
-import path from 'path';

-// 使用 require.resolve 获取包的路径，然后拼接字典文件路径
-const jiebaPath = path.dirname(require.resolve('@node-rs/jieba/package.json'));
-const dictPath = path.join(jiebaPath, 'dict.txt');
+let jieba: Jieba | undefined;

-// 使用正确的文件路径加载字典
-const jieba = Jieba.withDict(fs.readFileSync(dictPath));
+(async () => {
+  const dictData = await import('./dict.json');
+  // @ts-ignore
+  const dictBuffer = Buffer.from(dictData.dict?.replace(/\\n/g, '\n'), 'utf-8');
+  jieba = Jieba.withDict(dictBuffer);
+})();

 const stopWords = new Set([
  '--',
@@ -1519,7 +1519,9 @@ const stopWords = new Set([
 ]);

 export async function jiebaSplit({ text }: { text: string }) {
-  const tokens = (await jieba.cutAsync(text, true)) as string[];
+  text = text.replace(/[#*`_~>[\](){}|]/g, '').replace(/\S*https?\S*/gi, '');
+
+  const tokens = (await jieba!.cutAsync(text, true)) as string[];

  return (
    tokens
--- a/packages/service/common/system/utils.ts
+++ b/packages/service/common/system/utils.ts
@@ -30,6 +30,8 @@ export const isInternalAddress = (url: string): boolean => {
      return true;
    }

+    if (process.env.CHECK_INTERNAL_IP !== 'true') return false;
+
    // For IP addresses, check if they are internal
    const ipv4Pattern = /^(\d{1,3}\.){3}\d{1,3}$/;
    if (!ipv4Pattern.test(hostname)) {
--- a/packages/service/core/dataset/data/dataTextSchema.ts
+++ b/packages/service/core/dataset/data/dataTextSchema.ts
@@ -41,7 +41,7 @@ try {
    }
  );
  DatasetDataTextSchema.index({ teamId: 1, datasetId: 1, collectionId: 1 });
-  DatasetDataTextSchema.index({ dataId: 1 }, { unique: true });
+  DatasetDataTextSchema.index({ dataId: 'hashed' });
 } catch (error) {
  console.log(error);
 }
--- a/packages/service/core/dataset/data/schema.ts
+++ b/packages/service/core/dataset/data/schema.ts
@@ -86,7 +86,8 @@ const DatasetDataSchema = new Schema({

  // Abandon
  fullTextToken: String,
-  initFullText: Boolean
+  initFullText: Boolean,
+  initJieba: Boolean
 });

 try {
@@ -103,6 +104,9 @@ try {
  DatasetDataSchema.index({ updateTime: 1 });
  // rebuild data
  DatasetDataSchema.index({ rebuilding: 1, teamId: 1, datasetId: 1 });
+
+  // 为查询 initJieba 字段不存在的数据添加索引
+  DatasetDataSchema.index({ initJieba: 1 }, { sparse: true });
 } catch (error) {
  console.log(error);
 }
--- a/packages/service/core/dataset/search/controller.ts
+++ b/packages/service/core/dataset/search/controller.ts
@@ -16,7 +16,7 @@ import { reRankRecall } from '../../../core/ai/rerank';
 import { countPromptTokens } from '../../../common/string/tiktoken/index';
 import { datasetSearchResultConcat } from '@fastgpt/global/core/dataset/search/utils';
 import { hashStr } from '@fastgpt/global/common/string/tools';
-import { jiebaSplit } from '../../../common/string/jieba';
+import { jiebaSplit } from '../../../common/string/jieba/index';
 import { getCollectionSourceData } from '@fastgpt/global/core/dataset/collection/utils';
 import { Types } from '../../../common/mongo';
 import json5 from 'json5';