Compare commits

..

1 Commits

Author SHA1 Message Date
gru-agent[bot]
3338be6650 Add tests for getFlatAppResponses function in chat utils. 2025-05-25 12:55:13 +00:00
37 changed files with 305 additions and 932 deletions

View File

@@ -132,15 +132,15 @@ services:
# fastgpt # fastgpt
sandbox: sandbox:
container_name: sandbox container_name: sandbox
image: ghcr.io/labring/fastgpt-sandbox:v4.9.10 # git image: ghcr.io/labring/fastgpt-sandbox:v4.9.9 # git
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.9.10 # 阿里云 # image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.9.9 # 阿里云
networks: networks:
- fastgpt - fastgpt
restart: always restart: always
fastgpt-mcp-server: fastgpt-mcp-server:
container_name: fastgpt-mcp-server container_name: fastgpt-mcp-server
image: ghcr.io/labring/fastgpt-mcp_server:v4.9.10 # git image: ghcr.io/labring/fastgpt-mcp_server:v4.9.9 # git
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-mcp_server:v4.9.10 # 阿里云 # image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-mcp_server:v4.9.9 # 阿里云
ports: ports:
- 3005:3000 - 3005:3000
networks: networks:
@@ -150,8 +150,8 @@ services:
- FASTGPT_ENDPOINT=http://fastgpt:3000 - FASTGPT_ENDPOINT=http://fastgpt:3000
fastgpt: fastgpt:
container_name: fastgpt container_name: fastgpt
image: ghcr.io/labring/fastgpt:v4.9.10 # git image: ghcr.io/labring/fastgpt:v4.9.9 # git
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.9.10 # 阿里云 # image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.9.9 # 阿里云
ports: ports:
- 3000:3000 - 3000:3000
networks: networks:

View File

@@ -109,15 +109,15 @@ services:
# fastgpt # fastgpt
sandbox: sandbox:
container_name: sandbox container_name: sandbox
image: ghcr.io/labring/fastgpt-sandbox:v4.9.10 # git image: ghcr.io/labring/fastgpt-sandbox:v4.9.9 # git
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.9.10 # 阿里云 # image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.9.9 # 阿里云
networks: networks:
- fastgpt - fastgpt
restart: always restart: always
fastgpt-mcp-server: fastgpt-mcp-server:
container_name: fastgpt-mcp-server container_name: fastgpt-mcp-server
image: ghcr.io/labring/fastgpt-mcp_server:v4.9.10 # git image: ghcr.io/labring/fastgpt-mcp_server:v4.9.9 # git
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-mcp_server:v4.9.10 # 阿里云 # image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-mcp_server:v4.9.9 # 阿里云
ports: ports:
- 3005:3000 - 3005:3000
networks: networks:
@@ -127,8 +127,8 @@ services:
- FASTGPT_ENDPOINT=http://fastgpt:3000 - FASTGPT_ENDPOINT=http://fastgpt:3000
fastgpt: fastgpt:
container_name: fastgpt container_name: fastgpt
image: ghcr.io/labring/fastgpt:v4.9.10 # git image: ghcr.io/labring/fastgpt:v4.9.9 # git
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.9.10 # 阿里云 # image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.9.9 # 阿里云
ports: ports:
- 3000:3000 - 3000:3000
networks: networks:

View File

@@ -96,15 +96,15 @@ services:
# fastgpt # fastgpt
sandbox: sandbox:
container_name: sandbox container_name: sandbox
image: ghcr.io/labring/fastgpt-sandbox:v4.9.10 # git image: ghcr.io/labring/fastgpt-sandbox:v4.9.9 # git
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.9.10 # 阿里云 # image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.9.9 # 阿里云
networks: networks:
- fastgpt - fastgpt
restart: always restart: always
fastgpt-mcp-server: fastgpt-mcp-server:
container_name: fastgpt-mcp-server container_name: fastgpt-mcp-server
image: ghcr.io/labring/fastgpt-mcp_server:v4.9.10 # git image: ghcr.io/labring/fastgpt-mcp_server:v4.9.9 # git
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-mcp_server:v4.9.10 # 阿里云 # image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-mcp_server:v4.9.9 # 阿里云
ports: ports:
- 3005:3000 - 3005:3000
networks: networks:
@@ -114,8 +114,8 @@ services:
- FASTGPT_ENDPOINT=http://fastgpt:3000 - FASTGPT_ENDPOINT=http://fastgpt:3000
fastgpt: fastgpt:
container_name: fastgpt container_name: fastgpt
image: ghcr.io/labring/fastgpt:v4.9.10 # git image: ghcr.io/labring/fastgpt:v4.9.9 # git
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.9.10 # 阿里云 # image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.9.9 # 阿里云
ports: ports:
- 3000:3000 - 3000:3000
networks: networks:

View File

@@ -72,15 +72,15 @@ services:
sandbox: sandbox:
container_name: sandbox container_name: sandbox
image: ghcr.io/labring/fastgpt-sandbox:v4.9.10 # git image: ghcr.io/labring/fastgpt-sandbox:v4.9.9 # git
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.9.10 # 阿里云 # image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.9.9 # 阿里云
networks: networks:
- fastgpt - fastgpt
restart: always restart: always
fastgpt-mcp-server: fastgpt-mcp-server:
container_name: fastgpt-mcp-server container_name: fastgpt-mcp-server
image: ghcr.io/labring/fastgpt-mcp_server:v4.9.10 # git image: ghcr.io/labring/fastgpt-mcp_server:v4.9.9 # git
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-mcp_server:v4.9.10 # 阿里云 # image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-mcp_server:v4.9.9 # 阿里云
ports: ports:
- 3005:3000 - 3005:3000
networks: networks:
@@ -90,8 +90,8 @@ services:
- FASTGPT_ENDPOINT=http://fastgpt:3000 - FASTGPT_ENDPOINT=http://fastgpt:3000
fastgpt: fastgpt:
container_name: fastgpt container_name: fastgpt
image: ghcr.io/labring/fastgpt:v4.9.10 # git image: ghcr.io/labring/fastgpt:v4.9.9 # git
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.9.10 # 阿里云 # image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.9.9 # 阿里云
ports: ports:
- 3000:3000 - 3000:3000
networks: networks:

View File

@@ -7,28 +7,13 @@ toc: true
weight: 790 weight: 790
--- ---
## 升级指南
重要提示本次更新会重新构建全文索引构建期间全文检索结果会为空4c16g 700 万组全文索引大致消耗 25 分钟。如需无缝升级,需自行做表同步工程。
### 1. 做好数据备份
### 2. 更新镜像 tag
- 更新 FastGPT 镜像 tag: v4.9.10
- 更新 FastGPT 商业版镜像 tag: v4.9.10
- mcp_server 无需更新
- Sandbox 无需更新
- AIProxy 无需更新
## 🚀 新增内容 ## 🚀 新增内容
1. 支持 PG 设置`systemEnv.hnswMaxScanTuples`参数,提高迭代搜索的数据总量。 1. 支持 PG 设置`systemEnv.hnswMaxScanTuples`参数,提高迭代搜索的数据总量。
2. 知识库预处理参数增加 “分块条件”,可控制某些情况下不进行分块处理 2. 工作流调整为单向接入和接出,支持快速的添加下一步节点
3. 知识库预处理参数增加 “段落优先” 模式,可控制最大段落深度。原“长度优先”模式,不再内嵌段落优先逻辑 3. 开放飞书和语雀知识库到开源版
4. 工作流调整为单向接入和接出,支持快速的添加下一步节点 4. gemini 和 claude 最新模型预设
5. 开放飞书和语雀知识库到开源版。
6. gemini 和 claude 最新模型预设。
## ⚙️ 优化 ## ⚙️ 优化
@@ -37,14 +22,10 @@ weight: 790
3. 纠正原先知识库的“表格数据集”名称,改成“备份导入”。同时支持知识库索引的导出和导入。 3. 纠正原先知识库的“表格数据集”名称,改成“备份导入”。同时支持知识库索引的导出和导入。
4. 工作流知识库引用上限,如果工作流中没有相关 AI 节点,则交互模式改成纯手动输入,并且上限为 1000万。 4. 工作流知识库引用上限,如果工作流中没有相关 AI 节点,则交互模式改成纯手动输入,并且上限为 1000万。
5. 语音输入,移动端判断逻辑,准确判断是否为手机,而不是小屏。 5. 语音输入,移动端判断逻辑,准确判断是否为手机,而不是小屏。
6. 优化上下文截取算法,至少保证留下一组 Human 信息。
## 🐛 修复 ## 🐛 修复
1. 全文检索多知识库时排序得分排序不正确。 1. 全文检索多知识库时排序得分排序不正确。
2. 流响应捕获 finish_reason 可能不正确。 2. 流响应捕获 finish_reason 可能不正确。
3. 工具调用模式,未保存思考输出。 3. 工具调用模式,未保存思考输出。
4. 知识库 indexSize 参数未生效。 4. 知识库 indexSize 参数未生效。
5. 工作流嵌套 2 层后,获取预览引用、上下文不正确。
6. xlsx 转成 Markdown 时候,前面会多出一个空格。
7. 读取 Markdown 文件时Base64 图片未进行额外抓换保存。

View File

@@ -28,6 +28,7 @@ FastGPT 商业版是基于 FastGPT 开源版的增强版本,增加了一些独
| 应用发布安全配置 | ❌ | ✅ | ✅ | | 应用发布安全配置 | ❌ | ✅ | ✅ |
| 内容审核 | ❌ | ✅ | ✅ | | 内容审核 | ❌ | ✅ | ✅ |
| web站点同步 | ❌ | ✅ | ✅ | | web站点同步 | ❌ | ✅ | ✅ |
| 主流文档库接入(目前支持:语雀、飞书) | ❌ | ✅ | ✅ |
| 增强训练模式 | ❌ | ✅ | ✅ | | 增强训练模式 | ❌ | ✅ | ✅ |
| 第三方应用快速接入(飞书、公众号) | ❌ | ✅ | ✅ | | 第三方应用快速接入(飞书、公众号) | ❌ | ✅ | ✅ |
| 管理后台 | ❌ | ✅ | 不需要 | | 管理后台 | ❌ | ✅ | 不需要 |

View File

@@ -7,10 +7,6 @@ export const CUSTOM_SPLIT_SIGN = '-----CUSTOM_SPLIT_SIGN-----';
type SplitProps = { type SplitProps = {
text: string; text: string;
chunkSize: number; chunkSize: number;
paragraphChunkDeep?: number; // Paragraph deep
paragraphChunkMinSize?: number; // Paragraph min size, if too small, it will merge
maxSize?: number; maxSize?: number;
overlapRatio?: number; overlapRatio?: number;
customReg?: string[]; customReg?: string[];
@@ -112,8 +108,6 @@ const commonSplit = (props: SplitProps): SplitResponse => {
let { let {
text = '', text = '',
chunkSize, chunkSize,
paragraphChunkDeep = 5,
paragraphChunkMinSize = 100,
maxSize = defaultMaxChunkSize, maxSize = defaultMaxChunkSize,
overlapRatio = 0.15, overlapRatio = 0.15,
customReg = [] customReg = []
@@ -129,7 +123,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
text = text.replace(/(```[\s\S]*?```|~~~[\s\S]*?~~~)/g, function (match) { text = text.replace(/(```[\s\S]*?```|~~~[\s\S]*?~~~)/g, function (match) {
return match.replace(/\n/g, codeBlockMarker); return match.replace(/\n/g, codeBlockMarker);
}); });
// 2. Markdown 表格处理 - 单独提取表格出来,进行表头合并 // 2. 表格处理 - 单独提取表格出来,进行表头合并
const tableReg = const tableReg =
/(\n\|(?:(?:[^\n|]+\|){1,})\n\|(?:[:\-\s]+\|){1,}\n(?:\|(?:[^\n|]+\|)*\n?)*)(?:\n|$)/g; /(\n\|(?:(?:[^\n|]+\|){1,})\n\|(?:[:\-\s]+\|){1,}\n(?:\|(?:[^\n|]+\|)*\n?)*)(?:\n|$)/g;
const tableDataList = text.match(tableReg); const tableDataList = text.match(tableReg);
@@ -149,40 +143,25 @@ const commonSplit = (props: SplitProps): SplitResponse => {
text = text.replace(/(\r?\n|\r){3,}/g, '\n\n\n'); text = text.replace(/(\r?\n|\r){3,}/g, '\n\n\n');
// The larger maxLen is, the next sentence is less likely to trigger splitting // The larger maxLen is, the next sentence is less likely to trigger splitting
const customRegLen = customReg.length; const markdownIndex = 4;
const markdownIndex = paragraphChunkDeep - 1; const forbidOverlapIndex = 8;
const forbidOverlapIndex = customRegLen + markdownIndex + 4;
const markdownHeaderRules = ((deep?: number): { reg: RegExp; maxLen: number }[] => {
if (!deep || deep === 0) return [];
const maxDeep = Math.min(deep, 8); // Maximum 8 levels
const rules: { reg: RegExp; maxLen: number }[] = [];
for (let i = 1; i <= maxDeep; i++) {
const hashSymbols = '#'.repeat(i);
rules.push({
reg: new RegExp(`^(${hashSymbols}\\s[^\\n]+\\n)`, 'gm'),
maxLen: chunkSize
});
}
return rules;
})(paragraphChunkDeep);
const stepReges: { reg: RegExp | string; maxLen: number }[] = [ const stepReges: { reg: RegExp | string; maxLen: number }[] = [
...customReg.map((text) => ({ ...customReg.map((text) => ({
reg: text.replaceAll('\\n', '\n'), reg: text.replaceAll('\\n', '\n'),
maxLen: chunkSize maxLen: chunkSize
})), })),
...markdownHeaderRules, { reg: /^(#\s[^\n]+\n)/gm, maxLen: chunkSize },
{ reg: /^(##\s[^\n]+\n)/gm, maxLen: chunkSize },
{ reg: /^(###\s[^\n]+\n)/gm, maxLen: chunkSize },
{ reg: /^(####\s[^\n]+\n)/gm, maxLen: chunkSize },
{ reg: /^(#####\s[^\n]+\n)/gm, maxLen: chunkSize },
{ reg: /([\n](```[\s\S]*?```|~~~[\s\S]*?~~~))/g, maxLen: maxSize }, // code block { reg: /([\n](```[\s\S]*?```|~~~[\s\S]*?~~~))/g, maxLen: maxSize }, // code block
// HTML Table tag 尽可能保障完整
{ {
reg: /(\n\|(?:(?:[^\n|]+\|){1,})\n\|(?:[:\-\s]+\|){1,}\n(?:\|(?:[^\n|]+\|)*\n)*)/g, reg: /(\n\|(?:(?:[^\n|]+\|){1,})\n\|(?:[:\-\s]+\|){1,}\n(?:\|(?:[^\n|]+\|)*\n)*)/g,
maxLen: chunkSize maxLen: Math.min(chunkSize * 1.5, maxSize)
}, // Markdown Table 尽可能保证完整性 }, // Table 尽可能保证完整性
{ reg: /(\n{2,})/g, maxLen: chunkSize }, { reg: /(\n{2,})/g, maxLen: chunkSize },
{ reg: /([\n])/g, maxLen: chunkSize }, { reg: /([\n])/g, maxLen: chunkSize },
// ------ There's no overlap on the top // ------ There's no overlap on the top
@@ -193,10 +172,12 @@ const commonSplit = (props: SplitProps): SplitResponse => {
{ reg: /([]|,\s)/g, maxLen: chunkSize } { reg: /([]|,\s)/g, maxLen: chunkSize }
]; ];
const customRegLen = customReg.length;
const checkIsCustomStep = (step: number) => step < customRegLen; const checkIsCustomStep = (step: number) => step < customRegLen;
const checkIsMarkdownSplit = (step: number) => const checkIsMarkdownSplit = (step: number) =>
step >= customRegLen && step <= markdownIndex + customRegLen; step >= customRegLen && step <= markdownIndex + customRegLen;
const checkForbidOverlap = (step: number) => step <= forbidOverlapIndex;
const checkForbidOverlap = (step: number) => step <= forbidOverlapIndex + customRegLen;
// if use markdown title split, Separate record title // if use markdown title split, Separate record title
const getSplitTexts = ({ text, step }: { text: string; step: number }) => { const getSplitTexts = ({ text, step }: { text: string; step: number }) => {
@@ -320,7 +301,6 @@ const commonSplit = (props: SplitProps): SplitResponse => {
const splitTexts = getSplitTexts({ text, step }); const splitTexts = getSplitTexts({ text, step });
const chunks: string[] = []; const chunks: string[] = [];
for (let i = 0; i < splitTexts.length; i++) { for (let i = 0; i < splitTexts.length; i++) {
const item = splitTexts[i]; const item = splitTexts[i];
@@ -463,6 +443,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
*/ */
export const splitText2Chunks = (props: SplitProps): SplitResponse => { export const splitText2Chunks = (props: SplitProps): SplitResponse => {
let { text = '' } = props; let { text = '' } = props;
const start = Date.now();
const splitWithCustomSign = text.split(CUSTOM_SPLIT_SIGN); const splitWithCustomSign = text.split(CUSTOM_SPLIT_SIGN);
const splitResult = splitWithCustomSign.map((item) => { const splitResult = splitWithCustomSign.map((item) => {

View File

@@ -120,6 +120,7 @@ export const computeChunkSize = (params: {
return Math.min(params.chunkSize ?? chunkAutoChunkSize, getLLMMaxChunkSize(params.llmModel)); return Math.min(params.chunkSize ?? chunkAutoChunkSize, getLLMMaxChunkSize(params.llmModel));
}; };
export const computeChunkSplitter = (params: { export const computeChunkSplitter = (params: {
chunkSettingMode?: ChunkSettingModeEnum; chunkSettingMode?: ChunkSettingModeEnum;
chunkSplitMode?: DataChunkSplitModeEnum; chunkSplitMode?: DataChunkSplitModeEnum;
@@ -128,21 +129,8 @@ export const computeChunkSplitter = (params: {
if (params.chunkSettingMode === ChunkSettingModeEnum.auto) { if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
return undefined; return undefined;
} }
if (params.chunkSplitMode !== DataChunkSplitModeEnum.char) { if (params.chunkSplitMode === DataChunkSplitModeEnum.size) {
return undefined; return undefined;
} }
return params.chunkSplitter; return params.chunkSplitter;
}; };
export const computeParagraphChunkDeep = (params: {
chunkSettingMode?: ChunkSettingModeEnum;
chunkSplitMode?: DataChunkSplitModeEnum;
paragraphChunkDeep?: number;
}) => {
if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
return 5;
}
if (params.chunkSplitMode === DataChunkSplitModeEnum.paragraph) {
return params.paragraphChunkDeep;
}
return 0;
};

View File

@@ -9,8 +9,7 @@ import type {
DatasetTypeEnum, DatasetTypeEnum,
SearchScoreTypeEnum, SearchScoreTypeEnum,
TrainingModeEnum, TrainingModeEnum,
ChunkSettingModeEnum, ChunkSettingModeEnum
ChunkTriggerConfigTypeEnum
} from './constants'; } from './constants';
import type { DatasetPermission } from '../../support/permission/dataset/controller'; import type { DatasetPermission } from '../../support/permission/dataset/controller';
import type { APIFileServer, FeishuServer, YuqueServer } from './apiDataset'; import type { APIFileServer, FeishuServer, YuqueServer } from './apiDataset';
@@ -38,10 +37,11 @@ export type ChunkSettingsType = {
paragraphChunkAIMode?: ParagraphChunkAIModeEnum; paragraphChunkAIMode?: ParagraphChunkAIModeEnum;
paragraphChunkDeep?: number; // Paragraph deep paragraphChunkDeep?: number; // Paragraph deep
paragraphChunkMinSize?: number; // Paragraph min size, if too small, it will merge paragraphChunkMinSize?: number; // Paragraph min size, if too small, it will merge
paragraphChunkMaxSize?: number; // Paragraph max size, if too large, it will split
// Size split // Size split
chunkSize?: number; // chunk/qa chunk size, Paragraph max chunk size. chunkSize?: number;
// Char split // Char split
chunkSplitter?: string; // chunk/qa chunk splitter chunkSplitter?: string;
indexSize?: number; indexSize?: number;
qaPrompt?: string; qaPrompt?: string;

View File

@@ -65,8 +65,8 @@ export const filterGPTMessageByMaxContext = async ({
if (lastMessage.role === ChatCompletionRequestMessageRoleEnum.User) { if (lastMessage.role === ChatCompletionRequestMessageRoleEnum.User) {
const tokens = await countGptMessagesTokens([lastMessage, ...tmpChats]); const tokens = await countGptMessagesTokens([lastMessage, ...tmpChats]);
maxContext -= tokens; maxContext -= tokens;
// 该轮信息整体 tokens 超出范围,这段数据不要了。但是至少保证一组。 // 该轮信息整体 tokens 超出范围,这段数据不要了
if (maxContext < 0 && chats.length > 0) { if (maxContext < 0) {
break; break;
} }

View File

@@ -34,7 +34,6 @@ import { getTrainingModeByCollection } from './utils';
import { import {
computeChunkSize, computeChunkSize,
computeChunkSplitter, computeChunkSplitter,
computeParagraphChunkDeep,
getLLMMaxChunkSize getLLMMaxChunkSize
} from '@fastgpt/global/core/dataset/training/utils'; } from '@fastgpt/global/core/dataset/training/utils';
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants'; import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
@@ -75,8 +74,6 @@ export const createCollectionAndInsertData = async ({
llmModel: getLLMModel(dataset.agentModel) llmModel: getLLMModel(dataset.agentModel)
}); });
const chunkSplitter = computeChunkSplitter(createCollectionParams); const chunkSplitter = computeChunkSplitter(createCollectionParams);
const paragraphChunkDeep = computeParagraphChunkDeep(createCollectionParams);
if (trainingType === DatasetCollectionDataProcessModeEnum.qa) { if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
delete createCollectionParams.chunkTriggerType; delete createCollectionParams.chunkTriggerType;
delete createCollectionParams.chunkTriggerMinSize; delete createCollectionParams.chunkTriggerMinSize;
@@ -90,11 +87,7 @@ export const createCollectionAndInsertData = async ({
// 1. split chunks // 1. split chunks
const chunks = rawText2Chunks({ const chunks = rawText2Chunks({
rawText, rawText,
chunkTriggerType: createCollectionParams.chunkTriggerType,
chunkTriggerMinSize: createCollectionParams.chunkTriggerMinSize,
chunkSize, chunkSize,
paragraphChunkDeep,
paragraphChunkMinSize: createCollectionParams.paragraphChunkMinSize,
maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)), maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)),
overlapRatio: trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0, overlapRatio: trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0,
customReg: chunkSplitter ? [chunkSplitter] : [], customReg: chunkSplitter ? [chunkSplitter] : [],
@@ -119,7 +112,6 @@ export const createCollectionAndInsertData = async ({
const { _id: collectionId } = await createOneCollection({ const { _id: collectionId } = await createOneCollection({
...createCollectionParams, ...createCollectionParams,
trainingType, trainingType,
paragraphChunkDeep,
chunkSize, chunkSize,
chunkSplitter, chunkSplitter,
@@ -220,19 +212,46 @@ export type CreateOneCollectionParams = CreateDatasetCollectionParams & {
tmbId: string; tmbId: string;
session?: ClientSession; session?: ClientSession;
}; };
export async function createOneCollection({ session, ...props }: CreateOneCollectionParams) { export async function createOneCollection({
const { teamId,
teamId, tmbId,
parentId, name,
datasetId, parentId,
tags, datasetId,
type,
fileId, createTime,
rawLink, updateTime,
externalFileId,
externalFileUrl, hashRawText,
apiFileId rawTextLength,
} = props; metadata = {},
tags,
nextSyncTime,
fileId,
rawLink,
externalFileId,
externalFileUrl,
apiFileId,
// Parse settings
customPdfParse,
imageIndex,
autoIndexes,
// Chunk settings
trainingType,
chunkSettingMode,
chunkSplitMode,
chunkSize,
indexSize,
chunkSplitter,
qaPrompt,
session
}: CreateOneCollectionParams) {
// Create collection tags // Create collection tags
const collectionTags = await createOrGetCollectionTags({ tags, teamId, datasetId, session }); const collectionTags = await createOrGetCollectionTags({ tags, teamId, datasetId, session });
@@ -240,18 +259,41 @@ export async function createOneCollection({ session, ...props }: CreateOneCollec
const [collection] = await MongoDatasetCollection.create( const [collection] = await MongoDatasetCollection.create(
[ [
{ {
...props,
teamId, teamId,
tmbId,
parentId: parentId || null, parentId: parentId || null,
datasetId, datasetId,
name,
type,
rawTextLength,
hashRawText,
tags: collectionTags, tags: collectionTags,
metadata,
createTime,
updateTime,
nextSyncTime,
...(fileId ? { fileId } : {}), ...(fileId ? { fileId } : {}),
...(rawLink ? { rawLink } : {}), ...(rawLink ? { rawLink } : {}),
...(externalFileId ? { externalFileId } : {}), ...(externalFileId ? { externalFileId } : {}),
...(externalFileUrl ? { externalFileUrl } : {}), ...(externalFileUrl ? { externalFileUrl } : {}),
...(apiFileId ? { apiFileId } : {}) ...(apiFileId ? { apiFileId } : {}),
// Parse settings
customPdfParse,
imageIndex,
autoIndexes,
// Chunk settings
trainingType,
chunkSettingMode,
chunkSplitMode,
chunkSize,
indexSize,
chunkSplitter,
qaPrompt
} }
], ],
{ session, ordered: true } { session, ordered: true }

View File

@@ -1,8 +1,5 @@
import { BucketNameEnum } from '@fastgpt/global/common/file/constants'; import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
import { import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
ChunkTriggerConfigTypeEnum,
DatasetSourceReadTypeEnum
} from '@fastgpt/global/core/dataset/constants';
import { readFileContentFromMongo } from '../../common/file/gridfs/controller'; import { readFileContentFromMongo } from '../../common/file/gridfs/controller';
import { urlsFetch } from '../../common/string/cheerio'; import { urlsFetch } from '../../common/string/cheerio';
import { type TextSplitProps, splitText2Chunks } from '@fastgpt/global/common/string/textSplitter'; import { type TextSplitProps, splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
@@ -182,17 +179,11 @@ export const readApiServerFileContent = async ({
export const rawText2Chunks = ({ export const rawText2Chunks = ({
rawText, rawText,
chunkTriggerType = ChunkTriggerConfigTypeEnum.minSize,
chunkTriggerMinSize = 1000,
backupParse, backupParse,
chunkSize = 512, chunkSize = 512,
...splitProps ...splitProps
}: { }: {
rawText: string; rawText: string;
chunkTriggerType?: ChunkTriggerConfigTypeEnum;
chunkTriggerMinSize?: number; // maxSize from agent model, not store
backupParse?: boolean; backupParse?: boolean;
tableParse?: boolean; tableParse?: boolean;
} & TextSplitProps): { } & TextSplitProps): {
@@ -218,28 +209,6 @@ export const rawText2Chunks = ({
}; };
}; };
// Chunk condition
// 1. 选择最大值条件,只有超过了最大值(默认为模型的最大值*0.7),才会触发分块
if (chunkTriggerType === ChunkTriggerConfigTypeEnum.maxSize) {
const textLength = rawText.trim().length;
const maxSize = splitProps.maxSize ? splitProps.maxSize * 0.7 : 16000;
if (textLength < maxSize) {
return [
{
q: rawText,
a: ''
}
];
}
}
// 2. 选择最小值条件,只有超过最小值(手动决定)才会触发分块
if (chunkTriggerType !== ChunkTriggerConfigTypeEnum.forceChunk) {
const textLength = rawText.trim().length;
if (textLength < chunkTriggerMinSize) {
return [{ q: rawText, a: '' }];
}
}
if (backupParse) { if (backupParse) {
return parseDatasetBackup2Chunks(rawText).chunks; return parseDatasetBackup2Chunks(rawText).chunks;
} }

View File

@@ -47,6 +47,7 @@ export const ChunkSettings = {
}, },
paragraphChunkDeep: Number, paragraphChunkDeep: Number,
paragraphChunkMinSize: Number, paragraphChunkMinSize: Number,
paragraphChunkMaxSize: Number,
chunkSize: Number, chunkSize: Number,
chunkSplitter: String, chunkSplitter: String,

View File

@@ -658,7 +658,7 @@ export async function searchDatasetData(
tokenLen: 0 tokenLen: 0
}; };
} catch (error) { } catch (error) {
addLog.error('Full text search error', error); addLog.error('multiQueryRecall error', error);
return { return {
fullTextRecallResults: [], fullTextRecallResults: [],
tokenLen: 0 tokenLen: 0

View File

@@ -1,6 +1,5 @@
import iconv from 'iconv-lite'; import iconv from 'iconv-lite';
import { type ReadRawTextByBuffer, type ReadFileResponse } from '../type'; import { type ReadRawTextByBuffer, type ReadFileResponse } from '../type';
import { matchMdImg } from '@fastgpt/global/common/string/markdown';
const rawEncodingList = [ const rawEncodingList = [
'ascii', 'ascii',
@@ -35,10 +34,7 @@ export const readFileRawText = ({ buffer, encoding }: ReadRawTextByBuffer): Read
} }
})(); })();
const { text, imageList } = matchMdImg(content);
return { return {
rawText: text, rawText: content
imageList
}; };
}; };

View File

@@ -28,11 +28,11 @@ export const readXlsxRawText = async ({
if (!header) return; if (!header) return;
const formatText = `| ${header.join(' | ')} | const formatText = `| ${header.join(' | ')} |
| ${header.map(() => '---').join(' | ')} | | ${header.map(() => '---').join(' | ')} |
${csvArr ${csvArr
.slice(1) .slice(1)
.map((row) => `| ${row.map((item) => item.replace(/\n/g, '\\n')).join(' | ')} |`) .map((row) => `| ${row.map((item) => item.replace(/\n/g, '\\n')).join(' | ')} |`)
.join('\n')}`; .join('\n')}`;
return formatText; return formatText;
}) })

View File

@@ -54,7 +54,7 @@ const RadioGroup = <T = any,>({ list, value, onChange, ...props }: Props<T>) =>
/> />
</Flex> </Flex>
</Box> </Box>
<HStack spacing={0.5} color={'myGray.900'} whiteSpace={'nowrap'} fontSize={'sm'}> <HStack spacing={1} color={'myGray.900'} whiteSpace={'nowrap'} fontSize={'sm'}>
<Box>{typeof item.title === 'string' ? t(item.title as any) : item.title}</Box> <Box>{typeof item.title === 'string' ? t(item.title as any) : item.title}</Box>
{!!item.tooltip && <QuestionTip label={item.tooltip} color={'myGray.600'} />} {!!item.tooltip && <QuestionTip label={item.tooltip} color={'myGray.600'} />}
</HStack> </HStack>

View File

@@ -15,13 +15,7 @@
"backup_dataset_tip": "You can reimport the downloaded csv file when exporting the knowledge base.", "backup_dataset_tip": "You can reimport the downloaded csv file when exporting the knowledge base.",
"backup_mode": "Backup import", "backup_mode": "Backup import",
"chunk_max_tokens": "max_tokens", "chunk_max_tokens": "max_tokens",
"chunk_process_params": "Block processing parameters",
"chunk_size": "Block size", "chunk_size": "Block size",
"chunk_trigger": "Blocking conditions",
"chunk_trigger_force_chunk": "Forced chunking",
"chunk_trigger_max_size": "The original text length is less than the maximum context 70% of the file processing model",
"chunk_trigger_min_size": "The original text is greater than",
"chunk_trigger_tips": "Block storage is triggered when certain conditions are met, otherwise the original text will be stored in full directly",
"close_auto_sync": "Are you sure you want to turn off automatic sync?", "close_auto_sync": "Are you sure you want to turn off automatic sync?",
"collection.Create update time": "Creation/Update Time", "collection.Create update time": "Creation/Update Time",
"collection.Training type": "Training", "collection.Training type": "Training",
@@ -35,7 +29,6 @@
"collection_tags": "Collection Tags", "collection_tags": "Collection Tags",
"common_dataset": "General Dataset", "common_dataset": "General Dataset",
"common_dataset_desc": "Building a knowledge base by importing files, web page links, or manual entry", "common_dataset_desc": "Building a knowledge base by importing files, web page links, or manual entry",
"condition": "condition",
"config_sync_schedule": "Configure scheduled synchronization", "config_sync_schedule": "Configure scheduled synchronization",
"confirm_to_rebuild_embedding_tip": "Are you sure you want to switch the index for the Dataset?\nSwitching the index is a significant operation that requires re-indexing all data in your Dataset, which may take a long time. Please ensure your account has sufficient remaining points.\n\nAdditionally, you need to update the applications that use this Dataset to avoid conflicts with other indexed model Datasets.", "confirm_to_rebuild_embedding_tip": "Are you sure you want to switch the index for the Dataset?\nSwitching the index is a significant operation that requires re-indexing all data in your Dataset, which may take a long time. Please ensure your account has sufficient remaining points.\n\nAdditionally, you need to update the applications that use this Dataset to avoid conflicts with other indexed model Datasets.",
"core.dataset.import.Adjust parameters": "Adjust parameters", "core.dataset.import.Adjust parameters": "Adjust parameters",
@@ -107,7 +100,6 @@
"is_open_schedule": "Enable scheduled synchronization", "is_open_schedule": "Enable scheduled synchronization",
"keep_image": "Keep the picture", "keep_image": "Keep the picture",
"loading": "Loading...", "loading": "Loading...",
"max_chunk_size": "Maximum chunk size",
"move.hint": "After moving, the selected knowledge base/folder will inherit the permission settings of the new folder, and the original permission settings will become invalid.", "move.hint": "After moving, the selected knowledge base/folder will inherit the permission settings of the new folder, and the original permission settings will become invalid.",
"noChildren": "No subdirectories", "noChildren": "No subdirectories",
"noSelectedFolder": "No selected folder", "noSelectedFolder": "No selected folder",
@@ -115,10 +107,8 @@
"noValidId": "No valid ID", "noValidId": "No valid ID",
"open_auto_sync": "After scheduled synchronization is turned on, the system will try to synchronize the collection from time to time every day. During the collection synchronization period, the collection data will not be searched.", "open_auto_sync": "After scheduled synchronization is turned on, the system will try to synchronize the collection from time to time every day. During the collection synchronization period, the collection data will not be searched.",
"other_dataset": "Third-party knowledge base", "other_dataset": "Third-party knowledge base",
"paragraph_max_deep": "Maximum paragraph depth",
"paragraph_split": "Partition by paragraph",
"paragraph_split_tip": "Priority is given to chunking according to the Makdown title paragraph. If the chunking is too long, then chunking is done according to the length.",
"params_config": "Config", "params_config": "Config",
"params_setting": "Parameter settings",
"pdf_enhance_parse": "PDF enhancement analysis", "pdf_enhance_parse": "PDF enhancement analysis",
"pdf_enhance_parse_price": "{{price}} points/page", "pdf_enhance_parse_price": "{{price}} points/page",
"pdf_enhance_parse_tips": "Calling PDF recognition model for parsing, you can convert it into Markdown and retain pictures in the document. At the same time, you can also identify scanned documents, which will take a long time to identify them.", "pdf_enhance_parse_tips": "Calling PDF recognition model for parsing, you can convert it into Markdown and retain pictures in the document. At the same time, you can also identify scanned documents, which will take a long time to identify them.",

View File

@@ -554,7 +554,7 @@
"core.dataset.training.Agent queue": "QA 训练排队", "core.dataset.training.Agent queue": "QA 训练排队",
"core.dataset.training.Auto mode": "补充索引", "core.dataset.training.Auto mode": "补充索引",
"core.dataset.training.Auto mode Tip": "通过子索引以及调用模型生成相关问题与摘要,来增加数据块的语义丰富度,更利于检索。需要消耗更多的存储空间和增加 AI 调用次数。", "core.dataset.training.Auto mode Tip": "通过子索引以及调用模型生成相关问题与摘要,来增加数据块的语义丰富度,更利于检索。需要消耗更多的存储空间和增加 AI 调用次数。",
"core.dataset.training.Chunk mode": "分块存储", "core.dataset.training.Chunk mode": "直接分块",
"core.dataset.training.Full": "预计 20 分钟以上", "core.dataset.training.Full": "预计 20 分钟以上",
"core.dataset.training.Leisure": "空闲", "core.dataset.training.Leisure": "空闲",
"core.dataset.training.QA mode": "问答对提取", "core.dataset.training.QA mode": "问答对提取",

View File

@@ -15,13 +15,7 @@
"backup_dataset_tip": "可以将导出知识库时,下载的 csv 文件重新导入。", "backup_dataset_tip": "可以将导出知识库时,下载的 csv 文件重新导入。",
"backup_mode": "备份导入", "backup_mode": "备份导入",
"chunk_max_tokens": "分块上限", "chunk_max_tokens": "分块上限",
"chunk_process_params": "分块处理参数",
"chunk_size": "分块大小", "chunk_size": "分块大小",
"chunk_trigger": "分块条件",
"chunk_trigger_force_chunk": "强制分块",
"chunk_trigger_max_size": "原文长度小于文件处理模型最大上下文70%",
"chunk_trigger_min_size": "原文长度大于",
"chunk_trigger_tips": "当满足一定条件时才触发分块存储,否则会直接完整存储原文",
"close_auto_sync": "确认关闭自动同步功能?", "close_auto_sync": "确认关闭自动同步功能?",
"collection.Create update time": "创建/更新时间", "collection.Create update time": "创建/更新时间",
"collection.Training type": "训练模式", "collection.Training type": "训练模式",
@@ -35,7 +29,6 @@
"collection_tags": "集合标签", "collection_tags": "集合标签",
"common_dataset": "通用知识库", "common_dataset": "通用知识库",
"common_dataset_desc": "通过导入文件、网页链接或手动录入形式构建知识库", "common_dataset_desc": "通过导入文件、网页链接或手动录入形式构建知识库",
"condition": "条件",
"config_sync_schedule": "配置定时同步", "config_sync_schedule": "配置定时同步",
"confirm_to_rebuild_embedding_tip": "确认为知识库切换索引?\n切换索引是一个非常重量的操作需要对您知识库内所有数据进行重新索引时间可能较长请确保账号内剩余积分充足。\n\n此外你还需要注意修改选择该知识库的应用避免它们与其他索引模型知识库混用。", "confirm_to_rebuild_embedding_tip": "确认为知识库切换索引?\n切换索引是一个非常重量的操作需要对您知识库内所有数据进行重新索引时间可能较长请确保账号内剩余积分充足。\n\n此外你还需要注意修改选择该知识库的应用避免它们与其他索引模型知识库混用。",
"core.dataset.import.Adjust parameters": "调整参数", "core.dataset.import.Adjust parameters": "调整参数",
@@ -107,7 +100,6 @@
"is_open_schedule": "启用定时同步", "is_open_schedule": "启用定时同步",
"keep_image": "保留图片", "keep_image": "保留图片",
"loading": "加载中...", "loading": "加载中...",
"max_chunk_size": "最大分块大小",
"move.hint": "移动后,所选知识库/文件夹将继承新文件夹的权限设置,原先的权限设置失效。", "move.hint": "移动后,所选知识库/文件夹将继承新文件夹的权限设置,原先的权限设置失效。",
"noChildren": "无子目录", "noChildren": "无子目录",
"noSelectedFolder": "没有选择文件夹", "noSelectedFolder": "没有选择文件夹",
@@ -115,10 +107,8 @@
"noValidId": "没有有效的 ID", "noValidId": "没有有效的 ID",
"open_auto_sync": "开启定时同步后,系统将会每天不定时尝试同步集合,集合同步期间,会出现无法搜索到该集合数据现象。", "open_auto_sync": "开启定时同步后,系统将会每天不定时尝试同步集合,集合同步期间,会出现无法搜索到该集合数据现象。",
"other_dataset": "第三方知识库", "other_dataset": "第三方知识库",
"paragraph_max_deep": "最大段落深度",
"paragraph_split": "按段落分块",
"paragraph_split_tip": "优先按 Makdown 标题段落进行分块,如果分块过长,再按长度进行二次分块",
"params_config": "配置", "params_config": "配置",
"params_setting": "参数设置",
"pdf_enhance_parse": "PDF增强解析", "pdf_enhance_parse": "PDF增强解析",
"pdf_enhance_parse_price": "{{price}}积分/页", "pdf_enhance_parse_price": "{{price}}积分/页",
"pdf_enhance_parse_tips": "调用 PDF 识别模型进行解析,可以将其转换成 Markdown 并保留文档中的图片,同时也可以对扫描件进行识别,识别时间较长。", "pdf_enhance_parse_tips": "调用 PDF 识别模型进行解析,可以将其转换成 Markdown 并保留文档中的图片,同时也可以对扫描件进行识别,识别时间较长。",

View File

@@ -554,7 +554,7 @@
"core.dataset.training.Agent queue": "問答訓練排隊中", "core.dataset.training.Agent queue": "問答訓練排隊中",
"core.dataset.training.Auto mode": "補充索引", "core.dataset.training.Auto mode": "補充索引",
"core.dataset.training.Auto mode Tip": "透過子索引以及呼叫模型產生相關問題與摘要,來增加資料區塊的語意豐富度,更有利於檢索。需要消耗更多的儲存空間並增加 AI 呼叫次數。", "core.dataset.training.Auto mode Tip": "透過子索引以及呼叫模型產生相關問題與摘要,來增加資料區塊的語意豐富度,更有利於檢索。需要消耗更多的儲存空間並增加 AI 呼叫次數。",
"core.dataset.training.Chunk mode": "分塊存儲", "core.dataset.training.Chunk mode": "直接分塊",
"core.dataset.training.Full": "預計 20 分鐘以上", "core.dataset.training.Full": "預計 20 分鐘以上",
"core.dataset.training.Leisure": "閒置", "core.dataset.training.Leisure": "閒置",
"core.dataset.training.QA mode": "問答對提取", "core.dataset.training.QA mode": "問答對提取",

View File

@@ -14,12 +14,7 @@
"backup_dataset_tip": "可以將導出知識庫時,下載的 csv 文件重新導入。", "backup_dataset_tip": "可以將導出知識庫時,下載的 csv 文件重新導入。",
"backup_mode": "備份導入", "backup_mode": "備份導入",
"chunk_max_tokens": "分塊上限", "chunk_max_tokens": "分塊上限",
"chunk_process_params": "分塊處理參數",
"chunk_size": "分塊大小", "chunk_size": "分塊大小",
"chunk_trigger": "分塊條件",
"chunk_trigger_force_chunk": "強制分塊",
"chunk_trigger_max_size": "原文長度小於文件處理模型最大上下文 70%",
"chunk_trigger_min_size": "原文長度大於",
"close_auto_sync": "確認關閉自動同步功能?", "close_auto_sync": "確認關閉自動同步功能?",
"collection.Create update time": "建立/更新時間", "collection.Create update time": "建立/更新時間",
"collection.Training type": "分段模式", "collection.Training type": "分段模式",
@@ -33,7 +28,6 @@
"collection_tags": "集合標籤", "collection_tags": "集合標籤",
"common_dataset": "通用資料集", "common_dataset": "通用資料集",
"common_dataset_desc": "通過導入文件、網頁鏈接或手動錄入形式構建知識庫", "common_dataset_desc": "通過導入文件、網頁鏈接或手動錄入形式構建知識庫",
"condition": "條件",
"config_sync_schedule": "設定定時同步", "config_sync_schedule": "設定定時同步",
"confirm_to_rebuild_embedding_tip": "確定要為資料集切換索引嗎?\n切換索引是一個重要的操作需要對您資料集內所有資料重新建立索引可能需要較長時間請確保帳號內剩餘點數充足。\n\n此外您還需要注意修改使用此資料集的應用程式避免與其他索引模型資料集混用。", "confirm_to_rebuild_embedding_tip": "確定要為資料集切換索引嗎?\n切換索引是一個重要的操作需要對您資料集內所有資料重新建立索引可能需要較長時間請確保帳號內剩餘點數充足。\n\n此外您還需要注意修改使用此資料集的應用程式避免與其他索引模型資料集混用。",
"core.dataset.import.Adjust parameters": "調整參數", "core.dataset.import.Adjust parameters": "調整參數",
@@ -105,7 +99,6 @@
"is_open_schedule": "啟用定時同步", "is_open_schedule": "啟用定時同步",
"keep_image": "保留圖片", "keep_image": "保留圖片",
"loading": "加載中...", "loading": "加載中...",
"max_chunk_size": "最大分塊大小",
"move.hint": "移動後,所選資料集/資料夾將繼承新資料夾的權限設定,原先的權限設定將失效。", "move.hint": "移動後,所選資料集/資料夾將繼承新資料夾的權限設定,原先的權限設定將失效。",
"noChildren": "無子目錄", "noChildren": "無子目錄",
"noSelectedFolder": "沒有選擇文件夾", "noSelectedFolder": "沒有選擇文件夾",
@@ -113,10 +106,8 @@
"noValidId": "沒有有效的 ID", "noValidId": "沒有有效的 ID",
"open_auto_sync": "開啟定時同步後,系統將每天不定時嘗試同步集合,集合同步期間,會出現無法搜尋到該集合資料現象。", "open_auto_sync": "開啟定時同步後,系統將每天不定時嘗試同步集合,集合同步期間,會出現無法搜尋到該集合資料現象。",
"other_dataset": "第三方知識庫", "other_dataset": "第三方知識庫",
"paragraph_max_deep": "最大段落深度",
"paragraph_split": "按段落分塊",
"paragraph_split_tip": "優先按 Makdown 標題段落進行分塊,如果分塊過長,再按長度進行二次分塊",
"params_config": "設定", "params_config": "設定",
"params_setting": "參數設定",
"pdf_enhance_parse": "PDF 增強解析", "pdf_enhance_parse": "PDF 增強解析",
"pdf_enhance_parse_price": "{{price}}積分/頁", "pdf_enhance_parse_price": "{{price}}積分/頁",
"pdf_enhance_parse_tips": "呼叫 PDF 識別模型進行解析,可以將其轉換成 Markdown 並保留文件中的圖片,同時也可以對掃描件進行識別,識別時間較長。", "pdf_enhance_parse_tips": "呼叫 PDF 識別模型進行解析,可以將其轉換成 Markdown 並保留文件中的圖片,同時也可以對掃描件進行識別,識別時間較長。",

View File

@@ -1,6 +1,6 @@
{ {
"name": "app", "name": "app",
"version": "4.9.10", "version": "4.9.9",
"private": false, "private": false,
"scripts": { "scripts": {
"dev": "next dev", "dev": "next dev",

View File

@@ -7,7 +7,6 @@ import { type ChatHistoryItemResType } from '@fastgpt/global/core/chat/type';
import { FlowNodeTypeEnum } from '@fastgpt/global/core/workflow/node/constant'; import { FlowNodeTypeEnum } from '@fastgpt/global/core/workflow/node/constant';
import { useRequest2 } from '@fastgpt/web/hooks/useRequest'; import { useRequest2 } from '@fastgpt/web/hooks/useRequest';
import { useTranslation } from 'next-i18next'; import { useTranslation } from 'next-i18next';
import { getFlatAppResponses } from '@/global/core/chat/utils';
const isLLMNode = (item: ChatHistoryItemResType) => const isLLMNode = (item: ChatHistoryItemResType) =>
item.moduleType === FlowNodeTypeEnum.chatNode || item.moduleType === FlowNodeTypeEnum.tools; item.moduleType === FlowNodeTypeEnum.chatNode || item.moduleType === FlowNodeTypeEnum.tools;
@@ -17,7 +16,17 @@ const ContextModal = ({ onClose, dataId }: { onClose: () => void; dataId: string
const { loading: isLoading, data: contextModalData } = useRequest2( const { loading: isLoading, data: contextModalData } = useRequest2(
() => () =>
getHistoryResponseData({ dataId }).then((res) => { getHistoryResponseData({ dataId }).then((res) => {
const flatResData = getFlatAppResponses(res || []); const flatResData: ChatHistoryItemResType[] =
res
?.map((item) => {
return [
item,
...(item.pluginDetail || []),
...(item.toolDetail || []),
...(item.loopDetail || [])
];
})
.flat() || [];
return flatResData.find(isLLMNode)?.historyPreview || []; return flatResData.find(isLLMNode)?.historyPreview || [];
}), }),
{ manual: false } { manual: false }

View File

@@ -19,25 +19,23 @@ export function transformPreviewHistories(
}); });
} }
export const getFlatAppResponses = (res: ChatHistoryItemResType[]): ChatHistoryItemResType[] => {
return res
.map((item) => {
return [
item,
...getFlatAppResponses(item.pluginDetail || []),
...getFlatAppResponses(item.toolDetail || []),
...getFlatAppResponses(item.loopDetail || [])
];
})
.flat();
};
export function addStatisticalDataToHistoryItem(historyItem: ChatItemType) { export function addStatisticalDataToHistoryItem(historyItem: ChatItemType) {
if (historyItem.obj !== ChatRoleEnum.AI) return historyItem; if (historyItem.obj !== ChatRoleEnum.AI) return historyItem;
if (historyItem.totalQuoteList !== undefined) return historyItem; if (historyItem.totalQuoteList !== undefined) return historyItem;
if (!historyItem.responseData) return historyItem; if (!historyItem.responseData) return historyItem;
// Flat children // Flat children
const flatResData = getFlatAppResponses(historyItem.responseData || []); const flatResData: ChatHistoryItemResType[] =
historyItem.responseData
?.map((item) => {
return [
item,
...(item.pluginDetail || []),
...(item.toolDetail || []),
...(item.loopDetail || [])
];
})
.flat() || [];
return { return {
...historyItem, ...historyItem,

View File

@@ -100,6 +100,8 @@ const WebsiteConfigModal = ({
paragraphChunkDeep: chunkSettings?.paragraphChunkDeep || defaultFormData.paragraphChunkDeep, paragraphChunkDeep: chunkSettings?.paragraphChunkDeep || defaultFormData.paragraphChunkDeep,
paragraphChunkMinSize: paragraphChunkMinSize:
chunkSettings?.paragraphChunkMinSize || defaultFormData.paragraphChunkMinSize, chunkSettings?.paragraphChunkMinSize || defaultFormData.paragraphChunkMinSize,
paragraphChunkMaxSize:
chunkSettings?.paragraphChunkMaxSize || defaultFormData.paragraphChunkMaxSize,
chunkSize: chunkSettings?.chunkSize || defaultFormData.chunkSize, chunkSize: chunkSettings?.chunkSize || defaultFormData.chunkSize,

View File

@@ -17,8 +17,10 @@ import {
} from '@chakra-ui/react'; } from '@chakra-ui/react';
import MyIcon from '@fastgpt/web/components/common/Icon'; import MyIcon from '@fastgpt/web/components/common/Icon';
import LeftRadio from '@fastgpt/web/components/common/Radio/LeftRadio'; import LeftRadio from '@fastgpt/web/components/common/Radio/LeftRadio';
import type { ParagraphChunkAIModeEnum } from '@fastgpt/global/core/dataset/constants'; import type {
import { ChunkTriggerConfigTypeEnum } from '@fastgpt/global/core/dataset/constants'; ChunkTriggerConfigTypeEnum,
ParagraphChunkAIModeEnum
} from '@fastgpt/global/core/dataset/constants';
import { import {
DataChunkSplitModeEnum, DataChunkSplitModeEnum,
DatasetCollectionDataProcessModeEnum, DatasetCollectionDataProcessModeEnum,
@@ -106,6 +108,7 @@ export type CollectionChunkFormType = {
paragraphChunkAIMode: ParagraphChunkAIModeEnum; paragraphChunkAIMode: ParagraphChunkAIModeEnum;
paragraphChunkDeep: number; // Paragraph deep paragraphChunkDeep: number; // Paragraph deep
paragraphChunkMinSize: number; // Paragraph min size, if too small, it will merge paragraphChunkMinSize: number; // Paragraph min size, if too small, it will merge
paragraphChunkMaxSize: number; // Paragraph max size, if too large, it will split
// Size split // Size split
chunkSize: number; chunkSize: number;
// Char split // Char split
@@ -127,7 +130,6 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
const { setValue, register, watch, getValues } = form; const { setValue, register, watch, getValues } = form;
const trainingType = watch('trainingType'); const trainingType = watch('trainingType');
const chunkTriggerType = watch('chunkTriggerType');
const chunkSettingMode = watch('chunkSettingMode'); const chunkSettingMode = watch('chunkSettingMode');
const chunkSplitMode = watch('chunkSplitMode'); const chunkSplitMode = watch('chunkSplitMode');
const autoIndexes = watch('autoIndexes'); const autoIndexes = watch('autoIndexes');
@@ -149,14 +151,6 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
})); }));
}, [t]); }, [t]);
// Chunk trigger
const chunkTriggerSelectList = [
{ label: t('dataset:chunk_trigger_min_size'), value: ChunkTriggerConfigTypeEnum.minSize },
{ label: t('dataset:chunk_trigger_max_size'), value: ChunkTriggerConfigTypeEnum.maxSize },
{ label: t('dataset:chunk_trigger_force_chunk'), value: ChunkTriggerConfigTypeEnum.forceChunk }
];
// Form max or min value
const { const {
maxChunkSize, maxChunkSize,
minChunkSize: minChunkSizeValue, minChunkSize: minChunkSizeValue,
@@ -195,11 +189,14 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
{ label: '=====', value: '=====' }, { label: '=====', value: '=====' },
{ label: t('dataset:split_sign_custom'), value: 'Other' } { label: t('dataset:split_sign_custom'), value: 'Other' }
]; ];
const [customListSelectValue, setCustomListSelectValue] = useState( const [customListSelectValue, setCustomListSelectValue] = useState(getValues('chunkSplitter'));
customSplitList.some((item) => item.value === getValues('chunkSplitter')) useEffect(() => {
? getValues('chunkSplitter') if (customListSelectValue === 'Other') {
: 'Other' setValue('chunkSplitter', '');
); } else {
setValue('chunkSplitter', customListSelectValue);
}
}, [customListSelectValue, setValue]);
// Index size // Index size
const indexSizeSeletorList = useMemo(() => getIndexSizeSelectList(maxIndexSize), [maxIndexSize]); const indexSizeSeletorList = useMemo(() => getIndexSizeSelectList(maxIndexSize), [maxIndexSize]);
@@ -246,41 +243,6 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
gridTemplateColumns={'repeat(2, 1fr)'} gridTemplateColumns={'repeat(2, 1fr)'}
/> />
</Box> </Box>
{trainingType === DatasetCollectionDataProcessModeEnum.chunk && (
<Box mt={6}>
<HStack fontSize={'sm'} mb={2} color={'myGray.600'} spacing={1}>
<Box>{t('dataset:chunk_trigger')}</Box>
<QuestionTip label={t('dataset:chunk_trigger_tips')} />
</HStack>
<HStack>
<Box flex={'1 0 0'} h={'34px'}>
<MySelect
borderRadius={'md'}
list={chunkTriggerSelectList}
value={chunkTriggerType}
onChange={(e) => {
setValue('chunkTriggerType', e);
}}
/>
</Box>
{chunkTriggerType === ChunkTriggerConfigTypeEnum.minSize && (
<Box flex={'1 0 0'}>
<MyNumberInput
h={'34px'}
bg={'white'}
min={100}
max={100000}
register={register}
name={'chunkTriggerMinSize'}
step={100}
/>
</Box>
)}
</HStack>
</Box>
)}
{trainingType === DatasetCollectionDataProcessModeEnum.chunk && {trainingType === DatasetCollectionDataProcessModeEnum.chunk &&
feConfigs?.show_dataset_enhance !== false && ( feConfigs?.show_dataset_enhance !== false && (
<Box mt={6}> <Box mt={6}>
@@ -325,7 +287,7 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
)} )}
<Box mt={6}> <Box mt={6}>
<Box fontSize={'sm'} mb={2} color={'myGray.600'}> <Box fontSize={'sm'} mb={2} color={'myGray.600'}>
{t('dataset:chunk_process_params')} {t('dataset:params_setting')}
</Box> </Box>
<LeftRadio<ChunkSettingModeEnum> <LeftRadio<ChunkSettingModeEnum>
list={[ list={[
@@ -343,11 +305,6 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
<Box> <Box>
<RadioGroup<DataChunkSplitModeEnum> <RadioGroup<DataChunkSplitModeEnum>
list={[ list={[
{
title: t('dataset:paragraph_split'),
value: DataChunkSplitModeEnum.paragraph,
tooltip: t('dataset:paragraph_split_tip')
},
{ {
title: t('dataset:split_chunk_size'), title: t('dataset:split_chunk_size'),
value: DataChunkSplitModeEnum.size value: DataChunkSplitModeEnum.size
@@ -364,76 +321,30 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
}} }}
/> />
{chunkSplitMode === DataChunkSplitModeEnum.paragraph && (
<>
<Box mt={1.5}>
<Box>{t('dataset:paragraph_max_deep')}</Box>
<MyNumberInput
size={'sm'}
bg={'myGray.50'}
register={register}
name={'paragraphChunkDeep'}
min={1}
max={8}
step={1}
h={'32px'}
/>
</Box>
<Box mt={1.5}>
<Box>{t('dataset:max_chunk_size')}</Box>
<Box
css={{
'& > span': {
display: 'block'
}
}}
>
<MyTooltip
label={t('common:core.dataset.import.Chunk Range', {
min: minChunkSizeValue,
max: maxChunkSize
})}
>
<MyNumberInput
register={register}
name={'chunkSize'}
min={minChunkSizeValue}
max={maxChunkSize}
size={'sm'}
step={100}
/>
</MyTooltip>
</Box>
</Box>
</>
)}
{chunkSplitMode === DataChunkSplitModeEnum.size && ( {chunkSplitMode === DataChunkSplitModeEnum.size && (
<Box mt={1.5}> <Box
<Box>{t('dataset:chunk_size')}</Box> mt={1.5}
<Box css={{
css={{ '& > span': {
'& > span': { display: 'block'
display: 'block' }
} }}
}} >
<MyTooltip
label={t('common:core.dataset.import.Chunk Range', {
min: minChunkSizeValue,
max: maxChunkSize
})}
> >
<MyTooltip <MyNumberInput
label={t('common:core.dataset.import.Chunk Range', { register={register}
min: minChunkSizeValue, name={'chunkSize'}
max: maxChunkSize min={minChunkSizeValue}
})} max={maxChunkSize}
> size={'sm'}
<MyNumberInput step={100}
register={register} />
name={'chunkSize'} </MyTooltip>
min={minChunkSizeValue}
max={maxChunkSize}
size={'sm'}
step={100}
/>
</MyTooltip>
</Box>
</Box> </Box>
)} )}
@@ -447,11 +358,6 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
value={customListSelectValue} value={customListSelectValue}
h={'32px'} h={'32px'}
onChange={(val) => { onChange={(val) => {
if (val === 'Other') {
setValue('chunkSplitter', '');
} else {
setValue('chunkSplitter', val);
}
setCustomListSelectValue(val); setCustomListSelectValue(val);
}} }}
/> />

View File

@@ -51,10 +51,11 @@ export const defaultFormData: ImportFormType = {
autoIndexes: false, autoIndexes: false,
chunkSettingMode: ChunkSettingModeEnum.auto, chunkSettingMode: ChunkSettingModeEnum.auto,
chunkSplitMode: DataChunkSplitModeEnum.paragraph, chunkSplitMode: DataChunkSplitModeEnum.size,
paragraphChunkAIMode: ParagraphChunkAIModeEnum.auto, paragraphChunkAIMode: ParagraphChunkAIModeEnum.auto,
paragraphChunkDeep: 5, paragraphChunkDeep: 4,
paragraphChunkMinSize: 100, paragraphChunkMinSize: 100,
paragraphChunkMaxSize: chunkAutoChunkSize,
chunkSize: chunkAutoChunkSize, chunkSize: chunkAutoChunkSize,
chunkSplitter: '', chunkSplitter: '',

View File

@@ -8,8 +8,10 @@ import { useRouter } from 'next/router';
import { useRequest2 } from '@fastgpt/web/hooks/useRequest'; import { useRequest2 } from '@fastgpt/web/hooks/useRequest';
import { getDatasetCollectionById } from '@/web/core/dataset/api'; import { getDatasetCollectionById } from '@/web/core/dataset/api';
import MyBox from '@fastgpt/web/components/common/MyBox'; import MyBox from '@fastgpt/web/components/common/MyBox';
import { ChunkSettingModeEnum } from '@fastgpt/global/core/dataset/constants';
import { getCollectionIcon } from '@fastgpt/global/core/dataset/utils'; import { getCollectionIcon } from '@fastgpt/global/core/dataset/utils';
import { Box } from '@chakra-ui/react'; import { Box } from '@chakra-ui/react';
import { DataChunkSplitModeEnum } from '@fastgpt/global/core/dataset/constants';
import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent'; import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent';
const Upload = dynamic(() => import('../commonProgress/Upload')); const Upload = dynamic(() => import('../commonProgress/Upload'));
@@ -66,6 +68,8 @@ const ReTraining = () => {
paragraphChunkDeep: collection.paragraphChunkDeep || defaultFormData.paragraphChunkDeep, paragraphChunkDeep: collection.paragraphChunkDeep || defaultFormData.paragraphChunkDeep,
paragraphChunkMinSize: paragraphChunkMinSize:
collection.paragraphChunkMinSize || defaultFormData.paragraphChunkMinSize, collection.paragraphChunkMinSize || defaultFormData.paragraphChunkMinSize,
paragraphChunkMaxSize:
collection.paragraphChunkMaxSize || defaultFormData.paragraphChunkMaxSize,
chunkSize: collection.chunkSize || defaultFormData.chunkSize, chunkSize: collection.chunkSize || defaultFormData.chunkSize,
@@ -81,13 +85,11 @@ const ReTraining = () => {
return ( return (
<MyBox isLoading={loading} h={'100%'}> <MyBox isLoading={loading} h={'100%'}>
{!loading && ( <Box h={'100%'} overflow={'auto'}>
<Box h={'100%'} overflow={'auto'}> {activeStep === 0 && <DataProcess />}
{activeStep === 0 && <DataProcess />} {activeStep === 1 && <PreviewData />}
{activeStep === 1 && <PreviewData />} {activeStep === 2 && <Upload />}
{activeStep === 2 && <Upload />} </Box>
</Box>
)}
</MyBox> </MyBox>
); );
}; };

View File

@@ -48,7 +48,7 @@ async function handler(req: ApiRequestProps<backupBody, backupQuery>, res: ApiRe
encoding: file.encoding, encoding: file.encoding,
getFormatText: false getFormatText: false
}); });
if (!rawText.trim().startsWith('q,a,indexes')) { if (!rawText.startsWith('q,a,indexes')) {
return Promise.reject('Backup file start with "q,a,indexes"'); return Promise.reject('Backup file start with "q,a,indexes"');
} }
// 2. delete tmp file // 2. delete tmp file

View File

@@ -50,10 +50,7 @@ async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
}); });
res.setHeader('Content-Type', 'text/csv; charset=utf-8;'); res.setHeader('Content-Type', 'text/csv; charset=utf-8;');
res.setHeader( res.setHeader('Content-Disposition', `attachment; filename=${dataset.name}-backup.csv;`);
'Content-Disposition',
`attachment; filename=${encodeURIComponent(dataset.name)}-backup.csv;`
);
const cursor = MongoDatasetData.find<DataItemType>( const cursor = MongoDatasetData.find<DataItemType>(
{ {

View File

@@ -1,6 +1,7 @@
import { import {
ChunkSettingModeEnum, type ChunkSettingModeEnum,
DatasetCollectionDataProcessModeEnum type DataChunkSplitModeEnum,
type DatasetCollectionDataProcessModeEnum
} from '@fastgpt/global/core/dataset/constants'; } from '@fastgpt/global/core/dataset/constants';
import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants'; import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
import { rawText2Chunks, readDatasetSourceRawText } from '@fastgpt/service/core/dataset/read'; import { rawText2Chunks, readDatasetSourceRawText } from '@fastgpt/service/core/dataset/read';
@@ -15,21 +16,25 @@ import { authDataset } from '@fastgpt/service/support/permission/dataset/auth';
import { import {
computeChunkSize, computeChunkSize,
computeChunkSplitter, computeChunkSplitter,
computeParagraphChunkDeep,
getLLMMaxChunkSize getLLMMaxChunkSize
} from '@fastgpt/global/core/dataset/training/utils'; } from '@fastgpt/global/core/dataset/training/utils';
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common'; import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
import { getLLMModel } from '@fastgpt/service/core/ai/model'; import { getLLMModel } from '@fastgpt/service/core/ai/model';
import type { ChunkSettingsType } from '@fastgpt/global/core/dataset/type';
export type PostPreviewFilesChunksProps = ChunkSettingsType & { export type PostPreviewFilesChunksProps = {
datasetId: string; datasetId: string;
type: DatasetSourceReadTypeEnum; type: DatasetSourceReadTypeEnum;
sourceId: string; sourceId: string;
customPdfParse?: boolean; customPdfParse?: boolean;
trainingType: DatasetCollectionDataProcessModeEnum;
// Chunk settings // Chunk settings
chunkSettingMode: ChunkSettingModeEnum;
chunkSplitMode: DataChunkSplitModeEnum;
chunkSize: number;
chunkSplitter?: string;
overlapRatio: number; overlapRatio: number;
// Read params // Read params
@@ -52,15 +57,9 @@ async function handler(
sourceId, sourceId,
customPdfParse = false, customPdfParse = false,
trainingType = DatasetCollectionDataProcessModeEnum.chunk, trainingType,
chunkSettingMode,
chunkTriggerType,
chunkTriggerMinSize,
chunkSettingMode = ChunkSettingModeEnum.auto,
chunkSplitMode, chunkSplitMode,
paragraphChunkDeep,
paragraphChunkMinSize,
chunkSize, chunkSize,
chunkSplitter, chunkSplitter,
@@ -104,16 +103,12 @@ async function handler(
chunkSize, chunkSize,
llmModel: getLLMModel(dataset.agentModel) llmModel: getLLMModel(dataset.agentModel)
}); });
chunkSplitter = computeChunkSplitter({ chunkSplitter = computeChunkSplitter({
chunkSettingMode, chunkSettingMode,
chunkSplitMode, chunkSplitMode,
chunkSplitter chunkSplitter
}); });
paragraphChunkDeep = computeParagraphChunkDeep({
chunkSettingMode,
chunkSplitMode,
paragraphChunkDeep
});
const { rawText } = await readDatasetSourceRawText({ const { rawText } = await readDatasetSourceRawText({
teamId, teamId,
@@ -130,11 +125,7 @@ async function handler(
const chunks = rawText2Chunks({ const chunks = rawText2Chunks({
rawText, rawText,
chunkTriggerType,
chunkTriggerMinSize,
chunkSize, chunkSize,
paragraphChunkDeep,
paragraphChunkMinSize,
maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)), maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)),
overlapRatio, overlapRatio,
customReg: chunkSplitter ? [chunkSplitter] : [] customReg: chunkSplitter ? [chunkSplitter] : []

View File

@@ -1,4 +1,8 @@
import { type ChatHistoryItemResType, type ChatSchema } from '@fastgpt/global/core/chat/type'; import {
type AIChatItemType,
type ChatHistoryItemResType,
type ChatSchema
} from '@fastgpt/global/core/chat/type';
import { MongoChat } from '@fastgpt/service/core/chat/chatSchema'; import { MongoChat } from '@fastgpt/service/core/chat/chatSchema';
import { type AuthModeType } from '@fastgpt/service/support/permission/type'; import { type AuthModeType } from '@fastgpt/service/support/permission/type';
import { authOutLink } from './outLink'; import { authOutLink } from './outLink';
@@ -8,7 +12,6 @@ import { AuthUserTypeEnum, ReadPermissionVal } from '@fastgpt/global/support/per
import { authApp } from '@fastgpt/service/support/permission/app/auth'; import { authApp } from '@fastgpt/service/support/permission/app/auth';
import { MongoChatItem } from '@fastgpt/service/core/chat/chatItemSchema'; import { MongoChatItem } from '@fastgpt/service/core/chat/chatItemSchema';
import { DatasetErrEnum } from '@fastgpt/global/common/error/code/dataset'; import { DatasetErrEnum } from '@fastgpt/global/common/error/code/dataset';
import { getFlatAppResponses } from '@/global/core/chat/utils';
/* /*
检查chat的权限 检查chat的权限
@@ -218,7 +221,18 @@ export const authCollectionInChat = async ({
if (!chatItem) return Promise.reject(DatasetErrEnum.unAuthDatasetCollection); if (!chatItem) return Promise.reject(DatasetErrEnum.unAuthDatasetCollection);
// 找 responseData 里,是否有该文档 id // 找 responseData 里,是否有该文档 id
const flatResData = getFlatAppResponses(chatItem.responseData || []); const responseData = chatItem.responseData || [];
const flatResData: ChatHistoryItemResType[] =
responseData
?.map((item) => {
return [
item,
...(item.pluginDetail || []),
...(item.toolDetail || []),
...(item.loopDetail || [])
];
})
.flat() || [];
const quoteListSet = new Set( const quoteListSet = new Set(
flatResData flatResData

View File

@@ -16,6 +16,7 @@ import type {
ApiDatasetCreateDatasetCollectionParams, ApiDatasetCreateDatasetCollectionParams,
CreateDatasetCollectionParams, CreateDatasetCollectionParams,
CreateDatasetCollectionTagParams, CreateDatasetCollectionTagParams,
CsvTableCreateDatasetCollectionParams,
DatasetUpdateBody, DatasetUpdateBody,
ExternalFileCreateDatasetCollectionParams, ExternalFileCreateDatasetCollectionParams,
FileIdCreateDatasetCollectionParams, FileIdCreateDatasetCollectionParams,

View File

@@ -1,6 +1,5 @@
import { it, expect } from 'vitest'; // 必须显式导入 import { it, expect } from 'vitest'; // 必须显式导入
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter'; import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
import fs from 'fs';
const simpleChunks = (chunks: string[]) => { const simpleChunks = (chunks: string[]) => {
return chunks.map((chunk) => chunk.replace(/\s+/g, '')); return chunks.map((chunk) => chunk.replace(/\s+/g, ''));
@@ -635,83 +634,9 @@ it(`Test splitText2Chunks 9`, () => {
| 10012 | 杨一 | 34 | 程序员 | 厦门 | | 10012 | 杨一 | 34 | 程序员 | 厦门 |
`, `,
result: [ result: [
`测试的呀,第一个表格 '测试的呀,第一个表格\n\n| 序号 | 姓名 | 年龄 | 职业 | 城市 |\n| --- | --- | --- | --- | --- |\n| 1 | 张三 | 25 | 工程师 | 北京 |\n| 2 | 李四 | 30 | 教师 | 上海 |\n| 3 | 王五 | 28 | 医生 | 广州 |\n| 6 | 周八 | 32 | 会计 | 成都 |\n| 4 | 赵六 | 35 | 律师 | 深圳 |\n| 5 | 孙七 | 27 | 设计师 | 杭州 |\n| 6 | 周八 | 32 | 会计 | 成都 |\n| 6 | 周八 | 32 | 会计 | 成都 |\n| 7 | 吴九 | 29 | 销售 | 武汉 |\n| 8 | 郑十 | 31 | 记者 | 南京 |\n| 9 | 刘一 | 33 | 建筑师 | 天津 |\n| 10 | 陈二 | 26 | 程序员 | 重庆 |\n| 1000 | 黄末 | 28 | 作家 | 厦门 |\n| 1001 | 杨一 | 34 | 程序员 | 厦门 |\n| 1002 | 杨二 | 34 | 程序员 | 厦门 |\n| 1003 | 杨三 | 34 | 程序员 | 厦门 |',
'| 序号 | 姓名 | 年龄 | 职业 | 城市 |\n| --- | --- | --- | --- | --- |\n| 6 | 周八 | 32 | 会计 | 成都 |\n| 1004 | 杨四 | 34 | 程序员 | 厦门 |\n| 1005 | 杨五 | 34 | 程序员 | 厦门 |\n| 1000 | 黄末 | 28 | 作家 | 厦门 |\n| 1000 | 黄末 | 28 | 作家 | 厦门 |\n| 1000 | 黄末 | 28 | 作家 | 厦门 |\n| 9 | 刘一 | 33 | 建筑师 | 天津 |\n| 10 | 陈二 | 26 | 程序员 | 重庆 |\n| 1000 | 黄末 | 28 | 作家 | 厦门 |\n| 1001 | 杨一 | 34 | 程序员 | 厦门 |\n| 1002 | 杨二 | 34 | 程序员 | 厦门 |\n| 1003 | 杨三 | 34 | 程序员 | 厦门 |\n| 1004 | 杨四 | 34 | 程序员 | 厦门 |\n| 1005 | 杨五 | 34 | 程序员 | 厦门 |\n\n| 序号 | 姓名 | 年龄 | 职业 | 城市 |\n| --- | --- | --- | --- | --- |\n| 6 | 周八 | 32 | 会计 | 成都 |\n| 1000 | 黄末 | 28 | 作家 | 厦门 |\n| 1000 | 黄末 | 28 | 作家 | 厦门 |\n| 1000 | 黄末 | 28 | 作家 | 厦门 |',
| 序号 | 姓名 | 年龄 | 职业 | 城市 | '这是第二段了,第二表格\n\n| 序号 | 姓名 | 年龄 | 职业 | 城市 |\n| --- | --- | --- | --- | --- |\n| 1 | 张三 | 25 | 工程师 | 北京 |\n| 6 | 周八 | 32 | 会计 | 成都 |\n| 2 | 李四 | 30 | 教师 | 上海 |\n| 3 | 王五 | 28 | 医生 | 广州 |\n| 4 | 赵六 | 35 | 律师 | 深圳 |\n| 5 | 孙七 | 27 | 设计师 | 杭州 |\n| 6 | 周八 | 32 | 会计 | 成都 |\n| 7 | 吴九 | 29 | 销售 | 武汉 |\n| 8 | 郑十 | 31 | 记者 | 南京 |\n| 9 | 刘一 | 33 | 建筑师 | 天津 |\n| 10 | 陈二 | 26 | 程序员 | 重庆 |\n| 10004 | 黄末 | 28 | 作家 | 厦门 |\n| 10013 | 杨一 | 34 | 程序员 | 厦门 |\n\n\n结束了\n\n| 序号22 | 姓名 | 年龄 | 职业 | 城市 |\n| --- | --- | --- | --- | --- |\n| 1 | 张三 | 25 | 工程师 | 北京 |\n| 2 | 李四 | 30 | 教师 | 上海 |\n| 3 | 王五 | 28 | 医生 | 广州 |\n| 4 | 赵六 | 35 | 律师 | 深圳 |\n| 5 | 孙七 | 27 | 设计师 | 杭州 |\n| 6 | 周八 | 32 | 会计 | 成都 |\n| 6 | 周八 | 32 | 会计 | 成都 |\n| 7 | 吴九 | 29 | 销售 | 武汉 |\n| 8 | 郑十 | 31 | 记者 | 南京 |\n| 9 | 刘一 | 33 | 建筑师 | 天津 |\n| 10 | 陈二 | 26 | 程序员 | 重庆 |\n| 10002 | 黄末 | 28 | 作家 | 厦门 |\n| 10012 | 杨一 | 34 | 程序员 | 厦门 |'
| --- | --- | --- | --- | --- |
| 1 | 张三 | 25 | 工程师 | 北京 |
| 2 | 李四 | 30 | 教师 | 上海 |
| 3 | 王五 | 28 | 医生 | 广州 |
| 6 | 周八 | 32 | 会计 | 成都 |
| 4 | 赵六 | 35 | 律师 | 深圳 |
| 5 | 孙七 | 27 | 设计师 | 杭州 |
| 6 | 周八 | 32 | 会计 | 成都 |
| 6 | 周八 | 32 | 会计 | 成都 |
| 7 | 吴九 | 29 | 销售 | 武汉 |
| 8 | 郑十 | 31 | 记者 | 南京 |
| 9 | 刘一 | 33 | 建筑师 | 天津 |
| 10 | 陈二 | 26 | 程序员 | 重庆 |
| 1000 | 黄末 | 28 | 作家 | 厦门 |
| 1001 | 杨一 | 34 | 程序员 | 厦门 |
| 1002 | 杨二 | 34 | 程序员 | 厦门 |
| 1003 | 杨三 | 34 | 程序员 | 厦门 |`,
`| 序号 | 姓名 | 年龄 | 职业 | 城市 |
| --- | --- | --- | --- | --- |
| 6 | 周八 | 32 | 会计 | 成都 |
| 1004 | 杨四 | 34 | 程序员 | 厦门 |
| 1005 | 杨五 | 34 | 程序员 | 厦门 |
| 1000 | 黄末 | 28 | 作家 | 厦门 |
| 1000 | 黄末 | 28 | 作家 | 厦门 |
| 1000 | 黄末 | 28 | 作家 | 厦门 |
| 9 | 刘一 | 33 | 建筑师 | 天津 |
| 10 | 陈二 | 26 | 程序员 | 重庆 |
| 1000 | 黄末 | 28 | 作家 | 厦门 |
| 1001 | 杨一 | 34 | 程序员 | 厦门 |
| 1002 | 杨二 | 34 | 程序员 | 厦门 |
| 1003 | 杨三 | 34 | 程序员 | 厦门 |
| 1004 | 杨四 | 34 | 程序员 | 厦门 |
| 1005 | 杨五 | 34 | 程序员 | 厦门 |`,
`| 序号 | 姓名 | 年龄 | 职业 | 城市 |
| --- | --- | --- | --- | --- |
| 6 | 周八 | 32 | 会计 | 成都 |
| 1000 | 黄末 | 28 | 作家 | 厦门 |
| 1000 | 黄末 | 28 | 作家 | 厦门 |
| 1000 | 黄末 | 28 | 作家 | 厦门 |
这是第二段了,第二表格
| 序号 | 姓名 | 年龄 | 职业 | 城市 |
| --- | --- | --- | --- | --- |
| 1 | 张三 | 25 | 工程师 | 北京 |
| 6 | 周八 | 32 | 会计 | 成都 |
| 2 | 李四 | 30 | 教师 | 上海 |
| 3 | 王五 | 28 | 医生 | 广州 |
| 4 | 赵六 | 35 | 律师 | 深圳 |
| 5 | 孙七 | 27 | 设计师 | 杭州 |
| 6 | 周八 | 32 | 会计 | 成都 |
| 7 | 吴九 | 29 | 销售 | 武汉 |
| 8 | 郑十 | 31 | 记者 | 南京 |
| 9 | 刘一 | 33 | 建筑师 | 天津 |
| 10 | 陈二 | 26 | 程序员 | 重庆 |
| 10004 | 黄末 | 28 | 作家 | 厦门 |
| 10013 | 杨一 | 34 | 程序员 | 厦门 |`,
`结束了
| 序号22 | 姓名 | 年龄 | 职业 | 城市 |
| --- | --- | --- | --- | --- |
| 1 | 张三 | 25 | 工程师 | 北京 |
| 2 | 李四 | 30 | 教师 | 上海 |
| 3 | 王五 | 28 | 医生 | 广州 |
| 4 | 赵六 | 35 | 律师 | 深圳 |
| 5 | 孙七 | 27 | 设计师 | 杭州 |
| 6 | 周八 | 32 | 会计 | 成都 |
| 6 | 周八 | 32 | 会计 | 成都 |
| 7 | 吴九 | 29 | 销售 | 武汉 |
| 8 | 郑十 | 31 | 记者 | 南京 |
| 9 | 刘一 | 33 | 建筑师 | 天津 |
| 10 | 陈二 | 26 | 程序员 | 重庆 |
| 10002 | 黄末 | 28 | 作家 | 厦门 |
| 10012 | 杨一 | 34 | 程序员 | 厦门 |`
] ]
}; };
@@ -719,91 +644,3 @@ it(`Test splitText2Chunks 9`, () => {
expect(chunks).toEqual(mock.result); expect(chunks).toEqual(mock.result);
}); });
// 段落优化先测试 - 段落深度 0
it(`Test splitText2Chunks 10`, () => {
const mock = {
text: `# A
af da da fda a a
## B
段落 2
### D
段落 3
## E
段落 4`,
result: [
`# A
af da da fda a a
## B
段落 2
### D
段落 3
## E
段落 4`
]
};
const { chunks } = splitText2Chunks({ text: mock.text, chunkSize: 2000, paragraphChunkDeep: 0 });
expect(chunks).toEqual(mock.result);
});
// 段落优化先测试 - 段落深度 1
it(`Test splitText2Chunks 11`, () => {
const mock = {
text: `# A
af da da fda a a
## B
段落 2
### D
段落 3
## E
段落 4`,
result: [
`# A
af da da fda a a
## B
段落 2
### D
段落 3
## E
段落 4`
]
};
const { chunks } = splitText2Chunks({ text: mock.text, chunkSize: 2000, paragraphChunkDeep: 1 });
expect(chunks).toEqual(mock.result);
});
// 段落优化先测试 - 段落深度 2
it(`Test splitText2Chunks 12`, () => {
const mock = {
text: `# A
af da da fda a a
## B
段落 2
### D
段落 3
## E
段落 4`,
result: [
`# A
af da da fda a a`,
`# A
## B
段落 2
### D
段落 3`,
`# A
## E
段落 4`
]
};
const { chunks } = splitText2Chunks({ text: mock.text, chunkSize: 2000, paragraphChunkDeep: 2 });
expect(chunks).toEqual(mock.result);
});

View File

@@ -1,380 +0,0 @@
import { it, expect } from 'vitest'; // 必须显式导入
import { rawText2Chunks } from '@fastgpt/service/core/dataset/read';
import { ChunkTriggerConfigTypeEnum } from '@fastgpt/global/core/dataset/constants';
const formatChunks = (
chunks: {
q: string;
a: string;
indexes?: string[];
}[]
) => {
return chunks.map((chunk) => chunk.q.replace(/\s+/g, ''));
};
const formatResult = (result: string[]) => {
return result.map((item) => item.replace(/\s+/g, ''));
};
// 最大值分块测试-小于最大值,不分块
it(`Test splitText2Chunks 1`, () => {
const mock = {
text: `# A
af da da fda a a
## B
阿凡撒发生的都是发大水
### c
dsgsgfsgs22
#### D
dsgsgfsgs22
##### E
dsgsgfsgs22sddddddd
`,
result: [
`# A
af da da fda a a
## B
阿凡撒发生的都是发大水
### c
dsgsgfsgs22
#### D
dsgsgfsgs22
##### E
dsgsgfsgs22sddddddd`
]
};
const data = rawText2Chunks({
rawText: mock.text,
chunkTriggerType: ChunkTriggerConfigTypeEnum.maxSize,
chunkTriggerMinSize: 1000,
maxSize: 20000,
chunkSize: 512,
backupParse: false
});
expect(formatChunks(data)).toEqual(formatResult(mock.result));
});
// 最大值分块测试-大于最大值,分块
it(`Test splitText2Chunks 2`, () => {
const mock = {
text: `# A
af da da fda a a
## B
阿凡撒发生的都是发大水
### c
dsgsgfsgs22
#### D
dsgsgfsgs22
##### E
dsgsgfsgs22sddddddd`,
result: [
`# A
af da da fda a a`,
`# A
## B
阿凡撒发生的都是发大水`,
`# A
## B
### c
dsgsgfsgs22`,
`# A
## B
### c
#### D
dsgsgfsgs22`,
`# A
## B
### c
#### D
##### E
dsgsgfsgs22sddddddd`
]
};
const data = rawText2Chunks({
rawText: mock.text,
chunkTriggerType: ChunkTriggerConfigTypeEnum.maxSize,
chunkTriggerMinSize: 10,
maxSize: 10,
chunkSize: 512,
backupParse: false
});
expect(formatChunks(data)).toEqual(formatResult(mock.result));
});
// 最小值分块测试-大于最小值,不分块
it(`Test splitText2Chunks 3`, () => {
const mock = {
text: `# A
af da da fda a a
## B
阿凡撒发生的都是发大水
### c
dsgsgfsgs22
#### D
dsgsgfsgs22
##### E
dsgsgfsgs22sddddddd`,
result: [
`# A
af da da fda a a
## B
阿凡撒发生的都是发大水
### c
dsgsgfsgs22
#### D
dsgsgfsgs22
##### E
dsgsgfsgs22sddddddd`
]
};
const data = rawText2Chunks({
rawText: mock.text,
chunkTriggerType: ChunkTriggerConfigTypeEnum.minSize,
chunkTriggerMinSize: 1000,
maxSize: 1000,
chunkSize: 512,
backupParse: false
});
expect(formatChunks(data)).toEqual(formatResult(mock.result));
});
// 最小值分块测试-小于最小值,分块
it(`Test splitText2Chunks 4`, () => {
const mock = {
text: `# A
af da da fda a a
## B
阿凡撒发生的都是发大水
### c
dsgsgfsgs22
#### D
dsgsgfsgs22
##### E
dsgsgfsgs22sddddddd`,
result: [
`# A
af da da fda a a`,
`# A
## B
阿凡撒发生的都是发大水`,
`# A
## B
### c
dsgsgfsgs22`,
`# A
## B
### c
#### D
dsgsgfsgs22`,
`# A
## B
### c
#### D
##### E
dsgsgfsgs22sddddddd`
]
};
const data = rawText2Chunks({
rawText: mock.text,
chunkTriggerType: ChunkTriggerConfigTypeEnum.minSize,
chunkTriggerMinSize: 10,
maxSize: 10,
chunkSize: 512,
backupParse: false
});
expect(formatChunks(data)).toEqual(formatResult(mock.result));
});
// 强制分块测试-小于最小值和最大值
it(`Test splitText2Chunks 5`, () => {
const mock = {
text: `# A
af da da fda a a
## B
阿凡撒发生的都是发大水
### c
dsgsgfsgs22
#### D
dsgsgfsgs22
##### E
dsgsgfsgs22sddddddd`,
result: [
`# A
af da da fda a a`,
`# A
## B
阿凡撒发生的都是发大水`,
`# A
## B
### c
dsgsgfsgs22`,
`# A
## B
### c
#### D
dsgsgfsgs22`,
`# A
## B
### c
#### D
##### E
dsgsgfsgs22sddddddd`
]
};
const data = rawText2Chunks({
rawText: mock.text,
chunkTriggerType: ChunkTriggerConfigTypeEnum.forceChunk,
chunkTriggerMinSize: 1000,
maxSize: 10000,
chunkSize: 512,
backupParse: false
});
expect(formatChunks(data)).toEqual(formatResult(mock.result));
});
// 强制分块测试-大于最小值
it(`Test splitText2Chunks 6`, () => {
const mock = {
text: `# A
af da da fda a a
## B
阿凡撒发生的都是发大水
### c
dsgsgfsgs22
#### D
dsgsgfsgs22
##### E
dsgsgfsgs22sddddddd`,
result: [
`# A
af da da fda a a`,
`# A
## B
阿凡撒发生的都是发大水`,
`# A
## B
### c
dsgsgfsgs22`,
`# A
## B
### c
#### D
dsgsgfsgs22`,
`# A
## B
### c
#### D
##### E
dsgsgfsgs22sddddddd`
]
};
const data = rawText2Chunks({
rawText: mock.text,
chunkTriggerType: ChunkTriggerConfigTypeEnum.forceChunk,
chunkTriggerMinSize: 10,
maxSize: 10000,
chunkSize: 512,
backupParse: false
});
expect(formatChunks(data)).toEqual(formatResult(mock.result));
});

View File

@@ -4,7 +4,8 @@ import { FlowNodeTypeEnum } from '@fastgpt/global/core/workflow/node/constant';
import type { ChatItemType } from '@fastgpt/global/core/chat/type'; import type { ChatItemType } from '@fastgpt/global/core/chat/type';
import { import {
transformPreviewHistories, transformPreviewHistories,
addStatisticalDataToHistoryItem addStatisticalDataToHistoryItem,
getFlatAppResponses
} from '@/global/core/chat/utils'; } from '@/global/core/chat/utils';
const mockResponseData = { const mockResponseData = {
@@ -14,6 +15,70 @@ const mockResponseData = {
moduleType: FlowNodeTypeEnum.chatNode moduleType: FlowNodeTypeEnum.chatNode
}; };
describe('getFlatAppResponses', () => {
it('should return empty array for empty input', () => {
expect(getFlatAppResponses([])).toEqual([]);
});
it('should handle single level responses', () => {
const responses = [
{ ...mockResponseData, moduleType: FlowNodeTypeEnum.chatNode },
{ ...mockResponseData, moduleType: FlowNodeTypeEnum.tools }
];
expect(getFlatAppResponses(responses)).toEqual(responses);
});
it('should handle nested pluginDetail', () => {
const responses = [
{
...mockResponseData,
pluginDetail: [{ ...mockResponseData, moduleType: FlowNodeTypeEnum.tools }]
}
];
expect(getFlatAppResponses(responses)).toHaveLength(2);
});
it('should handle nested toolDetail', () => {
const responses = [
{
...mockResponseData,
toolDetail: [{ ...mockResponseData, moduleType: FlowNodeTypeEnum.chatNode }]
}
];
expect(getFlatAppResponses(responses)).toHaveLength(2);
});
it('should handle nested loopDetail', () => {
const responses = [
{
...mockResponseData,
loopDetail: [{ ...mockResponseData, moduleType: FlowNodeTypeEnum.datasetSearchNode }]
}
];
expect(getFlatAppResponses(responses)).toHaveLength(2);
});
it('should handle multiple levels of nesting', () => {
const responses = [
{
...mockResponseData,
pluginDetail: [
{
...mockResponseData,
toolDetail: [
{
...mockResponseData,
loopDetail: [{ ...mockResponseData }]
}
]
}
]
}
];
expect(getFlatAppResponses(responses)).toHaveLength(4);
});
});
describe('transformPreviewHistories', () => { describe('transformPreviewHistories', () => {
it('should transform histories correctly with responseDetail=true', () => { it('should transform histories correctly with responseDetail=true', () => {
const histories: ChatItemType[] = [ const histories: ChatItemType[] = [